注意
转到末尾下载完整的示例代码。
使用 Datashader 对 Fashion MNIST 数字数据集进行 UMAP
这是在 Fashion-MNIST 数据集上使用 UMAP 的一个简单示例。此示例的主要目标是演示 datashader 作为可视化 UMAP 结果的有效工具的用法。特别是,datashader 允许可视化非常大的数据集,在这种情况下,过度绘制可能是一个严重的问题。它支持按分类变量(如此示例所示)、连续变量或密度(如 datashader 示例中常见的那样)进行着色。
import umap
import numpy as np
import pandas as pd
import requests
import os
import datashader as ds
import datashader.utils as utils
import datashader.transfer_functions as tf
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(context="paper", style="white")
if not os.path.isfile("fashion-mnist.csv"):
csv_data = requests.get("https://www.openml.org/data/get_csv/18238735/phpnBqZGZ")
with open("fashion-mnist.csv", "w") as f:
f.write(csv_data.text)
source_df = pd.read_csv("fashion-mnist.csv")
data = source_df.iloc[:, :784].values.astype(np.float32)
target = source_df["class"].values
pal = [
"#9e0142",
"#d8434e",
"#f67a49",
"#fdbf6f",
"#feeda1",
"#f1f9a9",
"#bfe5a0",
"#74c7a5",
"#378ebb",
"#5e4fa2",
]
color_key = {str(d): c for d, c in enumerate(pal)}
reducer = umap.UMAP(random_state=42)
embedding = reducer.fit_transform(data)
df = pd.DataFrame(embedding, columns=("x", "y"))
df["class"] = pd.Series([str(x) for x in target], dtype="category")
cvs = ds.Canvas(plot_width=400, plot_height=400)
agg = cvs.points(df, "x", "y", ds.count_cat("class"))
img = tf.shade(agg, color_key=color_key, how="eq_hist")
utils.export_image(img, filename="fashion-mnist", background="black")
image = plt.imread("fashion-mnist.png")
fig, ax = plt.subplots(figsize=(6, 6))
plt.imshow(image)
plt.setp(ax, xticks=[], yticks=[])
plt.title(
"Fashion MNIST data embedded\n"
"into two dimensions by UMAP\n"
"visualised with Datashader",
fontsize=12,
)
plt.show()