注意
转到末尾 下载完整示例代码。
在 Galaxy10SDSS 数据集上使用 UMAP
这是一个在 Galaxy10SDSS 数据集上使用 UMAP 的示例。此示例的主要目标是展示如何将监督学习作为一种有效工具,用于可视化和降维复杂数据。此外,还使用 hdbscan 对处理后的数据进行分类。
import numpy as np
import h5py
import matplotlib.pyplot as plt
import umap
import os
# from sklearn.model_selection import train_test_split
import math
import requests
# libraries for clustering
import hdbscan
import sklearn.cluster as cluster
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score
if not os.path.isfile("Galaxy10.h5"):
url = "http://astro.utoronto.ca/~bovy/Galaxy10/Galaxy10.h5"
r = requests.get(url, allow_redirects=True)
open("Galaxy10.h5", "wb").write(r.content)
# To get the images and labels from file
with h5py.File("Galaxy10.h5", "r") as F:
images = np.array(F["images"])
labels = np.array(F["ans"])
X_train = np.empty([math.floor(len(labels) / 100), 14283], dtype=np.float64)
y_train = np.empty([math.floor(len(labels) / 100)], dtype=np.float64)
X_test = X_train
y_test = y_train
# Get a subset of the data
for i in range(math.floor(len(labels) / 100)):
X_train[i, :] = np.array(np.ndarray.flatten(images[i, :, :, :]), dtype=np.float64)
y_train[i] = labels[i]
X_test[i, :] = np.array(
np.ndarray.flatten(images[i + math.floor(len(labels) / 100), :, :, :]),
dtype=np.float64,
)
y_test[i] = labels[i + math.floor(len(labels) / 100)]
# Plot distribution
classes, frequency = np.unique(y_train, return_counts=True)
fig = plt.figure(1, figsize=(4, 4))
plt.clf()
plt.bar(classes, frequency)
plt.xlabel("Class")
plt.ylabel("Frequency")
plt.title("Data Subset")
plt.savefig("galaxy10_subset.svg")
# 2D Embedding
## UMAP
reducer = umap.UMAP(
n_components=2, n_neighbors=20, random_state=42, transform_seed=42, verbose=False
)
reducer.fit(X_train)
galaxy10_umap = reducer.transform(X_train)
fig = plt.figure(1, figsize=(4, 4))
plt.clf()
plt.scatter(
galaxy10_umap[:, 0],
galaxy10_umap[:, 1],
c=y_train,
cmap=plt.cm.nipy_spectral,
edgecolor="k",
label=y_train,
)
plt.colorbar(boundaries=np.arange(11) - 0.5).set_ticks(np.arange(10))
plt.savefig("galaxy10_2D_umap.svg")
### UMAP - Supervised
reducer = umap.UMAP(
n_components=2, n_neighbors=15, random_state=42, transform_seed=42, verbose=False
)
reducer.fit(X_train, y_train)
galaxy10_umap_supervised = reducer.transform(X_train)
fig = plt.figure(1, figsize=(4, 4))
plt.clf()
plt.scatter(
galaxy10_umap_supervised[:, 0],
galaxy10_umap_supervised[:, 1],
c=y_train,
cmap=plt.cm.nipy_spectral,
edgecolor="k",
label=y_train,
)
plt.colorbar(boundaries=np.arange(11) - 0.5).set_ticks(np.arange(10))
plt.savefig("galaxy10_2D_umap_supervised.svg")
### UMAP - Supervised prediction
galaxy10_umap_supervised_prediction = reducer.transform(X_test)
fig = plt.figure(1, figsize=(4, 4))
plt.clf()
plt.scatter(
galaxy10_umap_supervised_prediction[:, 0],
galaxy10_umap_supervised_prediction[:, 1],
c=y_test,
cmap=plt.cm.nipy_spectral,
edgecolor="k",
label=y_test,
)
plt.colorbar(boundaries=np.arange(11) - 0.5).set_ticks(np.arange(10))
plt.savefig("galaxy10_2D_umap_supervised_prediction.svg")
# cluster the data
labels = hdbscan.HDBSCAN(
min_samples=10,
min_cluster_size=10,
).fit_predict(galaxy10_umap_supervised_prediction)
clustered = labels >= 0
fig = plt.figure(1, figsize=(4, 4))
plt.clf()
plt.scatter(
galaxy10_umap_supervised_prediction[~clustered, 0],
galaxy10_umap_supervised_prediction[~clustered, 1],
color=(0.5, 0.5, 0.5),
alpha=0.5,
)
plt.scatter(
galaxy10_umap_supervised_prediction[clustered, 0],
galaxy10_umap_supervised_prediction[clustered, 1],
c=y_test[clustered],
cmap=plt.cm.nipy_spectral,
edgecolor="k",
label=y_test[clustered],
)
plt.colorbar(boundaries=np.arange(11) - 0.5).set_ticks(np.arange(10))
plt.savefig("galaxy10_2D_umap_supervised_prediction_clustered.svg")
# Print out information on quality of clustering
print("2D Supervised Embedding with Clustering")
print(adjusted_rand_score(y_test, labels), adjusted_mutual_info_score(y_test, labels))
print(
adjusted_rand_score(y_test[clustered], labels[clustered]),
adjusted_mutual_info_score(y_test[clustered], labels[clustered]),
)
print(np.sum(clustered) / y_test.shape[0])
# 3D Embedding
## UMAP
reducer = umap.UMAP(
n_components=3, n_neighbors=20, random_state=42, transform_seed=42, verbose=False
)
reducer.fit(X_train)
galaxy10_umap = reducer.transform(X_train)
fig = plt.figure(1, figsize=(4, 4))
plt.clf()
ax = fig.add_subplot(projection="3d")
p = ax.scatter(
galaxy10_umap[:, 0],
galaxy10_umap[:, 1],
galaxy10_umap[:, 2],
c=y_train,
cmap=plt.cm.nipy_spectral,
edgecolor="k",
label=y_train,
)
fig.colorbar(p, ax=ax, boundaries=np.arange(11) - 0.5).set_ticks(np.arange(10))
plt.savefig("galaxy10_3D_umap.svg")
## UMAP - Supervised
reducer = umap.UMAP(
n_components=3, n_neighbors=20, random_state=42, transform_seed=42, verbose=False
)
reducer.fit(X_train, y_train)
galaxy10_umap_supervised = reducer.transform(X_train)
fig = plt.figure(1, figsize=(4, 4))
plt.clf()
ax = fig.add_subplot(projection="3d")
p = ax.scatter(
galaxy10_umap_supervised[:, 0],
galaxy10_umap_supervised[:, 1],
galaxy10_umap_supervised[:, 2],
c=y_train,
cmap=plt.cm.nipy_spectral,
edgecolor="k",
label=y_train,
)
fig.colorbar(p, ax=ax, boundaries=np.arange(11) - 0.5).set_ticks(np.arange(10))
plt.savefig("galaxy10_3D_umap_supervised.svg")
## UMAP - Supervised prediction
galaxy10_umap_supervised_prediction = reducer.transform(X_test)
fig = plt.figure(1, figsize=(4, 4))
plt.clf()
ax = fig.add_subplot(projection="3d")
p = ax.scatter(
galaxy10_umap_supervised_prediction[:, 0],
galaxy10_umap_supervised_prediction[:, 1],
galaxy10_umap_supervised_prediction[:, 2],
c=y_test,
cmap=plt.cm.nipy_spectral,
edgecolor="k",
label=y_test,
)
fig.colorbar(p, ax=ax, boundaries=np.arange(11) - 0.5).set_ticks(np.arange(10))
plt.savefig("galaxy10_3D_umap_supervised_prediction.svg")
# cluster the data
labels = hdbscan.HDBSCAN(
min_samples=10,
min_cluster_size=10,
).fit_predict(galaxy10_umap_supervised_prediction)
clustered = labels >= 0
fig = plt.figure(1, figsize=(4, 4))
plt.clf()
ax = fig.add_subplot(projection="3d")
q = ax.scatter(
galaxy10_umap_supervised_prediction[~clustered, 0],
galaxy10_umap_supervised_prediction[~clustered, 1],
galaxy10_umap_supervised_prediction[~clustered, 2],
color=(0.5, 0.5, 0.5),
alpha=0.5,
)
p = ax.scatter(
galaxy10_umap_supervised_prediction[clustered, 0],
galaxy10_umap_supervised_prediction[clustered, 1],
galaxy10_umap_supervised_prediction[clustered, 2],
c=y_test[clustered],
cmap=plt.cm.nipy_spectral,
edgecolor="k",
label=y_test[clustered],
)
fig.colorbar(p, ax=ax, boundaries=np.arange(11) - 0.5).set_ticks(np.arange(10))
plt.savefig("galaxy10_3D_umap_supervised_prediction_clustered.svg")
# Print out information on quality of clustering
print("3D Supervised Embedding with Clustering")
print(adjusted_rand_score(y_test, labels), adjusted_mutual_info_score(y_test, labels))
print(
adjusted_rand_score(y_test[clustered], labels[clustered]),
adjusted_mutual_info_score(y_test[clustered], labels[clustered]),
)
print(np.sum(clustered) / y_test.shape[0])
# Dimensions 4 to 25
for dimensions in range(4, 26):
reducer = umap.UMAP(
n_components=dimensions,
n_neighbors=20,
random_state=42,
transform_seed=42,
verbose=False,
)
reducer.fit(X_train, y_train)
galaxy10_umap_supervised = reducer.transform(X_train)
# UMAP - Supervised prediction
galaxy10_umap_supervised_prediction = reducer.transform(X_test)
# cluster the data
labels = hdbscan.HDBSCAN(
min_samples=10,
min_cluster_size=10,
).fit_predict(galaxy10_umap_supervised_prediction)
clustered = labels >= 0
# Print out information on quality of clustering
print(str(dimensions) + "D Supervised Embedding with Clustering")
print(
adjusted_rand_score(y_test, labels), adjusted_mutual_info_score(y_test, labels)
)
print(
adjusted_rand_score(y_test[clustered], labels[clustered]),
adjusted_mutual_info_score(y_test[clustered], labels[clustered]),
)
print(np.sum(clustered) / y_test.shape[0])