Skip to content
Snippets Groups Projects
Commit 7b74d2c5 authored by Christof Kaufmann's avatar Christof Kaufmann
Browse files

Notebooks from applied-cs/data-science@152f2534

parent 657c213a
Branches
No related tags found
No related merge requests found
...@@ -33,8 +33,10 @@ from scipy.optimize import linear_sum_assignment ...@@ -33,8 +33,10 @@ from scipy.optimize import linear_sum_assignment
# # Install these with from terminal with: # # Install these with from terminal with:
# mamba install hdbscan scikit-learn-extra # mamba install hdbscan scikit-learn-extra
# pip install hdbscan scikit-learn-extra
# # Install these with from this script with: # # Install these with from this script with:
# !mamba install hdbscan scikit-learn-extra # !mamba install -y hdbscan scikit-learn-extra
# !pip install hdbscan scikit-learn-extra
from sklearn_extra.cluster import CommonNNClustering from sklearn_extra.cluster import CommonNNClustering
# from sklearn.cluster import HDBSCAN # does not have soft clustering and tree plot # from sklearn.cluster import HDBSCAN # does not have soft clustering and tree plot
...@@ -45,9 +47,10 @@ import hdbscan ...@@ -45,9 +47,10 @@ import hdbscan
# NOTE: use this only if in a jupyter environment. # NOTE: use this only if in a jupyter environment.
# %matplotlib widget # %matplotlib widget
# NOTE: use this only from spyder (or try from local VS Code) # NOTE: use this only from spyder (or try from local VS Code)
%matplotlib auto # %matplotlib auto
# %% generate data with different densities and structure # %% data + helper functions
# generate data with different densities and structure
np.random.seed(42) np.random.seed(42)
n = 2000 n = 2000
...@@ -64,27 +67,31 @@ y = np.hstack((y_blobs, y_moons + max(y_blobs) + 1)) ...@@ -64,27 +67,31 @@ y = np.hstack((y_blobs, y_moons + max(y_blobs) + 1))
cm = mpl.cm.Set3 cm = mpl.cm.Set3
# %% match label to ground truth function to match colors optimally # match label to ground truth function to match colors optimally (also knows as "Hungarian algorithm")
# also knows as "Hungarian algorithm"
def match_labels(labels, return_mapper=False): def match_labels(labels, return_mapper=False):
noise = labels == -1 noise = labels == -1
# get unique values and range labels starting from 0 or -1 # get unique values and range labels starting from 0 or -1 without gaps from merged clusters
u, range_labels = np.unique(labels, return_inverse=True) u, range_labels = np.unique(labels, return_inverse=True)
num_clusters = len(u) num_clusters = len(u)
if np.any(noise): if np.any(noise):
num_clusters -= 1 num_clusters -= 1
range_labels -= 1 range_labels -= 1
# map cluster range labels to ground truth labels
conf = confusion_matrix(range_labels[~noise], y[~noise])[:num_clusters] conf = confusion_matrix(range_labels[~noise], y[~noise])[:num_clusters]
_, mapper = linear_sum_assignment(-conf) _, mapper = linear_sum_assignment(-conf)
new_labels = mapper[range_labels] new_labels = mapper[range_labels]
new_labels[noise] = -1 # preserve noise
# preserve noise
new_labels[noise] = -1
if return_mapper: if return_mapper:
return new_labels, mapper return new_labels, mapper
return new_labels return new_labels
# %% helper for cluster hist
# helper for cluster hist plots
def draw_cluster_hist(ax, labels, y=None): def draw_cluster_hist(ax, labels, y=None):
ax.clear() ax.clear()
ax.set_yticks([]) ax.set_yticks([])
...@@ -122,7 +129,7 @@ plt.show() ...@@ -122,7 +129,7 @@ plt.show()
# %% plot with k-means labels # %% plot with k-means labels
plt.close('all') plt.close('all')
fig_kmeans = plt.figure('k-Means', figsize=(8, 11)) fig_kmeans = plt.figure('k-Means', figsize=(8, 8))
ax_kmeans = fig_kmeans.add_subplot(2, 1, 1) ax_kmeans = fig_kmeans.add_subplot(2, 1, 1)
sc_kmeans = ax_kmeans.scatter(*X.T, alpha=0.8) sc_kmeans = ax_kmeans.scatter(*X.T, alpha=0.8)
ax_kmeans.set_xticks([]) ax_kmeans.set_xticks([])
...@@ -177,7 +184,7 @@ plt.show() ...@@ -177,7 +184,7 @@ plt.show()
# %% plot with DBSCAN labels # %% plot with DBSCAN labels
plt.close('all') plt.close('all')
fig_dbscan = plt.figure('DBSCAN', figsize=(8, 11)) fig_dbscan = plt.figure('DBSCAN', figsize=(8, 8))
ax_dbscan = fig_dbscan.add_subplot(2, 1, 1) ax_dbscan = fig_dbscan.add_subplot(2, 1, 1)
sc_dbscan = ax_dbscan.scatter(*X.T, alpha=0.8) sc_dbscan = ax_dbscan.scatter(*X.T, alpha=0.8)
sc_dbscan.set_edgecolor(cm((y + 1) % cm.N)) sc_dbscan.set_edgecolor(cm((y + 1) % cm.N))
...@@ -263,7 +270,7 @@ plt.show() ...@@ -263,7 +270,7 @@ plt.show()
# %% plot with OPTICS (DBSCAN or Xi) labels # %% plot with OPTICS (DBSCAN or Xi) labels
plt.close('all') plt.close('all')
fig_optics = plt.figure('OPTICS', figsize=(8, 11)) fig_optics = plt.figure('OPTICS', figsize=(8, 8))
ax_optics_reachability = fig_optics.add_subplot(3, 1, 1) ax_optics_reachability = fig_optics.add_subplot(3, 1, 1)
sc_optics_reachability = ax_optics_reachability.scatter([], [], alpha=0.6) sc_optics_reachability = ax_optics_reachability.scatter([], [], alpha=0.6)
...@@ -372,7 +379,7 @@ is_standalone_hdbscan = hasattr(HDBSCAN, 'generate_prediction_data') ...@@ -372,7 +379,7 @@ is_standalone_hdbscan = hasattr(HDBSCAN, 'generate_prediction_data')
subplots = 3 if is_standalone_hdbscan else 2 subplots = 3 if is_standalone_hdbscan else 2
button_labels = ['EoM/Leaf', 'Soft'] if is_standalone_hdbscan else ['EoM/Leaf'] button_labels = ['EoM/Leaf', 'Soft'] if is_standalone_hdbscan else ['EoM/Leaf']
fig_hdbscan = plt.figure('HDBSCAN', figsize=(8, 11)) fig_hdbscan = plt.figure('HDBSCAN', figsize=(8, 8))
ax_hdbscan = fig_hdbscan.add_subplot(subplots, 1, 1) ax_hdbscan = fig_hdbscan.add_subplot(subplots, 1, 1)
sc_hdbscan = ax_hdbscan.scatter(*X.T, alpha=0.8) sc_hdbscan = ax_hdbscan.scatter(*X.T, alpha=0.8)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment