From 7b74d2c51b914545973180cd0ae9d01bf5480037 Mon Sep 17 00:00:00 2001 From: Christof Kaufmann <christof.kaufmann@hs-bochum.de> Date: Fri, 7 Jun 2024 12:03:58 +0000 Subject: [PATCH] Notebooks from applied-cs/data-science@152f2534 --- 06-clustering/demo/clustering_demo.py | 31 ++++++++++++++++----------- 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/06-clustering/demo/clustering_demo.py b/06-clustering/demo/clustering_demo.py index 95fa5db..a770258 100644 --- a/06-clustering/demo/clustering_demo.py +++ b/06-clustering/demo/clustering_demo.py @@ -33,8 +33,10 @@ from scipy.optimize import linear_sum_assignment # # Install these with from terminal with: # mamba install hdbscan scikit-learn-extra +# pip install hdbscan scikit-learn-extra # # Install these with from this script with: -# !mamba install hdbscan scikit-learn-extra +# !mamba install -y hdbscan scikit-learn-extra +# !pip install hdbscan scikit-learn-extra from sklearn_extra.cluster import CommonNNClustering # from sklearn.cluster import HDBSCAN # does not have soft clustering and tree plot @@ -45,9 +47,10 @@ import hdbscan # NOTE: use this only if in a jupyter environment. # %matplotlib widget # NOTE: use this only from spyder (or try from local VS Code) -%matplotlib auto +# %matplotlib auto -# %% generate data with different densities and structure +# %% data + helper functions +# generate data with different densities and structure np.random.seed(42) n = 2000 @@ -64,27 +67,31 @@ y = np.hstack((y_blobs, y_moons + max(y_blobs) + 1)) cm = mpl.cm.Set3 -# %% match label to ground truth function to match colors optimally -# also knows as "Hungarian algorithm" +# match label to ground truth function to match colors optimally (also knows as "Hungarian algorithm") def match_labels(labels, return_mapper=False): noise = labels == -1 - # get unique values and range labels starting from 0 or -1 + # get unique values and range labels starting from 0 or -1 without gaps from merged clusters u, range_labels = np.unique(labels, return_inverse=True) num_clusters = len(u) if np.any(noise): num_clusters -= 1 range_labels -= 1 + # map cluster range labels to ground truth labels conf = confusion_matrix(range_labels[~noise], y[~noise])[:num_clusters] _, mapper = linear_sum_assignment(-conf) new_labels = mapper[range_labels] - new_labels[noise] = -1 # preserve noise + + # preserve noise + new_labels[noise] = -1 + if return_mapper: return new_labels, mapper return new_labels -# %% helper for cluster hist + +# helper for cluster hist plots def draw_cluster_hist(ax, labels, y=None): ax.clear() ax.set_yticks([]) @@ -122,7 +129,7 @@ plt.show() # %% plot with k-means labels plt.close('all') -fig_kmeans = plt.figure('k-Means', figsize=(8, 11)) +fig_kmeans = plt.figure('k-Means', figsize=(8, 8)) ax_kmeans = fig_kmeans.add_subplot(2, 1, 1) sc_kmeans = ax_kmeans.scatter(*X.T, alpha=0.8) ax_kmeans.set_xticks([]) @@ -177,7 +184,7 @@ plt.show() # %% plot with DBSCAN labels plt.close('all') -fig_dbscan = plt.figure('DBSCAN', figsize=(8, 11)) +fig_dbscan = plt.figure('DBSCAN', figsize=(8, 8)) ax_dbscan = fig_dbscan.add_subplot(2, 1, 1) sc_dbscan = ax_dbscan.scatter(*X.T, alpha=0.8) sc_dbscan.set_edgecolor(cm((y + 1) % cm.N)) @@ -263,7 +270,7 @@ plt.show() # %% plot with OPTICS (DBSCAN or Xi) labels plt.close('all') -fig_optics = plt.figure('OPTICS', figsize=(8, 11)) +fig_optics = plt.figure('OPTICS', figsize=(8, 8)) ax_optics_reachability = fig_optics.add_subplot(3, 1, 1) sc_optics_reachability = ax_optics_reachability.scatter([], [], alpha=0.6) @@ -372,7 +379,7 @@ is_standalone_hdbscan = hasattr(HDBSCAN, 'generate_prediction_data') subplots = 3 if is_standalone_hdbscan else 2 button_labels = ['EoM/Leaf', 'Soft'] if is_standalone_hdbscan else ['EoM/Leaf'] -fig_hdbscan = plt.figure('HDBSCAN', figsize=(8, 11)) +fig_hdbscan = plt.figure('HDBSCAN', figsize=(8, 8)) ax_hdbscan = fig_hdbscan.add_subplot(subplots, 1, 1) sc_hdbscan = ax_hdbscan.scatter(*X.T, alpha=0.8) -- GitLab