From 7b74d2c51b914545973180cd0ae9d01bf5480037 Mon Sep 17 00:00:00 2001
From: Christof Kaufmann <christof.kaufmann@hs-bochum.de>
Date: Fri, 7 Jun 2024 12:03:58 +0000
Subject: [PATCH] Notebooks from applied-cs/data-science@152f2534

---
 06-clustering/demo/clustering_demo.py | 31 ++++++++++++++++-----------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/06-clustering/demo/clustering_demo.py b/06-clustering/demo/clustering_demo.py
index 95fa5db..a770258 100644
--- a/06-clustering/demo/clustering_demo.py
+++ b/06-clustering/demo/clustering_demo.py
@@ -33,8 +33,10 @@ from scipy.optimize import linear_sum_assignment
 
 # # Install these with from terminal with:
 # mamba install hdbscan scikit-learn-extra
+# pip   install hdbscan scikit-learn-extra
 # # Install these with from this script with:
-# !mamba install hdbscan scikit-learn-extra
+# !mamba install -y hdbscan scikit-learn-extra
+# !pip   install    hdbscan scikit-learn-extra
 
 from sklearn_extra.cluster import CommonNNClustering
 # from sklearn.cluster import HDBSCAN  # does not have soft clustering and tree plot
@@ -45,9 +47,10 @@ import hdbscan
 # NOTE: use this only if in a jupyter environment.
 # %matplotlib widget
 # NOTE: use this only from spyder (or try from local VS Code)
-%matplotlib auto
+# %matplotlib auto
 
-# %% generate data with different densities and structure
+# %% data + helper functions
+# generate data with different densities and structure
 np.random.seed(42)
 n = 2000
 
@@ -64,27 +67,31 @@ y = np.hstack((y_blobs, y_moons + max(y_blobs) + 1))
 cm = mpl.cm.Set3
 
 
-# %% match label to ground truth function to match colors optimally
-# also knows as "Hungarian algorithm"
+# match label to ground truth function to match colors optimally (also knows as "Hungarian algorithm")
 def match_labels(labels, return_mapper=False):
     noise = labels == -1
 
-    # get unique values and range labels starting from 0 or -1
+    # get unique values and range labels starting from 0 or -1 without gaps from merged clusters
     u, range_labels = np.unique(labels, return_inverse=True)
     num_clusters = len(u)
     if np.any(noise):
         num_clusters -= 1
         range_labels -= 1
 
+    # map cluster range labels to ground truth labels
     conf = confusion_matrix(range_labels[~noise], y[~noise])[:num_clusters]
     _, mapper = linear_sum_assignment(-conf)
     new_labels = mapper[range_labels]
-    new_labels[noise] = -1  # preserve noise
+
+    # preserve noise
+    new_labels[noise] = -1
+
     if return_mapper:
         return new_labels, mapper
     return new_labels
 
-# %% helper for cluster hist
+
+# helper for cluster hist plots
 def draw_cluster_hist(ax, labels, y=None):
     ax.clear()
     ax.set_yticks([])
@@ -122,7 +129,7 @@ plt.show()
 # %% plot with k-means labels
 plt.close('all')
 
-fig_kmeans = plt.figure('k-Means', figsize=(8, 11))
+fig_kmeans = plt.figure('k-Means', figsize=(8, 8))
 ax_kmeans = fig_kmeans.add_subplot(2, 1, 1)
 sc_kmeans = ax_kmeans.scatter(*X.T, alpha=0.8)
 ax_kmeans.set_xticks([])
@@ -177,7 +184,7 @@ plt.show()
 # %% plot with DBSCAN labels
 plt.close('all')
 
-fig_dbscan = plt.figure('DBSCAN', figsize=(8, 11))
+fig_dbscan = plt.figure('DBSCAN', figsize=(8, 8))
 ax_dbscan = fig_dbscan.add_subplot(2, 1, 1)
 sc_dbscan = ax_dbscan.scatter(*X.T, alpha=0.8)
 sc_dbscan.set_edgecolor(cm((y + 1) % cm.N))
@@ -263,7 +270,7 @@ plt.show()
 # %% plot with OPTICS (DBSCAN or Xi) labels
 plt.close('all')
 
-fig_optics = plt.figure('OPTICS', figsize=(8, 11))
+fig_optics = plt.figure('OPTICS', figsize=(8, 8))
 
 ax_optics_reachability = fig_optics.add_subplot(3, 1, 1)
 sc_optics_reachability = ax_optics_reachability.scatter([], [], alpha=0.6)
@@ -372,7 +379,7 @@ is_standalone_hdbscan = hasattr(HDBSCAN, 'generate_prediction_data')
 subplots = 3 if is_standalone_hdbscan else 2
 button_labels = ['EoM/Leaf', 'Soft'] if is_standalone_hdbscan else ['EoM/Leaf']
 
-fig_hdbscan = plt.figure('HDBSCAN', figsize=(8, 11))
+fig_hdbscan = plt.figure('HDBSCAN', figsize=(8, 8))
 
 ax_hdbscan = fig_hdbscan.add_subplot(subplots, 1, 1)
 sc_hdbscan = ax_hdbscan.scatter(*X.T, alpha=0.8)
-- 
GitLab