Notebooks from applied-cs/data-science@152f2534

7b74d2c5 · Christof Kaufmann · 657c213a · 7b74d2c5
Commit 7b74d2c5 authored 1 year ago by Christof Kaufmann
--- a/06-clustering/demo/clustering_demo.py
+++ b/06-clustering/demo/clustering_demo.py
@@ -33,8 +33,10 @@ from scipy.optimize import linear_sum_assignment
 # # Install these with from terminal with:
 # mamba install hdbscan scikit-learn-extra
+# pip   install hdbscan scikit-learn-extra
 # # Install these with from this script with:
-# !mamba install hdbscan scikit-learn-extra
+# !mamba install -y hdbscan scikit-learn-extra
+# !pip   install    hdbscan scikit-learn-extra
 from sklearn_extra.cluster import CommonNNClustering
 # from sklearn.cluster import HDBSCAN  # does not have soft clustering and tree plot
@@ -45,9 +47,10 @@ import hdbscan
 # NOTE: use this only if in a jupyter environment.
 # %matplotlib widget
 # NOTE: use this only from spyder (or try from local VS Code)
-%matplotlib auto
+# %matplotlib auto
-# %% generate data with different densities and structure
+# %% data + helper functions
+# generate data with different densities and structure
 np.random.seed(42)
 n = 2000
@@ -64,27 +67,31 @@ y = np.hstack((y_blobs, y_moons + max(y_blobs) + 1))
 cm = mpl.cm.Set3
-# %% match label to ground truth function to match colors optimally
+# match label to ground truth function to match colors optimally (also knows as "Hungarian algorithm")
-# also knows as "Hungarian algorithm"
 def match_labels(labels, return_mapper=False):
    noise = labels == -1
-    # get unique values and range labels starting from 0 or -1
+    # get unique values and range labels starting from 0 or -1 without gaps from merged clusters
    u, range_labels = np.unique(labels, return_inverse=True)
    num_clusters = len(u)
    if np.any(noise):
        num_clusters -= 1
        range_labels -= 1
+    # map cluster range labels to ground truth labels
    conf = confusion_matrix(range_labels[~noise], y[~noise])[:num_clusters]
    _, mapper = linear_sum_assignment(-conf)
    new_labels = mapper[range_labels]
-    new_labels[noise] = -1  # preserve noise
+    # preserve noise
+    new_labels[noise] = -1
    if return_mapper:
        return new_labels, mapper
    return new_labels
-# %% helper for cluster hist
+# helper for cluster hist plots
 def draw_cluster_hist(ax, labels, y=None):
    ax.clear()
    ax.set_yticks([])
@@ -122,7 +129,7 @@ plt.show()
 # %% plot with k-means labels
 plt.close('all')
-fig_kmeans = plt.figure('k-Means', figsize=(8, 11))
+fig_kmeans = plt.figure('k-Means', figsize=(8, 8))
 ax_kmeans = fig_kmeans.add_subplot(2, 1, 1)
 sc_kmeans = ax_kmeans.scatter(*X.T, alpha=0.8)
 ax_kmeans.set_xticks([])
@@ -177,7 +184,7 @@ plt.show()
 # %% plot with DBSCAN labels
 plt.close('all')
-fig_dbscan = plt.figure('DBSCAN', figsize=(8, 11))
+fig_dbscan = plt.figure('DBSCAN', figsize=(8, 8))
 ax_dbscan = fig_dbscan.add_subplot(2, 1, 1)
 sc_dbscan = ax_dbscan.scatter(*X.T, alpha=0.8)
 sc_dbscan.set_edgecolor(cm((y + 1) % cm.N))
@@ -263,7 +270,7 @@ plt.show()
 # %% plot with OPTICS (DBSCAN or Xi) labels
 plt.close('all')
-fig_optics = plt.figure('OPTICS', figsize=(8, 11))
+fig_optics = plt.figure('OPTICS', figsize=(8, 8))
 ax_optics_reachability = fig_optics.add_subplot(3, 1, 1)
 sc_optics_reachability = ax_optics_reachability.scatter([], [], alpha=0.6)
@@ -372,7 +379,7 @@ is_standalone_hdbscan = hasattr(HDBSCAN, 'generate_prediction_data')
 subplots = 3 if is_standalone_hdbscan else 2
 button_labels = ['EoM/Leaf', 'Soft'] if is_standalone_hdbscan else ['EoM/Leaf']
-fig_hdbscan = plt.figure('HDBSCAN', figsize=(8, 11))
+fig_hdbscan = plt.figure('HDBSCAN', figsize=(8, 8))
 ax_hdbscan = fig_hdbscan.add_subplot(subplots, 1, 1)
 sc_hdbscan = ax_hdbscan.scatter(*X.T, alpha=0.8)