diff --git a/06-clustering/01-inertia-sol.ipynb b/06-clustering/01-inertia-sol.ipynb index a8f8855e01c056b827594e8b96fc485535651815..7244e4633b111bb09bde2eb8f95fec472796053a 100644 --- a/06-clustering/01-inertia-sol.ipynb +++ b/06-clustering/01-inertia-sol.ipynb @@ -50,10 +50,10 @@ "axs[0].set_box_aspect(1)\n", "axs[1].set_box_aspect(1)\n", "axs[0].set_title('y1')\n", - "axs[0].set_title('y2')\n", + "axs[1].set_title('y2')\n", "plt.show()" ], - "id": "0004-2738953bffe1a6a1bf6958471b829f9f4cf355ce4d038f06451f55017ae" + "id": "0004-d297955758e8b5d26fc993f2509a3463420456b867ec1e4c37c5103fc38" }, { "cell_type": "markdown", diff --git a/06-clustering/01-inertia.ipynb b/06-clustering/01-inertia.ipynb index 157e1cef310333e9609802921ffc07a50c61894b..d4c3f73857c91563a3d486c1f89bf637a8a8ab5e 100644 --- a/06-clustering/01-inertia.ipynb +++ b/06-clustering/01-inertia.ipynb @@ -50,10 +50,10 @@ "axs[0].set_box_aspect(1)\n", "axs[1].set_box_aspect(1)\n", "axs[0].set_title('y1')\n", - "axs[0].set_title('y2')\n", + "axs[1].set_title('y2')\n", "plt.show()" ], - "id": "0004-2738953bffe1a6a1bf6958471b829f9f4cf355ce4d038f06451f55017ae" + "id": "0004-d297955758e8b5d26fc993f2509a3463420456b867ec1e4c37c5103fc38" }, { "cell_type": "markdown", diff --git a/06-clustering/02-elbow.ipynb b/06-clustering/02-elbow.ipynb index 6e32caeb17befc3073bd947788ccf1ca6783d3a3..d837410b2e020c7ae860ded950b484bc4e817c6a 100644 --- a/06-clustering/02-elbow.ipynb +++ b/06-clustering/02-elbow.ipynb @@ -57,14 +57,18 @@ " # TODO: collect inertias for clusterings with varying k\n", "\n", " inertias = np.array(inertias)\n", - " reductions = inertias[:-1] / inertias[1:]\n", + " log_reductions = np.log(inertias[:-1] / inertias[1:])\n", + " improvement = log_reductions / inertias[1:]\n", + " improvement /= np.max(improvement)\n", "\n", - " fig, axs = plt.subplots(1, 2, figsize=(10, 6))\n", + " fig, axs = plt.subplots(1, 2, figsize=(9, 4))\n", " axs[0].scatter(X[:, 0], X[:, 1])\n", " axs[1].plot(ks[1:], inertias[1:], c='gray')\n", - " axs[1].scatter(ks[1:], inertias[1:], c=np.log(reductions), cmap='brg')" + " axs[1].scatter(ks[1:], inertias[1:], c=log_reductions, cmap='brg', s=improvement * 100)\n", + " axs[0].set_title('data')\n", + " axs[1].set_title('elbow curve')" ], - "id": "0004-9700539d6d88f68b0c24b391384a0c6d7a6d7d3b58e751ed98bf47a2aff" + "id": "0004-4a8cf34210f0a0eb2c0610864778096a779afaef4059f1dd9d525dfb2f7" }, { "cell_type": "markdown", diff --git a/06-clustering/04-knn_consistency-sol.ipynb b/06-clustering/04-knn_consistency-sol.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..c56974979e0550d4d8b5c3751ba9a2e330cc35da --- /dev/null +++ b/06-clustering/04-knn_consistency-sol.ipynb @@ -0,0 +1,205 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# $k$-Nearest Neighbors Consistency\n", + "\n", + "Implementieren Sie den $k$-Nearest Neighbors Consistency Index, wie in\n", + "den Folien dargestellt. Gehen Sie dazu wie folgt vor:\n", + "\n", + "1. Finden Sie z. B. mit `NearestNeighbors` die Indizes der $k$ nächsten\n", + " Nachbarn (den Abstand benötigen Sie nicht).\n", + "2. Ermitteln Sie die Labels der Nachbarn mittels komplexer Indizierung.\n", + "3. Vergleichen Sie die Labels der Nachbarn mit dem Label des Punktes\n", + " indem Sie Broadcasting verwenden.\n", + "4. Mitteln Sie das so erhaltene Boolesche Array um den vereinfachten\n", + " $k$-Nearest Neighbors Consistency Index zu erhalten.\n", + "\n", + "Im Start-Code sind bereits drei Datensätze und Labels von $k$-Means und\n", + "DBSCAN vorbereitet. Dazu gibt es noch ein sehr kleines Dataset zum\n", + "Entwickeln und Debuggen:" + ], + "id": "0003-d14573b13f63851bad185642dc2e93b9c80f73c473f33bbf0d0950acea9" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "style": "python" + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import pandas as pd\n", + "import seaborn as sns\n", + "from sklearn.cluster import KMeans, DBSCAN\n", + "from sklearn.datasets import make_blobs, make_moons, make_circles\n", + "from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier\n", + "\n", + "X0, y0_true = make_blobs(n_samples=[100, 100, 400, 400], random_state=1)\n", + "X1, y1_true = make_moons(n_samples=1000, noise=0.1, random_state=42)\n", + "X2, y2_true = make_circles(n_samples=1000, noise=0.1, factor=0.45, random_state=42)\n", + "X3, y3_true = make_blobs(n_samples=[10, 10, 10, 5], random_state=1) # for debugging\n", + "\n", + "kmeans = KMeans(n_clusters=4, n_init=5)\n", + "y0_kmeans = kmeans.fit_predict(X0)\n", + "kmeans = KMeans(n_clusters=2, n_init=5)\n", + "y1_kmeans = kmeans.fit_predict(X1)\n", + "y2_kmeans = kmeans.fit_predict(X2)\n", + "\n", + "# put\n", + "def label_noise_inplace(X, labels):\n", + " noise = labels == -1\n", + " if noise.sum() == 0:\n", + " return\n", + "\n", + " knn = KNeighborsClassifier(weights='distance')\n", + " labels[noise] = knn.fit(X[~noise], labels[~noise]).predict(X[noise])\n", + "\n", + "dbscan = DBSCAN(eps=0.67, min_samples=15)\n", + "y0_dbscan = dbscan.fit_predict(X0)\n", + "label_noise_inplace(X0, y0_dbscan)\n", + "\n", + "dbscan = DBSCAN(eps=0.13)\n", + "y1_dbscan = dbscan.fit_predict(X1)\n", + "label_noise_inplace(X1, y1_dbscan)\n", + "\n", + "dbscan = DBSCAN(eps=0.136, min_samples=10)\n", + "y2_dbscan = dbscan.fit_predict(X2)\n", + "label_noise_inplace(X2, y2_dbscan)\n", + "\n", + "datasets = [\n", + " (X0, y0_kmeans, y0_dbscan),\n", + " (X1, y1_kmeans, y1_dbscan),\n", + " (X2, y2_kmeans, y2_dbscan),\n", + " (X3, y3_true, y3_true),\n", + "]\n", + "\n", + "# use this tiny dataset for development:\n", + "X, labels = X3, y3_true" + ], + "id": "0004-a87e934fff935b3d9b3bdbe2a3d30560b71ffad3832a9f9941c0eef14fd" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Hier ein Plot der Daten:" + ], + "id": "0005-3d78bcf9122a2a1bb3e1681a8c95f29db1a5e142a429a286e2f70f5ab28" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig, axs = plt.subplots(4, 2, figsize=(10, 12))\n", + "for i, (X, labels_kmeans, labels_dbscan) in enumerate(datasets):\n", + " axs[i, 0].scatter(*X.T, c=labels_kmeans)\n", + " axs[i, 1].scatter(*X.T, c=labels_dbscan)" + ], + "id": "0006-8c9aee80745ef7b3faf39e7f00d84bdd25f9842fe443006be802f23e3eb" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Lösung\n", + "\n", + "Hier eine mögliche Lösung mit etwas Bonusmaterial:" + ], + "id": "0008-f347ec7bc5a1dc9006bb5ef781ffed9c0da1374a907d33e9db132def21f" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "style": "python" + }, + "outputs": [], + "source": [ + "records = []\n", + "nn = NearestNeighbors(n_neighbors=7)\n", + "for i, (X, labels_kmeans, labels_dbscan) in enumerate([(X0, y0_kmeans, y0_dbscan), (X1, y1_kmeans, y1_dbscan), (X2, y2_kmeans, y2_dbscan)]):\n", + " for alg, labels in [('k-Means', labels_kmeans), ('DBSCAN', labels_dbscan)]:\n", + " nn.fit(X)\n", + " ind = nn.kneighbors(X, return_distance=False)\n", + " ind = ind[:, 1:]\n", + " neighbor_labels = labels[ind]\n", + "\n", + " simple_index = np.mean(neighbor_labels == labels[:, np.newaxis])\n", + "\n", + " # BONUS:\n", + " # - balance index\n", + " # - number of points with at least one bad neighbor\n", + " # - number of points with at more bad than good neighbors\n", + "\n", + " # 0.8 means 80% of the neighbors are in the same cluster\n", + " consistency_coefs = np.mean(neighbor_labels == labels[:, np.newaxis], axis=1)\n", + " balanced_index = np.mean([consistency_coefs[labels == l].mean() for l in range(labels.max() + 1)])\n", + " records.append((f'X{i}', alg, simple_index, balanced_index, np.sum(consistency_coefs < 1), np.sum(consistency_coefs < 0.5)))\n", + " # print(f'Dataset {i}: {name} yields {simple_index=}, {balanced_index=}, weak points: {np.sum(consistency_coefs < 1)}, bad points: {np.sum(consistency_coefs < 0.5)}')\n", + "\n", + "results = pd.DataFrame.from_records(records, columns=['dataset', 'algorithm', 'simple index', 'balanced index', '#weak points', '#bad points'])" + ], + "id": "0009-27747b1b528f9299d2d0a3e098bf1886a232ebd808b9bff0a79ea0c9f29" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Hier die Ausgabe der Ergebnisse:" + ], + "id": "0010-313e4ca932367cdbcd9dc27121e6fb697edb3fc0d3df0c926d3d4529a7c" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " dataset algorithm simple index balanced index #weak points #bad points\n", + "0 X0 k-Means 0.990833 0.987580 27 3\n", + "1 X0 DBSCAN 0.989667 0.989178 29 3\n", + "2 X1 k-Means 0.986000 0.985978 35 7\n", + "3 X1 DBSCAN 0.998500 0.998500 5 0\n", + "4 X2 k-Means 0.983500 0.983511 49 5\n", + "5 X2 DBSCAN 0.994500 0.994500 18 1" + ] + } + ], + "source": [ + "results" + ], + "id": "0011-42bc3186ded72786ba27e9ea6d2da240fb9fd3fe79b479ecf8e734b2850" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Und als Scatter-Plot:" + ], + "id": "0012-f48025ee575f7131a052e51b64abd55fbfd8f995f92843da305c1cf523c" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sns.scatterplot(results, x='dataset', y='simple index', hue='algorithm')" + ], + "id": "0013-ceb94852f9cf57c630717d4ca4582975623230f1f1e7958573077d27e01" + } + ], + "nbformat": 4, + "nbformat_minor": 5, + "metadata": {} +} diff --git a/06-clustering/04-knn_consistency.ipynb b/06-clustering/04-knn_consistency.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..d8cf72ffe39827748affc7274dcf3f6508f3e1f4 --- /dev/null +++ b/06-clustering/04-knn_consistency.ipynb @@ -0,0 +1,129 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# $k$-Nearest Neighbors Consistency\n", + "\n", + "Implementieren Sie den $k$-Nearest Neighbors Consistency Index, wie in\n", + "den Folien dargestellt. Gehen Sie dazu wie folgt vor:\n", + "\n", + "1. Finden Sie z. B. mit `NearestNeighbors` die Indizes der $k$ nächsten\n", + " Nachbarn (den Abstand benötigen Sie nicht).\n", + "2. Ermitteln Sie die Labels der Nachbarn mittels komplexer Indizierung.\n", + "3. Vergleichen Sie die Labels der Nachbarn mit dem Label des Punktes\n", + " indem Sie Broadcasting verwenden.\n", + "4. Mitteln Sie das so erhaltene Boolesche Array um den vereinfachten\n", + " $k$-Nearest Neighbors Consistency Index zu erhalten.\n", + "\n", + "Im Start-Code sind bereits drei Datensätze und Labels von $k$-Means und\n", + "DBSCAN vorbereitet. Dazu gibt es noch ein sehr kleines Dataset zum\n", + "Entwickeln und Debuggen:" + ], + "id": "0003-d14573b13f63851bad185642dc2e93b9c80f73c473f33bbf0d0950acea9" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "style": "python" + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import pandas as pd\n", + "import seaborn as sns\n", + "from sklearn.cluster import KMeans, DBSCAN\n", + "from sklearn.datasets import make_blobs, make_moons, make_circles\n", + "from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier\n", + "\n", + "X0, y0_true = make_blobs(n_samples=[100, 100, 400, 400], random_state=1)\n", + "X1, y1_true = make_moons(n_samples=1000, noise=0.1, random_state=42)\n", + "X2, y2_true = make_circles(n_samples=1000, noise=0.1, factor=0.45, random_state=42)\n", + "X3, y3_true = make_blobs(n_samples=[10, 10, 10, 5], random_state=1) # for debugging\n", + "\n", + "kmeans = KMeans(n_clusters=4, n_init=5)\n", + "y0_kmeans = kmeans.fit_predict(X0)\n", + "kmeans = KMeans(n_clusters=2, n_init=5)\n", + "y1_kmeans = kmeans.fit_predict(X1)\n", + "y2_kmeans = kmeans.fit_predict(X2)\n", + "\n", + "# put\n", + "def label_noise_inplace(X, labels):\n", + " noise = labels == -1\n", + " if noise.sum() == 0:\n", + " return\n", + "\n", + " knn = KNeighborsClassifier(weights='distance')\n", + " labels[noise] = knn.fit(X[~noise], labels[~noise]).predict(X[noise])\n", + "\n", + "dbscan = DBSCAN(eps=0.67, min_samples=15)\n", + "y0_dbscan = dbscan.fit_predict(X0)\n", + "label_noise_inplace(X0, y0_dbscan)\n", + "\n", + "dbscan = DBSCAN(eps=0.13)\n", + "y1_dbscan = dbscan.fit_predict(X1)\n", + "label_noise_inplace(X1, y1_dbscan)\n", + "\n", + "dbscan = DBSCAN(eps=0.136, min_samples=10)\n", + "y2_dbscan = dbscan.fit_predict(X2)\n", + "label_noise_inplace(X2, y2_dbscan)\n", + "\n", + "datasets = [\n", + " (X0, y0_kmeans, y0_dbscan),\n", + " (X1, y1_kmeans, y1_dbscan),\n", + " (X2, y2_kmeans, y2_dbscan),\n", + " (X3, y3_true, y3_true),\n", + "]\n", + "\n", + "# use this tiny dataset for development:\n", + "X, labels = X3, y3_true" + ], + "id": "0004-a87e934fff935b3d9b3bdbe2a3d30560b71ffad3832a9f9941c0eef14fd" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Hier ein Plot der Daten:" + ], + "id": "0005-3d78bcf9122a2a1bb3e1681a8c95f29db1a5e142a429a286e2f70f5ab28" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig, axs = plt.subplots(4, 2, figsize=(10, 12))\n", + "for i, (X, labels_kmeans, labels_dbscan) in enumerate(datasets):\n", + " axs[i, 0].scatter(*X.T, c=labels_kmeans)\n", + " axs[i, 1].scatter(*X.T, c=labels_dbscan)" + ], + "id": "0006-8c9aee80745ef7b3faf39e7f00d84bdd25f9842fe443006be802f23e3eb" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Hier Ihr Code:" + ], + "id": "0007-4f90926ab6960e65168fdff6f5e151194200dd69b468fb4eeb302fa84f1" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "style": "python" + }, + "outputs": [], + "source": [], + "id": "0008-44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" + } + ], + "nbformat": 4, + "nbformat_minor": 5, + "metadata": {} +} diff --git a/06-clustering/demo/clustering_demo.py b/06-clustering/demo/clustering_demo.py new file mode 100644 index 0000000000000000000000000000000000000000..95fa5dba19036efe0d96848559feef1813f0d725 --- /dev/null +++ b/06-clustering/demo/clustering_demo.py @@ -0,0 +1,493 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +"""Interactive Demonstration of Different Clustering Algorithms on a Hard Problem + +This demonstrates with interactive plots with sliders and check boxes: + +- k-Means, optionally with bisecting behavior +- DBSCAN / Common Nearest Neighbor Clustering +- OPTICS with either DBSCAN or Xi clustering +- HDBSCAN, combined with Excess of Mass (EoM) or leaf cluster selection. Also + a with DBSCAN epsilon cutoff can be set or soft clustering can be used to + reassign noise points. + +Prerequisites: +scikit-learn, matplotlib, scipy, scikit-learn-extra, hdbscan +If you only need to install the latter two packages, use: + + mamba install hdbscan scikit-learn-extra +""" + +# %% imports +import matplotlib as mpl +import matplotlib.pyplot as plt +from matplotlib.widgets import Slider, CheckButtons +import numpy as np + +from sklearn.datasets import make_blobs, make_moons +from sklearn.preprocessing import StandardScaler +from sklearn.cluster import DBSCAN, compute_optics_graph, cluster_optics_dbscan, cluster_optics_xi, KMeans, BisectingKMeans +from sklearn.neighbors import KNeighborsClassifier +from sklearn.metrics import confusion_matrix +from scipy.optimize import linear_sum_assignment + +# # Install these with from terminal with: +# mamba install hdbscan scikit-learn-extra +# # Install these with from this script with: +# !mamba install hdbscan scikit-learn-extra + +from sklearn_extra.cluster import CommonNNClustering +# from sklearn.cluster import HDBSCAN # does not have soft clustering and tree plot +from hdbscan import HDBSCAN +import hdbscan + +# INTERACTIVE GUI ELEMENTS (BUTTONS, SLIDERS ETC.) +# NOTE: use this only if in a jupyter environment. +# %matplotlib widget +# NOTE: use this only from spyder (or try from local VS Code) +%matplotlib auto + +# %% generate data with different densities and structure +np.random.seed(42) +n = 2000 + +blobs, y_blobs = make_blobs(n_samples=2000, cluster_std=[0.25, 1, 4, 6], centers=4, random_state=12) +blobs = StandardScaler().fit_transform(blobs) + +moons, y_moons = make_moons(n_samples=250, noise=0.1, random_state=42) +moons -= [2, 2] + +X = np.vstack((blobs, moons)) +y = np.hstack((y_blobs, y_moons + max(y_blobs) + 1)) + +# common color map for all plots +cm = mpl.cm.Set3 + + +# %% match label to ground truth function to match colors optimally +# also knows as "Hungarian algorithm" +def match_labels(labels, return_mapper=False): + noise = labels == -1 + + # get unique values and range labels starting from 0 or -1 + u, range_labels = np.unique(labels, return_inverse=True) + num_clusters = len(u) + if np.any(noise): + num_clusters -= 1 + range_labels -= 1 + + conf = confusion_matrix(range_labels[~noise], y[~noise])[:num_clusters] + _, mapper = linear_sum_assignment(-conf) + new_labels = mapper[range_labels] + new_labels[noise] = -1 # preserve noise + if return_mapper: + return new_labels, mapper + return new_labels + +# %% helper for cluster hist +def draw_cluster_hist(ax, labels, y=None): + ax.clear() + ax.set_yticks([]) + ax.set_title('Cluster histogram') + + min_label = labels.min() + + n_clusters = labels.max() + 1 + N, bins, patches = ax.hist(labels, bins=n_clusters - min_label, range=(min_label - 0.5, n_clusters - 0.5)) + for i, p in enumerate(patches, start=min_label): + p.set_facecolor(cm((i + 1) % cm.N)) + + if y is None: + return + + n_real_clusters = y.max() + 1 + N, bins, patches_real = ax.hist(y, bins=n_real_clusters, range=(-0.5, n_real_clusters - 0.5), edgecolor='k', rwidth=0.2) + for i, p in enumerate(patches_real): + p.set_facecolor(cm((i + 1) % cm.N)) + + +# %% plot with original y labels + +fig_orig = plt.figure('Ground Truth') +ax_orig = fig_orig.add_subplot() +sc_orig = ax_orig.scatter(*X.T, alpha=0.6) +sc_orig.set_facecolor(cm((y + 1) % cm.N)) +ax_orig.set_title('Original Labels as Generated') +clusters, counts = np.unique(y, return_counts=True) +print('Original cluster sizes:') +print(counts) +plt.show() + + +# %% plot with k-means labels +plt.close('all') + +fig_kmeans = plt.figure('k-Means', figsize=(8, 11)) +ax_kmeans = fig_kmeans.add_subplot(2, 1, 1) +sc_kmeans = ax_kmeans.scatter(*X.T, alpha=0.8) +ax_kmeans.set_xticks([]) +ax_kmeans.set_yticks([]) + +fig_kmeans.subplots_adjust(left=0.1, top=0.95, right=0.99) + +ax_kmeans_slider = fig_kmeans.add_axes([0.03, 0.1, 0.0225, 0.85]) +kmeans_slider = Slider(ax=ax_kmeans_slider, label="k", orientation="vertical", + valmin=1, valmax=20, valinit=6, valstep=1) + +ax_kmeans_show = fig_kmeans.add_axes([0.99 - 0.3, 0.95 - 0.05, 0.3, 0.05]) +kmeans_show_button = CheckButtons(ax=ax_kmeans_show, labels=['Show Ground Truth']) + +# ax_kmeans_button = fig_kmeans.add_axes([0.03, 0.03, 0.1, 0.1]) +ax_kmeans_button = fig_kmeans.add_axes([0.99 - 0.1, 0.95 - 0.05 - 0.05, 0.1, 0.05]) +kmeans_button = CheckButtons(ax=ax_kmeans_button, labels=['bisect']) + +ax_kmeans_bars = fig_kmeans.add_subplot(2, 1, 2) +ax_kmeans_bars.bar([0,1], [0, 1]) +ax_kmeans_bars.set_xticks([]) + +def update_kmeans(val=None): + if kmeans_button.get_status()[0]: + alg = BisectingKMeans(n_clusters=int(kmeans_slider.val)) + ax_kmeans.set_title('Bisecting k-Means++') + else: + alg = KMeans(n_clusters=int(kmeans_slider.val), n_init=10) + ax_kmeans.set_title('k-Means++') + + labels = alg.fit_predict(X) + matched_labels = match_labels(labels) + + sc_kmeans.set_facecolor(cm((matched_labels + 1) % cm.N)) + if kmeans_show_button.get_status()[0]: + draw_cluster_hist(ax_kmeans_bars, matched_labels, y) + sc_kmeans.set_edgecolor(cm((y + 1) % cm.N)) + else: + draw_cluster_hist(ax_kmeans_bars, matched_labels) + sc_kmeans.set_edgecolor(cm((matched_labels + 1) % cm.N)) + + fig_kmeans.canvas.draw_idle() + + +kmeans_slider.on_changed(update_kmeans) +kmeans_button.on_clicked(update_kmeans) +kmeans_show_button.on_clicked(update_kmeans) +update_kmeans() +plt.show() + + +# %% plot with DBSCAN labels +plt.close('all') + +fig_dbscan = plt.figure('DBSCAN', figsize=(8, 11)) +ax_dbscan = fig_dbscan.add_subplot(2, 1, 1) +sc_dbscan = ax_dbscan.scatter(*X.T, alpha=0.8) +sc_dbscan.set_edgecolor(cm((y + 1) % cm.N)) +ax_dbscan.set_xticks([]) +ax_dbscan.set_yticks([]) + +fig_dbscan.subplots_adjust(left=0.1, top=0.95, right=0.99) + +ax_dbscan_eps_slider = fig_dbscan.add_axes([0.03, 0.1, 0.0225, 0.75]) +dbscan_eps_slider = Slider(ax=ax_dbscan_eps_slider, label="eps", orientation="vertical", + valmin=0.01, valmax=0.7, valinit=0.25) + +ax_dbscan_samples_slider = fig_dbscan.add_axes([0.07, 0.15, 0.0225, 0.75]) +dbscan_samples_slider = Slider(ax=ax_dbscan_samples_slider, label='min_samples', orientation="vertical", + valmin=2, valmax=70, valinit=12, valstep=1) + +ax_dbscan_show = fig_dbscan.add_axes([0.99 - 0.3, 0.95 - 0.05, 0.3, 0.05]) +dbscan_show_button = CheckButtons(ax=ax_dbscan_show, labels=['Show Ground Truth']) + +ax_dbscan_button = fig_dbscan.add_axes([0.99 - 0.25, 0.95 - 0.05 - 0.05, 0.25, 0.05]) +dbscan_button = CheckButtons(ax=ax_dbscan_button, labels=['DBSCAN / CNNC']) + +ax_dbscan_bars = fig_dbscan.add_subplot(2, 1, 2) +ax_dbscan_bars.bar([0,1], [0, 1]) +ax_dbscan_bars.set_xticks([]) + +# merge small clusters to the nearest large cluster (useful for CommonNNClustering) +def merge_small_clusters(X, labels, limit=15, inplace=False): + noise_labels = labels == -1 + clusters, counts = np.unique(labels[~noise_labels], return_counts=True) + if min(counts) > limit: + return labels + + small_clusters = np.zeros(labels.shape, dtype=bool) + for cluster, count in zip(clusters, counts): + if count <= limit: + small_clusters |= labels == cluster + + large_clusters = (~small_clusters) & (~noise_labels) + knn = KNeighborsClassifier(n_neighbors=1) + knn.fit(X[large_clusters], labels[large_clusters]) + merge_labels = knn.predict(X[small_clusters]) + + if not inplace: + labels = labels.copy() + + labels[small_clusters] = merge_labels + return labels + + +def update_dbscan(val=None): + if dbscan_button.get_status()[0]: + alg = CommonNNClustering(eps=dbscan_eps_slider.val, min_samples=int(dbscan_samples_slider.val)) + alg_name = 'Common Nearest Neighbor Clustering' + else: + alg = DBSCAN(eps=dbscan_eps_slider.val, min_samples=int(dbscan_samples_slider.val)) + alg_name = 'DBSCAN' + + ax_dbscan.set_title(alg_name) + labels = alg.fit_predict(X) + merge_small_clusters(X, labels, inplace=True) + matched_labels = match_labels(labels) + + sc_dbscan.set_facecolor(cm((matched_labels + 1) % cm.N)) + sc_dbscan.set_sizes(list(map(lambda l: 10 if l < 0 else 50, matched_labels))) + if dbscan_show_button.get_status()[0]: + draw_cluster_hist(ax_dbscan_bars, matched_labels, y) + sc_dbscan.set_edgecolor(cm((y + 1) % cm.N)) + else: + draw_cluster_hist(ax_dbscan_bars, matched_labels) + sc_dbscan.set_edgecolor(cm((matched_labels + 1) % cm.N)) + fig_dbscan.canvas.draw_idle() + + +dbscan_eps_slider.on_changed(update_dbscan) +dbscan_samples_slider.on_changed(update_dbscan) +dbscan_button.on_clicked(update_dbscan) +dbscan_show_button.on_clicked(update_dbscan) +update_dbscan() +plt.show() + + +# %% plot with OPTICS (DBSCAN or Xi) labels +plt.close('all') + +fig_optics = plt.figure('OPTICS', figsize=(8, 11)) + +ax_optics_reachability = fig_optics.add_subplot(3, 1, 1) +sc_optics_reachability = ax_optics_reachability.scatter([], [], alpha=0.6) +ax_optics_reachability.set_xlim(0, len(y)) +X_optics_reachability = np.arange(len(y)) +ax_optics_reachability.set_title('Reachability Plot') + +ax_optics = fig_optics.add_subplot(3, 1, 2) +sc_optics = ax_optics.scatter(*X.T, alpha=0.8) +sc_optics.set_edgecolor(cm((y + 1) % cm.N)) +ax_optics.set_xticks([]) +ax_optics.set_yticks([]) + +fig_optics.subplots_adjust(left=0.15, top=0.95, right=0.99) +ax_optics_eps_slider = fig_optics.add_axes([0.03, 0.1, 0.0225, 0.75]) +optics_eps_slider = Slider(ax=ax_optics_eps_slider, label="eps", orientation="vertical", + valmin=0.01, valmax=0.7, valinit=0.05) + +ax_optics_samples_slider = fig_optics.add_axes([0.07, 0.15, 0.0225, 0.75]) +optics_samples_slider = Slider(ax=ax_optics_samples_slider, label='min_samples', orientation="vertical", + valmin=2, valmax=70, valinit=12, valstep=1) + +ax_optics_pos = ax_optics.get_position() +ax_optics_button = fig_optics.add_axes([0.15, ax_optics_pos.y0, 0.2, 0.05]) +optics_button = CheckButtons(ax=ax_optics_button, labels=['DBSCAN/Xi']) + +ax_optics_show = fig_optics.add_axes([0.99 - 0.3, ax_optics_pos.y1 - 0.05, 0.3, 0.05]) +optics_show_button = CheckButtons(ax=ax_optics_show, labels=['Show Ground Truth']) + +optics_graph = {} + +ax_optics_bars = fig_optics.add_subplot(3, 1, 3) +ax_optics_bars.bar([0,1], [0, 1]) +ax_optics_bars.set_xticks([]) + + +def update_optics_min_samples(val=None): + res = compute_optics_graph(X, min_samples=int(optics_samples_slider.val), + max_eps=optics_eps_slider.valmax, n_jobs=8, + metric='minkowski', p=2, metric_params=None, algorithm='auto', leaf_size=30) # defaults + optics_graph['ordering'] = res[0] + optics_graph['core_distances'] = res[1] + optics_graph['reachability'] = res[2] + optics_graph['predecessor'] = res[3] + + r = optics_graph['reachability'][optics_graph['ordering']] + sc_optics_reachability.set_offsets(np.c_[X_optics_reachability, r]) + ax_optics_reachability.set_ylim(0, np.max(r[np.isfinite(r)])) + + update_optics_clustering() + + +def update_optics_clustering(val=None): + if optics_button.get_status()[0]: + labels = cluster_optics_dbscan(reachability=optics_graph['reachability'], + core_distances=optics_graph['core_distances'], + ordering=optics_graph['ordering'], + eps=optics_eps_slider.val) + alg_name = 'OPTICS → DBSCAN' + ax_optics_eps_slider.texts[0].set_text('eps') + update_optics_clustering.eps_line.set_ydata([optics_eps_slider.val]) + update_optics_clustering.eps_line.set_visible(True) + else: + labels = cluster_optics_xi(reachability=optics_graph['reachability'], + predecessor=optics_graph['predecessor'], + ordering=optics_graph['ordering'], + xi=optics_eps_slider.val, + min_cluster_size=0.1, + min_samples=int(optics_samples_slider.val)) + labels = labels[0] + alg_name = 'OPTICS → Xi' + ax_optics_eps_slider.texts[0].set_text('Xi') + update_optics_clustering.eps_line.set_visible(False) + + ax_optics.set_title(alg_name) + + matched_labels = match_labels(labels) + labels_r = matched_labels[optics_graph['ordering']] + sc_optics_reachability.set_facecolor(cm((labels_r + 1) % cm.N)) + sc_optics_reachability.set_sizes(list(map(lambda l: 10 if l < 0 else 50, labels_r))) + sc_optics.set_facecolor(cm((matched_labels + 1) % cm.N)) + sc_optics.set_sizes(list(map(lambda l: 10 if l < 0 else 50, matched_labels))) + if optics_show_button.get_status()[0]: + draw_cluster_hist(ax_optics_bars, matched_labels, y) + sc_optics.set_edgecolor(cm((y + 1) % cm.N)) + else: + draw_cluster_hist(ax_optics_bars, matched_labels) + sc_optics.set_edgecolor(cm((matched_labels + 1) % cm.N)) + fig_optics.canvas.draw_idle() + + +update_optics_clustering.eps_line = ax_optics_reachability.axhline(y=optics_eps_slider.val, color="gray", linestyle='--') + +optics_eps_slider.on_changed(update_optics_clustering) +optics_samples_slider.on_changed(update_optics_min_samples) +optics_button.on_clicked(update_optics_clustering) +optics_show_button.on_clicked(update_optics_clustering) +update_optics_min_samples() +update_optics_clustering() +plt.show() + +# %% plot with HDBSCAN labels +plt.close('all') + +is_standalone_hdbscan = hasattr(HDBSCAN, 'generate_prediction_data') +subplots = 3 if is_standalone_hdbscan else 2 +button_labels = ['EoM/Leaf', 'Soft'] if is_standalone_hdbscan else ['EoM/Leaf'] + +fig_hdbscan = plt.figure('HDBSCAN', figsize=(8, 11)) + +ax_hdbscan = fig_hdbscan.add_subplot(subplots, 1, 1) +sc_hdbscan = ax_hdbscan.scatter(*X.T, alpha=0.8) +sc_hdbscan.set_edgecolor(cm((y + 1) % cm.N)) +ax_hdbscan.set_xticks([]) +ax_hdbscan.set_yticks([]) + +fig_hdbscan.subplots_adjust(left=0.15, top=0.95, right=0.99) + +ax_hdbscan_eps_slider = fig_hdbscan.add_axes([0.03, 0.1, 0.0225, 0.75]) +hdbscan_eps_slider = Slider(ax=ax_hdbscan_eps_slider, label="eps", orientation="vertical", + valmin=0.0, valmax=0.7, valinit=0.0) + +ax_hdbscan_samples_slider = fig_hdbscan.add_axes([0.07, 0.15, 0.0225, 0.75]) +hdbscan_samples_slider = Slider(ax=ax_hdbscan_samples_slider, label='min_samples', orientation="vertical", + valmin=2, valmax=70, valinit=6, valstep=1) + +if is_standalone_hdbscan: + ax_hdbscan_tree = fig_hdbscan.add_subplot(subplots, 1, 3) + ax_hdbscan_tree.set_title('Condensed Tree Plot') + +ax_hdbscan_pos = ax_hdbscan.get_position() +ax_hdbscan_button = fig_hdbscan.add_axes([0.15, ax_hdbscan_pos.y0, 0.15, 0.1]) +hdbscan_button = CheckButtons(ax=ax_hdbscan_button, labels=button_labels) + +ax_hdbscan_show = fig_hdbscan.add_axes([0.99 - 0.3, ax_hdbscan_pos.y1 - 0.05, 0.3, 0.05]) +hdbscan_show_button = CheckButtons(ax=ax_hdbscan_show, labels=['Show Ground Truth']) + +ax_hdbscan_bars = fig_hdbscan.add_subplot(subplots, 1, 2) +ax_hdbscan_bars.bar([0,1], [0, 1]) +ax_hdbscan_bars.set_xticks([]) + +if is_standalone_hdbscan: + def update_hdbscan(val=None): + if hdbscan_button.get_status()[0]: + csm = 'Leaf' + else: + csm = 'EoM' + + issoft = hdbscan_button.get_status()[1] + + alg = HDBSCAN(min_samples=int(hdbscan_samples_slider.val), + min_cluster_size=40, + cluster_selection_epsilon=0.0 if issoft else float(hdbscan_eps_slider.val), + cluster_selection_method=csm.lower(), + allow_single_cluster=True, + prediction_data=True) + labels = alg.fit_predict(X) + + # soft clustering + if issoft: + soft = ' with Soft Clustering' + probas = hdbscan.prediction.all_points_membership_vectors(alg.fit(X)) + soft_labels = np.argmax(probas, axis=1) + + good = probas[range(len(labels)), labels] > hdbscan_eps_slider.val # 0.05 for 5% + soft_labels[~good] = -1 + + noise = labels == -1 + labels[noise] = soft_labels[noise] + ax_hdbscan_eps_slider.texts[0].set_text('max prob') + descr = f'and soft clustering probability cutoff at {hdbscan_eps_slider.val * 100:.1f}%' + else: + ax_hdbscan_eps_slider.texts[0].set_text('eps') + soft = '' + descr = f'and cluster_selection_epsilon={hdbscan_eps_slider.val}' + + ax_hdbscan.set_title(f'HDBSCAN with {csm} Cluster Selection Method{soft}') + matched_labels, mapper = match_labels(labels, return_mapper=True) + mapped_colors = np.array(cm.colors[1:])[mapper] + + ax_hdbscan_tree.clear() + alg.condensed_tree_.plot(axis=ax_hdbscan_tree, colorbar=False, select_clusters=True, selection_palette=mapped_colors) + + sc_hdbscan.set_facecolor(cm((matched_labels + 1) % cm.N)) + sc_hdbscan.set_sizes(list(map(lambda l: 10 if l < 0 else 50, matched_labels))) + if hdbscan_show_button.get_status()[0]: + draw_cluster_hist(ax_hdbscan_bars, matched_labels, y) + sc_hdbscan.set_edgecolor(cm((y + 1) % cm.N)) + else: + draw_cluster_hist(ax_hdbscan_bars, matched_labels) + sc_hdbscan.set_edgecolor(cm((matched_labels + 1) % cm.N)) + fig_hdbscan.canvas.draw_idle() + +else: + def update_hdbscan(val=None): + if hdbscan_button.get_status()[0]: + csm = 'Leaf' + else: + csm = 'EoM' + + alg = HDBSCAN(min_samples=int(hdbscan_samples_slider.val), + min_cluster_size=40, + cluster_selection_epsilon=hdbscan_eps_slider.val, + cluster_selection_method=csm.lower(), + allow_single_cluster=True) + labels = alg.fit_predict(X) + + ax_hdbscan.set_title(f'HDBSCAN with {csm} Cluster Selection Method') + matched_labels = match_labels(labels) + + sc_hdbscan.set_facecolor(cm((matched_labels + 1) % cm.N)) + sc_hdbscan.set_sizes(list(map(lambda l: 10 if l < 0 else 50, matched_labels))) + if hdbscan_show_button.get_status()[0]: + draw_cluster_hist(ax_hdbscan_bars, matched_labels, y) + sc_hdbscan.set_edgecolor(cm((y + 1) % cm.N)) + else: + draw_cluster_hist(ax_hdbscan_bars, matched_labels) + sc_hdbscan.set_edgecolor(cm((matched_labels + 1) % cm.N)) + fig_hdbscan.canvas.draw_idle() + + +hdbscan_eps_slider.on_changed(update_hdbscan) +hdbscan_samples_slider.on_changed(update_hdbscan) +hdbscan_button.on_clicked(update_hdbscan) +hdbscan_show_button.on_clicked(update_hdbscan) +update_hdbscan() +plt.show()