From 17e18cedd15df96f1ec6727cb859ff47ce790654 Mon Sep 17 00:00:00 2001
From: Christof Kaufmann <christof.kaufmann@hs-bochum.de>
Date: Mon, 19 May 2025 10:20:12 +0000
Subject: [PATCH] Notebooks from applied-cs/data-science@94f58a16

---
 06-clustering/folien-code/folien-code.ipynb   | 97 ++++++++++++++++++-
 06-clustering/folien-code/folien-code.py      | 77 ++++++++++++++-
 06-clustering/solutions/01-inertia-sol.ipynb  | 23 +++--
 .../solutions/folien-code/folien-code.ipynb   | 97 ++++++++++++++++++-
 .../solutions/folien-code/folien-code.py      | 77 ++++++++++++++-
 5 files changed, 358 insertions(+), 13 deletions(-)

diff --git a/06-clustering/folien-code/folien-code.ipynb b/06-clustering/folien-code/folien-code.ipynb
index bf1d372..f677350 100644
--- a/06-clustering/folien-code/folien-code.ipynb
+++ b/06-clustering/folien-code/folien-code.ipynb
@@ -17,14 +17,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "import timeit\n",
     "import pandas as pd\n",
     "import numpy as np\n",
     "import matplotlib.pyplot as plt\n",
     "from IPython.display import display\n",
     "import seaborn as sns\n",
-    "from sklearn.cluster import KMeans, DBSCAN\n",
+    "from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering\n",
     "from sklearn.datasets import make_blobs, make_circles, make_moons, load_breast_cancer, load_iris\n",
     "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.neighbors import NearestNeighbors\n",
     "import keras\n",
     "sns.set_theme()\n"
    ]
@@ -299,7 +301,98 @@
     "ax = sns.lineplot(df, x='eps', y='n_clusters', hue='min_samples', palette='rainbow', legend='full')\n",
     "# ax.set_ylim(bottom=0)\n",
     "ax.set_ylim(bottom=0, top=10)     # zoom in y-Achse\n",
-    "ax.set_xlim(left=0.1, right=0.2)  # zoom in x-Achse\n"
+    "ax.set_xlim(left=0.1, right=0.2)  # zoom in x-Achse\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "n = 2\n",
+    "rng = np.random.default_rng()\n",
+    "datasets = {\n",
+    "    'normal': rng.standard_normal(size=(2000, n), dtype='float32'),\n",
+    "    'uniform': rng.random(size=(2000, n), dtype='float32'),\n",
+    "}\n",
+    "\n",
+    "for name, X in datasets.items():\n",
+    "    nn = NearestNeighbors(n_neighbors=5, n_jobs=1)\n",
+    "    nn.fit(X)\n",
+    "    dists, neighbors = nn.kneighbors(X)\n",
+    "    max_dists = dists.max(axis=1)\n",
+    "\n",
+    "    fig, ax = plt.subplots(figsize=(3, 3))\n",
+    "    ax.scatter(*X.T, c=-max_dists, alpha=0.7) # neg to invert color map\n",
+    "    ax.set_aspect('equal')\n",
+    "    ax.set_xlabel('$x_1$')\n",
+    "    ax.set_ylabel('$x_2$')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "n_bins = 21\n",
+    "cmap = plt.colormaps['viridis'].resampled(n_bins)\n",
+    "n_feat = [2, 5, 10, 20, 50, 100]\n",
+    "for n in n_feat:\n",
+    "    rng = np.random.default_rng(42)\n",
+    "    datasets = {\n",
+    "        'normal': rng.standard_normal(size=(50_000, n), dtype='float32'),\n",
+    "        'uniform': rng.random(size=(50_000, n), dtype='float32'),\n",
+    "    }\n",
+    "\n",
+    "    for name, X in datasets.items():\n",
+    "        nn = NearestNeighbors(n_neighbors=5, n_jobs=13)\n",
+    "\n",
+    "        start = timeit.default_timer()\n",
+    "        nn.fit(X)\n",
+    "        dists, neighbors = nn.kneighbors(X)\n",
+    "        max_dists = dists.max(axis=1)\n",
+    "        end = timeit.default_timer()\n",
+    "        print(f'{name=}, {n=}, runtime: {end - start} s')\n",
+    "\n",
+    "        fig, ax = plt.subplots(figsize=(7, 5))\n",
+    "        data_range = np.quantile(max_dists, [0.02, 0.98])\n",
+    "        N, bins, patches = ax.hist(max_dists, bins=n_bins, edgecolor='w', range=data_range)\n",
+    "        for i, p in enumerate(patches):\n",
+    "            p.set_facecolor(cmap(n_bins - 1 - i))\n",
+    "        ax.set_xlim(data_range)\n",
+    "        ax.set_xlabel('Max distances to 4 neighbors')\n",
+    "        ax.set_title(f'Density Histogram for {name} distribution in {n} dimensions')\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "n = 20\n",
+    "X, y = make_moons(n_samples=n, noise=0.05, random_state=4, shuffle=False)\n",
+    "X_outlier = np.vstack((X, [[1.2, 0.8], [1.6, 0.95]]))\n",
+    "\n",
+    "# without outlier\n",
+    "iteration = 18\n",
+    "agg = AgglomerativeClustering(n_clusters=20 - iteration, linkage='single')\n",
+    "c = agg.fit_predict(X)\n",
+    "plt.figure()\n",
+    "plt.scatter(*X.T, c=c)\n",
+    "plt.axis('square')\n",
+    "\n",
+    "# with outlier\n",
+    "agg = AgglomerativeClustering(n_clusters=3, linkage='single')\n",
+    "c = agg.fit_predict(X_outlier)\n",
+    "plt.figure()\n",
+    "plt.scatter(*X_outlier.T, c=c)\n",
+    "plt.axis('square')\n"
    ]
   }
  ],
diff --git a/06-clustering/folien-code/folien-code.py b/06-clustering/folien-code/folien-code.py
index 2f4a972..c21328f 100644
--- a/06-clustering/folien-code/folien-code.py
+++ b/06-clustering/folien-code/folien-code.py
@@ -4,14 +4,16 @@
 # Dieses Skript bzw. Jupyter-Notebook enthält den Code, der auch auf den Folien "Clustering" enthalten ist. Zum Vorbereiten, Mitmachen oder Nacharbeiten.
 
 # %% import
+import timeit
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 from IPython.display import display
 import seaborn as sns
-from sklearn.cluster import KMeans, DBSCAN
+from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
 from sklearn.datasets import make_blobs, make_circles, make_moons, load_breast_cancer, load_iris
 from sklearn.model_selection import train_test_split
+from sklearn.neighbors import NearestNeighbors
 import keras
 sns.set_theme()
 
@@ -215,3 +217,76 @@ ax = sns.lineplot(df, x='eps', y='n_clusters', hue='min_samples', palette='rainb
 # ax.set_ylim(bottom=0)
 ax.set_ylim(bottom=0, top=10)     # zoom in y-Achse
 ax.set_xlim(left=0.1, right=0.2)  # zoom in x-Achse
+
+
+# %% Dichteverteilungen
+
+n = 2
+rng = np.random.default_rng()
+datasets = {
+    'normal': rng.standard_normal(size=(2000, n), dtype='float32'),
+    'uniform': rng.random(size=(2000, n), dtype='float32'),
+}
+
+for name, X in datasets.items():
+    nn = NearestNeighbors(n_neighbors=5, n_jobs=1)
+    nn.fit(X)
+    dists, neighbors = nn.kneighbors(X)
+    max_dists = dists.max(axis=1)
+
+    fig, ax = plt.subplots(figsize=(3, 3))
+    ax.scatter(*X.T, c=-max_dists, alpha=0.7) # neg to invert color map
+    ax.set_aspect('equal')
+    ax.set_xlabel('$x_1$')
+    ax.set_ylabel('$x_2$')
+
+# %% Dichteverteilungen in hochdimensionalen Räumen
+n_bins = 21
+cmap = plt.colormaps['viridis'].resampled(n_bins)
+n_feat = [2, 5, 10, 20, 50, 100]
+for n in n_feat:
+    rng = np.random.default_rng(42)
+    datasets = {
+        'normal': rng.standard_normal(size=(50_000, n), dtype='float32'),
+        'uniform': rng.random(size=(50_000, n), dtype='float32'),
+    }
+
+    for name, X in datasets.items():
+        nn = NearestNeighbors(n_neighbors=5, n_jobs=13)
+
+        start = timeit.default_timer()
+        nn.fit(X)
+        dists, neighbors = nn.kneighbors(X)
+        max_dists = dists.max(axis=1)
+        end = timeit.default_timer()
+        print(f'{name=}, {n=}, runtime: {end - start} s')
+
+        fig, ax = plt.subplots(figsize=(7, 5))
+        data_range = np.quantile(max_dists, [0.02, 0.98])
+        N, bins, patches = ax.hist(max_dists, bins=n_bins, edgecolor='w', range=data_range)
+        for i, p in enumerate(patches):
+            p.set_facecolor(cmap(n_bins - 1 - i))
+        ax.set_xlim(data_range)
+        ax.set_xlabel('Max distances to 4 neighbors')
+        ax.set_title(f'Density Histogram for {name} distribution in {n} dimensions')
+
+
+# %% Single-Linkage Verhalten
+n = 20
+X, y = make_moons(n_samples=n, noise=0.05, random_state=4, shuffle=False)
+X_outlier = np.vstack((X, [[1.2, 0.8], [1.6, 0.95]]))
+
+# without outlier
+iteration = 18
+agg = AgglomerativeClustering(n_clusters=20 - iteration, linkage='single')
+c = agg.fit_predict(X)
+plt.figure()
+plt.scatter(*X.T, c=c)
+plt.axis('square')
+
+# with outlier
+agg = AgglomerativeClustering(n_clusters=3, linkage='single')
+c = agg.fit_predict(X_outlier)
+plt.figure()
+plt.scatter(*X_outlier.T, c=c)
+plt.axis('square')
diff --git a/06-clustering/solutions/01-inertia-sol.ipynb b/06-clustering/solutions/01-inertia-sol.ipynb
index 7244e46..bad96f8 100644
--- a/06-clustering/solutions/01-inertia-sol.ipynb
+++ b/06-clustering/solutions/01-inertia-sol.ipynb
@@ -131,13 +131,15 @@
     "die Repräsentanten für alle Cluster `mu` im Prinzip analog zum ersten\n",
     "Ansatz. Dann berechnen wir die quadrierte Distanzmatrix\n",
     "`sqr_dist_matrix`, analog zu den Folien (nur halt ohne Wurzel).\n",
-    "Anschließend wählen wir per Indizierung die Abstände der Samples, die zu\n",
-    "Cluster `j` gehören, zu $\\mu_j$ (Spalte `j`) und summieren sie auf. Das\n",
-    "wird in der `list`-Comprehension für jedes Cluster `j` gemacht und diese\n",
-    "Trägheitsmomente werden zum Gesamtträgheitsmoment `total_inertia`\n",
-    "aufsummiert."
+    "Anschließend wählen wir entweder per Indizierung die Abstände der\n",
+    "Samples, die zu Cluster `j` gehören, zu $\\mu_j$ (Spalte `j`) und\n",
+    "summieren sie auf. Das wird in der `list`-Comprehension für jedes\n",
+    "Cluster `j` gemacht und diese Trägheitsmomente werden zum\n",
+    "Gesamtträgheitsmoment `total_inertia` aufsummiert. Oder wir machen das\n",
+    "per komplexer Indizierung mit Indexpaaren. Hierbei wählt `y` jeweils die\n",
+    "richtige Spalte aus."
    ],
-   "id": "0011-f01157f3b95bd8a3730718503771b0bfdc3571b74ca13b9939f17eb58ab"
+   "id": "0011-2e0e1e9b6d0bd37fa78d2d2bdab3c699c3267bc9df7f2fdb3e03cc42823"
   },
   {
    "cell_type": "code",
@@ -153,10 +155,17 @@
     "    mu = np.array([X[y == j].mean(axis=0) for j in range(k)])\n",
     "    diff = X[:, np.newaxis, :] - mu[np.newaxis, :, :]\n",
     "    sqr_dist_matrix = np.sum(diff ** 2, axis=2)\n",
+    "\n",
+    "    # entweder mit list Comprehension\n",
     "    total_inertia = sum([np.sum(sqr_dist_matrix[y == j, j]) for j in range(k)])\n",
+    "\n",
+    "    # oder paarweiser Indizierung\n",
+    "    all_rows = np.arange(len(y))\n",
+    "    total_inertia = np.sum(sqr_dist_matrix[all_rows, y])\n",
+    "\n",
     "    inertia_comp.append(total_inertia)"
    ],
-   "id": "0012-fe2cf201c04462c6164ddc58d877a6d58d84dc101e5c31b244951544183"
+   "id": "0012-9050cec2f330532f97b66a58127c4417c52d272f1b2007d7f2c90548730"
   },
   {
    "cell_type": "code",
diff --git a/06-clustering/solutions/folien-code/folien-code.ipynb b/06-clustering/solutions/folien-code/folien-code.ipynb
index bf1d372..f677350 100644
--- a/06-clustering/solutions/folien-code/folien-code.ipynb
+++ b/06-clustering/solutions/folien-code/folien-code.ipynb
@@ -17,14 +17,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "import timeit\n",
     "import pandas as pd\n",
     "import numpy as np\n",
     "import matplotlib.pyplot as plt\n",
     "from IPython.display import display\n",
     "import seaborn as sns\n",
-    "from sklearn.cluster import KMeans, DBSCAN\n",
+    "from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering\n",
     "from sklearn.datasets import make_blobs, make_circles, make_moons, load_breast_cancer, load_iris\n",
     "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.neighbors import NearestNeighbors\n",
     "import keras\n",
     "sns.set_theme()\n"
    ]
@@ -299,7 +301,98 @@
     "ax = sns.lineplot(df, x='eps', y='n_clusters', hue='min_samples', palette='rainbow', legend='full')\n",
     "# ax.set_ylim(bottom=0)\n",
     "ax.set_ylim(bottom=0, top=10)     # zoom in y-Achse\n",
-    "ax.set_xlim(left=0.1, right=0.2)  # zoom in x-Achse\n"
+    "ax.set_xlim(left=0.1, right=0.2)  # zoom in x-Achse\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "n = 2\n",
+    "rng = np.random.default_rng()\n",
+    "datasets = {\n",
+    "    'normal': rng.standard_normal(size=(2000, n), dtype='float32'),\n",
+    "    'uniform': rng.random(size=(2000, n), dtype='float32'),\n",
+    "}\n",
+    "\n",
+    "for name, X in datasets.items():\n",
+    "    nn = NearestNeighbors(n_neighbors=5, n_jobs=1)\n",
+    "    nn.fit(X)\n",
+    "    dists, neighbors = nn.kneighbors(X)\n",
+    "    max_dists = dists.max(axis=1)\n",
+    "\n",
+    "    fig, ax = plt.subplots(figsize=(3, 3))\n",
+    "    ax.scatter(*X.T, c=-max_dists, alpha=0.7) # neg to invert color map\n",
+    "    ax.set_aspect('equal')\n",
+    "    ax.set_xlabel('$x_1$')\n",
+    "    ax.set_ylabel('$x_2$')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "n_bins = 21\n",
+    "cmap = plt.colormaps['viridis'].resampled(n_bins)\n",
+    "n_feat = [2, 5, 10, 20, 50, 100]\n",
+    "for n in n_feat:\n",
+    "    rng = np.random.default_rng(42)\n",
+    "    datasets = {\n",
+    "        'normal': rng.standard_normal(size=(50_000, n), dtype='float32'),\n",
+    "        'uniform': rng.random(size=(50_000, n), dtype='float32'),\n",
+    "    }\n",
+    "\n",
+    "    for name, X in datasets.items():\n",
+    "        nn = NearestNeighbors(n_neighbors=5, n_jobs=13)\n",
+    "\n",
+    "        start = timeit.default_timer()\n",
+    "        nn.fit(X)\n",
+    "        dists, neighbors = nn.kneighbors(X)\n",
+    "        max_dists = dists.max(axis=1)\n",
+    "        end = timeit.default_timer()\n",
+    "        print(f'{name=}, {n=}, runtime: {end - start} s')\n",
+    "\n",
+    "        fig, ax = plt.subplots(figsize=(7, 5))\n",
+    "        data_range = np.quantile(max_dists, [0.02, 0.98])\n",
+    "        N, bins, patches = ax.hist(max_dists, bins=n_bins, edgecolor='w', range=data_range)\n",
+    "        for i, p in enumerate(patches):\n",
+    "            p.set_facecolor(cmap(n_bins - 1 - i))\n",
+    "        ax.set_xlim(data_range)\n",
+    "        ax.set_xlabel('Max distances to 4 neighbors')\n",
+    "        ax.set_title(f'Density Histogram for {name} distribution in {n} dimensions')\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "n = 20\n",
+    "X, y = make_moons(n_samples=n, noise=0.05, random_state=4, shuffle=False)\n",
+    "X_outlier = np.vstack((X, [[1.2, 0.8], [1.6, 0.95]]))\n",
+    "\n",
+    "# without outlier\n",
+    "iteration = 18\n",
+    "agg = AgglomerativeClustering(n_clusters=20 - iteration, linkage='single')\n",
+    "c = agg.fit_predict(X)\n",
+    "plt.figure()\n",
+    "plt.scatter(*X.T, c=c)\n",
+    "plt.axis('square')\n",
+    "\n",
+    "# with outlier\n",
+    "agg = AgglomerativeClustering(n_clusters=3, linkage='single')\n",
+    "c = agg.fit_predict(X_outlier)\n",
+    "plt.figure()\n",
+    "plt.scatter(*X_outlier.T, c=c)\n",
+    "plt.axis('square')\n"
    ]
   }
  ],
diff --git a/06-clustering/solutions/folien-code/folien-code.py b/06-clustering/solutions/folien-code/folien-code.py
index 2f4a972..c21328f 100644
--- a/06-clustering/solutions/folien-code/folien-code.py
+++ b/06-clustering/solutions/folien-code/folien-code.py
@@ -4,14 +4,16 @@
 # Dieses Skript bzw. Jupyter-Notebook enthält den Code, der auch auf den Folien "Clustering" enthalten ist. Zum Vorbereiten, Mitmachen oder Nacharbeiten.
 
 # %% import
+import timeit
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 from IPython.display import display
 import seaborn as sns
-from sklearn.cluster import KMeans, DBSCAN
+from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
 from sklearn.datasets import make_blobs, make_circles, make_moons, load_breast_cancer, load_iris
 from sklearn.model_selection import train_test_split
+from sklearn.neighbors import NearestNeighbors
 import keras
 sns.set_theme()
 
@@ -215,3 +217,76 @@ ax = sns.lineplot(df, x='eps', y='n_clusters', hue='min_samples', palette='rainb
 # ax.set_ylim(bottom=0)
 ax.set_ylim(bottom=0, top=10)     # zoom in y-Achse
 ax.set_xlim(left=0.1, right=0.2)  # zoom in x-Achse
+
+
+# %% Dichteverteilungen
+
+n = 2
+rng = np.random.default_rng()
+datasets = {
+    'normal': rng.standard_normal(size=(2000, n), dtype='float32'),
+    'uniform': rng.random(size=(2000, n), dtype='float32'),
+}
+
+for name, X in datasets.items():
+    nn = NearestNeighbors(n_neighbors=5, n_jobs=1)
+    nn.fit(X)
+    dists, neighbors = nn.kneighbors(X)
+    max_dists = dists.max(axis=1)
+
+    fig, ax = plt.subplots(figsize=(3, 3))
+    ax.scatter(*X.T, c=-max_dists, alpha=0.7) # neg to invert color map
+    ax.set_aspect('equal')
+    ax.set_xlabel('$x_1$')
+    ax.set_ylabel('$x_2$')
+
+# %% Dichteverteilungen in hochdimensionalen Räumen
+n_bins = 21
+cmap = plt.colormaps['viridis'].resampled(n_bins)
+n_feat = [2, 5, 10, 20, 50, 100]
+for n in n_feat:
+    rng = np.random.default_rng(42)
+    datasets = {
+        'normal': rng.standard_normal(size=(50_000, n), dtype='float32'),
+        'uniform': rng.random(size=(50_000, n), dtype='float32'),
+    }
+
+    for name, X in datasets.items():
+        nn = NearestNeighbors(n_neighbors=5, n_jobs=13)
+
+        start = timeit.default_timer()
+        nn.fit(X)
+        dists, neighbors = nn.kneighbors(X)
+        max_dists = dists.max(axis=1)
+        end = timeit.default_timer()
+        print(f'{name=}, {n=}, runtime: {end - start} s')
+
+        fig, ax = plt.subplots(figsize=(7, 5))
+        data_range = np.quantile(max_dists, [0.02, 0.98])
+        N, bins, patches = ax.hist(max_dists, bins=n_bins, edgecolor='w', range=data_range)
+        for i, p in enumerate(patches):
+            p.set_facecolor(cmap(n_bins - 1 - i))
+        ax.set_xlim(data_range)
+        ax.set_xlabel('Max distances to 4 neighbors')
+        ax.set_title(f'Density Histogram for {name} distribution in {n} dimensions')
+
+
+# %% Single-Linkage Verhalten
+n = 20
+X, y = make_moons(n_samples=n, noise=0.05, random_state=4, shuffle=False)
+X_outlier = np.vstack((X, [[1.2, 0.8], [1.6, 0.95]]))
+
+# without outlier
+iteration = 18
+agg = AgglomerativeClustering(n_clusters=20 - iteration, linkage='single')
+c = agg.fit_predict(X)
+plt.figure()
+plt.scatter(*X.T, c=c)
+plt.axis('square')
+
+# with outlier
+agg = AgglomerativeClustering(n_clusters=3, linkage='single')
+c = agg.fit_predict(X_outlier)
+plt.figure()
+plt.scatter(*X_outlier.T, c=c)
+plt.axis('square')
-- 
GitLab