Notebooks from applied-cs/data-science@94f58a16

17e18ced · Christof Kaufmann · 96d4d671 · 17e18ced · 17e18ced · 17e18ced
Commit 17e18ced authored 2 months ago by Christof Kaufmann
--- a/06-clustering/folien-code/folien-code.ipynb
+++ b/06-clustering/folien-code/folien-code.ipynb
@@ -17,14 +17,16 @@
   "metadata": {},
   "outputs": [],
   "source": [
+    "import timeit\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "from IPython.display import display\n",
    "import seaborn as sns\n",
-    "from sklearn.cluster import KMeans, DBSCAN\n",
+    "from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering\n",
    "from sklearn.datasets import make_blobs, make_circles, make_moons, load_breast_cancer, load_iris\n",
    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.neighbors import NearestNeighbors\n",
    "import keras\n",
    "sns.set_theme()\n"
   ]
@@ -299,7 +301,98 @@
    "ax = sns.lineplot(df, x='eps', y='n_clusters', hue='min_samples', palette='rainbow', legend='full')\n",
    "# ax.set_ylim(bottom=0)\n",
    "ax.set_ylim(bottom=0, top=10)     # zoom in y-Achse\n",
-    "ax.set_xlim(left=0.1, right=0.2)  # zoom in x-Achse\n"
+    "ax.set_xlim(left=0.1, right=0.2)  # zoom in x-Achse\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "n = 2\n",
+    "rng = np.random.default_rng()\n",
+    "datasets = {\n",
+    "    'normal': rng.standard_normal(size=(2000, n), dtype='float32'),\n",
+    "    'uniform': rng.random(size=(2000, n), dtype='float32'),\n",
+    "}\n",
+    "\n",
+    "for name, X in datasets.items():\n",
+    "    nn = NearestNeighbors(n_neighbors=5, n_jobs=1)\n",
+    "    nn.fit(X)\n",
+    "    dists, neighbors = nn.kneighbors(X)\n",
+    "    max_dists = dists.max(axis=1)\n",
+    "\n",
+    "    fig, ax = plt.subplots(figsize=(3, 3))\n",
+    "    ax.scatter(*X.T, c=-max_dists, alpha=0.7) # neg to invert color map\n",
+    "    ax.set_aspect('equal')\n",
+    "    ax.set_xlabel('$x_1$')\n",
+    "    ax.set_ylabel('$x_2$')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "n_bins = 21\n",
+    "cmap = plt.colormaps['viridis'].resampled(n_bins)\n",
+    "n_feat = [2, 5, 10, 20, 50, 100]\n",
+    "for n in n_feat:\n",
+    "    rng = np.random.default_rng(42)\n",
+    "    datasets = {\n",
+    "        'normal': rng.standard_normal(size=(50_000, n), dtype='float32'),\n",
+    "        'uniform': rng.random(size=(50_000, n), dtype='float32'),\n",
+    "    }\n",
+    "\n",
+    "    for name, X in datasets.items():\n",
+    "        nn = NearestNeighbors(n_neighbors=5, n_jobs=13)\n",
+    "\n",
+    "        start = timeit.default_timer()\n",
+    "        nn.fit(X)\n",
+    "        dists, neighbors = nn.kneighbors(X)\n",
+    "        max_dists = dists.max(axis=1)\n",
+    "        end = timeit.default_timer()\n",
+    "        print(f'{name=}, {n=}, runtime: {end - start} s')\n",
+    "\n",
+    "        fig, ax = plt.subplots(figsize=(7, 5))\n",
+    "        data_range = np.quantile(max_dists, [0.02, 0.98])\n",
+    "        N, bins, patches = ax.hist(max_dists, bins=n_bins, edgecolor='w', range=data_range)\n",
+    "        for i, p in enumerate(patches):\n",
+    "            p.set_facecolor(cmap(n_bins - 1 - i))\n",
+    "        ax.set_xlim(data_range)\n",
+    "        ax.set_xlabel('Max distances to 4 neighbors')\n",
+    "        ax.set_title(f'Density Histogram for {name} distribution in {n} dimensions')\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "n = 20\n",
+    "X, y = make_moons(n_samples=n, noise=0.05, random_state=4, shuffle=False)\n",
+    "X_outlier = np.vstack((X, [[1.2, 0.8], [1.6, 0.95]]))\n",
+    "\n",
+    "# without outlier\n",
+    "iteration = 18\n",
+    "agg = AgglomerativeClustering(n_clusters=20 - iteration, linkage='single')\n",
+    "c = agg.fit_predict(X)\n",
+    "plt.figure()\n",
+    "plt.scatter(*X.T, c=c)\n",
+    "plt.axis('square')\n",
+    "\n",
+    "# with outlier\n",
+    "agg = AgglomerativeClustering(n_clusters=3, linkage='single')\n",
+    "c = agg.fit_predict(X_outlier)\n",
+    "plt.figure()\n",
+    "plt.scatter(*X_outlier.T, c=c)\n",
+    "plt.axis('square')\n"
   ]
  }
 ],

 %% Cell type:markdown id: tags:
 # Code zu Folien
 Dieses Skript bzw. Jupyter-Notebook enthält den Code, der auch auf den Folien "Clustering" enthalten ist. Zum Vorbereiten, Mitmachen oder Nacharbeiten.
 %% Cell type:code id: tags:
 ``` 
+import timeit
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 from IPython.display import display
 import seaborn as sns
-from sklearn.cluster import KMeans, DBSCAN
+from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
 from sklearn.datasets import make_blobs, make_circles, make_moons, load_breast_cancer, load_iris
 from sklearn.model_selection import train_test_split
+from sklearn.neighbors import NearestNeighbors
 import keras
 sns.set_theme()
 ```
 %% Cell type:code id: tags:
 ``` 
 def plot_clusters(X, labels, centers=None, ax=None, grid=True, xlabel='$x_1$', ylabel='$x_2$'):
    if ax is None:
        fig, ax = plt.subplots()
    markers=['*', 's', '^', 'D', 'o', '<', 'h', 'H', 'X', 'd', 'P', '.']
    colorName = ['teal','orange','firebrick','dodgerblue', 'b', 'y', 'k']
    for i in np.unique(labels):
        groupi = labels == i
        ax.scatter(X[groupi, 0], X[groupi, 1], s=120 if i == 0 else 60, alpha=0.2, marker=markers[i], c=colorName[i])
    if centers is not None:
        for i in range(len(centers)):
            ax.scatter(centers[i, 0], centers[i, 1], s=200 if i == 0 else 100,
                       marker=markers[i], c='black', edgecolors=colorName[i])
            ax.scatter(centers[i, 0], centers[i, 1], s=12, c=colorName[i])
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    if grid:
        ax.grid(True)
    return ax
 ```
 %% Cell type:code id: tags:
 ``` 
 rng = np.random.default_rng()
 X, y = make_blobs(n_samples=[100, 100, 400, 400], random_state=1)
 plt.scatter(X[:, 0], X[:, 1], c=y, cmap='viridis', edgecolors='k', alpha=0.5)
 ```
 %% Cell type:code id: tags:
 ``` 
 k = 4
 # NumPy:
 # mu = rng.choice(X, size=k, replace=False)  # zufällige Startwerte bei NumPy-Array
 mu = X[[4, 7, 100, 102], :]                # nicht all zu schlechte Startwerte von Folie "k-Means Algorithmus – Ablauf"
 # mu = X[[28, 732, 926, 966], :]             # schlechte Startwerte von Folie "Zufällige Startwerte ⇒ nicht deterministisch"
 # mu = X[[100, 50, 39, 42], :]               # schlechte Startwerte von Folie "Mehrere Durchläufe", mittlere Abbildung
 # # Pandas:
 # mu = X.sample(k, ignore_index=True, random_state=0)  # zufällige Startwerte bei Pandas-DataFrame
 print(mu.shape)
 print(X.shape)
 plot_clusters(X, np.full_like(y, 6), mu)
 ```
 %% Cell type:code id: tags:
 ``` 
 diff = X[:, np.newaxis, :] - mu[np.newaxis, :, :]
 dist_matrix = np.sqrt(np.sum(diff ** 2, axis=2))
 print(dist_matrix.shape)
 labels = np.argmin(dist_matrix, axis=1) # Zuordnung der Samples
 mu = np.array([X[labels == j].mean(axis=0) for j in range(k)])
 plot_clusters(X, labels, mu)
 ```
 %% Cell type:code id: tags:
 ``` 
 a = np.array([10, 20, 30]).reshape(3, 1)
 b = np.array([1,  2,  3])
 display(a + b)
 print(a.shape)
 print('  ', b.shape)
 print((a + b).shape)
 ```
 %% Cell type:code id: tags:
 ``` 
 X_iris = load_iris()['data']
 x_test = [5.5, 2.5, 5, 1.5]
 diff = X_iris - x_test
 display(diff)
 print(X_iris.shape)
 print('    ', np.shape(x_test))
 print(diff.shape)
 ```
 %% Cell type:code id: tags:
 ``` 
 (X_train, y_train), (X_test, y_test) = keras.datasets.cifar10.load_data()
 ship = X_test[1] / 255
 scale = np.array([0.1, 1, 0.7])
 result = ship * scale
 plt.figure()
 plt.imshow(ship)
 plt.figure()
 plt.imshow(result)
 print(ship.shape)
 print('       ', scale.shape)
 print(result.shape)
 ```
 %% Cell type:code id: tags:
 ``` 
 X, y = load_breast_cancer(return_X_y=True)
 X_train, X_test, y_train, y_test = train_test_split(X, y)
 # # Scikit-Learns predict-Methode gibt 1D-Array zurück
 # from sklearn.neighbors import KNeighborsClassifier
 # model = KNeighborsClassifier()
 # model.fit(X_train, y_train)
 # Keras predict-Methode gibt 2D-Spalten-Array zurück
 from keras import Sequential, Input
 Dense = keras.layers.Dense
 model = Sequential()
 model.add(Input(X_train.shape[1:]))
 model.add(Dense(50, activation='tanh'))
 model.add(Dense(1, activation='sigmoid'))
 model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
 model.fit(X_train, y_train, epochs=20, batch_size=71, verbose=False)
 y_p = model.predict(X_test, verbose=False)  # 2D
 err = y_p - y_test
 mae = np.mean(np.abs(err)) # keine Fehlermeldung, aber falscher Wert
 print(f'ohne squeeze: {y_p.shape=}, {err.shape=}, {mae=:.3}')
 y_p = model.predict(X_test, verbose=False).squeeze()  # auf 1D reduzieren
 err = y_p - y_test
 mae = np.mean(np.abs(err))  # richtiger Wert
 print(f'mit squeeze:  {y_p.shape=},   {err.shape=},     {mae=:.3}')
 ```
 %% Cell type:code id: tags:
 ``` 
 from sklearn.neighbors import NearestNeighbors
 X = np.array([
    [-4, 3],
    [0, 3],
    [0, 0],
    [3, 0],
    [4, 0],
    [4, 1],
 ])
 plt.scatter(*X.T, c=range(len(X)))
 plt.axis('equal')
 nn = NearestNeighbors()
 nn.fit(X)
 dists, indices = nn.kneighbors(X, 3)
 display(dists[:, 1:].round(2))
 display(indices[:, 1:])
 ```
 %% Cell type:code id: tags:
 ``` 
 X = np.array([
    [0.0, 0.0],
    [0.25, 0.5],
    [1.0, 1.0],
    [1.5, 0.75],
    [0.0, 1.5],
    [2.5, 0.5],
    [4.5, 1.0],
    [5.0, 2.0],
 ])
 knn = NearestNeighbors(n_neighbors=3)
 knn.fit(X)
 dists, neighbors = knn.kneighbors(X)
 max_dists = dists.max(axis=1)
 plt.hist(max_dists, bins=5, range=[0.5, 3], edgecolor='w')
 plt.xlabel(f'Max distances to {dists.shape[1] - 1} neighbors')
 ```
 %% Cell type:code id: tags:
 ``` 
 X, _ = make_moons(n_samples=480, noise=0.02)
 knn = NearestNeighbors(n_neighbors=5)
 knn.fit(X)
 dists, neighbors = knn.kneighbors(X)
 max_dists = dists.max(axis=1)
 plt.hist(max_dists, bins=20, edgecolor='w')
 plt.xlabel(f'Max distances to {dists.shape[1] - 1} neighbors')
 ```
 %% Cell type:code id: tags:
 ``` 
 # data
 X, y = make_circles(1000, noise=0.075, factor=0.5, random_state=42)
 plt.scatter(*X.T, c='gray', edgecolors='k')
 plt.axis('equal')
 # eps-k data
 min_samples_range = np.arange(2, 10)
 eps_range = np.arange(0.01, 0.2, 0.001)
 records = []
 for min_samples in min_samples_range:
    for eps in eps_range:
        clusterer = DBSCAN(eps=eps, min_samples=min_samples)
        labels = clusterer.fit_predict(X)
        noise_lvl = np.mean(labels < 0)
        n_clusters = labels.max() + 1
        records.append((min_samples, eps, n_clusters, noise_lvl))
 df = pd.DataFrame.from_records(records, columns=['min_samples', 'eps', 'n_clusters', 'noise_lvl'])
 df_clusters = df.pivot(index='eps', columns='min_samples', values='n_clusters')
 # eps-k plot
 plt.figure()
 ax = sns.lineplot(df, x='eps', y='n_clusters', hue='min_samples', palette='rainbow', legend='full')
 # ax.set_ylim(bottom=0)
 ax.set_ylim(bottom=0, top=10)     # zoom in y-Achse
 ax.set_xlim(left=0.1, right=0.2)  # zoom in x-Achse
+```
+%% Cell type:code id: tags:
+``` 
+n = 2
+rng = np.random.default_rng()
+datasets = {
+    'normal': rng.standard_normal(size=(2000, n), dtype='float32'),
+    'uniform': rng.random(size=(2000, n), dtype='float32'),
+}
+for name, X in datasets.items():
+    nn = NearestNeighbors(n_neighbors=5, n_jobs=1)
+    nn.fit(X)
+    dists, neighbors = nn.kneighbors(X)
+    max_dists = dists.max(axis=1)
+    fig, ax = plt.subplots(figsize=(3, 3))
+    ax.scatter(*X.T, c=-max_dists, alpha=0.7) # neg to invert color map
+    ax.set_aspect('equal')
+    ax.set_xlabel('$x_1$')
+    ax.set_ylabel('$x_2$')
+```
+%% Cell type:code id: tags:
+``` 
+n_bins = 21
+cmap = plt.colormaps['viridis'].resampled(n_bins)
+n_feat = [2, 5, 10, 20, 50, 100]
+for n in n_feat:
+    rng = np.random.default_rng(42)
+    datasets = {
+        'normal': rng.standard_normal(size=(50_000, n), dtype='float32'),
+        'uniform': rng.random(size=(50_000, n), dtype='float32'),
+    }
+    for name, X in datasets.items():
+        nn = NearestNeighbors(n_neighbors=5, n_jobs=13)
+        start = timeit.default_timer()
+        nn.fit(X)
+        dists, neighbors = nn.kneighbors(X)
+        max_dists = dists.max(axis=1)
+        end = timeit.default_timer()
+        print(f'{name=}, {n=}, runtime: {end - start} s')
+        fig, ax = plt.subplots(figsize=(7, 5))
+        data_range = np.quantile(max_dists, [0.02, 0.98])
+        N, bins, patches = ax.hist(max_dists, bins=n_bins, edgecolor='w', range=data_range)
+        for i, p in enumerate(patches):
+            p.set_facecolor(cmap(n_bins - 1 - i))
+        ax.set_xlim(data_range)
+        ax.set_xlabel('Max distances to 4 neighbors')
+        ax.set_title(f'Density Histogram for {name} distribution in {n} dimensions')
+```
+%% Cell type:code id: tags:
+``` 
+n = 20
+X, y = make_moons(n_samples=n, noise=0.05, random_state=4, shuffle=False)
+X_outlier = np.vstack((X, [[1.2, 0.8], [1.6, 0.95]]))
+# without outlier
+iteration = 18
+agg = AgglomerativeClustering(n_clusters=20 - iteration, linkage='single')
+c = agg.fit_predict(X)
+plt.figure()
+plt.scatter(*X.T, c=c)
+plt.axis('square')
+# with outlier
+agg = AgglomerativeClustering(n_clusters=3, linkage='single')
+c = agg.fit_predict(X_outlier)
+plt.figure()
+plt.scatter(*X_outlier.T, c=c)
+plt.axis('square')
 ```

--- a/06-clustering/folien-code/folien-code.py
+++ b/06-clustering/folien-code/folien-code.py
@@ -4,14 +4,16 @@
 # Dieses Skript bzw. Jupyter-Notebook enthält den Code, der auch auf den Folien "Clustering" enthalten ist. Zum Vorbereiten, Mitmachen oder Nacharbeiten.
 # %% import
+import timeit
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 from IPython.display import display
 import seaborn as sns
-from sklearn.cluster import KMeans, DBSCAN
+from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
 from sklearn.datasets import make_blobs, make_circles, make_moons, load_breast_cancer, load_iris
 from sklearn.model_selection import train_test_split
+from sklearn.neighbors import NearestNeighbors
 import keras
 sns.set_theme()
@@ -215,3 +217,76 @@ ax = sns.lineplot(df, x='eps', y='n_clusters', hue='min_samples', palette='rainb
 # ax.set_ylim(bottom=0)
 ax.set_ylim(bottom=0, top=10)     # zoom in y-Achse
 ax.set_xlim(left=0.1, right=0.2)  # zoom in x-Achse
+# %% Dichteverteilungen
+n = 2
+rng = np.random.default_rng()
+datasets = {
+    'normal': rng.standard_normal(size=(2000, n), dtype='float32'),
+    'uniform': rng.random(size=(2000, n), dtype='float32'),
+}
+for name, X in datasets.items():
+    nn = NearestNeighbors(n_neighbors=5, n_jobs=1)
+    nn.fit(X)
+    dists, neighbors = nn.kneighbors(X)
+    max_dists = dists.max(axis=1)
+    fig, ax = plt.subplots(figsize=(3, 3))
+    ax.scatter(*X.T, c=-max_dists, alpha=0.7) # neg to invert color map
+    ax.set_aspect('equal')
+    ax.set_xlabel('$x_1$')
+    ax.set_ylabel('$x_2$')
+# %% Dichteverteilungen in hochdimensionalen Räumen
+n_bins = 21
+cmap = plt.colormaps['viridis'].resampled(n_bins)
+n_feat = [2, 5, 10, 20, 50, 100]
+for n in n_feat:
+    rng = np.random.default_rng(42)
+    datasets = {
+        'normal': rng.standard_normal(size=(50_000, n), dtype='float32'),
+        'uniform': rng.random(size=(50_000, n), dtype='float32'),
+    }
+    for name, X in datasets.items():
+        nn = NearestNeighbors(n_neighbors=5, n_jobs=13)
+        start = timeit.default_timer()
+        nn.fit(X)
+        dists, neighbors = nn.kneighbors(X)
+        max_dists = dists.max(axis=1)
+        end = timeit.default_timer()
+        print(f'{name=}, {n=}, runtime: {end - start} s')
+        fig, ax = plt.subplots(figsize=(7, 5))
+        data_range = np.quantile(max_dists, [0.02, 0.98])
+        N, bins, patches = ax.hist(max_dists, bins=n_bins, edgecolor='w', range=data_range)
+        for i, p in enumerate(patches):
+            p.set_facecolor(cmap(n_bins - 1 - i))
+        ax.set_xlim(data_range)
+        ax.set_xlabel('Max distances to 4 neighbors')
+        ax.set_title(f'Density Histogram for {name} distribution in {n} dimensions')
+# %% Single-Linkage Verhalten
+n = 20
+X, y = make_moons(n_samples=n, noise=0.05, random_state=4, shuffle=False)
+X_outlier = np.vstack((X, [[1.2, 0.8], [1.6, 0.95]]))
+# without outlier
+iteration = 18
+agg = AgglomerativeClustering(n_clusters=20 - iteration, linkage='single')
+c = agg.fit_predict(X)
+plt.figure()
+plt.scatter(*X.T, c=c)
+plt.axis('square')
+# with outlier
+agg = AgglomerativeClustering(n_clusters=3, linkage='single')
+c = agg.fit_predict(X_outlier)
+plt.figure()
+plt.scatter(*X_outlier.T, c=c)
+plt.axis('square')
--- a/06-clustering/solutions/01-inertia-sol.ipynb
+++ b/06-clustering/solutions/01-inertia-sol.ipynb
@@ -131,13 +131,15 @@
    "die Repräsentanten für alle Cluster `mu` im Prinzip analog zum ersten\n",
    "Ansatz. Dann berechnen wir die quadrierte Distanzmatrix\n",
    "`sqr_dist_matrix`, analog zu den Folien (nur halt ohne Wurzel).\n",
-    "Anschließend wählen wir per Indizierung die Abstände der Samples, die zu\n",
+    "Anschließend wählen wir entweder per Indizierung die Abstände der\n",
-    "Cluster `j` gehören, zu $\\mu_j$ (Spalte `j`) und summieren sie auf. Das\n",
+    "Samples, die zu Cluster `j` gehören, zu $\\mu_j$ (Spalte `j`) und\n",
-    "wird in der `list`-Comprehension für jedes Cluster `j` gemacht und diese\n",
+    "summieren sie auf. Das wird in der `list`-Comprehension für jedes\n",
-    "Trägheitsmomente werden zum Gesamtträgheitsmoment `total_inertia`\n",
+    "Cluster `j` gemacht und diese Trägheitsmomente werden zum\n",
-    "aufsummiert."
+    "Gesamtträgheitsmoment `total_inertia` aufsummiert. Oder wir machen das\n",
+    "per komplexer Indizierung mit Indexpaaren. Hierbei wählt `y` jeweils die\n",
+    "richtige Spalte aus."
   ],
-   "id": "0011-f01157f3b95bd8a3730718503771b0bfdc3571b74ca13b9939f17eb58ab"
+   "id": "0011-2e0e1e9b6d0bd37fa78d2d2bdab3c699c3267bc9df7f2fdb3e03cc42823"
  },
  {
   "cell_type": "code",
@@ -153,10 +155,17 @@
    "    mu = np.array([X[y == j].mean(axis=0) for j in range(k)])\n",
    "    diff = X[:, np.newaxis, :] - mu[np.newaxis, :, :]\n",
    "    sqr_dist_matrix = np.sum(diff ** 2, axis=2)\n",
+    "\n",
+    "    # entweder mit list Comprehension\n",
    "    total_inertia = sum([np.sum(sqr_dist_matrix[y == j, j]) for j in range(k)])\n",
+    "\n",
+    "    # oder paarweiser Indizierung\n",
+    "    all_rows = np.arange(len(y))\n",
+    "    total_inertia = np.sum(sqr_dist_matrix[all_rows, y])\n",
+    "\n",
    "    inertia_comp.append(total_inertia)"
   ],
-   "id": "0012-fe2cf201c04462c6164ddc58d877a6d58d84dc101e5c31b244951544183"
+   "id": "0012-9050cec2f330532f97b66a58127c4417c52d272f1b2007d7f2c90548730"
  },
  {
   "cell_type": "code",

 %% Cell type:markdown id:0001-a3ae08f5a4a259687b176e7ee5d142ca2207e5ace1aada9d73aa8abe333 tags:
 # Trägheitsmoment
 Gegeben sind folgende Daten:
 %% Cell type:code id:0002-c8ceb98996d092532358b219fb962ede51c887f4bbaf30a4a2a9fa39712 tags:
 ``` 
 import numpy as np
 import matplotlib.pyplot as plt
 from sklearn.datasets import make_circles
 X, y1 = make_circles(1000, noise=0.1, factor=0.55)
 y2 = np.asarray(X[:, 0] > 0, dtype=int)
 ```
 %% Cell type:markdown id:0003-3d78bcf9122a2a1bb3e1681a8c95f29db1a5e142a429a286e2f70f5ab28 tags:
 Hier ein Plot der Daten:
 %% Cell type:code id:0004-d297955758e8b5d26fc993f2509a3463420456b867ec1e4c37c5103fc38 tags:
 ``` 
 fig, axs = plt.subplots(1, 2, sharey=True)
 axs[0].scatter(X[:, 0], X[:, 1], c=y1)
 axs[1].scatter(X[:, 0], X[:, 1], c=y2)
 axs[0].set_box_aspect(1)
 axs[1].set_box_aspect(1)
 axs[0].set_title('y1')
 axs[1].set_title('y2')
 plt.show()
 ```
 %% Cell type:markdown id:0008-2d3d0ac1dc306d99dffc90565a9e0335c9d741efdf70aa228dc53c54c66 tags:
 Dabei stellen `y1` die tatsächlich gewünschte, aber nicht konvexe
 Clusterung und `y2` eine konvexe Clusterung dar. Berechnen Sie das
 Gesamtträgheitsmoment. Beachten Sie dabei, dass Sie für beide
 Clusterungen nicht die Formel für einen konvergierten $k$-Means
 Algorithmus verwenden können. Hier Ihr Code:
 ## Lösung
 Wir geben zwei Lösungsvorschläge an. In beiden wird jeweils das
 Trägheitsmoment von `y` berechnet, was in einer äußeren `for`-Schleife
 im ersten Durchlauf `y1` und im zweiten `y2` ist.
 Im ersten Ansatz durchlaufen wir mit `j` die Cluster und betrachten mit
 `X_j` nur die Samples, die zu Cluster `j` gehören. Damit berechnen wir
 dessen Repräsentanten `mu_j` und bilden die Differenzen von `X_j` zu
 deren Repräsentanten. Wenn man die Differenzen quadriert und zeilenweise
 aufsummiert, erhält man die Distanzen. Wenn man die wiederrum
 aufsummiert, erhält man das Trägheitsmoment für Cluster `j`. Daher kann
 man auch direkt über beide Achsen summieren.
 %% Cell type:code id:0009-2a402c7b8cd559ae7dcb8bb3ec30f9029fb26c37ee966558437ace37014 tags:
 ``` 
 inertia_loop = []
 for y in [y1, y2]:
    total_inertia = 0
    for j in range(max(y) + 1):
        X_j = X[y == j]
        mu_j = X_j.mean(axis=0)
        diff = X_j - mu_j
        inertia = np.sum(diff ** 2)
        total_inertia += inertia
    inertia_loop.append(total_inertia)
 ```
 %% Cell type:code id:0010-312845f479b003bfe4db2fc5e3c7f7b7f42c593739d43473d785c47586a tags:
 ``` 
 inertia_loop
 ```
 %% Output
    [663.5525861044789, 420.48555691708304]
-%% Cell type:markdown id:0011-f01157f3b95bd8a3730718503771b0bfdc3571b74ca13b9939f17eb58ab tags:
+%% Cell type:markdown id:0011-2e0e1e9b6d0bd37fa78d2d2bdab3c699c3267bc9df7f2fdb3e03cc42823 tags:
 Im zweiten Ansatz arbeiten wir ohne (innere) `for`-Schleife und
 verwenden anstatt dessen `list`-Comprehensions. Zunächst berechnen wir
 die Repräsentanten für alle Cluster `mu` im Prinzip analog zum ersten
 Ansatz. Dann berechnen wir die quadrierte Distanzmatrix
 `sqr_dist_matrix`, analog zu den Folien (nur halt ohne Wurzel).
-Anschließend wählen wir per Indizierung die Abstände der Samples, die zu
+Anschließend wählen wir entweder per Indizierung die Abstände der
-Cluster `j` gehören, zu $\mu_j$ (Spalte `j`) und summieren sie auf. Das
+Samples, die zu Cluster `j` gehören, zu $\mu_j$ (Spalte `j`) und
-wird in der `list`-Comprehension für jedes Cluster `j` gemacht und diese
+summieren sie auf. Das wird in der `list`-Comprehension für jedes
-Trägheitsmomente werden zum Gesamtträgheitsmoment `total_inertia`
+Cluster `j` gemacht und diese Trägheitsmomente werden zum
-aufsummiert.
+Gesamtträgheitsmoment `total_inertia` aufsummiert. Oder wir machen das
+per komplexer Indizierung mit Indexpaaren. Hierbei wählt `y` jeweils die
+richtige Spalte aus.
-%% Cell type:code id:0012-fe2cf201c04462c6164ddc58d877a6d58d84dc101e5c31b244951544183 tags:
+%% Cell type:code id:0012-9050cec2f330532f97b66a58127c4417c52d272f1b2007d7f2c90548730 tags:
 ``` 
 inertia_comp = []
 for y in [y1, y2]:
    k = max(y) + 1
    mu = np.array([X[y == j].mean(axis=0) for j in range(k)])
    diff = X[:, np.newaxis, :] - mu[np.newaxis, :, :]
    sqr_dist_matrix = np.sum(diff ** 2, axis=2)
+    # entweder mit list Comprehension
    total_inertia = sum([np.sum(sqr_dist_matrix[y == j, j]) for j in range(k)])
+    # oder paarweiser Indizierung
+    all_rows = np.arange(len(y))
+    total_inertia = np.sum(sqr_dist_matrix[all_rows, y])
    inertia_comp.append(total_inertia)
 ```
 %% Cell type:code id:0013-54434e150ebc898aa6628247bbd58911180b2dd5b07d06ab7fd7306f86b tags:
 ``` 
 inertia_comp
 ```
 %% Output
    [663.552586104479, 420.48555691708304]

--- a/06-clustering/solutions/folien-code/folien-code.ipynb
+++ b/06-clustering/solutions/folien-code/folien-code.ipynb
@@ -17,14 +17,16 @@
   "metadata": {},
   "outputs": [],
   "source": [
+    "import timeit\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "from IPython.display import display\n",
    "import seaborn as sns\n",
-    "from sklearn.cluster import KMeans, DBSCAN\n",
+    "from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering\n",
    "from sklearn.datasets import make_blobs, make_circles, make_moons, load_breast_cancer, load_iris\n",
    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.neighbors import NearestNeighbors\n",
    "import keras\n",
    "sns.set_theme()\n"
   ]
@@ -299,7 +301,98 @@
    "ax = sns.lineplot(df, x='eps', y='n_clusters', hue='min_samples', palette='rainbow', legend='full')\n",
    "# ax.set_ylim(bottom=0)\n",
    "ax.set_ylim(bottom=0, top=10)     # zoom in y-Achse\n",
-    "ax.set_xlim(left=0.1, right=0.2)  # zoom in x-Achse\n"
+    "ax.set_xlim(left=0.1, right=0.2)  # zoom in x-Achse\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "n = 2\n",
+    "rng = np.random.default_rng()\n",
+    "datasets = {\n",
+    "    'normal': rng.standard_normal(size=(2000, n), dtype='float32'),\n",
+    "    'uniform': rng.random(size=(2000, n), dtype='float32'),\n",
+    "}\n",
+    "\n",
+    "for name, X in datasets.items():\n",
+    "    nn = NearestNeighbors(n_neighbors=5, n_jobs=1)\n",
+    "    nn.fit(X)\n",
+    "    dists, neighbors = nn.kneighbors(X)\n",
+    "    max_dists = dists.max(axis=1)\n",
+    "\n",
+    "    fig, ax = plt.subplots(figsize=(3, 3))\n",
+    "    ax.scatter(*X.T, c=-max_dists, alpha=0.7) # neg to invert color map\n",
+    "    ax.set_aspect('equal')\n",
+    "    ax.set_xlabel('$x_1$')\n",
+    "    ax.set_ylabel('$x_2$')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "n_bins = 21\n",
+    "cmap = plt.colormaps['viridis'].resampled(n_bins)\n",
+    "n_feat = [2, 5, 10, 20, 50, 100]\n",
+    "for n in n_feat:\n",
+    "    rng = np.random.default_rng(42)\n",
+    "    datasets = {\n",
+    "        'normal': rng.standard_normal(size=(50_000, n), dtype='float32'),\n",
+    "        'uniform': rng.random(size=(50_000, n), dtype='float32'),\n",
+    "    }\n",
+    "\n",
+    "    for name, X in datasets.items():\n",
+    "        nn = NearestNeighbors(n_neighbors=5, n_jobs=13)\n",
+    "\n",
+    "        start = timeit.default_timer()\n",
+    "        nn.fit(X)\n",
+    "        dists, neighbors = nn.kneighbors(X)\n",
+    "        max_dists = dists.max(axis=1)\n",
+    "        end = timeit.default_timer()\n",
+    "        print(f'{name=}, {n=}, runtime: {end - start} s')\n",
+    "\n",
+    "        fig, ax = plt.subplots(figsize=(7, 5))\n",
+    "        data_range = np.quantile(max_dists, [0.02, 0.98])\n",
+    "        N, bins, patches = ax.hist(max_dists, bins=n_bins, edgecolor='w', range=data_range)\n",
+    "        for i, p in enumerate(patches):\n",
+    "            p.set_facecolor(cmap(n_bins - 1 - i))\n",
+    "        ax.set_xlim(data_range)\n",
+    "        ax.set_xlabel('Max distances to 4 neighbors')\n",
+    "        ax.set_title(f'Density Histogram for {name} distribution in {n} dimensions')\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "n = 20\n",
+    "X, y = make_moons(n_samples=n, noise=0.05, random_state=4, shuffle=False)\n",
+    "X_outlier = np.vstack((X, [[1.2, 0.8], [1.6, 0.95]]))\n",
+    "\n",
+    "# without outlier\n",
+    "iteration = 18\n",
+    "agg = AgglomerativeClustering(n_clusters=20 - iteration, linkage='single')\n",
+    "c = agg.fit_predict(X)\n",
+    "plt.figure()\n",
+    "plt.scatter(*X.T, c=c)\n",
+    "plt.axis('square')\n",
+    "\n",
+    "# with outlier\n",
+    "agg = AgglomerativeClustering(n_clusters=3, linkage='single')\n",
+    "c = agg.fit_predict(X_outlier)\n",
+    "plt.figure()\n",
+    "plt.scatter(*X_outlier.T, c=c)\n",
+    "plt.axis('square')\n"
   ]
  }
 ],

 %% Cell type:markdown id: tags:
 # Code zu Folien
 Dieses Skript bzw. Jupyter-Notebook enthält den Code, der auch auf den Folien "Clustering" enthalten ist. Zum Vorbereiten, Mitmachen oder Nacharbeiten.
 %% Cell type:code id: tags:
 ``` 
+import timeit
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 from IPython.display import display
 import seaborn as sns
-from sklearn.cluster import KMeans, DBSCAN
+from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
 from sklearn.datasets import make_blobs, make_circles, make_moons, load_breast_cancer, load_iris
 from sklearn.model_selection import train_test_split
+from sklearn.neighbors import NearestNeighbors
 import keras
 sns.set_theme()
 ```
 %% Cell type:code id: tags:
 ``` 
 def plot_clusters(X, labels, centers=None, ax=None, grid=True, xlabel='$x_1$', ylabel='$x_2$'):
    if ax is None:
        fig, ax = plt.subplots()
    markers=['*', 's', '^', 'D', 'o', '<', 'h', 'H', 'X', 'd', 'P', '.']
    colorName = ['teal','orange','firebrick','dodgerblue', 'b', 'y', 'k']
    for i in np.unique(labels):
        groupi = labels == i
        ax.scatter(X[groupi, 0], X[groupi, 1], s=120 if i == 0 else 60, alpha=0.2, marker=markers[i], c=colorName[i])
    if centers is not None:
        for i in range(len(centers)):
            ax.scatter(centers[i, 0], centers[i, 1], s=200 if i == 0 else 100,
                       marker=markers[i], c='black', edgecolors=colorName[i])
            ax.scatter(centers[i, 0], centers[i, 1], s=12, c=colorName[i])
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    if grid:
        ax.grid(True)
    return ax
 ```
 %% Cell type:code id: tags:
 ``` 
 rng = np.random.default_rng()
 X, y = make_blobs(n_samples=[100, 100, 400, 400], random_state=1)
 plt.scatter(X[:, 0], X[:, 1], c=y, cmap='viridis', edgecolors='k', alpha=0.5)
 ```
 %% Cell type:code id: tags:
 ``` 
 k = 4
 # NumPy:
 # mu = rng.choice(X, size=k, replace=False)  # zufällige Startwerte bei NumPy-Array
 mu = X[[4, 7, 100, 102], :]                # nicht all zu schlechte Startwerte von Folie "k-Means Algorithmus – Ablauf"
 # mu = X[[28, 732, 926, 966], :]             # schlechte Startwerte von Folie "Zufällige Startwerte ⇒ nicht deterministisch"
 # mu = X[[100, 50, 39, 42], :]               # schlechte Startwerte von Folie "Mehrere Durchläufe", mittlere Abbildung
 # # Pandas:
 # mu = X.sample(k, ignore_index=True, random_state=0)  # zufällige Startwerte bei Pandas-DataFrame
 print(mu.shape)
 print(X.shape)
 plot_clusters(X, np.full_like(y, 6), mu)
 ```
 %% Cell type:code id: tags:
 ``` 
 diff = X[:, np.newaxis, :] - mu[np.newaxis, :, :]
 dist_matrix = np.sqrt(np.sum(diff ** 2, axis=2))
 print(dist_matrix.shape)
 labels = np.argmin(dist_matrix, axis=1) # Zuordnung der Samples
 mu = np.array([X[labels == j].mean(axis=0) for j in range(k)])
 plot_clusters(X, labels, mu)
 ```
 %% Cell type:code id: tags:
 ``` 
 a = np.array([10, 20, 30]).reshape(3, 1)
 b = np.array([1,  2,  3])
 display(a + b)
 print(a.shape)
 print('  ', b.shape)
 print((a + b).shape)
 ```
 %% Cell type:code id: tags:
 ``` 
 X_iris = load_iris()['data']
 x_test = [5.5, 2.5, 5, 1.5]
 diff = X_iris - x_test
 display(diff)
 print(X_iris.shape)
 print('    ', np.shape(x_test))
 print(diff.shape)
 ```
 %% Cell type:code id: tags:
 ``` 
 (X_train, y_train), (X_test, y_test) = keras.datasets.cifar10.load_data()
 ship = X_test[1] / 255
 scale = np.array([0.1, 1, 0.7])
 result = ship * scale
 plt.figure()
 plt.imshow(ship)
 plt.figure()
 plt.imshow(result)
 print(ship.shape)
 print('       ', scale.shape)
 print(result.shape)
 ```
 %% Cell type:code id: tags:
 ``` 
 X, y = load_breast_cancer(return_X_y=True)
 X_train, X_test, y_train, y_test = train_test_split(X, y)
 # # Scikit-Learns predict-Methode gibt 1D-Array zurück
 # from sklearn.neighbors import KNeighborsClassifier
 # model = KNeighborsClassifier()
 # model.fit(X_train, y_train)
 # Keras predict-Methode gibt 2D-Spalten-Array zurück
 from keras import Sequential, Input
 Dense = keras.layers.Dense
 model = Sequential()
 model.add(Input(X_train.shape[1:]))
 model.add(Dense(50, activation='tanh'))
 model.add(Dense(1, activation='sigmoid'))
 model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
 model.fit(X_train, y_train, epochs=20, batch_size=71, verbose=False)
 y_p = model.predict(X_test, verbose=False)  # 2D
 err = y_p - y_test
 mae = np.mean(np.abs(err)) # keine Fehlermeldung, aber falscher Wert
 print(f'ohne squeeze: {y_p.shape=}, {err.shape=}, {mae=:.3}')
 y_p = model.predict(X_test, verbose=False).squeeze()  # auf 1D reduzieren
 err = y_p - y_test
 mae = np.mean(np.abs(err))  # richtiger Wert
 print(f'mit squeeze:  {y_p.shape=},   {err.shape=},     {mae=:.3}')
 ```
 %% Cell type:code id: tags:
 ``` 
 from sklearn.neighbors import NearestNeighbors
 X = np.array([
    [-4, 3],
    [0, 3],
    [0, 0],
    [3, 0],
    [4, 0],
    [4, 1],
 ])
 plt.scatter(*X.T, c=range(len(X)))
 plt.axis('equal')
 nn = NearestNeighbors()
 nn.fit(X)
 dists, indices = nn.kneighbors(X, 3)
 display(dists[:, 1:].round(2))
 display(indices[:, 1:])
 ```
 %% Cell type:code id: tags:
 ``` 
 X = np.array([
    [0.0, 0.0],
    [0.25, 0.5],
    [1.0, 1.0],
    [1.5, 0.75],
    [0.0, 1.5],
    [2.5, 0.5],
    [4.5, 1.0],
    [5.0, 2.0],
 ])
 knn = NearestNeighbors(n_neighbors=3)
 knn.fit(X)
 dists, neighbors = knn.kneighbors(X)
 max_dists = dists.max(axis=1)
 plt.hist(max_dists, bins=5, range=[0.5, 3], edgecolor='w')
 plt.xlabel(f'Max distances to {dists.shape[1] - 1} neighbors')
 ```
 %% Cell type:code id: tags:
 ``` 
 X, _ = make_moons(n_samples=480, noise=0.02)
 knn = NearestNeighbors(n_neighbors=5)
 knn.fit(X)
 dists, neighbors = knn.kneighbors(X)
 max_dists = dists.max(axis=1)
 plt.hist(max_dists, bins=20, edgecolor='w')
 plt.xlabel(f'Max distances to {dists.shape[1] - 1} neighbors')
 ```
 %% Cell type:code id: tags:
 ``` 
 # data
 X, y = make_circles(1000, noise=0.075, factor=0.5, random_state=42)
 plt.scatter(*X.T, c='gray', edgecolors='k')
 plt.axis('equal')
 # eps-k data
 min_samples_range = np.arange(2, 10)
 eps_range = np.arange(0.01, 0.2, 0.001)
 records = []
 for min_samples in min_samples_range:
    for eps in eps_range:
        clusterer = DBSCAN(eps=eps, min_samples=min_samples)
        labels = clusterer.fit_predict(X)
        noise_lvl = np.mean(labels < 0)
        n_clusters = labels.max() + 1
        records.append((min_samples, eps, n_clusters, noise_lvl))
 df = pd.DataFrame.from_records(records, columns=['min_samples', 'eps', 'n_clusters', 'noise_lvl'])
 df_clusters = df.pivot(index='eps', columns='min_samples', values='n_clusters')
 # eps-k plot
 plt.figure()
 ax = sns.lineplot(df, x='eps', y='n_clusters', hue='min_samples', palette='rainbow', legend='full')
 # ax.set_ylim(bottom=0)
 ax.set_ylim(bottom=0, top=10)     # zoom in y-Achse
 ax.set_xlim(left=0.1, right=0.2)  # zoom in x-Achse
+```
+%% Cell type:code id: tags:
+``` 
+n = 2
+rng = np.random.default_rng()
+datasets = {
+    'normal': rng.standard_normal(size=(2000, n), dtype='float32'),
+    'uniform': rng.random(size=(2000, n), dtype='float32'),
+}
+for name, X in datasets.items():
+    nn = NearestNeighbors(n_neighbors=5, n_jobs=1)
+    nn.fit(X)
+    dists, neighbors = nn.kneighbors(X)
+    max_dists = dists.max(axis=1)
+    fig, ax = plt.subplots(figsize=(3, 3))
+    ax.scatter(*X.T, c=-max_dists, alpha=0.7) # neg to invert color map
+    ax.set_aspect('equal')
+    ax.set_xlabel('$x_1$')
+    ax.set_ylabel('$x_2$')
+```
+%% Cell type:code id: tags:
+``` 
+n_bins = 21
+cmap = plt.colormaps['viridis'].resampled(n_bins)
+n_feat = [2, 5, 10, 20, 50, 100]
+for n in n_feat:
+    rng = np.random.default_rng(42)
+    datasets = {
+        'normal': rng.standard_normal(size=(50_000, n), dtype='float32'),
+        'uniform': rng.random(size=(50_000, n), dtype='float32'),
+    }
+    for name, X in datasets.items():
+        nn = NearestNeighbors(n_neighbors=5, n_jobs=13)
+        start = timeit.default_timer()
+        nn.fit(X)
+        dists, neighbors = nn.kneighbors(X)
+        max_dists = dists.max(axis=1)
+        end = timeit.default_timer()
+        print(f'{name=}, {n=}, runtime: {end - start} s')
+        fig, ax = plt.subplots(figsize=(7, 5))
+        data_range = np.quantile(max_dists, [0.02, 0.98])
+        N, bins, patches = ax.hist(max_dists, bins=n_bins, edgecolor='w', range=data_range)
+        for i, p in enumerate(patches):
+            p.set_facecolor(cmap(n_bins - 1 - i))
+        ax.set_xlim(data_range)
+        ax.set_xlabel('Max distances to 4 neighbors')
+        ax.set_title(f'Density Histogram for {name} distribution in {n} dimensions')
+```
+%% Cell type:code id: tags:
+``` 
+n = 20
+X, y = make_moons(n_samples=n, noise=0.05, random_state=4, shuffle=False)
+X_outlier = np.vstack((X, [[1.2, 0.8], [1.6, 0.95]]))
+# without outlier
+iteration = 18
+agg = AgglomerativeClustering(n_clusters=20 - iteration, linkage='single')
+c = agg.fit_predict(X)
+plt.figure()
+plt.scatter(*X.T, c=c)
+plt.axis('square')
+# with outlier
+agg = AgglomerativeClustering(n_clusters=3, linkage='single')
+c = agg.fit_predict(X_outlier)
+plt.figure()
+plt.scatter(*X_outlier.T, c=c)
+plt.axis('square')
 ```

--- a/06-clustering/solutions/folien-code/folien-code.py
+++ b/06-clustering/solutions/folien-code/folien-code.py
@@ -4,14 +4,16 @@
 # Dieses Skript bzw. Jupyter-Notebook enthält den Code, der auch auf den Folien "Clustering" enthalten ist. Zum Vorbereiten, Mitmachen oder Nacharbeiten.
 # %% import
+import timeit
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 from IPython.display import display
 import seaborn as sns
-from sklearn.cluster import KMeans, DBSCAN
+from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
 from sklearn.datasets import make_blobs, make_circles, make_moons, load_breast_cancer, load_iris
 from sklearn.model_selection import train_test_split
+from sklearn.neighbors import NearestNeighbors
 import keras
 sns.set_theme()
@@ -215,3 +217,76 @@ ax = sns.lineplot(df, x='eps', y='n_clusters', hue='min_samples', palette='rainb
 # ax.set_ylim(bottom=0)
 ax.set_ylim(bottom=0, top=10)     # zoom in y-Achse
 ax.set_xlim(left=0.1, right=0.2)  # zoom in x-Achse
+# %% Dichteverteilungen
+n = 2
+rng = np.random.default_rng()
+datasets = {
+    'normal': rng.standard_normal(size=(2000, n), dtype='float32'),
+    'uniform': rng.random(size=(2000, n), dtype='float32'),
+}
+for name, X in datasets.items():
+    nn = NearestNeighbors(n_neighbors=5, n_jobs=1)
+    nn.fit(X)
+    dists, neighbors = nn.kneighbors(X)
+    max_dists = dists.max(axis=1)
+    fig, ax = plt.subplots(figsize=(3, 3))
+    ax.scatter(*X.T, c=-max_dists, alpha=0.7) # neg to invert color map
+    ax.set_aspect('equal')
+    ax.set_xlabel('$x_1$')
+    ax.set_ylabel('$x_2$')
+# %% Dichteverteilungen in hochdimensionalen Räumen
+n_bins = 21
+cmap = plt.colormaps['viridis'].resampled(n_bins)
+n_feat = [2, 5, 10, 20, 50, 100]
+for n in n_feat:
+    rng = np.random.default_rng(42)
+    datasets = {
+        'normal': rng.standard_normal(size=(50_000, n), dtype='float32'),
+        'uniform': rng.random(size=(50_000, n), dtype='float32'),
+    }
+    for name, X in datasets.items():
+        nn = NearestNeighbors(n_neighbors=5, n_jobs=13)
+        start = timeit.default_timer()
+        nn.fit(X)
+        dists, neighbors = nn.kneighbors(X)
+        max_dists = dists.max(axis=1)
+        end = timeit.default_timer()
+        print(f'{name=}, {n=}, runtime: {end - start} s')
+        fig, ax = plt.subplots(figsize=(7, 5))
+        data_range = np.quantile(max_dists, [0.02, 0.98])
+        N, bins, patches = ax.hist(max_dists, bins=n_bins, edgecolor='w', range=data_range)
+        for i, p in enumerate(patches):
+            p.set_facecolor(cmap(n_bins - 1 - i))
+        ax.set_xlim(data_range)
+        ax.set_xlabel('Max distances to 4 neighbors')
+        ax.set_title(f'Density Histogram for {name} distribution in {n} dimensions')
+# %% Single-Linkage Verhalten
+n = 20
+X, y = make_moons(n_samples=n, noise=0.05, random_state=4, shuffle=False)
+X_outlier = np.vstack((X, [[1.2, 0.8], [1.6, 0.95]]))
+# without outlier
+iteration = 18
+agg = AgglomerativeClustering(n_clusters=20 - iteration, linkage='single')
+c = agg.fit_predict(X)
+plt.figure()
+plt.scatter(*X.T, c=c)
+plt.axis('square')
+# with outlier
+agg = AgglomerativeClustering(n_clusters=3, linkage='single')
+c = agg.fit_predict(X_outlier)
+plt.figure()
+plt.scatter(*X_outlier.T, c=c)
+plt.axis('square')