From 17e18cedd15df96f1ec6727cb859ff47ce790654 Mon Sep 17 00:00:00 2001 From: Christof Kaufmann <christof.kaufmann@hs-bochum.de> Date: Mon, 19 May 2025 10:20:12 +0000 Subject: [PATCH] Notebooks from applied-cs/data-science@94f58a16 --- 06-clustering/folien-code/folien-code.ipynb | 97 ++++++++++++++++++- 06-clustering/folien-code/folien-code.py | 77 ++++++++++++++- 06-clustering/solutions/01-inertia-sol.ipynb | 23 +++-- .../solutions/folien-code/folien-code.ipynb | 97 ++++++++++++++++++- .../solutions/folien-code/folien-code.py | 77 ++++++++++++++- 5 files changed, 358 insertions(+), 13 deletions(-) diff --git a/06-clustering/folien-code/folien-code.ipynb b/06-clustering/folien-code/folien-code.ipynb index bf1d372..f677350 100644 --- a/06-clustering/folien-code/folien-code.ipynb +++ b/06-clustering/folien-code/folien-code.ipynb @@ -17,14 +17,16 @@ "metadata": {}, "outputs": [], "source": [ + "import timeit\n", "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "from IPython.display import display\n", "import seaborn as sns\n", - "from sklearn.cluster import KMeans, DBSCAN\n", + "from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering\n", "from sklearn.datasets import make_blobs, make_circles, make_moons, load_breast_cancer, load_iris\n", "from sklearn.model_selection import train_test_split\n", + "from sklearn.neighbors import NearestNeighbors\n", "import keras\n", "sns.set_theme()\n" ] @@ -299,7 +301,98 @@ "ax = sns.lineplot(df, x='eps', y='n_clusters', hue='min_samples', palette='rainbow', legend='full')\n", "# ax.set_ylim(bottom=0)\n", "ax.set_ylim(bottom=0, top=10) # zoom in y-Achse\n", - "ax.set_xlim(left=0.1, right=0.2) # zoom in x-Achse\n" + "ax.set_xlim(left=0.1, right=0.2) # zoom in x-Achse\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "n = 2\n", + "rng = np.random.default_rng()\n", + "datasets = {\n", + " 'normal': rng.standard_normal(size=(2000, n), dtype='float32'),\n", + " 'uniform': rng.random(size=(2000, n), dtype='float32'),\n", + "}\n", + "\n", + "for name, X in datasets.items():\n", + " nn = NearestNeighbors(n_neighbors=5, n_jobs=1)\n", + " nn.fit(X)\n", + " dists, neighbors = nn.kneighbors(X)\n", + " max_dists = dists.max(axis=1)\n", + "\n", + " fig, ax = plt.subplots(figsize=(3, 3))\n", + " ax.scatter(*X.T, c=-max_dists, alpha=0.7) # neg to invert color map\n", + " ax.set_aspect('equal')\n", + " ax.set_xlabel('$x_1$')\n", + " ax.set_ylabel('$x_2$')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "n_bins = 21\n", + "cmap = plt.colormaps['viridis'].resampled(n_bins)\n", + "n_feat = [2, 5, 10, 20, 50, 100]\n", + "for n in n_feat:\n", + " rng = np.random.default_rng(42)\n", + " datasets = {\n", + " 'normal': rng.standard_normal(size=(50_000, n), dtype='float32'),\n", + " 'uniform': rng.random(size=(50_000, n), dtype='float32'),\n", + " }\n", + "\n", + " for name, X in datasets.items():\n", + " nn = NearestNeighbors(n_neighbors=5, n_jobs=13)\n", + "\n", + " start = timeit.default_timer()\n", + " nn.fit(X)\n", + " dists, neighbors = nn.kneighbors(X)\n", + " max_dists = dists.max(axis=1)\n", + " end = timeit.default_timer()\n", + " print(f'{name=}, {n=}, runtime: {end - start} s')\n", + "\n", + " fig, ax = plt.subplots(figsize=(7, 5))\n", + " data_range = np.quantile(max_dists, [0.02, 0.98])\n", + " N, bins, patches = ax.hist(max_dists, bins=n_bins, edgecolor='w', range=data_range)\n", + " for i, p in enumerate(patches):\n", + " p.set_facecolor(cmap(n_bins - 1 - i))\n", + " ax.set_xlim(data_range)\n", + " ax.set_xlabel('Max distances to 4 neighbors')\n", + " ax.set_title(f'Density Histogram for {name} distribution in {n} dimensions')\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "n = 20\n", + "X, y = make_moons(n_samples=n, noise=0.05, random_state=4, shuffle=False)\n", + "X_outlier = np.vstack((X, [[1.2, 0.8], [1.6, 0.95]]))\n", + "\n", + "# without outlier\n", + "iteration = 18\n", + "agg = AgglomerativeClustering(n_clusters=20 - iteration, linkage='single')\n", + "c = agg.fit_predict(X)\n", + "plt.figure()\n", + "plt.scatter(*X.T, c=c)\n", + "plt.axis('square')\n", + "\n", + "# with outlier\n", + "agg = AgglomerativeClustering(n_clusters=3, linkage='single')\n", + "c = agg.fit_predict(X_outlier)\n", + "plt.figure()\n", + "plt.scatter(*X_outlier.T, c=c)\n", + "plt.axis('square')\n" ] } ], diff --git a/06-clustering/folien-code/folien-code.py b/06-clustering/folien-code/folien-code.py index 2f4a972..c21328f 100644 --- a/06-clustering/folien-code/folien-code.py +++ b/06-clustering/folien-code/folien-code.py @@ -4,14 +4,16 @@ # Dieses Skript bzw. Jupyter-Notebook enthält den Code, der auch auf den Folien "Clustering" enthalten ist. Zum Vorbereiten, Mitmachen oder Nacharbeiten. # %% import +import timeit import pandas as pd import numpy as np import matplotlib.pyplot as plt from IPython.display import display import seaborn as sns -from sklearn.cluster import KMeans, DBSCAN +from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering from sklearn.datasets import make_blobs, make_circles, make_moons, load_breast_cancer, load_iris from sklearn.model_selection import train_test_split +from sklearn.neighbors import NearestNeighbors import keras sns.set_theme() @@ -215,3 +217,76 @@ ax = sns.lineplot(df, x='eps', y='n_clusters', hue='min_samples', palette='rainb # ax.set_ylim(bottom=0) ax.set_ylim(bottom=0, top=10) # zoom in y-Achse ax.set_xlim(left=0.1, right=0.2) # zoom in x-Achse + + +# %% Dichteverteilungen + +n = 2 +rng = np.random.default_rng() +datasets = { + 'normal': rng.standard_normal(size=(2000, n), dtype='float32'), + 'uniform': rng.random(size=(2000, n), dtype='float32'), +} + +for name, X in datasets.items(): + nn = NearestNeighbors(n_neighbors=5, n_jobs=1) + nn.fit(X) + dists, neighbors = nn.kneighbors(X) + max_dists = dists.max(axis=1) + + fig, ax = plt.subplots(figsize=(3, 3)) + ax.scatter(*X.T, c=-max_dists, alpha=0.7) # neg to invert color map + ax.set_aspect('equal') + ax.set_xlabel('$x_1$') + ax.set_ylabel('$x_2$') + +# %% Dichteverteilungen in hochdimensionalen Räumen +n_bins = 21 +cmap = plt.colormaps['viridis'].resampled(n_bins) +n_feat = [2, 5, 10, 20, 50, 100] +for n in n_feat: + rng = np.random.default_rng(42) + datasets = { + 'normal': rng.standard_normal(size=(50_000, n), dtype='float32'), + 'uniform': rng.random(size=(50_000, n), dtype='float32'), + } + + for name, X in datasets.items(): + nn = NearestNeighbors(n_neighbors=5, n_jobs=13) + + start = timeit.default_timer() + nn.fit(X) + dists, neighbors = nn.kneighbors(X) + max_dists = dists.max(axis=1) + end = timeit.default_timer() + print(f'{name=}, {n=}, runtime: {end - start} s') + + fig, ax = plt.subplots(figsize=(7, 5)) + data_range = np.quantile(max_dists, [0.02, 0.98]) + N, bins, patches = ax.hist(max_dists, bins=n_bins, edgecolor='w', range=data_range) + for i, p in enumerate(patches): + p.set_facecolor(cmap(n_bins - 1 - i)) + ax.set_xlim(data_range) + ax.set_xlabel('Max distances to 4 neighbors') + ax.set_title(f'Density Histogram for {name} distribution in {n} dimensions') + + +# %% Single-Linkage Verhalten +n = 20 +X, y = make_moons(n_samples=n, noise=0.05, random_state=4, shuffle=False) +X_outlier = np.vstack((X, [[1.2, 0.8], [1.6, 0.95]])) + +# without outlier +iteration = 18 +agg = AgglomerativeClustering(n_clusters=20 - iteration, linkage='single') +c = agg.fit_predict(X) +plt.figure() +plt.scatter(*X.T, c=c) +plt.axis('square') + +# with outlier +agg = AgglomerativeClustering(n_clusters=3, linkage='single') +c = agg.fit_predict(X_outlier) +plt.figure() +plt.scatter(*X_outlier.T, c=c) +plt.axis('square') diff --git a/06-clustering/solutions/01-inertia-sol.ipynb b/06-clustering/solutions/01-inertia-sol.ipynb index 7244e46..bad96f8 100644 --- a/06-clustering/solutions/01-inertia-sol.ipynb +++ b/06-clustering/solutions/01-inertia-sol.ipynb @@ -131,13 +131,15 @@ "die Repräsentanten für alle Cluster `mu` im Prinzip analog zum ersten\n", "Ansatz. Dann berechnen wir die quadrierte Distanzmatrix\n", "`sqr_dist_matrix`, analog zu den Folien (nur halt ohne Wurzel).\n", - "Anschließend wählen wir per Indizierung die Abstände der Samples, die zu\n", - "Cluster `j` gehören, zu $\\mu_j$ (Spalte `j`) und summieren sie auf. Das\n", - "wird in der `list`-Comprehension für jedes Cluster `j` gemacht und diese\n", - "Trägheitsmomente werden zum Gesamtträgheitsmoment `total_inertia`\n", - "aufsummiert." + "Anschließend wählen wir entweder per Indizierung die Abstände der\n", + "Samples, die zu Cluster `j` gehören, zu $\\mu_j$ (Spalte `j`) und\n", + "summieren sie auf. Das wird in der `list`-Comprehension für jedes\n", + "Cluster `j` gemacht und diese Trägheitsmomente werden zum\n", + "Gesamtträgheitsmoment `total_inertia` aufsummiert. Oder wir machen das\n", + "per komplexer Indizierung mit Indexpaaren. Hierbei wählt `y` jeweils die\n", + "richtige Spalte aus." ], - "id": "0011-f01157f3b95bd8a3730718503771b0bfdc3571b74ca13b9939f17eb58ab" + "id": "0011-2e0e1e9b6d0bd37fa78d2d2bdab3c699c3267bc9df7f2fdb3e03cc42823" }, { "cell_type": "code", @@ -153,10 +155,17 @@ " mu = np.array([X[y == j].mean(axis=0) for j in range(k)])\n", " diff = X[:, np.newaxis, :] - mu[np.newaxis, :, :]\n", " sqr_dist_matrix = np.sum(diff ** 2, axis=2)\n", + "\n", + " # entweder mit list Comprehension\n", " total_inertia = sum([np.sum(sqr_dist_matrix[y == j, j]) for j in range(k)])\n", + "\n", + " # oder paarweiser Indizierung\n", + " all_rows = np.arange(len(y))\n", + " total_inertia = np.sum(sqr_dist_matrix[all_rows, y])\n", + "\n", " inertia_comp.append(total_inertia)" ], - "id": "0012-fe2cf201c04462c6164ddc58d877a6d58d84dc101e5c31b244951544183" + "id": "0012-9050cec2f330532f97b66a58127c4417c52d272f1b2007d7f2c90548730" }, { "cell_type": "code", diff --git a/06-clustering/solutions/folien-code/folien-code.ipynb b/06-clustering/solutions/folien-code/folien-code.ipynb index bf1d372..f677350 100644 --- a/06-clustering/solutions/folien-code/folien-code.ipynb +++ b/06-clustering/solutions/folien-code/folien-code.ipynb @@ -17,14 +17,16 @@ "metadata": {}, "outputs": [], "source": [ + "import timeit\n", "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "from IPython.display import display\n", "import seaborn as sns\n", - "from sklearn.cluster import KMeans, DBSCAN\n", + "from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering\n", "from sklearn.datasets import make_blobs, make_circles, make_moons, load_breast_cancer, load_iris\n", "from sklearn.model_selection import train_test_split\n", + "from sklearn.neighbors import NearestNeighbors\n", "import keras\n", "sns.set_theme()\n" ] @@ -299,7 +301,98 @@ "ax = sns.lineplot(df, x='eps', y='n_clusters', hue='min_samples', palette='rainbow', legend='full')\n", "# ax.set_ylim(bottom=0)\n", "ax.set_ylim(bottom=0, top=10) # zoom in y-Achse\n", - "ax.set_xlim(left=0.1, right=0.2) # zoom in x-Achse\n" + "ax.set_xlim(left=0.1, right=0.2) # zoom in x-Achse\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "n = 2\n", + "rng = np.random.default_rng()\n", + "datasets = {\n", + " 'normal': rng.standard_normal(size=(2000, n), dtype='float32'),\n", + " 'uniform': rng.random(size=(2000, n), dtype='float32'),\n", + "}\n", + "\n", + "for name, X in datasets.items():\n", + " nn = NearestNeighbors(n_neighbors=5, n_jobs=1)\n", + " nn.fit(X)\n", + " dists, neighbors = nn.kneighbors(X)\n", + " max_dists = dists.max(axis=1)\n", + "\n", + " fig, ax = plt.subplots(figsize=(3, 3))\n", + " ax.scatter(*X.T, c=-max_dists, alpha=0.7) # neg to invert color map\n", + " ax.set_aspect('equal')\n", + " ax.set_xlabel('$x_1$')\n", + " ax.set_ylabel('$x_2$')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "n_bins = 21\n", + "cmap = plt.colormaps['viridis'].resampled(n_bins)\n", + "n_feat = [2, 5, 10, 20, 50, 100]\n", + "for n in n_feat:\n", + " rng = np.random.default_rng(42)\n", + " datasets = {\n", + " 'normal': rng.standard_normal(size=(50_000, n), dtype='float32'),\n", + " 'uniform': rng.random(size=(50_000, n), dtype='float32'),\n", + " }\n", + "\n", + " for name, X in datasets.items():\n", + " nn = NearestNeighbors(n_neighbors=5, n_jobs=13)\n", + "\n", + " start = timeit.default_timer()\n", + " nn.fit(X)\n", + " dists, neighbors = nn.kneighbors(X)\n", + " max_dists = dists.max(axis=1)\n", + " end = timeit.default_timer()\n", + " print(f'{name=}, {n=}, runtime: {end - start} s')\n", + "\n", + " fig, ax = plt.subplots(figsize=(7, 5))\n", + " data_range = np.quantile(max_dists, [0.02, 0.98])\n", + " N, bins, patches = ax.hist(max_dists, bins=n_bins, edgecolor='w', range=data_range)\n", + " for i, p in enumerate(patches):\n", + " p.set_facecolor(cmap(n_bins - 1 - i))\n", + " ax.set_xlim(data_range)\n", + " ax.set_xlabel('Max distances to 4 neighbors')\n", + " ax.set_title(f'Density Histogram for {name} distribution in {n} dimensions')\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "n = 20\n", + "X, y = make_moons(n_samples=n, noise=0.05, random_state=4, shuffle=False)\n", + "X_outlier = np.vstack((X, [[1.2, 0.8], [1.6, 0.95]]))\n", + "\n", + "# without outlier\n", + "iteration = 18\n", + "agg = AgglomerativeClustering(n_clusters=20 - iteration, linkage='single')\n", + "c = agg.fit_predict(X)\n", + "plt.figure()\n", + "plt.scatter(*X.T, c=c)\n", + "plt.axis('square')\n", + "\n", + "# with outlier\n", + "agg = AgglomerativeClustering(n_clusters=3, linkage='single')\n", + "c = agg.fit_predict(X_outlier)\n", + "plt.figure()\n", + "plt.scatter(*X_outlier.T, c=c)\n", + "plt.axis('square')\n" ] } ], diff --git a/06-clustering/solutions/folien-code/folien-code.py b/06-clustering/solutions/folien-code/folien-code.py index 2f4a972..c21328f 100644 --- a/06-clustering/solutions/folien-code/folien-code.py +++ b/06-clustering/solutions/folien-code/folien-code.py @@ -4,14 +4,16 @@ # Dieses Skript bzw. Jupyter-Notebook enthält den Code, der auch auf den Folien "Clustering" enthalten ist. Zum Vorbereiten, Mitmachen oder Nacharbeiten. # %% import +import timeit import pandas as pd import numpy as np import matplotlib.pyplot as plt from IPython.display import display import seaborn as sns -from sklearn.cluster import KMeans, DBSCAN +from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering from sklearn.datasets import make_blobs, make_circles, make_moons, load_breast_cancer, load_iris from sklearn.model_selection import train_test_split +from sklearn.neighbors import NearestNeighbors import keras sns.set_theme() @@ -215,3 +217,76 @@ ax = sns.lineplot(df, x='eps', y='n_clusters', hue='min_samples', palette='rainb # ax.set_ylim(bottom=0) ax.set_ylim(bottom=0, top=10) # zoom in y-Achse ax.set_xlim(left=0.1, right=0.2) # zoom in x-Achse + + +# %% Dichteverteilungen + +n = 2 +rng = np.random.default_rng() +datasets = { + 'normal': rng.standard_normal(size=(2000, n), dtype='float32'), + 'uniform': rng.random(size=(2000, n), dtype='float32'), +} + +for name, X in datasets.items(): + nn = NearestNeighbors(n_neighbors=5, n_jobs=1) + nn.fit(X) + dists, neighbors = nn.kneighbors(X) + max_dists = dists.max(axis=1) + + fig, ax = plt.subplots(figsize=(3, 3)) + ax.scatter(*X.T, c=-max_dists, alpha=0.7) # neg to invert color map + ax.set_aspect('equal') + ax.set_xlabel('$x_1$') + ax.set_ylabel('$x_2$') + +# %% Dichteverteilungen in hochdimensionalen Räumen +n_bins = 21 +cmap = plt.colormaps['viridis'].resampled(n_bins) +n_feat = [2, 5, 10, 20, 50, 100] +for n in n_feat: + rng = np.random.default_rng(42) + datasets = { + 'normal': rng.standard_normal(size=(50_000, n), dtype='float32'), + 'uniform': rng.random(size=(50_000, n), dtype='float32'), + } + + for name, X in datasets.items(): + nn = NearestNeighbors(n_neighbors=5, n_jobs=13) + + start = timeit.default_timer() + nn.fit(X) + dists, neighbors = nn.kneighbors(X) + max_dists = dists.max(axis=1) + end = timeit.default_timer() + print(f'{name=}, {n=}, runtime: {end - start} s') + + fig, ax = plt.subplots(figsize=(7, 5)) + data_range = np.quantile(max_dists, [0.02, 0.98]) + N, bins, patches = ax.hist(max_dists, bins=n_bins, edgecolor='w', range=data_range) + for i, p in enumerate(patches): + p.set_facecolor(cmap(n_bins - 1 - i)) + ax.set_xlim(data_range) + ax.set_xlabel('Max distances to 4 neighbors') + ax.set_title(f'Density Histogram for {name} distribution in {n} dimensions') + + +# %% Single-Linkage Verhalten +n = 20 +X, y = make_moons(n_samples=n, noise=0.05, random_state=4, shuffle=False) +X_outlier = np.vstack((X, [[1.2, 0.8], [1.6, 0.95]])) + +# without outlier +iteration = 18 +agg = AgglomerativeClustering(n_clusters=20 - iteration, linkage='single') +c = agg.fit_predict(X) +plt.figure() +plt.scatter(*X.T, c=c) +plt.axis('square') + +# with outlier +agg = AgglomerativeClustering(n_clusters=3, linkage='single') +c = agg.fit_predict(X_outlier) +plt.figure() +plt.scatter(*X_outlier.T, c=c) +plt.axis('square') -- GitLab