Skip to content
Snippets Groups Projects
Commit 17e18ced authored by Christof Kaufmann's avatar Christof Kaufmann
Browse files

Notebooks from applied-cs/data-science@94f58a16

parent 96d4d671
No related branches found
No related tags found
No related merge requests found
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
# Code zu Folien # Code zu Folien
Dieses Skript bzw. Jupyter-Notebook enthält den Code, der auch auf den Folien "Clustering" enthalten ist. Zum Vorbereiten, Mitmachen oder Nacharbeiten. Dieses Skript bzw. Jupyter-Notebook enthält den Code, der auch auf den Folien "Clustering" enthalten ist. Zum Vorbereiten, Mitmachen oder Nacharbeiten.
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
import timeit
import pandas as pd import pandas as pd
import numpy as np import numpy as np
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from IPython.display import display from IPython.display import display
import seaborn as sns import seaborn as sns
from sklearn.cluster import KMeans, DBSCAN from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.datasets import make_blobs, make_circles, make_moons, load_breast_cancer, load_iris from sklearn.datasets import make_blobs, make_circles, make_moons, load_breast_cancer, load_iris
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
import keras import keras
sns.set_theme() sns.set_theme()
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
def plot_clusters(X, labels, centers=None, ax=None, grid=True, xlabel='$x_1$', ylabel='$x_2$'): def plot_clusters(X, labels, centers=None, ax=None, grid=True, xlabel='$x_1$', ylabel='$x_2$'):
if ax is None: if ax is None:
fig, ax = plt.subplots() fig, ax = plt.subplots()
markers=['*', 's', '^', 'D', 'o', '<', 'h', 'H', 'X', 'd', 'P', '.'] markers=['*', 's', '^', 'D', 'o', '<', 'h', 'H', 'X', 'd', 'P', '.']
colorName = ['teal','orange','firebrick','dodgerblue', 'b', 'y', 'k'] colorName = ['teal','orange','firebrick','dodgerblue', 'b', 'y', 'k']
for i in np.unique(labels): for i in np.unique(labels):
groupi = labels == i groupi = labels == i
ax.scatter(X[groupi, 0], X[groupi, 1], s=120 if i == 0 else 60, alpha=0.2, marker=markers[i], c=colorName[i]) ax.scatter(X[groupi, 0], X[groupi, 1], s=120 if i == 0 else 60, alpha=0.2, marker=markers[i], c=colorName[i])
if centers is not None: if centers is not None:
for i in range(len(centers)): for i in range(len(centers)):
ax.scatter(centers[i, 0], centers[i, 1], s=200 if i == 0 else 100, ax.scatter(centers[i, 0], centers[i, 1], s=200 if i == 0 else 100,
marker=markers[i], c='black', edgecolors=colorName[i]) marker=markers[i], c='black', edgecolors=colorName[i])
ax.scatter(centers[i, 0], centers[i, 1], s=12, c=colorName[i]) ax.scatter(centers[i, 0], centers[i, 1], s=12, c=colorName[i])
ax.set_xlabel(xlabel) ax.set_xlabel(xlabel)
ax.set_ylabel(ylabel) ax.set_ylabel(ylabel)
if grid: if grid:
ax.grid(True) ax.grid(True)
return ax return ax
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
rng = np.random.default_rng() rng = np.random.default_rng()
X, y = make_blobs(n_samples=[100, 100, 400, 400], random_state=1) X, y = make_blobs(n_samples=[100, 100, 400, 400], random_state=1)
plt.scatter(X[:, 0], X[:, 1], c=y, cmap='viridis', edgecolors='k', alpha=0.5) plt.scatter(X[:, 0], X[:, 1], c=y, cmap='viridis', edgecolors='k', alpha=0.5)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
k = 4 k = 4
# NumPy: # NumPy:
# mu = rng.choice(X, size=k, replace=False) # zufällige Startwerte bei NumPy-Array # mu = rng.choice(X, size=k, replace=False) # zufällige Startwerte bei NumPy-Array
mu = X[[4, 7, 100, 102], :] # nicht all zu schlechte Startwerte von Folie "k-Means Algorithmus – Ablauf" mu = X[[4, 7, 100, 102], :] # nicht all zu schlechte Startwerte von Folie "k-Means Algorithmus – Ablauf"
# mu = X[[28, 732, 926, 966], :] # schlechte Startwerte von Folie "Zufällige Startwerte ⇒ nicht deterministisch" # mu = X[[28, 732, 926, 966], :] # schlechte Startwerte von Folie "Zufällige Startwerte ⇒ nicht deterministisch"
# mu = X[[100, 50, 39, 42], :] # schlechte Startwerte von Folie "Mehrere Durchläufe", mittlere Abbildung # mu = X[[100, 50, 39, 42], :] # schlechte Startwerte von Folie "Mehrere Durchläufe", mittlere Abbildung
# # Pandas: # # Pandas:
# mu = X.sample(k, ignore_index=True, random_state=0) # zufällige Startwerte bei Pandas-DataFrame # mu = X.sample(k, ignore_index=True, random_state=0) # zufällige Startwerte bei Pandas-DataFrame
print(mu.shape) print(mu.shape)
print(X.shape) print(X.shape)
plot_clusters(X, np.full_like(y, 6), mu) plot_clusters(X, np.full_like(y, 6), mu)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
diff = X[:, np.newaxis, :] - mu[np.newaxis, :, :] diff = X[:, np.newaxis, :] - mu[np.newaxis, :, :]
dist_matrix = np.sqrt(np.sum(diff ** 2, axis=2)) dist_matrix = np.sqrt(np.sum(diff ** 2, axis=2))
print(dist_matrix.shape) print(dist_matrix.shape)
labels = np.argmin(dist_matrix, axis=1) # Zuordnung der Samples labels = np.argmin(dist_matrix, axis=1) # Zuordnung der Samples
mu = np.array([X[labels == j].mean(axis=0) for j in range(k)]) mu = np.array([X[labels == j].mean(axis=0) for j in range(k)])
plot_clusters(X, labels, mu) plot_clusters(X, labels, mu)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
a = np.array([10, 20, 30]).reshape(3, 1) a = np.array([10, 20, 30]).reshape(3, 1)
b = np.array([1, 2, 3]) b = np.array([1, 2, 3])
display(a + b) display(a + b)
print(a.shape) print(a.shape)
print(' ', b.shape) print(' ', b.shape)
print((a + b).shape) print((a + b).shape)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
X_iris = load_iris()['data'] X_iris = load_iris()['data']
x_test = [5.5, 2.5, 5, 1.5] x_test = [5.5, 2.5, 5, 1.5]
diff = X_iris - x_test diff = X_iris - x_test
display(diff) display(diff)
print(X_iris.shape) print(X_iris.shape)
print(' ', np.shape(x_test)) print(' ', np.shape(x_test))
print(diff.shape) print(diff.shape)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
(X_train, y_train), (X_test, y_test) = keras.datasets.cifar10.load_data() (X_train, y_train), (X_test, y_test) = keras.datasets.cifar10.load_data()
ship = X_test[1] / 255 ship = X_test[1] / 255
scale = np.array([0.1, 1, 0.7]) scale = np.array([0.1, 1, 0.7])
result = ship * scale result = ship * scale
plt.figure() plt.figure()
plt.imshow(ship) plt.imshow(ship)
plt.figure() plt.figure()
plt.imshow(result) plt.imshow(result)
print(ship.shape) print(ship.shape)
print(' ', scale.shape) print(' ', scale.shape)
print(result.shape) print(result.shape)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
X, y = load_breast_cancer(return_X_y=True) X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y) X_train, X_test, y_train, y_test = train_test_split(X, y)
# # Scikit-Learns predict-Methode gibt 1D-Array zurück # # Scikit-Learns predict-Methode gibt 1D-Array zurück
# from sklearn.neighbors import KNeighborsClassifier # from sklearn.neighbors import KNeighborsClassifier
# model = KNeighborsClassifier() # model = KNeighborsClassifier()
# model.fit(X_train, y_train) # model.fit(X_train, y_train)
# Keras predict-Methode gibt 2D-Spalten-Array zurück # Keras predict-Methode gibt 2D-Spalten-Array zurück
from keras import Sequential, Input from keras import Sequential, Input
Dense = keras.layers.Dense Dense = keras.layers.Dense
model = Sequential() model = Sequential()
model.add(Input(X_train.shape[1:])) model.add(Input(X_train.shape[1:]))
model.add(Dense(50, activation='tanh')) model.add(Dense(50, activation='tanh'))
model.add(Dense(1, activation='sigmoid')) model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=20, batch_size=71, verbose=False) model.fit(X_train, y_train, epochs=20, batch_size=71, verbose=False)
y_p = model.predict(X_test, verbose=False) # 2D y_p = model.predict(X_test, verbose=False) # 2D
err = y_p - y_test err = y_p - y_test
mae = np.mean(np.abs(err)) # keine Fehlermeldung, aber falscher Wert mae = np.mean(np.abs(err)) # keine Fehlermeldung, aber falscher Wert
print(f'ohne squeeze: {y_p.shape=}, {err.shape=}, {mae=:.3}') print(f'ohne squeeze: {y_p.shape=}, {err.shape=}, {mae=:.3}')
y_p = model.predict(X_test, verbose=False).squeeze() # auf 1D reduzieren y_p = model.predict(X_test, verbose=False).squeeze() # auf 1D reduzieren
err = y_p - y_test err = y_p - y_test
mae = np.mean(np.abs(err)) # richtiger Wert mae = np.mean(np.abs(err)) # richtiger Wert
print(f'mit squeeze: {y_p.shape=}, {err.shape=}, {mae=:.3}') print(f'mit squeeze: {y_p.shape=}, {err.shape=}, {mae=:.3}')
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
from sklearn.neighbors import NearestNeighbors from sklearn.neighbors import NearestNeighbors
X = np.array([ X = np.array([
[-4, 3], [-4, 3],
[0, 3], [0, 3],
[0, 0], [0, 0],
[3, 0], [3, 0],
[4, 0], [4, 0],
[4, 1], [4, 1],
]) ])
plt.scatter(*X.T, c=range(len(X))) plt.scatter(*X.T, c=range(len(X)))
plt.axis('equal') plt.axis('equal')
nn = NearestNeighbors() nn = NearestNeighbors()
nn.fit(X) nn.fit(X)
dists, indices = nn.kneighbors(X, 3) dists, indices = nn.kneighbors(X, 3)
display(dists[:, 1:].round(2)) display(dists[:, 1:].round(2))
display(indices[:, 1:]) display(indices[:, 1:])
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
X = np.array([ X = np.array([
[0.0, 0.0], [0.0, 0.0],
[0.25, 0.5], [0.25, 0.5],
[1.0, 1.0], [1.0, 1.0],
[1.5, 0.75], [1.5, 0.75],
[0.0, 1.5], [0.0, 1.5],
[2.5, 0.5], [2.5, 0.5],
[4.5, 1.0], [4.5, 1.0],
[5.0, 2.0], [5.0, 2.0],
]) ])
knn = NearestNeighbors(n_neighbors=3) knn = NearestNeighbors(n_neighbors=3)
knn.fit(X) knn.fit(X)
dists, neighbors = knn.kneighbors(X) dists, neighbors = knn.kneighbors(X)
max_dists = dists.max(axis=1) max_dists = dists.max(axis=1)
plt.hist(max_dists, bins=5, range=[0.5, 3], edgecolor='w') plt.hist(max_dists, bins=5, range=[0.5, 3], edgecolor='w')
plt.xlabel(f'Max distances to {dists.shape[1] - 1} neighbors') plt.xlabel(f'Max distances to {dists.shape[1] - 1} neighbors')
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
X, _ = make_moons(n_samples=480, noise=0.02) X, _ = make_moons(n_samples=480, noise=0.02)
knn = NearestNeighbors(n_neighbors=5) knn = NearestNeighbors(n_neighbors=5)
knn.fit(X) knn.fit(X)
dists, neighbors = knn.kneighbors(X) dists, neighbors = knn.kneighbors(X)
max_dists = dists.max(axis=1) max_dists = dists.max(axis=1)
plt.hist(max_dists, bins=20, edgecolor='w') plt.hist(max_dists, bins=20, edgecolor='w')
plt.xlabel(f'Max distances to {dists.shape[1] - 1} neighbors') plt.xlabel(f'Max distances to {dists.shape[1] - 1} neighbors')
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# data # data
X, y = make_circles(1000, noise=0.075, factor=0.5, random_state=42) X, y = make_circles(1000, noise=0.075, factor=0.5, random_state=42)
plt.scatter(*X.T, c='gray', edgecolors='k') plt.scatter(*X.T, c='gray', edgecolors='k')
plt.axis('equal') plt.axis('equal')
# eps-k data # eps-k data
min_samples_range = np.arange(2, 10) min_samples_range = np.arange(2, 10)
eps_range = np.arange(0.01, 0.2, 0.001) eps_range = np.arange(0.01, 0.2, 0.001)
records = [] records = []
for min_samples in min_samples_range: for min_samples in min_samples_range:
for eps in eps_range: for eps in eps_range:
clusterer = DBSCAN(eps=eps, min_samples=min_samples) clusterer = DBSCAN(eps=eps, min_samples=min_samples)
labels = clusterer.fit_predict(X) labels = clusterer.fit_predict(X)
noise_lvl = np.mean(labels < 0) noise_lvl = np.mean(labels < 0)
n_clusters = labels.max() + 1 n_clusters = labels.max() + 1
records.append((min_samples, eps, n_clusters, noise_lvl)) records.append((min_samples, eps, n_clusters, noise_lvl))
df = pd.DataFrame.from_records(records, columns=['min_samples', 'eps', 'n_clusters', 'noise_lvl']) df = pd.DataFrame.from_records(records, columns=['min_samples', 'eps', 'n_clusters', 'noise_lvl'])
df_clusters = df.pivot(index='eps', columns='min_samples', values='n_clusters') df_clusters = df.pivot(index='eps', columns='min_samples', values='n_clusters')
# eps-k plot # eps-k plot
plt.figure() plt.figure()
ax = sns.lineplot(df, x='eps', y='n_clusters', hue='min_samples', palette='rainbow', legend='full') ax = sns.lineplot(df, x='eps', y='n_clusters', hue='min_samples', palette='rainbow', legend='full')
# ax.set_ylim(bottom=0) # ax.set_ylim(bottom=0)
ax.set_ylim(bottom=0, top=10) # zoom in y-Achse ax.set_ylim(bottom=0, top=10) # zoom in y-Achse
ax.set_xlim(left=0.1, right=0.2) # zoom in x-Achse ax.set_xlim(left=0.1, right=0.2) # zoom in x-Achse
```
%% Cell type:code id: tags:
```
n = 2
rng = np.random.default_rng()
datasets = {
'normal': rng.standard_normal(size=(2000, n), dtype='float32'),
'uniform': rng.random(size=(2000, n), dtype='float32'),
}
for name, X in datasets.items():
nn = NearestNeighbors(n_neighbors=5, n_jobs=1)
nn.fit(X)
dists, neighbors = nn.kneighbors(X)
max_dists = dists.max(axis=1)
fig, ax = plt.subplots(figsize=(3, 3))
ax.scatter(*X.T, c=-max_dists, alpha=0.7) # neg to invert color map
ax.set_aspect('equal')
ax.set_xlabel('$x_1$')
ax.set_ylabel('$x_2$')
```
%% Cell type:code id: tags:
```
n_bins = 21
cmap = plt.colormaps['viridis'].resampled(n_bins)
n_feat = [2, 5, 10, 20, 50, 100]
for n in n_feat:
rng = np.random.default_rng(42)
datasets = {
'normal': rng.standard_normal(size=(50_000, n), dtype='float32'),
'uniform': rng.random(size=(50_000, n), dtype='float32'),
}
for name, X in datasets.items():
nn = NearestNeighbors(n_neighbors=5, n_jobs=13)
start = timeit.default_timer()
nn.fit(X)
dists, neighbors = nn.kneighbors(X)
max_dists = dists.max(axis=1)
end = timeit.default_timer()
print(f'{name=}, {n=}, runtime: {end - start} s')
fig, ax = plt.subplots(figsize=(7, 5))
data_range = np.quantile(max_dists, [0.02, 0.98])
N, bins, patches = ax.hist(max_dists, bins=n_bins, edgecolor='w', range=data_range)
for i, p in enumerate(patches):
p.set_facecolor(cmap(n_bins - 1 - i))
ax.set_xlim(data_range)
ax.set_xlabel('Max distances to 4 neighbors')
ax.set_title(f'Density Histogram for {name} distribution in {n} dimensions')
```
%% Cell type:code id: tags:
```
n = 20
X, y = make_moons(n_samples=n, noise=0.05, random_state=4, shuffle=False)
X_outlier = np.vstack((X, [[1.2, 0.8], [1.6, 0.95]]))
# without outlier
iteration = 18
agg = AgglomerativeClustering(n_clusters=20 - iteration, linkage='single')
c = agg.fit_predict(X)
plt.figure()
plt.scatter(*X.T, c=c)
plt.axis('square')
# with outlier
agg = AgglomerativeClustering(n_clusters=3, linkage='single')
c = agg.fit_predict(X_outlier)
plt.figure()
plt.scatter(*X_outlier.T, c=c)
plt.axis('square')
``` ```
......
...@@ -4,14 +4,16 @@ ...@@ -4,14 +4,16 @@
# Dieses Skript bzw. Jupyter-Notebook enthält den Code, der auch auf den Folien "Clustering" enthalten ist. Zum Vorbereiten, Mitmachen oder Nacharbeiten. # Dieses Skript bzw. Jupyter-Notebook enthält den Code, der auch auf den Folien "Clustering" enthalten ist. Zum Vorbereiten, Mitmachen oder Nacharbeiten.
# %% import # %% import
import timeit
import pandas as pd import pandas as pd
import numpy as np import numpy as np
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from IPython.display import display from IPython.display import display
import seaborn as sns import seaborn as sns
from sklearn.cluster import KMeans, DBSCAN from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.datasets import make_blobs, make_circles, make_moons, load_breast_cancer, load_iris from sklearn.datasets import make_blobs, make_circles, make_moons, load_breast_cancer, load_iris
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
import keras import keras
sns.set_theme() sns.set_theme()
...@@ -215,3 +217,76 @@ ax = sns.lineplot(df, x='eps', y='n_clusters', hue='min_samples', palette='rainb ...@@ -215,3 +217,76 @@ ax = sns.lineplot(df, x='eps', y='n_clusters', hue='min_samples', palette='rainb
# ax.set_ylim(bottom=0) # ax.set_ylim(bottom=0)
ax.set_ylim(bottom=0, top=10) # zoom in y-Achse ax.set_ylim(bottom=0, top=10) # zoom in y-Achse
ax.set_xlim(left=0.1, right=0.2) # zoom in x-Achse ax.set_xlim(left=0.1, right=0.2) # zoom in x-Achse
# %% Dichteverteilungen
n = 2
rng = np.random.default_rng()
datasets = {
'normal': rng.standard_normal(size=(2000, n), dtype='float32'),
'uniform': rng.random(size=(2000, n), dtype='float32'),
}
for name, X in datasets.items():
nn = NearestNeighbors(n_neighbors=5, n_jobs=1)
nn.fit(X)
dists, neighbors = nn.kneighbors(X)
max_dists = dists.max(axis=1)
fig, ax = plt.subplots(figsize=(3, 3))
ax.scatter(*X.T, c=-max_dists, alpha=0.7) # neg to invert color map
ax.set_aspect('equal')
ax.set_xlabel('$x_1$')
ax.set_ylabel('$x_2$')
# %% Dichteverteilungen in hochdimensionalen Räumen
n_bins = 21
cmap = plt.colormaps['viridis'].resampled(n_bins)
n_feat = [2, 5, 10, 20, 50, 100]
for n in n_feat:
rng = np.random.default_rng(42)
datasets = {
'normal': rng.standard_normal(size=(50_000, n), dtype='float32'),
'uniform': rng.random(size=(50_000, n), dtype='float32'),
}
for name, X in datasets.items():
nn = NearestNeighbors(n_neighbors=5, n_jobs=13)
start = timeit.default_timer()
nn.fit(X)
dists, neighbors = nn.kneighbors(X)
max_dists = dists.max(axis=1)
end = timeit.default_timer()
print(f'{name=}, {n=}, runtime: {end - start} s')
fig, ax = plt.subplots(figsize=(7, 5))
data_range = np.quantile(max_dists, [0.02, 0.98])
N, bins, patches = ax.hist(max_dists, bins=n_bins, edgecolor='w', range=data_range)
for i, p in enumerate(patches):
p.set_facecolor(cmap(n_bins - 1 - i))
ax.set_xlim(data_range)
ax.set_xlabel('Max distances to 4 neighbors')
ax.set_title(f'Density Histogram for {name} distribution in {n} dimensions')
# %% Single-Linkage Verhalten
n = 20
X, y = make_moons(n_samples=n, noise=0.05, random_state=4, shuffle=False)
X_outlier = np.vstack((X, [[1.2, 0.8], [1.6, 0.95]]))
# without outlier
iteration = 18
agg = AgglomerativeClustering(n_clusters=20 - iteration, linkage='single')
c = agg.fit_predict(X)
plt.figure()
plt.scatter(*X.T, c=c)
plt.axis('square')
# with outlier
agg = AgglomerativeClustering(n_clusters=3, linkage='single')
c = agg.fit_predict(X_outlier)
plt.figure()
plt.scatter(*X_outlier.T, c=c)
plt.axis('square')
%% Cell type:markdown id:0001-a3ae08f5a4a259687b176e7ee5d142ca2207e5ace1aada9d73aa8abe333 tags: %% Cell type:markdown id:0001-a3ae08f5a4a259687b176e7ee5d142ca2207e5ace1aada9d73aa8abe333 tags:
# Trägheitsmoment # Trägheitsmoment
Gegeben sind folgende Daten: Gegeben sind folgende Daten:
%% Cell type:code id:0002-c8ceb98996d092532358b219fb962ede51c887f4bbaf30a4a2a9fa39712 tags: %% Cell type:code id:0002-c8ceb98996d092532358b219fb962ede51c887f4bbaf30a4a2a9fa39712 tags:
``` ```
import numpy as np import numpy as np
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from sklearn.datasets import make_circles from sklearn.datasets import make_circles
X, y1 = make_circles(1000, noise=0.1, factor=0.55) X, y1 = make_circles(1000, noise=0.1, factor=0.55)
y2 = np.asarray(X[:, 0] > 0, dtype=int) y2 = np.asarray(X[:, 0] > 0, dtype=int)
``` ```
%% Cell type:markdown id:0003-3d78bcf9122a2a1bb3e1681a8c95f29db1a5e142a429a286e2f70f5ab28 tags: %% Cell type:markdown id:0003-3d78bcf9122a2a1bb3e1681a8c95f29db1a5e142a429a286e2f70f5ab28 tags:
Hier ein Plot der Daten: Hier ein Plot der Daten:
%% Cell type:code id:0004-d297955758e8b5d26fc993f2509a3463420456b867ec1e4c37c5103fc38 tags: %% Cell type:code id:0004-d297955758e8b5d26fc993f2509a3463420456b867ec1e4c37c5103fc38 tags:
``` ```
fig, axs = plt.subplots(1, 2, sharey=True) fig, axs = plt.subplots(1, 2, sharey=True)
axs[0].scatter(X[:, 0], X[:, 1], c=y1) axs[0].scatter(X[:, 0], X[:, 1], c=y1)
axs[1].scatter(X[:, 0], X[:, 1], c=y2) axs[1].scatter(X[:, 0], X[:, 1], c=y2)
axs[0].set_box_aspect(1) axs[0].set_box_aspect(1)
axs[1].set_box_aspect(1) axs[1].set_box_aspect(1)
axs[0].set_title('y1') axs[0].set_title('y1')
axs[1].set_title('y2') axs[1].set_title('y2')
plt.show() plt.show()
``` ```
%% Cell type:markdown id:0008-2d3d0ac1dc306d99dffc90565a9e0335c9d741efdf70aa228dc53c54c66 tags: %% Cell type:markdown id:0008-2d3d0ac1dc306d99dffc90565a9e0335c9d741efdf70aa228dc53c54c66 tags:
Dabei stellen `y1` die tatsächlich gewünschte, aber nicht konvexe Dabei stellen `y1` die tatsächlich gewünschte, aber nicht konvexe
Clusterung und `y2` eine konvexe Clusterung dar. Berechnen Sie das Clusterung und `y2` eine konvexe Clusterung dar. Berechnen Sie das
Gesamtträgheitsmoment. Beachten Sie dabei, dass Sie für beide Gesamtträgheitsmoment. Beachten Sie dabei, dass Sie für beide
Clusterungen nicht die Formel für einen konvergierten $k$-Means Clusterungen nicht die Formel für einen konvergierten $k$-Means
Algorithmus verwenden können. Hier Ihr Code: Algorithmus verwenden können. Hier Ihr Code:
## Lösung ## Lösung
Wir geben zwei Lösungsvorschläge an. In beiden wird jeweils das Wir geben zwei Lösungsvorschläge an. In beiden wird jeweils das
Trägheitsmoment von `y` berechnet, was in einer äußeren `for`-Schleife Trägheitsmoment von `y` berechnet, was in einer äußeren `for`-Schleife
im ersten Durchlauf `y1` und im zweiten `y2` ist. im ersten Durchlauf `y1` und im zweiten `y2` ist.
Im ersten Ansatz durchlaufen wir mit `j` die Cluster und betrachten mit Im ersten Ansatz durchlaufen wir mit `j` die Cluster und betrachten mit
`X_j` nur die Samples, die zu Cluster `j` gehören. Damit berechnen wir `X_j` nur die Samples, die zu Cluster `j` gehören. Damit berechnen wir
dessen Repräsentanten `mu_j` und bilden die Differenzen von `X_j` zu dessen Repräsentanten `mu_j` und bilden die Differenzen von `X_j` zu
deren Repräsentanten. Wenn man die Differenzen quadriert und zeilenweise deren Repräsentanten. Wenn man die Differenzen quadriert und zeilenweise
aufsummiert, erhält man die Distanzen. Wenn man die wiederrum aufsummiert, erhält man die Distanzen. Wenn man die wiederrum
aufsummiert, erhält man das Trägheitsmoment für Cluster `j`. Daher kann aufsummiert, erhält man das Trägheitsmoment für Cluster `j`. Daher kann
man auch direkt über beide Achsen summieren. man auch direkt über beide Achsen summieren.
%% Cell type:code id:0009-2a402c7b8cd559ae7dcb8bb3ec30f9029fb26c37ee966558437ace37014 tags: %% Cell type:code id:0009-2a402c7b8cd559ae7dcb8bb3ec30f9029fb26c37ee966558437ace37014 tags:
``` ```
inertia_loop = [] inertia_loop = []
for y in [y1, y2]: for y in [y1, y2]:
total_inertia = 0 total_inertia = 0
for j in range(max(y) + 1): for j in range(max(y) + 1):
X_j = X[y == j] X_j = X[y == j]
mu_j = X_j.mean(axis=0) mu_j = X_j.mean(axis=0)
diff = X_j - mu_j diff = X_j - mu_j
inertia = np.sum(diff ** 2) inertia = np.sum(diff ** 2)
total_inertia += inertia total_inertia += inertia
inertia_loop.append(total_inertia) inertia_loop.append(total_inertia)
``` ```
%% Cell type:code id:0010-312845f479b003bfe4db2fc5e3c7f7b7f42c593739d43473d785c47586a tags: %% Cell type:code id:0010-312845f479b003bfe4db2fc5e3c7f7b7f42c593739d43473d785c47586a tags:
``` ```
inertia_loop inertia_loop
``` ```
%% Output %% Output
[663.5525861044789, 420.48555691708304] [663.5525861044789, 420.48555691708304]
%% Cell type:markdown id:0011-f01157f3b95bd8a3730718503771b0bfdc3571b74ca13b9939f17eb58ab tags: %% Cell type:markdown id:0011-2e0e1e9b6d0bd37fa78d2d2bdab3c699c3267bc9df7f2fdb3e03cc42823 tags:
Im zweiten Ansatz arbeiten wir ohne (innere) `for`-Schleife und Im zweiten Ansatz arbeiten wir ohne (innere) `for`-Schleife und
verwenden anstatt dessen `list`-Comprehensions. Zunächst berechnen wir verwenden anstatt dessen `list`-Comprehensions. Zunächst berechnen wir
die Repräsentanten für alle Cluster `mu` im Prinzip analog zum ersten die Repräsentanten für alle Cluster `mu` im Prinzip analog zum ersten
Ansatz. Dann berechnen wir die quadrierte Distanzmatrix Ansatz. Dann berechnen wir die quadrierte Distanzmatrix
`sqr_dist_matrix`, analog zu den Folien (nur halt ohne Wurzel). `sqr_dist_matrix`, analog zu den Folien (nur halt ohne Wurzel).
Anschließend wählen wir per Indizierung die Abstände der Samples, die zu Anschließend wählen wir entweder per Indizierung die Abstände der
Cluster `j` gehören, zu $\mu_j$ (Spalte `j`) und summieren sie auf. Das Samples, die zu Cluster `j` gehören, zu $\mu_j$ (Spalte `j`) und
wird in der `list`-Comprehension für jedes Cluster `j` gemacht und diese summieren sie auf. Das wird in der `list`-Comprehension für jedes
Trägheitsmomente werden zum Gesamtträgheitsmoment `total_inertia` Cluster `j` gemacht und diese Trägheitsmomente werden zum
aufsummiert. Gesamtträgheitsmoment `total_inertia` aufsummiert. Oder wir machen das
per komplexer Indizierung mit Indexpaaren. Hierbei wählt `y` jeweils die
richtige Spalte aus.
%% Cell type:code id:0012-fe2cf201c04462c6164ddc58d877a6d58d84dc101e5c31b244951544183 tags: %% Cell type:code id:0012-9050cec2f330532f97b66a58127c4417c52d272f1b2007d7f2c90548730 tags:
``` ```
inertia_comp = [] inertia_comp = []
for y in [y1, y2]: for y in [y1, y2]:
k = max(y) + 1 k = max(y) + 1
mu = np.array([X[y == j].mean(axis=0) for j in range(k)]) mu = np.array([X[y == j].mean(axis=0) for j in range(k)])
diff = X[:, np.newaxis, :] - mu[np.newaxis, :, :] diff = X[:, np.newaxis, :] - mu[np.newaxis, :, :]
sqr_dist_matrix = np.sum(diff ** 2, axis=2) sqr_dist_matrix = np.sum(diff ** 2, axis=2)
# entweder mit list Comprehension
total_inertia = sum([np.sum(sqr_dist_matrix[y == j, j]) for j in range(k)]) total_inertia = sum([np.sum(sqr_dist_matrix[y == j, j]) for j in range(k)])
# oder paarweiser Indizierung
all_rows = np.arange(len(y))
total_inertia = np.sum(sqr_dist_matrix[all_rows, y])
inertia_comp.append(total_inertia) inertia_comp.append(total_inertia)
``` ```
%% Cell type:code id:0013-54434e150ebc898aa6628247bbd58911180b2dd5b07d06ab7fd7306f86b tags: %% Cell type:code id:0013-54434e150ebc898aa6628247bbd58911180b2dd5b07d06ab7fd7306f86b tags:
``` ```
inertia_comp inertia_comp
``` ```
%% Output %% Output
[663.552586104479, 420.48555691708304] [663.552586104479, 420.48555691708304]
......
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
# Code zu Folien # Code zu Folien
Dieses Skript bzw. Jupyter-Notebook enthält den Code, der auch auf den Folien "Clustering" enthalten ist. Zum Vorbereiten, Mitmachen oder Nacharbeiten. Dieses Skript bzw. Jupyter-Notebook enthält den Code, der auch auf den Folien "Clustering" enthalten ist. Zum Vorbereiten, Mitmachen oder Nacharbeiten.
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
import timeit
import pandas as pd import pandas as pd
import numpy as np import numpy as np
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from IPython.display import display from IPython.display import display
import seaborn as sns import seaborn as sns
from sklearn.cluster import KMeans, DBSCAN from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.datasets import make_blobs, make_circles, make_moons, load_breast_cancer, load_iris from sklearn.datasets import make_blobs, make_circles, make_moons, load_breast_cancer, load_iris
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
import keras import keras
sns.set_theme() sns.set_theme()
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
def plot_clusters(X, labels, centers=None, ax=None, grid=True, xlabel='$x_1$', ylabel='$x_2$'): def plot_clusters(X, labels, centers=None, ax=None, grid=True, xlabel='$x_1$', ylabel='$x_2$'):
if ax is None: if ax is None:
fig, ax = plt.subplots() fig, ax = plt.subplots()
markers=['*', 's', '^', 'D', 'o', '<', 'h', 'H', 'X', 'd', 'P', '.'] markers=['*', 's', '^', 'D', 'o', '<', 'h', 'H', 'X', 'd', 'P', '.']
colorName = ['teal','orange','firebrick','dodgerblue', 'b', 'y', 'k'] colorName = ['teal','orange','firebrick','dodgerblue', 'b', 'y', 'k']
for i in np.unique(labels): for i in np.unique(labels):
groupi = labels == i groupi = labels == i
ax.scatter(X[groupi, 0], X[groupi, 1], s=120 if i == 0 else 60, alpha=0.2, marker=markers[i], c=colorName[i]) ax.scatter(X[groupi, 0], X[groupi, 1], s=120 if i == 0 else 60, alpha=0.2, marker=markers[i], c=colorName[i])
if centers is not None: if centers is not None:
for i in range(len(centers)): for i in range(len(centers)):
ax.scatter(centers[i, 0], centers[i, 1], s=200 if i == 0 else 100, ax.scatter(centers[i, 0], centers[i, 1], s=200 if i == 0 else 100,
marker=markers[i], c='black', edgecolors=colorName[i]) marker=markers[i], c='black', edgecolors=colorName[i])
ax.scatter(centers[i, 0], centers[i, 1], s=12, c=colorName[i]) ax.scatter(centers[i, 0], centers[i, 1], s=12, c=colorName[i])
ax.set_xlabel(xlabel) ax.set_xlabel(xlabel)
ax.set_ylabel(ylabel) ax.set_ylabel(ylabel)
if grid: if grid:
ax.grid(True) ax.grid(True)
return ax return ax
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
rng = np.random.default_rng() rng = np.random.default_rng()
X, y = make_blobs(n_samples=[100, 100, 400, 400], random_state=1) X, y = make_blobs(n_samples=[100, 100, 400, 400], random_state=1)
plt.scatter(X[:, 0], X[:, 1], c=y, cmap='viridis', edgecolors='k', alpha=0.5) plt.scatter(X[:, 0], X[:, 1], c=y, cmap='viridis', edgecolors='k', alpha=0.5)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
k = 4 k = 4
# NumPy: # NumPy:
# mu = rng.choice(X, size=k, replace=False) # zufällige Startwerte bei NumPy-Array # mu = rng.choice(X, size=k, replace=False) # zufällige Startwerte bei NumPy-Array
mu = X[[4, 7, 100, 102], :] # nicht all zu schlechte Startwerte von Folie "k-Means Algorithmus – Ablauf" mu = X[[4, 7, 100, 102], :] # nicht all zu schlechte Startwerte von Folie "k-Means Algorithmus – Ablauf"
# mu = X[[28, 732, 926, 966], :] # schlechte Startwerte von Folie "Zufällige Startwerte ⇒ nicht deterministisch" # mu = X[[28, 732, 926, 966], :] # schlechte Startwerte von Folie "Zufällige Startwerte ⇒ nicht deterministisch"
# mu = X[[100, 50, 39, 42], :] # schlechte Startwerte von Folie "Mehrere Durchläufe", mittlere Abbildung # mu = X[[100, 50, 39, 42], :] # schlechte Startwerte von Folie "Mehrere Durchläufe", mittlere Abbildung
# # Pandas: # # Pandas:
# mu = X.sample(k, ignore_index=True, random_state=0) # zufällige Startwerte bei Pandas-DataFrame # mu = X.sample(k, ignore_index=True, random_state=0) # zufällige Startwerte bei Pandas-DataFrame
print(mu.shape) print(mu.shape)
print(X.shape) print(X.shape)
plot_clusters(X, np.full_like(y, 6), mu) plot_clusters(X, np.full_like(y, 6), mu)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
diff = X[:, np.newaxis, :] - mu[np.newaxis, :, :] diff = X[:, np.newaxis, :] - mu[np.newaxis, :, :]
dist_matrix = np.sqrt(np.sum(diff ** 2, axis=2)) dist_matrix = np.sqrt(np.sum(diff ** 2, axis=2))
print(dist_matrix.shape) print(dist_matrix.shape)
labels = np.argmin(dist_matrix, axis=1) # Zuordnung der Samples labels = np.argmin(dist_matrix, axis=1) # Zuordnung der Samples
mu = np.array([X[labels == j].mean(axis=0) for j in range(k)]) mu = np.array([X[labels == j].mean(axis=0) for j in range(k)])
plot_clusters(X, labels, mu) plot_clusters(X, labels, mu)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
a = np.array([10, 20, 30]).reshape(3, 1) a = np.array([10, 20, 30]).reshape(3, 1)
b = np.array([1, 2, 3]) b = np.array([1, 2, 3])
display(a + b) display(a + b)
print(a.shape) print(a.shape)
print(' ', b.shape) print(' ', b.shape)
print((a + b).shape) print((a + b).shape)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
X_iris = load_iris()['data'] X_iris = load_iris()['data']
x_test = [5.5, 2.5, 5, 1.5] x_test = [5.5, 2.5, 5, 1.5]
diff = X_iris - x_test diff = X_iris - x_test
display(diff) display(diff)
print(X_iris.shape) print(X_iris.shape)
print(' ', np.shape(x_test)) print(' ', np.shape(x_test))
print(diff.shape) print(diff.shape)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
(X_train, y_train), (X_test, y_test) = keras.datasets.cifar10.load_data() (X_train, y_train), (X_test, y_test) = keras.datasets.cifar10.load_data()
ship = X_test[1] / 255 ship = X_test[1] / 255
scale = np.array([0.1, 1, 0.7]) scale = np.array([0.1, 1, 0.7])
result = ship * scale result = ship * scale
plt.figure() plt.figure()
plt.imshow(ship) plt.imshow(ship)
plt.figure() plt.figure()
plt.imshow(result) plt.imshow(result)
print(ship.shape) print(ship.shape)
print(' ', scale.shape) print(' ', scale.shape)
print(result.shape) print(result.shape)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
X, y = load_breast_cancer(return_X_y=True) X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y) X_train, X_test, y_train, y_test = train_test_split(X, y)
# # Scikit-Learns predict-Methode gibt 1D-Array zurück # # Scikit-Learns predict-Methode gibt 1D-Array zurück
# from sklearn.neighbors import KNeighborsClassifier # from sklearn.neighbors import KNeighborsClassifier
# model = KNeighborsClassifier() # model = KNeighborsClassifier()
# model.fit(X_train, y_train) # model.fit(X_train, y_train)
# Keras predict-Methode gibt 2D-Spalten-Array zurück # Keras predict-Methode gibt 2D-Spalten-Array zurück
from keras import Sequential, Input from keras import Sequential, Input
Dense = keras.layers.Dense Dense = keras.layers.Dense
model = Sequential() model = Sequential()
model.add(Input(X_train.shape[1:])) model.add(Input(X_train.shape[1:]))
model.add(Dense(50, activation='tanh')) model.add(Dense(50, activation='tanh'))
model.add(Dense(1, activation='sigmoid')) model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=20, batch_size=71, verbose=False) model.fit(X_train, y_train, epochs=20, batch_size=71, verbose=False)
y_p = model.predict(X_test, verbose=False) # 2D y_p = model.predict(X_test, verbose=False) # 2D
err = y_p - y_test err = y_p - y_test
mae = np.mean(np.abs(err)) # keine Fehlermeldung, aber falscher Wert mae = np.mean(np.abs(err)) # keine Fehlermeldung, aber falscher Wert
print(f'ohne squeeze: {y_p.shape=}, {err.shape=}, {mae=:.3}') print(f'ohne squeeze: {y_p.shape=}, {err.shape=}, {mae=:.3}')
y_p = model.predict(X_test, verbose=False).squeeze() # auf 1D reduzieren y_p = model.predict(X_test, verbose=False).squeeze() # auf 1D reduzieren
err = y_p - y_test err = y_p - y_test
mae = np.mean(np.abs(err)) # richtiger Wert mae = np.mean(np.abs(err)) # richtiger Wert
print(f'mit squeeze: {y_p.shape=}, {err.shape=}, {mae=:.3}') print(f'mit squeeze: {y_p.shape=}, {err.shape=}, {mae=:.3}')
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
from sklearn.neighbors import NearestNeighbors from sklearn.neighbors import NearestNeighbors
X = np.array([ X = np.array([
[-4, 3], [-4, 3],
[0, 3], [0, 3],
[0, 0], [0, 0],
[3, 0], [3, 0],
[4, 0], [4, 0],
[4, 1], [4, 1],
]) ])
plt.scatter(*X.T, c=range(len(X))) plt.scatter(*X.T, c=range(len(X)))
plt.axis('equal') plt.axis('equal')
nn = NearestNeighbors() nn = NearestNeighbors()
nn.fit(X) nn.fit(X)
dists, indices = nn.kneighbors(X, 3) dists, indices = nn.kneighbors(X, 3)
display(dists[:, 1:].round(2)) display(dists[:, 1:].round(2))
display(indices[:, 1:]) display(indices[:, 1:])
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
X = np.array([ X = np.array([
[0.0, 0.0], [0.0, 0.0],
[0.25, 0.5], [0.25, 0.5],
[1.0, 1.0], [1.0, 1.0],
[1.5, 0.75], [1.5, 0.75],
[0.0, 1.5], [0.0, 1.5],
[2.5, 0.5], [2.5, 0.5],
[4.5, 1.0], [4.5, 1.0],
[5.0, 2.0], [5.0, 2.0],
]) ])
knn = NearestNeighbors(n_neighbors=3) knn = NearestNeighbors(n_neighbors=3)
knn.fit(X) knn.fit(X)
dists, neighbors = knn.kneighbors(X) dists, neighbors = knn.kneighbors(X)
max_dists = dists.max(axis=1) max_dists = dists.max(axis=1)
plt.hist(max_dists, bins=5, range=[0.5, 3], edgecolor='w') plt.hist(max_dists, bins=5, range=[0.5, 3], edgecolor='w')
plt.xlabel(f'Max distances to {dists.shape[1] - 1} neighbors') plt.xlabel(f'Max distances to {dists.shape[1] - 1} neighbors')
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
X, _ = make_moons(n_samples=480, noise=0.02) X, _ = make_moons(n_samples=480, noise=0.02)
knn = NearestNeighbors(n_neighbors=5) knn = NearestNeighbors(n_neighbors=5)
knn.fit(X) knn.fit(X)
dists, neighbors = knn.kneighbors(X) dists, neighbors = knn.kneighbors(X)
max_dists = dists.max(axis=1) max_dists = dists.max(axis=1)
plt.hist(max_dists, bins=20, edgecolor='w') plt.hist(max_dists, bins=20, edgecolor='w')
plt.xlabel(f'Max distances to {dists.shape[1] - 1} neighbors') plt.xlabel(f'Max distances to {dists.shape[1] - 1} neighbors')
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# data # data
X, y = make_circles(1000, noise=0.075, factor=0.5, random_state=42) X, y = make_circles(1000, noise=0.075, factor=0.5, random_state=42)
plt.scatter(*X.T, c='gray', edgecolors='k') plt.scatter(*X.T, c='gray', edgecolors='k')
plt.axis('equal') plt.axis('equal')
# eps-k data # eps-k data
min_samples_range = np.arange(2, 10) min_samples_range = np.arange(2, 10)
eps_range = np.arange(0.01, 0.2, 0.001) eps_range = np.arange(0.01, 0.2, 0.001)
records = [] records = []
for min_samples in min_samples_range: for min_samples in min_samples_range:
for eps in eps_range: for eps in eps_range:
clusterer = DBSCAN(eps=eps, min_samples=min_samples) clusterer = DBSCAN(eps=eps, min_samples=min_samples)
labels = clusterer.fit_predict(X) labels = clusterer.fit_predict(X)
noise_lvl = np.mean(labels < 0) noise_lvl = np.mean(labels < 0)
n_clusters = labels.max() + 1 n_clusters = labels.max() + 1
records.append((min_samples, eps, n_clusters, noise_lvl)) records.append((min_samples, eps, n_clusters, noise_lvl))
df = pd.DataFrame.from_records(records, columns=['min_samples', 'eps', 'n_clusters', 'noise_lvl']) df = pd.DataFrame.from_records(records, columns=['min_samples', 'eps', 'n_clusters', 'noise_lvl'])
df_clusters = df.pivot(index='eps', columns='min_samples', values='n_clusters') df_clusters = df.pivot(index='eps', columns='min_samples', values='n_clusters')
# eps-k plot # eps-k plot
plt.figure() plt.figure()
ax = sns.lineplot(df, x='eps', y='n_clusters', hue='min_samples', palette='rainbow', legend='full') ax = sns.lineplot(df, x='eps', y='n_clusters', hue='min_samples', palette='rainbow', legend='full')
# ax.set_ylim(bottom=0) # ax.set_ylim(bottom=0)
ax.set_ylim(bottom=0, top=10) # zoom in y-Achse ax.set_ylim(bottom=0, top=10) # zoom in y-Achse
ax.set_xlim(left=0.1, right=0.2) # zoom in x-Achse ax.set_xlim(left=0.1, right=0.2) # zoom in x-Achse
```
%% Cell type:code id: tags:
```
n = 2
rng = np.random.default_rng()
datasets = {
'normal': rng.standard_normal(size=(2000, n), dtype='float32'),
'uniform': rng.random(size=(2000, n), dtype='float32'),
}
for name, X in datasets.items():
nn = NearestNeighbors(n_neighbors=5, n_jobs=1)
nn.fit(X)
dists, neighbors = nn.kneighbors(X)
max_dists = dists.max(axis=1)
fig, ax = plt.subplots(figsize=(3, 3))
ax.scatter(*X.T, c=-max_dists, alpha=0.7) # neg to invert color map
ax.set_aspect('equal')
ax.set_xlabel('$x_1$')
ax.set_ylabel('$x_2$')
```
%% Cell type:code id: tags:
```
n_bins = 21
cmap = plt.colormaps['viridis'].resampled(n_bins)
n_feat = [2, 5, 10, 20, 50, 100]
for n in n_feat:
rng = np.random.default_rng(42)
datasets = {
'normal': rng.standard_normal(size=(50_000, n), dtype='float32'),
'uniform': rng.random(size=(50_000, n), dtype='float32'),
}
for name, X in datasets.items():
nn = NearestNeighbors(n_neighbors=5, n_jobs=13)
start = timeit.default_timer()
nn.fit(X)
dists, neighbors = nn.kneighbors(X)
max_dists = dists.max(axis=1)
end = timeit.default_timer()
print(f'{name=}, {n=}, runtime: {end - start} s')
fig, ax = plt.subplots(figsize=(7, 5))
data_range = np.quantile(max_dists, [0.02, 0.98])
N, bins, patches = ax.hist(max_dists, bins=n_bins, edgecolor='w', range=data_range)
for i, p in enumerate(patches):
p.set_facecolor(cmap(n_bins - 1 - i))
ax.set_xlim(data_range)
ax.set_xlabel('Max distances to 4 neighbors')
ax.set_title(f'Density Histogram for {name} distribution in {n} dimensions')
```
%% Cell type:code id: tags:
```
n = 20
X, y = make_moons(n_samples=n, noise=0.05, random_state=4, shuffle=False)
X_outlier = np.vstack((X, [[1.2, 0.8], [1.6, 0.95]]))
# without outlier
iteration = 18
agg = AgglomerativeClustering(n_clusters=20 - iteration, linkage='single')
c = agg.fit_predict(X)
plt.figure()
plt.scatter(*X.T, c=c)
plt.axis('square')
# with outlier
agg = AgglomerativeClustering(n_clusters=3, linkage='single')
c = agg.fit_predict(X_outlier)
plt.figure()
plt.scatter(*X_outlier.T, c=c)
plt.axis('square')
``` ```
......
...@@ -4,14 +4,16 @@ ...@@ -4,14 +4,16 @@
# Dieses Skript bzw. Jupyter-Notebook enthält den Code, der auch auf den Folien "Clustering" enthalten ist. Zum Vorbereiten, Mitmachen oder Nacharbeiten. # Dieses Skript bzw. Jupyter-Notebook enthält den Code, der auch auf den Folien "Clustering" enthalten ist. Zum Vorbereiten, Mitmachen oder Nacharbeiten.
# %% import # %% import
import timeit
import pandas as pd import pandas as pd
import numpy as np import numpy as np
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from IPython.display import display from IPython.display import display
import seaborn as sns import seaborn as sns
from sklearn.cluster import KMeans, DBSCAN from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.datasets import make_blobs, make_circles, make_moons, load_breast_cancer, load_iris from sklearn.datasets import make_blobs, make_circles, make_moons, load_breast_cancer, load_iris
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
import keras import keras
sns.set_theme() sns.set_theme()
...@@ -215,3 +217,76 @@ ax = sns.lineplot(df, x='eps', y='n_clusters', hue='min_samples', palette='rainb ...@@ -215,3 +217,76 @@ ax = sns.lineplot(df, x='eps', y='n_clusters', hue='min_samples', palette='rainb
# ax.set_ylim(bottom=0) # ax.set_ylim(bottom=0)
ax.set_ylim(bottom=0, top=10) # zoom in y-Achse ax.set_ylim(bottom=0, top=10) # zoom in y-Achse
ax.set_xlim(left=0.1, right=0.2) # zoom in x-Achse ax.set_xlim(left=0.1, right=0.2) # zoom in x-Achse
# %% Dichteverteilungen
n = 2
rng = np.random.default_rng()
datasets = {
'normal': rng.standard_normal(size=(2000, n), dtype='float32'),
'uniform': rng.random(size=(2000, n), dtype='float32'),
}
for name, X in datasets.items():
nn = NearestNeighbors(n_neighbors=5, n_jobs=1)
nn.fit(X)
dists, neighbors = nn.kneighbors(X)
max_dists = dists.max(axis=1)
fig, ax = plt.subplots(figsize=(3, 3))
ax.scatter(*X.T, c=-max_dists, alpha=0.7) # neg to invert color map
ax.set_aspect('equal')
ax.set_xlabel('$x_1$')
ax.set_ylabel('$x_2$')
# %% Dichteverteilungen in hochdimensionalen Räumen
n_bins = 21
cmap = plt.colormaps['viridis'].resampled(n_bins)
n_feat = [2, 5, 10, 20, 50, 100]
for n in n_feat:
rng = np.random.default_rng(42)
datasets = {
'normal': rng.standard_normal(size=(50_000, n), dtype='float32'),
'uniform': rng.random(size=(50_000, n), dtype='float32'),
}
for name, X in datasets.items():
nn = NearestNeighbors(n_neighbors=5, n_jobs=13)
start = timeit.default_timer()
nn.fit(X)
dists, neighbors = nn.kneighbors(X)
max_dists = dists.max(axis=1)
end = timeit.default_timer()
print(f'{name=}, {n=}, runtime: {end - start} s')
fig, ax = plt.subplots(figsize=(7, 5))
data_range = np.quantile(max_dists, [0.02, 0.98])
N, bins, patches = ax.hist(max_dists, bins=n_bins, edgecolor='w', range=data_range)
for i, p in enumerate(patches):
p.set_facecolor(cmap(n_bins - 1 - i))
ax.set_xlim(data_range)
ax.set_xlabel('Max distances to 4 neighbors')
ax.set_title(f'Density Histogram for {name} distribution in {n} dimensions')
# %% Single-Linkage Verhalten
n = 20
X, y = make_moons(n_samples=n, noise=0.05, random_state=4, shuffle=False)
X_outlier = np.vstack((X, [[1.2, 0.8], [1.6, 0.95]]))
# without outlier
iteration = 18
agg = AgglomerativeClustering(n_clusters=20 - iteration, linkage='single')
c = agg.fit_predict(X)
plt.figure()
plt.scatter(*X.T, c=c)
plt.axis('square')
# with outlier
agg = AgglomerativeClustering(n_clusters=3, linkage='single')
c = agg.fit_predict(X_outlier)
plt.figure()
plt.scatter(*X_outlier.T, c=c)
plt.axis('square')
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment