Skip to content
Snippets Groups Projects
Commit 20469b87 authored by Christof Kaufmann's avatar Christof Kaufmann
Browse files

Notebooks from applied-cs/data-science@bfb5dcac

parent 17e18ced
No related branches found
No related tags found
No related merge requests found
%% Cell type:markdown id:0003-63fa5c481c78bbdf737c04550a38235a19937a13865b411b2a2f4b0f35b tags:
# Kettenglied Cluster
Im Startcode werden die Kettenglied-Daten geladen.
- Versuchen Sie mit einem Plot über $\varepsilon$ herauszukriegen, wie
viele Cluster es gibt
- Clustern Sie die Daten anschließend mit `DBSCAN` mit den gefundenen
Parametern. Speichern Sie die Labels in `labels`.
Hier ist Ihr Startcode
%% Cell type:code id:0004-172ec8d8337c3342d2de94c7d0b95e0d58501261b0b2b053c81d4df3ebb tags:
```
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.cluster import DBSCAN, HDBSCAN
from plotly import graph_objects as go
X = np.loadtxt('chain.csv', delimiter=',')
```
%% Cell type:markdown id:0006-d062c7158819e9aa5c6c8293c56a7e567d66fc80cdc9e14a22c7dbcbf3f tags:
## Tests
Wir geben hier die Anzahl gefundener Cluster und den Anteil an Rauschen
aus.
%% Cell type:code id:0007-e7e44b3c247a9f8e162855e55d1ae41b0da010167820ecc81de8d505942 tags:
```
noise_lvl = np.mean(labels < 0)
n_clusters = labels.max() + 1
print(f'n_clusters: {n_clusters}, noise_lvl: {noise_lvl:.1%}')
```
%% Output
n_clusters: 18, noise_lvl: 0.0%
%% Cell type:markdown id:0008-02aa760645262825d3fe0fac5bed63b93cdb9024bd5568fbd9c82870756 tags:
Außerdem plotten wir die Daten mit den gefundenen Labels.
%% Cell type:code id:0009-b26d9437b5037dc028a307b8f9e18e3423e9bdd9d750ce887210662533a tags:
```
fig = go.Figure(data=[
go.Scatter3d(x=X[:, 0], y=X[:, 1], z=X[:, 2], marker=dict(size=3, color=labels), mode='markers', hovertemplate='label: %{marker.color}<br>x: %{x}<br>y: %{y}<br>z: %{z}<extra></extra>')
])
fig.show()
```
This diff is collapsed.
...@@ -32,13 +32,12 @@ from sklearn.metrics import confusion_matrix ...@@ -32,13 +32,12 @@ from sklearn.metrics import confusion_matrix
from scipy.optimize import linear_sum_assignment from scipy.optimize import linear_sum_assignment
# # Install these with from terminal with: # # Install these with from terminal with:
# mamba install hdbscan scikit-learn-extra # mamba install hdbscan
# pip install hdbscan scikit-learn-extra # pip install hdbscan
# # Install these with from this script with: # # Install these with from this script with:
# !mamba install -y hdbscan scikit-learn-extra # !mamba install -y hdbscan
# !pip install hdbscan scikit-learn-extra # !pip install hdbscan
from sklearn_extra.cluster import CommonNNClustering
# from sklearn.cluster import HDBSCAN # does not have soft clustering and tree plot # from sklearn.cluster import HDBSCAN # does not have soft clustering and tree plot
from hdbscan import HDBSCAN from hdbscan import HDBSCAN
import hdbscan import hdbscan
...@@ -188,6 +187,7 @@ fig_dbscan = plt.figure('DBSCAN', figsize=(8, 8)) ...@@ -188,6 +187,7 @@ fig_dbscan = plt.figure('DBSCAN', figsize=(8, 8))
ax_dbscan = fig_dbscan.add_subplot(2, 1, 1) ax_dbscan = fig_dbscan.add_subplot(2, 1, 1)
sc_dbscan = ax_dbscan.scatter(*X.T, alpha=0.8) sc_dbscan = ax_dbscan.scatter(*X.T, alpha=0.8)
sc_dbscan.set_edgecolor(cm((y + 1) % cm.N)) sc_dbscan.set_edgecolor(cm((y + 1) % cm.N))
ax_dbscan.set_title('DBSCAN')
ax_dbscan.set_xticks([]) ax_dbscan.set_xticks([])
ax_dbscan.set_yticks([]) ax_dbscan.set_yticks([])
...@@ -204,14 +204,11 @@ dbscan_samples_slider = Slider(ax=ax_dbscan_samples_slider, label='min_samples', ...@@ -204,14 +204,11 @@ dbscan_samples_slider = Slider(ax=ax_dbscan_samples_slider, label='min_samples',
ax_dbscan_show = fig_dbscan.add_axes([0.99 - 0.3, 0.95 - 0.05, 0.3, 0.05]) ax_dbscan_show = fig_dbscan.add_axes([0.99 - 0.3, 0.95 - 0.05, 0.3, 0.05])
dbscan_show_button = CheckButtons(ax=ax_dbscan_show, labels=['Show Ground Truth']) dbscan_show_button = CheckButtons(ax=ax_dbscan_show, labels=['Show Ground Truth'])
ax_dbscan_button = fig_dbscan.add_axes([0.99 - 0.25, 0.95 - 0.05 - 0.05, 0.25, 0.05])
dbscan_button = CheckButtons(ax=ax_dbscan_button, labels=['DBSCAN / CNNC'])
ax_dbscan_bars = fig_dbscan.add_subplot(2, 1, 2) ax_dbscan_bars = fig_dbscan.add_subplot(2, 1, 2)
ax_dbscan_bars.bar([0,1], [0, 1]) ax_dbscan_bars.bar([0,1], [0, 1])
ax_dbscan_bars.set_xticks([]) ax_dbscan_bars.set_xticks([])
# merge small clusters to the nearest large cluster (useful for CommonNNClustering) # merge small clusters to the nearest large cluster (was useful for CommonNNClustering)
def merge_small_clusters(X, labels, limit=15, inplace=False): def merge_small_clusters(X, labels, limit=15, inplace=False):
noise_labels = labels == -1 noise_labels = labels == -1
clusters, counts = np.unique(labels[~noise_labels], return_counts=True) clusters, counts = np.unique(labels[~noise_labels], return_counts=True)
...@@ -236,14 +233,7 @@ def merge_small_clusters(X, labels, limit=15, inplace=False): ...@@ -236,14 +233,7 @@ def merge_small_clusters(X, labels, limit=15, inplace=False):
def update_dbscan(val=None): def update_dbscan(val=None):
if dbscan_button.get_status()[0]:
alg = CommonNNClustering(eps=dbscan_eps_slider.val, min_samples=int(dbscan_samples_slider.val))
alg_name = 'Common Nearest Neighbor Clustering'
else:
alg = DBSCAN(eps=dbscan_eps_slider.val, min_samples=int(dbscan_samples_slider.val)) alg = DBSCAN(eps=dbscan_eps_slider.val, min_samples=int(dbscan_samples_slider.val))
alg_name = 'DBSCAN'
ax_dbscan.set_title(alg_name)
labels = alg.fit_predict(X) labels = alg.fit_predict(X)
merge_small_clusters(X, labels, inplace=True) merge_small_clusters(X, labels, inplace=True)
matched_labels = match_labels(labels) matched_labels = match_labels(labels)
...@@ -261,7 +251,6 @@ def update_dbscan(val=None): ...@@ -261,7 +251,6 @@ def update_dbscan(val=None):
dbscan_eps_slider.on_changed(update_dbscan) dbscan_eps_slider.on_changed(update_dbscan)
dbscan_samples_slider.on_changed(update_dbscan) dbscan_samples_slider.on_changed(update_dbscan)
dbscan_button.on_clicked(update_dbscan)
dbscan_show_button.on_clicked(update_dbscan) dbscan_show_button.on_clicked(update_dbscan)
update_dbscan() update_dbscan()
plt.show() plt.show()
......
%% Cell type:markdown id:0003-63fa5c481c78bbdf737c04550a38235a19937a13865b411b2a2f4b0f35b tags:
# Kettenglied Cluster
Im Startcode werden die Kettenglied-Daten geladen.
- Versuchen Sie mit einem Plot über $\varepsilon$ herauszukriegen, wie
viele Cluster es gibt
- Clustern Sie die Daten anschließend mit `DBSCAN` mit den gefundenen
Parametern. Speichern Sie die Labels in `labels`.
Hier ist Ihr Startcode
%% Cell type:code id:0004-172ec8d8337c3342d2de94c7d0b95e0d58501261b0b2b053c81d4df3ebb tags:
```
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.cluster import DBSCAN, HDBSCAN
from plotly import graph_objects as go
X = np.loadtxt('chain.csv', delimiter=',')
```
%% Cell type:markdown id:0006-1e1baaf19132e8389263dc7101c2161704a6858a2a3509b8b9784e04a23 tags:
## Lösung
Zunächst definieren wir die Bereiche für die Parameter `eps` und
`min_samples`. Für jede Kombination von `eps` und `min_samples` führen
wir `DBSCAN` aus und speichern die Anzahl der gefundenen Cluster
zusammen mit `eps` und `min_samples` in einer Liste. Diese Liste wandeln
wir anschließend in ein DataFrame um.
%% Cell type:code id:0007-03594f919e9a4e5881b9b1e6714a3597c78e3e46cebe932b3748e93124f tags:
```
min_samples_range = np.arange(2, 10)
eps_range = np.arange(0.16, 0.39, 0.02)
records = []
for min_samples in min_samples_range:
for eps in eps_range:
clusterer = DBSCAN(eps=eps, min_samples=min_samples)
labels = clusterer.fit_predict(X)
noise_lvl = np.mean(labels < 0)
n_clusters = labels.max() + 1
records.append((min_samples, eps, n_clusters, noise_lvl))
df = pd.DataFrame.from_records(records, columns=['min_samples', 'eps', 'n_clusters', 'noise_lvl'])
```
%% Cell type:markdown id:0008-1c28662b4a9b50bbbcf045db605e433969f0abb64225270c4c3d8df12b3 tags:
Dann lässt sich das z. B. mit Seaborns `lineplot` plotten. Wir
verschieben die y-Achse um den Wert von `min_samples` und multiplizieren
mit einem Faktor, damit die Kurven nicht übereinander liegen. Außerdem
setzen wir die Grenzen der x- und y-Achse, damit wir uns auf den
interessanten Bereich konzentrieren.
%% Cell type:code id:0009-18c3413612249d6d54e5becb70cce1c4583afe097ca75b232a62eadb882 tags:
```
plt.figure()
df_plt = df.copy()
df_plt['n_clusters'] = df_plt['n_clusters'] + (df_plt['min_samples'] - 5.5) * 0.07
ax = sns.lineplot(df_plt, x='eps', y='n_clusters', hue='min_samples', palette='rainbow', legend='full')
ax.set_ylim(bottom=0, top=25) # zoom in y-Achse
ax.set_xlim(left=0.16, right=0.38) # zoom in x-Achse
```
%% Cell type:markdown id:0010-e08d4bcdda9e99f64a3fde59e80ff5c65cfce2c7645844a3edf9115ae2d tags:
Alternativ lassen sich die Daten auch als Heatmap darstellen. Hierbei
ist `vmax` auf 25 gesetzt, damit die Ausreißer die Farben nicht
beeinflussen. Außerdem setzen wir `annot=True`, damit die Anzahl der
Cluster in die Zellen geschrieben wird.
%% Cell type:code id:0011-dfcc852ce6bddb3c8367b16667188a4905ce5926e84840a6a47458df55d tags:
```
df_clusters = df.pivot(index='eps', columns='min_samples', values='n_clusters')
plt.figure()
sns.heatmap(df_clusters, vmax=25, annot=True, fmt='d', cbar_kws={'label': 'n_clusters'}, yticklabels=df_clusters.index.values.round(2))
```
%% Cell type:markdown id:0012-11a2a41b09d2c97379afb6ee828f545b345a9f604a9e223c2274921b3ce tags:
Wir sehen, dass ein `eps` zwischen 0.22 und 0.32 gewählt werden kann und
wir 18 Cluster erhalten sollten. Wir wählen `eps=0.32` um möglichst
wenig Rauschen zu erhalten und `min_samples=5` und führen `DBSCAN` aus.
Die Labels werden in der Variable `labels` gespeichert.
%% Cell type:code id:0013-a4adbdcc1b97ce2d982515e21498cee223de6225681964c629986667b42 tags:
```
clusterer = DBSCAN(eps=0.32, min_samples=5)
labels = clusterer.fit_predict(X)
```
%% Cell type:markdown id:0014-93102c00a0113d350231e3596456f2b8581bfdade70ec4ad4e642129496 tags:
Alternativ können wir auch `HDBSCAN` verwenden. Hierbei ist die Wahl von
`min_samples` nicht so wichtig, da `HDBSCAN` die Parameter automatisch
anpasst. Wir verwenden hier den Standardwert von 5. In der Tat erhalten
wir mit `HDBSCAN` ohne Einstellung der Parameter die gleiche Anzahl an
Clustern wie mit `DBSCAN`.
%% Cell type:code id:0015-10e195334248e3113087be5a111beaf458fbde346a80448174f3230cea8 tags:
```
clusterer = HDBSCAN()
labels = clusterer.fit_predict(X)
```
%% Cell type:markdown id:0017-d062c7158819e9aa5c6c8293c56a7e567d66fc80cdc9e14a22c7dbcbf3f tags:
## Tests
Wir geben hier die Anzahl gefundener Cluster und den Anteil an Rauschen
aus.
%% Cell type:code id:0018-e7e44b3c247a9f8e162855e55d1ae41b0da010167820ecc81de8d505942 tags:
```
noise_lvl = np.mean(labels < 0)
n_clusters = labels.max() + 1
print(f'n_clusters: {n_clusters}, noise_lvl: {noise_lvl:.1%}')
```
%% Output
n_clusters: 18, noise_lvl: 0.0%
%% Cell type:markdown id:0019-02aa760645262825d3fe0fac5bed63b93cdb9024bd5568fbd9c82870756 tags:
Außerdem plotten wir die Daten mit den gefundenen Labels.
%% Cell type:code id:0020-b26d9437b5037dc028a307b8f9e18e3423e9bdd9d750ce887210662533a tags:
```
fig = go.Figure(data=[
go.Scatter3d(x=X[:, 0], y=X[:, 1], z=X[:, 2], marker=dict(size=3, color=labels), mode='markers', hovertemplate='label: %{marker.color}<br>x: %{x}<br>y: %{y}<br>z: %{z}<extra></extra>')
])
fig.show()
```
This diff is collapsed.
...@@ -32,13 +32,12 @@ from sklearn.metrics import confusion_matrix ...@@ -32,13 +32,12 @@ from sklearn.metrics import confusion_matrix
from scipy.optimize import linear_sum_assignment from scipy.optimize import linear_sum_assignment
# # Install these with from terminal with: # # Install these with from terminal with:
# mamba install hdbscan scikit-learn-extra # mamba install hdbscan
# pip install hdbscan scikit-learn-extra # pip install hdbscan
# # Install these with from this script with: # # Install these with from this script with:
# !mamba install -y hdbscan scikit-learn-extra # !mamba install -y hdbscan
# !pip install hdbscan scikit-learn-extra # !pip install hdbscan
from sklearn_extra.cluster import CommonNNClustering
# from sklearn.cluster import HDBSCAN # does not have soft clustering and tree plot # from sklearn.cluster import HDBSCAN # does not have soft clustering and tree plot
from hdbscan import HDBSCAN from hdbscan import HDBSCAN
import hdbscan import hdbscan
...@@ -188,6 +187,7 @@ fig_dbscan = plt.figure('DBSCAN', figsize=(8, 8)) ...@@ -188,6 +187,7 @@ fig_dbscan = plt.figure('DBSCAN', figsize=(8, 8))
ax_dbscan = fig_dbscan.add_subplot(2, 1, 1) ax_dbscan = fig_dbscan.add_subplot(2, 1, 1)
sc_dbscan = ax_dbscan.scatter(*X.T, alpha=0.8) sc_dbscan = ax_dbscan.scatter(*X.T, alpha=0.8)
sc_dbscan.set_edgecolor(cm((y + 1) % cm.N)) sc_dbscan.set_edgecolor(cm((y + 1) % cm.N))
ax_dbscan.set_title('DBSCAN')
ax_dbscan.set_xticks([]) ax_dbscan.set_xticks([])
ax_dbscan.set_yticks([]) ax_dbscan.set_yticks([])
...@@ -204,14 +204,11 @@ dbscan_samples_slider = Slider(ax=ax_dbscan_samples_slider, label='min_samples', ...@@ -204,14 +204,11 @@ dbscan_samples_slider = Slider(ax=ax_dbscan_samples_slider, label='min_samples',
ax_dbscan_show = fig_dbscan.add_axes([0.99 - 0.3, 0.95 - 0.05, 0.3, 0.05]) ax_dbscan_show = fig_dbscan.add_axes([0.99 - 0.3, 0.95 - 0.05, 0.3, 0.05])
dbscan_show_button = CheckButtons(ax=ax_dbscan_show, labels=['Show Ground Truth']) dbscan_show_button = CheckButtons(ax=ax_dbscan_show, labels=['Show Ground Truth'])
ax_dbscan_button = fig_dbscan.add_axes([0.99 - 0.25, 0.95 - 0.05 - 0.05, 0.25, 0.05])
dbscan_button = CheckButtons(ax=ax_dbscan_button, labels=['DBSCAN / CNNC'])
ax_dbscan_bars = fig_dbscan.add_subplot(2, 1, 2) ax_dbscan_bars = fig_dbscan.add_subplot(2, 1, 2)
ax_dbscan_bars.bar([0,1], [0, 1]) ax_dbscan_bars.bar([0,1], [0, 1])
ax_dbscan_bars.set_xticks([]) ax_dbscan_bars.set_xticks([])
# merge small clusters to the nearest large cluster (useful for CommonNNClustering) # merge small clusters to the nearest large cluster (was useful for CommonNNClustering)
def merge_small_clusters(X, labels, limit=15, inplace=False): def merge_small_clusters(X, labels, limit=15, inplace=False):
noise_labels = labels == -1 noise_labels = labels == -1
clusters, counts = np.unique(labels[~noise_labels], return_counts=True) clusters, counts = np.unique(labels[~noise_labels], return_counts=True)
...@@ -236,14 +233,7 @@ def merge_small_clusters(X, labels, limit=15, inplace=False): ...@@ -236,14 +233,7 @@ def merge_small_clusters(X, labels, limit=15, inplace=False):
def update_dbscan(val=None): def update_dbscan(val=None):
if dbscan_button.get_status()[0]:
alg = CommonNNClustering(eps=dbscan_eps_slider.val, min_samples=int(dbscan_samples_slider.val))
alg_name = 'Common Nearest Neighbor Clustering'
else:
alg = DBSCAN(eps=dbscan_eps_slider.val, min_samples=int(dbscan_samples_slider.val)) alg = DBSCAN(eps=dbscan_eps_slider.val, min_samples=int(dbscan_samples_slider.val))
alg_name = 'DBSCAN'
ax_dbscan.set_title(alg_name)
labels = alg.fit_predict(X) labels = alg.fit_predict(X)
merge_small_clusters(X, labels, inplace=True) merge_small_clusters(X, labels, inplace=True)
matched_labels = match_labels(labels) matched_labels = match_labels(labels)
...@@ -261,7 +251,6 @@ def update_dbscan(val=None): ...@@ -261,7 +251,6 @@ def update_dbscan(val=None):
dbscan_eps_slider.on_changed(update_dbscan) dbscan_eps_slider.on_changed(update_dbscan)
dbscan_samples_slider.on_changed(update_dbscan) dbscan_samples_slider.on_changed(update_dbscan)
dbscan_button.on_clicked(update_dbscan)
dbscan_show_button.on_clicked(update_dbscan) dbscan_show_button.on_clicked(update_dbscan)
update_dbscan() update_dbscan()
plt.show() plt.show()
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment