From 2baf5d16e780c2704f3aa553fe3e678ee604ea20 Mon Sep 17 00:00:00 2001 From: Christof Kaufmann <christof.kaufmann@hs-bochum.de> Date: Wed, 8 May 2024 06:45:48 +0000 Subject: [PATCH] Upload Code (Python Script) --- 04-pandas-und-seaborn/folien-code.py | 221 +++++++++++++++++++++++++++ 1 file changed, 221 insertions(+) create mode 100644 04-pandas-und-seaborn/folien-code.py diff --git a/04-pandas-und-seaborn/folien-code.py b/04-pandas-und-seaborn/folien-code.py new file mode 100644 index 0000000..b119bfe --- /dev/null +++ b/04-pandas-und-seaborn/folien-code.py @@ -0,0 +1,221 @@ +# %% [markdown] +# # Code zu Folien +# +# Dieses Skript bzw. Jupyter-Notebook enthält den Code, der auch auf den Folien "Pandas & Seaborn" enthalten ist. Zum Vorbereiten, Mitmachen oder Nacharbeiten. + +# %% import Pandas +import pandas as pd +import matplotlib.pyplot as plt +from IPython.display import display + + +# %% Iris Flower Dataset +url = 'https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv' +df = pd.read_csv(url) + +# offline-Alternative: +# from sklearn.datasets import load_iris +# df = pd.concat(load_iris(return_X_y=True, as_frame=True), axis='columns') +# df.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species'] +# df['species'] = df['species'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'}) + +df['species'] = df['species'].astype('category') +df + + +# %% Informationen +print(df.shape) +print(df.columns) +print(df.dtypes) +print(df.index) +df.info() + + +# %% Statistischer Überblick +display(df.describe()) +display(df.describe(exclude='number')) + + +# %% Kuchendiagramm +counts = df['species'].value_counts() +display(counts) + +counts.plot.pie(startangle=60, autopct='{:.2f}%'.format) +plt.ylabel('species') + + +# %% Boxplot +df.boxplot(column='petal_length', by='species') + + +# %% Boxplots aller Features +fig, axs = plt.subplots(2, 2, sharey=False) # y-Achsen unabhängig +pd.plotting.boxplot(df, by='species', ax=axs) # übergebe axs +[ax.set_xlabel('') for ax in axs.ravel()] # entferne x-Labels +fig.tight_layout() + +# %% Violinenplot +import seaborn as sns +sns.violinplot(hue='species', y='petal_length', data=df) + + +# %% Scatterplots +df.plot.scatter(x='petal_length', y='petal_width', c='species', colormap='viridis', alpha=0.7) + + +# %% Pair Plot +sns.pairplot(df, hue='species', plot_kws={'alpha': 0.5}) + + +# %% Parallele Koordinaten Plot, unskaliert +pd.plotting.parallel_coordinates(df, 'species', colormap='viridis', alpha=.5) + + +# %% Parallele Koordinaten Plot, normiert +from sklearn.preprocessing import minmax_scale +num_cols = df.columns.drop('species') +df_scaled = df.copy() +df_scaled[num_cols] = minmax_scale(df[num_cols]) +pd.plotting.parallel_coordinates(df_scaled, 'species', colormap='viridis', alpha=.5) + +# %% Parallele Koordinaten Plot, custom Code from https://stackoverflow.com/a/60401570/2414411 +import numpy as np +from matplotlib.path import Path +import matplotlib.patches as patches + +ys = df.drop(columns='species') +ynames = ys.columns +ys = ys.to_numpy() +ymins = ys.min(axis=0) +ymaxs = ys.max(axis=0) +dys = ymaxs - ymins +ymins -= dys * 0.05 # add 5% padding below and above +ymaxs += dys * 0.05 + +# reverse axis 1 to have less crossings +# ymaxs[1], ymins[1] = ymins[1], ymaxs[1] +# dys = ymaxs - ymins + +# transform all data to be compatible with the main axis +zs = np.zeros_like(ys) +zs[:, 0] = ys[:, 0] +zs[:, 1:] = (ys[:, 1:] - ymins[1:]) / dys[1:] * dys[0] + ymins[0] + +fig, host = plt.subplots(figsize=(10, 4)) + +axes = [host] + [host.twinx() for i in range(ys.shape[1] - 1)] +for i, ax in enumerate(axes): + ax.set_ylim(ymins[i], ymaxs[i]) + ax.spines['top'].set_visible(False) + ax.spines['bottom'].set_visible(False) + if ax != host: + ax.spines['left'].set_visible(False) + ax.yaxis.set_ticks_position('right') + ax.spines["right"].set_position(("axes", i / (ys.shape[1] - 1))) + +host.set_xlim(0, ys.shape[1] - 1) +host.set_xticks(range(ys.shape[1])) +host.set_xticklabels(ynames, fontsize=14) +host.tick_params(axis='x', which='major', pad=7) +host.spines['right'].set_visible(False) +host.xaxis.tick_top() +# host.set_title('Parallel Coordinates Plot — Iris', fontsize=18, pad=12) + +colors = plt.cm.viridis([0, 128, 255]) +target_names = df['species'].unique() +target = df['species'].cat.codes +legend_handles = [None for _ in target_names] +for j in range(ys.shape[0]): + # create bezier curves + verts = list(zip([x for x in np.linspace(0, len(ys) - 1, len(ys) * 3 - 2, endpoint=True)], + np.repeat(zs[j, :], 3)[1:-1])) + codes = [Path.MOVETO] + [Path.CURVE4 for _ in range(len(verts) - 1)] + path = Path(verts, codes) + patch = patches.PathPatch(path, facecolor='none', + lw=2, alpha=0.5, edgecolor=colors[target[j]]) + legend_handles[target[j]] = patch + host.add_patch(patch) +host.legend(legend_handles, target_names, + loc='lower center', bbox_to_anchor=(0.5, -0.18), + ncol=len(target_names), fancybox=True, shadow=True) + +# %% Parallele Koordinaten Plot mit Plotly Express +import plotly.express as px +# fig = px.parallel_coordinates(df, color="species", labels={'species': tuple('ABC')}) +fig = px.parallel_coordinates(df, color=df["species"].cat.codes) +fig.data[0]['dimensions'][-1]['label'] = 'species' +fig.show() + +# %% Slicing +cp = df.copy() +cp.loc[1, 'sepal_width'] = 1 +cp.loc[0:2, 'petal_length'] = 2 +cp.loc[0, 'sepal_width':'petal_width'] = 3 +cp.loc[1:, 'sepal_length'] = 4 +cp.loc[:2, :'sepal_width'] = 5 +cp.loc[:49, :].to_csv('iris-setosa.csv') +cp + +# %% komplexe Indizierung +display(df.loc[[0, 149, 2], 'petal_width']) + +part = df.loc[[0, 149, 2], ['petal_width', 'sepal_width']] +part + + +# %% integer location +display(part.iloc[1, -1]) +display(part.iloc[:2, -1]) +display(part.iloc[[0, 1], [0, 1]]) + + +# %% boolesche Indizierung +pw = part.loc[:, 'petal_width'] <= 1 +sw = part.loc[:, 'sepal_width'] < 3.5 +display(pw) +display(sw) +display(~sw) +display(part.loc[pw & sw]) +display(part.loc[pw | ~sw]) +display(part.loc[pw ^ sw]) + +# %% Daten fallen lassen +display(part.drop(index=149, columns='petal_width')) +display(part.drop(index=[149, 0])) + + +# %% einzelne Daten hinzufügen +part.loc[3] = [2, 6] +display(part) +part.loc[:, 'weight'] = [1, 2, 3, 4] +display(part) + + +# %% DataFrames zusammenführen +a = part.drop(index=3) +b = df.loc[:2, ['petal_length', 'petal_width']] +display(a) +display(b) +display(pd.concat((a, b), axis='columns')) +display(pd.concat((a, b), axis='index')) + + +# %% Kategoriale Daten +df['species'] +df['species'].info() + + +# %% Statistische Funktionen +X = df.drop(columns='species') +y = df['species'] + +display(X.mean()) +display(y.value_counts()) + + +# %% Gruppierung +species_means = X.groupby(y).mean() +display(species_means) + +diff = species_means - [6, 3, 2, 0.5] +(diff**2).sum(axis='columns') -- GitLab