Upload Code für Folien

aef86389 · Christof Kaufmann · eafb749e · aef86389
Commit aef86389 authored May 8, 2024 by Christof Kaufmann
--- a/04-pandas-und-seaborn/folien-code.ipynb
+++ b/04-pandas-und-seaborn/folien-code.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    " # Code zu Folien\n",
+    "\n",
+    "\n",
+    "\n",
+    " Dieses Skript bzw. Jupyter-Notebook enthält den Code, der auch auf den Folien \"Pandas & Seaborn\" enthalten ist. Zum Vorbereiten, Mitmachen oder Nacharbeiten."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %% import Pandas\n",
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "from IPython.display import display\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %% Iris Flower Dataset\n",
+    "url = 'https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv'\n",
+    "df = pd.read_csv(url)\n",
+    "\n",
+    "# offline-Alternative:\n",
+    "# from sklearn.datasets import load_iris\n",
+    "# df = pd.concat(load_iris(return_X_y=True, as_frame=True), axis='columns')\n",
+    "# df.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']\n",
+    "# df['species'] = df['species'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})\n",
+    "\n",
+    "df['species'] = df['species'].astype('category')\n",
+    "df\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %% Informationen\n",
+    "print(df.shape)\n",
+    "print(df.columns)\n",
+    "print(df.dtypes)\n",
+    "print(df.index)\n",
+    "df.info()\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %% Statistischer Überblick\n",
+    "display(df.describe())\n",
+    "display(df.describe(exclude='number'))\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %% Kuchendiagramm\n",
+    "counts = df['species'].value_counts()\n",
+    "display(counts)\n",
+    "\n",
+    "counts.plot.pie(startangle=60, autopct='{:.2f}%'.format)\n",
+    "plt.ylabel('species')\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %% Boxplot\n",
+    "df.boxplot(column='petal_length', by='species')\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %% Boxplots aller Features\n",
+    "fig, axs = plt.subplots(2, 2, sharey=False)    # y-Achsen unabhängig\n",
+    "pd.plotting.boxplot(df, by='species', ax=axs)  # übergebe axs\n",
+    "[ax.set_xlabel('') for ax in axs.ravel()]      # entferne x-Labels\n",
+    "fig.tight_layout()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %% Violinenplot\n",
+    "import seaborn as sns\n",
+    "sns.violinplot(hue='species', y='petal_length', data=df)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %% Scatterplots\n",
+    "df.plot.scatter(x='petal_length', y='petal_width', c='species', colormap='viridis', alpha=0.7)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %% Pair Plot\n",
+    "sns.pairplot(df, hue='species', plot_kws={'alpha': 0.5})\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %% Parallele Koordinaten Plot, unskaliert\n",
+    "pd.plotting.parallel_coordinates(df, 'species', colormap='viridis', alpha=.5)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %% Parallele Koordinaten Plot, normiert\n",
+    "from sklearn.preprocessing import minmax_scale\n",
+    "num_cols = df.columns.drop('species')\n",
+    "df_scaled = df.copy()\n",
+    "df_scaled[num_cols] = minmax_scale(df[num_cols])\n",
+    "pd.plotting.parallel_coordinates(df_scaled, 'species', colormap='viridis', alpha=.5)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %% Parallele Koordinaten Plot, custom Code from https://stackoverflow.com/a/60401570/2414411\n",
+    "import numpy as np\n",
+    "from matplotlib.path import Path\n",
+    "import matplotlib.patches as patches\n",
+    "\n",
+    "ys = df.drop(columns='species')\n",
+    "ynames = ys.columns\n",
+    "ys = ys.to_numpy()\n",
+    "ymins = ys.min(axis=0)\n",
+    "ymaxs = ys.max(axis=0)\n",
+    "dys = ymaxs - ymins\n",
+    "ymins -= dys * 0.05  # add 5% padding below and above\n",
+    "ymaxs += dys * 0.05\n",
+    "\n",
+    "# reverse axis 1 to have less crossings\n",
+    "# ymaxs[1], ymins[1] = ymins[1], ymaxs[1]\n",
+    "# dys = ymaxs - ymins\n",
+    "\n",
+    "# transform all data to be compatible with the main axis\n",
+    "zs = np.zeros_like(ys)\n",
+    "zs[:, 0] = ys[:, 0]\n",
+    "zs[:, 1:] = (ys[:, 1:] - ymins[1:]) / dys[1:] * dys[0] + ymins[0]\n",
+    "\n",
+    "fig, host = plt.subplots(figsize=(10, 4))\n",
+    "\n",
+    "axes = [host] + [host.twinx() for i in range(ys.shape[1] - 1)]\n",
+    "for i, ax in enumerate(axes):\n",
+    "    ax.set_ylim(ymins[i], ymaxs[i])\n",
+    "    ax.spines['top'].set_visible(False)\n",
+    "    ax.spines['bottom'].set_visible(False)\n",
+    "    if ax != host:\n",
+    "        ax.spines['left'].set_visible(False)\n",
+    "        ax.yaxis.set_ticks_position('right')\n",
+    "        ax.spines[\"right\"].set_position((\"axes\", i / (ys.shape[1] - 1)))\n",
+    "\n",
+    "host.set_xlim(0, ys.shape[1] - 1)\n",
+    "host.set_xticks(range(ys.shape[1]))\n",
+    "host.set_xticklabels(ynames, fontsize=14)\n",
+    "host.tick_params(axis='x', which='major', pad=7)\n",
+    "host.spines['right'].set_visible(False)\n",
+    "host.xaxis.tick_top()\n",
+    "# host.set_title('Parallel Coordinates Plot — Iris', fontsize=18, pad=12)\n",
+    "\n",
+    "colors = plt.cm.viridis([0, 128, 255])\n",
+    "target_names = df['species'].unique()\n",
+    "target = df['species'].cat.codes\n",
+    "legend_handles = [None for _ in target_names]\n",
+    "for j in range(ys.shape[0]):\n",
+    "    # create bezier curves\n",
+    "    verts = list(zip([x for x in np.linspace(0, len(ys) - 1, len(ys) * 3 - 2, endpoint=True)],\n",
+    "                     np.repeat(zs[j, :], 3)[1:-1]))\n",
+    "    codes = [Path.MOVETO] + [Path.CURVE4 for _ in range(len(verts) - 1)]\n",
+    "    path = Path(verts, codes)\n",
+    "    patch = patches.PathPatch(path, facecolor='none',\n",
+    "                              lw=2, alpha=0.5, edgecolor=colors[target[j]])\n",
+    "    legend_handles[target[j]] = patch\n",
+    "    host.add_patch(patch)\n",
+    "host.legend(legend_handles, target_names,\n",
+    "            loc='lower center', bbox_to_anchor=(0.5, -0.18),\n",
+    "            ncol=len(target_names), fancybox=True, shadow=True)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %% Parallele Koordinaten Plot mit Plotly Express\n",
+    "import plotly.express as px\n",
+    "# fig = px.parallel_coordinates(df, color=\"species\", labels={'species': tuple('ABC')})\n",
+    "fig = px.parallel_coordinates(df, color=df[\"species\"].cat.codes)\n",
+    "fig.data[0]['dimensions'][-1]['label'] = 'species'\n",
+    "fig.show()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %% Slicing\n",
+    "cp = df.copy()\n",
+    "cp.loc[1, 'sepal_width'] = 1\n",
+    "cp.loc[0:2, 'petal_length'] = 2\n",
+    "cp.loc[0, 'sepal_width':'petal_width'] = 3\n",
+    "cp.loc[1:, 'sepal_length'] = 4\n",
+    "cp.loc[:2, :'sepal_width'] = 5\n",
+    "cp.loc[:49, :].to_csv('iris-setosa.csv')\n",
+    "cp\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %% komplexe Indizierung\n",
+    "display(df.loc[[0, 149, 2], 'petal_width'])\n",
+    "\n",
+    "part = df.loc[[0, 149, 2], ['petal_width', 'sepal_width']]\n",
+    "part\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %% integer location\n",
+    "display(part.iloc[1, -1])\n",
+    "display(part.iloc[:2, -1])\n",
+    "display(part.iloc[[0, 1], [0, 1]])\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %% boolesche Indizierung\n",
+    "pw = part.loc[:, 'petal_width'] <= 1\n",
+    "sw = part.loc[:, 'sepal_width'] < 3.5\n",
+    "display(pw)\n",
+    "display(sw)\n",
+    "display(~sw)\n",
+    "display(part.loc[pw & sw])\n",
+    "display(part.loc[pw | ~sw])\n",
+    "display(part.loc[pw ^ sw])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %% Daten fallen lassen\n",
+    "display(part.drop(index=149, columns='petal_width'))\n",
+    "display(part.drop(index=[149, 0]))\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %% einzelne Daten hinzufügen\n",
+    "part.loc[3] = [2, 6]\n",
+    "display(part)\n",
+    "part.loc[:, 'weight'] = [1, 2, 3, 4]\n",
+    "display(part)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %% DataFrames zusammenführen\n",
+    "a = part.drop(index=3)\n",
+    "b = df.loc[:2, ['petal_length', 'petal_width']]\n",
+    "display(a)\n",
+    "display(b)\n",
+    "display(pd.concat((a, b), axis='columns'))\n",
+    "display(pd.concat((a, b), axis='index'))\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %% Kategoriale Daten\n",
+    "df['species']\n",
+    "df['species'].info()\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %% Statistische Funktionen\n",
+    "X = df.drop(columns='species')\n",
+    "y = df['species']\n",
+    "\n",
+    "display(X.mean())\n",
+    "display(y.value_counts())\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %% Gruppierung\n",
+    "species_means = X.groupby(y).mean()\n",
+    "display(species_means)\n",
+    "\n",
+    "diff = species_means - [6, 3, 2, 0.5]\n",
+    "(diff**2).sum(axis='columns')\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": 3
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
+%% Cell type:markdown id: tags:
+
+ # Code zu Folien
+
+
+
+ Dieses Skript bzw. Jupyter-Notebook enthält den Code, der auch auf den Folien "Pandas & Seaborn" enthalten ist. Zum Vorbereiten, Mitmachen oder Nacharbeiten.
+
+%% Cell type:code id: tags:
+
+``` 
+# %% import Pandas
+import pandas as pd
+import matplotlib.pyplot as plt
+from IPython.display import display
+
+```
+
+%% Cell type:code id: tags:
+
+``` 
+# %% Iris Flower Dataset
+url = 'https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv'
+df = pd.read_csv(url)
+
+# offline-Alternative:
+# from sklearn.datasets import load_iris
+# df = pd.concat(load_iris(return_X_y=True, as_frame=True), axis='columns')
+# df.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']
+# df['species'] = df['species'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})
+
+df['species'] = df['species'].astype('category')
+df
+
+```
+
+%% Cell type:code id: tags:
+
+``` 
+# %% Informationen
+print(df.shape)
+print(df.columns)
+print(df.dtypes)
+print(df.index)
+df.info()
+
+```
+
+%% Cell type:code id: tags:
+
+``` 
+# %% Statistischer Überblick
+display(df.describe())
+display(df.describe(exclude='number'))
+
+```
+
+%% Cell type:code id: tags:
+
+``` 
+# %% Kuchendiagramm
+counts = df['species'].value_counts()
+display(counts)
+
+counts.plot.pie(startangle=60, autopct='{:.2f}%'.format)
+plt.ylabel('species')
+
+```
+
+%% Cell type:code id: tags:
+
+``` 
+# %% Boxplot
+df.boxplot(column='petal_length', by='species')
+
+```
+
+%% Cell type:code id: tags:
+
+``` 
+# %% Boxplots aller Features
+fig, axs = plt.subplots(2, 2, sharey=False)    # y-Achsen unabhängig
+pd.plotting.boxplot(df, by='species', ax=axs)  # übergebe axs
+[ax.set_xlabel('') for ax in axs.ravel()]      # entferne x-Labels
+fig.tight_layout()
+```
+
+%% Cell type:code id: tags:
+
+``` 
+# %% Violinenplot
+import seaborn as sns
+sns.violinplot(hue='species', y='petal_length', data=df)
+
+```
+
+%% Cell type:code id: tags:
+
+``` 
+# %% Scatterplots
+df.plot.scatter(x='petal_length', y='petal_width', c='species', colormap='viridis', alpha=0.7)
+
+```
+
+%% Cell type:code id: tags:
+
+``` 
+# %% Pair Plot
+sns.pairplot(df, hue='species', plot_kws={'alpha': 0.5})
+
+```
+
+%% Cell type:code id: tags:
+
+``` 
+# %% Parallele Koordinaten Plot, unskaliert
+pd.plotting.parallel_coordinates(df, 'species', colormap='viridis', alpha=.5)
+
+```
+
+%% Cell type:code id: tags:
+
+``` 
+# %% Parallele Koordinaten Plot, normiert
+from sklearn.preprocessing import minmax_scale
+num_cols = df.columns.drop('species')
+df_scaled = df.copy()
+df_scaled[num_cols] = minmax_scale(df[num_cols])
+pd.plotting.parallel_coordinates(df_scaled, 'species', colormap='viridis', alpha=.5)
+```
+
+%% Cell type:code id: tags:
+
+``` 
+# %% Parallele Koordinaten Plot, custom Code from https://stackoverflow.com/a/60401570/2414411
+import numpy as np
+from matplotlib.path import Path
+import matplotlib.patches as patches
+
+ys = df.drop(columns='species')
+ynames = ys.columns
+ys = ys.to_numpy()
+ymins = ys.min(axis=0)
+ymaxs = ys.max(axis=0)
+dys = ymaxs - ymins
+ymins -= dys * 0.05  # add 5% padding below and above
+ymaxs += dys * 0.05
+
+# reverse axis 1 to have less crossings
+# ymaxs[1], ymins[1] = ymins[1], ymaxs[1]
+# dys = ymaxs - ymins
+
+# transform all data to be compatible with the main axis
+zs = np.zeros_like(ys)
+zs[:, 0] = ys[:, 0]
+zs[:, 1:] = (ys[:, 1:] - ymins[1:]) / dys[1:] * dys[0] + ymins[0]
+
+fig, host = plt.subplots(figsize=(10, 4))
+
+axes = [host] + [host.twinx() for i in range(ys.shape[1] - 1)]
+for i, ax in enumerate(axes):
+    ax.set_ylim(ymins[i], ymaxs[i])
+    ax.spines['top'].set_visible(False)
+    ax.spines['bottom'].set_visible(False)
+    if ax != host:
+        ax.spines['left'].set_visible(False)
+        ax.yaxis.set_ticks_position('right')
+        ax.spines["right"].set_position(("axes", i / (ys.shape[1] - 1)))
+
+host.set_xlim(0, ys.shape[1] - 1)
+host.set_xticks(range(ys.shape[1]))
+host.set_xticklabels(ynames, fontsize=14)
+host.tick_params(axis='x', which='major', pad=7)
+host.spines['right'].set_visible(False)
+host.xaxis.tick_top()
+# host.set_title('Parallel Coordinates Plot — Iris', fontsize=18, pad=12)
+
+colors = plt.cm.viridis([0, 128, 255])
+target_names = df['species'].unique()
+target = df['species'].cat.codes
+legend_handles = [None for _ in target_names]
+for j in range(ys.shape[0]):
+    # create bezier curves
+    verts = list(zip([x for x in np.linspace(0, len(ys) - 1, len(ys) * 3 - 2, endpoint=True)],
+                     np.repeat(zs[j, :], 3)[1:-1]))
+    codes = [Path.MOVETO] + [Path.CURVE4 for _ in range(len(verts) - 1)]
+    path = Path(verts, codes)
+    patch = patches.PathPatch(path, facecolor='none',
+                              lw=2, alpha=0.5, edgecolor=colors[target[j]])
+    legend_handles[target[j]] = patch
+    host.add_patch(patch)
+host.legend(legend_handles, target_names,
+            loc='lower center', bbox_to_anchor=(0.5, -0.18),
+            ncol=len(target_names), fancybox=True, shadow=True)
+```
+
+%% Cell type:code id: tags:
+
+``` 
+# %% Parallele Koordinaten Plot mit Plotly Express
+import plotly.express as px
+# fig = px.parallel_coordinates(df, color="species", labels={'species': tuple('ABC')})
+fig = px.parallel_coordinates(df, color=df["species"].cat.codes)
+fig.data[0]['dimensions'][-1]['label'] = 'species'
+fig.show()
+```
+
+%% Cell type:code id: tags:
+
+``` 
+# %% Slicing
+cp = df.copy()
+cp.loc[1, 'sepal_width'] = 1
+cp.loc[0:2, 'petal_length'] = 2
+cp.loc[0, 'sepal_width':'petal_width'] = 3
+cp.loc[1:, 'sepal_length'] = 4
+cp.loc[:2, :'sepal_width'] = 5
+cp.loc[:49, :].to_csv('iris-setosa.csv')
+cp
+```
+
+%% Cell type:code id: tags:
+
+``` 
+# %% komplexe Indizierung
+display(df.loc[[0, 149, 2], 'petal_width'])
+
+part = df.loc[[0, 149, 2], ['petal_width', 'sepal_width']]
+part
+
+```
+
+%% Cell type:code id: tags:
+
+``` 
+# %% integer location
+display(part.iloc[1, -1])
+display(part.iloc[:2, -1])
+display(part.iloc[[0, 1], [0, 1]])
+
+```
+
+%% Cell type:code id: tags:
+
+``` 
+# %% boolesche Indizierung
+pw = part.loc[:, 'petal_width'] <= 1
+sw = part.loc[:, 'sepal_width'] < 3.5
+display(pw)
+display(sw)
+display(~sw)
+display(part.loc[pw & sw])
+display(part.loc[pw | ~sw])
+display(part.loc[pw ^ sw])
+```
+
+%% Cell type:code id: tags:
+
+``` 
+# %% Daten fallen lassen
+display(part.drop(index=149, columns='petal_width'))
+display(part.drop(index=[149, 0]))
+
+```
+
+%% Cell type:code id: tags:
+
+``` 
+# %% einzelne Daten hinzufügen
+part.loc[3] = [2, 6]
+display(part)
+part.loc[:, 'weight'] = [1, 2, 3, 4]
+display(part)
+
+```
+
+%% Cell type:code id: tags:
+
+``` 
+# %% DataFrames zusammenführen
+a = part.drop(index=3)
+b = df.loc[:2, ['petal_length', 'petal_width']]
+display(a)
+display(b)
+display(pd.concat((a, b), axis='columns'))
+display(pd.concat((a, b), axis='index'))
+
+```
+
+%% Cell type:code id: tags:
+
+``` 
+# %% Kategoriale Daten
+df['species']
+df['species'].info()
+
+```
+
+%% Cell type:code id: tags:
+
+``` 
+# %% Statistische Funktionen
+X = df.drop(columns='species')
+y = df['species']
+
+display(X.mean())
+display(y.value_counts())
+
+```
+
+%% Cell type:code id: tags:
+
+``` 
+# %% Gruppierung
+species_means = X.groupby(y).mean()
+display(species_means)
+
+diff = species_means - [6, 3, 2, 0.5]
+(diff**2).sum(axis='columns')
+```