diff --git a/04-pandas-und-seaborn/folien-code.ipynb b/04-pandas-und-seaborn/folien-code.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..888a25d0560572dc01899315ccf48858bf2378b9 --- /dev/null +++ b/04-pandas-und-seaborn/folien-code.ipynb @@ -0,0 +1,416 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " # Code zu Folien\n", + "\n", + "\n", + "\n", + " Dieses Skript bzw. Jupyter-Notebook enthält den Code, der auch auf den Folien \"Pandas & Seaborn\" enthalten ist. Zum Vorbereiten, Mitmachen oder Nacharbeiten." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# %% import Pandas\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "from IPython.display import display\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# %% Iris Flower Dataset\n", + "url = 'https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv'\n", + "df = pd.read_csv(url)\n", + "\n", + "# offline-Alternative:\n", + "# from sklearn.datasets import load_iris\n", + "# df = pd.concat(load_iris(return_X_y=True, as_frame=True), axis='columns')\n", + "# df.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']\n", + "# df['species'] = df['species'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})\n", + "\n", + "df['species'] = df['species'].astype('category')\n", + "df\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# %% Informationen\n", + "print(df.shape)\n", + "print(df.columns)\n", + "print(df.dtypes)\n", + "print(df.index)\n", + "df.info()\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# %% Statistischer Überblick\n", + "display(df.describe())\n", + "display(df.describe(exclude='number'))\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# %% Kuchendiagramm\n", + "counts = df['species'].value_counts()\n", + "display(counts)\n", + "\n", + "counts.plot.pie(startangle=60, autopct='{:.2f}%'.format)\n", + "plt.ylabel('species')\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# %% Boxplot\n", + "df.boxplot(column='petal_length', by='species')\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# %% Boxplots aller Features\n", + "fig, axs = plt.subplots(2, 2, sharey=False) # y-Achsen unabhängig\n", + "pd.plotting.boxplot(df, by='species', ax=axs) # übergebe axs\n", + "[ax.set_xlabel('') for ax in axs.ravel()] # entferne x-Labels\n", + "fig.tight_layout()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# %% Violinenplot\n", + "import seaborn as sns\n", + "sns.violinplot(hue='species', y='petal_length', data=df)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# %% Scatterplots\n", + "df.plot.scatter(x='petal_length', y='petal_width', c='species', colormap='viridis', alpha=0.7)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# %% Pair Plot\n", + "sns.pairplot(df, hue='species', plot_kws={'alpha': 0.5})\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# %% Parallele Koordinaten Plot, unskaliert\n", + "pd.plotting.parallel_coordinates(df, 'species', colormap='viridis', alpha=.5)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# %% Parallele Koordinaten Plot, normiert\n", + "from sklearn.preprocessing import minmax_scale\n", + "num_cols = df.columns.drop('species')\n", + "df_scaled = df.copy()\n", + "df_scaled[num_cols] = minmax_scale(df[num_cols])\n", + "pd.plotting.parallel_coordinates(df_scaled, 'species', colormap='viridis', alpha=.5)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# %% Parallele Koordinaten Plot, custom Code from https://stackoverflow.com/a/60401570/2414411\n", + "import numpy as np\n", + "from matplotlib.path import Path\n", + "import matplotlib.patches as patches\n", + "\n", + "ys = df.drop(columns='species')\n", + "ynames = ys.columns\n", + "ys = ys.to_numpy()\n", + "ymins = ys.min(axis=0)\n", + "ymaxs = ys.max(axis=0)\n", + "dys = ymaxs - ymins\n", + "ymins -= dys * 0.05 # add 5% padding below and above\n", + "ymaxs += dys * 0.05\n", + "\n", + "# reverse axis 1 to have less crossings\n", + "# ymaxs[1], ymins[1] = ymins[1], ymaxs[1]\n", + "# dys = ymaxs - ymins\n", + "\n", + "# transform all data to be compatible with the main axis\n", + "zs = np.zeros_like(ys)\n", + "zs[:, 0] = ys[:, 0]\n", + "zs[:, 1:] = (ys[:, 1:] - ymins[1:]) / dys[1:] * dys[0] + ymins[0]\n", + "\n", + "fig, host = plt.subplots(figsize=(10, 4))\n", + "\n", + "axes = [host] + [host.twinx() for i in range(ys.shape[1] - 1)]\n", + "for i, ax in enumerate(axes):\n", + " ax.set_ylim(ymins[i], ymaxs[i])\n", + " ax.spines['top'].set_visible(False)\n", + " ax.spines['bottom'].set_visible(False)\n", + " if ax != host:\n", + " ax.spines['left'].set_visible(False)\n", + " ax.yaxis.set_ticks_position('right')\n", + " ax.spines[\"right\"].set_position((\"axes\", i / (ys.shape[1] - 1)))\n", + "\n", + "host.set_xlim(0, ys.shape[1] - 1)\n", + "host.set_xticks(range(ys.shape[1]))\n", + "host.set_xticklabels(ynames, fontsize=14)\n", + "host.tick_params(axis='x', which='major', pad=7)\n", + "host.spines['right'].set_visible(False)\n", + "host.xaxis.tick_top()\n", + "# host.set_title('Parallel Coordinates Plot — Iris', fontsize=18, pad=12)\n", + "\n", + "colors = plt.cm.viridis([0, 128, 255])\n", + "target_names = df['species'].unique()\n", + "target = df['species'].cat.codes\n", + "legend_handles = [None for _ in target_names]\n", + "for j in range(ys.shape[0]):\n", + " # create bezier curves\n", + " verts = list(zip([x for x in np.linspace(0, len(ys) - 1, len(ys) * 3 - 2, endpoint=True)],\n", + " np.repeat(zs[j, :], 3)[1:-1]))\n", + " codes = [Path.MOVETO] + [Path.CURVE4 for _ in range(len(verts) - 1)]\n", + " path = Path(verts, codes)\n", + " patch = patches.PathPatch(path, facecolor='none',\n", + " lw=2, alpha=0.5, edgecolor=colors[target[j]])\n", + " legend_handles[target[j]] = patch\n", + " host.add_patch(patch)\n", + "host.legend(legend_handles, target_names,\n", + " loc='lower center', bbox_to_anchor=(0.5, -0.18),\n", + " ncol=len(target_names), fancybox=True, shadow=True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# %% Parallele Koordinaten Plot mit Plotly Express\n", + "import plotly.express as px\n", + "# fig = px.parallel_coordinates(df, color=\"species\", labels={'species': tuple('ABC')})\n", + "fig = px.parallel_coordinates(df, color=df[\"species\"].cat.codes)\n", + "fig.data[0]['dimensions'][-1]['label'] = 'species'\n", + "fig.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# %% Slicing\n", + "cp = df.copy()\n", + "cp.loc[1, 'sepal_width'] = 1\n", + "cp.loc[0:2, 'petal_length'] = 2\n", + "cp.loc[0, 'sepal_width':'petal_width'] = 3\n", + "cp.loc[1:, 'sepal_length'] = 4\n", + "cp.loc[:2, :'sepal_width'] = 5\n", + "cp.loc[:49, :].to_csv('iris-setosa.csv')\n", + "cp\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# %% komplexe Indizierung\n", + "display(df.loc[[0, 149, 2], 'petal_width'])\n", + "\n", + "part = df.loc[[0, 149, 2], ['petal_width', 'sepal_width']]\n", + "part\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# %% integer location\n", + "display(part.iloc[1, -1])\n", + "display(part.iloc[:2, -1])\n", + "display(part.iloc[[0, 1], [0, 1]])\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# %% boolesche Indizierung\n", + "pw = part.loc[:, 'petal_width'] <= 1\n", + "sw = part.loc[:, 'sepal_width'] < 3.5\n", + "display(pw)\n", + "display(sw)\n", + "display(~sw)\n", + "display(part.loc[pw & sw])\n", + "display(part.loc[pw | ~sw])\n", + "display(part.loc[pw ^ sw])\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# %% Daten fallen lassen\n", + "display(part.drop(index=149, columns='petal_width'))\n", + "display(part.drop(index=[149, 0]))\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# %% einzelne Daten hinzufügen\n", + "part.loc[3] = [2, 6]\n", + "display(part)\n", + "part.loc[:, 'weight'] = [1, 2, 3, 4]\n", + "display(part)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# %% DataFrames zusammenführen\n", + "a = part.drop(index=3)\n", + "b = df.loc[:2, ['petal_length', 'petal_width']]\n", + "display(a)\n", + "display(b)\n", + "display(pd.concat((a, b), axis='columns'))\n", + "display(pd.concat((a, b), axis='index'))\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# %% Kategoriale Daten\n", + "df['species']\n", + "df['species'].info()\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# %% Statistische Funktionen\n", + "X = df.drop(columns='species')\n", + "y = df['species']\n", + "\n", + "display(X.mean())\n", + "display(y.value_counts())\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# %% Gruppierung\n", + "species_means = X.groupby(y).mean()\n", + "display(species_means)\n", + "\n", + "diff = species_means - [6, 3, 2, 0.5]\n", + "(diff**2).sum(axis='columns')\n" + ] + } + ], + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": 3 + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}