Skip to content
Snippets Groups Projects
Select Git revision
  • ae9fb663d29a87c98e59d9f4aff45ca6e1da879d
  • main default protected
  • increase_steps
  • auto_mutation
4 results

walker.py

Blame
  • folien-code.py 6.03 KiB
    # %% [markdown]
    # # Code zu Folien
    #
    # Dieses Skript bzw. Jupyter-Notebook enthält den Code, der auch auf den Folien "Pandas & Seaborn" enthalten ist. Zum Vorbereiten, Mitmachen oder Nacharbeiten.
    
    # %% import Pandas
    import pandas as pd
    import matplotlib.pyplot as plt
    from IPython.display import display
    
    
    # %% Iris Flower Dataset
    url = 'https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv'
    df = pd.read_csv(url)
    
    # offline-Alternative:
    # from sklearn.datasets import load_iris
    # df = pd.concat(load_iris(return_X_y=True, as_frame=True), axis='columns')
    # df.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']
    # df['species'] = df['species'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})
    
    df['species'] = df['species'].astype('category')
    df
    
    
    # %% Informationen
    print(df.shape)
    print(df.columns)
    print(df.dtypes)
    print(df.index)
    df.info()
    
    
    # %% Statistischer Überblick
    display(df.describe())
    display(df.describe(exclude='number'))
    
    
    # %% Kuchendiagramm
    counts = df['species'].value_counts()
    display(counts)
    
    counts.plot.pie(startangle=60, autopct='{:.2f}%'.format)
    plt.ylabel('species')
    
    
    # %% Boxplot
    df.boxplot(column='petal_length', by='species')
    
    
    # %% Boxplots aller Features
    fig, axs = plt.subplots(2, 2, sharey=False)    # y-Achsen unabhängig
    pd.plotting.boxplot(df, by='species', ax=axs)  # übergebe axs
    [ax.set_xlabel('') for ax in axs.ravel()]      # entferne x-Labels
    fig.tight_layout()
    
    # %% Violinenplot
    import seaborn as sns
    sns.violinplot(hue='species', y='petal_length', data=df)
    
    
    # %% Scatterplots
    df.plot.scatter(x='petal_length', y='petal_width', c='species', colormap='viridis', alpha=0.7)
    
    
    # %% Pair Plot
    sns.pairplot(df, hue='species', plot_kws={'alpha': 0.5})
    
    
    # %% Parallele Koordinaten Plot, unskaliert
    pd.plotting.parallel_coordinates(df, 'species', colormap='viridis', alpha=.5)
    
    
    # %% Parallele Koordinaten Plot, normiert
    from sklearn.preprocessing import minmax_scale
    num_cols = df.columns.drop('species')
    df_scaled = df.copy()
    df_scaled[num_cols] = minmax_scale(df[num_cols])
    pd.plotting.parallel_coordinates(df_scaled, 'species', colormap='viridis', alpha=.5)
    
    # %% Parallele Koordinaten Plot, custom Code from https://stackoverflow.com/a/60401570/2414411
    import numpy as np
    from matplotlib.path import Path
    import matplotlib.patches as patches
    
    ys = df.drop(columns='species')
    ynames = ys.columns
    ys = ys.to_numpy()
    ymins = ys.min(axis=0)
    ymaxs = ys.max(axis=0)
    dys = ymaxs - ymins
    ymins -= dys * 0.05  # add 5% padding below and above
    ymaxs += dys * 0.05
    
    # reverse axis 1 to have less crossings
    # ymaxs[1], ymins[1] = ymins[1], ymaxs[1]
    # dys = ymaxs - ymins
    
    # transform all data to be compatible with the main axis
    zs = np.zeros_like(ys)
    zs[:, 0] = ys[:, 0]
    zs[:, 1:] = (ys[:, 1:] - ymins[1:]) / dys[1:] * dys[0] + ymins[0]
    
    fig, host = plt.subplots(figsize=(10, 4))
    
    axes = [host] + [host.twinx() for i in range(ys.shape[1] - 1)]
    for i, ax in enumerate(axes):
        ax.set_ylim(ymins[i], ymaxs[i])
        ax.spines['top'].set_visible(False)
        ax.spines['bottom'].set_visible(False)
        if ax != host:
            ax.spines['left'].set_visible(False)
            ax.yaxis.set_ticks_position('right')
            ax.spines["right"].set_position(("axes", i / (ys.shape[1] - 1)))
    
    host.set_xlim(0, ys.shape[1] - 1)
    host.set_xticks(range(ys.shape[1]))
    host.set_xticklabels(ynames, fontsize=14)
    host.tick_params(axis='x', which='major', pad=7)
    host.spines['right'].set_visible(False)
    host.xaxis.tick_top()
    # host.set_title('Parallel Coordinates Plot — Iris', fontsize=18, pad=12)
    
    colors = plt.cm.viridis([0, 128, 255])
    target_names = df['species'].unique()
    target = df['species'].cat.codes
    legend_handles = [None for _ in target_names]
    for j in range(ys.shape[0]):
        # create bezier curves
        verts = list(zip([x for x in np.linspace(0, len(ys) - 1, len(ys) * 3 - 2, endpoint=True)],
                         np.repeat(zs[j, :], 3)[1:-1]))
        codes = [Path.MOVETO] + [Path.CURVE4 for _ in range(len(verts) - 1)]
        path = Path(verts, codes)
        patch = patches.PathPatch(path, facecolor='none',
                                  lw=2, alpha=0.5, edgecolor=colors[target[j]])
        legend_handles[target[j]] = patch
        host.add_patch(patch)
    host.legend(legend_handles, target_names,
                loc='lower center', bbox_to_anchor=(0.5, -0.18),
                ncol=len(target_names), fancybox=True, shadow=True)
    
    # %% Parallele Koordinaten Plot mit Plotly Express
    import plotly.express as px
    # fig = px.parallel_coordinates(df, color="species", labels={'species': tuple('ABC')})
    fig = px.parallel_coordinates(df, color=df["species"].cat.codes)
    fig.data[0]['dimensions'][-1]['label'] = 'species'
    fig.show()
    
    # %% Slicing
    cp = df.copy()
    cp.loc[1, 'sepal_width'] = 1
    cp.loc[0:2, 'petal_length'] = 2
    cp.loc[0, 'sepal_width':'petal_width'] = 3
    cp.loc[1:, 'sepal_length'] = 4
    cp.loc[:2, :'sepal_width'] = 5
    cp.loc[:49, :].to_csv('iris-setosa.csv')
    cp
    
    # %% komplexe Indizierung
    display(df.loc[[0, 149, 2], 'petal_width'])
    
    part = df.loc[[0, 149, 2], ['petal_width', 'sepal_width']]
    part
    
    
    # %% integer location
    display(part.iloc[1, -1])
    display(part.iloc[:2, -1])
    display(part.iloc[[0, 1], [0, 1]])
    
    
    # %% boolesche Indizierung
    pw = part.loc[:, 'petal_width'] <= 1
    sw = part.loc[:, 'sepal_width'] < 3.5
    display(pw)
    display(sw)
    display(~sw)
    display(part.loc[pw & sw])
    display(part.loc[pw | ~sw])
    display(part.loc[pw ^ sw])
    
    # %% Daten fallen lassen
    display(part.drop(index=149, columns='petal_width'))
    display(part.drop(index=[149, 0]))
    
    
    # %% einzelne Daten hinzufügen
    part.loc[3] = [2, 6]
    display(part)
    part.loc[:, 'weight'] = [1, 2, 3, 4]
    display(part)
    
    
    # %% DataFrames zusammenführen
    a = part.drop(index=3)
    b = df.loc[:2, ['petal_length', 'petal_width']]
    display(a)
    display(b)
    display(pd.concat((a, b), axis='columns'))
    display(pd.concat((a, b), axis='index'))
    
    
    # %% Kategoriale Daten
    df['species']
    df['species'].info()
    
    
    # %% Statistische Funktionen
    X = df.drop(columns='species')
    y = df['species']
    
    display(X.mean())
    display(y.value_counts())
    
    
    # %% Gruppierung
    species_means = X.groupby(y).mean()
    display(species_means)
    
    diff = species_means - [6, 3, 2, 0.5]
    (diff**2).sum(axis='columns')