Skip to content
Snippets Groups Projects
Commit 2baf5d16 authored by Christof Kaufmann's avatar Christof Kaufmann
Browse files

Upload Code (Python Script)

parent aef86389
No related branches found
No related tags found
No related merge requests found
# %% [markdown]
# # Code zu Folien
#
# Dieses Skript bzw. Jupyter-Notebook enthält den Code, der auch auf den Folien "Pandas & Seaborn" enthalten ist. Zum Vorbereiten, Mitmachen oder Nacharbeiten.
# %% import Pandas
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display
# %% Iris Flower Dataset
url = 'https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv'
df = pd.read_csv(url)
# offline-Alternative:
# from sklearn.datasets import load_iris
# df = pd.concat(load_iris(return_X_y=True, as_frame=True), axis='columns')
# df.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']
# df['species'] = df['species'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})
df['species'] = df['species'].astype('category')
df
# %% Informationen
print(df.shape)
print(df.columns)
print(df.dtypes)
print(df.index)
df.info()
# %% Statistischer Überblick
display(df.describe())
display(df.describe(exclude='number'))
# %% Kuchendiagramm
counts = df['species'].value_counts()
display(counts)
counts.plot.pie(startangle=60, autopct='{:.2f}%'.format)
plt.ylabel('species')
# %% Boxplot
df.boxplot(column='petal_length', by='species')
# %% Boxplots aller Features
fig, axs = plt.subplots(2, 2, sharey=False) # y-Achsen unabhängig
pd.plotting.boxplot(df, by='species', ax=axs) # übergebe axs
[ax.set_xlabel('') for ax in axs.ravel()] # entferne x-Labels
fig.tight_layout()
# %% Violinenplot
import seaborn as sns
sns.violinplot(hue='species', y='petal_length', data=df)
# %% Scatterplots
df.plot.scatter(x='petal_length', y='petal_width', c='species', colormap='viridis', alpha=0.7)
# %% Pair Plot
sns.pairplot(df, hue='species', plot_kws={'alpha': 0.5})
# %% Parallele Koordinaten Plot, unskaliert
pd.plotting.parallel_coordinates(df, 'species', colormap='viridis', alpha=.5)
# %% Parallele Koordinaten Plot, normiert
from sklearn.preprocessing import minmax_scale
num_cols = df.columns.drop('species')
df_scaled = df.copy()
df_scaled[num_cols] = minmax_scale(df[num_cols])
pd.plotting.parallel_coordinates(df_scaled, 'species', colormap='viridis', alpha=.5)
# %% Parallele Koordinaten Plot, custom Code from https://stackoverflow.com/a/60401570/2414411
import numpy as np
from matplotlib.path import Path
import matplotlib.patches as patches
ys = df.drop(columns='species')
ynames = ys.columns
ys = ys.to_numpy()
ymins = ys.min(axis=0)
ymaxs = ys.max(axis=0)
dys = ymaxs - ymins
ymins -= dys * 0.05 # add 5% padding below and above
ymaxs += dys * 0.05
# reverse axis 1 to have less crossings
# ymaxs[1], ymins[1] = ymins[1], ymaxs[1]
# dys = ymaxs - ymins
# transform all data to be compatible with the main axis
zs = np.zeros_like(ys)
zs[:, 0] = ys[:, 0]
zs[:, 1:] = (ys[:, 1:] - ymins[1:]) / dys[1:] * dys[0] + ymins[0]
fig, host = plt.subplots(figsize=(10, 4))
axes = [host] + [host.twinx() for i in range(ys.shape[1] - 1)]
for i, ax in enumerate(axes):
ax.set_ylim(ymins[i], ymaxs[i])
ax.spines['top'].set_visible(False)
ax.spines['bottom'].set_visible(False)
if ax != host:
ax.spines['left'].set_visible(False)
ax.yaxis.set_ticks_position('right')
ax.spines["right"].set_position(("axes", i / (ys.shape[1] - 1)))
host.set_xlim(0, ys.shape[1] - 1)
host.set_xticks(range(ys.shape[1]))
host.set_xticklabels(ynames, fontsize=14)
host.tick_params(axis='x', which='major', pad=7)
host.spines['right'].set_visible(False)
host.xaxis.tick_top()
# host.set_title('Parallel Coordinates Plot — Iris', fontsize=18, pad=12)
colors = plt.cm.viridis([0, 128, 255])
target_names = df['species'].unique()
target = df['species'].cat.codes
legend_handles = [None for _ in target_names]
for j in range(ys.shape[0]):
# create bezier curves
verts = list(zip([x for x in np.linspace(0, len(ys) - 1, len(ys) * 3 - 2, endpoint=True)],
np.repeat(zs[j, :], 3)[1:-1]))
codes = [Path.MOVETO] + [Path.CURVE4 for _ in range(len(verts) - 1)]
path = Path(verts, codes)
patch = patches.PathPatch(path, facecolor='none',
lw=2, alpha=0.5, edgecolor=colors[target[j]])
legend_handles[target[j]] = patch
host.add_patch(patch)
host.legend(legend_handles, target_names,
loc='lower center', bbox_to_anchor=(0.5, -0.18),
ncol=len(target_names), fancybox=True, shadow=True)
# %% Parallele Koordinaten Plot mit Plotly Express
import plotly.express as px
# fig = px.parallel_coordinates(df, color="species", labels={'species': tuple('ABC')})
fig = px.parallel_coordinates(df, color=df["species"].cat.codes)
fig.data[0]['dimensions'][-1]['label'] = 'species'
fig.show()
# %% Slicing
cp = df.copy()
cp.loc[1, 'sepal_width'] = 1
cp.loc[0:2, 'petal_length'] = 2
cp.loc[0, 'sepal_width':'petal_width'] = 3
cp.loc[1:, 'sepal_length'] = 4
cp.loc[:2, :'sepal_width'] = 5
cp.loc[:49, :].to_csv('iris-setosa.csv')
cp
# %% komplexe Indizierung
display(df.loc[[0, 149, 2], 'petal_width'])
part = df.loc[[0, 149, 2], ['petal_width', 'sepal_width']]
part
# %% integer location
display(part.iloc[1, -1])
display(part.iloc[:2, -1])
display(part.iloc[[0, 1], [0, 1]])
# %% boolesche Indizierung
pw = part.loc[:, 'petal_width'] <= 1
sw = part.loc[:, 'sepal_width'] < 3.5
display(pw)
display(sw)
display(~sw)
display(part.loc[pw & sw])
display(part.loc[pw | ~sw])
display(part.loc[pw ^ sw])
# %% Daten fallen lassen
display(part.drop(index=149, columns='petal_width'))
display(part.drop(index=[149, 0]))
# %% einzelne Daten hinzufügen
part.loc[3] = [2, 6]
display(part)
part.loc[:, 'weight'] = [1, 2, 3, 4]
display(part)
# %% DataFrames zusammenführen
a = part.drop(index=3)
b = df.loc[:2, ['petal_length', 'petal_width']]
display(a)
display(b)
display(pd.concat((a, b), axis='columns'))
display(pd.concat((a, b), axis='index'))
# %% Kategoriale Daten
df['species']
df['species'].info()
# %% Statistische Funktionen
X = df.drop(columns='species')
y = df['species']
display(X.mean())
display(y.value_counts())
# %% Gruppierung
species_means = X.groupby(y).mean()
display(species_means)
diff = species_means - [6, 3, 2, 0.5]
(diff**2).sum(axis='columns')
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment