Added Random Forest, Data Preperation & Hyperparameteroptimization

59b5d825 · Luca Sander · 17f2da77 · 59b5d825 · 59b5d825 · 59b5d825
Commit 59b5d825 authored Mar 21, 2021 by Luca Sander
--- a/Random Forest/.vscode/settings.json
+++ b/Random Forest/.vscode/settings.json
+{
+    "python.pythonPath": "/usr/bin/python3.8"
+}
\ No newline at end of file
--- a/Random Forest/data.csv
+++ b/Random Forest/data.csv
--- a/Random Forest/db_dump.csv
+++ b/Random Forest/db_dump.csv
--- a/Random Forest/grid_search.py
+++ b/Random Forest/grid_search.py
+from sklearn.model_selection import train_test_split
+import numpy as np
+import pandas as pd
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.metrics import mean_absolute_percentage_error
+from sklearn.model_selection import GridSearchCV
+dataset = np.genfromtxt('data.csv', delimiter=',', skip_header=1)
+x = dataset[:,[0,1,2,4,5,6,7,8,9]]
+y = dataset[:,3]
+x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)
+param_grid = {
+    'bootstrap': [True, False],
+    'max_depth': [25,50,75,100, None],
+    'max_features': ["auto", "sqrt", "log2"],
+    'min_samples_leaf': [1,3,5],
+    'min_samples_split': [2,4,6],
+    'n_estimators': [100, 250, 500, 750, 1000],
+    'random_state': [42]
+}
+rf = RandomForestRegressor()
+grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 3, n_jobs = -1, verbose = 2)
+grid_search.fit(x_train, y_train)
+print(grid_search.best_params_)
+y_predict = grid_search.best_estimator_.predict(x_test)
+accuracy = (1-mean_absolute_percentage_error(y_test, y_predict))*100
+print("Accuracy: " + str(accuracy) + '%')
--- a/Random Forest/prepare_data.py
+++ b/Random Forest/prepare_data.py
+import pandas as pd
+df = pd.read_csv("db_dump.csv", delimiter=';', usecols=[
+                 'Streckenvorhersage.Dauer', 'Streckenvorhersage.StartortID', 'Streckenvorhersage.ZielortID', 'Streckenvorhersage.Entfernung', 'Streckenvorhersage.Datum'])
+df['Streckenvorhersage.Year'] = [pd.Timestamp(
+    d).year for d in df['Streckenvorhersage.Datum']]
+df['Streckenvorhersage.Month'] = [pd.Timestamp(
+    d).month for d in df['Streckenvorhersage.Datum']]
+df['Streckenvorhersage.Day'] = [pd.Timestamp(
+    d).day for d in df['Streckenvorhersage.Datum']]
+df['Streckenvorhersage.DayOfWeek'] = [pd.Timestamp(
+    d).day_of_week for d in df['Streckenvorhersage.Datum']]
+df['Streckenvorhersage.Hour'] = [pd.Timestamp(
+    d).hour for d in df['Streckenvorhersage.Datum']]
+df['Streckenvorhersage.Minutes'] = [pd.Timestamp(
+    d).minute for d in df['Streckenvorhersage.Datum']]
+df.drop(['Streckenvorhersage.Datum'], axis=1, inplace=True)
+# One-Hot Encoding
+# df = pd.concat(
+#     [df, pd.get_dummies(df['Streckenvorhersage.DayOfWeek'])], axis=1)
+df.to_csv("data.csv", index=False)
--- a/Random Forest/random_forrest.py
+++ b/Random Forest/random_forrest.py
+from sklearn.model_selection import train_test_split
+import numpy as np
+import pandas as pd
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.metrics import mean_absolute_error,mean_absolute_percentage_error
+from pprint import pprint
+import matplotlib.pyplot as plt
+dataset = np.genfromtxt('data.csv', delimiter=',', skip_header=1)
+x = dataset[:,[1,2,4,5,6,7,8,9]]
+y = dataset[:,3]
+x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)
+model = RandomForestRegressor(random_state=42)
+model.fit(x_train,y_train)
+pprint(model.get_params())
+# Feature Importance 
+feature_list = ["ZielortID","Entfernung","Jahr","Monat","Tag","Wochentag","Stunde","Minute"]
+importances = np.round(model.feature_importances_ * 100,decimals=2)
+zipped = list(zip(feature_list,importances))
+pprint(zipped)
+# Feature Importance Plot
+x_values = list(range(len(importances)))
+plt.bar(x_values, importances, orientation = 'vertical')
+plt.xticks(x_values, feature_list, rotation='vertical')
+plt.ylabel('Importance'); 
+plt.xlabel('Feature'); 
+plt.title('Feature Analyse');
+# Accuracy in Percent
+y_predict = model.predict(x_test)
+accuracy = (1-mean_absolute_percentage_error(y_test, y_predict))*100
+print("Accuracy: " + str(accuracy) + '%')
+# Absolute Error
+print(mean_absolute_error(y_test,y_predict))
+# Average Time
+print("Average Time: " + str(np.average(y_test)) + " Sekunden")
+# Average Distance
+print("Average Distance: " + str(np.average(x_test[:,1])) + " Meter")
+plt.show()
\ No newline at end of file