Added One-Hot-Encoding & Removed Distance from Feature Set

03b48bfb · Luca Sander · 59b5d825 · 03b48bfb · 03b48bfb · 03b48bfb
Commit 03b48bfb authored Mar 30, 2021 by Luca Sander
--- a/Random Forest/data.csv
+++ b/Random Forest/data.csv
--- a/Random Forest/grid_search.py
+++ b/Random Forest/grid_search.py
@@ -5,19 +5,19 @@ from sklearn.ensemble import RandomForestRegressor
 from sklearn.metrics import mean_absolute_percentage_error
 from sklearn.model_selection import GridSearchCV
-dataset = np.genfromtxt('data.csv', delimiter=',', skip_header=1)
+dataset = np.genfromtxt('Random Forest/data.csv', delimiter=',', skip_header=1)
-x = dataset[:,[0,1,2,4,5,6,7,8,9]]
+x = dataset[:,np.arange(2,26)]
-y = dataset[:,3]
+y = dataset[:,0]
 x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)
 param_grid = {
-    'bootstrap': [True, False],
+    #'bootstrap': [True, False],
    'max_depth': [25,50,75,100, None],
-    'max_features': ["auto", "sqrt", "log2"],
+    #'max_features': ["auto", "sqrt", "log2"],
-    'min_samples_leaf': [1,3,5],
+    #'min_samples_leaf': [1,3,5],
-    'min_samples_split': [2,4,6],
+    #'min_samples_split': [2,4,6],
-    'n_estimators': [100, 250, 500, 750, 1000],
+    'n_estimators': [1000],
    'random_state': [42]
 }

--- a/Random Forest/prepare_data.py
+++ b/Random Forest/prepare_data.py
 import pandas as pd
-df = pd.read_csv("db_dump.csv", delimiter=';', usecols=[
+df = pd.read_csv("Random Forest/db_dump.csv", delimiter=';', usecols=[
-                 'Streckenvorhersage.Dauer', 'Streckenvorhersage.StartortID', 'Streckenvorhersage.ZielortID', 'Streckenvorhersage.Entfernung', 'Streckenvorhersage.Datum'])
+                 'Streckenvorhersage.Dauer', 'Streckenvorhersage.Entfernung', 'Streckenvorhersage.ZielortID', 'Streckenvorhersage.Datum'])
+df = df.reindex(columns=['Streckenvorhersage.Dauer', 'Streckenvorhersage.Entfernung',
+                         'Streckenvorhersage.ZielortID', 'Streckenvorhersage.Datum'])
 df['Streckenvorhersage.Year'] = [pd.Timestamp(
    d).year for d in df['Streckenvorhersage.Datum']]
@@ -19,7 +22,13 @@ df['Streckenvorhersage.Minutes'] = [pd.Timestamp(
 df.drop(['Streckenvorhersage.Datum'], axis=1, inplace=True)
 # One-Hot Encoding
-# df = pd.concat(
+df = pd.concat(
-#     [df, pd.get_dummies(df['Streckenvorhersage.DayOfWeek'])], axis=1)
+    [df, pd.get_dummies(df['Streckenvorhersage.DayOfWeek'], prefix='DayOfWeek')], axis=1)
+df.drop(['Streckenvorhersage.DayOfWeek'], axis=1, inplace=True)
+# One-Hot Encoding ZielortID
+df = pd.concat(
+    [df, pd.get_dummies(df['Streckenvorhersage.ZielortID'], prefix='ZielortID')], axis=1)
+df.drop(['Streckenvorhersage.ZielortID'], axis=1, inplace=True)
-df.to_csv("data.csv", index=False)
+df.to_csv("Random Forest/data.csv", index=False)
--- a/Random Forest/random_forrest.py
+++ b/Random Forest/random_forrest.py
@@ -6,19 +6,43 @@ from sklearn.metrics import mean_absolute_error,mean_absolute_percentage_error
 from pprint import pprint
 import matplotlib.pyplot as plt
-dataset = np.genfromtxt('data.csv', delimiter=',', skip_header=1)
+dataset = np.genfromtxt('Random Forest/data.csv', delimiter=',', skip_header=1)
-x = dataset[:,[1,2,4,5,6,7,8,9]]
+x = dataset[:,np.arange(2,26)]
-y = dataset[:,3]
+y = dataset[:,0]
 x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)
+#model = RandomForestRegressor(random_state=42, n_estimators=750, max_depth=50)
 model = RandomForestRegressor(random_state=42)
 model.fit(x_train,y_train)
-pprint(model.get_params())
+#pprint(model.get_params())
-# Feature Importance 
-feature_list = ["ZielortID","Entfernung","Jahr","Monat","Tag","Wochentag","Stunde","Minute"]
+# Accuracy in Percent
-importances = np.round(model.feature_importances_ * 100,decimals=2)
+y_predict = model.predict(x_test)
+accuracy = (1-mean_absolute_percentage_error(y_test, y_predict))*100
+print("Accuracy: " + str(accuracy) + '%')
+# Feature Importance without Distance
+pprint(model.feature_importances_)
+feature_list = ["Jahr","Monat","Tag","Stunde","Minute","Wochentag", "ZielortID"]
+importances = model.feature_importances_[0:5]
+wochentag_importance = np.sum(model.feature_importances_[5:12])
+importances = np.append(importances,wochentag_importance)
+zielort_importance = np.sum(model.feature_importances_[12:])
+importances = np.append(importances,zielort_importance)
+# Feature Importance with Distance
+#feature_list = ["Entfernung","Jahr","Monat","Tag","Stunde","Minute","Wochentag", "ZielortID"]
+#importances = model.feature_importances_[0:6]
+#wochentag_importance = np.sum(model.feature_importances_[6:13])
+#importances = np.append(importances,wochentag_importance)
+#zielort_importance = np.sum(model.feature_importances_[13:])
+#importances = np.append(importances,zielort_importance)
+importances = np.round(importances * 100,decimals=2)
+#pprint(np.round(model.feature_importances_ * 100,decimals=2))
 zipped = list(zip(feature_list,importances))
 pprint(zipped)
@@ -30,11 +54,6 @@ plt.ylabel('Importance');
 plt.xlabel('Feature'); 
 plt.title('Feature Analyse');
-# Accuracy in Percent
-y_predict = model.predict(x_test)
-accuracy = (1-mean_absolute_percentage_error(y_test, y_predict))*100
-print("Accuracy: " + str(accuracy) + '%')
 # Absolute Error
 print(mean_absolute_error(y_test,y_predict))
@@ -42,6 +61,5 @@ print(mean_absolute_error(y_test,y_predict))
 print("Average Time: " + str(np.average(y_test)) + " Sekunden")
 # Average Distance
-print("Average Distance: " + str(np.average(x_test[:,1])) + " Meter")
+print("Average Distance: " + str(np.average(dataset[:,1])) + " Meter")
+#plt.show()
-plt.show()
\ No newline at end of file
\ No newline at end of file