Skip to content
Snippets Groups Projects
Commit 03b48bfb authored by Luca Sander's avatar Luca Sander
Browse files

Added One-Hot-Encoding & Removed Distance from Feature Set

parent 59b5d825
No related branches found
No related tags found
No related merge requests found
Source diff could not be displayed: it is too large. Options to address this: view the blob.
...@@ -5,19 +5,19 @@ from sklearn.ensemble import RandomForestRegressor ...@@ -5,19 +5,19 @@ from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_percentage_error from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import GridSearchCV from sklearn.model_selection import GridSearchCV
dataset = np.genfromtxt('data.csv', delimiter=',', skip_header=1) dataset = np.genfromtxt('Random Forest/data.csv', delimiter=',', skip_header=1)
x = dataset[:,[0,1,2,4,5,6,7,8,9]] x = dataset[:,np.arange(2,26)]
y = dataset[:,3] y = dataset[:,0]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)
param_grid = { param_grid = {
'bootstrap': [True, False], #'bootstrap': [True, False],
'max_depth': [25,50,75,100, None], 'max_depth': [25,50,75,100, None],
'max_features': ["auto", "sqrt", "log2"], #'max_features': ["auto", "sqrt", "log2"],
'min_samples_leaf': [1,3,5], #'min_samples_leaf': [1,3,5],
'min_samples_split': [2,4,6], #'min_samples_split': [2,4,6],
'n_estimators': [100, 250, 500, 750, 1000], 'n_estimators': [1000],
'random_state': [42] 'random_state': [42]
} }
......
import pandas as pd import pandas as pd
df = pd.read_csv("db_dump.csv", delimiter=';', usecols=[ df = pd.read_csv("Random Forest/db_dump.csv", delimiter=';', usecols=[
'Streckenvorhersage.Dauer', 'Streckenvorhersage.StartortID', 'Streckenvorhersage.ZielortID', 'Streckenvorhersage.Entfernung', 'Streckenvorhersage.Datum']) 'Streckenvorhersage.Dauer', 'Streckenvorhersage.Entfernung', 'Streckenvorhersage.ZielortID', 'Streckenvorhersage.Datum'])
df = df.reindex(columns=['Streckenvorhersage.Dauer', 'Streckenvorhersage.Entfernung',
'Streckenvorhersage.ZielortID', 'Streckenvorhersage.Datum'])
df['Streckenvorhersage.Year'] = [pd.Timestamp( df['Streckenvorhersage.Year'] = [pd.Timestamp(
d).year for d in df['Streckenvorhersage.Datum']] d).year for d in df['Streckenvorhersage.Datum']]
...@@ -19,7 +22,13 @@ df['Streckenvorhersage.Minutes'] = [pd.Timestamp( ...@@ -19,7 +22,13 @@ df['Streckenvorhersage.Minutes'] = [pd.Timestamp(
df.drop(['Streckenvorhersage.Datum'], axis=1, inplace=True) df.drop(['Streckenvorhersage.Datum'], axis=1, inplace=True)
# One-Hot Encoding # One-Hot Encoding
# df = pd.concat( df = pd.concat(
# [df, pd.get_dummies(df['Streckenvorhersage.DayOfWeek'])], axis=1) [df, pd.get_dummies(df['Streckenvorhersage.DayOfWeek'], prefix='DayOfWeek')], axis=1)
df.drop(['Streckenvorhersage.DayOfWeek'], axis=1, inplace=True)
# One-Hot Encoding ZielortID
df = pd.concat(
[df, pd.get_dummies(df['Streckenvorhersage.ZielortID'], prefix='ZielortID')], axis=1)
df.drop(['Streckenvorhersage.ZielortID'], axis=1, inplace=True)
df.to_csv("data.csv", index=False) df.to_csv("Random Forest/data.csv", index=False)
...@@ -6,19 +6,43 @@ from sklearn.metrics import mean_absolute_error,mean_absolute_percentage_error ...@@ -6,19 +6,43 @@ from sklearn.metrics import mean_absolute_error,mean_absolute_percentage_error
from pprint import pprint from pprint import pprint
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
dataset = np.genfromtxt('data.csv', delimiter=',', skip_header=1) dataset = np.genfromtxt('Random Forest/data.csv', delimiter=',', skip_header=1)
x = dataset[:,[1,2,4,5,6,7,8,9]] x = dataset[:,np.arange(2,26)]
y = dataset[:,3] y = dataset[:,0]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)
#model = RandomForestRegressor(random_state=42, n_estimators=750, max_depth=50)
model = RandomForestRegressor(random_state=42) model = RandomForestRegressor(random_state=42)
model.fit(x_train,y_train) model.fit(x_train,y_train)
pprint(model.get_params()) #pprint(model.get_params())
# Feature Importance
feature_list = ["ZielortID","Entfernung","Jahr","Monat","Tag","Wochentag","Stunde","Minute"] # Accuracy in Percent
importances = np.round(model.feature_importances_ * 100,decimals=2) y_predict = model.predict(x_test)
accuracy = (1-mean_absolute_percentage_error(y_test, y_predict))*100
print("Accuracy: " + str(accuracy) + '%')
# Feature Importance without Distance
pprint(model.feature_importances_)
feature_list = ["Jahr","Monat","Tag","Stunde","Minute","Wochentag", "ZielortID"]
importances = model.feature_importances_[0:5]
wochentag_importance = np.sum(model.feature_importances_[5:12])
importances = np.append(importances,wochentag_importance)
zielort_importance = np.sum(model.feature_importances_[12:])
importances = np.append(importances,zielort_importance)
# Feature Importance with Distance
#feature_list = ["Entfernung","Jahr","Monat","Tag","Stunde","Minute","Wochentag", "ZielortID"]
#importances = model.feature_importances_[0:6]
#wochentag_importance = np.sum(model.feature_importances_[6:13])
#importances = np.append(importances,wochentag_importance)
#zielort_importance = np.sum(model.feature_importances_[13:])
#importances = np.append(importances,zielort_importance)
importances = np.round(importances * 100,decimals=2)
#pprint(np.round(model.feature_importances_ * 100,decimals=2))
zipped = list(zip(feature_list,importances)) zipped = list(zip(feature_list,importances))
pprint(zipped) pprint(zipped)
...@@ -30,11 +54,6 @@ plt.ylabel('Importance'); ...@@ -30,11 +54,6 @@ plt.ylabel('Importance');
plt.xlabel('Feature'); plt.xlabel('Feature');
plt.title('Feature Analyse'); plt.title('Feature Analyse');
# Accuracy in Percent
y_predict = model.predict(x_test)
accuracy = (1-mean_absolute_percentage_error(y_test, y_predict))*100
print("Accuracy: " + str(accuracy) + '%')
# Absolute Error # Absolute Error
print(mean_absolute_error(y_test,y_predict)) print(mean_absolute_error(y_test,y_predict))
...@@ -42,6 +61,5 @@ print(mean_absolute_error(y_test,y_predict)) ...@@ -42,6 +61,5 @@ print(mean_absolute_error(y_test,y_predict))
print("Average Time: " + str(np.average(y_test)) + " Sekunden") print("Average Time: " + str(np.average(y_test)) + " Sekunden")
# Average Distance # Average Distance
print("Average Distance: " + str(np.average(x_test[:,1])) + " Meter") print("Average Distance: " + str(np.average(dataset[:,1])) + " Meter")
#plt.show()
plt.show() \ No newline at end of file
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment