Skip to content
Snippets Groups Projects
Commit 59b5d825 authored by Luca Sander's avatar Luca Sander
Browse files

Added Random Forest, Data Preperation & Hyperparameteroptimization

parent 17f2da77
No related branches found
No related tags found
No related merge requests found
{
"python.pythonPath": "/usr/bin/python3.8"
}
\ No newline at end of file
Source diff could not be displayed: it is too large. Options to address this: view the blob.
Source diff could not be displayed: it is too large. Options to address this: view the blob.
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import GridSearchCV
dataset = np.genfromtxt('data.csv', delimiter=',', skip_header=1)
x = dataset[:,[0,1,2,4,5,6,7,8,9]]
y = dataset[:,3]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)
param_grid = {
'bootstrap': [True, False],
'max_depth': [25,50,75,100, None],
'max_features': ["auto", "sqrt", "log2"],
'min_samples_leaf': [1,3,5],
'min_samples_split': [2,4,6],
'n_estimators': [100, 250, 500, 750, 1000],
'random_state': [42]
}
rf = RandomForestRegressor()
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 3, n_jobs = -1, verbose = 2)
grid_search.fit(x_train, y_train)
print(grid_search.best_params_)
y_predict = grid_search.best_estimator_.predict(x_test)
accuracy = (1-mean_absolute_percentage_error(y_test, y_predict))*100
print("Accuracy: " + str(accuracy) + '%')
import pandas as pd
df = pd.read_csv("db_dump.csv", delimiter=';', usecols=[
'Streckenvorhersage.Dauer', 'Streckenvorhersage.StartortID', 'Streckenvorhersage.ZielortID', 'Streckenvorhersage.Entfernung', 'Streckenvorhersage.Datum'])
df['Streckenvorhersage.Year'] = [pd.Timestamp(
d).year for d in df['Streckenvorhersage.Datum']]
df['Streckenvorhersage.Month'] = [pd.Timestamp(
d).month for d in df['Streckenvorhersage.Datum']]
df['Streckenvorhersage.Day'] = [pd.Timestamp(
d).day for d in df['Streckenvorhersage.Datum']]
df['Streckenvorhersage.DayOfWeek'] = [pd.Timestamp(
d).day_of_week for d in df['Streckenvorhersage.Datum']]
df['Streckenvorhersage.Hour'] = [pd.Timestamp(
d).hour for d in df['Streckenvorhersage.Datum']]
df['Streckenvorhersage.Minutes'] = [pd.Timestamp(
d).minute for d in df['Streckenvorhersage.Datum']]
df.drop(['Streckenvorhersage.Datum'], axis=1, inplace=True)
# One-Hot Encoding
# df = pd.concat(
# [df, pd.get_dummies(df['Streckenvorhersage.DayOfWeek'])], axis=1)
df.to_csv("data.csv", index=False)
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error,mean_absolute_percentage_error
from pprint import pprint
import matplotlib.pyplot as plt
dataset = np.genfromtxt('data.csv', delimiter=',', skip_header=1)
x = dataset[:,[1,2,4,5,6,7,8,9]]
y = dataset[:,3]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)
model = RandomForestRegressor(random_state=42)
model.fit(x_train,y_train)
pprint(model.get_params())
# Feature Importance
feature_list = ["ZielortID","Entfernung","Jahr","Monat","Tag","Wochentag","Stunde","Minute"]
importances = np.round(model.feature_importances_ * 100,decimals=2)
zipped = list(zip(feature_list,importances))
pprint(zipped)
# Feature Importance Plot
x_values = list(range(len(importances)))
plt.bar(x_values, importances, orientation = 'vertical')
plt.xticks(x_values, feature_list, rotation='vertical')
plt.ylabel('Importance');
plt.xlabel('Feature');
plt.title('Feature Analyse');
# Accuracy in Percent
y_predict = model.predict(x_test)
accuracy = (1-mean_absolute_percentage_error(y_test, y_predict))*100
print("Accuracy: " + str(accuracy) + '%')
# Absolute Error
print(mean_absolute_error(y_test,y_predict))
# Average Time
print("Average Time: " + str(np.average(y_test)) + " Sekunden")
# Average Distance
print("Average Distance: " + str(np.average(x_test[:,1])) + " Meter")
plt.show()
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment