Skip to content
Snippets Groups Projects
Commit 03834d5e authored by Luca Sander's avatar Luca Sander
Browse files

Random Forest, Hyperparameter Tuning, Feature Importance

parent 17f2da77
Branches main
No related tags found
No related merge requests found
{
"python.pythonPath": "/usr/bin/python3.8"
}
\ No newline at end of file
Source diff could not be displayed: it is too large. Options to address this: view the blob.
Source diff could not be displayed: it is too large. Options to address this: view the blob.
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_percentage_error
from sklearn.model_selection import GridSearchCV
dataset = np.genfromtxt('data.csv', delimiter=',', skip_header=1)
x = dataset[:,[0,1,2,4,5,6,7,8,9]]
y = dataset[:,3]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)
param_grid = {
'random_state': [42]
}
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation,
# search across 100 different combinations, and use all available cores
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 3, n_jobs = -1, verbose = 2)
grid_search.fit(x_train, y_train)
print(grid_search.best_params_)
y_predict = grid_search.best_estimator_.predict(x_test)
accuracy = (1-mean_absolute_percentage_error(y_test, y_predict))*100
print("Accuracy: " + str(accuracy) + '%')
{'bootstrap': True, 'max_depth': 100, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}
Accuracy: 98.859838775608%
\ No newline at end of file
import pandas as pd
df = pd.read_csv("db_dump.csv", delimiter=';', usecols=[
'Streckenvorhersage.Dauer', 'Streckenvorhersage.StartortID', 'Streckenvorhersage.ZielortID', 'Streckenvorhersage.Entfernung', 'Streckenvorhersage.Datum'])
df['Streckenvorhersage.Year'] = [pd.Timestamp(
d).year for d in df['Streckenvorhersage.Datum']]
df['Streckenvorhersage.Month'] = [pd.Timestamp(
d).month for d in df['Streckenvorhersage.Datum']]
df['Streckenvorhersage.Day'] = [pd.Timestamp(
d).day for d in df['Streckenvorhersage.Datum']]
df['Streckenvorhersage.DayOfWeek'] = [pd.Timestamp(
d).day_of_week for d in df['Streckenvorhersage.Datum']]
df['Streckenvorhersage.Hour'] = [pd.Timestamp(
d).hour for d in df['Streckenvorhersage.Datum']]
df['Streckenvorhersage.Minutes'] = [pd.Timestamp(
d).minute for d in df['Streckenvorhersage.Datum']]
df.drop(['Streckenvorhersage.Datum'], axis=1, inplace=True)
#df = pd.concat(
# [df, pd.get_dummies(df['Streckenvorhersage.DayOfWeek'])], axis=1)
#li = ['Montag', 'Dienstag', 'Mittwoch', 'Donnerstag', 'Freitag', 'Samstag', 'Sonntag']
#print(pd.get_dummies(li,df['Streckenvorhersage.DayOfWeek'])
#df.rename(index={9: 'Montag', 10: 'Dienstag', 11: 'Mittwoch', 12: 'Donnerstag',
# 13: 'Freitag', 14: 'Samstag', 15: 'Sonntag'}, inplace=True)
#df.drop(['Streckenvorhersage.DayOfWeek'], axis=1, inplace=True)
print(df.columns)
df.to_csv("data.csv", index=False)
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_percentage_error
from pprint import pprint
import matplotlib.pyplot as plt
dataset = np.genfromtxt('data.csv', delimiter=',', skip_header=1)
x = dataset[:,[0,1,2,4,5,6,7,8,9]]
y = dataset[:,3]
print(x)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)
model = RandomForestRegressor(random_state=42)
model.fit(x_train,y_train)
pprint(model.get_params())
feature_list = ["StandortID","ZielortID","Entfernung","Jahr","Monat","Tag","Wochentag","Stunde","Minute"]
importances = np.round(model.feature_importances_ * 100,decimals=2)
zipped = list(zip(feature_list,importances))
pprint(zipped)
#plt.style.use('fivethirtyeight')
x_values = list(range(len(importances)))
plt.figure(figsize=(10,10))
plt.bar(x_values, importances, orientation = 'vertical')
plt.xticks(x_values, feature_list, rotation='vertical')
plt.ylabel('Importance'); plt.xlabel('Feature'); plt.title('Feature Analyse');
plt.show()
y_predict = model.predict(x_test)
accuracy = (1-mean_absolute_percentage_error(y_test, y_predict))*100
print("Accuracy: " + str(accuracy) + '%')
\ No newline at end of file
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_percentage_error
from sklearn.model_selection import GridSearchCV# Create the parameter grid based on the results of random search
from sklearn.model_selection import RandomizedSearchCV# Number of trees in random forest
dataset = np.genfromtxt('data.csv', delimiter=',', skip_header=1)
x = dataset[:,[0,1,2,4,5,6,7,8,9]]
y = dataset[:,3]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 1000, num = 5)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]# Create the random grid
random_grid = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'bootstrap': bootstrap}
print(random_grid)
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation,
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)# Fit the random search model
rf_random.fit(x_train, y_train)
rf_random.best_params_
print(rf_random.best_params_)
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment