Skip to content
Snippets Groups Projects
Commit 24d87cef authored by Christoph Olberding's avatar Christoph Olberding
Browse files

Update Imputer/compute_forest.py, Imputer/data_filled(Forest).csv files

parent d5a9bdbb
Branches
No related tags found
No related merge requests found
# -*- coding: utf-8 -*-
"""
Created on Mon Mar 29 19:08:44 2021
@author: Christoph
"""
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.model_selection import train_test_split
from randomForest import randomForestRegression
np.random.seed(42)
pd.options.mode.chained_assignment = None # default='warn'
data = pd.read_csv('data_unfilled.csv')
data= data.iloc[:,1:]
#komplette Daten fürs Training:
data_cmpl = data.loc[data['Streckenvorhersage.Dauer']!= 0]
X_cmpl = data_cmpl[['Streckenvorhersage.ZielortID','Streckenvorhersage.StartortID','time','weekday']]
Y_cmpl = data_cmpl['Streckenvorhersage.Dauer']
X_cmpl_train, X_cmpl_test, y_cmpl_train, y_cmpl_test = train_test_split(X_cmpl, Y_cmpl, test_size=0.2)
# fehlende Daten für Test:
data_incmpl = data.loc[data['Streckenvorhersage.Dauer']== 0]
X_incmpl = data_incmpl[['Streckenvorhersage.ZielortID','Streckenvorhersage.StartortID','time','weekday']]
Y_incmpl = data_incmpl['Streckenvorhersage.Dauer']
#Prediction anhand vorhandener Daten
myANN = Sequential()
myANN.add(Dense(80, activation='relu', input_dim=X_cmpl.shape[1]))
myANN.add(Dense(50,activation='relu'))
myANN.add(Dense(30,activation='relu'))
myANN.add(Dense(1,activation='linear'))
myANN.compile(loss='mean_squared_error', optimizer='adam')
myANN.fit(X_cmpl_train,y_cmpl_train, epochs=100,shuffle=True,verbose=False)
yp = myANN.predict(X_cmpl_test)
yp=np.squeeze(yp)
yDiff = yp - y_cmpl_test
print('Mittlere Abweichung auf fehlende Daten: %e ' % (np.mean(np.abs(yDiff)))) # mittlerer fehler
#impute Dauer auf vorhandenen mittels AForest
myForestgdp = randomForestRegression(noOfTrees=25,minLeafNodeSize=5,threshold=2)
myForestgdp.fit(X_cmpl_train.to_numpy(),y_cmpl_train.to_numpy())
yp = (myForestgdp.predict(X_incmpl.to_numpy()))
#yp = myANN.predict(X_incmpl)
yp=np.squeeze(yp)
Y_incmpl = pd.DataFrame(data=yp,columns=['Streckenvorhersage.Dauer'])
#größere Testmenge aus den originaldaten suchen
#X_train, X_test, y_train, y_test = train_test_split(X_cmpl, Y_cmpl, test_size=0.2)
y_train = pd.DataFrame(data=y_cmpl_train,columns=['Streckenvorhersage.Dauer'])
X_train=X_cmpl_train.append(X_incmpl)
y_train=pd.concat([y_train,Y_incmpl])
myANN = Sequential()
myANN.add(Dense(80, activation='relu', input_dim=X_cmpl.shape[1]))
myANN.add(Dense(50,activation='relu'))
myANN.add(Dense(30,activation='relu'))
myANN.add(Dense(1,activation='linear'))
myANN.compile(loss='mean_squared_error', optimizer='adam')
#ANN mit gefüllten Daten als trainingsmenge
myANN.fit(X_train,y_train, epochs=100,shuffle=True,verbose=False)
yp = myANN.predict(X_cmpl_test)
yp=np.squeeze(yp)
yDiff = yp - y_cmpl_test
print('Mittlere Abweichung mit aufgefüllten Daten(simuliert Forest): %e ' % (np.mean(np.abs(yDiff))))
#das Data-DateaFrame mit den gefüllten Daten füllen
y_cmpl_test = pd.DataFrame(data=y_cmpl_test,columns=['Streckenvorhersage.Dauer'])
X_all=X_train.append(X_cmpl_test)
y_all=pd.concat([y_train,y_cmpl_test])
data= X_all
y_all= np.asarray(y_all)
data['Streckenvorhersage.Dauer']=y_all
data.to_csv('data_filled(Forest).csv') # Die gefülten Daten in einer neuen csv abspeichern
\ No newline at end of file
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment