diff --git a/Imputer/compute_missing.py b/Imputer/compute_missing.py new file mode 100644 index 0000000000000000000000000000000000000000..75d565f04a61fae33c2cce23f260c171ce001453 --- /dev/null +++ b/Imputer/compute_missing.py @@ -0,0 +1,84 @@ +# -*- coding: utf-8 -*- +""" +Created on Fri Feb 26 12:58:53 2021 + +@author: Christoph + +""" +import pandas as pd +import numpy as np +from tensorflow.keras.models import Sequential +from tensorflow.keras.layers import Dense +from sklearn.model_selection import train_test_split +np.random.seed(42) + +data = pd.read_csv('data_unfilled.csv') +data= data.iloc[:,1:] + +#komplette Daten fürs Training: +data_cmpl = data.loc[data['Streckenvorhersage.Dauer']!= 0] +X_cmpl = data_cmpl[['Streckenvorhersage.ZielortID','Streckenvorhersage.StartortID','time']] +Y_cmpl = data_cmpl['Streckenvorhersage.Dauer'] +X_cmpl_train, X_cmpl_test, y_cmpl_train, y_cmpl_test = train_test_split(X_cmpl, Y_cmpl, test_size=0.2) + + +# fehlende Daten für Test: +data_incmpl = data.loc[data['Streckenvorhersage.Dauer']== 0] +X_incmpl = data_incmpl[['Streckenvorhersage.ZielortID','Streckenvorhersage.StartortID','time']] +Y_incmpl = data_incmpl['Streckenvorhersage.Dauer'] + + +#Prediction anhand vorhandener Daten + + +myANN = Sequential() +myANN.add(Dense(80, activation='relu', input_dim=X_cmpl.shape[1])) +myANN.add(Dense(50,activation='relu')) +myANN.add(Dense(30,activation='relu')) +myANN.add(Dense(1,activation='linear')) +myANN.compile(loss='mean_squared_error', optimizer='adam') + +myANN.fit(X_cmpl_train,y_cmpl_train, epochs=100,shuffle=True,verbose=False) +yp = myANN.predict(X_cmpl_test) +yp=np.squeeze(yp) + +yDiff = yp - y_cmpl_test +print('Mittlere Abweichung auf fehlende Daten: %e ' % (np.mean(np.abs(yDiff)))) + + + +#impute Dauer auf vorhandenen Daten +yp = myANN.predict(X_incmpl) +yp=np.squeeze(yp) +Y_incmpl = pd.DataFrame(data=yp,columns=['Streckenvorhersage.Dauer']) + + +X_train, X_test, y_train, y_test = train_test_split(X_cmpl, Y_cmpl, test_size=0.35) +y_train = pd.DataFrame(data=y_train,columns=['Streckenvorhersage.Dauer']) + +X_train=X_train.append(X_incmpl) +y_train=pd.concat([y_train,Y_incmpl]) + + +myANN = Sequential() +myANN.add(Dense(80, activation='relu', input_dim=X_cmpl.shape[1])) +myANN.add(Dense(50,activation='relu')) +myANN.add(Dense(30,activation='relu')) +myANN.add(Dense(1,activation='linear')) +myANN.compile(loss='mean_squared_error', optimizer='adam') + +myANN.fit(X_train,y_train, epochs=100,shuffle=True,verbose=False) +yp = myANN.predict(X_test) +yp=np.squeeze(yp) + +yDiff = yp - y_test +print('Mittlere Abweichung mit aufgefüllten Daten(simuliert): %e ' % (np.mean(np.abs(yDiff)))) + + +y_test = pd.DataFrame(data=y_test,columns=['Streckenvorhersage.Dauer']) +X_all=X_train.append(X_test) +y_all=pd.concat([y_train,y_test]) +data= X_all +y_all= np.asarray(y_all) +data['Streckenvorhersage.Dauer']=y_all +data.to_csv('data_filled(ANN).csv') \ No newline at end of file