Skip to content
Snippets Groups Projects
Select Git revision
  • 345418521e5ee89f8156255d8cdcbd8132895c53
  • main default protected
2 results

compute_missing.py

Blame
  • compute_missing.py 2.54 KiB
    # -*- coding: utf-8 -*-
    """
    Created on Fri Feb 26 12:58:53 2021
    
    @author: Christoph
    
    """
    import pandas as pd
    import numpy as np
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import Dense
    from sklearn.model_selection import train_test_split
    np.random.seed(42)
    
    data = pd.read_csv('data_unfilled.csv')
    data= data.iloc[:,1:]
    
    #komplette Daten fürs Training:
    data_cmpl = data.loc[data['Streckenvorhersage.Dauer']!= 0]
    X_cmpl = data_cmpl[['Streckenvorhersage.ZielortID','Streckenvorhersage.StartortID','time']]
    Y_cmpl = data_cmpl['Streckenvorhersage.Dauer']
    X_cmpl_train, X_cmpl_test, y_cmpl_train, y_cmpl_test = train_test_split(X_cmpl, Y_cmpl, test_size=0.2)
    
    
    # fehlende Daten für Test:
    data_incmpl = data.loc[data['Streckenvorhersage.Dauer']== 0]
    X_incmpl = data_incmpl[['Streckenvorhersage.ZielortID','Streckenvorhersage.StartortID','time']]
    Y_incmpl = data_incmpl['Streckenvorhersage.Dauer']
    
    
    #Prediction anhand vorhandener Daten
    
    
    myANN = Sequential()
    myANN.add(Dense(80, activation='relu', input_dim=X_cmpl.shape[1]))
    myANN.add(Dense(50,activation='relu'))
    myANN.add(Dense(30,activation='relu'))
    myANN.add(Dense(1,activation='linear'))
    myANN.compile(loss='mean_squared_error', optimizer='adam')
    
    myANN.fit(X_cmpl_train,y_cmpl_train, epochs=100,shuffle=True,verbose=False)
    yp = myANN.predict(X_cmpl_test)
    yp=np.squeeze(yp)
    
    yDiff = yp - y_cmpl_test
    print('Mittlere Abweichung auf fehlende Daten: %e ' % (np.mean(np.abs(yDiff))))
    
    
    
    #impute Dauer auf vorhandenen Daten
    yp = myANN.predict(X_incmpl)
    yp=np.squeeze(yp)
    Y_incmpl = pd.DataFrame(data=yp,columns=['Streckenvorhersage.Dauer'])
    
    
    X_train, X_test, y_train, y_test = train_test_split(X_cmpl, Y_cmpl, test_size=0.35)
    y_train = pd.DataFrame(data=y_train,columns=['Streckenvorhersage.Dauer'])
    
    X_train=X_train.append(X_incmpl)
    y_train=pd.concat([y_train,Y_incmpl])
    
    
    myANN = Sequential()
    myANN.add(Dense(80, activation='relu', input_dim=X_cmpl.shape[1]))
    myANN.add(Dense(50,activation='relu'))
    myANN.add(Dense(30,activation='relu'))
    myANN.add(Dense(1,activation='linear'))
    myANN.compile(loss='mean_squared_error', optimizer='adam')
    
    myANN.fit(X_train,y_train, epochs=100,shuffle=True,verbose=False)
    yp = myANN.predict(X_test)
    yp=np.squeeze(yp)
    
    yDiff = yp - y_test
    print('Mittlere Abweichung mit aufgefüllten Daten(simuliert): %e ' % (np.mean(np.abs(yDiff))))
    
    
    y_test = pd.DataFrame(data=y_test,columns=['Streckenvorhersage.Dauer'])
    X_all=X_train.append(X_test)
    y_all=pd.concat([y_train,y_test])
    data= X_all
    y_all= np.asarray(y_all)
    data['Streckenvorhersage.Dauer']=y_all
    data.to_csv('data_filled(ANN).csv')