Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found
Select Git revision

Target

Select target project
  • faust/vorhersage-der-verkehrslage-cvh
  • lsander/vorhersage-der-verkehrslage-cvh
2 results
Select Git revision
Show changes
Commits on Source (27)
Showing
with 468289 additions and 111878 deletions
#!/usr/bin/python3
import json
import sys
import urllib
import time as sleepTimer
import mariadb
import requests
from requests.exceptions import HTTPError
from datetime import datetime, time
GOOGLE_URL = "https://maps.googleapis.com/maps/api/distancematrix/json"
GOOGLE_API_KEY_FRED = ""
GOOGLE_API_KEY_CHRIS = ""
RUSHHOUR_MORNING_START = time(6)
RUSHHOUR_MORNING_END = time(9)
RUSHHOUR_EVENING_START = time(16)
RUSHHOUR_EVENING_END = time(19)
DB_USER = ""
DB_PASSWORD = ""
DB_SERVERNAME = ""
DB_PORT = 3307
DB_DATABASE = "Stauvorhersage"
def get_google_maps_db_id():
# Connect to MariaDB Platform
try:
with mariadb.connect(
user=DB_USER,
password=DB_PASSWORD,
host=DB_SERVERNAME,
port=DB_PORT,
database=DB_DATABASE
) as conn:
cursor = conn.cursor()
cursor.execute('SELECT ID FROM Stauvorhersage.Datenquelle AS q WHERE q.Name="Google Maps"')
for (id) in cursor:
print(id)
quelle_db_id = id[0]
return quelle_db_id
except mariadb.Error as e:
print(f"Error connecting to MariaDB Platform: {e}")
sys.exit(1)
def get_origin_and_destinationlist():
origin = None
destinationList = []
# Connect to MariaDB Platform
try:
with mariadb.connect(
user=DB_USER,
password=DB_PASSWORD,
host=DB_SERVERNAME,
port=DB_PORT,
database=DB_DATABASE
) as conn:
cursor = conn.cursor()
cursor.execute(
'SELECT ID, Name, Strasse, Hausnummer, PLZ, Ort, PlusCode FROM Stauvorhersage.Ort AS o WHERE o.ID="1"')
for (ID, Name, Strasse, Hausnummer, PLZ, Ort, PlusCode) in cursor:
print([ID, Name, Strasse, Hausnummer, PLZ, Ort, PlusCode])
origin = [ID, Name, Strasse, Hausnummer, PLZ, Ort, PlusCode]
cursor.execute(
'SELECT ID, Name, Strasse, Hausnummer, PLZ, Ort, PlusCode FROM Stauvorhersage.Ort AS o WHERE o.ID>"1"')
for (ID, Name, Strasse, Hausnummer, PLZ, Ort, PlusCode) in cursor:
print([ID, Name, Strasse, Hausnummer, PLZ, Ort, PlusCode])
destinationList.append([ID, Name, Strasse, Hausnummer, PLZ, Ort, PlusCode])
print(destinationList)
return origin, destinationList
except mariadb.Error as e:
print(f"Error connecting to MariaDB Platform: {e}")
sys.exit(1)
def get_google_matrix(origins, destination, api_key):
try:
response = requests.get(
GOOGLE_URL,
params={"unites": "metric",
"origins": origins,
"destinations": destination,
"departure_time": "now",
"key": api_key},
)
response.raise_for_status()
except HTTPError as http_error:
raise Exception(f"HTTP Error occurred: {http_error}")
except Exception as error:
raise Exception(f"Other Error occurred: {error}")
else:
print("Success: " + str(response.status_code))
return response.json()
def in_between(now, start, end):
if start <= now < end:
return True
else:
return False
def run_maps(quelle_db_id, origin, destinationList, api_key):
originrequest = ""
if origin[6] == "":
originrequest = origin[2] + " " + origin[3] + "," + origin[4] + "," + origin[5]
else:
originrequest = origin[5]
originrequest = urllib.parse.quote_plus(originrequest)
for destination in destinationList:
destinationrequest = ""
if destination[6] == "":
destinationrequest = destination[2] + " " + destination[3] + "," + destination[4] + "," + destination[5]
else:
destinationrequest = destination[6]
destinationrequest = urllib.parse.quote_plus(destinationrequest)
print("Abfrage - Von: " + str(origin[1] + " Nach: " + str(destination[1])))
api_query = "https://maps.googleapis.com/maps/api/distancematrix/json?units=metric&origins=" + originrequest + "&destinations=" + destinationrequest + "&departure_time=now&key=" + api_key
print(api_query)
with urllib.request.urlopen(api_query) as url:
try:
data = json.loads(url.read().decode())
print(data)
# data = json.loads('{"destination_addresses": ["C7XC+37 Bochum, Germany"], "origin_addresses": ["8XG9+X4 Heiligenhaus, Germany"], "rows": [{"elements": [{"distance": {"text": "44.6 km", "value": 44648}, "duration": {"text": "40 mins", "value": 2397}, "duration_in_traffic": {"text": "38 mins", "value": 2290}, "status": "OK"}]}], "status": "OK"}')
print("Von: " + data['origin_addresses'][0])
print("Nach: " + data['destination_addresses'][0])
print("Entfernung: " + data['rows'][0]['elements'][0]['distance']['text'])
print("Dauer: " + data['rows'][0]['elements'][0]['duration']['text'])
print("Dauer mit Verkehr: " + data['rows'][0]['elements'][0]['duration_in_traffic']['text'])
status = data['status']
print("Status: " + status)
if (status == "OK"):
try:
with mariadb.connect(
user=DB_USER,
password=DB_PASSWORD,
host=DB_SERVERNAME,
port=DB_PORT,
database=DB_DATABASE
) as conn:
cursor = conn.cursor()
query = "INSERT INTO Streckenvorhersage (StartortID, ZielortID, Entfernung, Dauer, Datenquelle) VALUES (%d, %d, %d, %d, %d)" % (
origin[0], destination[0], data['rows'][0]['elements'][0]['distance']['value'],
data['rows'][0]['elements'][0]['duration_in_traffic']['value'], quelle_db_id)
print(query)
cursor.execute(query)
conn.commit()
except mariadb.Error as e:
print(f"Error connecting to MariaDB Platform: {e}")
except:
print(f"Error while quering Google API")
if __name__ == "__main__":
origin = None
destinationList = []
quelle_maps_db_id = get_google_maps_db_id()
origin, destinationList = get_origin_and_destinationlist()
while True:
run_maps(quelle_maps_db_id, origin, destinationList[:6], GOOGLE_API_KEY_FRED) # immer 6 pro key
run_maps(quelle_maps_db_id, origin, destinationList[6:], GOOGLE_API_KEY_CHRIS)
if in_between(datetime.now().time(), RUSHHOUR_MORNING_START, RUSHHOUR_MORNING_END) or in_between(
datetime.now().time(), RUSHHOUR_EVENING_START, RUSHHOUR_EVENING_END):
sleepTimer.sleep(60 * 5)
else:
sleepTimer.sleep(60 * 30)
\ No newline at end of file
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.model_selection import train_test_split
from randomForest import randomForestRegression
np.random.seed(42)
pd.options.mode.chained_assignment = None # default='warn'
data = pd.read_csv('data_unfilled.csv')
data= data.iloc[:,1:]
#komplette Daten fürs Training:
data_cmpl = data.loc[data['Streckenvorhersage.Dauer']!= 0]
X_cmpl = data_cmpl[['Streckenvorhersage.ZielortID','Streckenvorhersage.StartortID','time','weekday']]
Y_cmpl = data_cmpl['Streckenvorhersage.Dauer']
X_cmpl_train, X_cmpl_test, y_cmpl_train, y_cmpl_test = train_test_split(X_cmpl, Y_cmpl, test_size=0.2)
# fehlende Daten für Test:
data_incmpl = data.loc[data['Streckenvorhersage.Dauer']== 0]
X_incmpl = data_incmpl[['Streckenvorhersage.ZielortID','Streckenvorhersage.StartortID','time','weekday']]
Y_incmpl = data_incmpl['Streckenvorhersage.Dauer']
#Prediction anhand vorhandener Daten
myANN = Sequential()
myANN.add(Dense(80, activation='relu', input_dim=X_cmpl.shape[1]))
myANN.add(Dense(50,activation='relu'))
myANN.add(Dense(30,activation='relu'))
myANN.add(Dense(1,activation='linear'))
myANN.compile(loss='mean_squared_error', optimizer='adam')
myANN.fit(X_cmpl_train,y_cmpl_train, epochs=100,shuffle=True,verbose=False)
yp = myANN.predict(X_cmpl_test)
yp=np.squeeze(yp)
yDiff = yp - y_cmpl_test
print('Mittlere Abweichung auf fehlende Daten: %e ' % (np.mean(np.abs(yDiff)))) # mittlerer fehler
#impute Dauer auf vorhandenen mittels AForest
myForestgdp = randomForestRegression(noOfTrees=25,minLeafNodeSize=5,threshold=2)
myForestgdp.fit(X_cmpl_train.to_numpy(),y_cmpl_train.to_numpy())
yp = (myForestgdp.predict(X_incmpl.to_numpy()))
#yp = myANN.predict(X_incmpl)
yp=np.squeeze(yp)
Y_incmpl = pd.DataFrame(data=yp,columns=['Streckenvorhersage.Dauer'])
#größere Testmenge aus den originaldaten suchen
#X_train, X_test, y_train, y_test = train_test_split(X_cmpl, Y_cmpl, test_size=0.2)
y_train = pd.DataFrame(data=y_cmpl_train,columns=['Streckenvorhersage.Dauer'])
X_train=X_cmpl_train.append(X_incmpl)
y_train=pd.concat([y_train,Y_incmpl])
myANN = Sequential()
myANN.add(Dense(80, activation='relu', input_dim=X_cmpl.shape[1]))
myANN.add(Dense(50,activation='relu'))
myANN.add(Dense(30,activation='relu'))
myANN.add(Dense(1,activation='linear'))
myANN.compile(loss='mean_squared_error', optimizer='adam')
#ANN mit gefüllten Daten als trainingsmenge
myANN.fit(X_train,y_train, epochs=100,shuffle=True,verbose=False)
yp = myANN.predict(X_cmpl_test)
yp=np.squeeze(yp)
yDiff = yp - y_cmpl_test
print('Mittlere Abweichung mit aufgefüllten Daten(simuliert Forest): %e ' % (np.mean(np.abs(yDiff))))
#das Data-DateaFrame mit den gefüllten Daten füllen
y_cmpl_test = pd.DataFrame(data=y_cmpl_test,columns=['Streckenvorhersage.Dauer'])
X_all=X_train.append(X_cmpl_test)
y_all=pd.concat([y_train,y_cmpl_test])
data= X_all
y_all= np.asarray(y_all)
data['Streckenvorhersage.Dauer']=y_all
data.to_csv('data_filled(Forest).csv') # Die gefülten Daten in einer neuen csv abspeichern
# -*- coding: utf-8 -*-
"""
Created on Fri Feb 26 12:58:53 2021
@author: Christoph
"""
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.model_selection import train_test_split
np.random.seed(42)
data = pd.read_csv('data_unfilled.csv')
data= data.iloc[:,1:]
#komplette Daten fürs Training:
data_cmpl = data.loc[data['Streckenvorhersage.Dauer']!= 0]
X_cmpl = data_cmpl[['Streckenvorhersage.ZielortID','Streckenvorhersage.StartortID','time']]
X_cmpl = data_cmpl[['Streckenvorhersage.ZielortID','Streckenvorhersage.StartortID','time','weekday']]
Y_cmpl = data_cmpl['Streckenvorhersage.Dauer']
X_cmpl_train, X_cmpl_test, y_cmpl_train, y_cmpl_test = train_test_split(X_cmpl, Y_cmpl, test_size=0.2)
# fehlende Daten für Test:
data_incmpl = data.loc[data['Streckenvorhersage.Dauer']== 0]
X_incmpl = data_incmpl[['Streckenvorhersage.ZielortID','Streckenvorhersage.StartortID','time']]
X_incmpl = data_incmpl[['Streckenvorhersage.ZielortID','Streckenvorhersage.StartortID','time','weekday']]
Y_incmpl = data_incmpl['Streckenvorhersage.Dauer']
#Prediction anhand vorhandener Daten
myANN = Sequential()
myANN.add(Dense(80, activation='relu', input_dim=X_cmpl.shape[1]))
myANN.add(Dense(50,activation='relu'))
......@@ -43,20 +33,20 @@ yp = myANN.predict(X_cmpl_test)
yp=np.squeeze(yp)
yDiff = yp - y_cmpl_test
print('Mittlere Abweichung auf fehlende Daten: %e ' % (np.mean(np.abs(yDiff))))
print('Mittlere Abweichung auf fehlende Daten: %e ' % (np.mean(np.abs(yDiff)))) # mittlerer fehler
#impute Dauer auf vorhandenen Daten
#impute Dauer auf vorhandenen mittels ANN
yp = myANN.predict(X_incmpl)
yp=np.squeeze(yp)
Y_incmpl = pd.DataFrame(data=yp,columns=['Streckenvorhersage.Dauer'])
#größere Testmenge aus den originaldaten suchen
##X_train, X_test, y_train, y_test = train_test_split(X_cmpl, Y_cmpl, test_size=0.2)
y_train = pd.DataFrame(data=y_cmpl_train,columns=['Streckenvorhersage.Dauer'])
X_train, X_test, y_train, y_test = train_test_split(X_cmpl, Y_cmpl, test_size=0.35)
y_train = pd.DataFrame(data=y_train,columns=['Streckenvorhersage.Dauer'])
X_train=X_train.append(X_incmpl)
X_train=X_cmpl_train.append(X_incmpl)
y_train=pd.concat([y_train,Y_incmpl])
......@@ -67,18 +57,19 @@ myANN.add(Dense(30,activation='relu'))
myANN.add(Dense(1,activation='linear'))
myANN.compile(loss='mean_squared_error', optimizer='adam')
#ANN mit gefüllten Daten als trainingsmenge
myANN.fit(X_train,y_train, epochs=100,shuffle=True,verbose=False)
yp = myANN.predict(X_test)
yp = myANN.predict(X_cmpl_test)
yp=np.squeeze(yp)
yDiff = yp - y_test
yDiff = yp - y_cmpl_test
print('Mittlere Abweichung mit aufgefüllten Daten(simuliert): %e ' % (np.mean(np.abs(yDiff))))
y_test = pd.DataFrame(data=y_test,columns=['Streckenvorhersage.Dauer'])
X_all=X_train.append(X_test)
#das Data-DateaFrame mit den gefüllten Daten füllen
y_test = pd.DataFrame(data=y_cmpl_test,columns=['Streckenvorhersage.Dauer'])
X_all=X_train.append(X_cmpl_test)
y_all=pd.concat([y_train,y_test])
data= X_all
y_all= np.asarray(y_all)
data['Streckenvorhersage.Dauer']=y_all
data.to_csv('data_filled(ANN).csv')
\ No newline at end of file
data.to_csv('data_filled(ANN).csv') # Die gefülten Daten in einer neuen csv abspeichern
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
# -*- coding: utf-8 -*-
"""
Created on Fri Feb 26 12:57:45 2021
@author: Christoph
"""
import pandas as pd
import numpy as np
data = pd.read_csv('data.csv')
data= data.iloc[:,1:]
timeinterval=[0]
data = pd.read_csv('data.csv') #einlesen der relevanten daten
data= data.iloc[:,1:] # alter index filtern
timeinterval=[0] # liste mit zeitintervalen erstellen
minutes=[0]
indexes=[]
time=0
minute=0
for h in range(108):
for h in range(108): # für den ganzen tag durchgehen und berechnen ab welchen sekunden man in welchem sende-interval ist
h=h+1
if (h<13 or (h>48 and h<63) or h>98):
time=time + (30*60)
......@@ -31,11 +24,11 @@ timeinterval=np.asarray(timeinterval)
minutes=np.asarray(minutes)
indexes=np.asarray(indexes)
for h in range(np.asarray(timeinterval).shape[0]-1):
data.loc[(data['time']>timeinterval[h]) & (data['time']<=timeinterval[h+1]), 'time'] = indexes[h] #data.loc[(data['time']>timeinterval[h]) & (data['time']<=timeinterval[h+1]), 'time'] = indexes[h]
data.loc[(data['time']>timeinterval[h]) & (data['time']<=timeinterval[h+1]), 'time'] = indexes[h] #für jede sukundenanzahl das jeweilige interval einsetzen
d=data.copy()
d=data.copy()# ergänzen der Zeilen welche nicht erfasst worden sind
for e in data['day_index'].unique(): # für alle verschiedenen tages_indexe
day_data=data.loc[data['day_index'] == e] # nur die einzelnen Tage
for h in indexes: #für alle zeit_indexe der tage
......@@ -46,7 +39,7 @@ for e in data['day_index'].unique(): # für alle verschiedenen tages_indexe
if(h==l):
exists=True # wenn die Zeit existiert
if (exists==False):
new_row={'Streckenvorhersage.ZielortID':k,'Streckenvorhersage.StartortID': 1,'Streckenvorhersage.Dauer':0,'time':h,'day_index':e}
new_row={'Streckenvorhersage.ZielortID':k,'Streckenvorhersage.StartortID': 1,'Streckenvorhersage.Dauer':0,'time':h,'day_index':e ,'weekday':(data.loc[data['day_index']==e]).loc[(data.loc[data['day_index']==e]).index[0],'weekday']}
data = data.append(new_row, ignore_index=True)
data.to_csv('data_unfilled.csv')
\ No newline at end of file
data.to_csv('data_unfilled.csv') # die daten aufgefüllt mit leeren zeielen für die fehlenden einträge als neue csv abspeichern
# -*- coding: utf-8 -*-
"""
Created on Mon Mar 1 10:06:55 2021
@author: Christoph
"""
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
......@@ -17,20 +10,18 @@ data= data.iloc[:,1:]
#komplette Daten fürs Training:
data_cmpl = data.loc[data['Streckenvorhersage.Dauer']!= 0]
X_cmpl = data_cmpl[['Streckenvorhersage.ZielortID','Streckenvorhersage.StartortID','time']]
X_cmpl = data_cmpl[['Streckenvorhersage.ZielortID','Streckenvorhersage.StartortID','time','weekday']]
Y_cmpl = data_cmpl['Streckenvorhersage.Dauer']
X_cmpl_train, X_cmpl_test, y_cmpl_train, y_cmpl_test = train_test_split(X_cmpl, Y_cmpl, test_size=0.2)
# fehlende Daten für Test:
data_incmpl = data.loc[data['Streckenvorhersage.Dauer']== 0]
X_incmpl = data_incmpl[['Streckenvorhersage.ZielortID','Streckenvorhersage.StartortID','time']]
X_incmpl = data_incmpl[['Streckenvorhersage.ZielortID','Streckenvorhersage.StartortID','time','weekday']]
Y_incmpl = data_incmpl['Streckenvorhersage.Dauer']
#Prediction anhand vorhandener Daten
myANN = Sequential()
myANN.add(Dense(80, activation='relu', input_dim=X_cmpl.shape[1]))
myANN.add(Dense(50,activation='relu'))
......@@ -47,22 +38,27 @@ print('Mittlere Abweichung auf fehlende Daten: %e ' % (np.mean(np.abs(yDiff))))
#impute Dauer auf vorhandenen Daten
#impute Dauer auf vorhandenen Daten indem er jede reihe durchgeht und den durchschnittswert den er in den Originaldaten hat berechnet.
for index,row in data_incmpl.iterrows():
relevant_data= data_cmpl[data_cmpl['time']==row['time']]
data_incmpl.loc[index,'Streckenvorhersage.Dauer']=pd.DataFrame.mean(relevant_data['Streckenvorhersage.Dauer'])
data_incmpl = data_incmpl[~np.isnan(data_incmpl['Streckenvorhersage.Dauer'])] # testing here
X_incmpl = data_incmpl[['Streckenvorhersage.ZielortID','Streckenvorhersage.StartortID','time']]
X_incmpl = data_incmpl[['Streckenvorhersage.ZielortID','Streckenvorhersage.StartortID','time','weekday']]
Y_incmpl = data_incmpl['Streckenvorhersage.Dauer']
#größere Testmenge aus den originaldaten suchen
#X_train, X_test, y_train, y_test = train_test_split(X_cmpl, Y_cmpl, test_size=0.2)
#y_train = pd.DataFrame(data=y_train,columns=['Streckenvorhersage.Dauer'])
X_train, X_test, y_train, y_test = train_test_split(X_cmpl, Y_cmpl, test_size=0.35)
y_train = pd.DataFrame(data=y_train,columns=['Streckenvorhersage.Dauer'])
X_train=X_train.append(X_incmpl)
#X_train=X_train.append(X_incmpl)
Y_incmpl = pd.DataFrame(data=Y_incmpl,columns=['Streckenvorhersage.Dauer'])
#y_train=pd.concat([y_train,Y_incmpl])
##X_train, X_test, y_train, y_test = train_test_split(X_cmpl, Y_cmpl, test_size=0.2)
y_train = pd.DataFrame(data=y_cmpl_train,columns=['Streckenvorhersage.Dauer'])
X_train=X_cmpl_train.append(X_incmpl)
y_train=pd.concat([y_train,Y_incmpl])
......@@ -73,20 +69,21 @@ myANN.add(Dense(30,activation='relu'))
myANN.add(Dense(1,activation='linear'))
myANN.compile(loss='mean_squared_error', optimizer='adam')
#ANN mit gefüllten Daten als trainingsmenge
myANN.fit(X_train,y_train, epochs=100,shuffle=True,verbose=False)
yp = myANN.predict(X_test)
yp = myANN.predict(X_cmpl_test)
yp=np.squeeze(yp)
yDiff = yp - y_test
yDiff = yp - y_cmpl_test
print('Mittlere Abweichung mit aufgefüllten Daten(mean): %e ' % (np.mean(np.abs(yDiff))))
y_test = pd.DataFrame(data=y_test,columns=['Streckenvorhersage.Dauer'])
X_all=X_train.append(X_test)
y_all=pd.concat([y_train,y_test])
#das Data-DateaFrame mit den gefüllten Daten füllen
y_cmpl_test = pd.DataFrame(data=y_cmpl_test,columns=['Streckenvorhersage.Dauer'])
X_all=X_train.append(X_cmpl_test)
y_all=pd.concat([y_train,y_cmpl_test])
data= X_all
y_all= np.asarray(y_all)
data['Streckenvorhersage.Dauer']=y_all
data.to_csv('data_filled(mean).csv')
data.to_csv('data_filled(mean).csv') # Die gefülten Daten in einer neuen csv abspeichern
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.model_selection import train_test_split
np.random.seed(42)
data = pd.read_csv('data_unfilled.csv')
data= data.iloc[:,1:]
#komplette Daten fürs Training:
data_cmpl = data.loc[data['Streckenvorhersage.Dauer']!= 0]
X_cmpl = data_cmpl[['Streckenvorhersage.ZielortID','Streckenvorhersage.StartortID','time','weekday']]
Y_cmpl = data_cmpl['Streckenvorhersage.Dauer']
X_cmpl_train, X_cmpl_test, y_cmpl_train, y_cmpl_test = train_test_split(X_cmpl, Y_cmpl, test_size=0.2)
# fehlende Daten für Test:
data_incmpl = data.loc[data['Streckenvorhersage.Dauer']== 0]
X_incmpl = data_incmpl[['Streckenvorhersage.ZielortID','Streckenvorhersage.StartortID','time','weekday']]
Y_incmpl = data_incmpl['Streckenvorhersage.Dauer']
#Prediction anhand vorhandener Daten
myANN = Sequential()
myANN.add(Dense(80, activation='relu', input_dim=X_cmpl.shape[1]))
myANN.add(Dense(50,activation='relu'))
myANN.add(Dense(30,activation='relu'))
myANN.add(Dense(1,activation='linear'))
myANN.compile(loss='mean_squared_error', optimizer='adam')
myANN.fit(X_cmpl_train,y_cmpl_train, epochs=100,shuffle=True,verbose=False)
yp = myANN.predict(X_cmpl_test)
yp=np.squeeze(yp)
yDiff = yp - y_cmpl_test
print('Mittlere Abweichung auf fehlende Daten: %e ' % (np.mean(np.abs(yDiff))))
#impute Dauer auf vorhandenen Daten indem er jede reihe durchgeht und den durchschnittswert den er in den Originaldaten hat berechnet.
for index,row in data_incmpl.iterrows():
relevant_data= data_cmpl[data_cmpl['time']==row['time']]
data_incmpl.loc[index,'Streckenvorhersage.Dauer']=pd.DataFrame.median(relevant_data['Streckenvorhersage.Dauer'])
data_incmpl = data_incmpl[~np.isnan(data_incmpl['Streckenvorhersage.Dauer'])] # testing here
X_incmpl = data_incmpl[['Streckenvorhersage.ZielortID','Streckenvorhersage.StartortID','time','weekday']]
Y_incmpl = data_incmpl['Streckenvorhersage.Dauer']
#größere Testmenge aus den originaldaten suchen
#X_train, X_test, y_train, y_test = train_test_split(X_cmpl, Y_cmpl, test_size=0.2)
#y_train = pd.DataFrame(data=y_train,columns=['Streckenvorhersage.Dauer'])
#X_train=X_train.append(X_incmpl)
Y_incmpl = pd.DataFrame(data=Y_incmpl,columns=['Streckenvorhersage.Dauer'])
#y_train=pd.concat([y_train,Y_incmpl])
##X_train, X_test, y_train, y_test = train_test_split(X_cmpl, Y_cmpl, test_size=0.2)
y_train = pd.DataFrame(data=y_cmpl_train,columns=['Streckenvorhersage.Dauer'])
X_train=X_cmpl_train.append(X_incmpl)
y_train=pd.concat([y_train,Y_incmpl])
myANN = Sequential()
myANN.add(Dense(80, activation='relu', input_dim=X_cmpl.shape[1]))
myANN.add(Dense(50,activation='relu'))
myANN.add(Dense(30,activation='relu'))
myANN.add(Dense(1,activation='linear'))
myANN.compile(loss='mean_squared_error', optimizer='adam')
#ANN mit gefüllten Daten als trainingsmenge
myANN.fit(X_train,y_train, epochs=100,shuffle=True,verbose=False)
yp = myANN.predict(X_cmpl_test)
yp=np.squeeze(yp)
yDiff = yp - y_cmpl_test
print('Mittlere Abweichung mit aufgefüllten Daten(mean): %e ' % (np.mean(np.abs(yDiff))))
#das Data-DateaFrame mit den gefüllten Daten füllen
y_test = pd.DataFrame(data=y_cmpl_test,columns=['Streckenvorhersage.Dauer'])
X_all=X_train.append(X_cmpl_test)
y_all=pd.concat([y_train,y_test])
data= X_all
y_all= np.asarray(y_all)
data['Streckenvorhersage.Dauer']=y_all
data.to_csv('data_filled(median).csv') # Die gefülten Daten in einer neuen csv abspeichern
# -*- coding: utf-8 -*-
"""
Created on Fri Feb 26 11:19:34 2021
@author: Christoph
"""
import pandas as pd
import numpy as np
data = pd.read_csv('db_dump.csv',delimiter=';')
dumb=data.copy()
relevantData= pd.DataFrame.copy(data[['Streckenvorhersage.ZielortID','Streckenvorhersage.StartortID','Streckenvorhersage.Dauer']])
helpSeries= pd.DataFrame(np.zeros(relevantData.shape[0]))
#relevantData['day']=helpSeries
#relevantData['month']=helpSeries
#relevantData['year']=helpSeries
data = pd.read_csv('db_dump.csv',delimiter=';')#Lese die Csv als Panda Dataframe ein
relevantData= pd.DataFrame.copy(data[['Streckenvorhersage.ZielortID','Streckenvorhersage.StartortID','Streckenvorhersage.Dauer']]) # filtere die relevaten Daten
helpSeries= pd.DataFrame(np.zeros(relevantData.shape[0])) # eine Spalte mit nullen für weitere ergänzten Spalten
#Spalten ergänzen für Zeit, einen Index für den Tag und den einzelnen Wochentagen
relevantData['time']=helpSeries
relevantData['day_index']=helpSeries
relevantData['weekday']=helpSeries
dates= data['Streckenvorhersage.Datum']
dates= data['Streckenvorhersage.Datum'] # fürs bessere Arbeiten alle Datum-Einträge in ein Hilfs-Dataframe einlesen
for d in range(dates.shape[0]):
toTest = dates[d]
toTest = dates[d] #Aufteilen des Datums in die Bestandteile + zusammenrechnen der gesamt sekunden
hours = toTest[11:13]
minutes = toTest[14:16]
seconds = int(toTest[17:19]) + int(hours)*60*60 + int(minutes)*60
day = toTest[0:2]
month = toTest[3:5]
year = toTest[6:10]
day_index= int(day)+int(month)*31+int(year)*12
day_index= int(day)+int(month)*31+int(year)*49
#relevantData.at[d,'month']= month
#relevantData.at[d,'day']=day
#relevantData.at[d,'year']=year
relevantData.at[d,'time']=seconds
relevantData.at[d,'day_index']=day_index
relevantData.at[d,'weekday']=pd.to_datetime(year+'-'+month+'-'+day)
relevantData.at[d,'weekday']=relevantData.loc[d,'weekday'].dayofweek
relevantData.to_csv('data.csv')
relevantData.to_csv('data.csv') # neue csv mit lediglich relevanten daten errstellen
{
"python.pythonPath": "/usr/bin/python3.8"
}
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import GridSearchCV
dataset = np.genfromtxt('Random Forest/data.csv', delimiter=',', skip_header=1)
x = dataset[:,np.arange(2,26)]
y = dataset[:,0]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)
param_grid = {
#'bootstrap': [True, False],
'max_depth': [25,50,75,100, None],
#'max_features': ["auto", "sqrt", "log2"],
#'min_samples_leaf': [1,3,5],
#'min_samples_split': [2,4,6],
'n_estimators': [1000],
'random_state': [42]
}
rf = RandomForestRegressor()
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 3, n_jobs = -1, verbose = 2)
grid_search.fit(x_train, y_train)
print(grid_search.best_params_)
y_predict = grid_search.best_estimator_.predict(x_test)
accuracy = (1-mean_absolute_percentage_error(y_test, y_predict))*100
print("Accuracy: " + str(accuracy) + '%')
import pandas as pd
df = pd.read_csv("Random Forest/db_dump.csv", delimiter=';', usecols=[
'Streckenvorhersage.Dauer', 'Streckenvorhersage.Entfernung', 'Streckenvorhersage.ZielortID', 'Streckenvorhersage.Datum'])
df = df.reindex(columns=['Streckenvorhersage.Dauer', 'Streckenvorhersage.Entfernung',
'Streckenvorhersage.ZielortID', 'Streckenvorhersage.Datum'])
df['Streckenvorhersage.Year'] = [pd.Timestamp(
d).year for d in df['Streckenvorhersage.Datum']]
df['Streckenvorhersage.Month'] = [pd.Timestamp(
d).month for d in df['Streckenvorhersage.Datum']]
df['Streckenvorhersage.Day'] = [pd.Timestamp(
d).day for d in df['Streckenvorhersage.Datum']]
df['Streckenvorhersage.DayOfWeek'] = [pd.Timestamp(
d).day_of_week for d in df['Streckenvorhersage.Datum']]
df['Streckenvorhersage.Hour'] = [pd.Timestamp(
d).hour for d in df['Streckenvorhersage.Datum']]
df['Streckenvorhersage.Minutes'] = [pd.Timestamp(
d).minute for d in df['Streckenvorhersage.Datum']]
df.drop(['Streckenvorhersage.Datum'], axis=1, inplace=True)
# One-Hot Encoding
df = pd.concat(
[df, pd.get_dummies(df['Streckenvorhersage.DayOfWeek'], prefix='DayOfWeek')], axis=1)
df.drop(['Streckenvorhersage.DayOfWeek'], axis=1, inplace=True)
# One-Hot Encoding ZielortID
df = pd.concat(
[df, pd.get_dummies(df['Streckenvorhersage.ZielortID'], prefix='ZielortID')], axis=1)
df.drop(['Streckenvorhersage.ZielortID'], axis=1, inplace=True)
df.to_csv("Random Forest/data.csv", index=False)
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error,mean_absolute_percentage_error
from pprint import pprint
import matplotlib.pyplot as plt
dataset = np.genfromtxt('Random Forest/data.csv', delimiter=',', skip_header=1)
x = dataset[:,np.arange(2,26)]
y = dataset[:,0]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)
#model = RandomForestRegressor(random_state=42, n_estimators=750, max_depth=50)
model = RandomForestRegressor(random_state=42)
model.fit(x_train,y_train)
#pprint(model.get_params())
# Accuracy in Percent
y_predict = model.predict(x_test)
accuracy = (1-mean_absolute_percentage_error(y_test, y_predict))*100
print("Accuracy: " + str(accuracy) + '%')
# Feature Importance without Distance
pprint(model.feature_importances_)
feature_list = ["Jahr","Monat","Tag","Stunde","Minute","Wochentag", "ZielortID"]
importances = model.feature_importances_[0:5]
wochentag_importance = np.sum(model.feature_importances_[5:12])
importances = np.append(importances,wochentag_importance)
zielort_importance = np.sum(model.feature_importances_[12:])
importances = np.append(importances,zielort_importance)
# Feature Importance with Distance
#feature_list = ["Entfernung","Jahr","Monat","Tag","Stunde","Minute","Wochentag", "ZielortID"]
#importances = model.feature_importances_[0:6]
#wochentag_importance = np.sum(model.feature_importances_[6:13])
#importances = np.append(importances,wochentag_importance)
#zielort_importance = np.sum(model.feature_importances_[13:])
#importances = np.append(importances,zielort_importance)
importances = np.round(importances * 100,decimals=2)
#pprint(np.round(model.feature_importances_ * 100,decimals=2))
zipped = list(zip(feature_list,importances))
pprint(zipped)
# Feature Importance Plot
x_values = list(range(len(importances)))
plt.bar(x_values, importances, orientation = 'vertical')
plt.xticks(x_values, feature_list, rotation='vertical')
plt.ylabel('Importance');
plt.xlabel('Feature');
plt.title('Feature Analyse');
# Absolute Error
print(mean_absolute_error(y_test,y_predict))
# Average Time
print("Average Time: " + str(np.average(y_test)) + " Sekunden")
# Average Distance
print("Average Distance: " + str(np.average(dataset[:,1])) + " Meter")
#plt.show()
\ No newline at end of file