Skip to content
Snippets Groups Projects
Commit 63bb3dba authored by Silas Dohm's avatar Silas Dohm
Browse files

renamed files, worked on hd5 cnn

parent e4ea3122
Branches
No related tags found
No related merge requests found
......@@ -85,7 +85,7 @@ print("Der Durschnitt der Textlaenge ist: %d"%(np.mean(X)))
\end{lstlisting}
Das Importieren der benötigten Libraries erfolgt in Zeile 0 und 1.
In Zeile 2 deklarieren wird X als leere Liste definiert.
In Zeile 2 wird X als leere Liste definiert.
In der 3. Zeile wird wie gehabt über jede Zeile des Datensatzes iteriert.
......@@ -93,7 +93,7 @@ In der Schleife parsen wir in Zeile 4 die aktuelle Zeile des Datensatzes und Kon
Nach dem Durchlauf der Schleife erhalten wir eine Liste, die alle Reviewlängen beinhaltet.
In der 6. Zeile nutzen wir die Numpy Funktion median, um uns den Median der Reviewlänge auszugeben. Anschließen geben wir in Zeile 7 den Durchschnitt über die Numpy Funktion mean aus.
In der 6. Zeile nutzen wir die Numpy Funktion median, um uns den Median der Reviewlänge auszugeben. Anschließend geben wir in Zeile 7 den Durchschnitt über die Numpy Funktion mean aus.
\wip{
Der Median der Textlänge ist: 78
Der Durschnitt der Textlänge ist: 110.
......
\subsection{Mean-Vektor-Klassifikationsmodell}
%Durchschnittsvektor-Klassifkationsmodell ??
Das Word2Vec-Modell bildet einen Vektorraum, indem ähnliche Wörter nahe
beieinander liegen.
......@@ -189,7 +190,6 @@ confusion_matrix(Y_test,y_pred,normalize='true')
\end{center}
\caption{Konfusionsmatrix mit Klassengewichtung}
\end{table}
Das gleiche Modell ohne die Gewichtung der Klassen erreicht eine Genauigkeit
von $85.7\%$, betrachtet man jedoch die Konfusionsmatrix in Tabelle \ref{tab:conf_no_w}, so
sieht man das dort bloß $27\%$ der neutralen Rezensionen richtig klassifiziert wurden.
......
No preview for this file type
......@@ -58,14 +58,13 @@
%The min, mid and max values
\newcommand*{\MinNumber}{0.0}%
\newcommand*{\MidNumber}{0.5}%
\newcommand*{\MaxNumber}{1.0}%
%Apply the gradient macro
\usepackage{xstring}
\newcommand{\ApplyGradient}[1]{%
\IfDecimal{#1}{
\pgfmathsetmacro{\PercentColor}{max(min(70.0*(#1 - \MinNumber)/(\MaxNumber-\MinNumber),100.0),0.00)} %
......@@ -75,9 +74,9 @@
}
\newcolumntype{R}{>{\collectcell\ApplyGradient}c<{\endcollectcell}}
\renewcommand{\arraystretch}{0}
\renewcommand{\arraystretch}{1.5}
\setlength{\fboxsep}{3mm} % box size
\setlength{\tabcolsep}{0pt}
\setlength{\tabcolsep}{5pt}
%-----------------------
\newcommand{\noteable}[1]{\textit{#1}}
......
#%%
data_path = "E:\\downloads\\yelp_dataset\\yelp_dataset\\yelp_academic_dataset_review.json"
data_path = "G:\\ml\\yelp_academic_dataset_review.json"
#%% data-structure
for index, line in enumerate(open(data_path,encoding="utf8")):
if(index>1):
......
if __name__ == '__main__':
#%%
import numpy as np
from gensim import utils
import json
import h5py
from w2v_yelp_model import getWordVecModel
#%% w2v model
model_path = "D:\\ml\\full_yelp_w2v_model"
model = getWordVecModel(model_path)
pathJoel = pathJoel = "C:\\Tsubasaya E\\Uni\\ML\\Hausarbeit\\Dataset\\"
pathSilas = "G:\\ml\\"
path = pathSilas
data_path ="E:\\downloads\\yelp_dataset\\yelp_dataset\\yelp_academic_dataset_review.json"
def getSentenceVectorCNN(sentence):
split = utils.simple_preprocess(sentence)
wordVecs = np.zeros((72,100))
i=0
for word in split:
if i == 72: break
try:
wordVecs[i] = model.wv[word]
i += 1
except:
pass
if np.all(wordVecs[5:]==0):
raise Exception('not enough words found in w2v model')
return wordVecs
#%%
import h5py
import json
i = 0
with h5py.File(path + "w2vCNN.hdf5", "w") as hf:
chunkSize = 10**4
trainChunk = int(chunkSize * 0.6)
valTestChunk = int(chunkSize * 0.2)
xTrain = []
yTrain = []
xVal = []
yVal = []
xTest = []
yTest = []
index = 0
for line in open(data_path,encoding="utf8"):
json_line = json.loads(line)
#X Data
xData = []
try:
xData = getSentenceVectorCNN(json_line["text"])
except:
continue
y = float(json_line["stars"])
if(y <3):
yData = 0
elif(y==3):
yData = 1
else:
yData = 2
if index == chunkSize:
XTrain = hf.create_dataset("XTrain", data=xTrain, maxshape=(None, 72, 100), chunks=(trainChunk, 72, 100))
YTrain = hf.create_dataset("YTrain", data=yTrain, maxshape=(None,), chunks=(trainChunk,))
XVal = hf.create_dataset("XVal", data=xVal, maxshape=(None, 72, 100), chunks=(valTestChunk, 72, 100))
YVal = hf.create_dataset("YVal", data=yVal, maxshape=(None,), chunks=(valTestChunk,))
XTest = hf.create_dataset("XTest", data=xTest, maxshape=(None, 72, 100), chunks=(valTestChunk, 72, 100))
YTest = hf.create_dataset("YTest", data=yTest, maxshape=(None,), chunks=(valTestChunk,))
#reset Buffer-Data
xTrain = []
yTrain = []
xVal = []
yVal = []
xTest = []
yTest = []
if index % chunkSize == 0 and index > chunkSize:
XTrain.resize(XTrain.shape[0]+trainChunk, axis=0)
XTrain[-trainChunk:] = xTrain
YTrain.resize(YTrain.shape[0]+trainChunk, axis=0)
YTrain[-trainChunk:] = yTrain
XVal.resize(XVal.shape[0]+valTestChunk, axis=0)
XVal[-valTestChunk:] = xVal
YVal.resize(YVal.shape[0]+valTestChunk, axis=0)
YVal[-valTestChunk:] = yVal
XTest.resize(XTest.shape[0]+valTestChunk, axis=0)
XTest[-valTestChunk:] = xTest
YTest.resize(YTest.shape[0]+valTestChunk, axis=0)
YTest[-valTestChunk:] = yTest
#reset Buffer-Data
xTrain = []
yTrain = []
xVal = []
yVal = []
xTest = []
yTest = []
print(index)
if i < 3:
xTrain.append(xData)
yTrain.append(yData)
elif i == 3:
xVal.append(xData)
yVal.append(yData)
else:
xTest.append(xData)
yTest.append(yData)
i = -1
i += 1
index +=1
XTrain.resize(XTrain.shape[0]+trainChunk, axis=0)
XTrain[-trainChunk:] = xTrain
YTrain.resize(YTrain.shape[0]+trainChunk, axis=0)
YTrain[-trainChunk:] = yTrain
XVal.resize(XVal.shape[0]+valTestChunk, axis=0)
XVal[-valTestChunk:] = xVal
YVal.resize(YVal.shape[0]+valTestChunk, axis=0)
YVal[-valTestChunk:] = yVal
XTest.resize(XTest.shape[0]+valTestChunk, axis=0)
XTest[-valTestChunk:] = xTest
YTest.resize(YTest.shape[0]+valTestChunk, axis=0)
YTest[-valTestChunk:] = yTest
#%%
import h5py
def hdf5Generator(filePath, batch_size, dataSet):
with h5py.File(filePath, 'r') as hf:
L = len(hf["X" + dataSet])
while True:
batch_start = 0
batch_end = batch_size
while batch_start < L:
limit = min(batch_end, L)
X = hf["X" + dataSet][batch_start:limit]
Y = hf["Y" + dataSet][batch_start:limit]
yield (X,Y) #a tuple with two numpy arrays with batch_size samples
batch_start += batch_size
batch_end += batch_size
\ No newline at end of file
#%%
from gensim import utils
import pandas as pd
import json
pathJoel = pathJoel = "C:\\Tsubasaya E\\Uni\\ML\\Hausarbeit\\Dataset\\"
#pathSilas = "C:\\Users\\sls21\\Documents\\Uni\\word2vec\\"
pathSilas = "G:\\ml\\"
path = pathSilas
model_path = "D:\\ml\\full_yelp_w2v_model"
#data_path ="C:\\Users\\sls21\\Documents\\Uni\\word2vec\\sample.json"
data_path ="E:\\downloads\\yelp_dataset\\yelp_dataset\\yelp_academic_dataset_review.json"
#%% w2v model
from w2v_yelp_model import getWordVecModel
model = getWordVecModel(model_path)
#%%
import numpy as np
import h5py
def getSentenceVectorCNN(sentence):
split = utils.simple_preprocess(sentence)
wordVecs = np.zeros((72,100))
i=0
for word in split:
if i == 72: break
try:
wordVecs[i] = model.wv[word]
i += 1
except:
pass
#if wordVecs == np.zeros((72,100)): #maybe dont alllow sentences with less than n wordvecotrs
# raise Exception('words not found in w2v model')
return wordVecs
corpus_path = path + "sample.json"
i = 0
with h5py.File(path + "w2vCNN.hdf5", "w") as hf:
chunkSize = 10**4
trainChunk = int(chunkSize * 0.6)
valTestChunk = int(chunkSize * 0.2)
xTrain = []
yTrain = []
xVal = []
yVal = []
xTest = []
yTest = []
for index, line in enumerate(open(data_path,encoding="utf8")):
json_line = json.loads(line)
#X Data
xData = []
try:
xData = getSentenceVectorCNN(json_line["text"])
except:
continue
y = float(json_line["stars"])
if(y <3):
yData = 0
elif(y==3):
yData = 1
else:
yData = 2
if index == chunkSize:
XTrain = hf.create_dataset("XTrain", data=xTrain, maxshape=(None, 72, 100), chunks=(trainChunk, 72, 100))
YTrain = hf.create_dataset("YTrain", data=yTrain, maxshape=(None,), chunks=(trainChunk,))
XVal = hf.create_dataset("XVal", data=xVal, maxshape=(None, 72, 100), chunks=(valTestChunk, 72, 100))
YVal = hf.create_dataset("YVal", data=yVal, maxshape=(None,), chunks=(valTestChunk,))
XTest = hf.create_dataset("XTest", data=xTest, maxshape=(None, 72, 100), chunks=(valTestChunk, 72, 100))
YTest = hf.create_dataset("YTest", data=yTest, maxshape=(None,), chunks=(valTestChunk,))
#reset Buffer-Data
xTrain = []
yTrain = []
xVal = []
yVal = []
xTest = []
yTest = []
if index % chunkSize == 0 and index > chunkSize:
XTrain.resize(XTrain.shape[0]+trainChunk, axis=0)
XTrain[-trainChunk:] = xTrain
YTrain.resize(YTrain.shape[0]+trainChunk, axis=0)
YTrain[-trainChunk:] = yTrain
XVal.resize(XVal.shape[0]+valTestChunk, axis=0)
XVal[-valTestChunk:] = xVal
YVal.resize(YVal.shape[0]+valTestChunk, axis=0)
YVal[-valTestChunk:] = yVal
XTest.resize(XTest.shape[0]+valTestChunk, axis=0)
XTest[-valTestChunk:] = xTest
YTest.resize(YTest.shape[0]+valTestChunk, axis=0)
YTest[-valTestChunk:] = yTest
#reset Buffer-Data
xTrain = []
yTrain = []
xVal = []
yVal = []
xTest = []
yTest = []
print(index)
if i < 3:
xTrain.append(xData)
yTrain.append(yData)
elif i == 3:
xVal.append(xData)
yVal.append(yData)
else:
xTest.append(xData)
yTest.append(yData)
i = -1
i += 1
XTrain.resize(XTrain.shape[0]+trainChunk, axis=0)
XTrain[-trainChunk:] = xTrain
YTrain.resize(YTrain.shape[0]+trainChunk, axis=0)
YTrain[-trainChunk:] = yTrain
XVal.resize(XVal.shape[0]+valTestChunk, axis=0)
XVal[-valTestChunk:] = xVal
YVal.resize(YVal.shape[0]+valTestChunk, axis=0)
YVal[-valTestChunk:] = yVal
XTest.resize(XTest.shape[0]+valTestChunk, axis=0)
XTest[-valTestChunk:] = xTest
YTest.resize(YTest.shape[0]+valTestChunk, axis=0)
YTest[-valTestChunk:] = yTest
#%%
import h5py
def hdf5Generator(filePath, batch_size, dataSet):
with h5py.File(filePath, 'r') as hf:
L = len(hf["X" + dataSet])
while True:
batch_start = 0
batch_end = batch_size
while batch_start < L:
limit = min(batch_end, L)
X = hf["X" + dataSet][batch_start:limit]
Y = hf["Y" + dataSet][batch_start:limit]
yield (X,Y) #a tuple with two numpy arrays with batch_size samples
batch_start += batch_size
batch_end += batch_size
#%% CNN
import tensorflow as tf
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.layers import Conv1D,MaxPooling1D,GlobalMaxPooling1D
from tensorflow import keras
modelNN = Sequential()
modelNN.add(Conv1D(32, 7, activation='relu',input_shape=((72, 100))))
modelNN.add(Conv1D(32, 7, activation='relu'))
#modelNN.add(GlobalMaxPooling1D())
modelNN.add(Flatten())
modelNN.add(Dense(512,activation='relu'))
modelNN.add(Dense(128,activation='relu'))
#modelNN.add(Dense(50,activation='relu',input_dim=X[0].size))
modelNN.add(Dense(10,activation='relu'))
modelNN.add(Dense(3,activation='softmax'))
modelNN.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=["sparse_categorical_accuracy"])
#%%
num_rows = 340000
batchSize = 512
steps = num_rows/batchSize
#early stop
earlystop = keras.callbacks.EarlyStopping(monitor='accuracy',patience=10,verbose=False,restore_best_weights=True)
cbList = [earlystop]
trainData = hdf5Generator(path + "w2vCNN.hdf5", batchSize, "Train")
valData = hdf5Generator(path + "w2vCNN.hdf5", batchSize, "Val")
hist = modelNN.fit(trainData, validation_data=valData, epochs=12, steps_per_epoch=steps, validation_steps=steps)
#hist = modelNN.fit(hdf5Generator("vectors.hdf5", batchSize),epochs=15, steps_per_epoch=steps)
#hist = modelNN.fit(hdf5Generator("vectors.hdf5", batchSize),epochs=15,batch_size=batchSize,callbacks=cbList)
#modelNN.fit(train,epochs=12,validation_data=val,batch_size=batchSize,steps_per_epoch= num_rows/batchSize,callbacks=cbList,validation_steps=num_rows/batchSize)
# %%
#%% CNN
import tensorflow as tf
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.layers import Conv1D,MaxPooling1D,GlobalMaxPooling1D
from tensorflow import keras
modelNN = Sequential()
modelNN.add(Conv1D(32, 7, activation='relu',input_shape=((72, 100))))
modelNN.add(Conv1D(32, 7, activation='relu'))
#modelNN.add(GlobalMaxPooling1D())
modelNN.add(Flatten())
modelNN.add(Dense(512,activation='relu'))
modelNN.add(Dense(128,activation='relu'))
#modelNN.add(Dense(50,activation='relu',input_dim=X[0].size))
modelNN.add(Dense(10,activation='relu'))
modelNN.add(Dense(3,activation='softmax'))
modelNN.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=["sparse_categorical_accuracy"])
#%%
from hdf5 import hdf5Generator
path = "G:\\ml\\"
num_rows = 340000
batchSize = 512
steps = num_rows/batchSize
#early stop
earlystop = keras.callbacks.EarlyStopping(monitor='accuracy',patience=10,verbose=False,restore_best_weights=True)
cbList = [earlystop]
trainData = hdf5Generator(path + "w2vCNN.hdf5", batchSize, "Train")
valData = hdf5Generator(path + "w2vCNN.hdf5", batchSize, "Val")
hist = modelNN.fit(trainData, validation_data=valData, epochs=12, steps_per_epoch=steps, validation_steps=steps)
#hist = modelNN.fit(hdf5Generator("vectors.hdf5", batchSize),epochs=15, steps_per_epoch=steps)
#hist = modelNN.fit(hdf5Generator("vectors.hdf5", batchSize),epochs=15,batch_size=batchSize,callbacks=cbList)
#modelNN.fit(train,epochs=12,validation_data=val,batch_size=batchSize,steps_per_epoch= num_rows/batchSize,callbacks=cbList,validation_steps=num_rows/batchSize)
# %%
File moved
File moved
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment