Select Git revision
incdate-6.c
hdf5.py 5.50 KiB
from gensim.utils import chunkize
if __name__ == '__main__':
#%%
import numpy as np
from gensim import utils
import json
import h5py
from w2v_yelp_model import getWordVecModel
#%% w2v model
model_path = "D:\\ml\\full_yelp_w2v_model"
model = getWordVecModel(model_path)
pathJoel = pathJoel = "C:\\Tsubasaya E\\Uni\\ML\\Hausarbeit\\Dataset\\"
pathSilas = "G:\\ml\\"
path = pathSilas
data_path ="E:\\downloads\\yelp_dataset\\yelp_dataset\\yelp_academic_dataset_review.json"
#data_path = "D:\\ml\\data\\sample1.json"
def getSentenceVectorCNN(sentence):
split = utils.simple_preprocess(sentence)
wordVecs = np.zeros((72,100))
i=0
for word in split:
if i == 72: break
try:
wordVecs[i] = model.wv[word]
i += 1
except:
pass
if np.all(wordVecs[5:]==0):
raise Exception('not enough words found in w2v model')
return wordVecs
#%%
import h5py
import json
i = 0
with h5py.File(path + "w2vCNN.hdf5", "w") as hf:
chunkSize = 10E3
trainChunk = int(chunkSize * 0.6)
valTestChunk = int(chunkSize * 0.2)
xTrain = []
yTrain = []
xVal = []
yVal = []
xTest = []
yTest = []
index = 0
for line in open(data_path,encoding="utf8"):
json_line = json.loads(line)
#X Data
xData = []
try:
xData = getSentenceVectorCNN(json_line["text"])
except:
continue
y = float(json_line["stars"])
if(y <3):
yData = 0
elif(y==3):
yData = 1
else:
yData = 2
if index == chunkSize:
XTrain = hf.create_dataset("XTrain", data=xTrain, maxshape=(None, 72, 100), chunks=(trainChunk, 72, 100))
YTrain = hf.create_dataset("YTrain", data=yTrain, maxshape=(None,), chunks=(trainChunk,))
XVal = hf.create_dataset("XVal", data=xVal, maxshape=(None, 72, 100), chunks=(valTestChunk, 72, 100))
YVal = hf.create_dataset("YVal", data=yVal, maxshape=(None,), chunks=(valTestChunk,))
XTest = hf.create_dataset("XTest", data=xTest, maxshape=(None, 72, 100), chunks=(valTestChunk, 72, 100))
YTest = hf.create_dataset("YTest", data=yTest, maxshape=(None,), chunks=(valTestChunk,))
#reset Buffer-Data
xTrain = []
yTrain = []
xVal = []
yVal = []
xTest = []
yTest = []
if index % chunkSize == 0 and index > chunkSize:
XTrain.resize(XTrain.shape[0]+trainChunk, axis=0)
XTrain[-trainChunk:] = xTrain
YTrain.resize(YTrain.shape[0]+trainChunk, axis=0)
YTrain[-trainChunk:] = yTrain
XVal.resize(XVal.shape[0]+valTestChunk, axis=0)
XVal[-valTestChunk:] = xVal
YVal.resize(YVal.shape[0]+valTestChunk, axis=0)
YVal[-valTestChunk:] = yVal
XTest.resize(XTest.shape[0]+valTestChunk, axis=0)
XTest[-valTestChunk:] = xTest
YTest.resize(YTest.shape[0]+valTestChunk, axis=0)
YTest[-valTestChunk:] = yTest
#reset Buffer-Data
xTrain = []
yTrain = []
xVal = []
yVal = []
xTest = []
yTest = []
print(index)
if i < 3:
xTrain.append(xData)
yTrain.append(yData)
elif i == 3:
xVal.append(xData)
yVal.append(yData)
else:
xTest.append(xData)
yTest.append(yData)
i = -1
i += 1
index +=1
trainChunk = len(xTrain)
if trainChunk != 0:
XTrain.resize(XTrain.shape[0]+trainChunk, axis=0)
XTrain[-trainChunk:] = xTrain
YTrain.resize(YTrain.shape[0]+trainChunk, axis=0)
YTrain[-trainChunk:] = yTrain
valTestChunk = len(xVal)
if valTestChunk != 0:
XVal.resize(XVal.shape[0]+valTestChunk, axis=0)
XVal[-valTestChunk:] = xVal
YVal.resize(YVal.shape[0]+valTestChunk, axis=0)
YVal[-valTestChunk:] = yVal
valTestChunk = len(xTest)
if valTestChunk != 0:
XTest.resize(XTest.shape[0]+valTestChunk, axis=0)
XTest[-valTestChunk:] = xTest
YTest.resize(YTest.shape[0]+valTestChunk, axis=0)
YTest[-valTestChunk:] = yTest
#%%
import h5py
def hdf5Generator(filePath, batch_size, dataSet,loop=True):
with h5py.File(filePath, 'r') as hf:
L = len(hf["X" + dataSet])
while True:
batch_start = 0
batch_end = batch_size
while batch_end < L:
X = hf["X" + dataSet][batch_start:batch_end]
Y = hf["Y" + dataSet][batch_start:batch_end]
yield (X,Y) #a tuple with two numpy arrays with batch_size samples
batch_start += batch_size
batch_end += batch_size
if not loop: break