Skip to content
Snippets Groups Projects
Select Git revision
  • 2016ws
  • 2024ws default
  • 2023ws
  • 2022ws
  • 2021ws
  • 2020ws
  • 2018ws
  • 2019ws
  • 2017ws
9 results

orbit-1.c

Blame
  • hdf5.py 5.50 KiB
    from gensim.utils import chunkize
    if __name__ == '__main__':
    #%%
        import numpy as np
        from gensim import utils
        import json
        import h5py
        from w2v_yelp_model import getWordVecModel
    
    #%% w2v model
        model_path = "D:\\ml\\full_yelp_w2v_model"
        model = getWordVecModel(model_path)
        pathJoel = pathJoel = "C:\\Tsubasaya E\\Uni\\ML\\Hausarbeit\\Dataset\\"
        pathSilas = "G:\\ml\\"
        path = pathSilas
        data_path ="E:\\downloads\\yelp_dataset\\yelp_dataset\\yelp_academic_dataset_review.json"
        #data_path = "D:\\ml\\data\\sample1.json"
    
        def getSentenceVectorCNN(sentence):
            split = utils.simple_preprocess(sentence)
            wordVecs = np.zeros((72,100))
            i=0
            for word in split:
                if i == 72: break
                try:
                    wordVecs[i] = model.wv[word]
                    i += 1
                except:
                    pass
            if np.all(wordVecs[5:]==0):
                raise Exception('not enough words found in w2v model')
            return wordVecs
    
    #%%
        import h5py
        import json
        i = 0
        with h5py.File(path + "w2vCNN.hdf5", "w") as hf:
            chunkSize = 10E3
            trainChunk = int(chunkSize * 0.6)
            valTestChunk = int(chunkSize * 0.2)
            xTrain = []
            yTrain = []
            xVal = []
            yVal = []
            xTest = []
            yTest = []
    
    
            index = 0
            for line in open(data_path,encoding="utf8"):
                json_line = json.loads(line)
                #X Data
                xData = []
                try:
                    xData = getSentenceVectorCNN(json_line["text"])
                except:
                    continue
                y = float(json_line["stars"])
                if(y <3):
                    yData = 0
                elif(y==3):
                    yData = 1
                else:
                    yData = 2
    
                if index == chunkSize:
                    XTrain = hf.create_dataset("XTrain", data=xTrain, maxshape=(None, 72, 100), chunks=(trainChunk, 72, 100))
                    YTrain = hf.create_dataset("YTrain", data=yTrain, maxshape=(None,), chunks=(trainChunk,))
    
                    XVal = hf.create_dataset("XVal", data=xVal, maxshape=(None, 72, 100), chunks=(valTestChunk, 72, 100))
                    YVal = hf.create_dataset("YVal", data=yVal, maxshape=(None,), chunks=(valTestChunk,))
    
                    XTest = hf.create_dataset("XTest", data=xTest, maxshape=(None, 72, 100), chunks=(valTestChunk, 72, 100))
                    YTest = hf.create_dataset("YTest", data=yTest, maxshape=(None,), chunks=(valTestChunk,))
    
                    #reset Buffer-Data
                    xTrain = []
                    yTrain = []
                    xVal = []
                    yVal = []
                    xTest = []
                    yTest = []
    
                if index % chunkSize == 0 and index > chunkSize:
                    XTrain.resize(XTrain.shape[0]+trainChunk, axis=0)
                    XTrain[-trainChunk:] = xTrain
    
                    YTrain.resize(YTrain.shape[0]+trainChunk, axis=0)
                    YTrain[-trainChunk:] = yTrain
    
                    XVal.resize(XVal.shape[0]+valTestChunk, axis=0)
                    XVal[-valTestChunk:] = xVal
    
                    YVal.resize(YVal.shape[0]+valTestChunk, axis=0)
                    YVal[-valTestChunk:] = yVal        
    
                    XTest.resize(XTest.shape[0]+valTestChunk, axis=0)
                    XTest[-valTestChunk:] = xTest
    
                    YTest.resize(YTest.shape[0]+valTestChunk, axis=0)
                    YTest[-valTestChunk:] = yTest
    
                    #reset Buffer-Data
                    xTrain = []
                    yTrain = []
                    xVal = []
                    yVal = []
                    xTest = []
                    yTest = []
    
                    print(index)
    
                if i < 3:
                    xTrain.append(xData)
                    yTrain.append(yData)
                elif i == 3:
                    xVal.append(xData)
                    yVal.append(yData)
                else:
                    xTest.append(xData)
                    yTest.append(yData)
                    i = -1
                i += 1
                index +=1
    
            trainChunk = len(xTrain)
            if trainChunk != 0:
    
                XTrain.resize(XTrain.shape[0]+trainChunk, axis=0)
                XTrain[-trainChunk:] = xTrain
    
                YTrain.resize(YTrain.shape[0]+trainChunk, axis=0)
                YTrain[-trainChunk:] = yTrain
    
            valTestChunk = len(xVal)
            if valTestChunk != 0:
    
                XVal.resize(XVal.shape[0]+valTestChunk, axis=0)
                XVal[-valTestChunk:] = xVal
    
                YVal.resize(YVal.shape[0]+valTestChunk, axis=0)
                YVal[-valTestChunk:] = yVal
    
            valTestChunk = len(xTest)
            if valTestChunk != 0:
    
                XTest.resize(XTest.shape[0]+valTestChunk, axis=0)
                XTest[-valTestChunk:] = xTest
    
                YTest.resize(YTest.shape[0]+valTestChunk, axis=0)
                YTest[-valTestChunk:] = yTest
    
    #%%
    import h5py
    def hdf5Generator(filePath, batch_size, dataSet,loop=True):
        with h5py.File(filePath, 'r') as hf:
            L = len(hf["X" + dataSet])
            while True:
                batch_start = 0
                batch_end = batch_size
                
                while batch_end < L:
                    X = hf["X" + dataSet][batch_start:batch_end]
                    Y = hf["Y" + dataSet][batch_start:batch_end]
                    yield (X,Y) #a tuple with two numpy arrays with batch_size samples     
        
                    batch_start += batch_size   
                    batch_end += batch_size
                if not loop: break