diff --git a/DeepQLearning/agentMemoryDQN.py b/DeepQLearning/agentMemoryDQN.py index efa250f02ada9a322dc012cb2222365928cffb51..3a3eb864c0cc4c710095b77161e6ba84a0eb2319 100644 --- a/DeepQLearning/agentMemoryDQN.py +++ b/DeepQLearning/agentMemoryDQN.py @@ -1,45 +1,46 @@ -import numpy as np import matplotlib.pyplot as plt +import numpy as np + class agentMemory(object): def __init__(self, memSize, input_shape, action_shape): self.memSize = memSize - self.mCounter= 0 - self._stateM = np.zeros((self.memSize, 1, 24)) - self._nextstateM = np.zeros((self.memSize, 1, 24)) - self._actionM = np.zeros((self.memSize, *action_shape), dtype=np.int32) - self._rewardM = np.zeros(self.memSize) - self._doneM = np.zeros(self.memSize, dtype=np.uint8) + self.mCounter = 0 + self._stateM = np.zeros((self.memSize, 1, 24)) + self._nextstateM = np.zeros((self.memSize, 1, 24)) + self._actionM = np.zeros((self.memSize, *action_shape), dtype=np.int32) + self._rewardM = np.zeros(self.memSize) + self._doneM = np.zeros(self.memSize, dtype=np.uint8) def addMemory(self, state, action, reward, nextState, done): - idx = self.mCounter% self.memSize - self.mCounter+= 1 - self._stateM[idx] = state - self._nextstateM [idx] = nextState - self._actionM [idx] = action - self._rewardM[idx] = reward - self._doneM[idx] = done - + idx = self.mCounter % self.memSize + self.mCounter += 1 + self._stateM[idx] = state + self._nextstateM[idx] = nextState + self._actionM[idx] = action + self._rewardM[idx] = reward + self._doneM[idx] = done + def getBatch(self, bSize): - maxMem = min(self.mCounter, self.memSize) - batchIdx = np.random.choice(maxMem, bSize, replace=False) - states = self._stateM[batchIdx] - actions = self._actionM [batchIdx] - rewards = self._rewardM[batchIdx] - nextStates = self._nextstateM [batchIdx] - done = self._doneM[batchIdx] + maxMem = min(self.mCounter, self.memSize) + batchIdx = np.random.choice(maxMem, bSize, replace=False) + states = self._stateM[batchIdx] + actions = self._actionM[batchIdx] + rewards = self._rewardM[batchIdx] + nextStates = self._nextstateM[batchIdx] + done = self._doneM[batchIdx] return states, actions, rewards, nextStates, done - - def showMemory(self,no): - print('Memory No.',no,' with memory counter',self.mCounter ) - print('Reward:',self._rewardM[no]) + + def showMemory(self, no): + print('Memory No.', no, ' with memory counter', self.mCounter) + print('Reward:', self._rewardM[no]) print('Action', self._actionM[no]) print('Done', self._doneM[no]) fig = plt.figure() for i in range(4): - ax = fig.add_subplot(1,4,i+1) - ax.imshow(self._stateM[no,:,:,i]) + ax = fig.add_subplot(1, 4, i + 1) + ax.imshow(self._stateM[no, :, :, i]) fig = plt.figure() for i in range(4): - ax = fig.add_subplot(1,4,i+1) - ax.imshow(self._nextstateM[no,:,:,i]) + ax = fig.add_subplot(1, 4, i + 1) + ax.imshow(self._nextstateM[no, :, :, i]) diff --git a/DeepQLearning/averageRewards.png b/DeepQLearning/averageRewards.png new file mode 100644 index 0000000000000000000000000000000000000000..21aeb85a6be3319c245a734a509e3e4e46366312 Binary files /dev/null and b/DeepQLearning/averageRewards.png differ diff --git a/DeepQLearning/config.py b/DeepQLearning/config.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/DeepQLearning/dqnAgent.py b/DeepQLearning/dqnAgent.py index c28c90a143566c83c355070b8e807d660fef54ab..b6e3ea4b6250883f3c9054111271860084efa770 100644 --- a/DeepQLearning/dqnAgent.py +++ b/DeepQLearning/dqnAgent.py @@ -1,32 +1,36 @@ import time + import numpy as np -from agentMemoryDQN import agentMemory -from tensorflow.keras.layers import Dense, Activation, Conv2D, Flatten +from tensorflow.keras.layers import Dense from tensorflow.keras.models import Sequential, load_model from tensorflow.keras.optimizers import Adam +from agentMemoryDQN import agentMemory + + def qFunctionNN(lr, outputs, inputs): QFunction = Sequential() QFunction.add(Dense(24, activation='relu', input_dim=inputs)) - QFunction.add(Dense(24, activation='relu') ) + QFunction.add(Dense(24, activation='relu')) QFunction.add(Dense(outputs, activation='linear')) QFunction.compile(optimizer=Adam(lr=lr), loss='mean_squared_error') return QFunction + class dqnAgent(object): def __init__(self, lr, gamma, actions, vareps, bSize, observations, - epsDec=0.0, epsMin=0.01, memSize=10000, name='Alan', bins = 7): + epsDec=0.0, epsMin=0.01, memSize=10000, name='Alan', bins=7): self.actions = actions - self.gamma = gamma - self.vareps = vareps - self.epsDec = epsDec - self.epsMin = epsMin - self.bSize = bSize + self.gamma = gamma + self.vareps = vareps + self.epsDec = epsDec + self.epsMin = epsMin + self.bSize = bSize self.bins = bins - self.memory = agentMemory(memSize, [1,24], [actions]) - self.Q = qFunctionNN(lr, actions, observations) - self.name = name - self.steps = 0 + self.memory = agentMemory(memSize, [1, 24], [actions]) + self.Q = qFunctionNN(lr, actions, observations) + self.name = name + self.steps = 0 def addMemory(self, state, action, reward, nextState, done): self.memory.addMemory(state, action, reward, nextState, done) @@ -41,7 +45,7 @@ class dqnAgent(object): def round_bins(self, x, bins): round_fact = (bins - 1) / 2 - return np.around(x*round_fact)/round_fact + return np.around(x * round_fact) / round_fact def learn(self): start = time.time() @@ -52,18 +56,18 @@ class dqnAgent(object): next_action = np.amax(self.Q.predict(nextState[i])) target = r[i] if not done[i]: - target = (1.0 - 0.1) * r[i] + 0.1 * self.gamma*next_action + target = (1.0 - 0.1) * r[i] + 0.1 * self.gamma * next_action y = self.Q.predict(state[i]) y[0] = target - history = self.Q.fit(x=state[i], y=y, verbose=0, epochs=1) + self.Q.fit(x=state[i], y=y, verbose=0, epochs=1) self.steps += 1 print("learn time: ", time.time() - start) def saveCNNs(self): - fname = self.name+'.h5' - self.Q.save('Q'+fname) + fname = self.name + '.h5' + self.Q.save('Q' + fname) def loadCNNs(self): - fname = self.name+'.h5' - self.Q = load_model('Q'+fname) \ No newline at end of file + fname = self.name + '.h5' + self.Q = load_model('Q' + fname) diff --git a/DeepQLearning/main.py b/DeepQLearning/main.py index 370e2ba990e05be46c65dad8fa1c18a9e83b3bed..73595036d219e438ce4fe7de2977d410e2e140a8 100644 --- a/DeepQLearning/main.py +++ b/DeepQLearning/main.py @@ -1,10 +1,10 @@ +import time + +import gym import numpy as np from tqdm import tqdm -from dqnAgent import dqnAgent -import gym -import time -from config import * +from dqnAgent import dqnAgent env = gym.make('BipedalWalker-v3') nmb_of_actions = env.action_space.shape[0] @@ -13,13 +13,14 @@ marvin = dqnAgent(gamma=0.99, vareps=1.0, lr=0.001, observations=nmb_of_obs, actions=nmb_of_actions, memSize=25000, epsMin=0.05, bSize=16, epsDec=0.999, bins=7) -rewards = []; epsHistory = [] +rewards = []; +epsHistory = [] avg_rewards = [] steps = 0 verbose = False best_total_reward = -1000 -progress = tqdm(range(10000),desc='Training',unit=' episode') +progress = tqdm(range(10000), desc='Training', unit=' episode') for epoche in progress: done = False observation = env.reset() @@ -34,7 +35,7 @@ for epoche in progress: obs = obs.reshape(1, -1) totalReward += reward marvin.addMemory(observation, action, reward, obs, int(done)) - if verbose : env.render() + if verbose: env.render() observation = obs ep_rewards.append(reward) @@ -48,7 +49,7 @@ for epoche in progress: marvin.learn() - marvin.vareps *= marvin.epsDec + marvin.vareps *= marvin.epsDec if marvin.vareps < marvin.epsMin: marvin.vareps = marvin.epsMin @@ -56,15 +57,15 @@ for epoche in progress: epsHistory.append(marvin.vareps) movingAvr = np.mean(rewards[-20:]) avg_rewards.append(movingAvr) - msg =' Training r='+str(totalReward) - msg +=' vareps='+ str(round(marvin.vareps,ndigits=2)) - msg += ' avg='+str(movingAvr) + msg = ' Training r=' + str(totalReward) + msg += ' vareps=' + str(round(marvin.vareps, ndigits=2)) + msg += ' avg=' + str(movingAvr) progress.set_description(msg) - if epoche % 10 == 0: + if epoche % 10 == 0: np.save("eps.npy", np.array(epsHistory)) np.save("total_rewards.npy", np.array(rewards)) np.save("avg_rewards.npy", np.array(avg_rewards)) - if movingAvr>300: break # solve condition + if movingAvr > 300: break # solve condition marvin.vareps = 0 done = False @@ -77,9 +78,6 @@ while not done: obs, reward, done, info = env.step(action) obs = obs.reshape(1, -1) totalReward += reward - #marvin.addMemory(observation, action_ind, reward, obs, int(done)) - env.render() + # marvin.addMemory(observation, action_ind, reward, obs, int(done)) + env.render() observation = obs - - - \ No newline at end of file diff --git a/DeepQLearning/show_agent.py b/DeepQLearning/show_agent.py index 5496929f22803295979307d9f260af31b7be21d1..3c99443d0716101189fe47a52eec7771f902712e 100644 --- a/DeepQLearning/show_agent.py +++ b/DeepQLearning/show_agent.py @@ -1,18 +1,17 @@ +import gym +import matplotlib.pyplot as plt import numpy as np -from tqdm import tqdm + from dqnAgent import dqnAgent -import matplotlib.pyplot as plt -import gym env = gym.make('BipedalWalker-v3') nmb_of_actions = env.action_space.shape[0] observation = env.reset() marvin = dqnAgent(gamma=0.99, vareps=0.0, lr=0.001, observations=len(observation), actions=nmb_of_actions, memSize=25000, - epsMin=0.02, bSize=32, replace=1000, epsDec=0.001) + epsMin=0.02, bSize=32, epsDec=0.001) marvin.loadCNNs() - total_rewards = np.load("total_rewards.npy", allow_pickle=True) eps = np.load("eps.npy", allow_pickle=True) avg_rewards = np.load("avg_rewards.npy", allow_pickle=True) @@ -27,7 +26,7 @@ plt.ylabel('Reward') plt.plot(avg_rewards, c='k') x = np.arange(len(avg_rewards)) m, b = np.polyfit(x, avg_rewards, 1) -plt.plot(x, m*x + b) +plt.plot(x, m * x + b) plt.figure() plt.title('Epsilon') plt.plot(eps, c='k') @@ -41,4 +40,4 @@ for i in range(10): action = marvin.getAction(observation) obs, reward, done, info = env.step(action) obs = obs.reshape(1, -1) - env.render() \ No newline at end of file + env.render() diff --git a/requirements.txt b/requirements.txt index 0c85215b839de1a44c1b47e1fc37c70c6b9032b7..f1eb1c0700b456e7528501523d68d32fb9a21185 100644 Binary files a/requirements.txt and b/requirements.txt differ