Added deep q learning

9e09122b · Tobias Döring · e1336696 · 9e09122b · 9e09122b · 9e09122b
Commit 9e09122b authored 3 years ago by Tobias Döring
--- a/DeepQLearning/QAlan.h5
+++ b/DeepQLearning/QAlan.h5
--- a/DeepQLearning/agentMemoryDQN.py
+++ b/DeepQLearning/agentMemoryDQN.py
+import numpy as np
+import matplotlib.pyplot as plt
+class agentMemory(object):
+    def __init__(self, memSize, input_shape, action_shape):
+        self.memSize = memSize
+        self.mCounter= 0
+        self._stateM      = np.zeros((self.memSize, 1, 24))
+        self._nextstateM  = np.zeros((self.memSize, 1, 24))
+        self._actionM     = np.zeros((self.memSize, *action_shape), dtype=np.int32)
+        self._rewardM     = np.zeros(self.memSize)
+        self._doneM       = np.zeros(self.memSize, dtype=np.uint8)
+    def addMemory(self, state, action, reward, nextState, done):
+        idx = self.mCounter% self.memSize
+        self.mCounter+= 1
+        self._stateM[idx]      = state
+        self._nextstateM [idx] = nextState
+        self._actionM [idx]    = action
+        self._rewardM[idx]     = reward
+        self._doneM[idx]       = done
+    def getBatch(self, bSize):
+        maxMem     = min(self.mCounter, self.memSize)
+        batchIdx   = np.random.choice(maxMem, bSize, replace=False)
+        states     = self._stateM[batchIdx]
+        actions    = self._actionM [batchIdx]
+        rewards    = self._rewardM[batchIdx]
+        nextStates = self._nextstateM [batchIdx]
+        done       = self._doneM[batchIdx]
+        return states, actions, rewards, nextStates, done
+    def showMemory(self,no):
+        print('Memory No.',no,' with memory counter',self.mCounter )
+        print('Reward:',self._rewardM[no])
+        print('Action', self._actionM[no])
+        print('Done', self._doneM[no])
+        fig = plt.figure()
+        for i in range(4):
+            ax = fig.add_subplot(1,4,i+1)
+            ax.imshow(self._stateM[no,:,:,i])
+        fig = plt.figure()
+        for i in range(4):
+            ax = fig.add_subplot(1,4,i+1)
+            ax.imshow(self._nextstateM[no,:,:,i])
--- a/DeepQLearning/avg_rewards.npy
+++ b/DeepQLearning/avg_rewards.npy
--- a/DeepQLearning/config.py
+++ b/DeepQLearning/config.py
--- a/DeepQLearning/dqnAgent.py
+++ b/DeepQLearning/dqnAgent.py
+import time
+import numpy as np
+from agentMemoryDQN import agentMemory
+from tensorflow.keras.layers import Dense, Activation, Conv2D, Flatten
+from tensorflow.keras.models import Sequential, load_model
+from tensorflow.keras.optimizers import Adam
+def qFunctionNN(lr, outputs, inputs):
+    QFunction = Sequential()
+    QFunction.add(Dense(24, activation='relu', input_dim=inputs))
+    QFunction.add(Dense(24, activation='relu') )
+    QFunction.add(Dense(outputs, activation='linear'))
+    QFunction.compile(optimizer=Adam(lr=lr), loss='mean_squared_error')
+    return QFunction
+class dqnAgent(object):
+    def __init__(self, lr, gamma, actions, vareps, bSize, observations,
+                 epsDec=0.0,  epsMin=0.01, memSize=10000, name='Alan', bins = 7):
+        self.actions = actions
+        self.gamma   = gamma
+        self.vareps  = vareps
+        self.epsDec  = epsDec
+        self.epsMin  = epsMin
+        self.bSize   = bSize
+        self.bins = bins
+        self.memory  = agentMemory(memSize, [1,24], [actions])
+        self.Q       = qFunctionNN(lr, actions, observations)
+        self.name    = name 
+        self.steps   = 0
+    def addMemory(self, state, action, reward, nextState, done):
+        self.memory.addMemory(state, action, reward, nextState, done)
+    def getAction(self, observation):
+        if np.random.random() < self.vareps:
+            action = np.random.uniform(-1, 1, 4)
+        else:
+            action = self.Q.predict(observation)[0]
+        action = self.round_bins(action, self.bins)
+        return action
+    def round_bins(self, x, bins):
+        round_fact = (bins - 1) / 2
+        return np.around(x*round_fact)/round_fact
+    def learn(self):
+        start = time.time()
+        if self.memory.mCounter > self.bSize:
+            state, action, r, nextState, done = self.memory.getBatch(self.bSize)
+            for i in range(self.bSize):
+                next_action = np.amax(self.Q.predict(nextState[i]))
+                target = r[i]
+                if not done[i]:
+                    target = (1.0 - 0.1) * r[i] + 0.1 * self.gamma*next_action
+                y = self.Q.predict(state[i])
+                y[0] = target
+                history = self.Q.fit(x=state[i], y=y, verbose=0, epochs=1)
+            self.steps += 1
+        print("learn time: ", time.time() - start)
+    def saveCNNs(self):
+        fname = self.name+'.h5'
+        self.Q.save('Q'+fname)
+    def loadCNNs(self):
+        fname = self.name+'.h5'
+        self.Q = load_model('Q'+fname)
\ No newline at end of file
--- a/DeepQLearning/eps.npy
+++ b/DeepQLearning/eps.npy
--- a/DeepQLearning/main.py
+++ b/DeepQLearning/main.py
+import numpy as np
+from tqdm import tqdm
+from dqnAgent import dqnAgent
+import gym
+import time
+from config import *
+env = gym.make('BipedalWalker-v3')
+nmb_of_actions = env.action_space.shape[0]
+nmb_of_obs = env.observation_space.shape[0]
+marvin = dqnAgent(gamma=0.99, vareps=1.0, lr=0.001,
+                  observations=nmb_of_obs, actions=nmb_of_actions, memSize=25000,
+                  epsMin=0.05, bSize=16, epsDec=0.999, bins=7)
+rewards = []; epsHistory = []
+avg_rewards = []
+steps = 0
+verbose = False
+best_total_reward = -1000
+progress = tqdm(range(10000),desc='Training',unit=' episode')
+for epoche in progress:
+    done = False
+    observation = env.reset()
+    observation = observation.reshape(1, -1)
+    totalReward = 0
+    ep_rewards = []
+    start = time.time()
+    while not done:
+        steps += 1
+        action = marvin.getAction(observation)
+        obs, reward, done, info = env.step(action)
+        obs = obs.reshape(1, -1)
+        totalReward += reward
+        marvin.addMemory(observation, action, reward, obs, int(done))
+        if verbose : env.render()    
+        observation = obs
+        ep_rewards.append(reward)
+        if len(ep_rewards) > 50 and max(ep_rewards[-50:]) <= 0.1:
+            break
+    if totalReward > best_total_reward:
+        marvin.saveCNNs()
+        best_total_reward = totalReward
+        print("new best walker found")
+    marvin.learn()
+    marvin.vareps *= marvin.epsDec 
+    if marvin.vareps < marvin.epsMin:
+        marvin.vareps = marvin.epsMin
+    rewards.append(totalReward)
+    epsHistory.append(marvin.vareps)
+    movingAvr = np.mean(rewards[-20:])
+    avg_rewards.append(movingAvr)
+    msg  =' Training r='+str(totalReward)
+    msg +=' vareps='+ str(round(marvin.vareps,ndigits=2))
+    msg += ' avg='+str(movingAvr)    
+    progress.set_description(msg)
+    if epoche % 10 == 0: 
+        np.save("eps.npy", np.array(epsHistory))
+        np.save("total_rewards.npy", np.array(rewards))
+        np.save("avg_rewards.npy", np.array(avg_rewards))
+    if movingAvr>300: break # solve condition
+marvin.vareps = 0
+done = False
+observation = env.reset()
+observation = observation.reshape(1, -1)
+totalReward = 0
+while not done:
+    steps += 1
+    action = marvin.getAction(observation)
+    obs, reward, done, info = env.step(action)
+    obs = obs.reshape(1, -1)
+    totalReward += reward
+    #marvin.addMemory(observation, action_ind, reward, obs, int(done))
+    env.render()    
+    observation = obs
\ No newline at end of file
--- a/DeepQLearning/show_agent.py
+++ b/DeepQLearning/show_agent.py
+import numpy as np
+from tqdm import tqdm
+from dqnAgent import dqnAgent
+import matplotlib.pyplot as plt
+import gym
+env = gym.make('BipedalWalker-v3')
+nmb_of_actions = env.action_space.shape[0]
+observation = env.reset()
+marvin = dqnAgent(gamma=0.99, vareps=0.0, lr=0.001,
+                  observations=len(observation), actions=nmb_of_actions, memSize=25000,
+                  epsMin=0.02, bSize=32, replace=1000, epsDec=0.001)
+marvin.loadCNNs()
+total_rewards = np.load("total_rewards.npy", allow_pickle=True)
+eps = np.load("eps.npy", allow_pickle=True)
+avg_rewards = np.load("avg_rewards.npy", allow_pickle=True)
+plt.figure()
+plt.title('Total Rewards')
+plt.plot(total_rewards, c='k')
+plt.figure()
+plt.title('Average Rewards')
+plt.xlabel('Episode')
+plt.ylabel('Reward')
+plt.plot(avg_rewards, c='k')
+x = np.arange(len(avg_rewards))
+m, b = np.polyfit(x, avg_rewards, 1)
+plt.plot(x, m*x + b)
+plt.figure()
+plt.title('Epsilon')
+plt.plot(eps, c='k')
+plt.show()
+for i in range(10):
+    done = False
+    observation = env.reset()
+    observation = observation.reshape(1, -1)
+    while not done:
+        action = marvin.getAction(observation)
+        obs, reward, done, info = env.step(action)
+        obs = obs.reshape(1, -1)
+        env.render()
\ No newline at end of file
--- a/DeepQLearning/total_rewards.npy
+++ b/DeepQLearning/total_rewards.npy