diff --git a/DeepQLearning/QAlan.h5 b/DeepQLearning/QAlan.h5 new file mode 100644 index 0000000000000000000000000000000000000000..2da24aae343566dab91d78aa207a4b4442a5c758 Binary files /dev/null and b/DeepQLearning/QAlan.h5 differ diff --git a/DeepQLearning/agentMemoryDQN.py b/DeepQLearning/agentMemoryDQN.py new file mode 100644 index 0000000000000000000000000000000000000000..efa250f02ada9a322dc012cb2222365928cffb51 --- /dev/null +++ b/DeepQLearning/agentMemoryDQN.py @@ -0,0 +1,45 @@ +import numpy as np +import matplotlib.pyplot as plt + +class agentMemory(object): + def __init__(self, memSize, input_shape, action_shape): + self.memSize = memSize + self.mCounter= 0 + self._stateM = np.zeros((self.memSize, 1, 24)) + self._nextstateM = np.zeros((self.memSize, 1, 24)) + self._actionM = np.zeros((self.memSize, *action_shape), dtype=np.int32) + self._rewardM = np.zeros(self.memSize) + self._doneM = np.zeros(self.memSize, dtype=np.uint8) + + def addMemory(self, state, action, reward, nextState, done): + idx = self.mCounter% self.memSize + self.mCounter+= 1 + self._stateM[idx] = state + self._nextstateM [idx] = nextState + self._actionM [idx] = action + self._rewardM[idx] = reward + self._doneM[idx] = done + + def getBatch(self, bSize): + maxMem = min(self.mCounter, self.memSize) + batchIdx = np.random.choice(maxMem, bSize, replace=False) + states = self._stateM[batchIdx] + actions = self._actionM [batchIdx] + rewards = self._rewardM[batchIdx] + nextStates = self._nextstateM [batchIdx] + done = self._doneM[batchIdx] + return states, actions, rewards, nextStates, done + + def showMemory(self,no): + print('Memory No.',no,' with memory counter',self.mCounter ) + print('Reward:',self._rewardM[no]) + print('Action', self._actionM[no]) + print('Done', self._doneM[no]) + fig = plt.figure() + for i in range(4): + ax = fig.add_subplot(1,4,i+1) + ax.imshow(self._stateM[no,:,:,i]) + fig = plt.figure() + for i in range(4): + ax = fig.add_subplot(1,4,i+1) + ax.imshow(self._nextstateM[no,:,:,i]) diff --git a/DeepQLearning/avg_rewards.npy b/DeepQLearning/avg_rewards.npy new file mode 100644 index 0000000000000000000000000000000000000000..b29c2e13eb4e751404fd5dbb5a4e24454952c747 Binary files /dev/null and b/DeepQLearning/avg_rewards.npy differ diff --git a/DeepQLearning/config.py b/DeepQLearning/config.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/DeepQLearning/dqnAgent.py b/DeepQLearning/dqnAgent.py new file mode 100644 index 0000000000000000000000000000000000000000..c28c90a143566c83c355070b8e807d660fef54ab --- /dev/null +++ b/DeepQLearning/dqnAgent.py @@ -0,0 +1,69 @@ +import time +import numpy as np +from agentMemoryDQN import agentMemory +from tensorflow.keras.layers import Dense, Activation, Conv2D, Flatten +from tensorflow.keras.models import Sequential, load_model +from tensorflow.keras.optimizers import Adam + +def qFunctionNN(lr, outputs, inputs): + QFunction = Sequential() + QFunction.add(Dense(24, activation='relu', input_dim=inputs)) + QFunction.add(Dense(24, activation='relu') ) + QFunction.add(Dense(outputs, activation='linear')) + QFunction.compile(optimizer=Adam(lr=lr), loss='mean_squared_error') + return QFunction + +class dqnAgent(object): + def __init__(self, lr, gamma, actions, vareps, bSize, observations, + epsDec=0.0, epsMin=0.01, memSize=10000, name='Alan', bins = 7): + self.actions = actions + self.gamma = gamma + self.vareps = vareps + self.epsDec = epsDec + self.epsMin = epsMin + self.bSize = bSize + self.bins = bins + self.memory = agentMemory(memSize, [1,24], [actions]) + self.Q = qFunctionNN(lr, actions, observations) + self.name = name + self.steps = 0 + + def addMemory(self, state, action, reward, nextState, done): + self.memory.addMemory(state, action, reward, nextState, done) + + def getAction(self, observation): + if np.random.random() < self.vareps: + action = np.random.uniform(-1, 1, 4) + else: + action = self.Q.predict(observation)[0] + action = self.round_bins(action, self.bins) + return action + + def round_bins(self, x, bins): + round_fact = (bins - 1) / 2 + return np.around(x*round_fact)/round_fact + + def learn(self): + start = time.time() + if self.memory.mCounter > self.bSize: + state, action, r, nextState, done = self.memory.getBatch(self.bSize) + + for i in range(self.bSize): + next_action = np.amax(self.Q.predict(nextState[i])) + target = r[i] + if not done[i]: + target = (1.0 - 0.1) * r[i] + 0.1 * self.gamma*next_action + y = self.Q.predict(state[i]) + y[0] = target + history = self.Q.fit(x=state[i], y=y, verbose=0, epochs=1) + + self.steps += 1 + print("learn time: ", time.time() - start) + + def saveCNNs(self): + fname = self.name+'.h5' + self.Q.save('Q'+fname) + + def loadCNNs(self): + fname = self.name+'.h5' + self.Q = load_model('Q'+fname) \ No newline at end of file diff --git a/DeepQLearning/eps.npy b/DeepQLearning/eps.npy new file mode 100644 index 0000000000000000000000000000000000000000..71b0b9ae479a32785e4724abbbdf2d70b6998b56 Binary files /dev/null and b/DeepQLearning/eps.npy differ diff --git a/DeepQLearning/main.py b/DeepQLearning/main.py new file mode 100644 index 0000000000000000000000000000000000000000..370e2ba990e05be46c65dad8fa1c18a9e83b3bed --- /dev/null +++ b/DeepQLearning/main.py @@ -0,0 +1,85 @@ +import numpy as np +from tqdm import tqdm +from dqnAgent import dqnAgent +import gym +import time + +from config import * + +env = gym.make('BipedalWalker-v3') +nmb_of_actions = env.action_space.shape[0] +nmb_of_obs = env.observation_space.shape[0] +marvin = dqnAgent(gamma=0.99, vareps=1.0, lr=0.001, + observations=nmb_of_obs, actions=nmb_of_actions, memSize=25000, + epsMin=0.05, bSize=16, epsDec=0.999, bins=7) + +rewards = []; epsHistory = [] +avg_rewards = [] +steps = 0 +verbose = False +best_total_reward = -1000 + +progress = tqdm(range(10000),desc='Training',unit=' episode') +for epoche in progress: + done = False + observation = env.reset() + observation = observation.reshape(1, -1) + totalReward = 0 + ep_rewards = [] + start = time.time() + while not done: + steps += 1 + action = marvin.getAction(observation) + obs, reward, done, info = env.step(action) + obs = obs.reshape(1, -1) + totalReward += reward + marvin.addMemory(observation, action, reward, obs, int(done)) + if verbose : env.render() + observation = obs + + ep_rewards.append(reward) + if len(ep_rewards) > 50 and max(ep_rewards[-50:]) <= 0.1: + break + + if totalReward > best_total_reward: + marvin.saveCNNs() + best_total_reward = totalReward + print("new best walker found") + + marvin.learn() + + marvin.vareps *= marvin.epsDec + if marvin.vareps < marvin.epsMin: + marvin.vareps = marvin.epsMin + + rewards.append(totalReward) + epsHistory.append(marvin.vareps) + movingAvr = np.mean(rewards[-20:]) + avg_rewards.append(movingAvr) + msg =' Training r='+str(totalReward) + msg +=' vareps='+ str(round(marvin.vareps,ndigits=2)) + msg += ' avg='+str(movingAvr) + progress.set_description(msg) + if epoche % 10 == 0: + np.save("eps.npy", np.array(epsHistory)) + np.save("total_rewards.npy", np.array(rewards)) + np.save("avg_rewards.npy", np.array(avg_rewards)) + if movingAvr>300: break # solve condition + +marvin.vareps = 0 +done = False +observation = env.reset() +observation = observation.reshape(1, -1) +totalReward = 0 +while not done: + steps += 1 + action = marvin.getAction(observation) + obs, reward, done, info = env.step(action) + obs = obs.reshape(1, -1) + totalReward += reward + #marvin.addMemory(observation, action_ind, reward, obs, int(done)) + env.render() + observation = obs + + + \ No newline at end of file diff --git a/DeepQLearning/show_agent.py b/DeepQLearning/show_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..5496929f22803295979307d9f260af31b7be21d1 --- /dev/null +++ b/DeepQLearning/show_agent.py @@ -0,0 +1,44 @@ +import numpy as np +from tqdm import tqdm +from dqnAgent import dqnAgent +import matplotlib.pyplot as plt +import gym + +env = gym.make('BipedalWalker-v3') +nmb_of_actions = env.action_space.shape[0] +observation = env.reset() +marvin = dqnAgent(gamma=0.99, vareps=0.0, lr=0.001, + observations=len(observation), actions=nmb_of_actions, memSize=25000, + epsMin=0.02, bSize=32, replace=1000, epsDec=0.001) +marvin.loadCNNs() + + +total_rewards = np.load("total_rewards.npy", allow_pickle=True) +eps = np.load("eps.npy", allow_pickle=True) +avg_rewards = np.load("avg_rewards.npy", allow_pickle=True) + +plt.figure() +plt.title('Total Rewards') +plt.plot(total_rewards, c='k') +plt.figure() +plt.title('Average Rewards') +plt.xlabel('Episode') +plt.ylabel('Reward') +plt.plot(avg_rewards, c='k') +x = np.arange(len(avg_rewards)) +m, b = np.polyfit(x, avg_rewards, 1) +plt.plot(x, m*x + b) +plt.figure() +plt.title('Epsilon') +plt.plot(eps, c='k') +plt.show() + +for i in range(10): + done = False + observation = env.reset() + observation = observation.reshape(1, -1) + while not done: + action = marvin.getAction(observation) + obs, reward, done, info = env.step(action) + obs = obs.reshape(1, -1) + env.render() \ No newline at end of file diff --git a/DeepQLearning/total_rewards.npy b/DeepQLearning/total_rewards.npy new file mode 100644 index 0000000000000000000000000000000000000000..8e6ed028a4bc03d2a03067b68a934b45f8e3d65c Binary files /dev/null and b/DeepQLearning/total_rewards.npy differ