Skip to content
Snippets Groups Projects
Commit 9e09122b authored by Tobias Döring's avatar Tobias Döring
Browse files

Added deep q learning

parent e1336696
No related branches found
No related tags found
No related merge requests found
File added
import numpy as np
import matplotlib.pyplot as plt
class agentMemory(object):
def __init__(self, memSize, input_shape, action_shape):
self.memSize = memSize
self.mCounter= 0
self._stateM = np.zeros((self.memSize, 1, 24))
self._nextstateM = np.zeros((self.memSize, 1, 24))
self._actionM = np.zeros((self.memSize, *action_shape), dtype=np.int32)
self._rewardM = np.zeros(self.memSize)
self._doneM = np.zeros(self.memSize, dtype=np.uint8)
def addMemory(self, state, action, reward, nextState, done):
idx = self.mCounter% self.memSize
self.mCounter+= 1
self._stateM[idx] = state
self._nextstateM [idx] = nextState
self._actionM [idx] = action
self._rewardM[idx] = reward
self._doneM[idx] = done
def getBatch(self, bSize):
maxMem = min(self.mCounter, self.memSize)
batchIdx = np.random.choice(maxMem, bSize, replace=False)
states = self._stateM[batchIdx]
actions = self._actionM [batchIdx]
rewards = self._rewardM[batchIdx]
nextStates = self._nextstateM [batchIdx]
done = self._doneM[batchIdx]
return states, actions, rewards, nextStates, done
def showMemory(self,no):
print('Memory No.',no,' with memory counter',self.mCounter )
print('Reward:',self._rewardM[no])
print('Action', self._actionM[no])
print('Done', self._doneM[no])
fig = plt.figure()
for i in range(4):
ax = fig.add_subplot(1,4,i+1)
ax.imshow(self._stateM[no,:,:,i])
fig = plt.figure()
for i in range(4):
ax = fig.add_subplot(1,4,i+1)
ax.imshow(self._nextstateM[no,:,:,i])
File added
import time
import numpy as np
from agentMemoryDQN import agentMemory
from tensorflow.keras.layers import Dense, Activation, Conv2D, Flatten
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.optimizers import Adam
def qFunctionNN(lr, outputs, inputs):
QFunction = Sequential()
QFunction.add(Dense(24, activation='relu', input_dim=inputs))
QFunction.add(Dense(24, activation='relu') )
QFunction.add(Dense(outputs, activation='linear'))
QFunction.compile(optimizer=Adam(lr=lr), loss='mean_squared_error')
return QFunction
class dqnAgent(object):
def __init__(self, lr, gamma, actions, vareps, bSize, observations,
epsDec=0.0, epsMin=0.01, memSize=10000, name='Alan', bins = 7):
self.actions = actions
self.gamma = gamma
self.vareps = vareps
self.epsDec = epsDec
self.epsMin = epsMin
self.bSize = bSize
self.bins = bins
self.memory = agentMemory(memSize, [1,24], [actions])
self.Q = qFunctionNN(lr, actions, observations)
self.name = name
self.steps = 0
def addMemory(self, state, action, reward, nextState, done):
self.memory.addMemory(state, action, reward, nextState, done)
def getAction(self, observation):
if np.random.random() < self.vareps:
action = np.random.uniform(-1, 1, 4)
else:
action = self.Q.predict(observation)[0]
action = self.round_bins(action, self.bins)
return action
def round_bins(self, x, bins):
round_fact = (bins - 1) / 2
return np.around(x*round_fact)/round_fact
def learn(self):
start = time.time()
if self.memory.mCounter > self.bSize:
state, action, r, nextState, done = self.memory.getBatch(self.bSize)
for i in range(self.bSize):
next_action = np.amax(self.Q.predict(nextState[i]))
target = r[i]
if not done[i]:
target = (1.0 - 0.1) * r[i] + 0.1 * self.gamma*next_action
y = self.Q.predict(state[i])
y[0] = target
history = self.Q.fit(x=state[i], y=y, verbose=0, epochs=1)
self.steps += 1
print("learn time: ", time.time() - start)
def saveCNNs(self):
fname = self.name+'.h5'
self.Q.save('Q'+fname)
def loadCNNs(self):
fname = self.name+'.h5'
self.Q = load_model('Q'+fname)
\ No newline at end of file
File added
import numpy as np
from tqdm import tqdm
from dqnAgent import dqnAgent
import gym
import time
from config import *
env = gym.make('BipedalWalker-v3')
nmb_of_actions = env.action_space.shape[0]
nmb_of_obs = env.observation_space.shape[0]
marvin = dqnAgent(gamma=0.99, vareps=1.0, lr=0.001,
observations=nmb_of_obs, actions=nmb_of_actions, memSize=25000,
epsMin=0.05, bSize=16, epsDec=0.999, bins=7)
rewards = []; epsHistory = []
avg_rewards = []
steps = 0
verbose = False
best_total_reward = -1000
progress = tqdm(range(10000),desc='Training',unit=' episode')
for epoche in progress:
done = False
observation = env.reset()
observation = observation.reshape(1, -1)
totalReward = 0
ep_rewards = []
start = time.time()
while not done:
steps += 1
action = marvin.getAction(observation)
obs, reward, done, info = env.step(action)
obs = obs.reshape(1, -1)
totalReward += reward
marvin.addMemory(observation, action, reward, obs, int(done))
if verbose : env.render()
observation = obs
ep_rewards.append(reward)
if len(ep_rewards) > 50 and max(ep_rewards[-50:]) <= 0.1:
break
if totalReward > best_total_reward:
marvin.saveCNNs()
best_total_reward = totalReward
print("new best walker found")
marvin.learn()
marvin.vareps *= marvin.epsDec
if marvin.vareps < marvin.epsMin:
marvin.vareps = marvin.epsMin
rewards.append(totalReward)
epsHistory.append(marvin.vareps)
movingAvr = np.mean(rewards[-20:])
avg_rewards.append(movingAvr)
msg =' Training r='+str(totalReward)
msg +=' vareps='+ str(round(marvin.vareps,ndigits=2))
msg += ' avg='+str(movingAvr)
progress.set_description(msg)
if epoche % 10 == 0:
np.save("eps.npy", np.array(epsHistory))
np.save("total_rewards.npy", np.array(rewards))
np.save("avg_rewards.npy", np.array(avg_rewards))
if movingAvr>300: break # solve condition
marvin.vareps = 0
done = False
observation = env.reset()
observation = observation.reshape(1, -1)
totalReward = 0
while not done:
steps += 1
action = marvin.getAction(observation)
obs, reward, done, info = env.step(action)
obs = obs.reshape(1, -1)
totalReward += reward
#marvin.addMemory(observation, action_ind, reward, obs, int(done))
env.render()
observation = obs
\ No newline at end of file
import numpy as np
from tqdm import tqdm
from dqnAgent import dqnAgent
import matplotlib.pyplot as plt
import gym
env = gym.make('BipedalWalker-v3')
nmb_of_actions = env.action_space.shape[0]
observation = env.reset()
marvin = dqnAgent(gamma=0.99, vareps=0.0, lr=0.001,
observations=len(observation), actions=nmb_of_actions, memSize=25000,
epsMin=0.02, bSize=32, replace=1000, epsDec=0.001)
marvin.loadCNNs()
total_rewards = np.load("total_rewards.npy", allow_pickle=True)
eps = np.load("eps.npy", allow_pickle=True)
avg_rewards = np.load("avg_rewards.npy", allow_pickle=True)
plt.figure()
plt.title('Total Rewards')
plt.plot(total_rewards, c='k')
plt.figure()
plt.title('Average Rewards')
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.plot(avg_rewards, c='k')
x = np.arange(len(avg_rewards))
m, b = np.polyfit(x, avg_rewards, 1)
plt.plot(x, m*x + b)
plt.figure()
plt.title('Epsilon')
plt.plot(eps, c='k')
plt.show()
for i in range(10):
done = False
observation = env.reset()
observation = observation.reshape(1, -1)
while not done:
action = marvin.getAction(observation)
obs, reward, done, info = env.step(action)
obs = obs.reshape(1, -1)
env.render()
\ No newline at end of file
File added
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment