diff --git a/.gitignore b/.gitignore index 43f67a2b2f59be6440652d7544d6fca3c545a65b..42640483c31d4089cf5c8a964709196beb5d7ec9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,3 @@ __pycache__ saved_agents -*.png \ No newline at end of file +*.png diff --git a/agents.py b/agents.py index edc276c6cf11a3003ec494f9db6870af456794dc..ede5a6d42aedadb185958ad5df6acb89ba233f85 100644 --- a/agents.py +++ b/agents.py @@ -78,7 +78,7 @@ class DQAgent(QAgent): return np.argmax(action_values[0]) def learn(self, offline=False): - for _ in range(2): + for _ in range(3): if np.random.rand() < 0.5: temp = self.q self.q = self.q2 @@ -87,8 +87,8 @@ class DQAgent(QAgent): epochs = 1 if offline: batch_size = 4096 - if len(self.memory.history) < self.online_batch_size: - return 0.0 + if len(self.memory.history) < batch_size: + return states, actions, rewards, following_states, dones = self.memory.get_batch(batch_size) q_max_hat = rewards + self.gamma * (np.amax(self.q2.predict_on_batch(following_states), axis=1)) * (1-dones) y = self.q.predict_on_batch(states) diff --git a/environment_wrapper.py b/environment_wrapper.py index a9e19205890dc2be581d5e92cc19e1ba9a94a930..f624a2132529e8b87ba554a00376ba9a77fb6285 100644 --- a/environment_wrapper.py +++ b/environment_wrapper.py @@ -76,11 +76,11 @@ def run(environment, agent, episodes, render=True, learn=True): return score_history, avg_score_history -def process_logs(avg_score_history, loss, title="Title"): +def process_logs(avg_score_history, loss, title="Title", render=False): """ Plot the log history """ plt.plot([i+1 for i in range(0, len(loss), 2)], loss[::2]) plt.plot([i+1 for i in range(0, len(avg_score_history), 2)], avg_score_history[::2], '--') plt.title(title) - plt.show() plt.savefig(title + '.png', format="png") - + if render: + plt.show() diff --git a/main.py b/main.py index daea0e605118df5365fd8b42e1d4b775cd8c115a..254f2b6bbde25c54cf3ca32e7c1cbb035374e2e1 100644 --- a/main.py +++ b/main.py @@ -6,7 +6,7 @@ import os import atexit import gym -from agents import QAgent +from agents import DQAgent as QAgent import environment_wrapper as ew # Allow GPU usage or force tensorflow to use the CPU. @@ -21,12 +21,14 @@ if __name__ == '__main__': env = gym.make('LunarLander-v2') # 2. Create a learning agent - marvin = QAgent(env.action_space.n, env.observation_space.shape[0], 'from_scratch') + marvin = QAgent(env.action_space.n, env.observation_space.shape[0], 'FromScratchDouble') # (2.5) *optional* Load agent memory and/or net from disk. - LOAD_MEMORIES = False + agnt = 'agent' LOAD_ANN = False - marvin.load('saved_agents/agent/agent', net=LOAD_ANN, memory=LOAD_MEMORIES) + LOAD_MEMORIES = False + if LOAD_ANN or LOAD_MEMORIES: + marvin.load('saved_agents/' + agnt + '/' + agnt, net=LOAD_ANN, memory=LOAD_MEMORIES) # 3. Set your configurations for the run. RENDER = False @@ -60,5 +62,4 @@ if __name__ == '__main__': marvin.save(SAVE_PATH) # Show the result of the runl. - if RENDER: - ew.process_logs(avg_score, loss, title=marvin.name) + ew.process_logs(avg_score, loss, title=marvin.name, render=RENDER)