DoubleQAgent use both networks to predict action.

ab185eb3 · Armin Co · ad48f62a · ab185eb3
Commit ab185eb3 authored Jan 4, 2021 by Armin Co
--- a/agents.py
+++ b/agents.py
@@ -33,22 +33,22 @@ class QAgent:
        if offline:
            batch_size = 4096
-        if len(self.memory.history) < self.online_batch_size:
+        if len(self.memory.history) < batch_size:
            return
        states, actions, rewards, following_states, dones = self.memory.get_batch(
            batch_size)
-        targets = rewards + self.gamma * \
+        qMax = rewards + self.gamma * \
            (np.amax(self.q.predict_on_batch(following_states), axis=1)) * (1-dones)
-        q_targets = self.q.predict_on_batch(states)
+        y = self.q.predict_on_batch(states)
        idx = np.array([i for i in range(batch_size)])
-        q_targets[[idx], [actions]] = targets
+        y[[idx], [actions]] = qMax
        if offline:
-            history = self.q.net.fit(states, q_targets, epochs=2, verbose=0)
+            history = self.q.net.fit(states, y, epochs=2, verbose=0)
            loss = history.history['loss'][-1]
        else:
-            loss = self.q.fit(states, q_targets, epochs)
+            loss = self.q.fit(states, y, epochs)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
        return loss
@@ -59,10 +59,11 @@ class QAgent:
        self.q.save(path+'/' + self.name + '.net')
        self.memory.save(path+'/' + self.name + '.mem')
-    def load(self, path, net=False):
+    def load(self, path, net=True, memory=True):
        print(path)
        if net:
            self.q.load(path+'.net')
+        if memory:
            self.memory.load(path+'.mem')
 class DQAgent(QAgent):
@@ -70,38 +71,34 @@ class DQAgent(QAgent):
        super().__init__(action_space, state_space, name)
        self.q2 = QNet(action_space, state_space)
-    def learn(self, offline=False):
+    def get_action(self, state):
+        if np.random.rand() <= self.epsilon:
+            return random.randrange(self.action_space)
+        action_values = (self.q.predict(state) + self.q2.predict(state)) / 2
+        return np.argmax(action_values[0])
+    def learn(self, offline=False):
        for _ in range(2):
            if np.random.rand() < 0.5:
                temp = self.q
                self.q = self.q2
                self.q2 = temp
        batch_size = self.online_batch_size
        epochs = 1
        if offline:
            batch_size = 4096
        if len(self.memory.history) < self.online_batch_size:
-            return
+            return 0.0
        states, actions, rewards, following_states, dones = self.memory.get_batch(batch_size)
+        q_max_hat = rewards + self.gamma * (np.amax(self.q2.predict_on_batch(following_states), axis=1)) * (1-dones)
-        targets = rewards + self.gamma * (np.amax(self.q2.predict_on_batch(following_states), axis=1)) * (1-dones)
+        y = self.q.predict_on_batch(states)
-        q_targets = self.q.predict_on_batch(states)
        idx = np.array([i for i in range(batch_size)])
-        q_targets[[idx], [actions]] = targets
+        y[[idx], [actions]] = q_max_hat
        if offline:
-            history = self.q.net.fit(states, q_targets, epochs=2, verbose=0)
+            history = self.q.net.fit(states, y, epochs=2, verbose=0)
            loss = history.history['loss'][-1]
        else:
-            loss = self.q.fit(states, q_targets, epochs)
+            loss = self.q.fit(states, y, epochs)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
        return loss
\ No newline at end of file