Skip to content
Snippets Groups Projects
Select Git revision
  • 353aaf1928e3bb4c69836560b771e5db85fda8a3
  • master default protected
  • change_modified_reward_v0
  • feature_carla_szenarios
  • develop_moreSensorsInCarla
  • feature_carlaSupport
  • LearningEnvironment
7 results

agents.py

Blame
  • agents.py 3.72 KiB
    import random
    import numpy as np
    from memory import Memory
    from networks import QNet
    
    class QAgent:
        gamma = 0.99
        epsilon = 1.0
        epsilon_min = 0.01
        epsilon_decay = 0.9999
        online_batch_size = 64
        action_space = 1
        name = "Q"
        OFFLINE_BATCHSIZE = 2048
    
        def __init__(self, conf):#self, action_space, state_space, name):
            self.q = QNet(conf)#conf.env.action_space.n, conf.env.observation_space.shape[0])
            self.memory = Memory()
            self.action_space = conf.env.action_space.n
            self.name = conf.name
            self.epsilon_decay = conf.eps_decay
    
        def get_action(self, state):
            if np.random.rand() <= self.epsilon:
                return random.randrange(self.action_space)
            action_values = self.q.predict(state)
            return np.argmax(action_values[0])
    
        def remember(self, state, action, reward, following_state, done):
            self.memory.add(state, action, reward, following_state, done)
    
        def learn(self, offline=False):
            """ Learn the Q-Function. """
            batch_size = self.online_batch_size
            epochs = 1
            
            if offline:
                batch_size = self.OFFLINE_BATCHSIZE
    
            if len(self.memory.history) < batch_size:
                return
    
            states, actions, rewards, following_states, dones = self.memory.get_batch(
                batch_size)
            qMax = rewards + self.gamma * \
                (np.amax(self.q.predict_on_batch(following_states), axis=1)) * (1-dones)
            y = self.q.predict_on_batch(states)
            idx = np.array([i for i in range(batch_size)])
            y[[idx], [actions]] = qMax
    
            if offline:
                history = self.q.net.fit(states, y, epochs=2, verbose=0)
                loss = history.history['loss'][-1]
            else:
                loss = self.q.fit(states, y, epochs)
            if self.epsilon > self.epsilon_min:
                self.epsilon *= self.epsilon_decay
            return loss
    
        def save(self, path):
            path += "/"+ self.name
            print(path)
            self.q.save(path+'/' + self.name + '.net')
            self.memory.save(path+'/' + self.name + '.mem')
    
        def load(self, path, net=True, memory=True):
            print(path)
            if net:
                self.q.load(path+'.net')
            if memory:
                self.memory.load(path+'.mem')
    
    class DQAgent(QAgent):
        def __init__(self, conf):
            super().__init__(conf)
            self.q2 = QNet(conf)
    
        def get_action(self, state):
            if np.random.rand() <= self.epsilon:
                return random.randrange(self.action_space)
            action_values = (self.q.predict(state) + self.q2.predict(state)) / 2
            return np.argmax(action_values[0])
    
        def learn(self, offline=False):
            for _ in range(2):
                if np.random.rand() < 0.5:
                    temp = self.q
                    self.q = self.q2
                    self.q2 = temp
                batch_size = self.online_batch_size
                epochs = 1
                if offline:
                    batch_size = self.OFFLINE_BATCHSIZE
                if len(self.memory.history) < batch_size:
                    return
                states, actions, rewards, following_states, dones = self.memory.get_batch(
                    batch_size)
                qMax = rewards + self.gamma * \
                    (np.amax(self.q2.predict_on_batch(following_states), axis=1)) * (1-dones)
                y = self.q.predict_on_batch(states)
                idx = np.array([i for i in range(batch_size)])
                y[[idx], [actions]] = qMax
                if offline:
                    history = self.q.net.fit(states, y, epochs=2, verbose=0)
                    loss = history.history['loss'][-1]
                else:
                    loss = self.q.fit(states, y, epochs)
                if self.epsilon > self.epsilon_min:
                    self.epsilon *= self.epsilon_decay
            return loss