Agents.py

from pacman_env import *
from tqdm import tqdm
class QLearningAgent():
    def agent_init(self, agent_init_info):
        """Setup for the agent called when the experiment first starts.
        
        Args:
        agent_init_info (dict), the parameters used to initialize the agent. The dictionary contains:
        {
            num_states (int): The number of states,
            num_actions (int): The number of actions,
            epsilon (float): The epsilon parameter for exploration,
            step_size (float): The step-size,
            discount (float): The discount factor,
        }
        
        """
        # Store the parameters provided in agent_init_info.
        self.num_actions = agent_init_info["num_actions"]
        self.epsilon = agent_init_info["epsilon"]
        self.step_size = agent_init_info["step_size"]
        self.discount = agent_init_info["discount"]
        self.rand_generator = np.random.RandomState(agent_init_info["seed"])

        
        # Create an array for action-value estimates and initialize it to zero.
        self.q = {} # The array of action-value estimates.
    def agent_start(self, state):
        """The first method called when the episode starts, called after
        the environment starts.
        Args:
            state (int): the state from the
                environment's evn_start function.
        Returns:
            action (int): the first action the agent takes.
        """
        
        # Choose action using epsilon greedy.
        current_q = self.q.setdefault(state,[0,0,0,0,0])
        if self.rand_generator.rand() < self.epsilon:
            action = self.rand_generator.randint(self.num_actions) # random action selection
        else:
            action = self.argmax(current_q) # greedy action selection

        self.prev_state = state
        self.prev_action = action
        return action
    
    def agent_step(self, reward, state):
        """A step taken by the agent.
        Args:
            reward (float): the reward received for taking the last action taken
            state (int): the state from the
                environment's step based on where the agent ended up after the
                last step.
        Returns:
            action (int): the action the agent is taking.
        """


        # Choose action using epsilon greedy.
        current_q = self.q.setdefault(state,[0,0,0,0,0])
        if self.rand_generator.rand() < self.epsilon:
            action = self.rand_generator.randint(self.num_actions)
        else:
            action = self.argmax(current_q)
        
        # Perform an update (1 line)
        ### START CODE HERE ###
        #perform update :
        self.q[self.prev_state][self.prev_action] += self.step_size*(reward + self.discount*np.max(self.q[state]) - self.q[self.prev_state][self.prev_action])

        ### END CODE HERE ###

        self.prev_state = state
        self.prev_action = action
        return action
    
    def agent_end(self, reward):
        """Run when the agent terminates.
        Args:
            reward (float): the reward the agent received for entering the
                terminal state.
        """
        # Perform the last update in the episode (1 line)
        self.q[self.prev_state][self.prev_action] += self.step_size*(reward - self.q[self.prev_state][self.prev_action])


    def argmax(self, q_values):
        """argmax with random tie-breaking
        Args:
            q_values (Numpy array): the array of action-values
        Returns:
            action (int): an action with the highest value
        """
        top = float("-inf")
        ties = []

        for i in range(len(q_values)):
            if q_values[i] > top:
                top = q_values[i]
                ties = []

            if q_values[i] == top:
                ties.append(i)

        return self.rand_generator.choice(ties)

        
    def set_epsilon(self, value):
        self.epsilon = value

class SarsaAgent():
    def agent_init(self, agent_init_info):
        self.num_actions = agent_init_info["num_actions"]
        self.epsilon = agent_init_info["epsilon"]
        self.step_size = agent_init_info["step_size"]
        self.discount = agent_init_info["discount"]
        self.rand_generator = np.random.RandomState(agent_init_info["seed"])
        self.q = {}

        
    def agent_start(self, state):
        current_q = self.q.setdefault(state,[0,0,0,0,0])
        if self.rand_generator.rand() < self.epsilon:
            action = self.rand_generator.randint(self.num_actions) # random action selection
        else:
            action = self.argmax(current_q) # greedy action selection
        self.prev_state = state
        self.prev_action = action
        return action
    
    def agent_step(self, reward, state):
        # Choose action using epsilon greedy.
        current_q = self.q.setdefault(state,[0,0,0,0,0])
        if self.rand_generator.rand() < self.epsilon:
            action = self.rand_generator.randint(self.num_actions)
        else:
            action = self.argmax(current_q)
        
        best_q = np.max(current_q)
        number_of_greedy_actions = np.sum(current_q==best_q)
        proba_non_greedy = (self.epsilon / self.num_actions)
        proba_greedy = ((1 - self.epsilon) / number_of_greedy_actions) + (self.epsilon / self.num_actions)
        expected_q = 0 
        for a in range(self.num_actions):
            if current_q[a] != best_q: 
                expected_q += current_q[a] * proba_non_greedy 
            else: 
                expected_q += current_q[a] * proba_greedy
        
        self.q[self.prev_state][self.prev_action] += self.step_size*(reward + self.discount*expected_q - self.q[self.prev_state][self.prev_action])

        
        self.prev_state = state
        self.prev_action = action
        return action
    
    def agent_end(self, reward):
        self.q[self.prev_state][self.prev_action] += self.step_size*(reward - self.q[self.prev_state][self.prev_action])
    
        
    def argmax(self, q_values):
        top = float("-inf")
        ties = []
        for i in range(len(q_values)):
            if q_values[i] > top:
                top = q_values[i]
                ties = []
            if q_values[i] == top:
                ties.append(i)

        return self.rand_generator.choice(ties)

    def set_epsilon(self, value):
        self.epsilon = value