Source code for pyqlearning.deep_q_learning

# -*- coding: utf-8 -*-
from abc import ABCMeta, abstractmethod
import numpy as np
from pyqlearning.function_approximator import FunctionApproximator


[docs]class DeepQLearning(metaclass=ABCMeta):
    '''
    Abstract base class to implement the Deep Q-Learning.
    
    The Reinforcement learning theory presents several issues 
    from a perspective of deep learning theory(Mnih, V., et al. 2013).
    Firstly, deep learning applications have required large amounts of 
    hand-labelled training data. Reinforcement learning algorithms, 
    on the other hand, must be able to learn from a scalar reward signal 
    that is frequently sparse, noisy and delayed.

    The difference between the two theories is not only the type of data 
    but also the timing to be observed. The delay between taking actions and 
    receiving rewards, which can be thousands of timesteps long, seems particularly 
    daunting when compared to the direct association between inputs and targets found 
    in supervised learning.

    Another issue is that deep learning algorithms assume the data samples to be independent, 
    while in reinforcement learning one typically encounters sequences of highly correlated 
    states. Furthermore, in Reinforcement learning, the data distribution changes as the 
    algorithm learns new behaviours, presenting aspects of recursive learning, which can be 
    problematic for deep learning methods that assume a fixed underlying distribution.

    Increasing the complexity of states/actions is equivalent to increasing the number of 
    combinations of states/actions. If the value function is continuous and granularities of 
    states/actions are extremely fine, the combinatorial explosion will be encountered. 
    In other words, this basic approach is totally impractical, because the state/action-value 
    function is estimated separately for each sequence, without any **generalisation**. Instead, 
    it is common to use a **function approximator** to estimate the state/action-value function.

    Considering many variable parts and functional extensions in the Deep Q-learning paradigm 
    from perspective of commonality/variability analysis in order to practice 
    object-oriented design, this abstract class defines the skeleton of a Deep Q-Learning 
    algorithm in an operation, deferring some steps in concrete variant algorithms 
    such as Epsilon Deep Q-Network to client subclasses. This abstract class in this library 
    lets subclasses redefine certain steps of a Deep Q-Learning algorithm without changing 
    the algorithm’s structure.

    References:
        - Egorov, M. (2016). Multi-agent deep reinforcement learning.(URL: https://pdfs.semanticscholar.org/dd98/9d94613f439c05725bad958929357e365084.pdf)
        - Gupta, J. K., Egorov, M., & Kochenderfer, M. (2017, May). Cooperative multi-agent control using deep reinforcement learning. In International Conference on Autonomous Agents and Multiagent Systems (pp. 66-83). Springer, Cham.
        - Mnih, V., Kavukcuoglu, K., Silver, D., Graves, A., Antonoglou, I., Wierstra, D., & Riedmiller, M. (2013). Playing atari with deep reinforcement learning. arXiv preprint arXiv:1312.5602.
    '''

    # Learning rate.
    __alpha_value = 0.1
    # Discount.
    __gamma_value = 0.5
    # The logs of predicted and real Q-Values.
    __q_logs_arr = np.array([])

    def __init__(self, function_approximator):
        '''
        Init.
        
        Args:
            function_approximator:  is-a `FunctionApproximator`.
        '''
        if isinstance(function_approximator, FunctionApproximator):
            self.__function_approximator = function_approximator
        else:
            raise TypeError()
        self.t = 1
        self.__q_logs_arr = np.array([])

[docs]    def learn(self, state_arr, limit=1000):
        '''
        Learning and searching the optimal solution.
        
        Args:
            state_arr:      `np.ndarray` of initial state.
            limit:          The maximum number of iterative updates based on value iteration algorithms.
        '''
        while self.t <= limit:
            # Draw samples of next possible actions from any distribution.
            next_action_arr = self.extract_possible_actions(state_arr)
            # Inference Q-Values.
            predicted_q_arr = self.__function_approximator.inference_q(next_action_arr)
            # Set `np.ndarray` of rewards and next Q-Values.
            reward_value_arr = np.empty((next_action_arr.shape[0], 1))
            next_max_q_arr = np.empty((next_action_arr.shape[0], 1))
            for i in range(reward_value_arr.shape[0]):
                # Observe reward values.
                reward_value_arr[i] = self.observe_reward_value(state_arr, next_action_arr[i])
                # Inference the Max-Q-Value in next action time.
                next_next_action_arr = self.extract_possible_actions(next_action_arr[i])
                next_max_q_arr[i] = self.__function_approximator.inference_q(next_next_action_arr).max()

            # Select action.
            action_arr, predicted_q = self.select_action(next_action_arr, predicted_q_arr)
            # Update real Q-Values.
            real_q_arr = self.update_q(
                predicted_q_arr,
                reward_value_arr,
                next_max_q_arr
            )

            # Maximum of predicted and real Q-Values.
            real_q = real_q_arr[np.where(predicted_q_arr == predicted_q)[0][0]]
            if self.__q_logs_arr.shape[0] > 0:
                self.__q_logs_arr = np.r_[
                    self.__q_logs_arr,
                    np.array([predicted_q, real_q]).reshape(1, 2)
                ]
            else:
                self.__q_logs_arr = np.array([predicted_q, real_q]).reshape(1, 2)

            # Learn Q-Values.
            self.learn_q(predicted_q_arr, real_q_arr)
            # Update State.
            state_arr = self.update_state(state_arr, action_arr)
            # Epsode.
            self.t += 1
            # Check.
            end_flag = self.check_the_end_flag(state_arr)
            if end_flag is True:
                break

[docs]    @abstractmethod
    def extract_possible_actions(self, state_arr):
        '''
        Extract possible actions.

        Args:
            state_arr:  `np.ndarray` of state.
        
        Returns:
            `np.ndarray` of actions.
        '''
        raise NotImplementedError("This method must be implemented.")

[docs]    @abstractmethod
    def select_action(self, next_action_arr, next_q_arr):
        '''
        Select action by Q(state, action).

        Args:
            next_action_arr:        `np.ndarray` of actions.
            next_q_arr:             `np.ndarray` of Q-Values.

        Returns:
            Tuple(`np.ndarray` of action., Q-Value)
        '''
        raise NotImplementedError("This method must be implemented.")

[docs]    @abstractmethod
    def observe_reward_value(self, state_arr, action_arr):
        '''
        Compute the reward value.
        
        Args:
            state_arr:              `np.ndarray` of state.
            action_arr:             `np.ndarray` of action.
        
        Returns:
            Reward value.
        '''
        raise NotImplementedError("This method must be implemented.")

[docs]    def update_q(self, predicted_q_arr, reward_value_arr, next_max_q_arr):
        '''
        Update Q.
        
        Args:
            predicted_q_arr:    `np.ndarray` of predicted Q-Values.
            reward_value_arr:   `np.ndarray` of reward values.
            next_max_q_arr:     `np.ndarray` of maximum Q-Values in next time step.
        
        Returns:
            `np.ndarray` of real Q-Values.
        '''
        # Update Q-Value.
        return predicted_q_arr + (self.alpha_value * (reward_value_arr + (self.gamma_value * next_max_q_arr) - predicted_q_arr))

[docs]    def learn_q(self, predicted_q_arr, real_q_arr):
        '''
        Learn Q with the function approximator.

        Args:
            predicted_q_arr:    `np.ndarray` of predicted Q-Values.
            real_q_arr:         `np.ndarray` of real Q-Values.
        '''
        # Learn updated Q-Value.
        self.__function_approximator.learn_q(predicted_q_arr, real_q_arr)

[docs]    def update_state(self, state_arr, action_arr):
        '''
        Update state.
        
        This method can be overrided for concreate usecases.

        Args:
            state_arr:    `np.ndarray` of state in `self.t`.
            action_arr:   `np.ndarray` of action in `self.t`.
        
        Returns:
            `np.ndarray` of state in `self.t+1`.
        '''
        return action_arr

[docs]    def check_the_end_flag(self, state_arr):
        '''
        Check the end flag.
        
        If this return value is `True`, the learning is end.

        As a rule, the learning can not be stopped.
        This method should be overrided for concreate usecases.

        Args:
            state_arr:    `np.ndarray` of state in `self.t`.

        Returns:
            bool
        '''
        # As a rule, the learning can not be stopped.
        return False

[docs]    def get_function_approximator(self):
        ''' getter '''
        return self.__function_approximator

[docs]    def set_function_approximator(self, value):
        if isinstance(value, FunctionApproximator):
            self.__function_approximator = value
        else:
            raise TypeError()

    function_approximator = property(get_function_approximator, set_function_approximator)

[docs]    def get_alpha_value(self):
        '''
        getter
        Learning rate.
        '''
        if isinstance(self.__alpha_value, float) is False:
            raise TypeError("The type of __alpha_value must be float.")
        return self.__alpha_value

[docs]    def set_alpha_value(self, value):
        '''
        setter
        Learning rate.
        '''
        if isinstance(value, float) is False:
            raise TypeError("The type of __alpha_value must be float.")
        self.__alpha_value = value

    alpha_value = property(get_alpha_value, set_alpha_value)

[docs]    def get_gamma_value(self):
        '''
        getter
        Gamma value.
        '''
        if isinstance(self.__gamma_value, float) is False:
            raise TypeError("The type of __gamma_value must be float.")
        return self.__gamma_value

[docs]    def set_gamma_value(self, value):
        '''
        setter
        Gamma value.
        '''
        if isinstance(value, float) is False:
            raise TypeError("The type of __gamma_value must be float.")
        self.__gamma_value = value

    gamma_value = property(get_gamma_value, set_gamma_value)

[docs]    def get_q_logs_arr(self):
        ''' getter '''
        return self.__q_logs_arr
    
[docs]    def set_q_logs_arr(self, values):
        ''' setter '''
        raise TypeError("The `q_logs_arr` must be read-only.")
    
    q_logs_arr = property(get_q_logs_arr, set_q_logs_arr)