Source code for pyqlearning.q_learning

# -*- coding: utf-8 -*-
from abc import ABCMeta, abstractmethod
import pandas as pd
import numpy as np
import random


[docs]class QLearning(metaclass=ABCMeta):
    '''
    Abstract base class and `Template Method Pattern` of Q-Learning.

    According to the Reinforcement Learning problem settings, Q-Learning 
    is a kind of Temporal Difference learning(TD Learning) that can be 
    considered as hybrid of Monte Carlo method and Dynamic Programming method. 
    As Monte Carlo method, TD Learning algorithm can learn by experience 
    without model of environment. And this learning algorithm is functional 
    extension of bootstrap method as Dynamic Programming Method.

    In this library, Q-Learning can be distinguished into Epsilon Greedy 
    Q-Leanring and Boltzmann Q-Learning. These algorithm is functionally equivalent 
    but their structures should be conceptually distinguished.

    Considering many variable parts and functional extensions in the Q-learning paradigm 
    from perspective of commonality/variability analysis in order to practice 
    object-oriented design, this abstract class defines the skeleton of a Q-Learning 
    algorithm in an operation, deferring some steps in concrete variant algorithms 
    such as Epsilon Greedy Q-Leanring and Boltzmann Q-Learning to client subclasses.
    This abstract class in this library lets subclasses redefine certain steps of 
    a Q-Learning algorithm without changing the algorithm’s structure.

    References:
        - Agrawal, S., & Goyal, N. (2011). Analysis of Thompson sampling for the multi-armed bandit problem. arXiv preprint arXiv:1111.1797.
        - Bubeck, S., & Cesa-Bianchi, N. (2012). Regret analysis of stochastic and nonstochastic multi-armed bandit problems. arXiv preprint arXiv:1204.5721.
        - Chapelle, O., & Li, L. (2011). An empirical evaluation of thompson sampling. In Advances in neural information processing systems (pp. 2249-2257).
        - Du, K. L., & Swamy, M. N. S. (2016). Search and optimization by metaheuristics (p. 434). New York City: Springer.
        - Kaufmann, E., Cappe, O., & Garivier, A. (2012). On Bayesian upper confidence bounds for bandit problems. In International Conference on Artificial Intelligence and Statistics (pp. 592-600).
        - Mnih, V., Kavukcuoglu, K., Silver, D., Graves, A., Antonoglou, I., Wierstra, D., & Riedmiller, M. (2013). Playing atari with deep reinforcement learning. arXiv preprint arXiv:1312.5602.
        - Richard Sutton and Andrew Barto (1998). Reinforcement Learning. MIT Press.
        - Watkins, C. J. C. H. (1989). Learning from delayed rewards (Doctoral dissertation, University of Cambridge).
        - Watkins, C. J., & Dayan, P. (1992). Q-learning. Machine learning, 8(3-4), 279-292.
        - White, J. (2012). Bandit algorithms for website optimization. ” O’Reilly Media, Inc.”.
    '''

    # Learning rate.
    __alpha_value = 0.1

[docs]    def get_alpha_value(self):
        '''
        getter
        Learning rate.
        '''
        if isinstance(self.__alpha_value, float) is False:
            raise TypeError("The type of __alpha_value must be float.")
        return self.__alpha_value

[docs]    def set_alpha_value(self, value):
        '''
        setter
        Learning rate.
        '''
        if isinstance(value, float) is False:
            raise TypeError("The type of __alpha_value must be float.")
        self.__alpha_value = value

    alpha_value = property(get_alpha_value, set_alpha_value)

    # Gamma value.
    __gamma_value = 0.5

[docs]    def get_gamma_value(self):
        '''
        getter
        Gamma value.
        '''
        if isinstance(self.__gamma_value, float) is False:
            raise TypeError("The type of __gamma_value must be float.")
        return self.__gamma_value

[docs]    def set_gamma_value(self, value):
        '''
        setter
        Gamma value.
        '''
        if isinstance(value, float) is False:
            raise TypeError("The type of __gamma_value must be float.")
        self.__gamma_value = value

    gamma_value = property(get_gamma_value, set_gamma_value)

    # Q(state, action)
    __q_df = None

[docs]    def get_q_df(self):
        '''
        getter
        '''
        if isinstance(self.__q_df, pd.DataFrame) is False and self.__q_df is not None:
            raise TypeError("The type of `__q_df` must be `pd.DataFrame`.")
        return self.__q_df

[docs]    def set_q_df(self, value):
        '''
        setter
        '''
        if isinstance(value, pd.DataFrame) is False and value is not None:
            raise TypeError("The type of `__q_df` must be `pd.DataFrame`.")
        self.__q_df = value

    q_df = property(get_q_df, set_q_df)

[docs]    def extract_q_df(self, state_key, action_key):
        '''
        Extract Q-Value from `self.q_df`.

        Args:
            state_key:      The key of state.
            action_key:     The key of action.

        Returns:
            Q-Value.

        '''
        q = 0.0
        if self.q_df is None:
            self.save_q_df(state_key, action_key, q)
            return q

        q_df = self.q_df[self.q_df.state_key == state_key]
        q_df = q_df[q_df.action_key == action_key]

        if q_df.shape[0]:
            q = float(q_df["q_value"])
        else:
            self.save_q_df(state_key, action_key, q)
        return q

[docs]    def save_q_df(self, state_key, action_key, q_value):
        '''
        Insert or update Q-Value in `self.q_df`.

        Args:
            state_key:      State.
            action_key:     Action.
            q_value:        Q-Value.

        Exceptions:
            TypeError:      If the type of `q_value` is not float.

        '''
        if isinstance(q_value, float) is False:
            raise TypeError("The type of q_value must be float.")

        new_q_df = pd.DataFrame([(state_key, action_key, q_value)], columns=["state_key", "action_key", "q_value"])
        if self.q_df is not None:
            self.q_df = pd.concat([new_q_df, self.q_df])
            self.q_df = self.q_df.drop_duplicates(["state_key", "action_key"])
        else:
            self.q_df = new_q_df
        
    # R(state)
    __r_df = None

[docs]    def get_r_df(self):
        ''' getter '''
        if isinstance(self.__r_df, pd.DataFrame) is False and self.__r_df is not None:
            raise TypeError("The type of `__r_df` must be `pd.DataFrame`.")
        return self.__r_df

[docs]    def set_r_df(self, value):
        ''' setter '''
        if isinstance(value, pd.DataFrame) is False and self.__r_df is not None:
            raise TypeError("The type of `__r_df` must be `pd.DataFrame`.")
        self.__r_df = value

    r_df = property(get_r_df, set_r_df)

[docs]    def extract_r_df(self, state_key, r_value, action_key=None):
        '''
        Insert or update R-Value in `self.r_df`.

        Args:
            state_key:     The key of state.
            r_value:       R-Value(Reward).
            action_key:    The key of action if it is nesesary for the parametar of value function.

        Exceptions:
            TypeError:      If the type of `r_value` is not float.
        '''
        if isinstance(r_value, float) is False:
            raise TypeError("The type of r_value must be float.")

        r = 0.0
        if self.r_df is None:
            self.save_r_df(state_key, r, action_key)
            return r

        r_df = self.r_df[self.r_df.state_key == state_key]
        if action_key is not None:
            r_df = r_df[r_df.action_key == action_key]
        if r_df.shape[0]:
            r = float(r_df["r_value"])
        else:
            self.save_r_df(state_key, r, action_key)
        return r

[docs]    def save_r_df(self, state_key, r_value, action_key=None):
        '''
        Insert or update R-Value in `self.r_df`.

        Args:
            state_key:     The key of state.
            r_value:       R-Value(Reward).
            action_key:    The key of action if it is nesesary for the parametar of value function.

        Exceptions:
            TypeError:      If the type of `r_value` is not float.
        '''
        if action_key is not None:
            add_r_df = pd.DataFrame([(state_key, action_key, r_value)], columns=["state_key", "action_key", "r_value"])
        else:
            add_r_df = pd.DataFrame([(state_key, r_value)], columns=["state_key", "r_value"])

        if self.r_df is not None:
            self.r_df = pd.concat([add_r_df, self.r_df])
            if action_key is not None:
                self.r_df = self.r_df.drop_duplicates(["state_key", "action_key"])
            else:
                self.r_df = self.r_df.drop_duplicates(["state_key"])
        else:
            self.r_df = add_r_df

    # Time.
    __t = 0

[docs]    def get_t(self):
        '''
        getter
        Time.
        '''
        if isinstance(self.__t, int) is False:
            raise TypeError("The type of __t must be int.")
        return self.__t

[docs]    def set_t(self, value):
        '''
        setter
        Time.
        '''
        if isinstance(value, int) is False:
            raise TypeError("The type of __t must be int.")
        self.__t = value

    t = property(get_t, set_t)

[docs]    def learn(self, state_key, limit=1000):
        '''
        Learning and searching the optimal solution.
        
        Args:
            state_key:      Initial state.
            limit:          The maximum number of iterative updates based on value iteration algorithms.
        '''
        self.t = 1
        while self.t <= limit:
            next_action_list = self.extract_possible_actions(state_key)
            if len(next_action_list):
                action_key = self.select_action(
                    state_key=state_key,
                    next_action_list=next_action_list
                )
                reward_value = self.observe_reward_value(state_key, action_key)

            if len(next_action_list):
                # Max-Q-Value in next action time.
                next_state_key = self.update_state(
                    state_key=state_key,
                    action_key=action_key
                )

                next_next_action_list = self.extract_possible_actions(next_state_key)
                next_action_key = self.predict_next_action(next_state_key, next_next_action_list)
                next_max_q = self.extract_q_df(next_state_key, next_action_key)

                # Update Q-Value.
                self.update_q(
                    state_key=state_key,
                    action_key=action_key,
                    reward_value=reward_value,
                    next_max_q=next_max_q
                )
                # Update State.
                state_key = next_state_key

            # Normalize.
            self.normalize_q_value()
            self.normalize_r_value()

            # Vis.
            self.visualize_learning_result(state_key)
            # Check.
            if self.check_the_end_flag(state_key) is True:
                break

            # Epsode.
            self.t += 1

[docs]    @abstractmethod
    def select_action(self, state_key, next_action_list):
        '''
        Select action by Q(state, action).

        Abstract method for concreate usecases.

        Args:
            state_key:              The key of state.
            next_action_list:       The possible action in `self.t+1`.
                                    If the length of this list is zero, all action should be possible.

        Returns:
            The key of action.

        '''
        raise NotImplementedError("This method must be implemented.")

[docs]    @abstractmethod
    def extract_possible_actions(self, state_key):
        '''
        Extract the list of the possible action in `self.t+1`.

        Abstract method for concreate usecases.

        Args:
            state_key       The key of state in `self.t+1`.

        Returns:
            `list` of the possible actions in `self.t+1`.

        '''
        raise NotImplementedError("This method must be implemented.")

[docs]    @abstractmethod
    def observe_reward_value(self, state_key, action_key):
        '''
        Compute the reward value.
        
        Args:
            state_key:              The key of state.
            action_key:             The key of action.
        
        Returns:
            Reward value.
        '''
        raise NotImplementedError("This method must be implemented.")
        
[docs]    def update_q(self, state_key, action_key, reward_value, next_max_q):
        '''
        Update Q-Value.

        Args:
            state_key:              The key of state.
            action_key:             The key of action.
            reward_value:           R-Value(Reward).
            next_max_q:             Maximum Q-Value.

        '''
        # Now Q-Value.
        q = self.extract_q_df(state_key, action_key)
        # Update Q-Value.
        new_q = q + self.alpha_value * (reward_value + (self.gamma_value * next_max_q) - q)
        # Save updated Q-Value.
        self.save_q_df(state_key, action_key, new_q)

[docs]    def predict_next_action(self, state_key, next_action_list):
        '''
        Predict next action by Q-Learning.

        Args:
            state_key:          The key of state in `self.t+1`.
            next_action_list:   The possible action in `self.t+1`.

        Returns:
            The key of action.

        '''
        if self.q_df is not None:
            next_action_q_df = self.q_df[self.q_df.state_key == state_key]
            next_action_q_df = next_action_q_df[next_action_q_df.action_key.isin(next_action_list)]
            if next_action_q_df.shape[0] == 0:
                return random.choice(next_action_list)
            else:
                if next_action_q_df.shape[0] == 1:
                    max_q_action = next_action_q_df["action_key"].values[0]
                else:
                    next_action_q_df = next_action_q_df.sort_values(by=["q_value"], ascending=False)
                    max_q_action = next_action_q_df.iloc[0, :]["action_key"]
                return max_q_action
        else:
            return random.choice(next_action_list)

[docs]    def update_state(self, state_key, action_key):
        '''
        Update state.
        
        This method can be overrided for concreate usecases.

        Args:
            state_key:    The key of state in `self.t`.
            action_key:   The key of action in `self.t`.
        
        Returns:
            The key of state in `self.t+1`.
        '''
        return action_key

[docs]    def normalize_q_value(self):
        '''
        Normalize q-value.
        This method should be overrided for concreate usecases.
        
        This method is called in each learning steps.
        
        For example:
            self.q_df.q_value = self.q_df.q_value / self.q_df.q_value.sum()
        '''
        pass

[docs]    def normalize_r_value(self):
        '''
        Normalize r-value.
        This method should be overrided for concreate usecases.

        This method is called in each learning steps.

        For example:
            self.r_df.r_value = self.r_df.r_value / self.r_df.r_value.sum()
        '''
        pass

[docs]    def check_the_end_flag(self, state_key):
        '''
        Check the end flag.
        
        If this return value is `True`, the learning is end.

        As a rule, the learning can not be stopped.
        This method should be overrided for concreate usecases.

        Args:
            state_key:    The key of state in `self.t`.

        Returns:
            bool
        '''
        # As a rule, the learning can not be stopped.
        return False

[docs]    def visualize_learning_result(self, state_key):
        '''
        Visualize learning result.
        This method should be overrided for concreate usecases.

        This method is called in last learning steps.

        Args:
            state_key:    The key of state in `self.t`.
        '''
        pass