Source code for pysummarization.similarityfilter.encoder_decoder_clustering

# -*- coding: utf-8 -*-
from logging import getLogger, StreamHandler, NullHandler, DEBUG, ERROR
import numpy as np
from pysummarization.nlp_base import NlpBase
from pysummarization.tokenizabledoc.mecab_tokenizer import MeCabTokenizer
from pysummarization.similarity_filter import SimilarityFilter
from pysummarization.clusterabledoc.k_means import KMeans
from pysummarization.vectorizablesentence.encoder_decoder import EncoderDecoder


[docs]class EncoderDecoderClustering(SimilarityFilter):
    '''
    Concrete class for filtering mutually similar sentences.
    '''
    
    def __init__(
        self,
        document=None,
        tokenizable_doc=None,
        hidden_neuron_count=200,
        epochs=100,
        batch_size=100,
        learning_rate=1e-05,
        learning_attenuate_rate=0.1,
        attenuate_epoch=50,
        bptt_tau=8,
        weight_limit=0.5,
        dropout_rate=0.5,
        test_size_rate=0.3,
        cluster_num=10,
        max_iter=100,
        debug_mode=False
    ):
        '''
        Init.
        
        Args:
            document:                       String of document.
            tokenizable_doc:                is-a `TokenizableDoc`.
            hidden_neuron_count:            The number of units in hidden layer.
            epochs:                         Epochs of Mini-batch.
            bath_size:                      Batch size of Mini-batch.
            learning_rate:                  Learning rate.
            learning_attenuate_rate:        Attenuate the `learning_rate` by a factor of this value every `attenuate_epoch`.
            attenuate_epoch:                Attenuate the `learning_rate` by a factor of `learning_attenuate_rate` every `attenuate_epoch`.
                                            Additionally, in relation to regularization,
                                            this class constrains weight matrixes every `attenuate_epoch`.

            bptt_tau:                       Refereed maxinum step `t` in Backpropagation Through Time(BPTT).
            weight_limit:                   Regularization for weights matrix
                                            to repeat multiplying the weights matrix and `0.9`
                                            until $\sum_{j=0}^{n}w_{ji}^2 < weight\_limit$.

            dropout_rate:                   The probability of dropout.
            test_size_rate:                 Size of Test data set. If this value is `0`, the 
            cluster_num:                    The number of clusters.
            max_iter:                       Maximum number of iterations.
            debug_mode:                     Debug mode or not.
        '''
        if debug_mode is True:
            logger = getLogger("pydbm")
            handler = StreamHandler()
            handler.setLevel(DEBUG)
            logger.setLevel(DEBUG)
            logger.addHandler(handler)

            logger = getLogger("pysummarization")
            handler = StreamHandler()
            handler.setLevel(DEBUG)
            logger.setLevel(DEBUG)
            logger.addHandler(handler)

        if document is not None:
            self.learn(
                document=document,
                tokenizable_doc=tokenizable_doc,
                hidden_neuron_count=hidden_neuron_count,
                epochs=epochs,
                batch_size=batch_size,
                learning_rate=learning_rate,
                learning_attenuate_rate=learning_attenuate_rate,
                attenuate_epoch=attenuate_epoch,
                bptt_tau=bptt_tau,
                weight_limit=weight_limit,
                dropout_rate=dropout_rate,
                test_size_rate=test_size_rate,
                cluster_num=cluster_num,
                max_iter=max_iter
            )

[docs]    def learn(
        self,
        document,
        tokenizable_doc=None,
        hidden_neuron_count=200,
        epochs=100,
        batch_size=100,
        learning_rate=1e-05,
        learning_attenuate_rate=0.1,
        attenuate_epoch=50,
        bptt_tau=8,
        weight_limit=0.5,
        dropout_rate=0.5,
        test_size_rate=0.3,
        cluster_num=10,
        max_iter=100
    ):
        '''
        Learning.
        
        Args:
            document:                       String of document.
            tokenizable_doc:                is-a `TokenizableDoc`.
            hidden_neuron_count:            The number of units in hidden layer.
            epochs:                         Epochs of Mini-batch.
            bath_size:                      Batch size of Mini-batch.
            learning_rate:                  Learning rate.
            learning_attenuate_rate:        Attenuate the `learning_rate` by a factor of this value every `attenuate_epoch`.
            attenuate_epoch:                Attenuate the `learning_rate` by a factor of `learning_attenuate_rate` every `attenuate_epoch`.
                                            Additionally, in relation to regularization,
                                            this class constrains weight matrixes every `attenuate_epoch`.

            bptt_tau:                       Refereed maxinum step `t` in Backpropagation Through Time(BPTT).
            weight_limit:                   Regularization for weights matrix
                                            to repeat multiplying the weights matrix and `0.9`
                                            until $\sum_{j=0}^{n}w_{ji}^2 < weight\_limit$.

            dropout_rate:                   The probability of dropout.
            test_size_rate:                 Size of Test data set. If this value is `0`, the 
            cluster_num:                    The number of clusters.
            max_iter:                       Maximum number of iterations.

        '''
        # The object of NLP.
        nlp_base = NlpBase()
        if tokenizable_doc is None:
            # Set tokenizer. This is japanese tokenizer with MeCab.
            nlp_base.tokenizable_doc = MeCabTokenizer()
        else:
            nlp_base.tokenizable_doc = tokenizable_doc

        sentence_list = nlp_base.listup_sentence(document)

        all_token_list = []
        for i in range(len(sentence_list)):
            nlp_base.tokenize(sentence_list[i])
            all_token_list.extend(nlp_base.token)
            sentence_list[i] = nlp_base.token

        token_master_list = list(set(all_token_list))
        vectorlizable_sentence = EncoderDecoder()
        vectorlizable_sentence.learn(
            sentence_list=sentence_list, 
            token_master_list=token_master_list,
            hidden_neuron_count=hidden_neuron_count,
            epochs=epochs,
            batch_size=batch_size,
            learning_rate=learning_rate,
            learning_attenuate_rate=learning_attenuate_rate,
            attenuate_epoch=attenuate_epoch,
            bptt_tau=bptt_tau,
            weight_limit=weight_limit,
            dropout_rate=dropout_rate,
            test_size_rate=test_size_rate
        )
        self.__vectorlizable_sentence = vectorlizable_sentence
        self.__token_master_list = token_master_list

        feature_arr = vectorlizable_sentence.vectorize(sentence_list)

        self.__clusterable_doc = KMeans(
            cluster_num=cluster_num,
            max_iter=max_iter,
            init_noise_arr=np.random.normal(size=feature_arr.shape)
        )
        self.__labeled_arr = self.__clusterable_doc.learn(feature_arr)
        self.__sentence_list = sentence_list
        self.__batch_size = batch_size

[docs]    def calculate(self, token_list_x, token_list_y):
        '''
        Calculate similarity with the so-called Cosine similarity of Tf-Idf vectors.
        
        Concrete method.
        
        Args:
            token_list_x:    [token, token, token, ...]
            token_list_y:    [token, token, token, ...]
        
        Returns:
            Similarity.
        '''
        if len(token_list_x) == 0 or len(token_list_y) == 0:
            return 0.0

        x_list = self.__sentence_list[:self.__batch_size-1]
        y_list = self.__sentence_list[:self.__batch_size-1]
        x_list.append(token_list_x)
        y_list.append(token_list_y)
        x_arr = self.__vectorlizable_sentence.vectorize(x_list)[-1]
        y_arr = self.__vectorlizable_sentence.vectorize(y_list)[-1]
        labeled_arr = self.__clusterable_doc.inference(np.r_[x_arr, y_arr])
        
        if labeled_arr[0] == labeled_arr[1]:
            return 1.0
        else:
            return 0.0

[docs]    def set_readonly(self, value):
        ''' setter '''
        raise TypeError()

[docs]    def get_labeled_arr(self):
        ''' getter '''
        return self.__labeled_arr
    
    labeled_arr = property(get_labeled_arr, set_readonly)

[docs]    def get_sentence_list(self):
        ''' getter '''
        return self.__sentence_list
    
    sentence_list = property(get_sentence_list, set_readonly)