Source code for pysummarization.similarityfilter.encoder_decoder_clustering

# -*- coding: utf-8 -*-
from logging import getLogger, StreamHandler, NullHandler, DEBUG, ERROR
import numpy as np
from pysummarization.nlp_base import NlpBase
from pysummarization.tokenizabledoc.mecab_tokenizer import MeCabTokenizer
from pysummarization.similarity_filter import SimilarityFilter
from pysummarization.clusterabledoc.k_means import KMeans
from pysummarization.vectorizablesentence.encoder_decoder import EncoderDecoder


[docs]class EncoderDecoderClustering(SimilarityFilter): ''' Concrete class for filtering mutually similar sentences. ''' def __init__( self, document=None, tokenizable_doc=None, hidden_neuron_count=200, epochs=100, batch_size=100, learning_rate=1e-05, learning_attenuate_rate=0.1, attenuate_epoch=50, bptt_tau=8, weight_limit=0.5, dropout_rate=0.5, test_size_rate=0.3, cluster_num=10, max_iter=100, debug_mode=False ): ''' Init. Args: document: String of document. tokenizable_doc: is-a `TokenizableDoc`. hidden_neuron_count: The number of units in hidden layer. epochs: Epochs of Mini-batch. bath_size: Batch size of Mini-batch. learning_rate: Learning rate. learning_attenuate_rate: Attenuate the `learning_rate` by a factor of this value every `attenuate_epoch`. attenuate_epoch: Attenuate the `learning_rate` by a factor of `learning_attenuate_rate` every `attenuate_epoch`. Additionally, in relation to regularization, this class constrains weight matrixes every `attenuate_epoch`. bptt_tau: Refereed maxinum step `t` in Backpropagation Through Time(BPTT). weight_limit: Regularization for weights matrix to repeat multiplying the weights matrix and `0.9` until $\sum_{j=0}^{n}w_{ji}^2 < weight\_limit$. dropout_rate: The probability of dropout. test_size_rate: Size of Test data set. If this value is `0`, the cluster_num: The number of clusters. max_iter: Maximum number of iterations. debug_mode: Debug mode or not. ''' if debug_mode is True: logger = getLogger("pydbm") handler = StreamHandler() handler.setLevel(DEBUG) logger.setLevel(DEBUG) logger.addHandler(handler) logger = getLogger("pysummarization") handler = StreamHandler() handler.setLevel(DEBUG) logger.setLevel(DEBUG) logger.addHandler(handler) if document is not None: self.learn( document=document, tokenizable_doc=tokenizable_doc, hidden_neuron_count=hidden_neuron_count, epochs=epochs, batch_size=batch_size, learning_rate=learning_rate, learning_attenuate_rate=learning_attenuate_rate, attenuate_epoch=attenuate_epoch, bptt_tau=bptt_tau, weight_limit=weight_limit, dropout_rate=dropout_rate, test_size_rate=test_size_rate, cluster_num=cluster_num, max_iter=max_iter )
[docs] def learn( self, document, tokenizable_doc=None, hidden_neuron_count=200, epochs=100, batch_size=100, learning_rate=1e-05, learning_attenuate_rate=0.1, attenuate_epoch=50, bptt_tau=8, weight_limit=0.5, dropout_rate=0.5, test_size_rate=0.3, cluster_num=10, max_iter=100 ): ''' Learning. Args: document: String of document. tokenizable_doc: is-a `TokenizableDoc`. hidden_neuron_count: The number of units in hidden layer. epochs: Epochs of Mini-batch. bath_size: Batch size of Mini-batch. learning_rate: Learning rate. learning_attenuate_rate: Attenuate the `learning_rate` by a factor of this value every `attenuate_epoch`. attenuate_epoch: Attenuate the `learning_rate` by a factor of `learning_attenuate_rate` every `attenuate_epoch`. Additionally, in relation to regularization, this class constrains weight matrixes every `attenuate_epoch`. bptt_tau: Refereed maxinum step `t` in Backpropagation Through Time(BPTT). weight_limit: Regularization for weights matrix to repeat multiplying the weights matrix and `0.9` until $\sum_{j=0}^{n}w_{ji}^2 < weight\_limit$. dropout_rate: The probability of dropout. test_size_rate: Size of Test data set. If this value is `0`, the cluster_num: The number of clusters. max_iter: Maximum number of iterations. ''' # The object of NLP. nlp_base = NlpBase() if tokenizable_doc is None: # Set tokenizer. This is japanese tokenizer with MeCab. nlp_base.tokenizable_doc = MeCabTokenizer() else: nlp_base.tokenizable_doc = tokenizable_doc sentence_list = nlp_base.listup_sentence(document) all_token_list = [] for i in range(len(sentence_list)): nlp_base.tokenize(sentence_list[i]) all_token_list.extend(nlp_base.token) sentence_list[i] = nlp_base.token token_master_list = list(set(all_token_list)) vectorlizable_sentence = EncoderDecoder() vectorlizable_sentence.learn( sentence_list=sentence_list, token_master_list=token_master_list, hidden_neuron_count=hidden_neuron_count, epochs=epochs, batch_size=batch_size, learning_rate=learning_rate, learning_attenuate_rate=learning_attenuate_rate, attenuate_epoch=attenuate_epoch, bptt_tau=bptt_tau, weight_limit=weight_limit, dropout_rate=dropout_rate, test_size_rate=test_size_rate ) self.__vectorlizable_sentence = vectorlizable_sentence self.__token_master_list = token_master_list feature_arr = vectorlizable_sentence.vectorize(sentence_list) self.__clusterable_doc = KMeans( cluster_num=cluster_num, max_iter=max_iter, init_noise_arr=np.random.normal(size=feature_arr.shape) ) self.__labeled_arr = self.__clusterable_doc.learn(feature_arr) self.__sentence_list = sentence_list self.__batch_size = batch_size
[docs] def calculate(self, token_list_x, token_list_y): ''' Calculate similarity with the so-called Cosine similarity of Tf-Idf vectors. Concrete method. Args: token_list_x: [token, token, token, ...] token_list_y: [token, token, token, ...] Returns: Similarity. ''' if len(token_list_x) == 0 or len(token_list_y) == 0: return 0.0 x_list = self.__sentence_list[:self.__batch_size-1] y_list = self.__sentence_list[:self.__batch_size-1] x_list.append(token_list_x) y_list.append(token_list_y) x_arr = self.__vectorlizable_sentence.vectorize(x_list)[-1] y_arr = self.__vectorlizable_sentence.vectorize(y_list)[-1] labeled_arr = self.__clusterable_doc.inference(np.r_[x_arr, y_arr]) if labeled_arr[0] == labeled_arr[1]: return 1.0 else: return 0.0
[docs] def set_readonly(self, value): ''' setter ''' raise TypeError()
[docs] def get_labeled_arr(self): ''' getter ''' return self.__labeled_arr
labeled_arr = property(get_labeled_arr, set_readonly)
[docs] def get_sentence_list(self): ''' getter ''' return self.__sentence_list
sentence_list = property(get_sentence_list, set_readonly)