Source code for pysummarization.vectorizabletoken.thotvectorizer.dbm_t_hot_vectorizer

# -*- coding: utf-8 -*-
import numpy as np
from pysummarization.vectorizabletoken.t_hot_vectorizer import THotVectorizer
from pysummarization.computable_distance import ComputableDistance
from pysummarization.computabledistance.euclid_distance import EuclidDistance
# `StackedAutoEncoder` is-a `DeepBoltzmannMachine`.
from pydbm.dbm.deepboltzmannmachine.stacked_auto_encoder import StackedAutoEncoder
# The `Concrete Builder` in Builder Pattern.
from pydbm.dbm.builders.dbm_multi_layer_builder import DBMMultiLayerBuilder
# Contrastive Divergence for function approximation.
from pydbm.approximation.contrastive_divergence import ContrastiveDivergence
# Logistic Function as activation function.
from pydbm.activation.logistic_function import LogisticFunction


[docs]class DBMTHotVectorizer(THotVectorizer): ''' Vectorize token by t-hot Vectorizer. This class outputs the dimension reduced vectors with Deep Boltzmann Machines as a Stacked Auto Encoder. ''' # is-a `StackedAutoEncoder`. __dbm = None # is-a `ComputableDistance`. __computable_distance = None
[docs] def pre_learn( self, hidden_n=100, training_count=1000, batch_size=10, learning_rate=1e-05, dbm=None ): if dbm is not None and isinstance(dbm, StackedAutoEncoder) is False: raise TypeError("The type of `dbm` must be `StackedAutoEncoder`.") vector_arr = np.array(super().vectorize(self.token_arr.tolist())) if dbm is None: # Setting objects for activation function. activation_list = [ LogisticFunction(), LogisticFunction(), LogisticFunction() ] # Setting the object for function approximation. approximaion_list = [ContrastiveDivergence(), ContrastiveDivergence()] dbm = StackedAutoEncoder( DBMMultiLayerBuilder(), [vector_arr.shape[1], hidden_n, vector_arr.shape[1]], activation_list, approximaion_list, learning_rate # Setting learning rate. ) # Execute learning. dbm.learn( vector_arr, training_count=training_count, # If approximation is the Contrastive Divergence, this parameter is `k` in CD method. batch_size=batch_size, # Batch size in mini-batch training. r_batch_size=-1, # if `r_batch_size` > 0, the function of `dbm.learn` is a kind of reccursive learning. sgd_flag=True ) dbm.learn( vector_arr, training_count=1, batch_size=vector_arr.shape[0], r_batch_size=-1, sgd_flag=True ) self.__dbm = dbm
[docs] def vectorize(self, token_list): ''' Tokenize token list. Args: token_list: The list of tokens. Returns: [vector of token, vector of token, vector of token, ...] ''' return [self.__dbm_t_hot(token).tolist() for token in token_list]
[docs] def tokenize(self, vector_list): ''' Tokenize vector. Args: vector_list: The list of vector of one token. Returns: token ''' if self.computable_distance is None: self.computable_distance = EuclidDistance() vector_arr = np.array(vector_list) distance_arr = np.empty_like(vector_arr) feature_arr = self.__dbm.get_feature_point(layer_number=0) key_arr = np.empty(vector_arr.shape[0], dtype=int) for i in range(vector_arr.shape[0]): distance_arr = self.computable_distance.compute( np.expand_dims(vector_arr[i], axis=0).repeat(feature_arr.shape[0], axis=0), feature_arr ) key_arr[i] = distance_arr.argmin(axis=0) return self.token_arr[key_arr]
def __dbm_t_hot(self, token): arr = np.zeros(len(self.token_arr)) key = self.token_arr.tolist().index(token) arr = self.__dbm.get_feature_point(layer_number=0)[key] return arr
[docs] def get_dbm(self): ''' getter ''' return self.__dbm
[docs] def set_dbm(self, value): ''' setter ''' raise TypeError("This property must be read-only.")
dbm = property(get_dbm, set_dbm)
[docs] def get_computable_distance(self): ''' getter ''' return self.__computable_distance
[docs] def set_computable_distance(self, value): ''' setter ''' if isinstance(value, ComputableDistance) is False: raise TypeError() self.__computable_distance = value
computable_distance = property(get_computable_distance, set_computable_distance)