Source code for pysummarization.vectorizabletoken.t_hot_vectorizer

# -*- coding: utf-8 -*-
import numpy as np
from pysummarization.vectorizable_token import VectorizableToken


[docs]class THotVectorizer(VectorizableToken): ''' Vectorize token by t-hot Vectorizer. ''' def __init__(self, token_list): ''' Initialize. Args: token_list: The list of all tokens. ''' self.__token_arr = np.array(list(set(token_list)))
[docs] def vectorize(self, token_list): ''' Tokenize token list. Args: token_list: The list of tokens. Returns: [vector of token, vector of token, vector of token, ...] ''' return [self.__t_hot(token).tolist() for token in token_list]
[docs] def convert_tokens_into_matrix(self, token_list): ''' Create matrix of sentences. Args: token_list: The list of tokens. Returns: 2-D `np.ndarray` of sentences. Each row means one hot vectors of one sentence. ''' return np.array(self.vectorize(token_list)).astype(np.float32)
[docs] def tokenize(self, vector_list): ''' Tokenize vector. Args: vector_list: The list of vector of one token. Returns: token ''' vector_arr = np.array(vector_list) if vector_arr.ndim == 1: key_arr = vector_arr.argmax() else: key_arr = vector_arr.argmax(axis=-1) return self.__token_arr[key_arr]
def __t_hot(self, token): arr = np.zeros(len(self.__token_arr)) key = self.__token_arr.tolist().index(token) arr[key] = 1 arr = arr.astype(np.float32) return arr
[docs] def get_token_arr(self): ''' getter ''' return self.__token_arr
[docs] def set_token_arr(self, value): ''' setter ''' raise TypeError("This property must be read-only.")
token_arr = property(get_token_arr, set_token_arr)