Source code for pysummarization.nlp_base

# -*- coding: utf-8 -*-
from abc import ABCMeta, abstractmethod
from pysummarization.tokenizable_doc import TokenizableDoc


[docs]class NlpBase(object): ''' The base class for NLP. ''' # object of tokenizer. __tokenizable_doc = None
[docs] def get_tokenizable_doc(self): ''' getter ''' if isinstance(self.__tokenizable_doc, TokenizableDoc): return self.__tokenizable_doc else: raise TypeError()
[docs] def set_tokenizable_doc(self, value): ''' setter ''' if isinstance(value, TokenizableDoc): self.__tokenizable_doc = value else: raise TypeError()
tokenizable_doc = property(get_tokenizable_doc, set_tokenizable_doc) # Delimiter for self.listup_sentence. __delimiter_list=["。", "\n"]
[docs] def get_delimiter_list(self): ''' getter ''' return self.__delimiter_list
[docs] def set_delimiter_list(self, value): ''' setter ''' self.__delimiter_list = value
delimiter_list = property(get_delimiter_list, set_delimiter_list) # List of tokens. __token = []
[docs] def get_token(self): ''' getter ''' return self.__token
[docs] def set_token(self, value): ''' setter ''' self.__token = value
token = property(get_token, set_token)
[docs] def tokenize(self, data): ''' Tokenize sentence and set the list of tokens to self.token. Args: data: string. ''' self.token = self.tokenizable_doc.tokenize(data)
[docs] def listup_sentence(self, data, counter=0): ''' Divide string into sentence list. Args: data: string. counter: recursive counter. Returns: List of sentences. ''' delimiter = self.delimiter_list[counter] sentence_list = [] [sentence_list.append(sentence + delimiter) for sentence in data.split(delimiter) if sentence != ""] if counter + 1 < len(self.delimiter_list): sentence_list_r = [] [sentence_list_r.extend(self.listup_sentence(sentence, counter+1)) for sentence in sentence_list] sentence_list = sentence_list_r return sentence_list