Source code for pysummarization.nlp_base
# -*- coding: utf-8 -*-
from abc import ABCMeta, abstractmethod
from pysummarization.tokenizable_doc import TokenizableDoc
[docs]class NlpBase(object):
'''
The base class for NLP.
'''
# object of tokenizer.
__tokenizable_doc = None
[docs] def get_tokenizable_doc(self):
''' getter '''
if isinstance(self.__tokenizable_doc, TokenizableDoc):
return self.__tokenizable_doc
else:
raise TypeError()
[docs] def set_tokenizable_doc(self, value):
''' setter '''
if isinstance(value, TokenizableDoc):
self.__tokenizable_doc = value
else:
raise TypeError()
tokenizable_doc = property(get_tokenizable_doc, set_tokenizable_doc)
# Delimiter for self.listup_sentence.
__delimiter_list=["。", "\n"]
[docs] def get_delimiter_list(self):
''' getter '''
return self.__delimiter_list
[docs] def set_delimiter_list(self, value):
''' setter '''
self.__delimiter_list = value
delimiter_list = property(get_delimiter_list, set_delimiter_list)
# List of tokens.
__token = []
[docs] def get_token(self):
''' getter '''
return self.__token
[docs] def set_token(self, value):
''' setter '''
self.__token = value
token = property(get_token, set_token)
[docs] def tokenize(self, data):
'''
Tokenize sentence and set the list of tokens to self.token.
Args:
data: string.
'''
self.token = self.tokenizable_doc.tokenize(data)
[docs] def listup_sentence(self, data, counter=0):
'''
Divide string into sentence list.
Args:
data: string.
counter: recursive counter.
Returns:
List of sentences.
'''
delimiter = self.delimiter_list[counter]
sentence_list = []
[sentence_list.append(sentence + delimiter) for sentence in data.split(delimiter) if sentence != ""]
if counter + 1 < len(self.delimiter_list):
sentence_list_r = []
[sentence_list_r.extend(self.listup_sentence(sentence, counter+1)) for sentence in sentence_list]
sentence_list = sentence_list_r
return sentence_list