Source code for pysummarization.tokenizabledoc.mecab_tokenizer

# -*- coding: utf-8 -*-
from pysummarization.tokenizable_doc import TokenizableDoc
import MeCab


[docs]class MeCabTokenizer(TokenizableDoc): ''' Tokenize string. Japanese morphological analysis with MeCab. ''' __part_of_speech = ["名詞", "形容詞", "動詞"]
[docs] def get_part_of_speech(self): ''' getter ''' return self.__part_of_speech
[docs] def set_part_of_speech(self, value): ''' setter ''' self.__part_of_speech = value
part_of_speech = property(get_part_of_speech, set_part_of_speech)
[docs] def tokenize(self, sentence_str): ''' Tokenize str. Args: sentence_str: tokenized string. Returns: [token, token, token, ...] ''' if len(self.part_of_speech) == 0: mt = MeCab.Tagger("-Owakati") wordlist = mt.parse(sentence_str) token_list = wordlist.rstrip(" \n").split(" ") return token_list else: tagger = MeCab.Tagger(" -Ochasen") node = tagger.parseToNode(sentence_str) result_tuple_list = [] token_list = [] while node: feature_list = node.feature.split(",") if feature_list[0] != "BOS/EOS": if feature_list[0] in self.part_of_speech: token = feature_list[6] token_list.append(token) node = node.next return token_list