Source code for pysummarization.similarity_filter
# -*- coding: utf-8 -*-
from abc import ABCMeta, abstractmethod
from pysummarization.nlp_base import NlpBase
[docs]class SimilarityFilter(metaclass=ABCMeta):
'''
Abstract class for filtering mutually similar sentences.
'''
# NlpBase
__nlp_base = None
[docs] def get_nlp_base(self):
''' getter '''
if isinstance(self.__nlp_base, NlpBase) is False:
raise TypeError("The type of self.__nlp_base must be NlpBase.")
return self.__nlp_base
[docs] def set_nlp_base(self, value):
''' setter '''
if isinstance(value, NlpBase) is False:
raise TypeError("The type of value must be NlpBase.")
self.__nlp_base = value
nlp_base = property(get_nlp_base, set_nlp_base)
# Cut off threshold.
__similarity_limit = 0.8
[docs] def get_similarity_limit(self):
''' getter '''
if isinstance(self.__similarity_limit, float) is False:
raise TypeError("__similarity_limit must be float.")
return self.__similarity_limit
[docs] def set_similarity_limit(self, value):
''' setter '''
if isinstance(value, float) is False:
raise TypeError("__similarity_limit must be float.")
self.__similarity_limit = value
similarity_limit = property(get_similarity_limit, set_similarity_limit)
[docs] @abstractmethod
def calculate(self, token_list_x, token_list_y):
'''
Calculate similarity.
Abstract method.
Args:
token_list_x: [token, token, token, ...]
token_list_y: [token, token, token, ...]
Returns:
Similarity.
'''
raise NotImplementedError("This method must be implemented.")
[docs] def unique(self, token_list_x, token_list_y):
'''
Remove duplicated elements.
Args:
token_list_x: [token, token, token, ...]
token_list_y: [token, token, token, ...]
Returns:
Tuple(token_list_x, token_list_y)
'''
x = set(list(token_list_x))
y = set(list(token_list_y))
return (x, y)
[docs] def count(self, token_list):
'''
Count the number of tokens in `token_list`.
Args:
token_list: The list of tokens.
Returns:
{token: the numbers}
'''
token_dict = {}
for token in token_list:
if token in token_dict:
token_dict[token] += 1
else:
token_dict[token] = 1
return token_dict
[docs] def similar_filter_r(self, sentence_list):
'''
Filter mutually similar sentences.
Args:
sentence_list: The list of sentences.
Returns:
The list of filtered sentences.
'''
result_list = []
recursive_list = []
try:
self.nlp_base.tokenize(sentence_list[0])
subject_token = self.nlp_base.token
result_list.append(sentence_list[0])
if len(sentence_list) > 1:
for i in range(len(sentence_list)):
if i > 0:
self.nlp_base.tokenize(sentence_list[i])
object_token = self.nlp_base.token
similarity = self.calculate(subject_token, object_token)
if similarity <= self.similarity_limit:
recursive_list.append(sentence_list[i])
if len(recursive_list) > 0:
result_list.extend(self.similar_filter_r(recursive_list))
except IndexError:
result_list = sentence_list
return result_list