Source code for pysummarization.nlpbase.auto_abstractor
import nltk
from pysummarization.nlp_base import NlpBase
from pysummarization.abstractable_doc import AbstractableDoc
from pysummarization.similarity_filter import SimilarityFilter
[docs]class AutoAbstractor(NlpBase):
'''
The object for automatic summarization.
'''
# Only top-n scored tokens must be set to target.
__target_n = 100
[docs] def get_target_n(self):
''' getter '''
if isinstance(self.__target_n, int) is False:
raise TypeError("The type of __target_n must be int.")
return self.__target_n
[docs] def set_target_n(self, value):
''' setter '''
if isinstance(value, int) is False:
raise TypeError("The type of __target_n must be int.")
self.__target_n = value
target_n = property(get_target_n, set_target_n)
# Adjacent distance.
__cluster_threshold = 5
[docs] def get_cluster_threshold(self):
''' getter '''
if isinstance(self.__cluster_threshold, int) is False:
raise TypeError("The type of __cluster_threshold must be int.")
return self.__cluster_threshold
[docs] def set_cluster_threshold(self, value):
''' setter '''
if isinstance(value, int) is False:
raise TypeError("The type of __cluster_threshold must be int.")
self.__cluster_threshold = value
cluster_threshold = property(get_cluster_threshold, set_cluster_threshold)
# The number of returned sentences.
__top_sentences = 5
[docs] def get_top_sentences(self):
''' getter '''
if isinstance(self.__top_sentences, int) is False:
raise TypeError("The type of __top_sentences must be int.")
return self.__top_sentences
[docs] def set_top_sentences(self, value):
''' setter '''
if isinstance(value, int) is False:
raise TypeError("The type of __top_sentences must be int.")
self.__top_sentences = value
top_sentences = property(get_top_sentences, set_top_sentences)
[docs] def summarize(self, document, Abstractor, similarity_filter=None):
'''
Execute summarization.
Args:
document: The target document.
Abstractor: The object of AbstractableDoc.
similarity_filter The object of SimilarityFilter.
Returns:
dict data.
- "summarize_result": The list of summarized sentences.,
- "scoring_data": The list of scores.
'''
if isinstance(document, str) is False:
raise TypeError("The type of document must be str.")
if isinstance(Abstractor, AbstractableDoc) is False:
raise TypeError("The type of Abstractor must be AbstractableDoc.")
if isinstance(similarity_filter, SimilarityFilter) is False and similarity_filter is not None:
raise TypeError("The type of similarity_filter must be SimilarityFilter.")
normalized_sentences = self.listup_sentence(document)
# for filtering similar sentences.
if similarity_filter is not None:
normalized_sentences = similarity_filter.similar_filter_r(normalized_sentences)
self.tokenize(document)
words = self.token
fdist = nltk.FreqDist(words)
top_n_words = [w[0] for w in fdist.items()][:self.target_n]
scored_list = self.__closely_associated_score(normalized_sentences, top_n_words)
filtered_list = Abstractor.filter(scored_list)
result_list = [normalized_sentences[idx] for (idx, score) in filtered_list]
result_dict = {
"summarize_result": result_list,
"scoring_data": filtered_list
}
return result_dict
def __closely_associated_score(self, normalized_sentences, top_n_words):
'''
Scoring the sentence with closely associations.
Args:
normalized_sentences: The list of sentences.
top_n_words: Important sentences.
Returns:
The list of scores.
'''
scores_list = []
sentence_idx = -1
for sentence in normalized_sentences:
self.tokenize(sentence)
sentence = self.token
sentence_idx += 1
word_idx = []
for w in top_n_words:
try:
word_idx.append(sentence.index(w))
except ValueError:
pass
word_idx.sort()
if len(word_idx) == 0:
continue
clusters = []
cluster = [word_idx[0]]
i = 1
while i < len(word_idx):
if word_idx[i] - word_idx[i - 1] < self.cluster_threshold:
cluster.append(word_idx[i])
else:
clusters.append(cluster[:])
cluster = [word_idx[i]]
i += 1
clusters.append(cluster)
max_cluster_score = 0
for c in clusters:
significant_words_in_cluster = len(c)
total_words_in_cluster = c[-1] - c[0] + 1
score = 1.0 * significant_words_in_cluster \
* significant_words_in_cluster / total_words_in_cluster
if score > max_cluster_score:
max_cluster_score = score
scores_list.append((sentence_idx, score))
return scores_list