Source code for pysummarization.nlpbase.auto_abstractor

import nltk
from pysummarization.nlp_base import NlpBase
from pysummarization.abstractable_doc import AbstractableDoc
from pysummarization.similarity_filter import SimilarityFilter

[docs]class AutoAbstractor(NlpBase): ''' The object for automatic summarization. ''' # Only top-n scored tokens must be set to target. __target_n = 100
[docs] def get_target_n(self): ''' getter ''' if isinstance(self.__target_n, int) is False: raise TypeError("The type of __target_n must be int.") return self.__target_n
[docs] def set_target_n(self, value): ''' setter ''' if isinstance(value, int) is False: raise TypeError("The type of __target_n must be int.") self.__target_n = value
target_n = property(get_target_n, set_target_n) # Adjacent distance. __cluster_threshold = 5
[docs] def get_cluster_threshold(self): ''' getter ''' if isinstance(self.__cluster_threshold, int) is False: raise TypeError("The type of __cluster_threshold must be int.") return self.__cluster_threshold
[docs] def set_cluster_threshold(self, value): ''' setter ''' if isinstance(value, int) is False: raise TypeError("The type of __cluster_threshold must be int.") self.__cluster_threshold = value
cluster_threshold = property(get_cluster_threshold, set_cluster_threshold) # The number of returned sentences. __top_sentences = 5
[docs] def get_top_sentences(self): ''' getter ''' if isinstance(self.__top_sentences, int) is False: raise TypeError("The type of __top_sentences must be int.") return self.__top_sentences
[docs] def set_top_sentences(self, value): ''' setter ''' if isinstance(value, int) is False: raise TypeError("The type of __top_sentences must be int.") self.__top_sentences = value
top_sentences = property(get_top_sentences, set_top_sentences)
[docs] def summarize(self, document, Abstractor, similarity_filter=None): ''' Execute summarization. Args: document: The target document. Abstractor: The object of AbstractableDoc. similarity_filter The object of SimilarityFilter. Returns: dict data. - "summarize_result": The list of summarized sentences., - "scoring_data": The list of scores. ''' if isinstance(document, str) is False: raise TypeError("The type of document must be str.") if isinstance(Abstractor, AbstractableDoc) is False: raise TypeError("The type of Abstractor must be AbstractableDoc.") if isinstance(similarity_filter, SimilarityFilter) is False and similarity_filter is not None: raise TypeError("The type of similarity_filter must be SimilarityFilter.") normalized_sentences = self.listup_sentence(document) # for filtering similar sentences. if similarity_filter is not None: normalized_sentences = similarity_filter.similar_filter_r(normalized_sentences) self.tokenize(document) words = self.token fdist = nltk.FreqDist(words) top_n_words = [w[0] for w in fdist.items()][:self.target_n] scored_list = self.__closely_associated_score(normalized_sentences, top_n_words) filtered_list = Abstractor.filter(scored_list) result_list = [normalized_sentences[idx] for (idx, score) in filtered_list] result_dict = { "summarize_result": result_list, "scoring_data": filtered_list } return result_dict
def __closely_associated_score(self, normalized_sentences, top_n_words): ''' Scoring the sentence with closely associations. Args: normalized_sentences: The list of sentences. top_n_words: Important sentences. Returns: The list of scores. ''' scores_list = [] sentence_idx = -1 for sentence in normalized_sentences: self.tokenize(sentence) sentence = self.token sentence_idx += 1 word_idx = [] for w in top_n_words: try: word_idx.append(sentence.index(w)) except ValueError: pass word_idx.sort() if len(word_idx) == 0: continue clusters = [] cluster = [word_idx[0]] i = 1 while i < len(word_idx): if word_idx[i] - word_idx[i - 1] < self.cluster_threshold: cluster.append(word_idx[i]) else: clusters.append(cluster[:]) cluster = [word_idx[i]] i += 1 clusters.append(cluster) max_cluster_score = 0 for c in clusters: significant_words_in_cluster = len(c) total_words_in_cluster = c[-1] - c[0] + 1 score = 1.0 * significant_words_in_cluster \ * significant_words_in_cluster / total_words_in_cluster if score > max_cluster_score: max_cluster_score = score scores_list.append((sentence_idx, score)) return scores_list