Source code for pysummarization.similarityfilter.tfidf_cosine

# -*- coding: utf-8 -*-
import numpy as np
from pysummarization.similarity_filter import SimilarityFilter
from pysummarization.vectorizabletoken.tfidf_vectorizer import TfidfVectorizer


[docs]class TfIdfCosine(SimilarityFilter): ''' Concrete class for filtering mutually similar sentences. '''
[docs] def calculate(self, token_list_x, token_list_y): ''' Calculate similarity with the so-called Cosine similarity of Tf-Idf vectors. Concrete method. Args: token_list_x: [token, token, token, ...] token_list_y: [token, token, token, ...] Returns: Similarity. ''' if len(token_list_x) == 0 or len(token_list_y) == 0: return 0.0 document_list = token_list_x.copy() [document_list.append(v) for v in token_list_y] document_list = list(set(document_list)) tfidf_vectorizer = TfidfVectorizer(document_list) vector_list_x = tfidf_vectorizer.vectorize(token_list_x) vector_list_y = tfidf_vectorizer.vectorize(token_list_y) if len(vector_list_x) > len(vector_list_y): [vector_list_y.append(0.0) for _ in range(len(vector_list_x) - len(vector_list_y))] elif len(vector_list_y) > len(vector_list_x): [vector_list_x.append(0.0) for _ in range(len(vector_list_y) - len(vector_list_x))] dot_prod = np.dot(vector_list_x, vector_list_y) norm_x = np.linalg.norm(vector_list_x) norm_y = np.linalg.norm(vector_list_y) try: result = dot_prod / (norm_x * norm_y) if np.isnan(result) is True: return 0.0 else: return result except ZeroDivisionError: return 0.0