Source code for pysummarization.readablewebpdf.web_pdf_reading

from io import StringIO
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
import urllib.request
from pysummarization.readable_web_pdf import ReadableWebPDF


[docs]class WebPDFReading(ReadableWebPDF): ''' Read the PDF. '''
[docs] def url_to_text(self, url): ''' Download PDF file and transform its document to string. Args: url: PDF url. Returns: string. ''' path, headers = urllib.request.urlretrieve(url) return self.path_to_text(path)
[docs] def path_to_text(self, path): ''' Transform local PDF file to string. Args: path: path to PDF file. Returns: string. ''' rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = open(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() pages_data = PDFPage.get_pages( fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True ) for page in pages_data: interpreter.process_page(page) text = retstr.getvalue() text = text.replace("\n", "") fp.close() device.close() retstr.close() return text
[docs] def is_pdf_url(self, url): ''' Check PDF file format. @TODO(chimera0): validation. Args: url: URL Returns: True: PDF, False: not PDF ''' if url[-4:] == ".pdf": return True else: return False
if __name__ == "__main__": import sys url = sys.argv[1] text = WebPDFReading().url_to_text(url) print(text[:300])