Source code for pysummarization.web_scraping

# -*- coding: utf-8 -*-
from pysummarization.readable_web_pdf import ReadableWebPDF
from time import sleep
import urllib
from pyquery import PyQuery as pq


[docs]class WebScraping(object): ''' Object of Web-scraping. This is only a demo. ''' # List of scraped dom objects. __dom_object_list = ["body"] # List of not scraped dom objects. __remove_object_list = ["script", "style"] # Object of ReadableWebPdf. __readable_web_pdf = None
[docs] def get_readable_web_pdf(self): ''' getter ''' if isinstance(self.__readable_web_pdf, ReadableWebPDF) is False and self.__readable_web_pdf is not None: raise TypeError("The type of __readable_web_pdf must be ReadableWebPDF.") return self.__readable_web_pdf
[docs] def set_readable_web_pdf(self, value): ''' setter ''' if isinstance(value, ReadableWebPDF) is False and value is not None: raise TypeError("The type of __readable_web_pdf must be ReadableWebPDF.") self.__readable_web_pdf = value
readable_web_pdf = property(get_readable_web_pdf, set_readable_web_pdf)
[docs] def scrape(self, url): ''' Execute Web-Scraping. The target dom objects are in self.__dom_object_list. Args: url: Web site url. Returns: The result. this is a string. @TODO(chimera0): check URLs format. ''' if isinstance(url, str) is False: raise TypeError("The type of url must be str.") if self.readable_web_pdf is not None and self.readable_web_pdf.is_pdf_url(url) is True: web_data = self.readable_web_pdf.url_to_text(url) else: web_data = "" req = urllib.request.Request(url=url) with urllib.request.urlopen(req) as f: web = f.read().decode('utf-8') dom = pq(web) [dom(remove_object).remove() for remove_object in self.__remove_object_list] for dom_object in self.__dom_object_list: web_data += dom(dom_object).text() sleep(1) return web_data