Module backend.scraper.libs.lib_scraping
Scraping class for various scraping-related functions.
Methods
init: Initialize the Scraping object. del: Destructor for the Scraping object. encode_code: Encode code as base64. decode_code: Decode base64-encoded code. decode_picture: Decode base64-encoded picture. get_result_meta: Get metadata for a given URL. take_screenshot: Take a screenshot of the browser window. get_real_url: Get the real URL after any redirects.
Classes
class Scraping-
Initialize the Scraping object.
Expand source code
class Scraping: def __init__(self): """ Initialize the Scraping object. """ self = self def __del__(self): """ Destructor for the Scraping object. """ print('Helper object destroyed') def encode_code(self, code): """ Encode code as base64. Args: code (str): Code to encode. Returns: str: Base64-encoded code. """ code = code.encode('utf-8','ignore') code = base64.b64encode(code) return code def decode_code(self, value): """ Decode base64-encoded code. Args: value (str): Base64-encoded code. Returns: str: Decoded code. """ try: code_decoded = base64.b64decode(value) code_decoded = BeautifulSoup(code_decoded, "html.parser") code_decoded = str(code_decoded) except Exception as e: print(str(e)) code_decoded = "decoding error" return code_decoded def decode_picture(self, value): """ Decode base64-encoded picture. Args: value (str): Base64-encoded picture. Returns: str: Decoded picture. """ picture = value.tobytes() picture = picture.decode('ascii') return picture def get_result_meta(self, url): """ Get metadata for a given URL. Args: url (str): URL to get metadata for. Returns: dict: Dictionary containing the metadata. """ meta = {} ip = "-1" main = url #parse url to get hostname and socket try: parsed_uri = urlparse(url) hostname = '{uri.netloc}'.format(uri=parsed_uri) ip = socket.gethostbyname(hostname) except Exception as e: print(str(e)) ip = "-1" try: main = '{0.scheme}://{0.netloc}/'.format(urlsplit(url)) except Exception as e: print(str(e)) main = url #write to meta dictionary meta = {"ip":ip, "main":main} return meta def take_screenshot(self, driver): """ Take a screenshot of the browser window. Args: driver: WebDriver instance. Returns: str: Base64-encoded screenshot image. """ #function to encode file content to base64 def encode_file_base64(self, file): f = open(file, 'rb') code = f.read() code = base64.b64encode(code) f.close() return code current_path = os.path.abspath(os.getcwd()) #iniatilize constant variables #iniatilize the directories for the extension and for the folder for temporary downlods of files if os.name == "nt": screenshot_folder = current_path+"\\tmp\\" else: screenshot_folder = current_path+"//tmp//" screenshot_file = screenshot_folder+str(uuid.uuid1())+".png" time.sleep(2) driver.maximize_window() #maximize browser window for screenshot driver.save_screenshot(screenshot_file) # #open screenshot and save as base64 screenshot = encode_file_base64(self, screenshot_file) os.remove(screenshot_file) return screenshot #return base64 code of image def get_real_url(url, driver): """ Get the real URL after any redirects. Args: url (str): URL to get the real URL for. driver: WebDriver instance. Returns: str: Real URL after any redirects. """ try: driver.get(url) time.sleep(4) current_url = driver.current_url #read real url (redirected url) driver.quit() return current_url except Exception as e: print(str(e)) passMethods
def decode_code(self, value)-
Decode base64-encoded code.
Args
value:str- Base64-encoded code.
Returns
str- Decoded code.
def decode_picture(self, value)-
Decode base64-encoded picture.
Args
value:str- Base64-encoded picture.
Returns
str- Decoded picture.
def encode_code(self, code)-
Encode code as base64.
Args
code:str- Code to encode.
Returns
str- Base64-encoded code.
def get_real_url(url, driver)-
Get the real URL after any redirects.
Args
url:str- URL to get the real URL for.
driver- WebDriver instance.
Returns
str- Real URL after any redirects.
def get_result_meta(self, url)-
Get metadata for a given URL.
Args
url:str- URL to get metadata for.
Returns
dict- Dictionary containing the metadata.
def take_screenshot(self, driver)-
Take a screenshot of the browser window.
Args
driver- WebDriver instance.
Returns
str- Base64-encoded screenshot image.