#!/usr/bin/env python3 import argparse import html2text import urllib.parse import requests import requests.exceptions import urllib.error import bs4 from mimetypes import MimeTypes processed_links = [] domain = '' def web_get(url: str) -> str: # Let's bail early if this is a pdf or something mime = MimeTypes() mime_type = mime.guess_type(url) if mime_type[0] not in [None, 'text/html']: return '' try: get_result = requests.get(url) except requests.exceptions.ConnectionError: return '' if get_result.status_code != 200: return '' final_result = get_result.text return final_result def get_text(html: str) -> str: text_maker = html2text.HTML2Text() text_maker.images_to_alt = True text_maker.ignore_links = True return text_maker.handle(html) def get_links(html: str) -> list: soup = bs4.BeautifulSoup(html, 'html.parser') anchors = soup.find_all('a') links = [] for anchor in anchors: link = make_links_canonical(anchor.get('href')) if link != '' and link not in links: links.append(link) return links def get_internal_links(links: list) -> list: global domain internal_links = [] for link in links: if link.startswith(domain.rstrip('/')): internal_links.append(link) return internal_links def make_links_canonical(link: str) -> str: global domain if link == None or link == '' or len(link) > 254: return '' parsed_link = urllib.parse.urlparse(str(link)) if parsed_link.netloc == '': link = domain.rstrip('/') + str(parsed_link.path) else: link = link return link def index(text, links): print(text) print(links) def is_in_processed_links(link: str) -> bool: parsed_link = urllib.parse.urlparse(str(link)) for l in processed_links: link_start = parsed_link.scheme + '://' + parsed_link.netloc + parsed_link.path if l.startswith(link_start): return True return False def process_page(url: str): html = web_get(url) if html == '': return text = get_text(html) links_to = get_links(html) index(text, links_to) internal_links = get_internal_links(links_to) for link in internal_links: if not is_in_processed_links(link): parsed_link = urllib.parse.urlparse(str(link)) processed_link = parsed_link.scheme + '://' + parsed_link.netloc + parsed_link.path + parsed_link.params + parsed_link.query + parsed_link.fragment processed_links.append(processed_link) process_page(processed_link) def main(): global domain parser = argparse.ArgumentParser() parser.add_argument('--url', help='URL to start crawling from', required=True) args = parser.parse_args() url = args.url parsed_domain = urllib.parse.urlparse(url) domain = parsed_domain.scheme + '://' + parsed_domain.netloc processed_links.append(domain) process_page(domain) if __name__ == '__main__': main()