From c21cab163298ecf04419ab2f52cb523c5d62012f Mon Sep 17 00:00:00 2001 From: Micke Nordin Date: Tue, 9 Jan 2024 13:12:20 +0100 Subject: [PATCH] Add script --- crawler.py | 116 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 116 insertions(+) create mode 100644 crawler.py diff --git a/crawler.py b/crawler.py new file mode 100644 index 0000000..284a3d5 --- /dev/null +++ b/crawler.py @@ -0,0 +1,116 @@ +#!/usr/bin/env python3 +import argparse +import html2text +import urllib.parse +import requests +import requests.exceptions +import urllib.error +import bs4 +from mimetypes import MimeTypes + +processed_links = [] +domain = '' + + +def web_get(url: str) -> str: + # Let's bail early if this is a pdf or something + mime = MimeTypes() + mime_type = mime.guess_type(url) + if mime_type[0] not in [None, 'text/html']: + return '' + try: + get_result = requests.get(url) + except requests.exceptions.ConnectionError: + return '' + if get_result.status_code != 200: + return '' + final_result = get_result.text + return final_result + + +def get_text(html: str) -> str: + text_maker = html2text.HTML2Text() + text_maker.images_to_alt = True + text_maker.ignore_links = True + + return text_maker.handle(html) + + +def get_links(html: str) -> list: + soup = bs4.BeautifulSoup(html, 'html.parser') + anchors = soup.find_all('a') + links = [] + for anchor in anchors: + link = make_links_canonical(anchor.get('href')) + if link != '' and link not in links: + links.append(link) + return links + + +def get_internal_links(links: list) -> list: + global domain + internal_links = [] + for link in links: + if link.startswith(domain.rstrip('/')): + internal_links.append(link) + return internal_links + + +def make_links_canonical(link: str) -> str: + global domain + if link == None or link == '' or len(link) > 254: + return '' + parsed_link = urllib.parse.urlparse(str(link)) + if parsed_link.netloc == '': + link = domain.rstrip('/') + str(parsed_link.path) + else: + link = link + return link + + +def index(text, links): + print(text) + print(links) + + +def is_in_processed_links(link: str) -> bool: + parsed_link = urllib.parse.urlparse(str(link)) + for l in processed_links: + link_start = parsed_link.scheme + '://' + parsed_link.netloc + parsed_link.path + if l.startswith(link_start): + return True + return False + + +def process_page(url: str): + html = web_get(url) + if html == '': + return + text = get_text(html) + links_to = get_links(html) + index(text, links_to) + internal_links = get_internal_links(links_to) + for link in internal_links: + if not is_in_processed_links(link): + parsed_link = urllib.parse.urlparse(str(link)) + processed_link = parsed_link.scheme + '://' + parsed_link.netloc + parsed_link.path + parsed_link.params + parsed_link.query + parsed_link.fragment + processed_links.append(processed_link) + process_page(processed_link) + + +def main(): + global domain + parser = argparse.ArgumentParser() + parser.add_argument('--url', + help='URL to start crawling from', + required=True) + args = parser.parse_args() + url = args.url + parsed_domain = urllib.parse.urlparse(url) + domain = parsed_domain.scheme + '://' + parsed_domain.netloc + processed_links.append(domain) + process_page(domain) + + +if __name__ == '__main__': + main()