From c21cab163298ecf04419ab2f52cb523c5d62012f Mon Sep 17 00:00:00 2001
From: Micke Nordin <kano@sunet.se>
Date: Tue, 9 Jan 2024 13:12:20 +0100
Subject: [PATCH] Add script

---
 crawler.py | 116 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 116 insertions(+)
 create mode 100644 crawler.py

diff --git a/crawler.py b/crawler.py
new file mode 100644
index 0000000..284a3d5
--- /dev/null
+++ b/crawler.py
@@ -0,0 +1,116 @@
+#!/usr/bin/env python3
+import argparse
+import html2text
+import urllib.parse
+import requests
+import requests.exceptions
+import urllib.error
+import bs4
+from mimetypes import MimeTypes
+
+processed_links = []
+domain = ''
+
+
+def web_get(url: str) -> str:
+    # Let's bail early if this is a pdf or something
+    mime = MimeTypes()
+    mime_type = mime.guess_type(url)
+    if mime_type[0] not in [None, 'text/html']:
+        return ''
+    try:
+        get_result = requests.get(url)
+    except requests.exceptions.ConnectionError:
+        return ''
+    if get_result.status_code != 200:
+        return ''
+    final_result = get_result.text
+    return final_result
+
+
+def get_text(html: str) -> str:
+    text_maker = html2text.HTML2Text()
+    text_maker.images_to_alt = True
+    text_maker.ignore_links = True
+
+    return text_maker.handle(html)
+
+
+def get_links(html: str) -> list:
+    soup = bs4.BeautifulSoup(html, 'html.parser')
+    anchors = soup.find_all('a')
+    links = []
+    for anchor in anchors:
+        link = make_links_canonical(anchor.get('href'))
+        if link != '' and link not in links:
+            links.append(link)
+    return links
+
+
+def get_internal_links(links: list) -> list:
+    global domain
+    internal_links = []
+    for link in links:
+        if link.startswith(domain.rstrip('/')):
+            internal_links.append(link)
+    return internal_links
+
+
+def make_links_canonical(link: str) -> str:
+    global domain
+    if link == None or link == '' or len(link) > 254:
+        return ''
+    parsed_link = urllib.parse.urlparse(str(link))
+    if parsed_link.netloc == '':
+        link = domain.rstrip('/') + str(parsed_link.path)
+    else:
+        link = link
+    return link
+
+
+def index(text, links):
+    print(text)
+    print(links)
+
+
+def is_in_processed_links(link: str) -> bool:
+    parsed_link = urllib.parse.urlparse(str(link))
+    for l in processed_links:
+        link_start = parsed_link.scheme + '://' + parsed_link.netloc + parsed_link.path
+        if l.startswith(link_start):
+            return True
+    return False
+
+
+def process_page(url: str):
+    html = web_get(url)
+    if html == '':
+        return
+    text = get_text(html)
+    links_to = get_links(html)
+    index(text, links_to)
+    internal_links = get_internal_links(links_to)
+    for link in internal_links:
+        if not is_in_processed_links(link):
+            parsed_link = urllib.parse.urlparse(str(link))
+            processed_link = parsed_link.scheme + '://' + parsed_link.netloc + parsed_link.path + parsed_link.params + parsed_link.query + parsed_link.fragment
+            processed_links.append(processed_link)
+            process_page(processed_link)
+
+
+def main():
+    global domain
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--url',
+                        help='URL to start crawling from',
+                        required=True)
+    args = parser.parse_args()
+    url = args.url
+    parsed_domain = urllib.parse.urlparse(url)
+    domain = parsed_domain.scheme + '://' + parsed_domain.netloc
+    processed_links.append(domain)
+    process_page(domain)
+
+
+if __name__ == '__main__':
+    main()