main
Micke Nordin 12 months ago
parent a52e61aadc
commit c21cab1632
Signed by: micke
GPG Key ID: 0DA0A7A5708FE257

@ -0,0 +1,116 @@
#!/usr/bin/env python3
import argparse
import html2text
import urllib.parse
import requests
import requests.exceptions
import urllib.error
import bs4
from mimetypes import MimeTypes
processed_links = []
domain = ''
def web_get(url: str) -> str:
# Let's bail early if this is a pdf or something
mime = MimeTypes()
mime_type = mime.guess_type(url)
if mime_type[0] not in [None, 'text/html']:
return ''
try:
get_result = requests.get(url)
except requests.exceptions.ConnectionError:
return ''
if get_result.status_code != 200:
return ''
final_result = get_result.text
return final_result
def get_text(html: str) -> str:
text_maker = html2text.HTML2Text()
text_maker.images_to_alt = True
text_maker.ignore_links = True
return text_maker.handle(html)
def get_links(html: str) -> list:
soup = bs4.BeautifulSoup(html, 'html.parser')
anchors = soup.find_all('a')
links = []
for anchor in anchors:
link = make_links_canonical(anchor.get('href'))
if link != '' and link not in links:
links.append(link)
return links
def get_internal_links(links: list) -> list:
global domain
internal_links = []
for link in links:
if link.startswith(domain.rstrip('/')):
internal_links.append(link)
return internal_links
def make_links_canonical(link: str) -> str:
global domain
if link == None or link == '' or len(link) > 254:
return ''
parsed_link = urllib.parse.urlparse(str(link))
if parsed_link.netloc == '':
link = domain.rstrip('/') + str(parsed_link.path)
else:
link = link
return link
def index(text, links):
print(text)
print(links)
def is_in_processed_links(link: str) -> bool:
parsed_link = urllib.parse.urlparse(str(link))
for l in processed_links:
link_start = parsed_link.scheme + '://' + parsed_link.netloc + parsed_link.path
if l.startswith(link_start):
return True
return False
def process_page(url: str):
html = web_get(url)
if html == '':
return
text = get_text(html)
links_to = get_links(html)
index(text, links_to)
internal_links = get_internal_links(links_to)
for link in internal_links:
if not is_in_processed_links(link):
parsed_link = urllib.parse.urlparse(str(link))
processed_link = parsed_link.scheme + '://' + parsed_link.netloc + parsed_link.path + parsed_link.params + parsed_link.query + parsed_link.fragment
processed_links.append(processed_link)
process_page(processed_link)
def main():
global domain
parser = argparse.ArgumentParser()
parser.add_argument('--url',
help='URL to start crawling from',
required=True)
args = parser.parse_args()
url = args.url
parsed_domain = urllib.parse.urlparse(url)
domain = parsed_domain.scheme + '://' + parsed_domain.netloc
processed_links.append(domain)
process_page(domain)
if __name__ == '__main__':
main()
Loading…
Cancel
Save