parent
a52e61aadc
commit
c21cab1632
@ -0,0 +1,116 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import html2text
|
||||
import urllib.parse
|
||||
import requests
|
||||
import requests.exceptions
|
||||
import urllib.error
|
||||
import bs4
|
||||
from mimetypes import MimeTypes
|
||||
|
||||
processed_links = []
|
||||
domain = ''
|
||||
|
||||
|
||||
def web_get(url: str) -> str:
|
||||
# Let's bail early if this is a pdf or something
|
||||
mime = MimeTypes()
|
||||
mime_type = mime.guess_type(url)
|
||||
if mime_type[0] not in [None, 'text/html']:
|
||||
return ''
|
||||
try:
|
||||
get_result = requests.get(url)
|
||||
except requests.exceptions.ConnectionError:
|
||||
return ''
|
||||
if get_result.status_code != 200:
|
||||
return ''
|
||||
final_result = get_result.text
|
||||
return final_result
|
||||
|
||||
|
||||
def get_text(html: str) -> str:
|
||||
text_maker = html2text.HTML2Text()
|
||||
text_maker.images_to_alt = True
|
||||
text_maker.ignore_links = True
|
||||
|
||||
return text_maker.handle(html)
|
||||
|
||||
|
||||
def get_links(html: str) -> list:
|
||||
soup = bs4.BeautifulSoup(html, 'html.parser')
|
||||
anchors = soup.find_all('a')
|
||||
links = []
|
||||
for anchor in anchors:
|
||||
link = make_links_canonical(anchor.get('href'))
|
||||
if link != '' and link not in links:
|
||||
links.append(link)
|
||||
return links
|
||||
|
||||
|
||||
def get_internal_links(links: list) -> list:
|
||||
global domain
|
||||
internal_links = []
|
||||
for link in links:
|
||||
if link.startswith(domain.rstrip('/')):
|
||||
internal_links.append(link)
|
||||
return internal_links
|
||||
|
||||
|
||||
def make_links_canonical(link: str) -> str:
|
||||
global domain
|
||||
if link == None or link == '' or len(link) > 254:
|
||||
return ''
|
||||
parsed_link = urllib.parse.urlparse(str(link))
|
||||
if parsed_link.netloc == '':
|
||||
link = domain.rstrip('/') + str(parsed_link.path)
|
||||
else:
|
||||
link = link
|
||||
return link
|
||||
|
||||
|
||||
def index(text, links):
|
||||
print(text)
|
||||
print(links)
|
||||
|
||||
|
||||
def is_in_processed_links(link: str) -> bool:
|
||||
parsed_link = urllib.parse.urlparse(str(link))
|
||||
for l in processed_links:
|
||||
link_start = parsed_link.scheme + '://' + parsed_link.netloc + parsed_link.path
|
||||
if l.startswith(link_start):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def process_page(url: str):
|
||||
html = web_get(url)
|
||||
if html == '':
|
||||
return
|
||||
text = get_text(html)
|
||||
links_to = get_links(html)
|
||||
index(text, links_to)
|
||||
internal_links = get_internal_links(links_to)
|
||||
for link in internal_links:
|
||||
if not is_in_processed_links(link):
|
||||
parsed_link = urllib.parse.urlparse(str(link))
|
||||
processed_link = parsed_link.scheme + '://' + parsed_link.netloc + parsed_link.path + parsed_link.params + parsed_link.query + parsed_link.fragment
|
||||
processed_links.append(processed_link)
|
||||
process_page(processed_link)
|
||||
|
||||
|
||||
def main():
|
||||
global domain
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--url',
|
||||
help='URL to start crawling from',
|
||||
required=True)
|
||||
args = parser.parse_args()
|
||||
url = args.url
|
||||
parsed_domain = urllib.parse.urlparse(url)
|
||||
domain = parsed_domain.scheme + '://' + parsed_domain.netloc
|
||||
processed_links.append(domain)
|
||||
process_page(domain)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
Loading…
Reference in new issue