gemtext2html/src/convert/__init__.py

#!/usr/bin/env python3
"""This is a python module that will parse gemtext and convert it to html5
"""
# -*- coding: utf-8 -*-
import sys


class GemParser:
    """This is the main parser class
    """

    def __init__(self):
        """Constructor for the GemParser class
        """
        self.mstate = State.INITIAL
        self.recurse = False

    def get_document_from_gemfile(self, filename):
        """This subroutine will read a file line by line and convert it to html

        Args:
            filename (str): A file name corresponing to a file of gemtext

        Returns:
            str: A valid html5 document as string
        """
        rdocument = '<!DOCTYPE html>\n<html>\n<head>\n'
        rdocument += '<title>gemtext2html</title>\n</head>\n<body>\n'
        with open(filename) as gemtext:
            mline: str = gemtext.readline()
            while mline:
                rline = self.parse_line(mline)
                if rline is not None and rline != str() and rline != '\n':
                    rdocument += rline
                mline = gemtext.readline()
        rdocument += '{}</body>\n</html>\n'.format(self.get_end_tag())
        return rdocument

    def get_end_tag(self):
        """A subroutine that will emit the correct end tag for the state

        Returns:
            str: A html end tag
        """
        tag = list()
        tag.append('')
        tag.append('</p>\n')
        tag.append('</a>\n')
        tag.append('</pre>\n')
        tag.append('</h{}>\n')
        tag.append('</ul>\n')
        tag.append('</blockquote>\n')
        return tag[self.mstate]

    def get_start_tag(self):
        """A subroutine to emit the correct html start tag for the state

        Returns:
            str: A html start tag
        """
        tag = list()
        tag.append('')
        tag.append('<p>\n')
        tag.append('<a href="{}">')
        tag.append('<pre{}>\n')
        tag.append('<h{}>')
        tag.append('<ul>\n')
        tag.append('<blockquote>\n')
        return tag[self.mstate]

    def is_text(self, line):
        """A function that will check if this is a paragraph of text

        Args:
            line (str): A line of gemtext

        Returns:
            bool: True if it is text, False if it is something else
        """
        return (not (line == '' or line == '\n')
                and self.mstate == State.INITIAL and
                (not line.startswith(Tag.LINK) and not line.startswith(Tag.PRE)
                 and not line.startswith(Tag.HEADING)
                 and not line.startswith(Tag.UNORDERED)
                 and not line.startswith(Tag.QUOTE)))

    def parse_heading(self, line, level=0):
        """A function that will recursively call it self to get the correct
        heading level

        Args:
            line (str): A gemtext heading line
            level (int, optional): The heading level to start from. Defaults to 0.

        Returns:
            str: A html heading tag of the correct level
        """
        if line.startswith(Tag.HEADING):
            return self.parse_heading(line[1:], level + 1)
        else:
            return "{}{}{}".format(self.get_start_tag().format(level),
                                   line.strip().rstrip('\n'),
                                   self.get_end_tag().format(level))

    def parse_line(self, line):
        """This subroutine will parse a single line of gemtext and enter the
        correct state and output the corresponding html

        Args:
            line (string): A string of gemtext

        Returns:
            str: A string of html
        """
        rstring = str()
        is_text = self.is_text(line)
        starts_with_pre = line.startswith(Tag.PRE)
        starts_with_ul = line.startswith(Tag.UNORDERED)
        starts_with_quote = line.startswith(Tag.QUOTE)
        if line == '\n' and self.mstate != State.PRETEXT:
            return rstring
        # This is blockquote end tag
        if self.mstate == State.QUOTE and not starts_with_quote:
            rstring += self.get_end_tag()
            self.recurse = True
        # This is paragraph end tag
        if self.mstate == State.TEXT and not is_text:
            rstring += self.get_end_tag()
            self.recurse = True
        # This is ul end tag
        if self.mstate == State.UNORDERED and not starts_with_ul:
            rstring += self.get_end_tag()
            self.recurse = True
        # This is a pre start tag
        if starts_with_pre and self.mstate != State.PRETEXT:
            self.mstate = State.PRETEXT
            rstring += self.get_start_tag().format(' aria-label="{}"'.format(
                line[4:].rstrip('\n')))
            return rstring
        # This is in pre formatted text
        elif self.mstate == State.PRETEXT and not starts_with_pre:
            rstring += line
            return rstring
        # This is a pre end tag
        elif starts_with_pre and self.mstate == State.PRETEXT:
            rstring += self.get_end_tag()
            self.mstate = State.INITIAL
            return rstring
        else:
            # This is paragraph start tag
            if is_text and self.mstate != State.TEXT:
                self.mstate = State.TEXT
                rstring += self.get_start_tag()
                rstring += line
                return rstring
            # This is in paragraph
            if is_text and self.mstate == State.TEXT:
                rstring += line
                return rstring
            # This is a heading
            if line.startswith(Tag.HEADING):
                self.mstate = State.HEADING
                rstring += self.parse_heading(line)
                self.mstate = State.INITIAL
                return rstring
            # This is a link
            if line.startswith(Tag.LINK):
                self.mstate = State.LINK
                rstring += self.parse_link(line)
                self.mstate = State.INITIAL
                return rstring
            # This is ul start tag
            if starts_with_ul and not self.mstate == State.UNORDERED:
                self.mstate = State.UNORDERED
                rstring += self.get_start_tag()
                rstring += '<li>{}</li>\n'.format(line[1:].strip())
                return rstring
            # This is in middle of unordered list
            if starts_with_ul and self.mstate == State.UNORDERED:
                rstring += '<li>{}</li>\n'.format(line[1:].strip())
                return rstring
            # This is blockquote start tag
            if starts_with_quote and self.mstate != State.QUOTE:
                self.mstate = State.QUOTE
                rstring += self.get_start_tag()
                rstring += line[1:]
                return rstring
            # This is in quote
            if starts_with_quote:
                rstring += line[1:]
                return rstring
        if self.recurse:
            self.recurse = False
            self.mstate = State.INITIAL
            # Recurse so we don't miss whats next
            return "{}{}".format(rstring, self.parse_line(line))
        else:
            print("We should never be here, the line is: {}".format(line))
            sys.exit(1)

    def parse_link(self, line):
        """This function will parse a gemtext link

        Args:
            line (str): A link line of gemtext

        Returns:
            str: A html fragment with a <a> tag
        """
        linearr = line[2:].strip().split(' ')
        link = linearr[0]
        anchor = str().join(linearr[1:])
        if not anchor:
            anchor = link
        rline = self.get_start_tag().format(link)
        rline += anchor
        rline += self.get_end_tag()
        return rline


class State:
    """This is an enumeration of the states of the state machine
    """
    INITIAL = 0
    TEXT = 1
    LINK = 2
    PRETEXT = 3
    HEADING = 4
    UNORDERED = 5
    QUOTE = 6


class Tag:
    """This is an enumeration of the possible gemtext tags
    """
    LINK = '=>'
    PRE = '```'
    HEADING = '#'
    UNORDERED = '*'
    QUOTE = '>'


if __name__ == '__main__':
    mparser: GemParser = GemParser()
    document = mparser.get_document_from_gemfile(sys.argv[1])
    print(document)