gemtext2html/src/gemtext2html/__init__.py

#!/usr/bin/env python3
"""This is a python module that will parse gemtext and convert it to html5
"""
# -*- coding: utf-8 -*-
import sys

# Type aliases
State = int
Tag = str


class GemParser:
    """This is the main parser class
    """

    def __init__(self):
        """Constructor for the GemParser class
        """
        self.mstate: State = StateEnum.INITIAL
        self.recurse: bool = False

    def get_document_from_gemfile(self, filename: str) -> str:
        """This subroutine will read a file line by line and convert it to html

        Args:
            filename (str): A file name corresponing to a file of gemtext

        Returns:
            str: A valid html5 document as string
        """
        rdocument: str = '<!DOCTYPE html>\n<html>\n<head>\n'
        rdocument += '<title>gemtext2html</title>\n</head>\n<body>\n'
        with open(filename) as gemtext:
            mline: str = gemtext.readline()
            while mline:
                rline: str = self.parse_line(mline)
                if rline is not None and rline != str() and rline != '\n':
                    rdocument += rline
                mline = gemtext.readline()
        rdocument += '{}</body>\n</html>\n'.format(self.get_end_tag())
        return rdocument

    def get_end_tag(self) -> str:
        """A subroutine that will emit the correct end tag for the state

        Returns:
            str: A html end tag
        """
        tag: list[str] = list()
        tag.append('')
        tag.append('</p>\n')
        tag.append('</a>\n')
        tag.append('</pre>\n')
        tag.append('</h{}>\n')
        tag.append('</ul>\n')
        tag.append('</blockquote>\n')
        return tag[self.mstate]

    def get_start_tag(self) -> str:
        """A subroutine to emit the correct html start tag for the state

        Returns:
            str: A html start tag
        """
        tag: list[str] = list()
        tag.append('')
        tag.append('<p>\n')
        tag.append('<a href="{}">')
        tag.append('<pre{}>\n')
        tag.append('<h{}>')
        tag.append('<ul>\n')
        tag.append('<blockquote>\n')
        return tag[self.mstate]

    def is_text(self, line: str) -> bool:
        """A function that will check if this is a paragraph of text

        Args:
            line (str): A line of gemtext

        Returns:
            bool: True if it is text, False if it is something else
        """
        return (not (line == '' or line == '\n')
                and self.mstate == StateEnum.INITIAL and
                (not line.startswith(TagEnum.LINK) and not line.startswith(TagEnum.PRE)
                 and not line.startswith(TagEnum.HEADING)
                 and not line.startswith(TagEnum.UNORDERED)
                 and not line.startswith(TagEnum.QUOTE)))

    def parse_heading(self, line: str, level: int = 0) -> str:
        """A function that will recursively call it self to get the correct
        heading level

        Args:
            line (str): A gemtext heading line
            level (int, optional): The heading level to start from. Defaults to 0.

        Returns:
            str: A html heading tag of the correct level
        """
        if line.startswith(TagEnum.HEADING):
            return self.parse_heading(line[1:], level + 1)
        else:
            return "{}{}{}".format(self.get_start_tag().format(level),
                                   line.strip().rstrip('\n'),
                                   self.get_end_tag().format(level))

    def parse_line(self, line: str) -> str:
        """This subroutine will parse a single line of gemtext and enter the
        correct state and output the corresponding html

        Args:
            line (string): A string of gemtext

        Returns:
            str: A string of html
        """
        rstring: str = str()
        is_text: bool = self.is_text(line)
        starts_with_pre: bool = line.startswith(TagEnum.PRE)
        starts_with_ul: bool = line.startswith(TagEnum.UNORDERED)
        starts_with_quote: bool = line.startswith(TagEnum.QUOTE)
        if line == '\n' and self.mstate != StateEnum.PRETEXT:
            return rstring
        # This is blockquote end tag
        if self.mstate == StateEnum.QUOTE and not starts_with_quote:
            rstring += self.get_end_tag()
            self.recurse = True
        # This is paragraph end tag
        if self.mstate == StateEnum.TEXT and not is_text:
            rstring += self.get_end_tag()
            self.recurse = True
        # This is ul end tag
        if self.mstate == StateEnum.UNORDERED and not starts_with_ul:
            rstring += self.get_end_tag()
            self.recurse = True
        # This is a pre start tag
        if starts_with_pre and self.mstate != StateEnum.PRETEXT:
            self.mstate = StateEnum.PRETEXT
            label: str = line[4:].rstrip('\n').strip()
            rstring += self.get_start_tag().format(
                ' aria-label="{}"'.format(label) if label else '')
            return rstring
        # This is in pre formatted text
        elif self.mstate == StateEnum.PRETEXT and not starts_with_pre:
            rstring += line
            return rstring
        # This is a pre end tag
        elif starts_with_pre and self.mstate == StateEnum.PRETEXT:
            rstring += self.get_end_tag()
            self.mstate = StateEnum.INITIAL
            return rstring
        else:
            # This is paragraph start tag
            if is_text and self.mstate != StateEnum.TEXT:
                self.mstate = StateEnum.TEXT
                rstring += self.get_start_tag()
                rstring += line
                return rstring
            # This is in paragraph
            if is_text and self.mstate == StateEnum.TEXT:
                rstring += line
                return rstring
            # This is a heading
            if line.startswith(TagEnum.HEADING):
                self.mstate = StateEnum.HEADING
                rstring += self.parse_heading(line)
                self.mstate = StateEnum.INITIAL
                return rstring
            # This is a link
            if line.startswith(TagEnum.LINK):
                self.mstate = StateEnum.LINK
                rstring += self.parse_link(line)
                self.mstate = StateEnum.INITIAL
                return rstring
            # This is ul start tag
            if starts_with_ul and not self.mstate == StateEnum.UNORDERED:
                self.mstate = StateEnum.UNORDERED
                rstring += self.get_start_tag()
                rstring += '<li>{}</li>\n'.format(line[1:].strip())
                return rstring
            # This is in middle of unordered list
            if starts_with_ul and self.mstate == StateEnum.UNORDERED:
                rstring += '<li>{}</li>\n'.format(line[1:].strip())
                return rstring
            # This is blockquote start tag
            if starts_with_quote and self.mstate != StateEnum.QUOTE:
                self.mstate = StateEnum.QUOTE
                rstring += self.get_start_tag()
                rstring += line[1:]
                return rstring
            # This is in quote
            if starts_with_quote:
                rstring += line[1:]
                return rstring
        if self.recurse:
            self.recurse = False
            self.mstate = StateEnum.INITIAL
            # Recurse so we don't miss whats next
            return "{}{}".format(rstring, self.parse_line(line))
        else:
            print("We should never be here, the line is: {}".format(line))
            sys.exit(1)

    def parse_link(self, line: str) -> str:
        """This function will parse a gemtext link

        Args:
            line (str): A link line of gemtext

        Returns:
            str: A html fragment with a <a> tag
        """
        linearr: list[str] = line[2:].strip().split(' ')
        link: str = linearr[0]
        anchor: str = str().join(linearr[1:])
        if not anchor:
            anchor = link
        rline: str = self.get_start_tag().format(link)
        rline += anchor
        rline += self.get_end_tag()
        return rline


class StateEnum:
    """This is an enumeration of the states of the state machine
    """
    INITIAL: State = 0
    TEXT: State = 1
    LINK: State = 2
    PRETEXT: State = 3
    HEADING: State = 4
    UNORDERED: State = 5
    QUOTE: State = 6


class TagEnum:
    """This is an enumeration of the possible gemtext tags
    """
    LINK: Tag = '=>'
    PRE: Tag = '```'
    HEADING: Tag = '#'
    UNORDERED: Tag = '*'
    QUOTE: Tag = '>'


if __name__ == '__main__':
    mparser: GemParser = GemParser()
    document: str = mparser.get_document_from_gemfile(sys.argv[1])
    print(document)