This is the initial commit

This is a working parser and converter of gemtext to html. It can properly parse and convert all parts of a gemtext document, however there are still bugs and the html might not be 100% valid html5 yet. A known bug is that a list directly after a blockquote, without a newline in between will cause the quote to contain the first element of the list.
4 years ago · 3cd0ce594c
parent 0694a46f90
commit 3cd0ce594c
1 changed files with 160 additions and 0 deletions
--- a/src/convert.py
+++ b/src/convert.py
@ -0,0 +1,160 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 import sys
 class State:
    INITIAL = 0
    TEXT = 1
    LINK = 2
    PRETEXT = 3
    HEADING = 4
    UNORDERED = 5
    QUOTE = 6
 class Tag:
    LINK = '=>'
    PRE = '```'
    HEADING = '#'
    UNORDERED = '*'
    QUOTE = '>'
 class GemParser:
    def __init__(self):
        self.mstate = State.INITIAL
        self.toggle = False
    def parse_line(self, line):
        rstring = str()
        # This is a pre start tag
        if line.startswith(Tag.PRE) and not self.toggle:
            self.mstate = State.PRETEXT
            self.toggle = True
            rstring = self.get_start_tag().format(' aria-label="{}"'.format(
                line[4:].rstrip('\n')))
        # This is a pre end tag
        elif line.startswith(Tag.PRE):
            rstring = self.get_end_tag()
            self.mstate = State.INITIAL
            self.toggle = False
        # This is in pre formatted text
        elif self.mstate == State.PRETEXT:
            rstring = line
        else:
            # This is ul start tag
            if line.startswith(Tag.UNORDERED) and not self.toggle:
                self.mstate = State.UNORDERED
                self.toggle = True
                rstring = self.get_start_tag()
                rstring += '<li>{}</li>\n'.format(line[1:].strip())
            # This is in middle of unordered list
            elif line.startswith(Tag.UNORDERED):
                rstring += '<li>{}</li>\n'.format(line[1:].strip())
            # This is ul end tag
            elif self.mstate == State.UNORDERED and not line.startswith(
                    Tag.UNORDERED):
                rstring += self.get_end_tag()
                self.mstate = State.INITIAL
                self.toggle = False
                # Recurse so we don't miss whats next
                rstring += self.parse_line(line)
            # This is blockquote start tag
            if line.startswith(Tag.QUOTE) and not self.toggle:
                self.mstate = State.QUOTE
                self.toggle = True
                rstring = self.get_start_tag()
                rstring += line[1:]
            # This is in quote
            elif line.startswith(Tag.QUOTE):
                rstring += line[1:]
            # FIXME: If a list is placed directly after a quote there will ba a list item in the quote...
            # This is blockquote end tag
            elif self.mstate == State.QUOTE and not line.startswith(Tag.QUOTE):
                rstring += self.get_end_tag()
                self.mstate = State.INITIAL
                self.toggle = False
                # Recurse so we don't miss whats next
                rstring += self.parse_line(line)
            # This is paragraph start tag
            if self.is_text(line) and not self.toggle:
                self.mstate = State.TEXT
                self.toggle = True
                rstring = self.get_start_tag()
                rstring += line
            # This is in paragraph
            elif self.is_text(line):
                rstring += line
            # This is paragraph end tag
            elif self.mstate == State.TEXT:
                rstring += self.get_end_tag()
                self.mstate = State.INITIAL
                self.toggle = False
                # Recurse so we don't miss whats next
                rstring += self.parse_line(line)
            # This is a link
            if line.startswith(Tag.LINK):
                self.mstate = State.LINK
                rstring = self.parse_link(line)
                self.mstate = State.INITIAL
        return rstring
    def parse_link(self, line):
        linearr = line[2:].strip().split(' ')
        link = linearr[0]
        anchor = str().join(linearr[1:])
        if not anchor:
            anchor = link
        rline = self.get_start_tag().format(link)
        rline += anchor
        rline += self.get_end_tag()
        return rline
    def is_text(self, line):
        return (not (line == '' or line == '\n')
                and self.mstate == State.INITIAL and
                (not line.startswith(Tag.LINK) and not line.startswith(Tag.PRE)
                 and not line.startswith(Tag.HEADING)
                 and not line.startswith(Tag.UNORDERED)
                 and not line.startswith(Tag.QUOTE)))
    def get_start_tag(self):
        tag = list()
        tag.append('')
        tag.append('<p>\n')
        tag.append('<a href="{}">')
        tag.append('<pre{}>\n')
        tag.append('<h{}>')
        tag.append('<ul>\n')
        tag.append('<blockquote>\n')
        return tag[self.mstate]
    def get_end_tag(self):
        tag = list()
        tag.append('')
        tag.append('</p>\n')
        tag.append('</a>\n')
        tag.append('</pre>\n')
        tag.append('</h{}>\n')
        tag.append('</ul>\n')
        tag.append('</blockquote>\n')
        return tag[self.mstate]
    def get_document_from_gemfile(self, filename):
        rdocument = '<!DOCTYPE html>\n<html>\n<head>\n<title>gemtext2html</title>\n</head>\n<body>\n'
        with open(filename) as gemtext:
            mline: str = gemtext.readline()
            while mline:
                rdocument += self.parse_line(mline)
                mline = gemtext.readline()
            if self.mstate != State.INITIAL:
                rdocument += self.get_end_tag()
        rdocument += '</body>\n</html>\n'
        return rdocument
 if __name__ == '__main__':
    mparser: GemParser = GemParser()
    document = mparser.get_document_from_gemfile(sys.argv[1])
    print(document)