This is the initial commit

This is a working parser and converter of gemtext to html. It can properly parse and convert all parts of a gemtext document, however there are still bugs and the html might not be 100% valid html5 yet. A known bug is that a list directly after a blockquote, without a newline in between will cause the quote to contain the first element of the list.
2021-04-25 23:33:59 +02:00 · 2021-04-25 23:33:59 +02:00 · 3cd0ce594c
commit 3cd0ce594c
parent 0694a46f90
1 changed files with 160 additions and 0 deletions
--- a/src/convert.py
+++ b/src/convert.py
@ -0,0 +1,160 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+import sys
+
+
+class State:
+    INITIAL = 0
+    TEXT = 1
+    LINK = 2
+    PRETEXT = 3
+    HEADING = 4
+    UNORDERED = 5
+    QUOTE = 6
+
+
+class Tag:
+    LINK = '=>'
+    PRE = '```'
+    HEADING = '#'
+    UNORDERED = '*'
+    QUOTE = '>'
+
+
+class GemParser:
+    def __init__(self):
+        self.mstate = State.INITIAL
+        self.toggle = False
+
+    def parse_line(self, line):
+        rstring = str()
+        # This is a pre start tag
+        if line.startswith(Tag.PRE) and not self.toggle:
+            self.mstate = State.PRETEXT
+            self.toggle = True
+            rstring = self.get_start_tag().format(' aria-label="{}"'.format(
+                line[4:].rstrip('\n')))
+        # This is a pre end tag
+        elif line.startswith(Tag.PRE):
+            rstring = self.get_end_tag()
+            self.mstate = State.INITIAL
+            self.toggle = False
+        # This is in pre formatted text
+        elif self.mstate == State.PRETEXT:
+            rstring = line
+        else:
+            # This is ul start tag
+            if line.startswith(Tag.UNORDERED) and not self.toggle:
+                self.mstate = State.UNORDERED
+                self.toggle = True
+                rstring = self.get_start_tag()
+                rstring += '<li>{}</li>\n'.format(line[1:].strip())
+            # This is in middle of unordered list
+            elif line.startswith(Tag.UNORDERED):
+                rstring += '<li>{}</li>\n'.format(line[1:].strip())
+            # This is ul end tag
+            elif self.mstate == State.UNORDERED and not line.startswith(
+                    Tag.UNORDERED):
+                rstring += self.get_end_tag()
+                self.mstate = State.INITIAL
+                self.toggle = False
+                # Recurse so we don't miss whats next
+                rstring += self.parse_line(line)
+            # This is blockquote start tag
+            if line.startswith(Tag.QUOTE) and not self.toggle:
+                self.mstate = State.QUOTE
+                self.toggle = True
+                rstring = self.get_start_tag()
+                rstring += line[1:]
+            # This is in quote
+            elif line.startswith(Tag.QUOTE):
+                rstring += line[1:]
+            # FIXME: If a list is placed directly after a quote there will ba a list item in the quote...
+            # This is blockquote end tag
+            elif self.mstate == State.QUOTE and not line.startswith(Tag.QUOTE):
+                rstring += self.get_end_tag()
+                self.mstate = State.INITIAL
+                self.toggle = False
+                # Recurse so we don't miss whats next
+                rstring += self.parse_line(line)
+            # This is paragraph start tag
+            if self.is_text(line) and not self.toggle:
+                self.mstate = State.TEXT
+                self.toggle = True
+                rstring = self.get_start_tag()
+                rstring += line
+            # This is in paragraph
+            elif self.is_text(line):
+                rstring += line
+            # This is paragraph end tag
+            elif self.mstate == State.TEXT:
+                rstring += self.get_end_tag()
+                self.mstate = State.INITIAL
+                self.toggle = False
+                # Recurse so we don't miss whats next
+                rstring += self.parse_line(line)
+            # This is a link
+            if line.startswith(Tag.LINK):
+                self.mstate = State.LINK
+                rstring = self.parse_link(line)
+                self.mstate = State.INITIAL
+        return rstring
+
+    def parse_link(self, line):
+        linearr = line[2:].strip().split(' ')
+        link = linearr[0]
+        anchor = str().join(linearr[1:])
+        if not anchor:
+            anchor = link
+        rline = self.get_start_tag().format(link)
+        rline += anchor
+        rline += self.get_end_tag()
+        return rline
+
+    def is_text(self, line):
+        return (not (line == '' or line == '\n')
+                and self.mstate == State.INITIAL and
+                (not line.startswith(Tag.LINK) and not line.startswith(Tag.PRE)
+                 and not line.startswith(Tag.HEADING)
+                 and not line.startswith(Tag.UNORDERED)
+                 and not line.startswith(Tag.QUOTE)))
+
+    def get_start_tag(self):
+        tag = list()
+        tag.append('')
+        tag.append('<p>\n')
+        tag.append('<a href="{}">')
+        tag.append('<pre{}>\n')
+        tag.append('<h{}>')
+        tag.append('<ul>\n')
+        tag.append('<blockquote>\n')
+        return tag[self.mstate]
+
+    def get_end_tag(self):
+        tag = list()
+        tag.append('')
+        tag.append('</p>\n')
+        tag.append('</a>\n')
+        tag.append('</pre>\n')
+        tag.append('</h{}>\n')
+        tag.append('</ul>\n')
+        tag.append('</blockquote>\n')
+        return tag[self.mstate]
+
+    def get_document_from_gemfile(self, filename):
+        rdocument = '<!DOCTYPE html>\n<html>\n<head>\n<title>gemtext2html</title>\n</head>\n<body>\n'
+        with open(filename) as gemtext:
+            mline: str = gemtext.readline()
+            while mline:
+                rdocument += self.parse_line(mline)
+                mline = gemtext.readline()
+            if self.mstate != State.INITIAL:
+                rdocument += self.get_end_tag()
+        rdocument += '</body>\n</html>\n'
+        return rdocument
+
+
+if __name__ == '__main__':
+    mparser: GemParser = GemParser()
+    document = mparser.get_document_from_gemfile(sys.argv[1])
+    print(document)