Working parser/converter

The parser now works as expected with no known bugs. As expected from the spec each line of ordinary text is treated as a separate paragraph.
2021-04-26 22:35:47 +02:00 · 2021-04-26 22:35:47 +02:00 · aecbe362ad
commit aecbe362ad
parent 3cd0ce594c
3 changed files with 97 additions and 61 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,11 @@
+.idea/.name
+.idea/gemtext2html.iml
+.idea/misc.xml
+.idea/modules.xml
+.idea/vcs.xml
+.idea/inspectionProfiles/profiles_settings.xml
+.idea/inspectionProfiles/Project_Default.xml
+.vscode/launch.json
+.vscode/settings.json
+.vscode/.ropeproject/config.py
+.vscode/.ropeproject/objectdb
--- a/.idea/.gitignore
+++ b/.idea/.gitignore
@ -0,0 +1,5 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+/.idea/
+/.vscode/
--- a/src/convert.py
+++ b/src/convert.py
@ -24,81 +24,94 @@ class Tag:
 class GemParser:
    def __init__(self):
        self.mstate = State.INITIAL
-        self.toggle = False
+        self.recurse = False

    def parse_line(self, line):
        rstring = str()
+        is_text = self.is_text(line)
+        starts_with_pre = line.startswith(Tag.PRE)
+        starts_with_ul = line.startswith(Tag.UNORDERED)
+        starts_with_quote = line.startswith(Tag.QUOTE)
+        if line == '\n' and self.mstate != State.PRETEXT:
+            return rstring
+        # This is blockquote end tag
+        if self.mstate == State.QUOTE and not starts_with_quote:
+            rstring += self.get_end_tag()
+            self.recurse = True
+        # This is paragraph end tag
+        if self.mstate == State.TEXT and not is_text:
+            rstring += self.get_end_tag()
+            self.recurse = True
+        # This is ul end tag
+        if self.mstate == State.UNORDERED and not starts_with_ul:
+            rstring += self.get_end_tag()
+            self.recurse = True
        # This is a pre start tag
-        if line.startswith(Tag.PRE) and not self.toggle:
+        if starts_with_pre and self.mstate != State.PRETEXT:
            self.mstate = State.PRETEXT
-            self.toggle = True
-            rstring = self.get_start_tag().format(' aria-label="{}"'.format(
+            rstring += self.get_start_tag().format(' aria-label="{}"'.format(
                line[4:].rstrip('\n')))
-        # This is a pre end tag
-        elif line.startswith(Tag.PRE):
-            rstring = self.get_end_tag()
-            self.mstate = State.INITIAL
-            self.toggle = False
+            return rstring
        # This is in pre formatted text
-        elif self.mstate == State.PRETEXT:
-            rstring = line
+        elif self.mstate == State.PRETEXT and not starts_with_pre:
+            rstring += line
+            return rstring
+        # This is a pre end tag
+        elif starts_with_pre and self.mstate == State.PRETEXT:
+            rstring += self.get_end_tag()
+            self.mstate = State.INITIAL
+            return rstring
        else:
-            # This is ul start tag
-            if line.startswith(Tag.UNORDERED) and not self.toggle:
-                self.mstate = State.UNORDERED
-                self.toggle = True
-                rstring = self.get_start_tag()
-                rstring += '<li>{}</li>\n'.format(line[1:].strip())
-            # This is in middle of unordered list
-            elif line.startswith(Tag.UNORDERED):
-                rstring += '<li>{}</li>\n'.format(line[1:].strip())
-            # This is ul end tag
-            elif self.mstate == State.UNORDERED and not line.startswith(
-                    Tag.UNORDERED):
-                rstring += self.get_end_tag()
-                self.mstate = State.INITIAL
-                self.toggle = False
-                # Recurse so we don't miss whats next
-                rstring += self.parse_line(line)
-            # This is blockquote start tag
-            if line.startswith(Tag.QUOTE) and not self.toggle:
-                self.mstate = State.QUOTE
-                self.toggle = True
-                rstring = self.get_start_tag()
-                rstring += line[1:]
-            # This is in quote
-            elif line.startswith(Tag.QUOTE):
-                rstring += line[1:]
-            # FIXME: If a list is placed directly after a quote there will ba a list item in the quote...
-            # This is blockquote end tag
-            elif self.mstate == State.QUOTE and not line.startswith(Tag.QUOTE):
-                rstring += self.get_end_tag()
-                self.mstate = State.INITIAL
-                self.toggle = False
-                # Recurse so we don't miss whats next
-                rstring += self.parse_line(line)
            # This is paragraph start tag
-            if self.is_text(line) and not self.toggle:
+            if is_text and self.mstate != State.TEXT:
                self.mstate = State.TEXT
-                self.toggle = True
-                rstring = self.get_start_tag()
+                rstring += self.get_start_tag()
                rstring += line
+                return rstring
            # This is in paragraph
-            elif self.is_text(line):
+            if is_text and self.mstate == State.TEXT:
                rstring += line
-            # This is paragraph end tag
-            elif self.mstate == State.TEXT:
-                rstring += self.get_end_tag()
+                return rstring
+            # This is a heading
+            if line.startswith(Tag.HEADING):
+                self.mstate = State.HEADING
+                rstring += self.parse_heading(line)
                self.mstate = State.INITIAL
-                self.toggle = False
-                # Recurse so we don't miss whats next
-                rstring += self.parse_line(line)
+                return rstring
            # This is a link
            if line.startswith(Tag.LINK):
                self.mstate = State.LINK
-                rstring = self.parse_link(line)
+                rstring += self.parse_link(line)
                self.mstate = State.INITIAL
-        return rstring
+                return rstring
+            # This is ul start tag
+            if starts_with_ul and not self.mstate == State.UNORDERED:
+                self.mstate = State.UNORDERED
+                rstring += self.get_start_tag()
+                rstring += '<li>{}</li>\n'.format(line[1:].strip())
+                return rstring
+            # This is in middle of unordered list
+            if starts_with_ul and self.mstate == State.UNORDERED:
+                rstring += '<li>{}</li>\n'.format(line[1:].strip())
+                return rstring
+            # This is blockquote start tag
+            if starts_with_quote and self.mstate != State.QUOTE:
+                self.mstate = State.QUOTE
+                rstring += self.get_start_tag()
+                rstring += line[1:]
+                return rstring
+            # This is in quote
+            if starts_with_quote:
+                rstring += line[1:]
+                return rstring
+        if self.recurse:
+            self.recurse = False
+            self.mstate = State.INITIAL
+            # Recurse so we don't miss whats next
+            return "{}{}".format(rstring, self.parse_line(line))
+        else:
+            print("We should never be here, the line is: {}".format(line))
+            sys.exit(1)

    def parse_link(self, line):
        linearr = line[2:].strip().split(' ')
@ -146,13 +159,20 @@ class GemParser:
        with open(filename) as gemtext:
            mline: str = gemtext.readline()
            while mline:
-                rdocument += self.parse_line(mline)
+                rline = self.parse_line(mline)
+                if rline is not None and rline != str() and rline != '\n':
+                    rdocument += rline
                mline = gemtext.readline()
-            if self.mstate != State.INITIAL:
-                rdocument += self.get_end_tag()
-        rdocument += '</body>\n</html>\n'
+        rdocument += '{}</body>\n</html>\n'.format(self.get_end_tag())
        return rdocument

+    def parse_heading(self, line, level=0):
+        if line.startswith(Tag.HEADING):
+            return self.parse_heading(line[1:], level + 1)
+        else:
+            return "{}{}{}".format(self.get_start_tag().format(level), line.strip().rstrip('\n'),
+                                   self.get_end_tag().format(level))
+

 if __name__ == '__main__':
    mparser: GemParser = GemParser()