Working parser/converter

The parser now works as expected with no known bugs. As expected from the spec each line of ordinary text is treated as a separate paragraph.
2021-04-26 22:35:47 +02:00 · 2021-04-26 22:35:47 +02:00 · aecbe362ad
commit aecbe362ad
parent 3cd0ce594c
3 changed files with 97 additions and 61 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,11 @@
 .idea/.name
 .idea/gemtext2html.iml
 .idea/misc.xml
 .idea/modules.xml
 .idea/vcs.xml
 .idea/inspectionProfiles/profiles_settings.xml
 .idea/inspectionProfiles/Project_Default.xml
 .vscode/launch.json
 .vscode/settings.json
 .vscode/.ropeproject/config.py
 .vscode/.ropeproject/objectdb
--- a/.idea/.gitignore
+++ b/.idea/.gitignore
@ -0,0 +1,5 @@
 # Default ignored files
 /shelf/
 /workspace.xml
 /.idea/
 /.vscode/
--- a/src/convert.py
+++ b/src/convert.py
@ -24,81 +24,94 @@ class Tag:
 class GemParser:
    def __init__(self):
        self.mstate = State.INITIAL
-        self.toggle = False
+        self.recurse = False
    def parse_line(self, line):
        rstring = str()
-        # This is a pre start tag
+        is_text = self.is_text(line)
-        if line.startswith(Tag.PRE) and not self.toggle:
+        starts_with_pre = line.startswith(Tag.PRE)
-            self.mstate = State.PRETEXT
+        starts_with_ul = line.startswith(Tag.UNORDERED)
-            self.toggle = True
+        starts_with_quote = line.startswith(Tag.QUOTE)
-            rstring = self.get_start_tag().format(' aria-label="{}"'.format(
+        if line == '\n' and self.mstate != State.PRETEXT:
-                line[4:].rstrip('\n')))
+            return rstring
        # This is a pre end tag
        elif line.startswith(Tag.PRE):
            rstring = self.get_end_tag()
            self.mstate = State.INITIAL
            self.toggle = False
        # This is in pre formatted text
        elif self.mstate == State.PRETEXT:
            rstring = line
        else:
            # This is ul start tag
            if line.startswith(Tag.UNORDERED) and not self.toggle:
                self.mstate = State.UNORDERED
                self.toggle = True
                rstring = self.get_start_tag()
                rstring += '<li>{}</li>\n'.format(line[1:].strip())
            # This is in middle of unordered list
            elif line.startswith(Tag.UNORDERED):
                rstring += '<li>{}</li>\n'.format(line[1:].strip())
            # This is ul end tag
            elif self.mstate == State.UNORDERED and not line.startswith(
                    Tag.UNORDERED):
                rstring += self.get_end_tag()
                self.mstate = State.INITIAL
                self.toggle = False
                # Recurse so we don't miss whats next
                rstring += self.parse_line(line)
            # This is blockquote start tag
            if line.startswith(Tag.QUOTE) and not self.toggle:
                self.mstate = State.QUOTE
                self.toggle = True
                rstring = self.get_start_tag()
                rstring += line[1:]
            # This is in quote
            elif line.startswith(Tag.QUOTE):
                rstring += line[1:]
            # FIXME: If a list is placed directly after a quote there will ba a list item in the quote...
        # This is blockquote end tag
-            elif self.mstate == State.QUOTE and not line.startswith(Tag.QUOTE):
+        if self.mstate == State.QUOTE and not starts_with_quote:
            rstring += self.get_end_tag()
-                self.mstate = State.INITIAL
+            self.recurse = True
                self.toggle = False
                # Recurse so we don't miss whats next
                rstring += self.parse_line(line)
            # This is paragraph start tag
            if self.is_text(line) and not self.toggle:
                self.mstate = State.TEXT
                self.toggle = True
                rstring = self.get_start_tag()
                rstring += line
            # This is in paragraph
            elif self.is_text(line):
                rstring += line
        # This is paragraph end tag
-            elif self.mstate == State.TEXT:
+        if self.mstate == State.TEXT and not is_text:
            rstring += self.get_end_tag()
            self.recurse = True
        # This is ul end tag
        if self.mstate == State.UNORDERED and not starts_with_ul:
            rstring += self.get_end_tag()
            self.recurse = True
        # This is a pre start tag
        if starts_with_pre and self.mstate != State.PRETEXT:
            self.mstate = State.PRETEXT
            rstring += self.get_start_tag().format(' aria-label="{}"'.format(
                line[4:].rstrip('\n')))
            return rstring
        # This is in pre formatted text
        elif self.mstate == State.PRETEXT and not starts_with_pre:
            rstring += line
            return rstring
        # This is a pre end tag
        elif starts_with_pre and self.mstate == State.PRETEXT:
            rstring += self.get_end_tag()
            self.mstate = State.INITIAL
-                self.toggle = False
+            return rstring
-                # Recurse so we don't miss whats next
+        else:
-                rstring += self.parse_line(line)
+            # This is paragraph start tag
            if is_text and self.mstate != State.TEXT:
                self.mstate = State.TEXT
                rstring += self.get_start_tag()
                rstring += line
                return rstring
            # This is in paragraph
            if is_text and self.mstate == State.TEXT:
                rstring += line
                return rstring
            # This is a heading
            if line.startswith(Tag.HEADING):
                self.mstate = State.HEADING
                rstring += self.parse_heading(line)
                self.mstate = State.INITIAL
                return rstring
            # This is a link
            if line.startswith(Tag.LINK):
                self.mstate = State.LINK
-                rstring = self.parse_link(line)
+                rstring += self.parse_link(line)
                self.mstate = State.INITIAL
                return rstring
            # This is ul start tag
            if starts_with_ul and not self.mstate == State.UNORDERED:
                self.mstate = State.UNORDERED
                rstring += self.get_start_tag()
                rstring += '<li>{}</li>\n'.format(line[1:].strip())
                return rstring
            # This is in middle of unordered list
            if starts_with_ul and self.mstate == State.UNORDERED:
                rstring += '<li>{}</li>\n'.format(line[1:].strip())
                return rstring
            # This is blockquote start tag
            if starts_with_quote and self.mstate != State.QUOTE:
                self.mstate = State.QUOTE
                rstring += self.get_start_tag()
                rstring += line[1:]
                return rstring
            # This is in quote
            if starts_with_quote:
                rstring += line[1:]
                return rstring
        if self.recurse:
            self.recurse = False
            self.mstate = State.INITIAL
            # Recurse so we don't miss whats next
            return "{}{}".format(rstring, self.parse_line(line))
        else:
            print("We should never be here, the line is: {}".format(line))
            sys.exit(1)
    def parse_link(self, line):
        linearr = line[2:].strip().split(' ')
@ -146,13 +159,20 @@ class GemParser:
        with open(filename) as gemtext:
            mline: str = gemtext.readline()
            while mline:
-                rdocument += self.parse_line(mline)
+                rline = self.parse_line(mline)
                if rline is not None and rline != str() and rline != '\n':
                    rdocument += rline
                mline = gemtext.readline()
-            if self.mstate != State.INITIAL:
+        rdocument += '{}</body>\n</html>\n'.format(self.get_end_tag())
                rdocument += self.get_end_tag()
        rdocument += '</body>\n</html>\n'
        return rdocument
    def parse_heading(self, line, level=0):
        if line.startswith(Tag.HEADING):
            return self.parse_heading(line[1:], level + 1)
        else:
            return "{}{}{}".format(self.get_start_tag().format(level), line.strip().rstrip('\n'),
                                   self.get_end_tag().format(level))
 if __name__ == '__main__':
    mparser: GemParser = GemParser()