diff options
author | tjpcc <tjp@ctrl-c.club> | 2023-01-13 10:50:30 -0700 |
---|---|---|
committer | tjpcc <tjp@ctrl-c.club> | 2023-01-13 10:50:30 -0700 |
commit | aa23984bc21f99e00b8552e928d23ef02bb745f3 (patch) | |
tree | b11e6489cde0bbfd21a4a4472e9fa837aa0abf25 /gemtext | |
parent | 13f553c965d0e8c1f609cbbbf66e81fc1450b850 (diff) |
Initial gemtext package.
Contains:
- gemtext AST (Document and line types)
- Parse from an io.Reader
- ParseLine a []byte
- doc comments on everything
- ParseLine tests for every line type
Still needs tests for Parse & Document.
Diffstat (limited to 'gemtext')
-rw-r--r-- | gemtext/parse.go | 154 | ||||
-rw-r--r-- | gemtext/parse_line_test.go | 271 | ||||
-rw-r--r-- | gemtext/types.go | 171 |
3 files changed, 596 insertions, 0 deletions
diff --git a/gemtext/parse.go b/gemtext/parse.go new file mode 100644 index 0000000..4a8c641 --- /dev/null +++ b/gemtext/parse.go @@ -0,0 +1,154 @@ +package gemtext + +import ( + "bufio" + "bytes" + "io" +) + +// Parse parses the full contents of an io.Reader into a gemtext.Document. +func Parse(input io.Reader) (Document, error) { + rdr := bufio.NewReader(input) + + var lines []Line + inPFT := false + + for { + raw, err := rdr.ReadBytes('\n') + if err != io.EOF && err != nil { + return nil, err + } + + var line Line + + if inPFT && (len(raw) < 3 || raw[0] != '`' || raw[1] != '`' || raw[2] != '`') { + line = PreformattedTextLine{raw: raw} + } else { + line = ParseLine(raw) + } + + if line.Type() == LineTypePreformatToggle { + if inPFT { + toggle := line.(PreformatToggleLine) + (&toggle).clearAlt() + line = toggle + } + + inPFT = !inPFT + } + + if line != nil { + lines = append(lines, line) + } + + if err == io.EOF { + break + } + } + + return Document(lines), nil +} + +// ParseLine parses a single line (including the trailing \n) into a gemtext.Line. +func ParseLine(line []byte) Line { + if len(line) == 0 { + return nil + } + + switch line[0] { + case '=': + if len(line) == 1 || line[1] != '>' { + break + } + return parseLinkLine(line) + case '`': + if len(line) < 3 || line[1] != '`' || line[2] != '`' { + break + } + return parsePreformatToggleLine(line) + case '#': + level := 1 + if len(line) > 1 && line[1] == '#' { + level += 1 + if len(line) > 2 && line[2] == '#' { + level += 1 + } + } + return parseHeadingLine(level, line) + case '*': + if len(line) == 1 || line[1] != ' ' { + break + } + return parseListItemLine(line) + case '>': + return parseQuoteLine(line) + } + + return TextLine{raw: line} +} + +func parseLinkLine(raw []byte) LinkLine { + line := LinkLine{raw: raw} + + // move past =>[<whitespace>] + raw = bytes.TrimLeft(raw[2:], " \t") + + // find the next space or tab + spIdx := bytes.IndexByte(raw, ' ') + tbIdx := bytes.IndexByte(raw, '\t') + idx := spIdx + if idx == -1 { + idx = tbIdx + } + if tbIdx >= 0 && tbIdx < idx { + idx = tbIdx + } + + if idx < 0 { + line.URL = bytes.TrimRight(raw, "\r\n") + return line + } + + line.URL = raw[:idx] + raw = raw[idx+1:] + + label := bytes.TrimRight(bytes.TrimLeft(raw, " \t"), "\r\n") + if len(label) > 0 { + line.Label = label + } + + return line +} + +func parsePreformatToggleLine(raw []byte) PreformatToggleLine { + line := PreformatToggleLine{raw: raw} + + raw = bytes.TrimRight(raw[3:], "\r\n") + if len(raw) > 0 { + line.AltText = raw + } + + return line +} + +func parseHeadingLine(level int, raw []byte) HeadingLine { + return HeadingLine{ + raw: raw, + lineType: LineTypeHeading1 - 1 + LineType(level), + Body: bytes.TrimRight(bytes.TrimLeft(raw[level:], " \t"), "\r\n"), + } +} + +func parseListItemLine(raw []byte) ListItemLine { + return ListItemLine{ + raw: raw, + Body: bytes.TrimRight(raw[2:], "\r\n"), + } +} + +func parseQuoteLine(raw []byte) QuoteLine { + return QuoteLine{ + raw: raw, + Body: bytes.TrimRight(raw[1:], "\r\n"), + } +} diff --git a/gemtext/parse_line_test.go b/gemtext/parse_line_test.go new file mode 100644 index 0000000..64c1bc7 --- /dev/null +++ b/gemtext/parse_line_test.go @@ -0,0 +1,271 @@ +package gemtext_test + +import ( + "testing" + + "tildegit.org/tjp/gus/gemtext" +) + +func TestParseLinkLine(t *testing.T) { + tests := []struct { + input string + url string + label string + }{ + { + input: "=> gemini.ctrl-c.club/~tjp/ home page\r\n", + url: "gemini.ctrl-c.club/~tjp/", + label: "home page", + }, + { + input: "=> gemi.dev/\n", + url: "gemi.dev/", + }, + { + input: "=> /gemlog/foobar 2023-01-13 - Foo Bar\n", + url: "/gemlog/foobar", + label: "2023-01-13 - Foo Bar", + }, + } + + for _, test := range tests { + t.Run(test.input, func(t *testing.T) { + line := gemtext.ParseLine([]byte(test.input)) + if line == nil { + t.Fatal("ParseLine() returned nil line") + } + if string(line.Raw()) != string(test.input) { + t.Error("Raw() does not match input") + } + + if line.Type() != gemtext.LineTypeLink { + t.Errorf("expected LineTypeLink, got %d", line.Type()) + } + link, ok := line.(gemtext.LinkLine) + if !ok { + t.Fatalf("expected a LinkLine, got %T", line) + } + + if string(link.URL) != test.url { + t.Errorf("expected url %q, got %q", test.url, string(link.URL)) + } + + if string(link.Label) != test.label { + t.Errorf("expected label %q, got %q", test.label, string(link.Label)) + } + }) + } +} + +func TestParsePreformatToggleLine(t *testing.T) { + tests := []struct { + input string + altText string + }{ + { + input: "```\n", + }, + { + input: "```some alt-text\r\n", + altText: "some alt-text", + }, + { + input: "``` leading space preserved\n", + altText: " leading space preserved", + }, + } + + for _, test := range tests { + t.Run(test.input, func(t *testing.T) { + line := gemtext.ParseLine([]byte(test.input)) + if line == nil { + t.Fatal("ParseLine() returned nil line") + } + if string(line.Raw()) != string(test.input) { + t.Error("Raw() does not match input") + } + + if line.Type() != gemtext.LineTypePreformatToggle { + t.Errorf("expected LineTypePreformatToggle, got %d", line.Type()) + } + toggle, ok := line.(gemtext.PreformatToggleLine) + if !ok { + t.Fatalf("expected a PreformatToggleLine, got %T", line) + } + + if string(toggle.AltText) != test.altText { + t.Errorf("expected alt-text %q, got %q", test.altText, string(toggle.AltText)) + } + }) + } +} + +func TestParseHeadingLine(t *testing.T) { + tests := []struct { + input string + lineType gemtext.LineType + body string + }{ + { + input: "# this is an H1\n", + lineType: gemtext.LineTypeHeading1, + body: "this is an H1", + }, + { + input: "## extra leading spaces\r\n", + lineType: gemtext.LineTypeHeading2, + body: "extra leading spaces", + }, + { + input: "##no leading space\n", + lineType: gemtext.LineTypeHeading2, + body: "no leading space", + }, + { + input: "#### there is no h4\n", + lineType: gemtext.LineTypeHeading3, + body: "# there is no h4", + }, + } + + for _, test := range tests { + t.Run(test.input, func(t *testing.T) { + line := gemtext.ParseLine([]byte(test.input)) + if line == nil { + t.Fatal("ParseLine() returned nil") + } + + if line.Type() != test.lineType { + t.Errorf("expected line type %d, got %d", test.lineType, line.Type()) + } + if string(line.Raw()) != test.input { + t.Error("line.Raw() does not match input") + } + + hdg, ok := line.(gemtext.HeadingLine) + if !ok { + t.Fatalf("expected HeadingLine, got a %T", line) + } + + if string(hdg.Body) != test.body { + t.Errorf("expected body %q, got %q", test.body, string(hdg.Body)) + } + }) + } +} + +func TestParseListItemLine(t *testing.T) { + tests := []struct { + input string + body string + }{ + { + input: "* this is a list item\r\n", + body: "this is a list item", + }, + { + input: "* more leading spaces\n", + body: " more leading spaces", + }, + } + + for _, test := range tests { + t.Run(test.input, func(t *testing.T) { + line := gemtext.ParseLine([]byte(test.input)) + if line == nil { + t.Fatal("ParseLine() returned nil") + } + + if line.Type() != gemtext.LineTypeListItem { + t.Errorf("expected LineTypeListItem, got %d", line.Type()) + } + if string(line.Raw()) != test.input { + t.Error("line.Raw() does not match input") + } + + li, ok := line.(gemtext.ListItemLine) + if !ok { + t.Fatalf("expected ListItemLine, got a %T", line) + } + + if string(li.Body) != test.body { + t.Errorf("expected body %q, got %q", test.body, string(li.Body)) + } + }) + } +} + +func TestParseQuoteLine(t *testing.T) { + tests := []struct { + input string + body string + }{ + { + input: ">a quote line\r\n", + body: "a quote line", + }, + { + input: "> with a leading space\n", + body: " with a leading space", + }, + { + input: "> more leading spaces\n", + body: " more leading spaces", + }, + } + + for _, test := range tests { + t.Run(test.input, func(t *testing.T) { + line := gemtext.ParseLine([]byte(test.input)) + if line == nil { + t.Fatal("ParseLine() returned nil") + } + + if line.Type() != gemtext.LineTypeQuote { + t.Errorf("expected LineTypeQuote, got %d", line.Type()) + } + if string(line.Raw()) != test.input { + t.Error("line.Raw() does not match input") + } + + qu, ok := line.(gemtext.QuoteLine) + if !ok { + t.Fatalf("expected QuoteLine , got a %T", line) + } + + if string(qu.Body) != test.body { + t.Errorf("expected body %q, got %q", test.body, string(qu.Body)) + } + }) + } +} + +func TestParseTextLine(t *testing.T) { + tests := []string { + "\n", + "simple text line\r\n", + " * an invalid list item\n", + "*another invalid list item\r\n", + } + + for _, test := range tests { + t.Run(test, func(t *testing.T) { + line := gemtext.ParseLine([]byte(test)) + if line == nil { + t.Fatal("ParseLine() returned nil") + } + + if line.Type() != gemtext.LineTypeText { + t.Errorf("expected LineTypeText, got %d", line.Type()) + } + if string(line.Raw()) != test { + t.Error("line.Raw() does not match input") + } + + _, ok := line.(gemtext.TextLine) + if !ok { + t.Fatalf("expected TextLine , got a %T", line) + } + }) + } +} diff --git a/gemtext/types.go b/gemtext/types.go new file mode 100644 index 0000000..fb9352a --- /dev/null +++ b/gemtext/types.go @@ -0,0 +1,171 @@ +package gemtext + +// LineType represents the different types of lines in a gemtext document. +type LineType int + +const ( + // LineTypeText is the default case when nothing else matches. + // + // It indicates that the line object is a TextLine. + LineTypeText LineType = iota + 1 + + // LineTypeLink is a link line. + // + // =>[<ws>]<url>[<ws><label>][\r]\n + // + // The line is a LinkLine. + LineTypeLink + + // LineTypePreformatToggle switches the document between pre-formatted text or not. + // + // ```[<alt-text>][\r]\n + // + // The line object is a PreformatToggleLine. + LineTypePreformatToggle + + // LineTypePreformattedText is any line between two PreformatToggles. + // + // The line is a PreformattedTextLine. + LineTypePreformattedText + + // LineTypeHeading1 is a top-level heading. + // + // #[<ws>]body[\r]\n + // + // The line is a HeadingLine. + LineTypeHeading1 + + // LineTypeHeading2 is a second-level heading. + // + // ##[<ws>]body[\r]\n + // + // The line is a HeadingLine. + LineTypeHeading2 + + // LineTypeHeading3 is a third-level heading. + // + // ###[<ws>]<body>[\r]\n + // + // The line is a HeadingLine. + LineTypeHeading3 + + // LineTypeListItem is an unordered list item. + // + // * <body>[\r]\n + // + // The line object is a ListItemLine. + LineTypeListItem + + // LineTypeQuote is a quote line. + // + // ><body>[\r]\n + // + // The line object is a QuoteLine. + LineTypeQuote +) + +// Line is the interface implemented by all specific line types. +// +// Many of those concrete implementation types have additional useful fields, +// so it can be a good idea to cast these to their concrete types based on the +// return value of the Type() method. +type Line interface { + // Type returns the specific type of the gemtext line. + Type() LineType + + // Raw reproduces the original bytes from the source reader. + Raw() []byte +} + +// Document is the list of lines that make up a full text/gemini resource. +type Document []Line + +// TextLine is a line of LineTypeText. +type TextLine struct { + raw []byte +} + +func (tl TextLine) Type() LineType { return LineTypeText } +func (tl TextLine) Raw() []byte { return tl.raw } + +// LinkLine is a line of LineTypeLink. +type LinkLine struct { + raw []byte + + // URL is the original bytes of the url portion of the line. + // + // It is not guaranteed to be a valid URL. + URL []byte + + // Label is the label portion of the line. + // + // If there was no label it will always be nil, never []byte{}. + Label []byte +} + +func (ll LinkLine) Type() LineType { return LineTypeLink } +func (ll LinkLine) Raw() []byte { return ll.raw } + +// PreformatToggleLine is a preformatted text toggle line. +type PreformatToggleLine struct { + raw []byte + + // AltText contains the alt-text portion of the line. + // + // It will either have len() > 0 or be nil. + // + // If the line was parsed as part of a full document by Parse(), + // and this is a *closing* toggle, any alt-text present will be + // stripped and this will be nil. If the line was parsed by + // ParseLine() no such correction is performed. + AltText []byte +} + +func (tl PreformatToggleLine) Type() LineType { return LineTypePreformatToggle } +func (tl PreformatToggleLine) Raw() []byte { return tl.raw } +func (tl *PreformatToggleLine) clearAlt() { tl.AltText = nil } + +// PreformattedTextLine represents a line between two toggles. +// +// It is never returned by ParseLine but can be part of a +// document parsed by Parse(). +type PreformattedTextLine struct { + raw []byte +} + +func (tl PreformattedTextLine) Type() LineType { return LineTypePreformattedText } +func (tl PreformattedTextLine) Raw() []byte { return tl.raw } + +// HeadingLine is a line of LineTypeHeading[1,2,3]. +type HeadingLine struct { + raw []byte + lineType LineType + + // Body is the portion of the line with the header text. + Body []byte +} + +func (hl HeadingLine) Type() LineType { return hl.lineType } +func (hl HeadingLine) Raw() []byte { return hl.raw } + +// ListItemLine is a line of LineTypeListItem. +type ListItemLine struct { + raw []byte + + // Body is the text of the list item. + Body []byte +} + +func (li ListItemLine) Type() LineType { return LineTypeListItem } +func (li ListItemLine) Raw() []byte { return li.raw } + +// QuoteLine is a line of LineTypeQuote. +type QuoteLine struct { + raw []byte + + // Body is the text of the quote. + Body []byte +} + +func (ql QuoteLine) Type() LineType { return LineTypeQuote } +func (ql QuoteLine) Raw() []byte { return ql.raw } |