From bd152189b91694fc7cf2fae385e48f96163e01ef Mon Sep 17 00:00:00 2001 From: Johannes Kaufmann Date: Tue, 25 Feb 2025 12:02:30 +0100 Subject: [PATCH] plugin: added table plugin (#144) * table: first basic version of table plugin * table: abort table if problematic node (e.g. hr) is discovered * table: render caption below table * table: fill up missing cells * table: combine header row with rows array * table: replicate content to spanned cell * table: change header underline format * table: grow table for colspan & rowspan * table: fix order of applying modifications * table: add align to the underline * table: loop through cells directly * table: abort early for nested table & small cleanup * table: add more tests * table: make empty cell wider * table: add option WithSkipEmptyRows * table: add option WithHeaderPromotion * table: escape "|" character inside table * table: improve naming & docs for WithSpanCellBehavior * table: skip render for role="presentation" table * table: skip render when parent is problematic * table: fallback render newlines between rows * collapse: update testcases to use RenderRepresentation --- collapse/collapse_test.go | 563 +++++++++++++---- plugin/table/1_select.go | 50 ++ plugin/table/1_select_test.go | 218 +++++++ plugin/table/2_collect.go | 247 ++++++++ plugin/table/3_render.go | 105 +++ plugin/table/table.go | 108 ++++ plugin/table/table_test.go | 597 ++++++++++++++++++ plugin/table/testdata/.gitattributes | 4 + .../table/testdata/GoldenFiles/basics.in.html | 234 +++++++ .../table/testdata/GoldenFiles/basics.out.md | 80 +++ .../testdata/GoldenFiles/col_row_span.in.html | 62 ++ .../testdata/GoldenFiles/col_row_span.out.md | 22 + .../testdata/GoldenFiles/contents.in.html | 140 ++++ .../testdata/GoldenFiles/contents.out.md | 57 ++ .../table/testdata/GoldenFiles/email.in.html | 248 ++++++++ .../table/testdata/GoldenFiles/email.out.md | 7 + .../testdata/GoldenFiles/parents.in.html | 110 ++++ .../table/testdata/GoldenFiles/parents.out.md | 58 ++ plugin/table/utils.go | 187 ++++++ plugin/table/utils_test.go | 268 ++++++++ 20 files changed, 3235 insertions(+), 130 deletions(-) create mode 100644 plugin/table/1_select.go create mode 100644 plugin/table/1_select_test.go create mode 100644 plugin/table/2_collect.go create mode 100644 plugin/table/3_render.go create mode 100644 plugin/table/table.go create mode 100644 plugin/table/table_test.go create mode 100644 plugin/table/testdata/.gitattributes create mode 100644 plugin/table/testdata/GoldenFiles/basics.in.html create mode 100644 plugin/table/testdata/GoldenFiles/basics.out.md create mode 100644 plugin/table/testdata/GoldenFiles/col_row_span.in.html create mode 100644 plugin/table/testdata/GoldenFiles/col_row_span.out.md create mode 100644 plugin/table/testdata/GoldenFiles/contents.in.html create mode 100644 plugin/table/testdata/GoldenFiles/contents.out.md create mode 100644 plugin/table/testdata/GoldenFiles/email.in.html create mode 100644 plugin/table/testdata/GoldenFiles/email.out.md create mode 100644 plugin/table/testdata/GoldenFiles/parents.in.html create mode 100644 plugin/table/testdata/GoldenFiles/parents.out.md create mode 100644 plugin/table/utils.go create mode 100644 plugin/table/utils_test.go diff --git a/collapse/collapse_test.go b/collapse/collapse_test.go index 0b45b40..239c711 100644 --- a/collapse/collapse_test.go +++ b/collapse/collapse_test.go @@ -1,31 +1,13 @@ package collapse import ( - "bytes" "strings" "testing" + "github.com/JohannesKaufmann/html-to-markdown/v2/internal/tester" "golang.org/x/net/html" ) -func getBody(doc *html.Node) *html.Node { - var body *html.Node - - var finder func(*html.Node) - finder = func(node *html.Node) { - if node.Type == html.ElementNode && node.Data == "body" { - body = node - return - } - for child := node.FirstChild; child != nil; child = child.NextSibling { - finder(child) - } - } - finder(doc) - - return body -} - func TestCollapse_DocType(t *testing.T) { // The DOCTYPE gets removed input := ` ` @@ -37,16 +19,12 @@ func TestCollapse_DocType(t *testing.T) { Collapse(doc, nil) - var buf bytes.Buffer - err = html.Render(&buf, doc) - if err != nil { - t.Error(err) - } - - expected := ` ` - if buf.String() != expected { - t.Errorf("expected %q but got %q", expected, buf.String()) - } + tester.ExpectRepresentation(t, doc, "after", ` +#document +├─html +│ ├─head +│ ├─body + `) } func TestCollapse_NoFirstChild(t *testing.T) { @@ -57,16 +35,7 @@ func TestCollapse_NoFirstChild(t *testing.T) { Collapse(boldNode, nil) - var buf bytes.Buffer - err := html.Render(&buf, boldNode) - if err != nil { - t.Error(err) - } - - expected := `` - if buf.String() != expected { - t.Errorf("expected %q but got %q", expected, buf.String()) - } + tester.ExpectRepresentation(t, boldNode, "after", `strong`) } func TestCollapse_StartWithCode(t *testing.T) { @@ -82,16 +51,10 @@ func TestCollapse_StartWithCode(t *testing.T) { Collapse(codeNode, nil) - var buf bytes.Buffer - err := html.Render(&buf, codeNode) - if err != nil { - t.Error(err) - } - - expected := ` text ` - if buf.String() != expected { - t.Errorf("expected %q but got %q", expected, buf.String()) - } + tester.ExpectRepresentation(t, codeNode, "after", ` +code +├─#text " text " + `) } func TestCollapse_TwoTextNodes(t *testing.T) { @@ -113,16 +76,11 @@ func TestCollapse_TwoTextNodes(t *testing.T) { Collapse(node1, nil) - var buf bytes.Buffer - err := html.Render(&buf, node1) - if err != nil { - t.Error(err) - } - - expected := `a b` - if buf.String() != expected { - t.Errorf("expected %q but got %q", expected, buf.String()) - } + tester.ExpectRepresentation(t, node1, "after", ` +span +├─#text "a " +├─#text "b" + `) } func TestCollapse_LastTextIsEmpty(t *testing.T) { @@ -144,75 +102,276 @@ func TestCollapse_LastTextIsEmpty(t *testing.T) { Collapse(node1, nil) - var buf bytes.Buffer - err := html.Render(&buf, node1) - if err != nil { - t.Error(err) - } - - expected := `text` - if buf.String() != expected { - t.Errorf("expected %q but got %q", expected, buf.String()) - } + tester.ExpectRepresentation(t, node1, "after", ` +span +├─#text "text" + `) } func TestCollapse_Table(t *testing.T) { runs := []struct { - desc string - input string - expected string + desc string + input string + + expectedBefore string // optional + expectedAfter string }{ { - desc: "basic example", - input: "

Foo bar

Words

", - expected: "

Foo bar

Words

", + desc: "basic example", + input: "

Foo bar

Words

", + expectedBefore: ` +#document +├─html +│ ├─head +│ ├─body +│ │ ├─p +│ │ │ ├─#text "Foo bar" +│ │ ├─#text " " +│ │ ├─p +│ │ │ ├─#text "Words" +│ │ ├─#text " " + `, + expectedAfter: ` +#document +├─html +│ ├─head +│ ├─body +│ │ ├─p +│ │ │ ├─#text "Foo bar" +│ │ ├─p +│ │ │ ├─#text "Words" + `, }, { - desc: "without whitespace", - input: "

SomeText

", - expected: "

SomeText

", + desc: "without whitespace", + input: "

SomeText

", + expectedAfter: ` +#document +├─html +│ ├─head +│ ├─body +│ │ ├─p +│ │ │ ├─#text "Some" +│ │ │ ├─strong +│ │ │ │ ├─#text "Text" + `, }, { - desc: "with one space & space in paragraph", - input: "

Some text.

", - expected: "

Some text.

", + desc: "with one space & space in paragraph", + input: "

Some text.

", + expectedAfter: ` +#document +├─html +│ ├─head +│ ├─body +│ │ ├─p +│ │ │ ├─#text "Some " +│ │ │ ├─strong +│ │ │ │ ├─#text "text." + `, }, { - desc: "with one space", - input: "

Some text.

", - expected: "

Some text.

", + desc: "with one space", + input: "

Some text.

", + expectedBefore: ` +#document +├─html +│ ├─head +│ ├─body +│ │ ├─p +│ │ │ ├─#text "Some" +│ │ │ ├─strong +│ │ │ │ ├─#text " text. " + `, + expectedAfter: ` +#document +├─html +│ ├─head +│ ├─body +│ │ ├─p +│ │ │ ├─#text "Some" +│ │ │ ├─strong +│ │ │ │ ├─#text " text." + `, }, { - desc: "with three space", - input: "

Some text.

", - expected: "

Some text.

", + desc: "with three space", + input: "

Some text.

", + expectedBefore: ` +#document +├─html +│ ├─head +│ ├─body +│ │ ├─p +│ │ │ ├─#text "Some" +│ │ │ ├─strong +│ │ │ │ ├─#text " text. " + `, + expectedAfter: ` +#document +├─html +│ ├─head +│ ├─body +│ │ ├─p +│ │ │ ├─#text "Some" +│ │ │ ├─strong +│ │ │ │ ├─#text " text." + `, }, { - desc: "with three space (at beginning of paragraph)", - input: "

text.

", - expected: "

text.

", + desc: "with three space (at beginning of paragraph)", + input: "

text.

", + expectedAfter: ` +#document +├─html +│ ├─head +│ ├─body +│ │ ├─p +│ │ │ ├─strong +│ │ │ │ ├─#text "text." + `, }, { - desc: "with image between", - input: `

a b

`, - expected: `

a b

`, + desc: "with image between", + input: `

a b

`, + expectedBefore: ` +#document +├─html +│ ├─head +│ ├─body +│ │ ├─p +│ │ │ ├─strong +│ │ │ │ ├─#text " a " +│ │ │ ├─img (src="/img.png") +│ │ │ ├─strong +│ │ │ │ ├─#text " b " + `, + expectedAfter: ` +#document +├─html +│ ├─head +│ ├─body +│ │ ├─p +│ │ │ ├─strong +│ │ │ │ ├─#text "a " +│ │ │ ├─img (src="/img.png") +│ │ │ ├─strong +│ │ │ │ ├─#text " b" + `, }, { - desc: "spans directly next to each other", - input: "

(Text A)(Text B)

", - expected: "

(Text A)(Text B)

", + desc: "spans directly next to each other", + input: "

(Text A)(Text B)

", + expectedAfter: ` +#document +├─html +│ ├─head +│ ├─body +│ │ ├─p +│ │ │ ├─span +│ │ │ │ ├─#text "(Text A)" +│ │ │ ├─span +│ │ │ │ ├─#text "(Text B)" + `, + }, + { + desc: "spans with newline between each other", + input: "

\n(Text A)\n(Text B)\n

", + expectedBefore: ` +#document +├─html +│ ├─head +│ ├─body +│ │ ├─p +│ │ │ ├─#text "\n" +│ │ │ ├─span +│ │ │ │ ├─#text "(Text A)" +│ │ │ ├─#text "\n" +│ │ │ ├─span +│ │ │ │ ├─#text "(Text B)" +│ │ │ ├─#text "\n" + `, + expectedAfter: ` +#document +├─html +│ ├─head +│ ├─body +│ │ ├─p +│ │ │ ├─span +│ │ │ │ ├─#text "(Text A)" +│ │ │ ├─#text " " +│ │ │ ├─span +│ │ │ │ ├─#text "(Text B)" +│ │ │ ├─#text "" + `, }, { - desc: "spans with newline between each other", - input: "

\n(Text A)\n(Text B)\n

", - expected: "

(Text A) (Text B)

", + desc: "spans with indentation", + input: ` +
+ A + B +
+
+ C +
+ `, + expectedBefore: ` +#document +├─html +│ ├─head +│ ├─body +│ │ ├─div +│ │ │ ├─#text "\n\t\t\t\t" +│ │ │ ├─span +│ │ │ │ ├─#text "A" +│ │ │ ├─#text "\n\t\t\t\t" +│ │ │ ├─span +│ │ │ │ ├─#text "B" +│ │ │ ├─#text "\n\t\t\t" +│ │ ├─#text "\n\t\t\t" +│ │ ├─div +│ │ │ ├─#text "\n\t\t\t\t" +│ │ │ ├─span +│ │ │ │ ├─#text "C" +│ │ │ ├─#text "\n\t\t\t" +│ │ ├─#text "\n\t\t\t" + `, + + // TODO: are we expecting empty #text nodes??! + expectedAfter: ` +#document +├─html +│ ├─head +│ ├─body +│ │ ├─div +│ │ │ ├─span +│ │ │ │ ├─#text "A" +│ │ │ ├─#text " " +│ │ │ ├─span +│ │ │ │ ├─#text "B" +│ │ │ ├─#text "" +│ │ ├─div +│ │ │ ├─span +│ │ │ │ ├─#text "C" +│ │ │ ├─#text "" + `, }, { desc: "code with space", input: "

aaa

", - // Note: This is different thant the javascript implementation. + // Note: This is different then the javascript implementation. // We want the space to be preserved. - expected: "

aaa

", + expectedAfter: ` +#document +├─html +│ ├─head +│ ├─body +│ │ ├─p +│ │ │ ├─code +│ │ │ │ ├─#text " " +│ │ │ ├─#text "aaa" + `, }, { desc: "#text in sample", @@ -227,42 +386,190 @@ func TestCollapse_Table(t *testing.T) { `, - expected: `

Browseor ask.

`, + expectedBefore: ` +#document +├─html +│ ├─head +│ ├─body +│ │ ├─h2 +│ │ │ ├─#text "\n\t\t\t " +│ │ │ ├─div +│ │ │ │ ├─#text "\n\t\t\t\tBrowse\n\t\t\t\t" +│ │ │ │ ├─ul +│ │ │ │ │ ├─#text "\n\t\t\t\t " +│ │ │ │ │ ├─li +│ │ │ │ │ │ ├─a (href="/go") +│ │ │ │ │ │ │ ├─#text "go" +│ │ │ │ │ ├─#text "\n\t\t\t\t" +│ │ │ │ ├─#text "\n\t\t\t\tor " +│ │ │ │ ├─a (href="/ask") +│ │ │ │ │ ├─#text "ask" +│ │ │ │ ├─#text ".\n\t\t\t " +│ │ │ ├─#text "\n\t\t\t" +│ │ ├─#text "\n\t\t\t" + `, + expectedAfter: ` +#document +├─html +│ ├─head +│ ├─body +│ │ ├─h2 +│ │ │ ├─div +│ │ │ │ ├─#text "Browse" +│ │ │ │ ├─ul +│ │ │ │ │ ├─li +│ │ │ │ │ │ ├─a (href="/go") +│ │ │ │ │ │ │ ├─#text "go" +│ │ │ │ ├─#text "or " +│ │ │ │ ├─a (href="/ask") +│ │ │ │ │ ├─#text "ask" +│ │ │ │ ├─#text "." + `, }, // - - - - - - // { - desc: "mdn example: inline formatting context", - input: "

Hello \n\t\t\t\t World!\t

", - expected: "

Hello World!

", + desc: "mdn example: inline formatting context", + input: "

Hello \n\t\t\t\t World!\t

", // -> https://developer.mozilla.org/en-US/docs/Web/API/Document_Object_Model/Whitespace + + expectedBefore: ` +#document +├─html +│ ├─head +│ ├─body +│ │ ├─h1 +│ │ │ ├─#text " Hello \n\t\t\t\t" +│ │ │ ├─span +│ │ │ │ ├─#text " World!" +│ │ │ ├─#text "\t " + `, + expectedAfter: ` +#document +├─html +│ ├─head +│ ├─body +│ │ ├─h1 +│ │ │ ├─#text "Hello " +│ │ │ ├─span +│ │ │ │ ├─#text "World!" +│ │ │ ├─#text "" + `, }, { - desc: "mdn example: block formatting contexts", - input: "\n\t
Hello
\n\n
World!
\n", - expected: "
Hello
World!
", + desc: "mdn example: block formatting contexts", + input: "\n\t
Hello
\n\n
World!
\n", + expectedBefore: ` +#document +├─html +│ ├─head +│ ├─body +│ │ ├─#text "\n\t" +│ │ ├─div +│ │ │ ├─#text " Hello " +│ │ ├─#text "\n\n " +│ │ ├─div +│ │ │ ├─#text " World! " +│ │ ├─#text " \n" + `, + expectedAfter: ` +#document +├─html +│ ├─head +│ ├─body +│ │ ├─div +│ │ │ ├─#text "Hello" +│ │ ├─div +│ │ │ ├─#text "World!" + `, }, // - - - - - - Comments - - - - - - // { - desc: "#comment inside paragraph", - input: `

beforeafter

`, - expected: `

beforeafter

`, + desc: "#comment inside paragraph", + input: `

beforeafter

`, + expectedAfter: ` +#document +├─html +│ ├─head +│ ├─body +│ │ ├─p +│ │ │ ├─#text "before" +│ │ │ ├─#comment +│ │ │ ├─#text "after" + `, }, { - desc: "#comment inside paragraph (with spaces)", - input: `

before after

`, - expected: `

before after

`, + desc: "#comment inside paragraph (with spaces)", + input: `

before after

`, + expectedBefore: ` +#document +├─html +│ ├─head +│ ├─body +│ │ ├─p +│ │ │ ├─#text "before " +│ │ │ ├─#comment +│ │ │ ├─#text " after" + `, + expectedAfter: ` +#document +├─html +│ ├─head +│ ├─body +│ │ ├─p +│ │ │ ├─#text "before " +│ │ │ ├─#comment +│ │ │ ├─#text "after" + `, }, { - desc: "#comment inside div", - input: `
beforeafter
`, - expected: `
beforeafter
`, + desc: "#comment inside div", + input: `
beforeafter
`, + expectedBefore: ` +#document +├─html +│ ├─head +│ ├─body +│ │ ├─div +│ │ │ ├─#text "before" +│ │ │ ├─#comment +│ │ │ ├─#text "after" + `, + expectedAfter: ` +#document +├─html +│ ├─head +│ ├─body +│ │ ├─div +│ │ │ ├─#text "before" +│ │ │ ├─#comment +│ │ │ ├─#text "after" + `, }, { - desc: "#comment inside div (with spaces)", - input: `
before after
`, - expected: `
before after
`, + desc: "#comment inside div (with spaces)", + input: `
before after
`, + expectedBefore: ` +#document +├─html +│ ├─head +│ ├─body +│ │ ├─div +│ │ │ ├─#text "before " +│ │ │ ├─#comment +│ │ │ ├─#text " after" + `, + expectedAfter: ` +#document +├─html +│ ├─head +│ ├─body +│ │ ├─div +│ │ │ ├─#text "before " +│ │ │ ├─#comment +│ │ │ ├─#text "after" + `, }, } @@ -270,20 +577,16 @@ func TestCollapse_Table(t *testing.T) { t.Run(run.desc, func(t *testing.T) { doc, err := html.Parse(strings.NewReader(run.input)) if err != nil { - t.Error(err) + t.Fatal(err) } - Collapse(doc, nil) - - var buf bytes.Buffer - err = html.Render(&buf, getBody(doc)) - if err != nil { - t.Error(err) + if run.expectedBefore != "" { + tester.ExpectRepresentation(t, doc, "before", run.expectedBefore) } - if buf.String() != run.expected { - t.Errorf("expected %q but got %q", run.expected, buf.String()) - } + Collapse(doc, nil) + + tester.ExpectRepresentation(t, doc, "after", run.expectedAfter) }) } } diff --git a/plugin/table/1_select.go b/plugin/table/1_select.go new file mode 100644 index 0000000..944b61b --- /dev/null +++ b/plugin/table/1_select.go @@ -0,0 +1,50 @@ +package table + +import ( + "github.com/JohannesKaufmann/dom" + "golang.org/x/net/html" +) + +func selectHeaderRowNode(node *html.Node) *html.Node { + thead := dom.FindFirstNode(node, func(n *html.Node) bool { + return dom.NodeName(n) == "thead" + }) + if thead != nil { + firstTr := dom.FindFirstNode(thead, func(n *html.Node) bool { + return dom.NodeName(n) == "tr" + }) + if firstTr != nil { + // YEAH we found the "tr" inside the "thead" + return firstTr + } + } + + firstTh := dom.FindFirstNode(node, func(n *html.Node) bool { + return dom.NodeName(n) == "th" + }) + if firstTh != nil { + // YEAH we found the "th" + return firstTh.Parent + } + + return nil +} +func selectNormalRowNodes(tableNode *html.Node, selectedHeaderRowNode *html.Node) []*html.Node { + var collected []*html.Node + + var finder func(node *html.Node) + finder = func(node *html.Node) { + name := dom.NodeName(node) + if name == "tr" && node != selectedHeaderRowNode { + // We want to make sure to not select the header row a *second* time. + collected = append(collected, node) + } + + for child := node.FirstChild; child != nil; child = child.NextSibling { + finder(child) + } + } + finder(tableNode) + + return collected +} diff --git a/plugin/table/1_select_test.go b/plugin/table/1_select_test.go new file mode 100644 index 0000000..837073f --- /dev/null +++ b/plugin/table/1_select_test.go @@ -0,0 +1,218 @@ +package table + +import ( + "testing" + + "github.com/JohannesKaufmann/html-to-markdown/v2/collapse" + "github.com/JohannesKaufmann/html-to-markdown/v2/internal/tester" + "golang.org/x/net/html" +) + +func TestSelectRowNodes(t *testing.T) { + runs := []struct { + desc string + input string + + expected string + }{ + { + desc: "invalid table", + input: ` + + + there is no data cell tag + +
+ `, + + // Note: "golang.org/x/net/html" automatically cleans up the "table" + expected: ` +├─body +│ ├─#text "there is no data cell tag" +│ ├─table +│ │ ├─tbody +│ │ │ ├─tr (__test_normal_row__="true") + `, + }, + { + desc: "completely empty table", + input: `
`, + + expected: ` +├─body +│ ├─table + `, + }, + { + desc: "completely empty tbody", + input: `
`, + + expected: ` +├─body +│ ├─table +│ │ ├─tbody + `, + }, + { + desc: "basic table", + input: ` + + + + + + + + + +
A1A2
B1B2
+ `, + // Note: "golang.org/x/net/html" automatically adds the "tbody" + expected: ` +├─body +│ ├─table +│ │ ├─tbody +│ │ │ ├─tr (__test_normal_row__="true") +│ │ │ │ ├─td +│ │ │ │ │ ├─#text "A1" +│ │ │ │ ├─td +│ │ │ │ │ ├─#text "A2" +│ │ │ ├─tr (__test_normal_row__="true") +│ │ │ │ ├─td +│ │ │ │ │ ├─#text "B1" +│ │ │ │ ├─td +│ │ │ │ │ ├─#text "B2" + `, + }, + { + desc: "basic table with th", + input: ` + + + + + + + +
Heading 1 + Heading 2 +
A1A2
+ `, + expected: ` +├─body +│ ├─table +│ │ ├─tbody +│ │ │ ├─tr (__test_header_row__="true") +│ │ │ │ ├─th +│ │ │ │ │ ├─#text "Heading 1" +│ │ │ │ ├─th +│ │ │ │ │ ├─#text "Heading 2" +│ │ │ ├─tr (__test_normal_row__="true") +│ │ │ │ ├─td +│ │ │ │ │ ├─#text "A1" +│ │ │ │ ├─td +│ │ │ │ │ ├─#text "A2" + `, + }, + { + desc: "with caption, thead, tbody, tfoot", + input: ` + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ A description about the table +
NameCityAge
Max MustermannBerlin20
Max MüllerMünchen30
Average age25
+ `, + expected: ` +├─body +│ ├─table +│ │ ├─caption +│ │ │ ├─#text "A description about the table" +│ │ ├─thead +│ │ │ ├─tr (__test_header_row__="true") +│ │ │ │ ├─th (scope="col") +│ │ │ │ │ ├─#text "Name" +│ │ │ │ ├─th (scope="col") +│ │ │ │ │ ├─#text "City" +│ │ │ │ ├─th (scope="col") +│ │ │ │ │ ├─#text "Age" +│ │ ├─tbody +│ │ │ ├─tr (__test_normal_row__="true") +│ │ │ │ ├─th (scope="row") +│ │ │ │ │ ├─#text "Max Mustermann" +│ │ │ │ ├─td +│ │ │ │ │ ├─#text "Berlin" +│ │ │ │ ├─td +│ │ │ │ │ ├─#text "20" +│ │ │ ├─tr (__test_normal_row__="true") +│ │ │ │ ├─th (scope="row") +│ │ │ │ │ ├─#text "Max Müller" +│ │ │ │ ├─td +│ │ │ │ │ ├─#text "München" +│ │ │ │ ├─td +│ │ │ │ │ ├─#text "30" +│ │ ├─tfoot +│ │ │ ├─tr (__test_normal_row__="true") +│ │ │ │ ├─th (scope="row" colspan="2") +│ │ │ │ │ ├─#text "Average age" +│ │ │ │ ├─td +│ │ │ │ │ ├─#text "25" + `, + }, + } + for _, run := range runs { + t.Run(run.desc, func(t *testing.T) { + doc := tester.Parse(t, run.input, "") + + // NOTE FOR FUTURE: I discovered that "golang.org/x/net/html" automatically adds the "tbody". + // => So we probably don't need to do that much work beforehand. + collapse.Collapse(doc, nil) + + { + // We can then see if we correctly *identified* all the necessary table components. + // For that we add an attribute (just for the test). + + headerRow := selectHeaderRowNode(doc) + if headerRow != nil { + headerRow.Attr = append(headerRow.Attr, html.Attribute{ + Key: "__test_header_row__", + Val: "true", + }) + } + for _, n := range selectNormalRowNodes(doc, headerRow) { + n.Attr = append(n.Attr, html.Attribute{ + Key: "__test_normal_row__", + Val: "true", + }) + } + } + + tester.ExpectRepresentation(t, doc, "output", run.expected) + }) + } +} diff --git a/plugin/table/2_collect.go b/plugin/table/2_collect.go new file mode 100644 index 0000000..b1c114c --- /dev/null +++ b/plugin/table/2_collect.go @@ -0,0 +1,247 @@ +package table + +import ( + "bytes" + + "github.com/JohannesKaufmann/dom" + "github.com/JohannesKaufmann/html-to-markdown/v2/converter" + "github.com/JohannesKaufmann/html-to-markdown/v2/marker" + "golang.org/x/net/html" + "golang.org/x/net/html/atom" +) + +type tableContent struct { + Alignments []string + Rows [][][]byte + Caption []byte +} + +func containsNewline(b []byte) bool { + return bytes.Contains(b, []byte("\n")) +} + +func hasProblematicChildNode(node *html.Node) bool { + problematicNode := dom.FindFirstNode(node, func(n *html.Node) bool { + name := dom.NodeName(n) + + if dom.NameIsHeading(name) { + return true + } + switch name { + case "table": + // This will be caught with the newline check anyway. + // But we can safe some effort by aborting early... + return true + case "br", "hr", "ul", "ol", "blockquote": + return true + } + + return false + }) + + return problematicNode != nil +} + +func hasProblematicParentNode(node *html.Node) bool { + p := node.Parent + + for p != nil { + name := dom.NodeName(p) + if name == "a" { + return true + } + if name == "strong" || name == "b" { + return true + } + if name == "em" || name == "i" { + return true + } + if name == "del" || name == "s" || name == "strike" { + return true + } + + p = p.Parent + } + + return false +} +func (p *tablePlugin) collectTableContent(ctx converter.Context, node *html.Node) *tableContent { + if role := dom.GetAttributeOr(node, "role", ""); role == "presentation" { + // In HTML-Emails many tables are used. Oftentimes these tables are nested + // which is not possible with markdown. But these tables are mostly used + // for *layout purposes* rather than displaying actual tabular data. + if !p.convertPresentationTables { + // So lets skip those with role="presentation" and focus on real tables... + return nil + } + } + if hasProblematicChildNode(node) { + // There are certain nodes (e.g.
) that cannot be in a table. + // If we found one, we unfortunately cannot convert the table. + // + // Note: It is okay for a block node (e.g.
) to be in a table. + // However once it causes multiple lines, it does not work anymore. + // For that we have the `containsNewline` check below. + return nil + } + + if hasProblematicParentNode(node) { + // There are certain parent nodes (e.g. ) that cannot contain a table. + // We would break the rendering of the link, so we unfortunately cannot convert the table. + return nil + } + + headerRowNode := selectHeaderRowNode(node) + normalRowNodes := selectNormalRowNodes(node, headerRowNode) + + rows := p.collectRows(ctx, headerRowNode, normalRowNodes) + if len(rows) == 0 { + return nil + } + + for _, cells := range rows { + for _, cell := range cells { + if containsNewline(cell) { + // Having newlines inside the content would break the table. + // So unfortunately we cannot convert the table. + return nil + } + } + } + + return &tableContent{ + Alignments: collectAlignments(headerRowNode, normalRowNodes), + Rows: rows, + Caption: collectCaption(ctx, node), + } +} + +// Sometimes a cell wants to *span* over multiple columns or/and rows. +// What should be displayed in those other cells? +// Render exactly the same content OR an empty string? +func (p *tablePlugin) getContentForMergedCell(originalContent []byte) []byte { + if p.spanCellBehavior == SpanBehaviorMirror { + return originalContent + } + + return []byte("") +} + +func getFirstNode(node *html.Node, nodes ...*html.Node) *html.Node { + if node != nil { + return node + } + if len(nodes) >= 1 { + return nodes[0] + } + return nil +} + +func collectAlignments(headerRowNode *html.Node, rowNodes []*html.Node) []string { + firstRow := getFirstNode(headerRowNode, rowNodes...) + if firstRow == nil { + return nil + } + + cellNodes := dom.FindAllNodes(firstRow, func(node *html.Node) bool { + name := dom.NodeName(node) + return name == "th" || name == "td" + }) + + var alignments []string + for _, cellNode := range cellNodes { + align := dom.GetAttributeOr(cellNode, "align", "") + + alignments = append(alignments, align) + } + + return alignments +} +func (p *tablePlugin) collectCellsInRow(ctx converter.Context, rowIndex int, rowNode *html.Node) ([][]byte, []modification) { + cellNodes := dom.FindAllNodes(rowNode, func(node *html.Node) bool { + name := dom.NodeName(node) + return name == "th" || name == "td" + }) + + cellContents := make([][]byte, 0, len(cellNodes)) + modifications := make([]modification, 0) + + for index, cellNode := range cellNodes { + var buf bytes.Buffer + ctx.RenderNodes(ctx, &buf, cellNode) + + content := buf.Bytes() + content = bytes.TrimSpace(content) + + // The character "|" inside the content would mistakenly be recognized as part of the table. So we have to escape it. + content = bytes.Replace(content, []byte{byte(marker.MarkerEscaping), '|'}, []byte(`\|`), -1) + content = ctx.UnEscapeContent(content) + + cellContents = append(cellContents, content) + + // - - col / row span - - // + rowSpan := getNumberAttributeOr(cellNode, "rowspan", 1) + colSpan := getNumberAttributeOr(cellNode, "colspan", 1) + + mods := calculateModifications(rowIndex, index, rowSpan, colSpan, p.getContentForMergedCell(content)) + + modifications = append(modifications, mods...) + } + + return cellContents, modifications +} +func (p *tablePlugin) collectRows(ctx converter.Context, headerRowNode *html.Node, rowNodes []*html.Node) [][][]byte { + rowContents := make([][][]byte, 0, len(rowNodes)+1) + groupedModifications := make([][]modification, 0) + + // - - 1. the header row - - // + if headerRowNode != nil { + cells, mods := p.collectCellsInRow(ctx, 0, headerRowNode) + + rowContents = append(rowContents, cells) + groupedModifications = append(groupedModifications, mods) + } else { + // There needs to be *header* row so that the table is recognized. + // So it is better to have an empty header row... + rowContents = append(rowContents, [][]byte{}) + } + + // - - 2. the normal rows - - // + for index, rowNode := range rowNodes { + cells, mods := p.collectCellsInRow(ctx, index+1, rowNode) + + rowContents = append(rowContents, cells) + groupedModifications = append(groupedModifications, mods) + } + + // Sometimes a cell wants to *span* over multiple columns or/and rows. + // We collected these modifications and are now applying it, + // by shifting the cells around. + rowContents = applyGroupedModifications(rowContents, groupedModifications) + + if p.skipEmptyRows { + rowContents = removeEmptyRows(rowContents) + } + if p.promoteFirstRowToHeader { + rowContents = removeFirstRowIfEmpty(rowContents) + } + + return rowContents +} + +func collectCaption(ctx converter.Context, node *html.Node) []byte { + captionNode := dom.FindFirstNode(node, func(node *html.Node) bool { + return node.DataAtom == atom.Caption + }) + if captionNode == nil { + return nil + } + + var buf bytes.Buffer + ctx.RenderNodes(ctx, &buf, captionNode) + + content := buf.Bytes() + content = bytes.TrimSpace(content) + + return content +} diff --git a/plugin/table/3_render.go b/plugin/table/3_render.go new file mode 100644 index 0000000..d941487 --- /dev/null +++ b/plugin/table/3_render.go @@ -0,0 +1,105 @@ +package table + +import ( + "strings" + "unicode/utf8" + + "github.com/JohannesKaufmann/html-to-markdown/v2/converter" + "golang.org/x/net/html" +) + +func (p *tablePlugin) renderTable(ctx converter.Context, w converter.Writer, n *html.Node) converter.RenderStatus { + table := p.collectTableContent(ctx, n) + if table == nil { + // Sometime we just cannot render the table. + // Either because it is an empty table OR + // because there are newlines inside the content (which would break the table). + return converter.RenderTryNext + } + + // Sometimes we pad the cells with extra spaces (e.g. "| text |"). + // For that we first need to know the maximum width of every column. + counts := calculateMaxCounts(table.Rows) + + // Sometimes a row contains less cells that another row. + // We then fill it up with empty cells (e.g. "| text | |"). + table.Rows = fillUpRows(table.Rows, len(counts)) + + // - - - - - - - - - - - - - - - - - - - - - - - - - - // + + w.WriteString("\n\n") + // - - - Header - - - // + p.writeRow(w, counts, table.Rows[0]) + w.WriteString("\n") + p.writeHeaderUnderline(w, table.Alignments, counts) + w.WriteString("\n") + + // - - - Body - - - // + for _, cells := range table.Rows[1:] { + p.writeRow(w, counts, cells) + w.WriteString("\n") + } + + // - - - Caption - - - // + if table.Caption != nil { + w.WriteString("\n\n") + w.Write(table.Caption) + + } + // - - - - - - // + w.WriteString("\n\n") + + return converter.RenderSuccess +} + +func getAlignmentFor(alignments []string, index int) string { + if index > len(alignments)-1 { + return "" + } + + return alignments[index] +} +func (s *tablePlugin) writeHeaderUnderline(w converter.Writer, alignments []string, counts []int) { + for i, maxLength := range counts { + align := getAlignmentFor(alignments, i) + + isFirstCell := i == 0 + if isFirstCell { + w.WriteString("|") + } + if align == "left" || align == "center" { + w.WriteString(":") + } else { + w.WriteString("-") + } + + w.WriteString(strings.Repeat("-", maxLength)) + + if align == "right" || align == "center" { + w.WriteString(":") + } else { + w.WriteString("-") + } + w.WriteString("|") + } +} + +func (s *tablePlugin) writeRow(w converter.Writer, counts []int, cells [][]byte) { + for i, cell := range cells { + isFirstCell := i == 0 + if isFirstCell { + w.WriteString("|") + } + w.WriteString(" ") + w.Write(cell) + + currentCount := utf8.RuneCount(cell) + filler := counts[i] - currentCount + + if filler > 0 { + w.WriteString(strings.Repeat(" ", filler)) + } + + w.WriteString(" |") + } +} diff --git a/plugin/table/table.go b/plugin/table/table.go new file mode 100644 index 0000000..e413812 --- /dev/null +++ b/plugin/table/table.go @@ -0,0 +1,108 @@ +package table + +import ( + "github.com/JohannesKaufmann/dom" + "github.com/JohannesKaufmann/html-to-markdown/v2/converter" + "golang.org/x/net/html" +) + +type option func(p *tablePlugin) + +type spanCellBehavior string + +const ( + // SpanBehaviorEmpty renders an empty cell. + SpanBehaviorEmpty spanCellBehavior = "empty" + // SpanBehaviorMirror renders the same content as the original cell. + SpanBehaviorMirror spanCellBehavior = "mirror" +) + +// WithSpanCellBehavior configures how cells affected by colspan/rowspan attributes +// should be rendered. When a cell spans multiple columns or rows, the affected cells +// can either be empty or contain the same content as the original cell. +func WithSpanCellBehavior(behavior spanCellBehavior) option { + return func(p *tablePlugin) { + p.spanCellBehavior = behavior + } +} + +// WithSkipEmptyRows configures the table plugin to omit empty rows from the output. +// An empty row is defined as a row where all cells contain no content or only whitespace. +// When set to true, empty rows will be omitted from the output. When false (default), +// all rows are preserved. +func WithSkipEmptyRows(skip bool) option { + return func(p *tablePlugin) { + p.skipEmptyRows = skip + } +} + +// WithHeaderPromotion configures whether the first row should be treated as a header +// when the table has no explicit header row (e.g. elements). When set to true, the +// first row will be converted to a header row with separator dashes. When false (default), +// all rows are treated as regular content. +func WithHeaderPromotion(promote bool) option { + return func(p *tablePlugin) { + p.promoteFirstRowToHeader = promote + } +} + +// WithPresentationTables configures whether tables marked with role="presentation" +// should be converted to markdown. When set to true, presentation tables will be +// converted like regular tables. When false (default), these tables are skipped +// since they typically represent layout rather than semantic content. +func WithPresentationTables(convert bool) option { + return func(p *tablePlugin) { + p.convertPresentationTables = convert + } +} + +type tablePlugin struct { + spanCellBehavior spanCellBehavior + skipEmptyRows bool + promoteFirstRowToHeader bool + convertPresentationTables bool +} + +func NewTablePlugin(opts ...option) converter.Plugin { + plugin := &tablePlugin{} + for _, opt := range opts { + opt(plugin) + } + return plugin +} + +func (s *tablePlugin) Name() string { + return "table" +} + +func (s *tablePlugin) Init(conv *converter.Converter) error { + + conv.Register.EscapedChar('|') + + conv.Register.Renderer(s.handleRender, converter.PriorityStandard) + + return nil +} + +func (s *tablePlugin) handleRender(ctx converter.Context, w converter.Writer, n *html.Node) converter.RenderStatus { + name := dom.NodeName(n) + switch name { + case "table": + return s.renderTable(ctx, w, n) + + case "tr": + // Normally, when the "table" gets rendered we do NOT go into this case. + // But as a fallback we separate the rows through newlines. + return s.renderFallbackRow(ctx, w, n) + + } + + return converter.RenderTryNext +} + +func (s *tablePlugin) renderFallbackRow(ctx converter.Context, w converter.Writer, n *html.Node) converter.RenderStatus { + w.WriteString("\n\n") + ctx.RenderChildNodes(ctx, w, n) + w.WriteString("\n\n") + return converter.RenderSuccess +} diff --git a/plugin/table/table_test.go b/plugin/table/table_test.go new file mode 100644 index 0000000..ef14828 --- /dev/null +++ b/plugin/table/table_test.go @@ -0,0 +1,597 @@ +package table + +import ( + "bytes" + "strings" + "testing" + + "github.com/JohannesKaufmann/html-to-markdown/v2/converter" + "github.com/JohannesKaufmann/html-to-markdown/v2/internal/tester" + "github.com/JohannesKaufmann/html-to-markdown/v2/plugin/base" + "github.com/JohannesKaufmann/html-to-markdown/v2/plugin/commonmark" +) + +func TestGoldenFiles(t *testing.T) { + goldenFileConvert := func(htmlInput []byte) ([]byte, error) { + conv := converter.NewConverter( + converter.WithPlugins( + base.NewBasePlugin(), + commonmark.NewCommonmarkPlugin(), + NewTablePlugin(), + ), + ) + + return conv.ConvertReader(bytes.NewReader(htmlInput)) + } + + tester.GoldenFiles(t, goldenFileConvert, goldenFileConvert) +} + +func TestOptionFunc_ColRowSpan(t *testing.T) { + testCases := []struct { + desc string + input string + options []option + expected string + }{ + // - - - - - - - - - - default - - - - - - - - - - // + { + desc: "default", + options: []option{ + WithSpanCellBehavior(SpanBehaviorEmpty), + }, + input: ` + + + + + +
AB
+ `, + expected: ` +| | | | | +|---|---|---|---| +| A | B | | | + `, + }, + + // - - - - - - - - - - colspan - - - - - - - - - - // + { + desc: "colspan=3", + options: []option{ + WithSpanCellBehavior(SpanBehaviorMirror), + }, + input: ` + + + + + +
AB
+ `, + expected: ` +| | | | | +|---|---|---|---| +| A | B | B | B | + `, + }, + // - - - - - - - - - - rowspan - - - - - - - - - - // + { + desc: "rowspan=3", + options: []option{ + WithSpanCellBehavior(SpanBehaviorMirror), + }, + input: ` + + + + + +
AB
+ `, + expected: ` +| | | +|---|---| +| A | B | +| | B | +| | B | + `, + }, + + // - - - - - - - - - - colspan & rowspan - - - - - - - - - - // + { + desc: "cell with colspan and rowspan", + options: []option{ + WithSpanCellBehavior(SpanBehaviorMirror), + }, + input: ` + + + + + + +
ABC
+ `, + expected: ` +| | | | | | +|---|---|---|---|---| +| A | B | B | B | C | +| | B | B | B | | +| | B | B | B | | + `, + }, + { + desc: "shifting content", + options: []option{ + WithSpanCellBehavior(SpanBehaviorMirror), + }, + input: ` + + + + + + + + + + + +
ABC
123
+ `, + expected: ` +| | | | | | | +|---|---|---|---|---|---| +| A | B | B | B | C | | +| 1 | B | B | B | 2 | 3 | +| | B | B | B | | | + `, + }, + { + desc: "rowspans overlap with colspans", + options: []option{ + WithSpanCellBehavior(SpanBehaviorMirror), + }, + input: ` + + + + + + + + + + + + + +
ABC
DE
F
+ `, + expected: ` +| | | | | +|---|---|---|---| +| A | B | B | C | +| A | D | D | E | +| A | D | D | F | + `, + }, + } + for _, tC := range testCases { + t.Run(tC.desc, func(t *testing.T) { + conv := converter.NewConverter( + converter.WithPlugins( + base.NewBasePlugin(), + commonmark.NewCommonmarkPlugin(), + NewTablePlugin(tC.options...), + ), + ) + + output, err := conv.ConvertString(tC.input) + if err != nil { + t.Error(err) + } + + actual := strings.TrimSpace(output) + expected := strings.TrimSpace(tC.expected) + + if actual != expected { + t.Errorf("expected\n%s\nbut got\n%s\n", expected, actual) + } + }) + } +} + +func TestOptionFunc_EmptyRows(t *testing.T) { + testCases := []struct { + desc string + input string + options []option + expected string + }{ + // - - - - - - - - - - default - - - - - - - - - - // + { + desc: "by default keep empty rows", + options: []option{}, + input: ` + + + + + + + + + + + + + +
B1
A3
+ `, + expected: ` +| | | +|----|----| +| | B1 | +| | | +| A3 | | + `, + }, + { + desc: "some rows are empty", + options: []option{ + WithSkipEmptyRows(true), + }, + input: ` + + + + + + + + + + + + + + + + + +
B1
A3
+ `, + expected: ` +| | | +|----|----| +| | B1 | +| A3 | | + `, + }, + { + desc: "all rows are empty", + options: []option{ + WithSkipEmptyRows(true), + }, + input: ` +

Before

+ + + + + + + + + + + + + + + +
A description
+ +

After

+ `, + expected: ` +Before + +A description + +After + `, + }, + { + desc: "element that is not rendered", + options: []option{ + WithSkipEmptyRows(true), + }, + input: ` +

Before

+ + + + + +
+ +
+ +

After

+ `, + expected: ` +Before + +After + `, + }, + } + for _, tC := range testCases { + t.Run(tC.desc, func(t *testing.T) { + conv := converter.NewConverter( + converter.WithPlugins( + base.NewBasePlugin(), + commonmark.NewCommonmarkPlugin(), + NewTablePlugin(tC.options...), + ), + ) + + output, err := conv.ConvertString(tC.input) + if err != nil { + t.Error(err) + } + + actual := strings.TrimSpace(output) + expected := strings.TrimSpace(tC.expected) + + if actual != expected { + t.Errorf("expected\n%s\nbut got\n%s\n", expected, actual) + } + }) + } +} + +func TestOptionFunc_PromoteHeader(t *testing.T) { + testCases := []struct { + desc string + input string + options []option + expected string + }{ + // - - - - - - - - - - default - - - - - - - - - - // + { + desc: "default", + options: []option{}, + input: ` + + + + + + + + + +
A1B1
A2B2
+ `, + expected: ` +| | | +|----|----| +| A1 | B1 | +| A2 | B2 | + `, + }, + { + desc: "not needed", + options: []option{ + WithHeaderPromotion(true), + }, + input: ` + + + + + + + + + + + + + +
HeadingHeading
A1B1
A2B2
+ `, + expected: ` +| Heading | Heading | +|---------|---------| +| A1 | B1 | +| A2 | B2 | + `, + }, + + { + desc: "promote first row", + options: []option{ + WithHeaderPromotion(true), + }, + input: ` + + + + + + + + + +
A1B1
A2B2
+ `, + expected: ` +| A1 | B1 | +|----|----| +| A2 | B2 | + `, + }, + { + desc: "promote first row (but it is empty)", + options: []option{ + WithHeaderPromotion(true), + }, + input: ` + + + + + + + + + + + + + +
A1B1
A2B2
+ `, + expected: ` +| | | +|----|----| +| A1 | B1 | +| A2 | B2 | + `, + }, + { + desc: "deleted empty rows & promoted first row", + options: []option{ + WithHeaderPromotion(true), + WithSkipEmptyRows(true), + }, + input: ` + + + + + + + + + + + + + +
A1B1
A2B2
+ `, + expected: ` +| A1 | B1 | +|----|----| +| A2 | B2 | + `, + }, + } + for _, tC := range testCases { + t.Run(tC.desc, func(t *testing.T) { + conv := converter.NewConverter( + converter.WithPlugins( + base.NewBasePlugin(), + commonmark.NewCommonmarkPlugin(), + NewTablePlugin(tC.options...), + ), + ) + + output, err := conv.ConvertString(tC.input) + if err != nil { + t.Error(err) + } + + actual := strings.TrimSpace(output) + expected := strings.TrimSpace(tC.expected) + + if actual != expected { + t.Errorf("expected\n%s\nbut got\n%s\n", expected, actual) + } + }) + } +} + +func TestOptionFunc_PresentationTable(t *testing.T) { + testCases := []struct { + desc string + input string + options []option + expected string + }{ + { + desc: "default", + options: []option{}, + input: ` + + + + + + + + + +
A1A2
B1B2
+ `, + expected: ` +A1 A2 + +B1 B2 + `, + }, + { + desc: "keep the presentation table", + options: []option{ + WithPresentationTables(true), + }, + input: ` + + + + + + + + + +
A1A2
B1B2
+ `, + expected: ` +| | | +|----|----| +| A1 | A2 | +| B1 | B2 | + `, + }, + } + for _, tC := range testCases { + t.Run(tC.desc, func(t *testing.T) { + conv := converter.NewConverter( + converter.WithPlugins( + base.NewBasePlugin(), + commonmark.NewCommonmarkPlugin(), + NewTablePlugin(tC.options...), + ), + ) + + output, err := conv.ConvertString(tC.input) + if err != nil { + t.Error(err) + } + + actual := strings.TrimSpace(output) + expected := strings.TrimSpace(tC.expected) + + if actual != expected { + t.Errorf("expected\n%s\nbut got\n%s\n", expected, actual) + } + }) + } +} diff --git a/plugin/table/testdata/.gitattributes b/plugin/table/testdata/.gitattributes new file mode 100644 index 0000000..a8d2daa --- /dev/null +++ b/plugin/table/testdata/.gitattributes @@ -0,0 +1,4 @@ + +# Leave the files untouched. Otherwise they might be +# changed when cloning the repo on Windows... +* -text diff --git a/plugin/table/testdata/GoldenFiles/basics.in.html b/plugin/table/testdata/GoldenFiles/basics.in.html new file mode 100644 index 0000000..c381ca5 --- /dev/null +++ b/plugin/table/testdata/GoldenFiles/basics.in.html @@ -0,0 +1,234 @@ + + A caption outside a table + + +
+ + +
+ + + +
+ + + + + +
+ The caption text of the empty table +
+ + + + + + +
+ +
+ + + + + + + + + + + + + +
+ + + + + + + + + + + + + + +
B1
A3
+ +
+ + + + + + + + + + + + + + + +
A1A2
B1B2
C1C2
+ +
+ + + + + + + + + + +
NameCityAge
+ +
+ + + + + + + + + + + + + + + + + + +
CompanyContactCountry
Company AMax MustermannDE
Company BJohn DoeUS
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ A description about the + table +
NameCityAge
Max MustermannBerlin20space for the note
Max MüllerMünchen30
Peter MustermannMünchen
Average age25
+ +
+ + + + + + + + + + + + + + + + + +
LeftCenterRight
ABC
+ + + + + + + + + + + + +
LeftCenterRight
ABC
+ + + + + + + + + + + + +
+ +
+ +

A | B

+ + + + + + + + + + + + + + + + + + +
A (B) CA **B** C
A (B)A *B*
A | B
+ +
+ + + + + + + + + + + +
A1A2
B1B2
diff --git a/plugin/table/testdata/GoldenFiles/basics.out.md b/plugin/table/testdata/GoldenFiles/basics.out.md new file mode 100644 index 0000000..3db6972 --- /dev/null +++ b/plugin/table/testdata/GoldenFiles/basics.out.md @@ -0,0 +1,80 @@ +A caption outside a table + +* * * + +The caption text of the empty table + +* * * + +| | | +|---|---| +| | | +| | | + +| | | +|----|----| +| | B1 | +| | | +| A3 | | + +* * * + +| | | +|----|----| +| A1 | A2 | +| B1 | B2 | +| C1 | C2 | + +* * * + +| Name | City | Age | +|------|------|-----| + +* * * + +| Company | Contact | Country | +|-----------|----------------|---------| +| Company A | Max Mustermann | DE | +| Company B | John Doe | US | + +* * * + +| Name | City | Age | | +|------------------|---------|-----|--------------------| +| Max Mustermann | Berlin | 20 | space for the note | +| Max Müller | München | 30 | | +| Peter Mustermann | München | | | +| Average age | | 25 | | + +A description about the `table` + +* * * + +| Left | Center | Right | +|:-----|:------:|------:| +| A | B | C | + +| | | | +|:-----|:------:|------:| +| Left | Center | Right | +| A | B | C | + +| | | | +|:--|:-:|--:| +| | | | +| | | | + +* * * + +A | B + +| A (B) C | A \*\*B\** C | +|---------|--------------| +| A (B) | A \*B* | +| A \| B | | + +* * * + +A1 A2 + +B1 B2 \ No newline at end of file diff --git a/plugin/table/testdata/GoldenFiles/col_row_span.in.html b/plugin/table/testdata/GoldenFiles/col_row_span.in.html new file mode 100644 index 0000000..578b0f3 --- /dev/null +++ b/plugin/table/testdata/GoldenFiles/col_row_span.in.html @@ -0,0 +1,62 @@ + + + + + + + +
A1B1C1
+ +
+ + + + + + + + + + + + + +
wide cellB1
A2B2C2
+ + + + + + + + + + + + + + + + + + +
tall cellB1C1
A2B2
A3B3C3
+ + + + + + + + + + + + + + + + + + +
big cellB1
A2
A3B3C3
diff --git a/plugin/table/testdata/GoldenFiles/col_row_span.out.md b/plugin/table/testdata/GoldenFiles/col_row_span.out.md new file mode 100644 index 0000000..df42107 --- /dev/null +++ b/plugin/table/testdata/GoldenFiles/col_row_span.out.md @@ -0,0 +1,22 @@ +| | | | +|----|----|----| +| A1 | B1 | C1 | + +* * * + +| | | | +|-----------|----|----| +| wide cell | | B1 | +| A2 | B2 | C2 | + +| | | | +|-----------|----|----| +| tall cell | B1 | C1 | +| | A2 | B2 | +| A3 | B3 | C3 | + +| | | | +|----------|----|----| +| big cell | | B1 | +| | | A2 | +| A3 | B3 | C3 | \ No newline at end of file diff --git a/plugin/table/testdata/GoldenFiles/contents.in.html b/plugin/table/testdata/GoldenFiles/contents.in.html new file mode 100644 index 0000000..8786cf7 --- /dev/null +++ b/plugin/table/testdata/GoldenFiles/contents.in.html @@ -0,0 +1,140 @@ + + + + + + + + + + +
+ A1 + + B1 +
+ A2 + + B2 +
+ +
+ + + + + + +

Some normal content

+ + + + + +
Some normal content
+ + + + + +
+
+

Some normal content

+
+
+ +
+ + + + + + +
The content
with break
+ + + + + +

Heading

+ + + + + + +

not the empty heading
+ + + + + +

+ + + + + +
+
Code block
+
+ + + + + +
+
Blockquote
+
+ + + + + +
+
    +
  • Unordered List
  • +
+
+ + + + + +
+
    +
  1. Ordered List
  2. +
+
+ +
+ + + + + + +
+ + + + +
Nested Table
+
+ +
+ + + + + + + + +
Other cell + + + + +
Nested Table
+
Another cell
diff --git a/plugin/table/testdata/GoldenFiles/contents.out.md b/plugin/table/testdata/GoldenFiles/contents.out.md new file mode 100644 index 0000000..fb7f9e4 --- /dev/null +++ b/plugin/table/testdata/GoldenFiles/contents.out.md @@ -0,0 +1,57 @@ +| | | +|--------|---------| +| **A1** | *B1* | +| `A2` | [B2](/) | + +* * * + +| | +|---------------------| +| Some normal content | + +| | +|---------------------| +| Some normal content | + +| | +|---------------------| +| Some normal content | + +* * * + +The content +with break + +# Heading + +not the empty heading + +* * * + +``` +Code block +``` + +> Blockquote + +- Unordered List + + + +1. Ordered List + +* * * + +| | +|--------------| +| Nested Table | + +* * * + +Other cell + +| | +|--------------| +| Nested Table | + +Another cell \ No newline at end of file diff --git a/plugin/table/testdata/GoldenFiles/email.in.html b/plugin/table/testdata/GoldenFiles/email.in.html new file mode 100644 index 0000000..a3a01bc --- /dev/null +++ b/plugin/table/testdata/GoldenFiles/email.in.html @@ -0,0 +1,248 @@ + + + +
+ +
+ + + + + + +
+ +
+ + + + + + + + + + + + +
+ + + + + + +
+ +
+
+

+ +
+
+ normal body content +
+
+
+ +
+
+ +
+ + + + + + +
+ +
+ + + + + + +
+ + + + + + + + + +
A1A2
B1B2
+
+
+ +
+
+ +
+ + diff --git a/plugin/table/testdata/GoldenFiles/email.out.md b/plugin/table/testdata/GoldenFiles/email.out.md new file mode 100644 index 0000000..c8c5718 --- /dev/null +++ b/plugin/table/testdata/GoldenFiles/email.out.md @@ -0,0 +1,7 @@ +![](/assets/picture.png) + +normal body content + +| A1 | A2 | +|----|----| +| B1 | B2 | \ No newline at end of file diff --git a/plugin/table/testdata/GoldenFiles/parents.in.html b/plugin/table/testdata/GoldenFiles/parents.in.html new file mode 100644 index 0000000..2fb0f77 --- /dev/null +++ b/plugin/table/testdata/GoldenFiles/parents.in.html @@ -0,0 +1,110 @@ + + +
+ The blockquote content: + + + + + + +
A1A2
+
+ +
+ +
    +
  1. The list item content
  2. +
  3. + + + + + +
    A1A2
    +
  4. +
+ +
+ + + + + + +
A1A2
+
+
+ +
+ + + +
+ link content before + + + + + +
A1A2
+ link content after +
+ +
+ + +
+
+ + + + + +
A1A2
+
+
+
+ +
+ + + bold content before + + + + + +
A1A2
+ bold content after +
+ +
+ + + italic content before +
+ blockquote content before + + + + + +
A1A2
+ blockquote content after +
+ italic content after +
+ +
+ + diff --git a/plugin/table/testdata/GoldenFiles/parents.out.md b/plugin/table/testdata/GoldenFiles/parents.out.md new file mode 100644 index 0000000..79e7f22 --- /dev/null +++ b/plugin/table/testdata/GoldenFiles/parents.out.md @@ -0,0 +1,58 @@ +> The blockquote content: +> +> | | | +> |----|----| +> | A1 | A2 | + +* * * + +10. The list item content +11. | | | + |----|----| + | A1 | A2 | + +| | | +|----|----| +| A1 | A2 | + +* * * + +[link content before +\ +A1 A2 +\ +link content after](/link) + +* * * + +[" +\ +A1 A2 +\ +"](/link) + +* * * + +**bold content before** + +**A1 A2** + +**bold content after** + +* * * + +*italic content before " blockquote content before* + +*A1 A2* + +*blockquote content after " italic content after* + +* * * + +button content before + +| | | +|----|----| +| A1 | A2 | + +button content after \ No newline at end of file diff --git a/plugin/table/utils.go b/plugin/table/utils.go new file mode 100644 index 0000000..dc1eacb --- /dev/null +++ b/plugin/table/utils.go @@ -0,0 +1,187 @@ +package table + +import ( + "slices" + "strconv" + "unicode/utf8" + + "github.com/JohannesKaufmann/dom" + "golang.org/x/net/html" +) + +// The content should be at least 1 character wide. +// This also ensures that the table is correctly *recognized* as a markdown table. +const defaultCellWidth = 1 + +func calculateMaxCounts(rows [][][]byte) []int { + maxCounts := make([]int, 0) + + for _, cells := range rows { + for index, cell := range cells { + count := utf8.RuneCount(cell) + + if index >= len(maxCounts) { + maxCounts = append(maxCounts, defaultCellWidth) + } + currentMax := maxCounts[index] + if count > currentMax { + maxCounts[index] = count + } + } + } + return maxCounts +} + +func fillUpRows(rows [][][]byte, maxColumnCount int) [][][]byte { + + for i, cells := range rows { + missingCells := maxColumnCount - len(cells) + for range missingCells { + rows[i] = append(rows[i], []byte("")) + } + } + + return rows +} + +func getNumberAttributeOr(node *html.Node, key string, fallback int) int { + val, ok := dom.GetAttribute(node, key) + if !ok { + return fallback + } + num, err := strconv.Atoi(val) + if err != nil { + return fallback + } + if num < 1 { + return fallback + } + + return num +} + +type modification struct { + y int + x int + data []byte +} + +func calculateModifications(currentRowIndex, currentColIndex, rowSpan, colSpan int, data []byte) []modification { + + mods := make([]modification, 0) + + if colSpan <= 1 && rowSpan <= 1 { + // No modification is needed + return mods + } + + // Calculate modifications for colspan + for dx := 1; dx < colSpan; dx++ { + // Add modifications for the same row + mods = append(mods, modification{ + y: currentRowIndex, + x: currentColIndex + dx, + data: data, + }) + } + + // Calculate modifications for subsequent rows + if rowSpan > 1 { + for dy := 1; dy < rowSpan; dy++ { + for dx := 0; dx < colSpan; dx++ { + mods = append(mods, modification{ + y: currentRowIndex + dy, + x: currentColIndex + dx, + data: data, + }) + } + } + } + + return mods +} + +func applyGroupedModifications(contents [][][]byte, groupedMods [][]modification) [][][]byte { + // By applying the modifications in reverse we correctly + // handle overlapping modifications. + slices.Reverse(groupedMods) + + for _, mods := range groupedMods { + contents = applyModifications(contents, mods) + } + + return contents +} + +func applyModifications(contents [][][]byte, mods []modification) [][][]byte { + for _, mod := range mods { + // Grow on the y axis + contents = growSlice(contents, mod.y, nil) + + // Grow on the x axis + // (Note: we only grow x-1 since `Insert` takes care of the rest) + contents[mod.y] = growSlice(contents[mod.y], mod.x-1, nil) + + // Now we can do our change: + contents[mod.y] = slices.Insert(contents[mod.y], mod.x, mod.data) + } + + return contents +} + +// growSlice ensures the slice has enough capacity to access the given index. +func growSlice[T any](contents []T, index int, placeholderVal T) []T { + // Calculate the required growth + currentLen := len(contents) + if index < currentLen { + return contents + } + + growBy := index - currentLen + 1 + + // Grow the slice by appending values + for range growBy { + contents = append(contents, placeholderVal) + } + + return contents +} + +func isEmptyRow(cells [][]byte) bool { + for _, cell := range cells { + if len(cell) > 0 { + return false + } + } + return true +} +func removeEmptyRows(rows [][][]byte) [][][]byte { + index := 0 + filteredRows := slices.DeleteFunc(rows, func(cells [][]byte) bool { + if index == 0 { + index++ + return false // Always keep the first row (the header row) + } else { + index++ + } + + return isEmptyRow(cells) + }) + + if len(filteredRows) == 1 && isEmptyRow(filteredRows[0]) { + // If all the rows are empty (including the header row) + // then the table is completely empty... + return nil + } + + return filteredRows +} + +func removeFirstRowIfEmpty(rows [][][]byte) [][][]byte { + if len(rows) > 0 && isEmptyRow(rows[0]) { + // The first row (the header row) is empty. So lets remove it... + return slices.Delete(rows, 0, 1) + } + + return rows +} diff --git a/plugin/table/utils_test.go b/plugin/table/utils_test.go new file mode 100644 index 0000000..83a0e00 --- /dev/null +++ b/plugin/table/utils_test.go @@ -0,0 +1,268 @@ +package table + +import ( + "reflect" + "testing" +) + +func TestCalculateMaxCounts(t *testing.T) { + a := [][][]byte{ + { + []byte("Company A"), // 9 + []byte("Max Müller"), // 10 <-- + []byte("Berlin"), // 6 <-- + }, + { + []byte("Company Example"), // 15 <-- + []byte("John Doe"), // 8 + []byte("Bonn"), // 4 + }, + { + []byte("A"), + }, + } + + output := calculateMaxCounts(a) + expected := []int{15, 10, 6} + + if !reflect.DeepEqual(output, expected) { + t.Errorf("expected %+v but got %v", expected, output) + } +} +func TestFillUpRows(t *testing.T) { + input := [][][]byte{ + { + []byte("Company A"), + []byte("Max Müller"), + []byte("Berlin"), + }, + { + []byte("Company Example"), + []byte("John Doe"), + []byte("Bonn"), + }, + { + []byte("A"), + // <-- + // <-- + }, + } + + counts := calculateMaxCounts(input) + t.Log("counts:", counts) + + // - - - - - - - - - - - - - - - - - - - - // + maxColumnCount := len(counts) + + output := fillUpRows(input, maxColumnCount) + expected := [][][]byte{ + { + []byte("Company A"), + []byte("Max Müller"), + []byte("Berlin"), + }, + { + []byte("Company Example"), + []byte("John Doe"), + []byte("Bonn"), + }, + { + []byte("A"), + []byte(""), + []byte(""), + }, + } + + if !reflect.DeepEqual(output, expected) { + t.Errorf("expected %+v but got %v", expected, output) + } +} + +func TestCalculateModifications(t *testing.T) { + testCases := []struct { + desc string + + currentRowIndex int + currentColIndex int + colSpan int + rowSpan int + + expected []modification + }{ + { + desc: "no modifications needed #1", + + currentRowIndex: 0, + currentColIndex: 0, + colSpan: 1, + rowSpan: 1, + + expected: []modification{}, + }, + { + desc: "no modifications needed #2", + + currentRowIndex: 10, + currentColIndex: 5, + colSpan: 1, + rowSpan: 1, + + expected: []modification{}, + }, + + { + desc: "colspan=2", + + currentRowIndex: 0, + currentColIndex: 0, + colSpan: 2, + rowSpan: 1, + + expected: []modification{{y: 0, x: 1}}, + }, + { + desc: "rowspan=2", + + currentRowIndex: 0, + currentColIndex: 0, + colSpan: 1, + rowSpan: 2, + + expected: []modification{{y: 1, x: 0}}, + }, + { + desc: "colspan=2 and rowspan=2", + + currentRowIndex: 0, + currentColIndex: 0, + colSpan: 2, + rowSpan: 2, + + expected: []modification{ + /* the actual cell */ {y: 0, x: 1}, + {y: 1, x: 0}, {y: 1, x: 1}, + }, + }, + } + for _, tC := range testCases { + t.Run(tC.desc, func(t *testing.T) { + actual := calculateModifications(tC.currentRowIndex, tC.currentColIndex, tC.rowSpan, tC.colSpan, nil) + if len(actual) != len(tC.expected) { + t.Errorf("expected length %d but got %d", len(tC.expected), len(actual)) + } + + if !reflect.DeepEqual(actual, tC.expected) { + t.Errorf("expected %+v but got %+v", tC.expected, actual) + } + }) + } +} + +func TestApplyModifications(t *testing.T) { + testCases := []struct { + desc string + + contents [][][]byte + modifications []modification + + expected [][][]byte + }{ + { + desc: "add in same row", + + contents: [][][]byte{ + { + []byte("A"), + }, + }, + modifications: []modification{ + { + y: 0, + x: 0, + data: []byte("the modification"), + }, + }, + + expected: [][][]byte{ + { + []byte("the modification"), + []byte("A"), + }, + }, + }, + { + desc: "add in row below", + + contents: [][][]byte{ + { + []byte("A"), + }, + { + []byte("B"), + }, + }, + modifications: []modification{ + { + y: 1, + x: 0, + data: []byte("the modification"), + }, + }, + + expected: [][][]byte{ + { + []byte("A"), + }, + { + []byte("the modification"), + []byte("B"), + }, + }, + }, + + { + desc: "grow two rows below", + + contents: [][][]byte{ + { + []byte("A"), + []byte("B"), + }, + }, + modifications: []modification{ + { + y: 1, + x: 0, + data: []byte("A #2"), + }, + { + y: 2, + x: 0, + data: []byte("A #3"), + }, + }, + + expected: [][][]byte{ + { + []byte("A"), + []byte("B"), + }, + { + []byte("A #2"), + }, + { + []byte("A #3"), + }, + }, + }, + } + for _, tC := range testCases { + t.Run(tC.desc, func(t *testing.T) { + output := applyModifications(tC.contents, tC.modifications) + + if !reflect.DeepEqual(output, tC.expected) { + t.Errorf("expected %+v but got %+v", tC.expected, output) + } + }) + } +}