Skip to content

Commit

Permalink
plugin: added table plugin (#144)
Browse files Browse the repository at this point in the history
* table: first basic version of table plugin

* table: abort table if problematic node (e.g. hr) is discovered

* table: render caption below table

* table: fill up missing cells

* table: combine header row with rows array

* table: replicate content to spanned cell

* table: change header underline format

* table: grow table for colspan & rowspan

* table: fix order of applying modifications

* table: add align to the underline

* table: loop through cells directly

* table: abort early for nested table & small cleanup

* table: add more tests

* table: make empty cell wider

* table: add option WithSkipEmptyRows

* table: add option WithHeaderPromotion

* table: escape "|" character inside table

* table: improve naming & docs for WithSpanCellBehavior

* table: skip render for role="presentation" table

* table: skip render when parent is problematic

* table: fallback render newlines between rows

* collapse: update testcases to use RenderRepresentation
  • Loading branch information
JohannesKaufmann authored Feb 25, 2025
1 parent 7e02068 commit bd15218
Show file tree
Hide file tree
Showing 20 changed files with 3,235 additions and 130 deletions.
563 changes: 433 additions & 130 deletions collapse/collapse_test.go

Large diffs are not rendered by default.

50 changes: 50 additions & 0 deletions plugin/table/1_select.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
package table

import (
"github.com/JohannesKaufmann/dom"
"golang.org/x/net/html"
)

func selectHeaderRowNode(node *html.Node) *html.Node {
thead := dom.FindFirstNode(node, func(n *html.Node) bool {
return dom.NodeName(n) == "thead"
})
if thead != nil {
firstTr := dom.FindFirstNode(thead, func(n *html.Node) bool {
return dom.NodeName(n) == "tr"
})
if firstTr != nil {
// YEAH we found the "tr" inside the "thead"
return firstTr
}
}

firstTh := dom.FindFirstNode(node, func(n *html.Node) bool {
return dom.NodeName(n) == "th"
})
if firstTh != nil {
// YEAH we found the "th"
return firstTh.Parent
}

return nil
}
func selectNormalRowNodes(tableNode *html.Node, selectedHeaderRowNode *html.Node) []*html.Node {
var collected []*html.Node

var finder func(node *html.Node)
finder = func(node *html.Node) {
name := dom.NodeName(node)
if name == "tr" && node != selectedHeaderRowNode {
// We want to make sure to not select the header row a *second* time.
collected = append(collected, node)
}

for child := node.FirstChild; child != nil; child = child.NextSibling {
finder(child)
}
}
finder(tableNode)

return collected
}
218 changes: 218 additions & 0 deletions plugin/table/1_select_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,218 @@
package table

import (
"testing"

"github.com/JohannesKaufmann/html-to-markdown/v2/collapse"
"github.com/JohannesKaufmann/html-to-markdown/v2/internal/tester"
"golang.org/x/net/html"
)

func TestSelectRowNodes(t *testing.T) {
runs := []struct {
desc string
input string

expected string
}{
{
desc: "invalid table",
input: `
<table>
<tbody>
<tr>there is no data cell tag</tr>
</tbody>
</table>
`,

// Note: "golang.org/x/net/html" automatically cleans up the "table"
expected: `
├─body
│ ├─#text "there is no data cell tag"
│ ├─table
│ │ ├─tbody
│ │ │ ├─tr (__test_normal_row__="true")
`,
},
{
desc: "completely empty table",
input: `<table></table>`,

expected: `
├─body
│ ├─table
`,
},
{
desc: "completely empty tbody",
input: `<table><tbody></tbody></table>`,

expected: `
├─body
│ ├─table
│ │ ├─tbody
`,
},
{
desc: "basic table",
input: `
<table>
<tr>
<td>A1</td>
<td>A2</td>
</tr>
<tr>
<td>B1</td>
<td>B2</td>
</tr>
</table>
`,
// Note: "golang.org/x/net/html" automatically adds the "tbody"
expected: `
├─body
│ ├─table
│ │ ├─tbody
│ │ │ ├─tr (__test_normal_row__="true")
│ │ │ │ ├─td
│ │ │ │ │ ├─#text "A1"
│ │ │ │ ├─td
│ │ │ │ │ ├─#text "A2"
│ │ │ ├─tr (__test_normal_row__="true")
│ │ │ │ ├─td
│ │ │ │ │ ├─#text "B1"
│ │ │ │ ├─td
│ │ │ │ │ ├─#text "B2"
`,
},
{
desc: "basic table with th",
input: `
<table>
<tr>
<th>Heading 1</td>
<th>Heading 2</td>
</tr>
<tr>
<td>A1</td>
<td>A2</td>
</tr>
</table>
`,
expected: `
├─body
│ ├─table
│ │ ├─tbody
│ │ │ ├─tr (__test_header_row__="true")
│ │ │ │ ├─th
│ │ │ │ │ ├─#text "Heading 1"
│ │ │ │ ├─th
│ │ │ │ │ ├─#text "Heading 2"
│ │ │ ├─tr (__test_normal_row__="true")
│ │ │ │ ├─td
│ │ │ │ │ ├─#text "A1"
│ │ │ │ ├─td
│ │ │ │ │ ├─#text "A2"
`,
},
{
desc: "with caption, thead, tbody, tfoot",
input: `
<table>
<caption>
A description about the table
</caption>
<thead>
<tr>
<th scope="col">Name</th>
<th scope="col">City</th>
<th scope="col">Age</th>
</tr>
</thead>
<tbody>
<tr>
<th scope="row">Max Mustermann</th>
<td>Berlin</td>
<td>20</td>
</tr>
<tr>
<th scope="row">Max Müller</th>
<td>München</td>
<td>30</td>
</tr>
</tbody>
<tfoot>
<tr>
<th scope="row" colspan="2">Average age</th>
<td>25</td>
</tr>
</tfoot>
</table>
`,
expected: `
├─body
│ ├─table
│ │ ├─caption
│ │ │ ├─#text "A description about the table"
│ │ ├─thead
│ │ │ ├─tr (__test_header_row__="true")
│ │ │ │ ├─th (scope="col")
│ │ │ │ │ ├─#text "Name"
│ │ │ │ ├─th (scope="col")
│ │ │ │ │ ├─#text "City"
│ │ │ │ ├─th (scope="col")
│ │ │ │ │ ├─#text "Age"
│ │ ├─tbody
│ │ │ ├─tr (__test_normal_row__="true")
│ │ │ │ ├─th (scope="row")
│ │ │ │ │ ├─#text "Max Mustermann"
│ │ │ │ ├─td
│ │ │ │ │ ├─#text "Berlin"
│ │ │ │ ├─td
│ │ │ │ │ ├─#text "20"
│ │ │ ├─tr (__test_normal_row__="true")
│ │ │ │ ├─th (scope="row")
│ │ │ │ │ ├─#text "Max Müller"
│ │ │ │ ├─td
│ │ │ │ │ ├─#text "München"
│ │ │ │ ├─td
│ │ │ │ │ ├─#text "30"
│ │ ├─tfoot
│ │ │ ├─tr (__test_normal_row__="true")
│ │ │ │ ├─th (scope="row" colspan="2")
│ │ │ │ │ ├─#text "Average age"
│ │ │ │ ├─td
│ │ │ │ │ ├─#text "25"
`,
},
}
for _, run := range runs {
t.Run(run.desc, func(t *testing.T) {
doc := tester.Parse(t, run.input, "")

// NOTE FOR FUTURE: I discovered that "golang.org/x/net/html" automatically adds the "tbody".
// => So we probably don't need to do that much work beforehand.
collapse.Collapse(doc, nil)

{
// We can then see if we correctly *identified* all the necessary table components.
// For that we add an attribute (just for the test).

headerRow := selectHeaderRowNode(doc)
if headerRow != nil {
headerRow.Attr = append(headerRow.Attr, html.Attribute{
Key: "__test_header_row__",
Val: "true",
})
}
for _, n := range selectNormalRowNodes(doc, headerRow) {
n.Attr = append(n.Attr, html.Attribute{
Key: "__test_normal_row__",
Val: "true",
})
}
}

tester.ExpectRepresentation(t, doc, "output", run.expected)
})
}
}
Loading

0 comments on commit bd15218

Please sign in to comment.