Skip to content

Commit

Permalink
Add ReadHTML to support loading HTML tables (go-gota#107)
Browse files Browse the repository at this point in the history
  • Loading branch information
typeless authored May 1, 2020
1 parent 58d5485 commit 648ad10
Show file tree
Hide file tree
Showing 4 changed files with 209 additions and 1 deletion.
127 changes: 127 additions & 0 deletions dataframe/dataframe.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ import (
"unicode/utf8"

"github.com/go-gota/gota/series"
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
)

// DataFrame is a data structure designed for operating on table like data (Such
Expand Down Expand Up @@ -1246,6 +1248,131 @@ func (df DataFrame) WriteJSON(w io.Writer) error {
return json.NewEncoder(w).Encode(df.Maps())
}

// Internal state for implementing ReadHTML
type remainder struct {
index int
text string
nrows int
}

func readRows(trs []*html.Node) [][]string {
rems := []remainder{}
rows := [][]string{}
for _, tr := range trs {
xrems := []remainder{}
row := []string{}
index := 0
text := ""
for j, td := 0, tr.FirstChild; td != nil; j, td = j+1, td.NextSibling {
if td.Type == html.ElementNode && td.DataAtom == atom.Td {

for len(rems) > 0 {
v := rems[0]
if v.index > index {
break
}
v, rems = rems[0], rems[1:]
row = append(row, v.text)
if v.nrows > 1 {
xrems = append(xrems, remainder{v.index, v.text, v.nrows - 1})
}
index++
}

rowspan, colspan := 1, 1
for _, attr := range td.Attr {
switch attr.Key {
case "rowspan":
if k, err := strconv.Atoi(attr.Val); err == nil {
rowspan = k
}
case "colspan":
if k, err := strconv.Atoi(attr.Val); err == nil {
colspan = k
}
}
}
for c := td.FirstChild; c != nil; c = c.NextSibling {
if c.Type == html.TextNode {
text = strings.TrimSpace(c.Data)
}
}

for k := 0; k < colspan; k++ {
row = append(row, text)
if rowspan > 1 {
xrems = append(xrems, remainder{index, text, rowspan - 1})
}
index++
}
}
}
for j := 0; j < len(rems); j++ {
v := rems[j]
row = append(row, v.text)
if v.nrows > 1 {
xrems = append(xrems, remainder{v.index, v.text, v.nrows - 1})
}
}
rows = append(rows, row)
rems = xrems
}
for len(rems) > 0 {
xrems := []remainder{}
row := []string{}
for i := 0; i < len(rems); i++ {
v := rems[i]
row = append(row, v.text)
if v.nrows > 1 {
xrems = append(xrems, remainder{v.index, v.text, v.nrows - 1})
}
}
rows = append(rows, row)
rems = xrems
}
return rows
}

func ReadHTML(r io.Reader, options ...LoadOption) []DataFrame {
var err error
var dfs []DataFrame
var doc *html.Node
var f func(*html.Node)

doc, err = html.Parse(r)
if err != nil {
return []DataFrame{DataFrame{Err: err}}
}

f = func(n *html.Node) {
if n.Type == html.ElementNode && n.DataAtom == atom.Table {
trs := []*html.Node{}
for c := n.FirstChild; c != nil; c = c.NextSibling {
if c.Type == html.ElementNode && c.DataAtom == atom.Tbody {
for cc := c.FirstChild; cc != nil; cc = cc.NextSibling {
if cc.Type == html.ElementNode && (cc.DataAtom == atom.Th || cc.DataAtom == atom.Tr) {
trs = append(trs, cc)
}
}
}
}

df := LoadRecords(readRows(trs), options...)
if df.Err == nil {
dfs = append(dfs, df)
}
return
}

for c := n.FirstChild; c != nil; c = c.NextSibling {
f(c)
}
}

f(doc)
return dfs
}

// Getters/Setters for DataFrame fields
// ====================================

Expand Down
73 changes: 73 additions & 0 deletions dataframe/dataframe_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1430,6 +1430,79 @@ func TestReadJSON(t *testing.T) {
}
}

func TestReadHTML(t *testing.T) {
table := []struct {
htmlStr string
expDf []DataFrame
}{
{
"",
[]DataFrame{},
},
{
`<html>
<body>
<table>
<tr><td>COL.1</td></tr>
<tr><td>100</td></tr>
</table>
</body>
</html>`,
[]DataFrame{
LoadRecords(
[][]string{
{"COL.1"},
{"100"},
}),
},
},
{
`<html>
<body>
<table>
<tr><td rowspan='2'>COL.1</td><td rowspan='2'>COL.2</td><td>COL.3</td></tr>
<tr><td>100</td></tr>
</table>
</body>
</html>`,
[]DataFrame{
LoadRecords(
[][]string{
{"COL.1", "COL.2", "COL.3"},
{"COL.1", "COL.2", "100"},
}),
},
},
}

for i, tc := range table {
cs := ReadHTML(strings.NewReader(tc.htmlStr))
if tc.htmlStr != "" && len(cs) == 0 {
t.Errorf("Test: %d, got zero dataframes: %#v", i, cs)
}
for j, c := range cs {
if len(cs) != len(tc.expDf) {
t.Errorf("Test: %d\n got len(%d), want len(%d)", i, len(cs), len(tc.expDf))
}
if c.Err != nil {
t.Errorf("Test: %d\nError:%v", i, c.Err)
}
// Check that the types are the same between both DataFrames
if !reflect.DeepEqual(tc.expDf[j].Types(), c.Types()) {
t.Errorf("Test: %d\nDifferent types:\nA:%v\nB:%v", i, tc.expDf[j].Types(), c.Types())
}
// Check that the colnames are the same between both DataFrames
if !reflect.DeepEqual(tc.expDf[j].Names(), c.Names()) {
t.Errorf("Test: %d\nDifferent colnames:\nA:%v\nB:%v", i, tc.expDf[j].Names(), c.Names())
}
// Check that the values are the same between both DataFrames
if !reflect.DeepEqual(tc.expDf[j].Records(), c.Records()) {
t.Errorf("Test: %d\nDifferent values:\nA:%v\nB:%v", i, tc.expDf[j].Records(), c.Records())
}
}
}
}

func TestDataFrame_SetNames(t *testing.T) {
a := New(
series.New([]string{"a", "b", "c"}, series.String, "COL.1"),
Expand Down
5 changes: 4 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,7 @@ module github.com/go-gota/gota

go 1.12

require gonum.org/v1/gonum v0.0.0-20190926113837-94b2bbd8ac13
require (
golang.org/x/net v0.0.0-20200114155413-6afb5195e5aa
gonum.org/v1/gonum v0.0.0-20190926113837-94b2bbd8ac13
)
5 changes: 5 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,16 @@ github.com/ajstarks/svgo v0.0.0-20180226025133-644b8db467af/go.mod h1:K08gAheRH3
github.com/fogleman/gg v1.2.1-0.20190220221249-0403632d5b90/go.mod h1:R/bRT+9gY/C5z7JzPU0zXsXHKM4/ayA+zqcVNZzPa1k=
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k=
github.com/jung-kurt/gofpdf v1.0.3-0.20190309125859-24315acbbda5/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+r32jIOes=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
golang.org/x/exp v0.0.0-20190125153040-c74c464bbbf2 h1:y102fOLFqhV41b+4GPiJoa0k/x+pJcEi2/HB1Y5T6fU=
golang.org/x/exp v0.0.0-20190125153040-c74c464bbbf2/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
golang.org/x/image v0.0.0-20180708004352-c73c2afc3b81/go.mod h1:ux5Hcp/YLpHSI86hEcLt0YII63i6oz57MZXIpbrjZUs=
golang.org/x/net v0.0.0-20200114155413-6afb5195e5aa h1:F+8P+gmewFQYRk6JoLQLwjBCTu3mcIURZfNkVweuRKA=
golang.org/x/net v0.0.0-20200114155413-6afb5195e5aa/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20190206041539-40960b6deb8e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
gonum.org/v1/gonum v0.0.0-20180816165407-929014505bf4/go.mod h1:Y+Yx5eoAFn32cQvJDxZx5Dpnq+c3wtXuadVZAcxbbBo=
Expand Down

0 comments on commit 648ad10

Please sign in to comment.