Skip to content

Commit

Permalink
Now isolates the root list node
Browse files Browse the repository at this point in the history
  • Loading branch information
kevin-pierce committed Sep 6, 2021
1 parent dc801a4 commit 1899ad2
Showing 1 changed file with 20 additions and 10 deletions.
30 changes: 20 additions & 10 deletions scraper/scraper.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,22 +44,32 @@ func GetHTML() (*html.Node, error) {
}

func ParseHTML(n *html.Node) {
fmt.Println(n.Type)
fmt.Println(n)
var rootNode *html.Node
rootNode = nil

if n.Type == html.ElementNode && n.Data == "a" {
for _, element := range n.Attr {
if element.Key == "href" {
fmt.Printf("LINK: %s\n", element.Val)
}
var recFindRoot func(*html.Node)

recFindRoot = func(n *html.Node) {
if n.Type == html.ElementNode && n.Data == "ol" {
rootNode = n
}
}

for c := n.FirstChild; c != nil; c = c.NextSibling {
ParseHTML(c)
for c := n.FirstChild; c != nil; c = c.NextSibling {
if rootNode != nil {
break
} else {
recFindRoot(c)
}
}
}
recFindRoot(n)
fmt.Println(rootNode)
}

// func GetLinks(n *html.Node) {

// }

// root, err := html.Parse(r)
// if err != nil {
// return nil, err
Expand Down

0 comments on commit 1899ad2

Please sign in to comment.