Skip to content

Commit

Permalink
Switched to goquery for parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
kevin-pierce committed Sep 7, 2021
1 parent 1899ad2 commit c7bdc73
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 64 deletions.
5 changes: 4 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,7 @@ module billboard-scraper

go 1.16

require golang.org/x/net v0.0.0-20210903162142-ad29c8ab022f // indirect
require (
github.com/PuerkitoBio/goquery v1.7.1 // indirect
golang.org/x/net v0.0.0-20210903162142-ad29c8ab022f // indirect
)
6 changes: 6 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
github.com/PuerkitoBio/goquery v1.7.1 h1:oE+T06D+1T7LNrn91B4aERsRIeCLJ/oPSa6xB9FPnz4=
github.com/PuerkitoBio/goquery v1.7.1/go.mod h1:XY0pP4kfraEmmV1O7Uf6XyjoslwsneBbgeDjLYuN8xY=
github.com/andybalholm/cascadia v1.2.0 h1:vuRCkM5Ozh/BfmsaTm26kbjm0mIOM3yS5Ek/F5h18aE=
github.com/andybalholm/cascadia v1.2.0/go.mod h1:YCyR8vOZT9aZ1CHEd8ap0gMVm2aFgxBp0T0eFw1RUQY=
golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20210614182718-04defd469f4e/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
golang.org/x/net v0.0.0-20210903162142-ad29c8ab022f h1:w6wWR0H+nyVpbSAQbzVEIACVyr/h8l/BEkY6Sokc7Eg=
golang.org/x/net v0.0.0-20210903162142-ad29c8ab022f/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
Expand Down
77 changes: 14 additions & 63 deletions scraper/scraper.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import (
"log"
"net/http"

"golang.org/x/net/html"
"github.com/PuerkitoBio/goquery"
)

type LinkTag struct {
Expand All @@ -14,89 +14,40 @@ type LinkTag struct {
}

func GetSongList() []string {
htmlBody, err := GetHTML()
htmlDoc, err := GetHTML()
fmt.Println(htmlDoc)

//fmt.Print(htmlBody)
if err != nil {
log.Fatalln(err)
}
var links []string

ParseHTML(htmlBody)
//links := ParseHTML(htmlBody)
links, err := GetLinks(htmlDoc)
return links
}

func GetHTML() (*html.Node, error) {
func GetHTML() (*goquery.Document, error) {
resp, err := http.Get("https://www.billboard.com/charts/hot-100")

if err != nil {
return nil, err
}
defer resp.Body.Close()

doc, err := html.Parse(resp.Body)
doc, err := goquery.NewDocumentFromReader(resp.Body)
if err != nil {
return nil, err
}

return doc, nil
}

func ParseHTML(n *html.Node) {
var rootNode *html.Node
rootNode = nil

var recFindRoot func(*html.Node)
func GetLinks(doc *goquery.Document) ([]string, error) {
var songList []string

recFindRoot = func(n *html.Node) {
if n.Type == html.ElementNode && n.Data == "ol" {
rootNode = n
}
doc.Find(".chart-list__element .display--flex").Each(func(i int, s *goquery.Selection) {
songTitle := s.Find(".chart-element__information").Text()
fmt.Println(songTitle)
//fmt.Println(s)
})

for c := n.FirstChild; c != nil; c = c.NextSibling {
if rootNode != nil {
break
} else {
recFindRoot(c)
}
}
}
recFindRoot(n)
fmt.Println(rootNode)
return songList, nil
}

// func GetLinks(n *html.Node) {

// }

// root, err := html.Parse(r)
// if err != nil {
// return nil, err
// }

// var links []Link
// var rec func(*html.Node)
// rec = func(n *html.Node) {
// if n.Type == html.ElementNode && n.Data == "a" {
// for _, attr := range n.Attr {
// if attr.Key == "href" {
// var text string
// if n.FirstChild != nil {
// text = grabText(n.FirstChild)
// }
// links = append(links, Link{attr.Val, text})
// }
// }
// }
// if n.FirstChild != nil {
// rec(n.FirstChild)
// }
// if n.NextSibling != nil {
// rec(n.NextSibling)
// }
// }
// rec(root)

// return links, nil
// }

0 comments on commit c7bdc73

Please sign in to comment.