Skip to content

Commit 79fe516

Browse files
Merge pull request #1 from campoy/master
Improving the code and fixing some bugs
2 parents 0b5202d + ce9ec22 commit 79fe516

File tree

5 files changed

+116
-157
lines changed

5 files changed

+116
-157
lines changed

main.go

+116-32
Original file line numberDiff line numberDiff line change
@@ -1,51 +1,135 @@
1+
// iterscraper scrapes information from a website where URLs contain an incrementing integer.
2+
// Information is retrieved from HTML5 elements, and outputted as a CSV.
13
package main
24

35
import (
6+
"encoding/csv"
47
"flag"
8+
"fmt"
9+
"log"
10+
"net/http"
11+
"os"
12+
"strconv"
13+
"strings"
514
"sync"
6-
)
715

8-
var (
9-
urlBase string
10-
idLow, idHigh int
11-
concurrency int
12-
outfile string
13-
nameQuery string
14-
addressQuery string
15-
phoneQuery string
16-
emailQuery string
16+
"github.com/PuerkitoBio/goquery"
1717
)
1818

1919
func main() {
20-
// Get flags
21-
flag.StringVar(&urlBase, "url", "http://example.com/v/%d", `The URL you wish to scrape, containing "%d" where the id should be substituted`)
22-
flag.IntVar(&idLow, "from", 0, "The first ID that should be searched in the URL - inclusive.")
23-
flag.IntVar(&idHigh, "to", 1, "The last ID that should be searched in the URL - exclusive")
24-
flag.IntVar(&concurrency, "concurrency", 1, "How many scrapers to run in parallel. (More scrapers are faster, but more prone to rate limiting or bandwith issues)")
25-
flag.StringVar(&outfile, "output", "output.csv", "Filename to export the CSV results")
26-
flag.StringVar(&nameQuery, "nameQuery", ".name", "JQuery-style query for the name element")
27-
flag.StringVar(&addressQuery, "addressQuery", ".address", "JQuery-style query for the address element")
28-
flag.StringVar(&phoneQuery, "phoneQuery", ".phone", "JQuery-style query for the phone element")
29-
flag.StringVar(&emailQuery, "emailQuery", ".email", "JQuery-style query for the email element")
30-
20+
var (
21+
urlTemplate = flag.String("url", "http://example.com/v/%d", "The URL you wish to scrape, containing \"%d\" where the id should be substituted")
22+
idLow = flag.Int("from", 0, "The first ID that should be searched in the URL - inclusive.")
23+
idHigh = flag.Int("to", 1, "The last ID that should be searched in the URL - exclusive")
24+
concurrency = flag.Int("concurrency", 1, "How many scrapers to run in parallel. (More scrapers are faster, but more prone to rate limiting or bandwith issues)")
25+
outfile = flag.String("output", "output.csv", "Filename to export the CSV results")
26+
name = flag.String("nameQuery", ".name", "JQuery-style query for the name element")
27+
address = flag.String("addressQuery", ".address", "JQuery-style query for the address element")
28+
phone = flag.String("phoneQuery", ".phone", "JQuery-style query for the phone element")
29+
email = flag.String("emailQuery", ".email", "JQuery-style query for the email element")
30+
)
3131
flag.Parse()
3232

33-
// Use waitgroup so we can keep track of tasks
33+
columns := []string{*name, *address, *phone, *email}
34+
headers := []string{"name", "address", "phone", "email"}
35+
// url and id are added as the first two rows.
36+
headers = append([]string{"url", "id"}, headers...)
37+
38+
// create all tasks and send them to the channel.
39+
type task struct {
40+
url string
41+
id int
42+
}
43+
tasks := make(chan task)
44+
go func() {
45+
for i := *idLow; i < *idHigh; i++ {
46+
tasks <- task{url: fmt.Sprintf(*urlTemplate, i), id: i}
47+
}
48+
close(tasks)
49+
}()
50+
51+
// create workers and schedule closing results when all work is done.
52+
results := make(chan []string)
3453
var wg sync.WaitGroup
35-
wg.Add(idHigh - idLow)
54+
wg.Add(*concurrency)
55+
go func() {
56+
wg.Wait()
57+
close(results)
58+
}()
59+
60+
for i := 0; i < *concurrency; i++ {
61+
go func() {
62+
defer wg.Done()
63+
for t := range tasks {
64+
r, err := fetch(t.url, t.id, columns)
65+
if err != nil {
66+
log.Printf("could not fetch %v: %v", t.url, err)
67+
continue
68+
}
69+
results <- r
70+
}
71+
}()
72+
}
73+
74+
if err := dumpCSV(*outfile, headers, results); err != nil {
75+
log.Printf("could not write to %s: %v", *outfile, err)
76+
}
77+
}
3678

37-
// channel for emitting sites to fetch
38-
taskChan := make(chan site)
39-
// Channel of data to write to disk
40-
dataChan := make(chan site)
79+
func fetch(url string, id int, queries []string) ([]string, error) {
80+
res, err := http.Get(url)
81+
if err != nil {
82+
return nil, fmt.Errorf("could not get %s: %v", url, err)
83+
}
84+
defer res.Body.Close()
4185

42-
go emitTasks(taskChan)
86+
if res.StatusCode != http.StatusOK {
87+
if res.StatusCode == http.StatusTooManyRequests {
88+
return nil, fmt.Errorf("you are being rate limited")
89+
}
4390

44-
for i := 0; i < concurrency; i++ {
45-
go scrape(taskChan, dataChan)
91+
return nil, fmt.Errorf("bad response from server: %s", res.Status)
4692
}
4793

48-
go writeSites(dataChan, &wg)
94+
// parse body with goquery.
95+
doc, err := goquery.NewDocumentFromReader(res.Body)
96+
if err != nil {
97+
return nil, fmt.Errorf("could not parse page: %v", err)
98+
}
4999

50-
wg.Wait()
100+
// extract info we want.
101+
r := []string{url, strconv.Itoa(id)}
102+
for _, q := range queries {
103+
r = append(r, strings.TrimSpace(doc.Find(q).Text()))
104+
}
105+
return r, nil
106+
}
107+
108+
func dumpCSV(path string, headers []string, records <-chan []string) error {
109+
f, err := os.Create(path)
110+
if err != nil {
111+
return fmt.Errorf("unable to create file %s: %v", path, err)
112+
}
113+
defer f.Close()
114+
115+
w := csv.NewWriter(f)
116+
defer w.Flush()
117+
118+
// write headers to file.
119+
if err := w.Write(headers); err != nil {
120+
log.Fatalf("error writing record to csv: %v", err)
121+
}
122+
123+
// write all records.
124+
for r := range records {
125+
if err := w.Write(r); err != nil {
126+
log.Fatalf("could not write record to csv: %v", err)
127+
}
128+
}
129+
130+
// check for extra errors.
131+
if err := w.Error(); err != nil {
132+
return fmt.Errorf("writer failed: %v", err)
133+
}
134+
return nil
51135
}

scraper.go

-9
This file was deleted.

site.go

-59
This file was deleted.

tasking.go

-10
This file was deleted.

writer.go

-47
This file was deleted.

0 commit comments

Comments
 (0)