|
| 1 | +// iterscraper scrapes information from a website where URLs contain an incrementing integer. |
| 2 | +// Information is retrieved from HTML5 elements, and outputted as a CSV. |
1 | 3 | package main
|
2 | 4 |
|
3 | 5 | import (
|
| 6 | + "encoding/csv" |
4 | 7 | "flag"
|
| 8 | + "fmt" |
| 9 | + "log" |
| 10 | + "net/http" |
| 11 | + "os" |
| 12 | + "strconv" |
| 13 | + "strings" |
5 | 14 | "sync"
|
6 |
| -) |
7 | 15 |
|
8 |
| -var ( |
9 |
| - urlBase string |
10 |
| - idLow, idHigh int |
11 |
| - concurrency int |
12 |
| - outfile string |
13 |
| - nameQuery string |
14 |
| - addressQuery string |
15 |
| - phoneQuery string |
16 |
| - emailQuery string |
| 16 | + "github.com/PuerkitoBio/goquery" |
17 | 17 | )
|
18 | 18 |
|
19 | 19 | func main() {
|
20 |
| - // Get flags |
21 |
| - flag.StringVar(&urlBase, "url", "http://example.com/v/%d", `The URL you wish to scrape, containing "%d" where the id should be substituted`) |
22 |
| - flag.IntVar(&idLow, "from", 0, "The first ID that should be searched in the URL - inclusive.") |
23 |
| - flag.IntVar(&idHigh, "to", 1, "The last ID that should be searched in the URL - exclusive") |
24 |
| - flag.IntVar(&concurrency, "concurrency", 1, "How many scrapers to run in parallel. (More scrapers are faster, but more prone to rate limiting or bandwith issues)") |
25 |
| - flag.StringVar(&outfile, "output", "output.csv", "Filename to export the CSV results") |
26 |
| - flag.StringVar(&nameQuery, "nameQuery", ".name", "JQuery-style query for the name element") |
27 |
| - flag.StringVar(&addressQuery, "addressQuery", ".address", "JQuery-style query for the address element") |
28 |
| - flag.StringVar(&phoneQuery, "phoneQuery", ".phone", "JQuery-style query for the phone element") |
29 |
| - flag.StringVar(&emailQuery, "emailQuery", ".email", "JQuery-style query for the email element") |
30 |
| - |
| 20 | + var ( |
| 21 | + urlTemplate = flag.String("url", "http://example.com/v/%d", "The URL you wish to scrape, containing \"%d\" where the id should be substituted") |
| 22 | + idLow = flag.Int("from", 0, "The first ID that should be searched in the URL - inclusive.") |
| 23 | + idHigh = flag.Int("to", 1, "The last ID that should be searched in the URL - exclusive") |
| 24 | + concurrency = flag.Int("concurrency", 1, "How many scrapers to run in parallel. (More scrapers are faster, but more prone to rate limiting or bandwith issues)") |
| 25 | + outfile = flag.String("output", "output.csv", "Filename to export the CSV results") |
| 26 | + name = flag.String("nameQuery", ".name", "JQuery-style query for the name element") |
| 27 | + address = flag.String("addressQuery", ".address", "JQuery-style query for the address element") |
| 28 | + phone = flag.String("phoneQuery", ".phone", "JQuery-style query for the phone element") |
| 29 | + email = flag.String("emailQuery", ".email", "JQuery-style query for the email element") |
| 30 | + ) |
31 | 31 | flag.Parse()
|
32 | 32 |
|
33 |
| - // Use waitgroup so we can keep track of tasks |
| 33 | + columns := []string{*name, *address, *phone, *email} |
| 34 | + headers := []string{"name", "address", "phone", "email"} |
| 35 | + // url and id are added as the first two rows. |
| 36 | + headers = append([]string{"url", "id"}, headers...) |
| 37 | + |
| 38 | + // create all tasks and send them to the channel. |
| 39 | + type task struct { |
| 40 | + url string |
| 41 | + id int |
| 42 | + } |
| 43 | + tasks := make(chan task) |
| 44 | + go func() { |
| 45 | + for i := *idLow; i < *idHigh; i++ { |
| 46 | + tasks <- task{url: fmt.Sprintf(*urlTemplate, i), id: i} |
| 47 | + } |
| 48 | + close(tasks) |
| 49 | + }() |
| 50 | + |
| 51 | + // create workers and schedule closing results when all work is done. |
| 52 | + results := make(chan []string) |
34 | 53 | var wg sync.WaitGroup
|
35 |
| - wg.Add(idHigh - idLow) |
| 54 | + wg.Add(*concurrency) |
| 55 | + go func() { |
| 56 | + wg.Wait() |
| 57 | + close(results) |
| 58 | + }() |
| 59 | + |
| 60 | + for i := 0; i < *concurrency; i++ { |
| 61 | + go func() { |
| 62 | + defer wg.Done() |
| 63 | + for t := range tasks { |
| 64 | + r, err := fetch(t.url, t.id, columns) |
| 65 | + if err != nil { |
| 66 | + log.Printf("could not fetch %v: %v", t.url, err) |
| 67 | + continue |
| 68 | + } |
| 69 | + results <- r |
| 70 | + } |
| 71 | + }() |
| 72 | + } |
| 73 | + |
| 74 | + if err := dumpCSV(*outfile, headers, results); err != nil { |
| 75 | + log.Printf("could not write to %s: %v", *outfile, err) |
| 76 | + } |
| 77 | +} |
36 | 78 |
|
37 |
| - // channel for emitting sites to fetch |
38 |
| - taskChan := make(chan site) |
39 |
| - // Channel of data to write to disk |
40 |
| - dataChan := make(chan site) |
| 79 | +func fetch(url string, id int, queries []string) ([]string, error) { |
| 80 | + res, err := http.Get(url) |
| 81 | + if err != nil { |
| 82 | + return nil, fmt.Errorf("could not get %s: %v", url, err) |
| 83 | + } |
| 84 | + defer res.Body.Close() |
41 | 85 |
|
42 |
| - go emitTasks(taskChan) |
| 86 | + if res.StatusCode != http.StatusOK { |
| 87 | + if res.StatusCode == http.StatusTooManyRequests { |
| 88 | + return nil, fmt.Errorf("you are being rate limited") |
| 89 | + } |
43 | 90 |
|
44 |
| - for i := 0; i < concurrency; i++ { |
45 |
| - go scrape(taskChan, dataChan) |
| 91 | + return nil, fmt.Errorf("bad response from server: %s", res.Status) |
46 | 92 | }
|
47 | 93 |
|
48 |
| - go writeSites(dataChan, &wg) |
| 94 | + // parse body with goquery. |
| 95 | + doc, err := goquery.NewDocumentFromReader(res.Body) |
| 96 | + if err != nil { |
| 97 | + return nil, fmt.Errorf("could not parse page: %v", err) |
| 98 | + } |
49 | 99 |
|
50 |
| - wg.Wait() |
| 100 | + // extract info we want. |
| 101 | + r := []string{url, strconv.Itoa(id)} |
| 102 | + for _, q := range queries { |
| 103 | + r = append(r, strings.TrimSpace(doc.Find(q).Text())) |
| 104 | + } |
| 105 | + return r, nil |
| 106 | +} |
| 107 | + |
| 108 | +func dumpCSV(path string, headers []string, records <-chan []string) error { |
| 109 | + f, err := os.Create(path) |
| 110 | + if err != nil { |
| 111 | + return fmt.Errorf("unable to create file %s: %v", path, err) |
| 112 | + } |
| 113 | + defer f.Close() |
| 114 | + |
| 115 | + w := csv.NewWriter(f) |
| 116 | + defer w.Flush() |
| 117 | + |
| 118 | + // write headers to file. |
| 119 | + if err := w.Write(headers); err != nil { |
| 120 | + log.Fatalf("error writing record to csv: %v", err) |
| 121 | + } |
| 122 | + |
| 123 | + // write all records. |
| 124 | + for r := range records { |
| 125 | + if err := w.Write(r); err != nil { |
| 126 | + log.Fatalf("could not write record to csv: %v", err) |
| 127 | + } |
| 128 | + } |
| 129 | + |
| 130 | + // check for extra errors. |
| 131 | + if err := w.Error(); err != nil { |
| 132 | + return fmt.Errorf("writer failed: %v", err) |
| 133 | + } |
| 134 | + return nil |
51 | 135 | }
|
0 commit comments