-
Notifications
You must be signed in to change notification settings - Fork 23
Improving the code and fixing some bugs #1
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,51 +1,135 @@ | ||
// iterscraper scrapes information from a website where URLs contain an incrementing integer. | ||
// Information is retrieved from HTML5 elements, and outputted as a CSV. | ||
package main | ||
|
||
import ( | ||
"encoding/csv" | ||
"flag" | ||
"fmt" | ||
"log" | ||
"net/http" | ||
"os" | ||
"strconv" | ||
"strings" | ||
"sync" | ||
) | ||
|
||
var ( | ||
urlBase string | ||
idLow, idHigh int | ||
concurrency int | ||
outfile string | ||
nameQuery string | ||
addressQuery string | ||
phoneQuery string | ||
emailQuery string | ||
"github.com/PuerkitoBio/goquery" | ||
) | ||
|
||
func main() { | ||
// Get flags | ||
flag.StringVar(&urlBase, "url", "http://example.com/v/%d", `The URL you wish to scrape, containing "%d" where the id should be substituted`) | ||
flag.IntVar(&idLow, "from", 0, "The first ID that should be searched in the URL - inclusive.") | ||
flag.IntVar(&idHigh, "to", 1, "The last ID that should be searched in the URL - exclusive") | ||
flag.IntVar(&concurrency, "concurrency", 1, "How many scrapers to run in parallel. (More scrapers are faster, but more prone to rate limiting or bandwith issues)") | ||
flag.StringVar(&outfile, "output", "output.csv", "Filename to export the CSV results") | ||
flag.StringVar(&nameQuery, "nameQuery", ".name", "JQuery-style query for the name element") | ||
flag.StringVar(&addressQuery, "addressQuery", ".address", "JQuery-style query for the address element") | ||
flag.StringVar(&phoneQuery, "phoneQuery", ".phone", "JQuery-style query for the phone element") | ||
flag.StringVar(&emailQuery, "emailQuery", ".email", "JQuery-style query for the email element") | ||
|
||
var ( | ||
urlTemplate = flag.String("url", "http://example.com/v/%d", "The URL you wish to scrape, containing \"%d\" where the id should be substituted") | ||
idLow = flag.Int("from", 0, "The first ID that should be searched in the URL - inclusive.") | ||
idHigh = flag.Int("to", 1, "The last ID that should be searched in the URL - exclusive") | ||
concurrency = flag.Int("concurrency", 1, "How many scrapers to run in parallel. (More scrapers are faster, but more prone to rate limiting or bandwith issues)") | ||
outfile = flag.String("output", "output.csv", "Filename to export the CSV results") | ||
name = flag.String("nameQuery", ".name", "JQuery-style query for the name element") | ||
address = flag.String("addressQuery", ".address", "JQuery-style query for the address element") | ||
phone = flag.String("phoneQuery", ".phone", "JQuery-style query for the phone element") | ||
email = flag.String("emailQuery", ".email", "JQuery-style query for the email element") | ||
) | ||
flag.Parse() | ||
|
||
// Use waitgroup so we can keep track of tasks | ||
columns := []string{*name, *address, *phone, *email} | ||
headers := []string{"name", "address", "phone", "email"} | ||
// url and id are added as the first two rows. | ||
headers = append([]string{"url", "id"}, headers...) | ||
|
||
// create all tasks and send them to the channel. | ||
type task struct { | ||
url string | ||
id int | ||
} | ||
tasks := make(chan task) | ||
go func() { | ||
for i := *idLow; i < *idHigh; i++ { | ||
tasks <- task{url: fmt.Sprintf(*urlTemplate, i), id: i} | ||
} | ||
close(tasks) | ||
}() | ||
|
||
// create workers and schedule closing results when all work is done. | ||
results := make(chan []string) | ||
var wg sync.WaitGroup | ||
wg.Add(idHigh - idLow) | ||
wg.Add(*concurrency) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So the waitgroup basically counts open channels rather than open tasks? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, basically you close the this allows you to have simpler error handling |
||
go func() { | ||
wg.Wait() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a really cool technique. I hadn't seen it before. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
close(results) | ||
}() | ||
|
||
for i := 0; i < *concurrency; i++ { | ||
go func() { | ||
defer wg.Done() | ||
for t := range tasks { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So this loop breaks when the channel closes? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, it is a better way to clean up Go routines If you were to make your tool a package you would be leaking routines every time |
||
r, err := fetch(t.url, t.id, columns) | ||
if err != nil { | ||
log.Printf("could not fetch %v: %v", t.url, err) | ||
continue | ||
} | ||
results <- r | ||
} | ||
}() | ||
} | ||
|
||
if err := dumpCSV(*outfile, headers, results); err != nil { | ||
log.Printf("could not write to %s: %v", *outfile, err) | ||
} | ||
} | ||
|
||
// channel for emitting sites to fetch | ||
taskChan := make(chan site) | ||
// Channel of data to write to disk | ||
dataChan := make(chan site) | ||
func fetch(url string, id int, queries []string) ([]string, error) { | ||
res, err := http.Get(url) | ||
if err != nil { | ||
return nil, fmt.Errorf("could not get %s: %v", url, err) | ||
} | ||
defer res.Body.Close() | ||
|
||
go emitTasks(taskChan) | ||
if res.StatusCode != http.StatusOK { | ||
if res.StatusCode == http.StatusTooManyRequests { | ||
return nil, fmt.Errorf("you are being rate limited") | ||
} | ||
|
||
for i := 0; i < concurrency; i++ { | ||
go scrape(taskChan, dataChan) | ||
return nil, fmt.Errorf("bad response from server: %s", res.Status) | ||
} | ||
|
||
go writeSites(dataChan, &wg) | ||
// parse body with goquery. | ||
doc, err := goquery.NewDocumentFromReader(res.Body) | ||
if err != nil { | ||
return nil, fmt.Errorf("could not parse page: %v", err) | ||
} | ||
|
||
wg.Wait() | ||
// extract info we want. | ||
r := []string{url, strconv.Itoa(id)} | ||
for _, q := range queries { | ||
r = append(r, strings.TrimSpace(doc.Find(q).Text())) | ||
} | ||
return r, nil | ||
} | ||
|
||
func dumpCSV(path string, headers []string, records <-chan []string) error { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the compiler will complain if you try to send data into the channel or close it |
||
f, err := os.Create(path) | ||
if err != nil { | ||
return fmt.Errorf("unable to create file %s: %v", path, err) | ||
} | ||
defer f.Close() | ||
|
||
w := csv.NewWriter(f) | ||
defer w.Flush() | ||
|
||
// write headers to file. | ||
if err := w.Write(headers); err != nil { | ||
log.Fatalf("error writing record to csv: %v", err) | ||
} | ||
|
||
// write all records. | ||
for r := range records { | ||
if err := w.Write(r); err != nil { | ||
log.Fatalf("could not write record to csv: %v", err) | ||
} | ||
} | ||
|
||
// check for extra errors. | ||
if err := w.Error(); err != nil { | ||
return fmt.Errorf("writer failed: %v", err) | ||
} | ||
return nil | ||
} |
This file was deleted.
This file was deleted.
This file was deleted.
This file was deleted.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
s/rows/columns
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(fixed on master)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would call it columns, because those are the column names ... but it's really up to you