philipithomas · philipithomas · Jul 20, 2016 · Jul 20, 2016 · Jul 20, 2016 · Jul 20, 2016
diff --git a/main.go b/main.go
@@ -1,51 +1,135 @@
+// iterscraper scrapes information from a website where URLs contain an incrementing integer.
+// Information is retrieved from HTML5 elements, and outputted as a CSV.
 package main
 
 import (
+	"encoding/csv"
 	"flag"
+	"fmt"
+	"log"
+	"net/http"
+	"os"
+	"strconv"
+	"strings"
 	"sync"
-)
 
-var (
-	urlBase       string
-	idLow, idHigh int
-	concurrency   int
-	outfile       string
-	nameQuery     string
-	addressQuery  string
-	phoneQuery    string
-	emailQuery    string
+	"github.com/PuerkitoBio/goquery"
 )
 
 func main() {
-	// Get flags
-	flag.StringVar(&urlBase, "url", "http://example.com/v/%d", `The URL you wish to scrape, containing "%d" where the id should be substituted`)
-	flag.IntVar(&idLow, "from", 0, "The first ID that should be searched in the URL - inclusive.")
-	flag.IntVar(&idHigh, "to", 1, "The last ID that should be searched in the URL - exclusive")
-	flag.IntVar(&concurrency, "concurrency", 1, "How many scrapers to run in parallel. (More scrapers are faster, but more prone to rate limiting or bandwith issues)")
-	flag.StringVar(&outfile, "output", "output.csv", "Filename to export the CSV results")
-	flag.StringVar(&nameQuery, "nameQuery", ".name", "JQuery-style query for the name element")
-	flag.StringVar(&addressQuery, "addressQuery", ".address", "JQuery-style query for the address element")
-	flag.StringVar(&phoneQuery, "phoneQuery", ".phone", "JQuery-style query for the phone element")
-	flag.StringVar(&emailQuery, "emailQuery", ".email", "JQuery-style query for the email element")
-
+	var (
+		urlTemplate = flag.String("url", "http://example.com/v/%d", "The URL you wish to scrape, containing \"%d\" where the id should be substituted")
+		idLow       = flag.Int("from", 0, "The first ID that should be searched in the URL - inclusive.")
+		idHigh      = flag.Int("to", 1, "The last ID that should be searched in the URL - exclusive")
+		concurrency = flag.Int("concurrency", 1, "How many scrapers to run in parallel. (More scrapers are faster, but more prone to rate limiting or bandwith issues)")
+		outfile     = flag.String("output", "output.csv", "Filename to export the CSV results")
+		name        = flag.String("nameQuery", ".name", "JQuery-style query for the name element")
+		address     = flag.String("addressQuery", ".address", "JQuery-style query for the address element")
+		phone       = flag.String("phoneQuery", ".phone", "JQuery-style query for the phone element")
+		email       = flag.String("emailQuery", ".email", "JQuery-style query for the email element")
+	)
 	flag.Parse()
 
-	// Use waitgroup so we can keep track of tasks
+	columns := []string{*name, *address, *phone, *email}
+	headers := []string{"name", "address", "phone", "email"}
+	// url and id are added as the first two rows.
+	headers = append([]string{"url", "id"}, headers...)
+
+	// create all tasks and send them to the channel.
+	type task struct {
+		url string
+		id  int
+	}
+	tasks := make(chan task)
+	go func() {
+		for i := *idLow; i < *idHigh; i++ {
+			tasks <- task{url: fmt.Sprintf(*urlTemplate, i), id: i}
+		}
+		close(tasks)
+	}()
+
+	// create workers and schedule closing results when all work is done.
+	results := make(chan []string)
 	var wg sync.WaitGroup
-	wg.Add(idHigh - idLow)
+	wg.Add(*concurrency)
+	go func() {
+		wg.Wait()
+		close(results)
+	}()
+
+	for i := 0; i < *concurrency; i++ {
+		go func() {
+			defer wg.Done()
+			for t := range tasks {
+				r, err := fetch(t.url, t.id, columns)
+				if err != nil {
+					log.Printf("could not fetch %v: %v", t.url, err)
+					continue
+				}
+				results <- r
+			}
+		}()
+	}
+
+	if err := dumpCSV(*outfile, headers, results); err != nil {
+		log.Printf("could not write to %s: %v", *outfile, err)
+	}
+}
 
-	// channel for emitting sites to fetch
-	taskChan := make(chan site)
-	// Channel of data to write to disk
-	dataChan := make(chan site)
+func fetch(url string, id int, queries []string) ([]string, error) {
+	res, err := http.Get(url)
+	if err != nil {
+		return nil, fmt.Errorf("could not get %s: %v", url, err)
+	}
+	defer res.Body.Close()
 
-	go emitTasks(taskChan)
+	if res.StatusCode != http.StatusOK {
+		if res.StatusCode == http.StatusTooManyRequests {
+			return nil, fmt.Errorf("you are being rate limited")
+		}
 
-	for i := 0; i < concurrency; i++ {
-		go scrape(taskChan, dataChan)
+		return nil, fmt.Errorf("bad response from server: %s", res.Status)
 	}
 
-	go writeSites(dataChan, &wg)
+	// parse body with goquery.
+	doc, err := goquery.NewDocumentFromReader(res.Body)
+	if err != nil {
+		return nil, fmt.Errorf("could not parse page: %v", err)
+	}
 
-	wg.Wait()
+	// extract info we want.
+	r := []string{url, strconv.Itoa(id)}
+	for _, q := range queries {
+		r = append(r, strings.TrimSpace(doc.Find(q).Text()))
+	}
+	return r, nil
+}
+
+func dumpCSV(path string, headers []string, records <-chan []string) error {
+	f, err := os.Create(path)
+	if err != nil {
+		return fmt.Errorf("unable to create file %s: %v", path, err)
+	}
+	defer f.Close()
+
+	w := csv.NewWriter(f)
+	defer w.Flush()
+
+	// write headers to file.
+	if err := w.Write(headers); err != nil {
+		log.Fatalf("error writing record to csv: %v", err)
+	}
+
+	// write all records.
+	for r := range records {
+		if err := w.Write(r); err != nil {
+			log.Fatalf("could not write record to csv: %v", err)
+		}
+	}
+
+	// check for extra errors.
+	if err := w.Error(); err != nil {
+		return fmt.Errorf("writer failed: %v", err)
+	}
+	return nil
 }
diff --git a/scraper.go b/scraper.go
diff --git a/site.go b/site.go
diff --git a/tasking.go b/tasking.go
diff --git a/writer.go b/writer.go