Skip to content

Commit

Permalink
feat: add metadata fetching util
Browse files Browse the repository at this point in the history
  • Loading branch information
VladCroitoru committed Nov 14, 2021
1 parent 7e13981 commit ce837e3
Show file tree
Hide file tree
Showing 2 changed files with 223 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gh_tokens
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
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
222 changes: 222 additions & 0 deletions fetch_metadata.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
package main

import (
"bufio"
"fmt"
"io/ioutil"
"log"
"net/http"
"os"
"strconv"
"strings"
"sync"
)

func buildRequest(url, token string) *http.Request {
req, _ := http.NewRequest("GET", url, nil)
req.Header.Set("Authorization", fmt.Sprintf("token %s", token))
return req
}

// get token rate limit from github api
func getRateLimit(token string) int {
client := &http.Client{}
fmt.Println(fmt.Sprintf("checking rate limit for token: %s", token))
resp, err := client.Do(buildRequest("https://api.github.com/rate_limit", token))
if err != nil {
fmt.Println(err)
return 0
}
defer resp.Body.Close()
remaining, _ := strconv.Atoi(resp.Header.Get("X-RateLimit-Remaining"))
fmt.Println(fmt.Sprintf("remaining api calls for token: %s is - %d", token, remaining))
return remaining
}

func readLines(path string, n int) ([]string, error) {
file, err := os.Open(path)
if err != nil {
return nil, err
}
defer file.Close()

var lines []string
scanner := bufio.NewScanner(file)
for scanner.Scan() {
lines = append(lines, scanner.Text())
if len(lines) >= n {
break
}
}
return lines, scanner.Err()
}

// write string to file, if file missing create
func writeFile(pathRoot, repo string, text string) {
path := fmt.Sprintf("./%s/%s", pathRoot, repo)
err := os.MkdirAll(path, os.ModePerm)
if err != nil {
log.Fatalln(err)
}

file, err := os.OpenFile(path+"/repodata.json", os.O_CREATE|os.O_WRONLY, 0755)
if err != nil {
log.Fatalln(err)
}
defer file.Close()

if _, err := file.WriteString(text); err != nil {
fmt.Println(err)
return
}
}

// create file then append string to file
func writeVisited(outputPath, repo string) {
file, err := os.OpenFile(outputPath+"_visited", os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0755)
if err != nil {
log.Fatalln(err)
}
defer file.Close()

if _, err := file.WriteString(repo+"\n"); err != nil {
log.Fatalln(err)
}
}

func fetchMetadata(repos []string, token, outputPath string) {
client := &http.Client{}
for _, repo := range repos {
url := fmt.Sprintf("https://api.github.com/repos/%s", repo)
log.Println(fmt.Sprintf("%s getting repo data", repo))
resp, err := client.Do(buildRequest(url, token))
if err != nil {
fmt.Println(err)
continue
}
defer resp.Body.Close()

if resp.StatusCode == 200 {
body, _ := ioutil.ReadAll(resp.Body)
writeFile(outputPath, repo, string(body))
writeVisited(outputPath, repo)
} else {
log.Println(fmt.Sprintf("ERROR, could not retrieve %s data - %s", repo, resp.Status))
}
}
}

func getVisited(path string) []string {
file, err := os.Open(path)
if err != nil {
panic(err)
}

var lines []string
scanner := bufio.NewScanner(file)
for scanner.Scan() {
lines = append(lines, scanner.Text())
}
return lines
}

func readRepos(path string) []string {
repos, err := os.Open(path)
if err != nil {
log.Fatalln(err)
}
defer repos.Close()

var lines []string
scanner := bufio.NewScanner(repos)
for scanner.Scan() {
line := scanner.Text()
if line != "" {
lines = append(lines, strings.Split(line, " ")[0])
}
}
return lines
}

func readTokens(path string) []string {
tokens, err := os.Open(path)
if err != nil {
log.Fatalln(err)
}
defer tokens.Close()

var lines []string
scanner := bufio.NewScanner(tokens)
for scanner.Scan() {
line := scanner.Text()
if line != "" {
lines = append(lines, strings.Split(line, " ")[0])
}
}
return lines
}

// filter duplicates from a list based on a second list
func filter(list []string, filter []string) []string {
var filtered []string
for _, item := range list {
if !contains(filter, item) {
filtered = append(filtered, item)
}
}
return filtered
}

// check if a list contains a string
func contains(list []string, item string) bool {
for _, val := range list {
if val == item {
return true
}
}
return false
}

// return first n item from a list skip offset
func slice(list []string, n int, offset int) []string {
var sliced []string
for i, item := range list {
if i >= offset && i < offset+n {
sliced = append(sliced, item)
}
}
return sliced
}

func main() {
fmt.Println("starting")
reposFile := os.Args[1]
outputPath := os.Args[2]
tokens := readTokens(".gh_tokens")
visited := getVisited(outputPath + "_visited")
fmt.Println(fmt.Sprintf("visited %d", len(visited)))
repos := readRepos(reposFile)
fmt.Println(fmt.Sprintf("repos %d", len(repos)))
toVisit := filter(repos, visited)
fmt.Println(fmt.Sprintf("toVisit %d", len(toVisit)))
fmt.Println(toVisit)

offset := 0
var wg sync.WaitGroup
fmt.Println("iterating tokens")
for _, token := range tokens {
remaining := getRateLimit(token)
if remaining > 0 {
repos := slice(toVisit, remaining, offset)
offset += len(repos)

fmt.Println("launching goroutine")
wg.Add(1)
go func () {
fetchMetadata(repos, token, outputPath)
defer wg.Done()
}()
}
}
wg.Wait()
}

0 comments on commit ce837e3

Please sign in to comment.