Skip to content

Commit

Permalink
feat: moved goroutine setup before downloading a file so on on thread…
Browse files Browse the repository at this point in the history
… it won't start downloading one file after another. This should help solve transfer limit issue
  • Loading branch information
kris-dev-hub committed Jan 3, 2024
1 parent 020c697 commit bd55728
Showing 1 changed file with 7 additions and 5 deletions.
12 changes: 7 additions & 5 deletions cmd/importer/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,13 @@ func importSegment(segment commoncrawl.WatSegment, dataDir commoncrawl.DataDir,
if err != nil {
panic(fmt.Sprintf("Failed to create file: %v", err))
}

wg.Add(1)
// Before starting the goroutine, we insert an empty struct into the guard channel.
// If the channel is already full (meaning we have 'maxGoroutines' goroutines running),
// this will block until one of the running goroutines finishes and reads from the channel.
guard <- struct{}{}

if !fileutils.FileExists(recordWatFile) {
err := fileutils.DownloadFile("https://data.commoncrawl.org/"+watFile.Path, recordWatFile, 2)
if err != nil {
Expand All @@ -255,11 +262,6 @@ func importSegment(segment commoncrawl.WatSegment, dataDir commoncrawl.DataDir,

fmt.Println("Importing file: ", recordWatFile)

wg.Add(1)
// Before starting the goroutine, we insert an empty struct into the guard channel.
// If the channel is already full (meaning we have 'maxGoroutines' goroutines running),
// this will block until one of the running goroutines finishes and reads from the channel.
guard <- struct{}{}
go func(recordFile string, linkFile string, pageFile string) {
defer wg.Done() // Signal the WaitGroup that the goroutine is done after it finishes
defer func() { <-guard }() // Release the guard when the goroutine is done
Expand Down

0 comments on commit bd55728

Please sign in to comment.