Skip to content

Commit

Permalink
Merge pull request #199 from gleanerio/dev_dv_198_headless_never
Browse files Browse the repository at this point in the history
#198 headless never is headless wait < 0
  • Loading branch information
fils authored Apr 24, 2023
2 parents 860b5d3 + 2fd50b2 commit 15afe79
Show file tree
Hide file tree
Showing 5 changed files with 20 additions and 25 deletions.
2 changes: 0 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ require (
github.com/gocarina/gocsv v0.0.0-20211020200912-82fc2684cc48
github.com/gorilla/mux v1.8.0
github.com/gosuri/uiprogress v0.0.1
github.com/k4s/webrowser v0.0.0-20160107091637-934d526d0f27
github.com/knakk/rdf v0.0.0-20190304171630-8521bf4c5042
github.com/mafredri/cdp v0.32.0
github.com/minio/minio-go/v7 v7.0.52
Expand Down Expand Up @@ -57,7 +56,6 @@ require (
github.com/jmespath/go-jmespath v0.4.0 // indirect
github.com/josharian/intern v1.0.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/k4s/phantomgo v0.0.0-20161104020322-11963773aa04 // indirect
github.com/klauspost/compress v1.16.0 // indirect
github.com/klauspost/cpuid/v2 v2.2.4 // indirect
github.com/magiconair/properties v1.8.6 // indirect
Expand Down
4 changes: 0 additions & 4 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -386,10 +386,6 @@ github.com/jstemmer/go-junit-report v0.9.1/go.mod h1:Brl9GWCQeLvo8nXZwPNNblvFj/X
github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU=
github.com/julienschmidt/httprouter v1.2.0/go.mod h1:SYymIcj16QtmaHHD7aYtjjsJG7VTCxuUUipMqKk8s4w=
github.com/k0kubun/go-ansi v0.0.0-20180517002512-3bf9e2903213/go.mod h1:vNUNkEQ1e29fT/6vq2aBdFsgNPmy8qMdSay1npru+Sw=
github.com/k4s/phantomgo v0.0.0-20161104020322-11963773aa04 h1:tA/Xc0VnJtHIdxAML0WraKG+ErOYVgJ6oDcuxOloZOM=
github.com/k4s/phantomgo v0.0.0-20161104020322-11963773aa04/go.mod h1:YWxksSger0gUVO0tKEY/mVkyBTPoKAf4KX/S8Vt7ndc=
github.com/k4s/webrowser v0.0.0-20160107091637-934d526d0f27 h1:YBHxM4fmxQghvs3Ty/rQIPnY+tdCFheIOMj/h0Zw0A8=
github.com/k4s/webrowser v0.0.0-20160107091637-934d526d0f27/go.mod h1:kd1f/k6xHQrfwfszgeiZklsPzBNJJj/el6cjp86YowQ=
github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
github.com/kisielk/godepgraph v0.0.0-20190626013829-57a7e4a651a9/go.mod h1:Gb5YEgxqiSSVrXKWQxDcKoCM94NO5QAwOwTaVmIUAMI=
github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
Expand Down
33 changes: 15 additions & 18 deletions internal/summoner/acquire/acquire.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,17 +51,17 @@ func ResRetrieve(v1 *viper.Viper, mc *minio.Client, m map[string][]string, runSt
wg.Wait()
}

func getConfig(v1 *viper.Viper, sourceName string) (string, int, int64, error) {
func getConfig(v1 *viper.Viper, sourceName string) (string, int, int64, int, error) {
bucketName, err := configTypes.GetBucketName(v1)
if err != nil {
return bucketName, 0, 0, err
return bucketName, 0, 0, 0, err
}

var mcfg configTypes.Summoner
mcfg, err = configTypes.ReadSummmonerConfig(v1.Sub("summoner"))

if err != nil {
return bucketName, 0, 0, err
return bucketName, 0, 0, 0, err
}
// Set default thread counts and global delay
tc := mcfg.Threads
Expand All @@ -74,25 +74,24 @@ func getConfig(v1 *viper.Viper, sourceName string) (string, int, int64, error) {
// look for a domain specific override crawl delay
sources, err := configTypes.GetSources(v1)
source, err := configTypes.GetSourceByName(sources, sourceName)

hw := source.HeadlessWait
if err != nil {
return bucketName, tc, delay, err
return bucketName, tc, delay, hw, err
}

if source.Delay != 0 && source.Delay > delay {
delay = source.Delay
tc = 1
log.Info("Crawl delay set to ", delay, " for ", sourceName)
}

log.Info("Thread count ", tc, " delay ", delay)
return bucketName, tc, delay, nil
return bucketName, tc, delay, hw, nil
}

func getDomain(v1 *viper.Viper, mc *minio.Client, urls []string, sourceName string,
wg *sync.WaitGroup, repologger *log.Logger, repoStats *common.RepoStats) {

bucketName, tc, delay, err := getConfig(v1, sourceName)
bucketName, tc, delay, headlessWait, err := getConfig(v1, sourceName)
if err != nil {
// trying to read a source, so let's not kill everything with a panic/fatal
log.Error("Error reading config file ", err)
Expand Down Expand Up @@ -164,16 +163,14 @@ func getDomain(v1 *viper.Viper, mc *minio.Client, urls []string, sourceName stri
// even is no JSON-LD packages found, record the event of checking this URL
if len(jsonlds) < 1 {
// TODO is her where I then try headless, and scope the following for into an else?
log.WithFields(log.Fields{"url": urlloc, "contentType": "Direct access failed, trying headless']"}).Info("Direct access failed, trying headless for ", urlloc)
repologger.WithFields(log.Fields{"url": urlloc, "contentType": "Direct access failed, trying headless']"}).Error() // this needs to go into the issues file
err := PageRenderAndUpload(v1, mc, 60*time.Second, urlloc, sourceName, repologger, repoStats) // TODO make delay configurable

if err != nil {
log.WithFields(log.Fields{"url": urlloc, "issue": "converting json ld"}).Error("PageRenderAndUpload ", urlloc, "::", err)
repologger.WithFields(log.Fields{"url": urlloc, "issue": "converting json ld"}).Error(err)
}
if err != nil {
log.Error("DB Update", urlloc, "::", err)
if headlessWait >= 0 {
log.WithFields(log.Fields{"url": urlloc, "contentType": "Direct access failed, trying headless']"}).Info("Direct access failed, trying headless for ", urlloc)
repologger.WithFields(log.Fields{"url": urlloc, "contentType": "Direct access failed, trying headless']"}).Error() // this needs to go into the issues file
err := PageRenderAndUpload(v1, mc, 60*time.Second, urlloc, sourceName, repologger, repoStats) // TODO make delay configurable
if err != nil {
log.WithFields(log.Fields{"url": urlloc, "issue": "converting json ld"}).Error("PageRenderAndUpload ", urlloc, "::", err)
repologger.WithFields(log.Fields{"url": urlloc, "issue": "converting json ld"}).Error(err)
}
}

} else {
Expand Down
2 changes: 1 addition & 1 deletion internal/summoner/acquire/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ func RetrieveAPIData(apiSources []configTypes.Sources, mc *minio.Client, runStat

func getAPISource(v1 *viper.Viper, mc *minio.Client, source configTypes.Sources, wg *sync.WaitGroup, repologger *log.Logger, repoStats *common.RepoStats) {

bucketName, tc, delay, err := getConfig(v1, source.Name)
bucketName, tc, delay, _, err := getConfig(v1, source.Name) // _ is headless wait
if err != nil {
// trying to read a source, so let's not kill everything with a panic/fatal
log.Error("Error reading config file ", err)
Expand Down
4 changes: 4 additions & 0 deletions internal/summoner/acquire/headlessNG.go
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,10 @@ func PageRender(v1 *viper.Viper, timeout time.Duration, url, k string, repologge
sources, err := configTypes.GetSources(v1)
source, err := configTypes.GetSourceByName(sources, k)
headlessWait := source.HeadlessWait
if headlessWait < 0 {
log.Info("Headless wait on a headless configured to less that zero. Setting to 0")
headlessWait = 0 // if someone screws up the config, be good
}

if timeout*time.Duration(retries) < time.Duration(headlessWait)*time.Second {
timeout = time.Duration(headlessWait) * time.Second
Expand Down

0 comments on commit 15afe79

Please sign in to comment.