diff --git a/go.mod b/go.mod index def2ed8f..11e572db 100644 --- a/go.mod +++ b/go.mod @@ -14,7 +14,6 @@ require ( github.com/gocarina/gocsv v0.0.0-20211020200912-82fc2684cc48 github.com/gorilla/mux v1.8.0 github.com/gosuri/uiprogress v0.0.1 - github.com/k4s/webrowser v0.0.0-20160107091637-934d526d0f27 github.com/knakk/rdf v0.0.0-20190304171630-8521bf4c5042 github.com/mafredri/cdp v0.32.0 github.com/minio/minio-go/v7 v7.0.52 @@ -57,7 +56,6 @@ require ( github.com/jmespath/go-jmespath v0.4.0 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect - github.com/k4s/phantomgo v0.0.0-20161104020322-11963773aa04 // indirect github.com/klauspost/compress v1.16.0 // indirect github.com/klauspost/cpuid/v2 v2.2.4 // indirect github.com/magiconair/properties v1.8.6 // indirect diff --git a/go.sum b/go.sum index 3fee5249..8d5454f4 100644 --- a/go.sum +++ b/go.sum @@ -386,10 +386,6 @@ github.com/jstemmer/go-junit-report v0.9.1/go.mod h1:Brl9GWCQeLvo8nXZwPNNblvFj/X github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU= github.com/julienschmidt/httprouter v1.2.0/go.mod h1:SYymIcj16QtmaHHD7aYtjjsJG7VTCxuUUipMqKk8s4w= github.com/k0kubun/go-ansi v0.0.0-20180517002512-3bf9e2903213/go.mod h1:vNUNkEQ1e29fT/6vq2aBdFsgNPmy8qMdSay1npru+Sw= -github.com/k4s/phantomgo v0.0.0-20161104020322-11963773aa04 h1:tA/Xc0VnJtHIdxAML0WraKG+ErOYVgJ6oDcuxOloZOM= -github.com/k4s/phantomgo v0.0.0-20161104020322-11963773aa04/go.mod h1:YWxksSger0gUVO0tKEY/mVkyBTPoKAf4KX/S8Vt7ndc= -github.com/k4s/webrowser v0.0.0-20160107091637-934d526d0f27 h1:YBHxM4fmxQghvs3Ty/rQIPnY+tdCFheIOMj/h0Zw0A8= -github.com/k4s/webrowser v0.0.0-20160107091637-934d526d0f27/go.mod h1:kd1f/k6xHQrfwfszgeiZklsPzBNJJj/el6cjp86YowQ= github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= github.com/kisielk/godepgraph v0.0.0-20190626013829-57a7e4a651a9/go.mod h1:Gb5YEgxqiSSVrXKWQxDcKoCM94NO5QAwOwTaVmIUAMI= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= diff --git a/internal/summoner/acquire/acquire.go b/internal/summoner/acquire/acquire.go index 24901c75..bf10150d 100644 --- a/internal/summoner/acquire/acquire.go +++ b/internal/summoner/acquire/acquire.go @@ -51,17 +51,17 @@ func ResRetrieve(v1 *viper.Viper, mc *minio.Client, m map[string][]string, runSt wg.Wait() } -func getConfig(v1 *viper.Viper, sourceName string) (string, int, int64, error) { +func getConfig(v1 *viper.Viper, sourceName string) (string, int, int64, int, error) { bucketName, err := configTypes.GetBucketName(v1) if err != nil { - return bucketName, 0, 0, err + return bucketName, 0, 0, 0, err } var mcfg configTypes.Summoner mcfg, err = configTypes.ReadSummmonerConfig(v1.Sub("summoner")) if err != nil { - return bucketName, 0, 0, err + return bucketName, 0, 0, 0, err } // Set default thread counts and global delay tc := mcfg.Threads @@ -74,9 +74,9 @@ func getConfig(v1 *viper.Viper, sourceName string) (string, int, int64, error) { // look for a domain specific override crawl delay sources, err := configTypes.GetSources(v1) source, err := configTypes.GetSourceByName(sources, sourceName) - + hw := source.HeadlessWait if err != nil { - return bucketName, tc, delay, err + return bucketName, tc, delay, hw, err } if source.Delay != 0 && source.Delay > delay { @@ -84,15 +84,14 @@ func getConfig(v1 *viper.Viper, sourceName string) (string, int, int64, error) { tc = 1 log.Info("Crawl delay set to ", delay, " for ", sourceName) } - log.Info("Thread count ", tc, " delay ", delay) - return bucketName, tc, delay, nil + return bucketName, tc, delay, hw, nil } func getDomain(v1 *viper.Viper, mc *minio.Client, urls []string, sourceName string, wg *sync.WaitGroup, repologger *log.Logger, repoStats *common.RepoStats) { - bucketName, tc, delay, err := getConfig(v1, sourceName) + bucketName, tc, delay, headlessWait, err := getConfig(v1, sourceName) if err != nil { // trying to read a source, so let's not kill everything with a panic/fatal log.Error("Error reading config file ", err) @@ -164,16 +163,14 @@ func getDomain(v1 *viper.Viper, mc *minio.Client, urls []string, sourceName stri // even is no JSON-LD packages found, record the event of checking this URL if len(jsonlds) < 1 { // TODO is her where I then try headless, and scope the following for into an else? - log.WithFields(log.Fields{"url": urlloc, "contentType": "Direct access failed, trying headless']"}).Info("Direct access failed, trying headless for ", urlloc) - repologger.WithFields(log.Fields{"url": urlloc, "contentType": "Direct access failed, trying headless']"}).Error() // this needs to go into the issues file - err := PageRenderAndUpload(v1, mc, 60*time.Second, urlloc, sourceName, repologger, repoStats) // TODO make delay configurable - - if err != nil { - log.WithFields(log.Fields{"url": urlloc, "issue": "converting json ld"}).Error("PageRenderAndUpload ", urlloc, "::", err) - repologger.WithFields(log.Fields{"url": urlloc, "issue": "converting json ld"}).Error(err) - } - if err != nil { - log.Error("DB Update", urlloc, "::", err) + if headlessWait >= 0 { + log.WithFields(log.Fields{"url": urlloc, "contentType": "Direct access failed, trying headless']"}).Info("Direct access failed, trying headless for ", urlloc) + repologger.WithFields(log.Fields{"url": urlloc, "contentType": "Direct access failed, trying headless']"}).Error() // this needs to go into the issues file + err := PageRenderAndUpload(v1, mc, 60*time.Second, urlloc, sourceName, repologger, repoStats) // TODO make delay configurable + if err != nil { + log.WithFields(log.Fields{"url": urlloc, "issue": "converting json ld"}).Error("PageRenderAndUpload ", urlloc, "::", err) + repologger.WithFields(log.Fields{"url": urlloc, "issue": "converting json ld"}).Error(err) + } } } else { diff --git a/internal/summoner/acquire/api.go b/internal/summoner/acquire/api.go index fd831d8c..b7f9409f 100644 --- a/internal/summoner/acquire/api.go +++ b/internal/summoner/acquire/api.go @@ -57,7 +57,7 @@ func RetrieveAPIData(apiSources []configTypes.Sources, mc *minio.Client, runStat func getAPISource(v1 *viper.Viper, mc *minio.Client, source configTypes.Sources, wg *sync.WaitGroup, repologger *log.Logger, repoStats *common.RepoStats) { - bucketName, tc, delay, err := getConfig(v1, source.Name) + bucketName, tc, delay, _, err := getConfig(v1, source.Name) // _ is headless wait if err != nil { // trying to read a source, so let's not kill everything with a panic/fatal log.Error("Error reading config file ", err) diff --git a/internal/summoner/acquire/headlessNG.go b/internal/summoner/acquire/headlessNG.go index 122d8e66..65742f5a 100644 --- a/internal/summoner/acquire/headlessNG.go +++ b/internal/summoner/acquire/headlessNG.go @@ -183,6 +183,10 @@ func PageRender(v1 *viper.Viper, timeout time.Duration, url, k string, repologge sources, err := configTypes.GetSources(v1) source, err := configTypes.GetSourceByName(sources, k) headlessWait := source.HeadlessWait + if headlessWait < 0 { + log.Info("Headless wait on a headless configured to less that zero. Setting to 0") + headlessWait = 0 // if someone screws up the config, be good + } if timeout*time.Duration(retries) < time.Duration(headlessWait)*time.Second { timeout = time.Duration(headlessWait) * time.Second