Skip to content

Commit

Permalink
Merge pull request #133 from gleanerio/mm-dev--api
Browse files Browse the repository at this point in the history
Consume JSON-LD metadata from a paged API
  • Loading branch information
fils authored Mar 2, 2023
2 parents 14434be + b443475 commit ecbab66
Show file tree
Hide file tree
Showing 17 changed files with 466 additions and 119 deletions.
Binary file removed cmd/gleaner/gleaner.db
Binary file not shown.
2 changes: 1 addition & 1 deletion configs/template/README_Configure_Template.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ hack,SourceType,Active,Name,ProperName,URL,Headless,Domain,PID,Logo

Fields:
1. hack:a hack to make the fields are properly read.
2. SourceType : [sitemap, sitegraph, googledrive] type of source
2. SourceType : [sitemap, sitegraph, googledrive, api] type of source
3. Active: [TRUE,FALSE] is source active.
4. Name: short name of source. It should be one word (no space) and be lower case.
5. ProperName: Long name of source that will be added to organization record for provenance
Expand Down
15 changes: 14 additions & 1 deletion docs/GleanerConfig.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ The miller and summon sections are true and we will leave them that way. It mea
Now look at the "miller:" section when lets of pick what milling to do. Currently it is set with only graph set to true. Let's leave it that way for now. This means Gleaner will only attempt to make graph and not also run validation or generate prov reports for the process.
The final section we need to look at is the "sources:" section.
Here is where the fun is. While there are two types, sitegraph and sitemaps we will normally use sitemap type.
Here is where the fun is. While there multiple types, sitegraph, sitemaps, googledrive and api, we will normally use sitemap type.
A standard [sitemap](./SourceSitemap.md) is below:
Expand Down Expand Up @@ -122,6 +122,19 @@ sources:
credentialsfile: configs/credentials/gleaner-331805-030e15e1d9c4.json
other: {}
```
An [API endpoint](./SourceAPI.md)
```yaml
sources:
- sourcetype: api
name: example
url: http://test-metadata-api.com?query=something&page=%d
properName: Example JSON-LD API Source
domain: http://test-metadata-api.com
active: true
apipagelimit: 200
```
These are the sources we wish to pull and process.
Each source has a type, and 8 entries though at this time we no longer use the "logo" value.
It was used in the past to provide a page showing all the sources and
Expand Down
16 changes: 16 additions & 0 deletions docs/SourceAPI.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
## Using a paged API endpoint as a Gleaner source

Sometimes, instead of crawling webpages using a list in a sitemap, we have the opportunity to query an API that will let us directly ingest JSON-LD. To do so, we can specify a `sourcetype: api` in our Gleaner config yaml, and Gleaner will iterate through a paged API, using the given `url` as a template. For example, let's say that you want to use the API endpoint at `http://test-api.com`, and that you can page through it by using a url like `http://test/api.com/page/4`. You would put this in your config:

```yaml
url: http://test-api.com/page/%d
```
Notice the `%d` where the page number goes. Gleaner will then increment that number (starting from 0) until it gets an error back from the API.

Optionally, you can set a limit on the number of pages to iterate through, using `apipagelimit`. This means that Gleaner will page through the API until it gets an error back *or* until it reaches the limit you set. That looks like the example below:

```yaml
url: http://test-api.com/page/%d
apipagelimit: 200
```
2 changes: 1 addition & 1 deletion internal/config/minio.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ func ReadMinioConfig(minioSubtress *viper.Viper) (Minio, error) {
// config already read. substree passed
err := minioSubtress.Unmarshal(&minioCfg)
if err != nil {
log.Fatal("error when parsing minio config: %v", err)
log.Fatal("error when parsing minio config: ", err)
}
return minioCfg, err
}
Expand Down
5 changes: 3 additions & 2 deletions internal/config/sources.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ import (

// as read from csv
type Sources struct {
// Valid values for SourceType: sitemap, sitegraph, csv, googledrive, and robots
// Valid values for SourceType: sitemap, sitegraph, csv, googledrive, api, and robots
SourceType string `default:"sitemap"`
Name string
Logo string
Expand All @@ -31,6 +31,7 @@ type Sources struct {
// Active bool
HeadlessWait int // if loading is slow, wait
Delay int64 // A domain-specific crawl delay value
ApiPageLimit int
}

// add needed for file
Expand Down Expand Up @@ -124,7 +125,7 @@ func GetSources(g1 *viper.Viper) ([]Sources, error) {
// config already read. substree passed
err := g1.UnmarshalKey(subtreeKey, &cfg)
if err != nil {
log.Fatal("error when parsing %v config: %v", subtreeKey, err)
log.Fatal("error when parsing ", subtreeKey, " config: ", err)
//No sources, so nothing to run
}
for i, s := range cfg {
Expand Down
22 changes: 11 additions & 11 deletions internal/objects/sourcesAndGraphs.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,17 @@ import (
"github.com/spf13/viper"
)

//type Sources struct {
// Name string
// Logo string
// URL string
// Headless bool
// PID string
// ProperName string
// Domain string
// // SitemapFormat string
// // Active bool
//}
// type Sources struct {
// Name string
// Logo string
// URL string
// Headless bool
// PID string
// ProperName string
// Domain string
// // SitemapFormat string
// // Active bool
// }
type Sources = configTypes.Sources

// Return all sources and sitegraph domains
Expand Down
152 changes: 86 additions & 66 deletions internal/summoner/acquire/acquire.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
)

const EarthCubeAgent = "EarthCube_DataBot/1.0"
const JSONContentType = "application/ld+json"

// ResRetrieve is a function to pull down the data graphs at resources
func ResRetrieve(v1 *viper.Viper, mc *minio.Client, m map[string][]string, db *bolt.DB, runStats *common.RunStats) {
Expand Down Expand Up @@ -157,7 +158,8 @@ func getDomain(v1 *viper.Viper, mc *minio.Client, urls []string, sourceName stri
}
defer resp.Body.Close()

doc, err := goquery.NewDocumentFromResponse(resp)
jsonlds, err := FindJSONInResponse(v1, urlloc, repologger, resp)

if err != nil {
log.Error("#", i, " error on ", urlloc, err) // print an message containing the index (won't keep order)
repoStats.Inc(common.Issues)
Expand All @@ -166,34 +168,6 @@ func getDomain(v1 *viper.Viper, mc *minio.Client, urls []string, sourceName stri
return
}

var jsonlds []string
var contentTypeHeader = resp.Header["Content-Type"]

// if
// The URL is sending back JSON-LD correctly as application/ld+json
// this should not be here IMHO, but need to support people not setting proper header value
// The URL is sending back JSON-LD but incorrectly sending as application/json
if contains(contentTypeHeader, "application/ld+json") || contains(contentTypeHeader, "application/json") || fileExtensionIsJson(urlloc) {
repologger.WithFields(log.Fields{"url": urlloc, "contentType": "json or ld_json"}).Debug()
log.WithFields(log.Fields{"url": urlloc, "contentType": "json or ld_json"}).Debug(urlloc, " as ", contentTypeHeader)

jsonlds, err = addToJsonListIfValid(v1, jsonlds, doc.Text())
if err != nil {
log.WithFields(log.Fields{"url": urlloc, "contentType": "json or ld_json"}).Error("Error processing json response from ", urlloc, err)
repologger.WithFields(log.Fields{"url": urlloc, "contentType": "json or ld_json"}).Error(err)
}
// look in the HTML page for <script type=application/ld+json>
} else {
doc.Find("script[type='application/ld+json']").Each(func(i int, s *goquery.Selection) {
jsonlds, err = addToJsonListIfValid(v1, jsonlds, s.Text())
repologger.WithFields(log.Fields{"url": urlloc, "contentType": "script[type='application/ld+json']"}).Info()
if err != nil {
log.WithFields(log.Fields{"url": urlloc, "contentType": "script[type='application/ld+json']"}).Error("Error processing script tag in ", urlloc, err)
repologger.WithFields(log.Fields{"url": urlloc, "contentType": "script[type='application/ld+json']"}).Error(err)
}
})
}

// For incremental indexing I want to know every URL I visit regardless
// if there is a valid JSON-LD document or not. For "full" indexing we
// visit ALL URLs. However, many will not have JSON-LD, so let's also record
Expand Down Expand Up @@ -225,43 +199,7 @@ func getDomain(v1 *viper.Viper, mc *minio.Client, urls []string, sourceName stri
repoStats.Inc(common.Summoned)
}

for i, jsonld := range jsonlds {
if jsonld != "" { // traps out the root domain... should do this different
log.WithFields(log.Fields{"url": urlloc, "issue": "Uploading"}).Trace("#", i, "Uploading ")
repologger.WithFields(log.Fields{"url": urlloc, "issue": "Uploading"}).Trace()
sha, err := Upload(v1, mc, bucketName, sourceName, urlloc, jsonld)
if err != nil {
log.WithFields(log.Fields{"url": urlloc, "sha": sha, "issue": "Error uploading jsonld to object store"}).Error("Error uploading jsonld to object store: ", urlloc, err)
repologger.WithFields(log.Fields{"url": urlloc, "sha": sha, "issue": "Error uploading jsonld to object store"}).Error(err)
repoStats.Inc(common.StoreError)
} else {
repologger.WithFields(log.Fields{"url": urlloc, "sha": sha, "issue": "Uploaded to object store"}).Trace(err)
log.WithFields(log.Fields{"url": urlloc, "sha": sha, "issue": "Uploaded to object store"}).Info("Successfully put ", sha, " in summoned bucket for ", urlloc)
repoStats.Inc(common.Stored)
}
// TODO Is here where to add an entry to the KV store
db.Update(func(tx *bolt.Tx) error {
b := tx.Bucket([]byte(sourceName))
err := b.Put([]byte(urlloc), []byte(sha))
if err != nil {
log.Error("Error writing to bolt ", err)
}
return nil
})
} else {
log.WithFields(log.Fields{"url": urlloc, "issue": "Empty JSON-LD document found "}).Info("Empty JSON-LD document found. Continuing.")
repologger.WithFields(log.Fields{"url": urlloc, "issue": "Empty JSON-LD document found "}).Error(err)
// TODO Is here where to add an entry to the KV store
db.Update(func(tx *bolt.Tx) error {
b := tx.Bucket([]byte(sourceName))
err := b.Put([]byte(urlloc), []byte(fmt.Sprintf("NULL: %s", urlloc))) // no JOSN-LD found at this URL
if err != nil {
log.Error("Error writing to bolt ", err)
}
return nil
})
}
}
UploadWrapper(v1, mc, bucketName, sourceName, urlloc, db, repologger, repoStats, jsonlds)

bar.Add(1) // bar.Incr()
log.Trace("#", i, "thread for", urlloc) // print an message containing the index (won't keep order)
Expand All @@ -275,6 +213,88 @@ func getDomain(v1 *viper.Viper, mc *minio.Client, urls []string, sourceName stri
}
}

func FindJSONInResponse(v1 *viper.Viper, urlloc string, repologger *log.Logger, response *http.Response) ([]string, error) {
doc, err := goquery.NewDocumentFromResponse(response)
if err != nil {
return nil, err
}

contentTypeHeader := response.Header["Content-Type"]
var jsonlds []string

// if the URL is sending back JSON-LD correctly as application/ld+json
// this should not be here IMHO, but need to support people not setting proper header value
// The URL is sending back JSON-LD but incorrectly sending as application/json
if contains(contentTypeHeader, JSONContentType) || contains(contentTypeHeader, "application/json") || fileExtensionIsJson(urlloc) {
logFields := log.Fields{"url": urlloc, "contentType": "json or ld_json"}
repologger.WithFields(logFields).Debug()
log.WithFields(logFields).Debug(urlloc, " as ", contentTypeHeader)

jsonlds, err = addToJsonListIfValid(v1, jsonlds, doc.Text())
if err != nil {
log.WithFields(logFields).Error("Error processing json response from ", urlloc, err)
repologger.WithFields(logFields).Error(err)
}
// look in the HTML response for <script type=application/ld+json>
} else {
doc.Find("script[type='application/ld+json']").Each(func(i int, s *goquery.Selection) {
jsonlds, err = addToJsonListIfValid(v1, jsonlds, s.Text())
logFields := log.Fields{"url": urlloc, "contentType": "script[type='application/ld+json']"}
repologger.WithFields(logFields).Info()
if err != nil {
log.WithFields(logFields).Error("Error processing script tag in ", urlloc, err)
repologger.WithFields(logFields).Error(err)
}
})
}

return jsonlds, nil
}

func UploadWrapper(v1 *viper.Viper, mc *minio.Client, bucketName string, sourceName string, urlloc string, db *bolt.DB, repologger *log.Logger, repoStats *common.RepoStats, jsonlds []string) {
for i, jsonld := range jsonlds {
if jsonld != "" { // traps out the root domain... should do this different
logFields := log.Fields{"url": urlloc, "issue": "Uploading"}
log.WithFields(logFields).Trace("#", i, "Uploading ")
repologger.WithFields(logFields).Trace()
sha, err := Upload(v1, mc, bucketName, sourceName, urlloc, jsonld)
if err != nil {
logFields = log.Fields{"url": urlloc, "sha": sha, "issue": "Error uploading jsonld to object store"}
log.WithFields(logFields).Error("Error uploading jsonld to object store: ", urlloc, err)
repologger.WithFields(logFields).Error(err)
repoStats.Inc(common.StoreError)
} else {
logFields = log.Fields{"url": urlloc, "sha": sha, "issue": "Uploaded to object store"}
repologger.WithFields(logFields).Trace(err)
log.WithFields(logFields).Info("Successfully put ", sha, " in summoned bucket for ", urlloc)
repoStats.Inc(common.Stored)
}
// TODO Is here where to add an entry to the KV store
db.Update(func(tx *bolt.Tx) error {
b := tx.Bucket([]byte(sourceName))
err := b.Put([]byte(urlloc), []byte(sha))
if err != nil {
log.Error("Error writing to bolt ", err)
}
return nil
})
} else {
logFields := log.Fields{"url": urlloc, "issue": "Empty JSON-LD document found "}
log.WithFields(logFields).Info("Empty JSON-LD document found. Continuing.")
repologger.WithFields(logFields).Error("Empty JSON-LD document found. Continuing.")
// TODO Is here where to add an entry to the KV store
db.Update(func(tx *bolt.Tx) error {
b := tx.Bucket([]byte(sourceName))
err := b.Put([]byte(urlloc), []byte(fmt.Sprintf("NULL: %s", urlloc))) // no JSON-LD found at this URL
if err != nil {
log.Error("Error writing to bolt ", err)
}
return nil
})
}
}
}

func contains(arr []string, str string) bool {
for _, a := range arr {

Expand Down
Loading

0 comments on commit ecbab66

Please sign in to comment.