Skip to content

Commit

Permalink
Merge pull request clearlydefined#532 from lamarrr/conda-support
Browse files Browse the repository at this point in the history
conda crawler implementation
  • Loading branch information
qtomlinson authored May 2, 2024
2 parents 63526a8 + 9f29866 commit 5cfbdd5
Show file tree
Hide file tree
Showing 16 changed files with 800 additions and 4 deletions.
5 changes: 5 additions & 0 deletions config/cdConfig.js
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ module.exports = {
fetched: { defaultTtlSeconds: fetchedCacheTtlSeconds }
},
cocoapods: { githubToken },
conda: {
cdFileLocation: cd_file.location
},
cratesio: {},
debian: { cdFileLocation: cd_file.location },
git: {},
Expand All @@ -50,6 +53,8 @@ module.exports = {
process: {
cdsource: {},
component: {},
conda: { githubToken },
condasrc: {},
crate: { githubToken },
deb: {},
debsrc: {},
Expand Down
12 changes: 12 additions & 0 deletions config/map.js
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,16 @@ const npm = {
fossology
}

const conda = {
_type: 'conda',
source,
clearlydefined,
licensee,
reuse,
scancode,
fossology
}

const crate = {
_type: 'crate',
source,
Expand Down Expand Up @@ -127,6 +137,7 @@ const gem = {
const _package = {
_type: 'package',
npm,
conda,
crate,
deb,
go,
Expand Down Expand Up @@ -156,6 +167,7 @@ const entities = {
licensee,
reuse,
npm,
conda,
crate,
deb,
go,
Expand Down
202 changes: 202 additions & 0 deletions providers/fetch/condaFetch.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
// Copyright (c) Microsoft Corporation and others. Licensed under the MIT license.
// SPDX-License-Identifier: MIT

const AbstractFetch = require('./abstractFetch')
const { clone } = require('lodash')
const fs = require('fs')
const memCache = require('memory-cache')
const nodeRequest = require('request')
const FetchResult = require('../../lib/fetchResult')

class CondaFetch extends AbstractFetch {
constructor(options) {
super(options)
this.packageMapFolder = this.options.cdFileLocation
this.channels = {
'anaconda-main': 'https://repo.anaconda.com/pkgs/main',
'anaconda-r': 'https://repo.anaconda.com/pkgs/r',
'conda-forge': 'https://conda.anaconda.org/conda-forge'
}
this.headers = {
'User-Agent': 'clearlydefined.io crawler (clearlydefined@outlook.com)'
}
this.CACHE_DURATION = 8 * 60 * 60 * 1000 // 8 hours
}

canHandle(request) {
const spec = this.toSpec(request)
return spec && !!(this.channels[spec.provider])
}

// {type: conda|condasrc}/{provider: anaconda-main|anaconda-r|conda-forge}/{architecture|-}/{package name}/[{version | }]-[{build version | }]/
// i.e. conda/conda-forge/linux-aarch64/numpy/1.13.0-py36/
// conda/conda-forge/-/numpy/-py36/
// conda/conda-forge/-/numpy/1.13.0-py36/
// conda/conda-forge/linux-aarch64/numpy/-py36/
// conda/conda-forge/-/numpy/
// conda/conda-forge/-/numpy/-
async handle(request) {
const spec = this.toSpec(request)
if (spec.type !== 'conda' && spec.type !== 'condasrc') {
return request.markSkip('spec type must either be conda or condasrc')
}
const channelData = await this.getChannelData(this.channels[spec.provider], spec.provider)
if (!channelData) {
return request.markSkip('failed to fetch and parse channelData.json')
}
let architecture = spec.namespace
let [version, buildVersion] = (spec.revision || '').split('-')
if (channelData.packages[spec.name] === undefined) {
return request.markSkip(`Missing package ${spec.name} in channel: ${spec.provider}`)
}
const packageChannelData = channelData.packages[spec.name]
if (spec.type === 'condasrc') {
return this._downloadCondaSourcePackage(spec, request, version, packageChannelData)
} else {
return this._downloadCondaPackage(
spec,
request,
version,
buildVersion,
architecture,
packageChannelData
)
}
}

async _downloadCondaSourcePackage(spec, request, version, packageChannelData) {
if (version && packageChannelData.version !== version) {
return request.markSkip(`Missing source file version ${version} for package ${spec.name}`)
}
if (!packageChannelData.source_url) {
return request.markSkip(`Missing archive source file in repodata for package ${spec.name}`)
}
let downloadUrl = new URL(`${packageChannelData.source_url}`).href
spec.revision = packageChannelData.version
request.url = spec.toUrl()
super.handle(request)
const file = this.createTempFile(request)
const dir = this.createTempDir(request)
await this._downloadPackage(downloadUrl, file.name)
await this.decompress(file.name, dir.name)
const hashes = await this.computeHashes(file.name)
const fetchResult = new FetchResult(request.url)
fetchResult.document = {
location: dir.name,
registryData: { 'channelData': packageChannelData, downloadUrl },
releaseDate: new Date(packageChannelData.timestamp || 0).toISOString(),
declaredLicenses: packageChannelData.license,
hashes
}
fetchResult.casedSpec = clone(spec)
request.fetchResult = fetchResult.adoptCleanup(dir, request)
return request
}

_matchPackage(name, version, buildVersion, repoData) {
let packageRepoEntries = []
let packageMatches = ([, packageData]) => {
return packageData.name === name && ((!version) || version === packageData.version)
&& ((!buildVersion) || packageData.build.startsWith(buildVersion))
}
if (repoData['packages']) {
packageRepoEntries = packageRepoEntries.concat(Object.entries(repoData['packages'])
.filter(packageMatches)
.map(([packageFile, packageData]) => { return { packageFile, packageData } }))
}
if (repoData['packages.conda']) {
packageRepoEntries = packageRepoEntries.concat(Object.entries(repoData['packages.conda'])
.filter(packageMatches)
.map(([packageFile, packageData]) => { return { packageFile, packageData } }))
}
packageRepoEntries.sort((a, b) => (b.packageData.timestamp || 0) - (a.packageData.timestamp || 0))
return packageRepoEntries
}

async _downloadCondaPackage(spec, request, version, buildVersion, architecture, packageChannelData) {
if (!architecture || architecture === '-' && packageChannelData.subdirs.length > 0) {
// prefer no-arch if available
architecture = packageChannelData.subdirs.includes('noarch') ? 'noarch' : packageChannelData.subdirs[0]
this.logger.info(`No binary architecture specified for ${spec.name}, using architecture: ${architecture}`)
}
let repoData = undefined
if (!(packageChannelData.subdirs.find(x => x === architecture))) {
return request.markSkip(`Missing architecture ${architecture} for package ${spec.name} in channel`)
}
repoData = await this.getRepoData(this.channels[spec.provider], spec.provider, architecture)
if (!repoData) {
return request.markSkip(`failed to fetch and parse repodata json file for channel ${spec.provider} in architecture ${architecture}`)
}
let packageRepoEntries = this._matchPackage(spec.name, version, buildVersion, repoData)
if (packageRepoEntries.length === 0) {
return request.markSkip(`Missing package with matching spec (version: ${version}, buildVersion: ${buildVersion}) in ${architecture} repository`)
}
let packageRepoEntry = packageRepoEntries[0]
let downloadUrl = new URL(`${this.channels[spec.provider]}/${architecture}/${packageRepoEntry.packageFile}`).href
spec.namespace = architecture
spec.revision = packageRepoEntry.packageData.version + '-' + packageRepoEntry.packageData.build
request.url = spec.toUrl()
super.handle(request)
const file = this.createTempFile(request)
const dir = this.createTempDir(request)
await this._downloadPackage(downloadUrl, file.name)
await this.decompress(file.name, dir.name)
const hashes = await this.computeHashes(file.name)
const fetchResult = new FetchResult(request.url)
fetchResult.document = {
location: dir.name,
registryData: { 'channelData': packageChannelData, 'repoData': packageRepoEntry, downloadUrl },
releaseDate: new Date(packageRepoEntry.packageData.timestamp || 0).toISOString(),
declaredLicenses: packageRepoEntry.packageData.license,
hashes
}
fetchResult.casedSpec = clone(spec)
request.fetchResult = fetchResult.adoptCleanup(dir, request)
return request
}

async _downloadPackage(downloadUrl, destination) {
return new Promise((resolve, reject) => {
const options = { url: downloadUrl, headers: this.headers }
nodeRequest.get(options, (error, response) => {
if (error) return reject(error)
if (response.statusCode !== 200) return reject(new Error(`${response.statusCode} ${response.statusMessage}`))
}).pipe(fs.createWriteStream(destination).on('finish', () => resolve()))
})
}

async _cachedDownload(cacheKey, sourceUrl, cacheDuration, fileDstLocation) {
if (!memCache.get(cacheKey)) {
return new Promise((resolve, reject) => {
const options = { url: sourceUrl, headers: this.headers }
nodeRequest.get(options, (error, response) => {
if (error) return reject(error)
if (response.statusCode !== 200) return reject(new Error(`${response.statusCode} ${response.statusMessage}`))
}).pipe(fs.createWriteStream(fileDstLocation).on('finish', () => {
memCache.put(cacheKey, true, cacheDuration)
this.logger.info(`Conda: retrieved ${sourceUrl}. Stored data file at ${fileDstLocation}`)
return resolve()
}))
})
}
}

async _fetchCachedJSONFile(cacheKey, url, cacheDuration, fileLocation) {
try {
await this._cachedDownload(cacheKey, url, cacheDuration, fileLocation)
} catch (error) {
return null
}
return JSON.parse(fs.readFileSync(fileLocation))
}

async getChannelData(condaChannelUrl, condaChannelID) {
return await this._fetchCachedJSONFile(`${condaChannelID}-channelDataFile`, `${condaChannelUrl}/channeldata.json`, this.CACHE_DURATION, `${this.packageMapFolder}/${condaChannelID}-channelDataFile.json`)
}

async getRepoData(condaChannelUrl, condaChannelID, architecture) {
return await this._fetchCachedJSONFile(`${condaChannelID}-repoDataFile-${architecture}`, `${condaChannelUrl}/${architecture}/repodata.json`, this.CACHE_DURATION, `${this.packageMapFolder}/${condaChannelID}-repoDataFile-${architecture}.json`)
}
}

module.exports = options => new CondaFetch(options)
4 changes: 3 additions & 1 deletion providers/fetch/dispatcher.js
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,9 @@ class FetchDispatcher extends AbstractFetch {
if (!force && this.filter && !this.filter.shouldFetch(request)) return request
// get the right real fetcher for this request and dispatch
const handler = this._getHandler(request, this.fetchers)
if (!handler) throw new Error(`No fetcher found for ${request.toString()}`)
if (!handler) {
throw new Error(`No fetcher found for ${request.toString()}`)
}

await this._fetchResult(request, handler)
return request
Expand Down
3 changes: 3 additions & 0 deletions providers/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ module.exports = {
fetch: {
cdDispatch: require('./fetch/dispatcher'),
cocoapods: require('./fetch/podFetch'),
conda: require('./fetch/condaFetch'),
packagist: require('./fetch/packagistFetch'),
cratesio: require('./fetch/cratesioFetch'),
debian: require('./fetch/debianFetch'),
Expand All @@ -28,6 +29,8 @@ module.exports = {
process: {
cdsource: require('./process/sourceExtract'),
component: require('./process/component'),
conda: require('./process/condaExtract'),
condasrc: require('./process/condaSrcExtract'),
crate: require('./process/crateExtract'),
deb: require('./process/debExtract'),
debsrc: require('./process/debsrcExtract'),
Expand Down
63 changes: 63 additions & 0 deletions providers/process/condaExtract.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
const AbstractClearlyDefinedProcessor = require('./abstractClearlyDefinedProcessor')
const sourceDiscovery = require('../../lib/sourceDiscovery')
const { merge } = require('lodash')
const SourceSpec = require('../../lib/sourceSpec')

class CondaExtract extends AbstractClearlyDefinedProcessor {
constructor(options, sourceFinder) {
super(options)
this.sourceFinder = sourceFinder
}

get toolVersion() {
return '0.0.1'
}

canHandle(request) {
const spec = this.toSpec(request)
return request.type === 'conda' && spec && spec.type === 'conda'
}

async handle(request) {
if (this.isProcessing(request)) {
await super.handle(request)
const spec = this.toSpec(request)
const { releaseDate, registryData, declaredLicenses } = request.document
request.document = merge(this.clone(request.document), { releaseDate, registryData, declaredLicenses })
request.document.sourceInfo = await this._discoverSource(spec, registryData)
}
this.addLocalToolTasks(request)
if (request.document.sourceInfo) {
const sourceSpec = SourceSpec.fromObject(request.document.sourceInfo)
this.linkAndQueue(request, 'source', sourceSpec.toEntitySpec())
}
return request
}

async _discoverSource(spec, registryData) {
let sourceCandidates = [
registryData.channelData.source_url,
registryData.channelData.source_git_url,
registryData.channelData.home,
registryData.channelData.dev_url,
registryData.channelData.doc_url,
registryData.channelData.doc_source_url].filter(e => e)
let sourceInfo = undefined
const githubSource = await this.sourceFinder(
registryData.repoData.packageData.version, sourceCandidates, {
githubToken: this.options.githubToken,
logger: this.logger
})
if (githubSource) {
sourceInfo = githubSource
} else {
sourceInfo = SourceSpec.fromObject(spec)
sourceInfo.type = 'condasrc'
sourceInfo.namespace = null
sourceInfo.revision = spec.revision.split('-')[0]
}
return sourceInfo
}
}

module.exports = (options, sourceFinder) => new CondaExtract(options, sourceFinder || sourceDiscovery)
25 changes: 25 additions & 0 deletions providers/process/condaSrcExtract.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
const AbstractClearlyDefinedProcessor = require('./abstractClearlyDefinedProcessor')
const { merge } = require('lodash')

class CondaSrcExtract extends AbstractClearlyDefinedProcessor {
constructor(options) {
super(options)
}

get toolVersion() {
return '0.0.1'
}

canHandle(request) {
const spec = this.toSpec(request)
return request.type === 'clearlydefined' && spec && spec.type === 'condasrc'
}

async handle(request) {
await super.handle(request)
const { releaseDate, registryData, declaredLicenses } = request.document
request.document = merge(this.clone(request.document), { releaseDate, registryData, declaredLicenses })
}
}

module.exports = (options) => new CondaSrcExtract(options)
2 changes: 1 addition & 1 deletion providers/process/package.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

const AbstractProcessor = require('./abstractProcessor')

const supportedTypes = ['npm', 'crate', 'maven', 'nuget', 'gem', 'go', 'pod', 'pypi', 'composer', 'deb']
const supportedTypes = ['npm', 'conda', 'crate', 'maven', 'nuget', 'gem', 'go', 'pod', 'pypi', 'composer', 'deb']

class PackageProcessor extends AbstractProcessor {
shouldFetch() {
Expand Down
2 changes: 1 addition & 1 deletion providers/process/source.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
// SPDX-License-Identifier: MIT

const AbstractProcessor = require('./abstractProcessor')
const supportedTypes = ['git', 'sourcearchive', 'debsrc']
const supportedTypes = ['git', 'sourcearchive', 'debsrc', 'condasrc']

class SourceProcessor extends AbstractProcessor {
shouldFetch() {
Expand Down
Loading

0 comments on commit 5cfbdd5

Please sign in to comment.