Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

conda crawler implementation #532

Merged
merged 17 commits into from
May 2, 2024
5 changes: 5 additions & 0 deletions config/cdConfig.js
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ module.exports = {
fetched: { defaultTtlSeconds: fetchedCacheTtlSeconds }
},
cocoapods: { githubToken },
conda: {
cdFileLocation: cd_file.location
},
cratesio: {},
debian: { cdFileLocation: cd_file.location },
git: {},
Expand All @@ -50,6 +53,8 @@ module.exports = {
process: {
cdsource: {},
component: {},
conda: { githubToken },
condasrc: { githubToken },
lamarrr marked this conversation as resolved.
Show resolved Hide resolved
crate: { githubToken },
deb: {},
debsrc: {},
Expand Down
24 changes: 24 additions & 0 deletions config/map.js
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,26 @@ const npm = {
fossology
}

const conda = {
_type: 'conda',
source,
clearlydefined,
licensee,
reuse,
scancode,
fossology
}

const condasrc = {
_type: 'condasrc',
source,
clearlydefined,
licensee,
reuse,
scancode,
fossology
}

const crate = {
_type: 'crate',
source,
Expand Down Expand Up @@ -127,6 +147,8 @@ const gem = {
const _package = {
_type: 'package',
npm,
conda,
condasrc,
crate,
deb,
go,
Expand Down Expand Up @@ -156,6 +178,8 @@ const entities = {
licensee,
reuse,
npm,
conda,
condasrc,
crate,
deb,
go,
Expand Down
277 changes: 277 additions & 0 deletions providers/fetch/condaFetch.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,277 @@
// Copyright (c) Microsoft Corporation and others. Licensed under the MIT license.
// SPDX-License-Identifier: MIT

const AbstractFetch = require('./abstractFetch')
const { clone } = require('lodash')
const fs = require('fs')
const memCache = require('memory-cache')
const nodeRequest = require('request')
const FetchResult = require('../../lib/fetchResult')

class CondaFetch extends AbstractFetch {
constructor(options) {
super(options)
this.packageMapFolder = this.options.cdFileLocation
this.channels = {
'anaconda-main': 'https://repo.anaconda.com/pkgs/main',
'anaconda-r': 'https://repo.anaconda.com/pkgs/r',
'conda-forge': 'https://conda.anaconda.org/conda-forge'
}
}

canHandle(request) {
const spec = this.toSpec(request)
return spec && this.channels[spec.provider]
}

// {type: conda|condasrc}/{provider: anaconda-main|anaconda-r|conda-forge}/{architecture|-}/{package name}/[{version | _}]-[{build version | _}]/[{tool version}]
lamarrr marked this conversation as resolved.
Show resolved Hide resolved
// i.e. conda/conda-forge/linux-aarch64/numpy/1.13.0-py36/
// conda/conda-forge/-/numpy/-py36/
// conda/conda-forge/-/numpy/1.13.0-py36/
// conda/conda-forge/linux-aarch64/numpy/_-py36/
// conda/conda-forge/-/numpy/
// conda/conda-forge/-/numpy/_-_
async handle(request) {
const spec = this.toSpec(request)
if (!this.channels[spec.provider]) {
return request.markSkip(`Unrecognized conda provider: ${spec.provider}, must be either of: ${Object.keys(this.channels)}`)
}
const channelData = await this.getChannelData(this.channels[spec.provider], spec.provider)

lamarrr marked this conversation as resolved.
Show resolved Hide resolved
if (!channelData) return this.markSkip(request)

let architecture = spec.namespace

let [version, buildVersion] = (spec.revision || '').split('-')

if (channelData.packages[spec.name] === undefined) {
return request.markSkip(`Missing package ${spec.name} in channel: ${spec.provider}`)
}

const packageChannelData = channelData.packages[spec.name]
if (spec.type !== 'conda' && spec.type !== 'condasrc') {
return request.markSkip('spec type must either be conda or condasrc')
}

// unless otherwise specified, we fetch the architecture package
if (spec.type !== 'conda' && packageChannelData.subdirs.length === 0) {
return request.markSkip('No architecture build in package channel data')
lamarrr marked this conversation as resolved.
Show resolved Hide resolved
}

if ((!architecture || architecture === '_') && spec.type === 'conda') {
lamarrr marked this conversation as resolved.
Show resolved Hide resolved
// prefer no-arch if available
architecture = packageChannelData.subdirs.includes('noarch') ? 'noarch' : packageChannelData.subdirs[0]
this.logger.info(`No binary architecture specified for ${spec.name}, using architecture: ${architecture}`)
}

lamarrr marked this conversation as resolved.
Show resolved Hide resolved
if (spec.type === 'condasrc') {
return this._downloadCondaSourcePackage(spec, request, version, packageChannelData)
} else {
return this._downloadCondaPackage(
spec,
request,
version,
buildVersion,
architecture,
packageChannelData
)
}
}

async _downloadCondaSourcePackage(spec, request, version, packageChannelData) {
if (version && version !== '_' && packageChannelData.version !== version) {
lamarrr marked this conversation as resolved.
Show resolved Hide resolved
return request.markSkip(`Missing source file version ${version} for package ${spec.name}`)
}
if (!packageChannelData.source_url) {
return request.markSkip(`Missing archive source file in repodata for package ${spec.name}`)
}
let downloadUrl = new URL(`${packageChannelData.source_url}`).href

spec.revision = packageChannelData.version
request.url = spec.toUrl()
super.handle(request)

const file = this.createTempFile(request)
const dir = this.createTempDir(request)

await this._downloadPackage(downloadUrl, file.name)
await this.decompress(file.name, dir.name)
const hashes = await this.computeHashes(file.name)

const fetchResult = new FetchResult(request.url)
fetchResult.document = {
location: dir.name,
registryData: { 'channelData': packageChannelData, downloadUrl },
releaseDate: new Date(packageChannelData.timestamp).toUTCString(),
declaredLicenses: packageChannelData.license,
lamarrr marked this conversation as resolved.
Show resolved Hide resolved
hashes
}

fetchResult.casedSpec = clone(spec)
request.fetchResult = fetchResult.adoptCleanup(dir, request)
return request
}

async _downloadCondaPackage(spec, request, version, buildVersion, architecture, packageChannelData) {
let repoData = undefined
if (!(packageChannelData.subdirs.find(x => x === architecture))) {
return request.markSkip(`Missing architecture ${architecture} for package ${spec.name} in channel`)
}
repoData = await this.getRepoData(this.channels[spec.provider], spec.provider, architecture)

if (!repoData) return this.markSkip(request)

let packageRepoEntries = []
lamarrr marked this conversation as resolved.
Show resolved Hide resolved
let packageMatches = ([, packageData]) => {
return packageData.name === spec.name && ((!version) || version === '_' || version === packageData.version)
&& ((!buildVersion) || buildVersion === '_' || packageData.build.startsWith(buildVersion))
}

if (repoData['packages']) {
packageRepoEntries = packageRepoEntries.concat(Object.entries(repoData['packages'])
.filter(packageMatches)
.map(([packageFile, packageData]) => { return { packageFile, packageData } }))
}

if (repoData['packages.conda']) {
packageRepoEntries = packageRepoEntries.concat(Object.entries(repoData['packages.conda'])
.filter(packageMatches)
.map(([packageFile, packageData]) => { return { packageFile, packageData } }))
}

packageRepoEntries.sort((a, b) => {
if (a.packageData.build < b.packageData.build) {
return 1
} else if (a.packageData.build === b.packageData.build) {
return 0
}
else {
return -1
}
})

let packageRepoEntry = packageRepoEntries[0]
if (!packageRepoEntry) {
return request.markSkip(`Missing package with matching spec (version: ${version}, buildVersion: ${buildVersion}) in ${architecture} repository`)
}

let downloadUrl = new URL(`${this.channels[spec.provider]}/${architecture}/${packageRepoEntry.packageFile}`).href

spec.namespace = architecture
spec.revision = packageRepoEntry.packageData.version + '-' + packageRepoEntry.packageData.build
request.url = spec.toUrl()
super.handle(request)

const file = this.createTempFile(request)
const dir = this.createTempDir(request)

await this._downloadPackage(downloadUrl, file.name)
await this.decompress(file.name, dir.name)
const hashes = await this.computeHashes(file.name)

const fetchResult = new FetchResult(request.url)
fetchResult.document = {
location: dir.name,
registryData: { 'channelData': packageChannelData, 'repoData': packageRepoEntry, downloadUrl },
releaseDate: new Date(packageRepoEntry.packageData.timestamp).toUTCString(),
lamarrr marked this conversation as resolved.
Show resolved Hide resolved
declaredLicenses: packageRepoEntry.packageData.license,
hashes
}

fetchResult.casedSpec = clone(spec)

request.fetchResult = fetchResult.adoptCleanup(dir, request)
return request
}

async _downloadPackage(downloadUrl, destination) {
return new Promise(
(resolve, reject) => {
const options = {
url: downloadUrl,
headers: {
'User-Agent': 'clearlydefined.io crawler (clearlydefined@outlook.com)'
}
}
nodeRequest.get(options, (error, response) => {
if (error) {
return reject(error)
}
if (response.statusCode !== 200) {
return reject(new Error(`${response.statusCode} ${response.statusMessage}`))
}
}).pipe(fs.createWriteStream(destination).on('finish', () =>
resolve()
))
}
)
}

async _cachedDownload(cacheKey, sourceUrl, cacheDuration, fileDstLocation) {
if (!memCache.get(cacheKey)) {
return new Promise(
(resolve, reject) => {
const options = {
url: sourceUrl,
headers: {
'User-Agent': 'clearlydefined.io crawler (clearlydefined@outlook.com)'
}
}
nodeRequest.get(options, (error, response) => {
if (error) {
return reject(error)
}
if (response.statusCode !== 200) {
return reject(new Error(`${response.statusCode} ${response.statusMessage}`))
}
}).pipe(fs.createWriteStream(fileDstLocation).on('finish', () => {
memCache.put(cacheKey, true, cacheDuration)
this.logger.info(
`Conda: retrieved ${sourceUrl}. Stored channel data file at ${fileDstLocation}`
)
return resolve()
}))
}
)
}
}


async getChannelData(condaChannelUrl, condaChannelID) {
// ~10MB file, needs to be cached
let channelDataFile = {
url: `${condaChannelUrl}/channeldata.json`,
cacheKey: `${condaChannelID}-channelDataFile`,
cacheDuration: 8 * 60 * 60 * 1000,// 8 hours
fileLocation: `${this.packageMapFolder}/${condaChannelID}-channelDataFile.json`
}
try {
await this._cachedDownload(channelDataFile.cacheKey, channelDataFile.url,
channelDataFile.cacheDuration, channelDataFile.fileLocation)
} catch (error) {
return null
}
let fileText = fs.readFileSync(channelDataFile.fileLocation)
return JSON.parse(fileText)
}

async getRepoData(condaChannelUrl, condaChannelID, architecture) {
// ~30MB file, needs to be cached
let repoFile = {
url: `${condaChannelUrl}/${architecture}/repodata.json`,
cacheKey: `${condaChannelID}-repoDataFile-${architecture}`,
cacheDuration: 8 * 60 * 60 * 1000,// 8 hours
fileLocation: `${this.packageMapFolder}/${condaChannelID}-repoDataFile-${architecture}.json`
}
try {
await this._cachedDownload(repoFile.cacheKey, repoFile.url,
repoFile.cacheDuration, repoFile.fileLocation)
} catch (error) {
return null
}
let fileText = fs.readFileSync(repoFile.fileLocation)
return JSON.parse(fileText)
}
}

module.exports = options => new CondaFetch(options)
4 changes: 3 additions & 1 deletion providers/fetch/dispatcher.js
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,9 @@ class FetchDispatcher extends AbstractFetch {
if (!force && this.filter && !this.filter.shouldFetch(request)) return request
// get the right real fetcher for this request and dispatch
const handler = this._getHandler(request, this.fetchers)
if (!handler) throw new Error(`No fetcher found for ${request.toString()}`)
if (!handler) {
throw new Error(`No fetcher found for ${request.toString()}`)
}

await this._fetchResult(request, handler)
return request
Expand Down
3 changes: 3 additions & 0 deletions providers/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ module.exports = {
fetch: {
cdDispatch: require('./fetch/dispatcher'),
cocoapods: require('./fetch/podFetch'),
conda: require('./fetch/condaFetch'),
packagist: require('./fetch/packagistFetch'),
cratesio: require('./fetch/cratesioFetch'),
debian: require('./fetch/debianFetch'),
Expand All @@ -28,6 +29,8 @@ module.exports = {
process: {
cdsource: require('./process/sourceExtract'),
component: require('./process/component'),
conda: require('./process/condaExtract'),
condasrc: require('./process/condaSrcExtract'),
crate: require('./process/crateExtract'),
deb: require('./process/debExtract'),
debsrc: require('./process/debsrcExtract'),
Expand Down
Loading