From 9decf1ae52873461e9006066b05ab0eb12c8a328 Mon Sep 17 00:00:00 2001 From: Christopher Angelo Phillips <32073428+spiffcs@users.noreply.github.com> Date: Fri, 8 Apr 2022 15:12:32 -0400 Subject: [PATCH] Add digest property to parent and nested java package metadata (#941) --- .../cyclonedxhelpers/external_references.go | 32 +++++++++++++++- .../common/spdxhelpers/external_refs.go | 1 + .../common/spdxhelpers/to_syft_model.go | 8 ++++ .../formats/spdx22json/to_format_model.go | 22 ++++++++++- .../formats/spdx22tagvalue/to_format_model.go | 21 +++++++++- syft/file/digest_cataloger.go | 23 +++++++---- syft/pkg/cataloger/java/archive_parser.go | 38 ++++++++++++++++--- .../pkg/cataloger/java/archive_parser_test.go | 6 ++- syft/pkg/java_metadata.go | 14 ++++--- 9 files changed, 141 insertions(+), 24 deletions(-) diff --git a/internal/formats/common/cyclonedxhelpers/external_references.go b/internal/formats/common/cyclonedxhelpers/external_references.go index 5cc75302166..e94a60111c6 100644 --- a/internal/formats/common/cyclonedxhelpers/external_references.go +++ b/internal/formats/common/cyclonedxhelpers/external_references.go @@ -4,12 +4,15 @@ import ( "fmt" "strings" + syftFile "github.com/anchore/syft/syft/file" + "github.com/CycloneDX/cyclonedx-go" "github.com/anchore/syft/syft/pkg" ) +//nolint:funlen, gocognit func encodeExternalReferences(p pkg.Package) *[]cyclonedx.ExternalReference { - refs := []cyclonedx.ExternalReference{} + var refs []cyclonedx.ExternalReference if hasMetadata(p) { switch metadata := p.Metadata.(type) { case pkg.ApkMetadata: @@ -46,6 +49,19 @@ func encodeExternalReferences(p pkg.Package) *[]cyclonedx.ExternalReference { Type: cyclonedx.ERTypeWebsite, }) } + case pkg.JavaMetadata: + if len(metadata.ArchiveDigests) > 0 { + for _, digest := range metadata.ArchiveDigests { + refs = append(refs, cyclonedx.ExternalReference{ + URL: "", + Type: cyclonedx.ERTypeBuildMeta, + Hashes: &[]cyclonedx.Hash{{ + Algorithm: cyclonedx.HashAlgorithm(digest.Algorithm), + Value: digest.Value, + }}, + }) + } + } case pkg.PythonPackageMetadata: if metadata.DirectURLOrigin != nil && metadata.DirectURLOrigin.URL != "" { ref := cyclonedx.ExternalReference{ @@ -79,6 +95,20 @@ func decodeExternalReferences(c *cyclonedx.Component, metadata interface{}) { meta.Homepage = refURL(c, cyclonedx.ERTypeWebsite) case *pkg.GemMetadata: meta.Homepage = refURL(c, cyclonedx.ERTypeWebsite) + case *pkg.JavaMetadata: + var digests []syftFile.Digest + if ref := findExternalRef(c, cyclonedx.ERTypeBuildMeta); ref != nil { + if ref.Hashes != nil { + for _, hash := range *ref.Hashes { + digests = append(digests, syftFile.Digest{ + Algorithm: string(hash.Algorithm), + Value: hash.Value, + }) + } + } + } + + meta.ArchiveDigests = digests case *pkg.PythonPackageMetadata: if meta.DirectURLOrigin == nil { meta.DirectURLOrigin = &pkg.PythonDirectURLOriginInfo{} diff --git a/internal/formats/common/spdxhelpers/external_refs.go b/internal/formats/common/spdxhelpers/external_refs.go index 384282cb038..d114c2ab279 100644 --- a/internal/formats/common/spdxhelpers/external_refs.go +++ b/internal/formats/common/spdxhelpers/external_refs.go @@ -22,5 +22,6 @@ func ExternalRefs(p pkg.Package) (externalRefs []ExternalRef) { ReferenceType: PurlExternalRefType, }) } + return externalRefs } diff --git a/internal/formats/common/spdxhelpers/to_syft_model.go b/internal/formats/common/spdxhelpers/to_syft_model.go index ade236089e9..a36eac3de24 100644 --- a/internal/formats/common/spdxhelpers/to_syft_model.go +++ b/internal/formats/common/spdxhelpers/to_syft_model.go @@ -309,6 +309,14 @@ func extractMetadata(p *spdx.Package2_2, info pkgInfo) (pkg.MetadataType, interf Architecture: arch, Maintainer: p.PackageOriginatorPerson, } + case pkg.JavaPkg: + var digests []file.Digest + for algorithm, value := range p.PackageChecksums { + digests = append(digests, file.Digest{Algorithm: string(algorithm), Value: value.Value}) + } + return pkg.JavaMetadataType, pkg.JavaMetadata{ + ArchiveDigests: digests, + } } return pkg.UnknownMetadataType, nil } diff --git a/internal/formats/spdx22json/to_format_model.go b/internal/formats/spdx22json/to_format_model.go index 761993b3ece..2e1a0c65cff 100644 --- a/internal/formats/spdx22json/to_format_model.go +++ b/internal/formats/spdx22json/to_format_model.go @@ -55,14 +55,32 @@ func toPackages(catalog *pkg.Catalog, relationships []artifact.Relationship) []m for _, p := range catalog.Sorted() { license := spdxhelpers.License(p) packageSpdxID := model.ElementID(p.ID()).String() - + filesAnalyzed := false + + // we generate digest for some Java packages + // see page 33 of the spdx specification for 2.2 + // spdx.github.io/spdx-spec/package-information/#710-package-checksum-field + var checksums []model.Checksum + if p.MetadataType == pkg.JavaMetadataType { + javaMetadata := p.Metadata.(pkg.JavaMetadata) + if len(javaMetadata.ArchiveDigests) > 0 { + filesAnalyzed = true + for _, digest := range javaMetadata.ArchiveDigests { + checksums = append(checksums, model.Checksum{ + Algorithm: digest.Algorithm, + ChecksumValue: digest.Value, + }) + } + } + } // note: the license concluded and declared should be the same since we are collecting license information // from the project data itself (the installed package files). packages = append(packages, model.Package{ + Checksums: checksums, Description: spdxhelpers.Description(p), DownloadLocation: spdxhelpers.DownloadLocation(p), ExternalRefs: spdxhelpers.ExternalRefs(p), - FilesAnalyzed: false, + FilesAnalyzed: filesAnalyzed, HasFiles: fileIDsForPackage(packageSpdxID, relationships), Homepage: spdxhelpers.Homepage(p), // The Declared License is what the authors of a project believe govern the package diff --git a/internal/formats/spdx22tagvalue/to_format_model.go b/internal/formats/spdx22tagvalue/to_format_model.go index a52aba13b1b..2f0d84ba726 100644 --- a/internal/formats/spdx22tagvalue/to_format_model.go +++ b/internal/formats/spdx22tagvalue/to_format_model.go @@ -103,6 +103,24 @@ func toFormatPackages(catalog *pkg.Catalog) map[spdx.ElementID]*spdx.Package2_2 // the Comments on License field (section 3.16) is preferred. license := spdxhelpers.License(p) + filesAnalyzed := false + checksums := make(map[spdx.ChecksumAlgorithm]spdx.Checksum) + + // If the pkg type is Java we have attempted to generated a digest + // FilesAnalyzed should be true in this case + if p.MetadataType == pkg.JavaMetadataType { + javaMetadata := p.Metadata.(pkg.JavaMetadata) + if len(javaMetadata.ArchiveDigests) > 0 { + filesAnalyzed = true + for _, digest := range javaMetadata.ArchiveDigests { + checksums[spdx.ChecksumAlgorithm(digest.Algorithm)] = spdx.Checksum{ + Algorithm: spdx.ChecksumAlgorithm(digest.Algorithm), + Value: digest.Value, + } + } + } + } + results[spdx.ElementID(id)] = &spdx.Package2_2{ // NOT PART OF SPEC @@ -159,7 +177,7 @@ func toFormatPackages(catalog *pkg.Catalog) map[spdx.ElementID]*spdx.Package2_2 // Intent: A package can refer to a project, product, artifact, distribution or a component that is // external to the SPDX document. - FilesAnalyzed: false, + FilesAnalyzed: filesAnalyzed, // NOT PART OF SPEC: did FilesAnalyzed tag appear? IsFilesAnalyzedTagPresent: true, @@ -180,6 +198,7 @@ func toFormatPackages(catalog *pkg.Catalog) map[spdx.ElementID]*spdx.Package2_2 // to determine if any file in the original package has been changed. If the SPDX file is to be included // in a package, this value should not be calculated. The SHA-1 algorithm will be used to provide the // checksum by default. + PackageChecksums: checksums, // note: based on the purpose above no discovered checksums should be provided, but instead, only // tool-derived checksums. diff --git a/syft/file/digest_cataloger.go b/syft/file/digest_cataloger.go index cb80f1a85a4..edc096c16b1 100644 --- a/syft/file/digest_cataloger.go +++ b/syft/file/digest_cataloger.go @@ -77,30 +77,39 @@ func (i *DigestsCataloger) catalogLocation(resolver source.FileResolver, locatio } defer internal.CloseAndLogError(contentReader, location.VirtualPath) + digests, err := DigestsFromFile(contentReader, i.hashes) + if err != nil { + return nil, internal.ErrPath{Context: "digests-cataloger", Path: location.RealPath, Err: err} + } + + return digests, nil +} + +func DigestsFromFile(closer io.ReadCloser, hashes []crypto.Hash) ([]Digest, error) { // create a set of hasher objects tied together with a single writer to feed content into - hashers := make([]hash.Hash, len(i.hashes)) - writers := make([]io.Writer, len(i.hashes)) - for idx, hashObj := range i.hashes { + hashers := make([]hash.Hash, len(hashes)) + writers := make([]io.Writer, len(hashes)) + for idx, hashObj := range hashes { hashers[idx] = hashObj.New() writers[idx] = hashers[idx] } - size, err := io.Copy(io.MultiWriter(writers...), contentReader) + size, err := io.Copy(io.MultiWriter(writers...), closer) if err != nil { - return nil, internal.ErrPath{Context: "digests-cataloger", Path: location.RealPath, Err: err} + return nil, err } if size == 0 { return make([]Digest, 0), nil } - result := make([]Digest, len(i.hashes)) + result := make([]Digest, len(hashes)) // only capture digests when there is content. It is important to do this based on SIZE and not // FILE TYPE. The reasoning is that it is possible for a tar to be crafted with a header-only // file type but a body is still allowed. for idx, hasher := range hashers { result[idx] = Digest{ - Algorithm: DigestAlgorithmName(i.hashes[idx]), + Algorithm: DigestAlgorithmName(hashes[idx]), Value: fmt.Sprintf("%+x", hasher.Sum(nil)), } } diff --git a/syft/pkg/cataloger/java/archive_parser.go b/syft/pkg/cataloger/java/archive_parser.go index b96f8f1827d..c49b591ee7d 100644 --- a/syft/pkg/cataloger/java/archive_parser.go +++ b/syft/pkg/cataloger/java/archive_parser.go @@ -1,14 +1,17 @@ package java import ( + "crypto" "fmt" "io" + "os" "path" "strings" "github.com/anchore/syft/internal/file" "github.com/anchore/syft/internal/log" "github.com/anchore/syft/syft/artifact" + syftFile "github.com/anchore/syft/syft/file" "github.com/anchore/syft/syft/pkg" "github.com/anchore/syft/syft/pkg/cataloger/common" ) @@ -34,6 +37,11 @@ var archiveFormatGlobs = []string{ // project that we can build in CI feel free to include it } +// javaArchiveHashes are all the current hash algorithms used to calculate archive digests +var javaArchiveHashes = []crypto.Hash{ + crypto.SHA1, +} + type archiveParser struct { fileManifest file.ZipFileManifest virtualPath string @@ -101,6 +109,7 @@ func (j *archiveParser) parse() ([]*pkg.Package, []artifact.Relationship, error) } // find aux packages from pom.properties/pom.xml and potentially modify the existing parentPkg + // NOTE: we cannot generate sha1 digests from packages discovered via pom.properties/pom.xml auxPkgs, err := j.discoverPkgsFromAllMavenFiles(parentPkg) if err != nil { return nil, nil, err @@ -135,6 +144,7 @@ func (j *archiveParser) parse() ([]*pkg.Package, []artifact.Relationship, error) // discoverMainPackage parses the root Java manifest used as the parent package to all discovered nested packages. func (j *archiveParser) discoverMainPackage() (*pkg.Package, error) { // search and parse java manifest files + // TODO: do we want to prefer or check for pom files over manifest here? manifestMatches := j.fileManifest.GlobMatch(manifestGlob) if len(manifestMatches) > 1 { return nil, fmt.Errorf("found multiple manifests in the jar: %+v", manifestMatches) @@ -157,6 +167,18 @@ func (j *archiveParser) discoverMainPackage() (*pkg.Package, error) { return nil, nil } + archiveCloser, err := os.Open(j.archivePath) + if err != nil { + return nil, fmt.Errorf("unable to open archive path (%s): %w", j.archivePath, err) + } + defer archiveCloser.Close() + + // grab and assign digest for the entire archive + digests, err := syftFile.DigestsFromFile(archiveCloser, javaArchiveHashes) + if err != nil { + log.Warnf("failed to create digest for file=%q: %+v", j.archivePath, err) + } + return &pkg.Package{ Name: selectName(manifest, j.fileInfo), Version: selectVersion(manifest, j.fileInfo), @@ -164,8 +186,9 @@ func (j *archiveParser) discoverMainPackage() (*pkg.Package, error) { Type: j.fileInfo.pkgType(), MetadataType: pkg.JavaMetadataType, Metadata: pkg.JavaMetadata{ - VirtualPath: j.virtualPath, - Manifest: manifest, + VirtualPath: j.virtualPath, + Manifest: manifest, + ArchiveDigests: digests, }, }, nil } @@ -181,12 +204,14 @@ func (j *archiveParser) discoverPkgsFromAllMavenFiles(parentPkg *pkg.Package) ([ var pkgs []*pkg.Package - properties, err := pomPropertiesByParentPath(j.archivePath, j.fileManifest.GlobMatch(pomPropertiesGlob), j.virtualPath) + // pom.properties + properties, err := pomPropertiesByParentPath(j.archivePath, j.virtualPath, j.fileManifest.GlobMatch(pomPropertiesGlob)) if err != nil { return nil, err } - projects, err := pomProjectByParentPath(j.archivePath, j.fileManifest.GlobMatch(pomXMLGlob), j.virtualPath) + // pom.xml + projects, err := pomProjectByParentPath(j.archivePath, j.virtualPath, j.fileManifest.GlobMatch(pomXMLGlob)) if err != nil { return nil, err } @@ -273,7 +298,7 @@ func discoverPkgsFromOpener(virtualPath, pathWithinArchive string, archiveOpener return nestedPkgs, nestedRelationships, nil } -func pomPropertiesByParentPath(archivePath string, extractPaths []string, virtualPath string) (map[string]pkg.PomProperties, error) { +func pomPropertiesByParentPath(archivePath, virtualPath string, extractPaths []string) (map[string]pkg.PomProperties, error) { contentsOfMavenPropertiesFiles, err := file.ContentsFromZip(archivePath, extractPaths...) if err != nil { return nil, fmt.Errorf("unable to extract maven files: %w", err) @@ -298,10 +323,11 @@ func pomPropertiesByParentPath(archivePath string, extractPaths []string, virtua propertiesByParentPath[path.Dir(filePath)] = *pomProperties } + return propertiesByParentPath, nil } -func pomProjectByParentPath(archivePath string, extractPaths []string, virtualPath string) (map[string]pkg.PomProject, error) { +func pomProjectByParentPath(archivePath, virtualPath string, extractPaths []string) (map[string]pkg.PomProject, error) { contentsOfMavenProjectFiles, err := file.ContentsFromZip(archivePath, extractPaths...) if err != nil { return nil, fmt.Errorf("unable to extract maven files: %w", err) diff --git a/syft/pkg/cataloger/java/archive_parser_test.go b/syft/pkg/cataloger/java/archive_parser_test.go index 748bc10cce1..7af900c99c3 100644 --- a/syft/pkg/cataloger/java/archive_parser_test.go +++ b/syft/pkg/cataloger/java/archive_parser_test.go @@ -283,6 +283,11 @@ func TestParseJar(t *testing.T) { metadata := a.Metadata.(pkg.JavaMetadata) metadata.Parent = nil + // redact Digest which is computed differently between CI and local + if len(metadata.ArchiveDigests) > 0 { + metadata.ArchiveDigests = nil + } + // ignore select fields (only works for the main section) for _, field := range test.ignoreExtras { if metadata.Manifest != nil && metadata.Manifest.Main != nil { @@ -567,7 +572,6 @@ func TestParseNestedJar(t *testing.T) { } } } - } }) } diff --git a/syft/pkg/java_metadata.go b/syft/pkg/java_metadata.go index baa169a359e..4e606a03d3f 100644 --- a/syft/pkg/java_metadata.go +++ b/syft/pkg/java_metadata.go @@ -3,6 +3,7 @@ package pkg import ( "strings" + "github.com/anchore/syft/syft/file" "github.com/anchore/syft/syft/linux" "github.com/anchore/syft/internal" @@ -20,12 +21,13 @@ var jenkinsPluginPomPropertiesGroupIDs = []string{ // JavaMetadata encapsulates all Java ecosystem metadata for a package as well as an (optional) parent relationship. type JavaMetadata struct { - VirtualPath string `json:"virtualPath" cyclonedx:"virtualPath"` // we need to include the virtual path in cyclonedx documents to prevent deduplication of jars within jars - Manifest *JavaManifest `mapstructure:"Manifest" json:"manifest,omitempty"` - PomProperties *PomProperties `mapstructure:"PomProperties" json:"pomProperties,omitempty" cyclonedx:"-"` - PomProject *PomProject `mapstructure:"PomProject" json:"pomProject,omitempty"` - PURL string `hash:"ignore" json:"-"` // pURLs and CPEs are ignored for package IDs - Parent *Package `hash:"ignore" json:"-"` // note: the parent cannot be included in the minimal definition of uniqueness since this field is not reproducible in an encode-decode cycle (is lossy). + VirtualPath string `json:"virtualPath" cyclonedx:"virtualPath"` // we need to include the virtual path in cyclonedx documents to prevent deduplication of jars within jars + Manifest *JavaManifest `mapstructure:"Manifest" json:"manifest,omitempty"` + PomProperties *PomProperties `mapstructure:"PomProperties" json:"pomProperties,omitempty" cyclonedx:"-"` + PomProject *PomProject `mapstructure:"PomProject" json:"pomProject,omitempty"` + ArchiveDigests []file.Digest `hash:"ignore" json:"digest,omitempty"` + PURL string `hash:"ignore" json:"-"` // pURLs and CPEs are ignored for package IDs + Parent *Package `hash:"ignore" json:"-"` // note: the parent cannot be included in the minimal definition of uniqueness since this field is not reproducible in an encode-decode cycle (is lossy). } // PomProperties represents the fields of interest extracted from a Java archive's pom.properties file.