Skip to content

Commit

Permalink
static analysis: collect basic information about archive file (ossf#993)
Browse files Browse the repository at this point in the history
* static analysis: collect basic information about archive file

Signed-off-by: Max Fisher <maxfisher@google.com>

* make AnalyzePackageFiles return `[]SingleResult` and fix bug picked up by test

Signed-off-by: Max Fisher <maxfisher@google.com>

* fix compile error, do archive analysis before extraction and add timing

Signed-off-by: Max Fisher <maxfisher@google.com>

---------

Signed-off-by: Max Fisher <maxfisher@google.com>
  • Loading branch information
maxfisher-g authored Jan 16, 2024
1 parent 5c77643 commit ae8550a
Show file tree
Hide file tree
Showing 6 changed files with 126 additions and 51 deletions.
6 changes: 3 additions & 3 deletions internal/staticanalysis/analyze.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ If staticanalysis.Parsing is not in the list of analysisTasks, jsParserConfig ma
If an error occurs while traversing the extracted package directory tree, or an invalid
task is requested, a nil result is returned along with the corresponding error object.
*/
func AnalyzePackageFiles(ctx context.Context, extractDir string, jsParserConfig parsing.ParserConfig, analysisTasks []Task) (*Result, error) {
func AnalyzePackageFiles(ctx context.Context, extractDir string, jsParserConfig parsing.ParserConfig, analysisTasks []Task) ([]SingleResult, error) {
runTask := map[Task]bool{}

for _, task := range analysisTasks {
Expand Down Expand Up @@ -89,7 +89,7 @@ func AnalyzePackageFiles(ctx context.Context, extractDir string, jsParserConfig

if runTask[Basic] {
slog.InfoContext(ctx, "run basic analysis")
basicData, err := basicdata.Analyze(ctx, paths, getPathInArchive)
basicData, err := basicdata.Analyze(ctx, paths, basicdata.FormatPaths(getPathInArchive))
if err != nil {
slog.ErrorContext(ctx, "static analysis basic data error", "error", err)
} else if len(basicData) != len(fileResults) {
Expand Down Expand Up @@ -133,5 +133,5 @@ func AnalyzePackageFiles(ctx context.Context, extractDir string, jsParserConfig
}
}

return &Result{Files: fileResults}, nil
return fileResults, nil
}
16 changes: 7 additions & 9 deletions internal/staticanalysis/analyze_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,10 @@ var helloWorldJs = testFile{
lineLengths: valuecounts.Count([]int{18}),
}

func makeDesiredResult(files ...testFile) *Result {
result := Result{
Files: []SingleResult{},
}
for _, file := range files {
result.Files = append(result.Files, SingleResult{
func makeDesiredResult(files ...testFile) []SingleResult {
result := make([]SingleResult, len(files))
for index, file := range files {
result[index] = SingleResult{
Filename: file.filename,
Basic: &basicdata.FileData{
DetectedType: file.fileType,
Expand Down Expand Up @@ -67,17 +65,17 @@ func makeDesiredResult(files ...testFile) *Result {
IPAddresses: []string{},
URLs: []string{},
},
})
}
}

return &result
return result
}

func TestAnalyzePackageFiles(t *testing.T) {
tests := []struct {
name string
files []testFile
want *Result
want []SingleResult
wantErr bool
}{
{
Expand Down
81 changes: 65 additions & 16 deletions internal/staticanalysis/basicdata/basic_data.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,32 +39,79 @@ func (bd FileData) String() string {
return strings.Join(parts, "\n")
}

// Option allows controlling the behaviour of Analyze with non-required arguments.
type Option interface{ set(*analyzeConfig) }

// option implements Option.
type option func(*analyzeConfig)

func (o option) set(config *analyzeConfig) { o(config) }

// analyzeConfig stores all behaviour configuration for Analyze which is adjustable by Option.
type analyzeConfig struct {
// withLineLengths enables line length analysis
withLineLengths bool
// formatPathFunc allows providing a custom transformation for file paths
// when logging errors. For example, removing a common path prefix.
formatPathFunc func(absPath string) string
}

func getDefaultAnalyzeConfig() analyzeConfig {
return analyzeConfig{
withLineLengths: true,
formatPathFunc: func(absPath string) string { return absPath },
}
}

// SkipLineLengths disables collecting line length information during analysis, which is
// useful when the input files are known not to be text files (e.g. a package tarball).
func SkipLineLengths() Option {
return option(func(config *analyzeConfig) {
config.withLineLengths = false
})
}

// FormatPaths uses the given function to transform absolute file paths
// before they are passed to logging.
func FormatPaths(formatPathFunc func(absPath string) string) Option {
return option(func(config *analyzeConfig) {
config.formatPathFunc = formatPathFunc
})
}

/*
Analyze collects basic file information for the specified files. Errors are logged
rather than returned where possible, to maximise the amount of data collected.
pathInArchive should return the relative path in the package archive, given an absolute
path to a file in the package. The relative path is used for the result data.
Pass instances of Option to control which information is collected.
*/
func Analyze(ctx context.Context, paths []string, pathInArchive func(absolutePath string) string) ([]FileData, error) {
func Analyze(ctx context.Context, paths []string, options ...Option) ([]FileData, error) {
if len(paths) == 0 {
return []FileData{}, nil
}

detectedTypes, err := detectFileTypes(ctx, paths)
haveDetectedTypes := true
config := getDefaultAnalyzeConfig()
for _, o := range options {
o.set(&config)
}

var detectedTypes []string
var haveDetectedTypes bool
types, err := detectFileTypes(ctx, paths)
haveDetectedTypes = true
if err != nil {
slog.ErrorContext(ctx, "failed to run file type detection", "error", err)
haveDetectedTypes = false
}
if len(detectedTypes) != len(paths) {
if len(types) != len(paths) {
slog.ErrorContext(ctx, fmt.Sprintf("detectFileTypes() returned %d results, expecting %d", len(detectedTypes), len(paths)))
haveDetectedTypes = false
}
detectedTypes = types

var result []FileData
result := make([]FileData, len(paths))

for index, filePath := range paths {
archivePath := pathInArchive(filePath)
formattedPath := config.formatPathFunc(filePath)
detectedType := ""
if haveDetectedTypes {
detectedType = detectedTypes[index]
Expand All @@ -73,31 +120,33 @@ func Analyze(ctx context.Context, paths []string, pathInArchive func(absolutePat
var fileSize int64
if fileInfo, err := os.Stat(filePath); err != nil {
fileSize = -1 // error value
slog.ErrorContext(ctx, "Error during stat file", "path", archivePath, "error", err)
slog.ErrorContext(ctx, "Error during stat file", "file", formattedPath, "error", err)
} else {
fileSize = fileInfo.Size()
}

var sha265Sum string
if hash, err := utils.SHA256Hash(filePath); err != nil {
slog.ErrorContext(ctx, "Error hashing file", "path", archivePath, "error", err)
slog.ErrorContext(ctx, "Error hashing file", "file", formattedPath, "error", err)
} else {
sha265Sum = hash
}

var lineLengths valuecounts.ValueCounts
if ll, err := linelengths.GetLineLengths(filePath, ""); err != nil {
slog.ErrorContext(ctx, "Error counting line lengths", "path", archivePath, "error", err)
} else {
lineLengths = valuecounts.Count(ll)
if config.withLineLengths {
if ll, err := linelengths.GetLineLengths(filePath, ""); err != nil {
slog.ErrorContext(ctx, "Error counting line lengths", "file", formattedPath, "error", err)
} else {
lineLengths = valuecounts.Count(ll)
}
}

result = append(result, FileData{
result[index] = FileData{
DetectedType: detectedType,
Size: fileSize,
SHA256: sha265Sum,
LineLengths: lineLengths,
})
}
}

return result, nil
Expand Down
7 changes: 1 addition & 6 deletions internal/staticanalysis/basicdata/basic_data_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ import (
"os"
"path/filepath"
"reflect"
"strings"
"testing"

"github.com/ossf/package-analysis/internal/utils"
Expand Down Expand Up @@ -72,11 +71,7 @@ func TestGetBasicData(t *testing.T) {
}
}

getArchivePath := func(absolutePath string) string {
return strings.TrimPrefix(absolutePath, testDir+string(os.PathSeparator))
}

got, err := Analyze(context.Background(), paths, getArchivePath)
got, err := Analyze(context.Background(), paths)
if (err != nil) != tt.wantErr {
t.Errorf("detectFileTypes() error = %v, wantErr %v", err, tt.wantErr)
return
Expand Down
15 changes: 13 additions & 2 deletions internal/staticanalysis/result.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,19 @@ import (
// Result (staticanalysis.Result) is the top-level internal data structure
// that stores all data produced by static analysis performed on a package artifact.
type Result struct {
ArchiveSHA256 string
Files []SingleResult
Archive ArchiveResult
Files []SingleResult
}

type ArchiveResult struct {
// DetectedType records the output of the `file` command run on the archive.
DetectedType string

// Size records the (compressed) size of the archive (as reported by the filesystem).
Size int64

// SHA256 records the SHA256 hashsum of the archive.
SHA256 string
}

/*
Expand Down
52 changes: 37 additions & 15 deletions sandboxes/staticanalysis/staticanalyze.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (
"github.com/ossf/package-analysis/internal/log"
"github.com/ossf/package-analysis/internal/pkgmanager"
"github.com/ossf/package-analysis/internal/staticanalysis"
"github.com/ossf/package-analysis/internal/staticanalysis/basicdata"
"github.com/ossf/package-analysis/internal/staticanalysis/parsing"
"github.com/ossf/package-analysis/internal/utils"
"github.com/ossf/package-analysis/internal/worker"
Expand Down Expand Up @@ -151,7 +152,8 @@ func run() (err error) {
}
defer workDirs.cleanup(ctx)

startExtractionTime := time.Now()
startDownloadTime := time.Now()

var archivePath string
if *localFile != "" {
archivePath = *localFile
Expand All @@ -162,6 +164,32 @@ func run() (err error) {
}
}

downloadTime := time.Since(startDownloadTime)

results := staticanalysis.Result{}

startArchiveAnalysisTime := time.Now()
archiveResult, err := basicdata.Analyze(ctx, []string{archivePath},
basicdata.SkipLineLengths(),
basicdata.FormatPaths(func(absPath string) string { return "/" }),
)
if err != nil {
slog.WarnContext(ctx, "failed to analyze archive file", "error", err)
} else if len(archiveResult) != 1 {
slog.WarnContext(ctx, "archive file analysis: unexpected number of results", "len", len(archiveResult))
} else {
archiveInfo := archiveResult[0]
results.Archive = staticanalysis.ArchiveResult{
DetectedType: archiveInfo.DetectedType,
Size: archiveInfo.Size,
SHA256: archiveInfo.SHA256,
}
}

archiveAnalysisTime := time.Since(startArchiveAnalysisTime)

startExtractionTime := time.Now()

if err := manager.ExtractArchive(archivePath, workDirs.extractDir); err != nil {
return fmt.Errorf("archive extraction failed: %w", err)
}
Expand All @@ -174,20 +202,13 @@ func run() (err error) {
}

startAnalysisTime := time.Now()
results, err := staticanalysis.AnalyzePackageFiles(ctx, workDirs.extractDir, jsParserConfig, analysisTasks)
analysisTime := time.Since(startAnalysisTime)
fileResults, err := staticanalysis.AnalyzePackageFiles(ctx, workDirs.extractDir, jsParserConfig, analysisTasks)
if err != nil {
return fmt.Errorf("static analysis error: %w", err)
}
results.Files = fileResults

startHashTime := time.Now()
archiveHash, err := utils.SHA256Hash(archivePath)
if err != nil {
slog.WarnContext(ctx, "failed to calculate archive checksum", "error", err)
}
results.ArchiveSHA256 = archiveHash
hashTime := time.Since(startHashTime)

analysisTime := time.Since(startAnalysisTime)
startWritingResultsTime := time.Now()

jsonResult, err := json.Marshal(results)
Expand Down Expand Up @@ -217,12 +238,13 @@ func run() (err error) {
writingResultsTime := time.Since(startWritingResultsTime)

totalTime := time.Since(startTime)
otherTime := totalTime - writingResultsTime - analysisTime - extractionTime - hashTime
otherTime := totalTime - writingResultsTime - analysisTime - extractionTime - archiveAnalysisTime - downloadTime

slog.InfoContext(ctx, "Execution times",
"download and extraction", extractionTime,
"analysis", analysisTime,
"sha256Hash calculation", hashTime,
"download", downloadTime,
"archive analysis", archiveAnalysisTime,
"archive extraction", extractionTime,
"file analysis", analysisTime,
"writing results", writingResultsTime,
"other", otherTime,
"total", totalTime)
Expand Down

0 comments on commit ae8550a

Please sign in to comment.