Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introduce indexed embedded CPE dictionary #1897

Merged
merged 5 commits into from
Jul 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/update-bootstrap-tools.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ on:
workflow_dispatch:

env:
GO_VERSION: "1.19.x"
GO_VERSION: "1.20.x"
GO_STABLE_VERSION: true

jobs:
Expand Down
43 changes: 43 additions & 0 deletions .github/workflows/update-cpe-dictionary-index.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
name: PR to update CPE dictionary index
on:
schedule:
- cron: "0 1 * * 1" # every monday at 1 AM

workflow_dispatch:

env:
GO_VERSION: "1.20.x"
GO_STABLE_VERSION: true

jobs:
upgrade-cpe-dictionary-index:
runs-on: ubuntu-latest
if: github.repository == 'anchore/syft' # only run for main repo
steps:
- uses: actions/checkout@v3

- uses: actions/setup-go@v4
with:
go-version: ${{ env.GO_VERSION }}
stable: ${{ env.GO_STABLE_VERSION }}

- run: |
make generate-cpe-dictionary-index

- uses: tibdex/github-app-token@v1
id: generate-token
with:
app_id: ${{ secrets.TOKEN_APP_ID }}
private_key: ${{ secrets.TOKEN_APP_PRIVATE_KEY }}

- uses: peter-evans/create-pull-request@v5
with:
signoff: true
delete-branch: true
branch: auto/latest-cpe-dictionary-index
labels: dependencies
commit-message: "chore(deps): update CPE dictionary index"
title: "chore(deps): update CPE dictionary index"
body: |
Update CPE dictionary index based on the latest available CPE dictionary
token: ${{ steps.generate-token.outputs.token }}
2 changes: 1 addition & 1 deletion .github/workflows/update-stereoscope-release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ on:
workflow_dispatch:

env:
GO_VERSION: "1.19.x"
GO_VERSION: "1.20.x"
GO_STABLE_VERSION: true

jobs:
Expand Down
7 changes: 6 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,7 @@ compare-test-rpm-package-install: $(TEMP_DIR) $(SNAPSHOT_DIR)
$(TEMP_DIR)


## Code generation targets #################################
## Code and data generation targets #################################

.PHONY: generate-json-schema
generate-json-schema: ## Generate a new json schema
Expand All @@ -309,6 +309,11 @@ generate-license-list: ## Generate an updated spdx license list
go generate ./internal/spdxlicense/...
gofmt -s -w ./internal/spdxlicense

.PHONY: generate-cpe-dictionary-index
generate-cpe-dictionary-index: ## Build the CPE index based off of the latest available CPE dictionary
$(call title,Building CPE index)
go generate ./syft/pkg/cataloger/common/cpe/dictionary


## Build-related targets #################################

Expand Down
9 changes: 8 additions & 1 deletion syft/pkg/cataloger/catalog.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,14 @@ func runCataloger(cataloger pkg.Cataloger, resolver file.Resolver) (catalogerRes
for _, p := range packages {
// generate CPEs (note: this is excluded from package ID, so is safe to mutate)
// we might have binary classified CPE already with the package so we want to append here
p.CPEs = append(p.CPEs, cpe.Generate(p)...)

dictionaryCPE, ok := cpe.DictionaryFind(p)
if ok {
log.Debugf("used CPE dictionary to find CPE for %s package %q: %s", p.Type, p.Name, dictionaryCPE.BindToFmtString())
p.CPEs = append(p.CPEs, dictionaryCPE)
} else {
p.CPEs = append(p.CPEs, cpe.Generate(p)...)
}

// if we were not able to identify the language we have an opportunity
// to try and get this value from the PURL. Worst case we assert that
Expand Down
1,296 changes: 1,296 additions & 0 deletions syft/pkg/cataloger/common/cpe/dictionary/data/cpe-index.json

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions syft/pkg/cataloger/common/cpe/dictionary/generate_index.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
package dictionary

//go:generate go run ./index-generator/ -o data/cpe-index.json
230 changes: 230 additions & 0 deletions syft/pkg/cataloger/common/cpe/dictionary/index-generator/generate.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,230 @@
package main

import (
"compress/gzip"
"encoding/json"
"encoding/xml"
"fmt"
"io"
"log"
"strings"

"github.com/facebookincubator/nvdtools/wfn"
"golang.org/x/exp/slices"

"github.com/anchore/syft/syft/pkg/cataloger/common/cpe/dictionary"
)

func generateIndexedDictionaryJSON(rawGzipData io.Reader) ([]byte, error) {
gzipReader, err := gzip.NewReader(rawGzipData)
if err != nil {
return nil, fmt.Errorf("unable to decompress CPE dictionary: %w", err)
}
defer gzipReader.Close()

// Read XML data
data, err := io.ReadAll(gzipReader)
if err != nil {
return nil, fmt.Errorf("unable to read CPE dictionary: %w", err)
}

// Unmarshal XML
var cpeList CpeList
if err := xml.Unmarshal(data, &cpeList); err != nil {
return nil, fmt.Errorf("unable to unmarshal CPE dictionary XML: %w", err)
}

// Filter out data that's not applicable here
cpeList = filterCpeList(cpeList)

// Create indexed dictionary to help with looking up CPEs
indexedDictionary := indexCPEList(cpeList)

// Convert to JSON
jsonData, err := json.MarshalIndent(indexedDictionary, "", " ")
if err != nil {
return nil, fmt.Errorf("unable to marshal CPE dictionary to JSON: %w", err)
}
return jsonData, nil
}

// filterCpeList removes CPE items that are not applicable to software packages.
func filterCpeList(cpeList CpeList) CpeList {
var processedCpeList CpeList

seen := make(map[string]struct{})

for _, cpeItem := range cpeList.CpeItems {
// Skip CPE items that don't have any references.
if len(cpeItem.References) == 0 {
continue
}

// Skip CPE items where the CPE URI doesn't meet our criteria.
parsedName, err := wfn.Parse(cpeItem.Name)
if err != nil {
log.Printf("unable to parse CPE URI %q: %s", cpeItem.Name, err)
}

if slices.Contains([]string{"h", "o"}, parsedName.Part) {
continue
}

normalizedName := normalizeCPE(parsedName).BindToURI()
if _, ok := seen[normalizedName]; ok {
continue
}
seen[normalizedName] = struct{}{}
cpeItem.Name = normalizedName

parsedCPE, err := wfn.Parse(cpeItem.Cpe23Item.Name)
if err != nil {
log.Printf("unable to parse CPE value %q: %s", cpeItem.Cpe23Item.Name, err)
}

cpeItem.Cpe23Item.Name = normalizeCPE(parsedCPE).BindToFmtString()

processedCpeList.CpeItems = append(processedCpeList.CpeItems, cpeItem)
}

return processedCpeList
}

// normalizeCPE removes the version and update parts of a CPE.
func normalizeCPE(cpe *wfn.Attributes) *wfn.Attributes {
cpeCopy := *cpe

cpeCopy.Version = ""
cpeCopy.Update = ""

return &cpeCopy
}

const (
prefixForNPMPackages = "https://www.npmjs.com/package/"
prefixForRubyGems = "https://rubygems.org/gems/"
prefixForRubyGemsHTTP = "http://rubygems.org/gems/"
prefixForNativeRubyGems = "https://github.com/ruby/"
prefixForPyPIPackages = "https://pypi.org/project/"
prefixForJenkinsPlugins = "https://github.com/jenkinsci/"
prefixForRustCrates = "https://crates.io/crates/"
)

// indexCPEList creates an index of CPEs by ecosystem.
func indexCPEList(list CpeList) *dictionary.Indexed {
wagoodman marked this conversation as resolved.
Show resolved Hide resolved
indexed := &dictionary.Indexed{
EcosystemPackages: make(map[string]dictionary.Packages),
}

for _, cpeItem := range list.CpeItems {
cpeItemName := cpeItem.Cpe23Item.Name

for _, reference := range cpeItem.References {
ref := reference.Reference.Href

switch {
case strings.HasPrefix(ref, prefixForNPMPackages):
addEntryForNPMPackage(indexed, ref, cpeItemName)

case strings.HasPrefix(ref, prefixForRubyGems), strings.HasPrefix(ref, prefixForRubyGemsHTTP):
addEntryForRubyGem(indexed, ref, cpeItemName)

case strings.HasPrefix(ref, prefixForNativeRubyGems):
addEntryForNativeRubyGem(indexed, ref, cpeItemName)

case strings.HasPrefix(ref, prefixForPyPIPackages):
addEntryForPyPIPackage(indexed, ref, cpeItemName)

case strings.HasPrefix(ref, prefixForJenkinsPlugins):
// It _might_ be a jenkins plugin!
addEntryForJenkinsPlugin(indexed, ref, cpeItemName)

case strings.HasPrefix(ref, prefixForRustCrates):
addEntryForRustCrate(indexed, ref, cpeItemName)
}
}
}

return indexed
}

func addEntryForRustCrate(indexed *dictionary.Indexed, ref string, cpeItemName string) {
// Prune off the non-package-name parts of the URL
ref = strings.TrimPrefix(ref, prefixForRustCrates)
ref = strings.Split(ref, "/")[0]

if _, ok := indexed.EcosystemPackages[dictionary.EcosystemRustCrates]; !ok {
indexed.EcosystemPackages[dictionary.EcosystemRustCrates] = make(dictionary.Packages)
}

indexed.EcosystemPackages[dictionary.EcosystemRustCrates][ref] = cpeItemName
}

func addEntryForJenkinsPlugin(indexed *dictionary.Indexed, ref string, cpeItemName string) {
// Prune off the non-package-name parts of the URL
ref = strings.TrimPrefix(ref, prefixForJenkinsPlugins)
ref = strings.Split(ref, "/")[0]

if !strings.HasSuffix(ref, "-plugin") {
// It's not a jenkins plugin!
return
}

ref = strings.TrimSuffix(ref, "-plugin")

if _, ok := indexed.EcosystemPackages[dictionary.EcosystemJenkinsPlugins]; !ok {
indexed.EcosystemPackages[dictionary.EcosystemJenkinsPlugins] = make(dictionary.Packages)
}

indexed.EcosystemPackages[dictionary.EcosystemJenkinsPlugins][ref] = cpeItemName
}

func addEntryForPyPIPackage(indexed *dictionary.Indexed, ref string, cpeItemName string) {
// Prune off the non-package-name parts of the URL
ref = strings.TrimPrefix(ref, prefixForPyPIPackages)
ref = strings.Split(ref, "/")[0]

if _, ok := indexed.EcosystemPackages[dictionary.EcosystemPyPI]; !ok {
indexed.EcosystemPackages[dictionary.EcosystemPyPI] = make(dictionary.Packages)
}

indexed.EcosystemPackages[dictionary.EcosystemPyPI][ref] = cpeItemName
}

func addEntryForNativeRubyGem(indexed *dictionary.Indexed, ref string, cpeItemName string) {
// Prune off the non-package-name parts of the URL
ref = strings.TrimPrefix(ref, prefixForNativeRubyGems)
ref = strings.Split(ref, "/")[0]

if _, ok := indexed.EcosystemPackages[dictionary.EcosystemRubyGems]; !ok {
indexed.EcosystemPackages[dictionary.EcosystemRubyGems] = make(dictionary.Packages)
}

indexed.EcosystemPackages[dictionary.EcosystemRubyGems][ref] = cpeItemName
}

func addEntryForRubyGem(indexed *dictionary.Indexed, ref string, cpeItemName string) {
// Prune off the non-package-name parts of the URL
ref = strings.TrimPrefix(ref, prefixForRubyGems)
ref = strings.TrimPrefix(ref, prefixForRubyGemsHTTP)
ref = strings.Split(ref, "/")[0]

if _, ok := indexed.EcosystemPackages[dictionary.EcosystemRubyGems]; !ok {
indexed.EcosystemPackages[dictionary.EcosystemRubyGems] = make(dictionary.Packages)
}

indexed.EcosystemPackages[dictionary.EcosystemRubyGems][ref] = cpeItemName
}

func addEntryForNPMPackage(indexed *dictionary.Indexed, ref string, cpeItemName string) {
// Prune off the non-package-name parts of the URL
ref = strings.Split(ref, "/v/")[0]
ref = strings.Split(ref, "?")[0]
ref = strings.TrimPrefix(ref, prefixForNPMPackages)

if _, ok := indexed.EcosystemPackages[dictionary.EcosystemNPM]; !ok {
indexed.EcosystemPackages[dictionary.EcosystemNPM] = make(dictionary.Packages)
}

indexed.EcosystemPackages[dictionary.EcosystemNPM][ref] = cpeItemName
}
Loading