-
Notifications
You must be signed in to change notification settings - Fork 27
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: add transitive extraction for Maven pom.xml #399
Changes from 16 commits
dee5fa9
cdff15f
cf22342
3cede05
84fc579
6533f01
4a5099b
db0dd1d
c931bc8
5213257
1492083
71492ea
4606350
7ec576e
08f4c03
75871ac
a9a9d54
5a9d20c
a53dbf9
637a77f
4ae8122
c394c5d
14609fd
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,202 @@ | ||
// Copyright 2025 Google LLC | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
// Package pomxmlnet extracts Maven's pom.xml format with transitive dependency resolution. | ||
package pomxmlnet | ||
|
||
import ( | ||
"context" | ||
"fmt" | ||
"path/filepath" | ||
|
||
"golang.org/x/exp/maps" | ||
|
||
"deps.dev/util/maven" | ||
"deps.dev/util/resolve" | ||
"deps.dev/util/resolve/dep" | ||
mavenresolve "deps.dev/util/resolve/maven" | ||
"github.com/google/osv-scalibr/extractor" | ||
"github.com/google/osv-scalibr/extractor/filesystem" | ||
"github.com/google/osv-scalibr/extractor/filesystem/osv" | ||
"github.com/google/osv-scalibr/internal/datasource" | ||
"github.com/google/osv-scalibr/internal/mavenutil" | ||
"github.com/google/osv-scalibr/internal/resolution/client" | ||
"github.com/google/osv-scalibr/plugin" | ||
"github.com/google/osv-scalibr/purl" | ||
) | ||
|
||
// Extractor extracts Maven packages with transitive dependency resolution. | ||
type Extractor struct { | ||
client.DependencyClient | ||
*datasource.MavenRegistryAPIClient | ||
} | ||
|
||
// Name of the extractor. | ||
func (e Extractor) Name() string { return "java/pomxmlnet" } | ||
|
||
// Version of the extractor. | ||
func (e Extractor) Version() int { return 0 } | ||
|
||
// Requirements of the extractor. | ||
func (e Extractor) Requirements() *plugin.Capabilities { | ||
return &plugin.Capabilities{ | ||
Network: true, | ||
DirectFS: true, | ||
} | ||
} | ||
|
||
// FileRequired never returns true, as this is for the osv-scanner json output. | ||
func (e Extractor) FileRequired(fapi filesystem.FileAPI) bool { | ||
return filepath.Base(fapi.Path()) == "pom.xml" | ||
} | ||
|
||
// Extract extracts packages from yarn.lock files passed through the scan input. | ||
func (e Extractor) Extract(ctx context.Context, input *filesystem.ScanInput) ([]*extractor.Inventory, error) { | ||
var project maven.Project | ||
if err := datasource.NewMavenDecoder(input.Reader).Decode(&project); err != nil { | ||
return nil, fmt.Errorf("could not extract from %s: %w", input.Path, err) | ||
} | ||
// Empty JDK and ActivationOS indicates merging the default profiles. | ||
if err := project.MergeProfiles("", maven.ActivationOS{}); err != nil { | ||
return nil, fmt.Errorf("failed to merge profiles: %w", err) | ||
} | ||
for _, repo := range project.Repositories { | ||
if err := e.MavenRegistryAPIClient.AddRegistry(datasource.MavenRegistry{ | ||
URL: string(repo.URL), | ||
ID: string(repo.ID), | ||
ReleasesEnabled: repo.Releases.Enabled.Boolean(), | ||
SnapshotsEnabled: repo.Snapshots.Enabled.Boolean(), | ||
}); err != nil { | ||
return nil, fmt.Errorf("failed to add registry %s: %w", repo.URL, err) | ||
} | ||
} | ||
// Merging parents data by parsing local parent pom.xml or fetching from upstream. | ||
if err := mavenutil.MergeParents(ctx, input, e.MavenRegistryAPIClient, &project, project.Parent, 1, true); err != nil { | ||
return nil, fmt.Errorf("failed to merge parents: %w", err) | ||
} | ||
// Process the dependencies: | ||
// - dedupe dependencies and dependency management | ||
// - import dependency management | ||
// - fill in missing dependency version requirement | ||
project.ProcessDependencies(func(groupID, artifactID, version maven.String) (maven.DependencyManagement, error) { | ||
return mavenutil.GetDependencyManagement(ctx, e.MavenRegistryAPIClient, groupID, artifactID, version) | ||
}) | ||
|
||
if registries := e.MavenRegistryAPIClient.GetRegistries(); len(registries) > 0 { | ||
clientRegs := make([]client.Registry, len(registries)) | ||
for i, reg := range registries { | ||
clientRegs[i] = reg | ||
} | ||
if err := e.DependencyClient.AddRegistries(clientRegs); err != nil { | ||
return nil, err | ||
} | ||
} | ||
|
||
overrideClient := client.NewOverrideClient(e.DependencyClient) | ||
resolver := mavenresolve.NewResolver(overrideClient) | ||
|
||
// Resolve the dependencies. | ||
root := resolve.Version{ | ||
VersionKey: resolve.VersionKey{ | ||
PackageKey: resolve.PackageKey{ | ||
System: resolve.Maven, | ||
Name: project.ProjectKey.Name(), | ||
}, | ||
VersionType: resolve.Concrete, | ||
Version: string(project.Version), | ||
}} | ||
reqs := make([]resolve.RequirementVersion, len(project.Dependencies)+len(project.DependencyManagement.Dependencies)) | ||
for i, d := range project.Dependencies { | ||
reqs[i] = resolve.RequirementVersion{ | ||
VersionKey: resolve.VersionKey{ | ||
PackageKey: resolve.PackageKey{ | ||
System: resolve.Maven, | ||
Name: d.Name(), | ||
}, | ||
VersionType: resolve.Requirement, | ||
Version: string(d.Version), | ||
}, | ||
Type: resolve.MavenDepType(d, ""), | ||
} | ||
} | ||
for i, d := range project.DependencyManagement.Dependencies { | ||
reqs[len(project.Dependencies)+i] = resolve.RequirementVersion{ | ||
VersionKey: resolve.VersionKey{ | ||
PackageKey: resolve.PackageKey{ | ||
System: resolve.Maven, | ||
Name: d.Name(), | ||
}, | ||
VersionType: resolve.Requirement, | ||
Version: string(d.Version), | ||
}, | ||
Type: resolve.MavenDepType(d, mavenutil.OriginManagement), | ||
} | ||
} | ||
overrideClient.AddVersion(root, reqs) | ||
|
||
g, err := resolver.Resolve(ctx, root.VersionKey) | ||
if err != nil { | ||
return nil, fmt.Errorf("failed resolving %v: %w", root, err) | ||
} | ||
for i, e := range g.Edges { | ||
e.Type = dep.Type{} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add a comment here about why you are wiping e.Type here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this was for comparing the resolved graph - we don't need this here. |
||
g.Edges[i] = e | ||
} | ||
|
||
details := map[string]*extractor.Inventory{} | ||
for i := 1; i < len(g.Nodes); i++ { | ||
// Ignore the first node which is the root. | ||
node := g.Nodes[i] | ||
depGroups := []string{} | ||
inventory := extractor.Inventory{ | ||
Name: node.Version.Name, | ||
Version: node.Version.Version, | ||
// TODO(#408): Add merged paths in here as well | ||
Locations: []string{input.Path}, | ||
} | ||
// We are only able to know dependency groups of direct dependencies but | ||
// not transitive dependencies because the nodes in the resolve graph does | ||
// not have the scope information. | ||
for _, dep := range project.Dependencies { | ||
if dep.Name() != inventory.Name { | ||
continue | ||
} | ||
if dep.Scope != "" && dep.Scope != "compile" { | ||
depGroups = append(depGroups, string(dep.Scope)) | ||
} | ||
} | ||
inventory.Metadata = osv.DepGroupMetadata{ | ||
DepGroupVals: depGroups, | ||
} | ||
details[inventory.Name] = &inventory | ||
} | ||
|
||
return maps.Values(details), nil | ||
} | ||
|
||
// ToPURL converts an inventory created by this extractor into a PURL. | ||
func (e Extractor) ToPURL(i *extractor.Inventory) *purl.PackageURL { | ||
return &purl.PackageURL{ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This might be missing some of the purl requirements. https://github.com/package-url/purl-spec/blob/master/PURL-TYPES.rst#maven
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Currently |
||
Type: purl.TypeMaven, | ||
Name: i.Name, | ||
Version: i.Version, | ||
} | ||
} | ||
|
||
// Ecosystem returns the OSV ecosystem ('npm') of the software extracted by this extractor. | ||
func (e Extractor) Ecosystem(_ *extractor.Inventory) string { | ||
return "Maven" | ||
} | ||
|
||
var _ filesystem.Extractor = Extractor{} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think this needs DirectFS, as all fs operations are done through the VirtualFS interface.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am a bit confused about when to use the VirtualFS and when to use DirectFS. There was another comment suggesting to mark DirectFS as true. Do you mind elaborating this more?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
From what I understand you need DirectFS if you access the disk directly (e.g. via os.Open, or via a C library ...etc), if you only access the disk through our virtual filesystem, you don't need DirectFS, since it doesn't matter to the extractor whether we are actually reading from disk, streaming the file over a network connection, or reading a container for example.
@erikvarga can you double check that my understanding is correct?