Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add transitive extraction for Maven pom.xml #399

Closed
wants to merge 23 commits into from
Closed
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
202 changes: 202 additions & 0 deletions extractor/filesystem/language/java/pomxmlnet/pomxmlnet.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
// Copyright 2025 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package pomxmlnet extracts Maven's pom.xml format with transitive dependency resolution.
package pomxmlnet

import (
"context"
"fmt"
"path/filepath"

"golang.org/x/exp/maps"

"deps.dev/util/maven"
"deps.dev/util/resolve"
"deps.dev/util/resolve/dep"
mavenresolve "deps.dev/util/resolve/maven"
"github.com/google/osv-scalibr/extractor"
"github.com/google/osv-scalibr/extractor/filesystem"
"github.com/google/osv-scalibr/extractor/filesystem/osv"
"github.com/google/osv-scalibr/internal/datasource"
"github.com/google/osv-scalibr/internal/mavenutil"
"github.com/google/osv-scalibr/internal/resolution/client"
"github.com/google/osv-scalibr/plugin"
"github.com/google/osv-scalibr/purl"
)

// Extractor extracts Maven packages with transitive dependency resolution.
type Extractor struct {
client.DependencyClient
*datasource.MavenRegistryAPIClient
}

// Name of the extractor.
func (e Extractor) Name() string { return "java/pomxmlnet" }

// Version of the extractor.
func (e Extractor) Version() int { return 0 }

// Requirements of the extractor.
func (e Extractor) Requirements() *plugin.Capabilities {
return &plugin.Capabilities{
Network: true,
DirectFS: true,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this needs DirectFS, as all fs operations are done through the VirtualFS interface.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am a bit confused about when to use the VirtualFS and when to use DirectFS. There was another comment suggesting to mark DirectFS as true. Do you mind elaborating this more?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From what I understand you need DirectFS if you access the disk directly (e.g. via os.Open, or via a C library ...etc), if you only access the disk through our virtual filesystem, you don't need DirectFS, since it doesn't matter to the extractor whether we are actually reading from disk, streaming the file over a network connection, or reading a container for example.

@erikvarga can you double check that my understanding is correct?

}
}

// FileRequired never returns true, as this is for the osv-scanner json output.
func (e Extractor) FileRequired(fapi filesystem.FileAPI) bool {
return filepath.Base(fapi.Path()) == "pom.xml"
}

// Extract extracts packages from yarn.lock files passed through the scan input.
func (e Extractor) Extract(ctx context.Context, input *filesystem.ScanInput) ([]*extractor.Inventory, error) {
var project maven.Project
if err := datasource.NewMavenDecoder(input.Reader).Decode(&project); err != nil {
return nil, fmt.Errorf("could not extract from %s: %w", input.Path, err)
}
// Empty JDK and ActivationOS indicates merging the default profiles.
if err := project.MergeProfiles("", maven.ActivationOS{}); err != nil {
return nil, fmt.Errorf("failed to merge profiles: %w", err)
}
for _, repo := range project.Repositories {
if err := e.MavenRegistryAPIClient.AddRegistry(datasource.MavenRegistry{
URL: string(repo.URL),
ID: string(repo.ID),
ReleasesEnabled: repo.Releases.Enabled.Boolean(),
SnapshotsEnabled: repo.Snapshots.Enabled.Boolean(),
}); err != nil {
return nil, fmt.Errorf("failed to add registry %s: %w", repo.URL, err)
}
}
// Merging parents data by parsing local parent pom.xml or fetching from upstream.
if err := mavenutil.MergeParents(ctx, input, e.MavenRegistryAPIClient, &project, project.Parent, 1, true); err != nil {
return nil, fmt.Errorf("failed to merge parents: %w", err)
}
// Process the dependencies:
// - dedupe dependencies and dependency management
// - import dependency management
// - fill in missing dependency version requirement
project.ProcessDependencies(func(groupID, artifactID, version maven.String) (maven.DependencyManagement, error) {
return mavenutil.GetDependencyManagement(ctx, e.MavenRegistryAPIClient, groupID, artifactID, version)
})

if registries := e.MavenRegistryAPIClient.GetRegistries(); len(registries) > 0 {
clientRegs := make([]client.Registry, len(registries))
for i, reg := range registries {
clientRegs[i] = reg
}
if err := e.DependencyClient.AddRegistries(clientRegs); err != nil {
return nil, err
}
}

overrideClient := client.NewOverrideClient(e.DependencyClient)
resolver := mavenresolve.NewResolver(overrideClient)

// Resolve the dependencies.
root := resolve.Version{
VersionKey: resolve.VersionKey{
PackageKey: resolve.PackageKey{
System: resolve.Maven,
Name: project.ProjectKey.Name(),
},
VersionType: resolve.Concrete,
Version: string(project.Version),
}}
reqs := make([]resolve.RequirementVersion, len(project.Dependencies)+len(project.DependencyManagement.Dependencies))
for i, d := range project.Dependencies {
reqs[i] = resolve.RequirementVersion{
VersionKey: resolve.VersionKey{
PackageKey: resolve.PackageKey{
System: resolve.Maven,
Name: d.Name(),
},
VersionType: resolve.Requirement,
Version: string(d.Version),
},
Type: resolve.MavenDepType(d, ""),
}
}
for i, d := range project.DependencyManagement.Dependencies {
reqs[len(project.Dependencies)+i] = resolve.RequirementVersion{
VersionKey: resolve.VersionKey{
PackageKey: resolve.PackageKey{
System: resolve.Maven,
Name: d.Name(),
},
VersionType: resolve.Requirement,
Version: string(d.Version),
},
Type: resolve.MavenDepType(d, mavenutil.OriginManagement),
}
}
overrideClient.AddVersion(root, reqs)

g, err := resolver.Resolve(ctx, root.VersionKey)
if err != nil {
return nil, fmt.Errorf("failed resolving %v: %w", root, err)
}
for i, e := range g.Edges {
e.Type = dep.Type{}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add a comment here about why you are wiping e.Type here?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this was for comparing the resolved graph - we don't need this here.

g.Edges[i] = e
}

details := map[string]*extractor.Inventory{}
for i := 1; i < len(g.Nodes); i++ {
// Ignore the first node which is the root.
node := g.Nodes[i]
depGroups := []string{}
inventory := extractor.Inventory{
Name: node.Version.Name,
Version: node.Version.Version,
// TODO(#408): Add merged paths in here as well
Locations: []string{input.Path},
}
// We are only able to know dependency groups of direct dependencies but
// not transitive dependencies because the nodes in the resolve graph does
// not have the scope information.
for _, dep := range project.Dependencies {
if dep.Name() != inventory.Name {
continue
}
if dep.Scope != "" && dep.Scope != "compile" {
depGroups = append(depGroups, string(dep.Scope))
}
}
inventory.Metadata = osv.DepGroupMetadata{
DepGroupVals: depGroups,
}
details[inventory.Name] = &inventory
}

return maps.Values(details), nil
}

// ToPURL converts an inventory created by this extractor into a PURL.
func (e Extractor) ToPURL(i *extractor.Inventory) *purl.PackageURL {
return &purl.PackageURL{
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This might be missing some of the purl requirements.

https://github.com/package-url/purl-spec/blob/master/PURL-TYPES.rst#maven

  • Check name the format that purl expects?
  • Add repository_url parameter if neccessary
  • Maybe add type as well?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Currently classifier and type are not available in the node. I made #426 as a TODO for the future.

Type: purl.TypeMaven,
Name: i.Name,
Version: i.Version,
}
}

// Ecosystem returns the OSV ecosystem ('npm') of the software extracted by this extractor.
func (e Extractor) Ecosystem(_ *extractor.Inventory) string {
return "Maven"
}

var _ filesystem.Extractor = Extractor{}
Loading
Loading