Skip to content

Commit

Permalink
Debug bufmod calc
Browse files Browse the repository at this point in the history
  • Loading branch information
emcfarlane committed Jan 29, 2025
1 parent 34d54b9 commit c89c7c2
Show file tree
Hide file tree
Showing 5 changed files with 111 additions and 182 deletions.
121 changes: 32 additions & 89 deletions private/bufpkg/bufmodule/added_module.go
Original file line number Diff line number Diff line change
Expand Up @@ -161,101 +161,44 @@ func (a *addedModule) ToModule(
}
return moduleData.V1Beta1OrV1BufLockObjectData()
}
// Imagine the following scenario:
// getDeclaredDepModuleKeysB5 gets the declared dependencies for the specific Module.
//
// module-a (local)
// README.md
// a.proto
// b.proto
// module-b:c1
// README.md
// c.proto
// d.proto
// module-b:c2
// README.md
// c.proto
// d.proto
// module-c:c1
// e.proto
// f.proto
// module-c:c2
// e.proto
// f.proto
// g.proto
// This is needed to calculate the digest for the Module. A Module constructed from this
// ModuleData as the target will require all Modules referenced by its DeclaredDepModuleKeys
// to be present in the ModuleSet.
//
// Then, you have this dependency graph:
// Modules that depend on this remote Module will include this Module and its data.
// However all the dependencies of the remote Module may not be present in the parents ModuleSet.
// As the target Module will use its direct dependencies to resolve the dependencies required.
// The digest of the remote Module is however, unchanged. It is calculated based on the contents
// and its declared dependencies, not the dependencies of the parent ModuleSet.
//
// module-a -> module-b:c1, module-c:c2
// module-b:c1 -> module-c:c1
// In contrast, a local Module dependency can be thought of as a ModuleKey at the latest commit.
// It will always use the bucket and declared dependencies, which may be resolved recursively
// for dependencies on other local Modules, to calculate its digest.
// This is the difference between the ModuleData digest calculation and the Module
// digest calculation. As remote Modules are required to have all their dependencies
// declared as ModuleKeys they can calculate their digest directly from the contents
// and declared dependencies, they do not need to recursively resolve digests.
//
// Note that module-b depends on an earlier commit of module-c than module-a does.
// For example, consider the following modules at commits with their declared dependencies:
// ```
// X:C1 (X has no dependencies)
// A:C1 -> X:C1 (A depends on X)
// B:C1 -> A:C1 ~> X:C1 (B depends on A, transitive dependency on X)
// A:C2 (A removes the dependency on X)
// C:C1 -> A:C2, B:C1 (C depends on A and B, X is not a dependency)
// ```
// The ModuleSet for C:C1 will include B:C1 and A:C2, but not A:C1 or X:C1.
// This is because for C:C1 it will use the direct dependencies to resolve its dependencies.
// A is required by both B:C1 and C:C1, the lastest is chosen, so A:C2 is used.
//
// If we were to just use the dependencies in the ModuleSet to compute the digest, the following
// would happen as a calculation:
// The ModuleSet for B:C1 will include A:C1 and X:C1.
// When calculating the digest for B:C1 in the ModuleSet of C:C1, the ModuleSet
// ModuleDeps cannot be used to resolve the dependencies of B:C1. It must use
// the declared dependencies of B:C1, which are A:C1 and X:C1, not A:C2.
//
// DIGEST(module-a) = digest(
// // module-a contents
// README.md,
// a.proto,
// b.proto,
// // module-b:c1 digest
// DIGEST(
// README.md,
// c.proto,
// d.proto,
// // module-c:c2 digest
// DIGEST(
// README.md,
// e.proto,
// f.proto,
// g.proto,
// ),
// ),
// // module-c:c2 digest
// DIGEST(
// README.md,
// e.proto,
// f.proto,
// g.proto,
// ),
// )
//
// Note that to compute the digest of module-b:c1, we would actually use the digest of
// module-c:c2, as opposed to module-c:c1, since within the ModuleSet, we would resolve
// to use module-c:c2 instead of module-c:c1.
//
// We should be using module-c:c1 to compute the digest of module-b:c1:
//
// DIGEST(module-a) = digest(
// // module-a contents
// README.md,
// a.proto,
// b.proto,
// // module-b:c1 digest
// DIGEST(
// README.md,
// c.proto,
// d.proto,
// // module-c:c1 digest
// DIGEST(
// README.md,
// e.proto,
// f.proto,
// ),
// ),
// // module-c:c2 digest
// DIGEST(
// README.md,
// e.proto,
// f.proto,
// g.proto,
// ),
// )
//
// To accomplish this, we need to take the dependencies of the declared ModuleKeys (ie what
// the Module actually says is in its buf.lock). This function enables us to do that for
// digest calculations. Within the Module, we say that if we get a remote Module, use the
// declared ModuleKeys instead of whatever Module we have resolved to for a given FullName.
// This is used for digest calculations. It is not used otherwise.
getDeclaredDepModuleKeysB5 := func() ([]ModuleKey, error) {
moduleData, err := getModuleData()
if err != nil {
Expand Down
96 changes: 41 additions & 55 deletions private/bufpkg/bufmodule/bufmoduleapi/module_data_provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,8 @@ func (a *moduleDataProvider) getIndexedModuleDatasForRegistryAndIndexedModuleKey
indexedModuleKeys []slicesext.Indexed[bufmodule.ModuleKey],
digestType bufmodule.DigestType,
) ([]slicesext.Indexed[bufmodule.ModuleData], error) {
// This returns the graph with pruned dependencies for the given indexedModuleKeys.
// We must resolve the declared dependencies for each module key.
graph, err := a.graphProvider.GetGraphForModuleKeys(ctx, slicesext.IndexedToValues(indexedModuleKeys))
if err != nil {
return nil, err
Expand All @@ -148,63 +150,47 @@ func (a *moduleDataProvider) getIndexedModuleDatasForRegistryAndIndexedModuleKey
if err != nil {
return nil, err
}

indexedModuleDatas := make([]slicesext.Indexed[bufmodule.ModuleData], 0, len(indexedModuleKeys))
if err := graph.WalkNodes(
func(
moduleKey bufmodule.ModuleKey,
_ []bufmodule.ModuleKey,
_ []bufmodule.ModuleKey,
) error {
// TopoSort will get us both the direct and transitive dependencies for the key.
//
// The outgoing edge list is just the direct dependencies.
//
// There is definitely a better way to do this in one pass for all commits with
// memoization - this is algorithmically bad.
depModuleKeys, err := graph.TopoSort(bufmodule.ModuleKeyToRegistryCommitID(moduleKey))
if err != nil {
return err
}
depModuleKeys = depModuleKeys[:len(depModuleKeys)-1]
sort.Slice(
depModuleKeys,
func(i int, j int) bool {
return depModuleKeys[i].FullName().String() < depModuleKeys[j].FullName().String()
},
)
for _, indexedModuleKey := range indexedModuleKeys {
moduleKey := indexedModuleKey.Value
// TopoSort will get us both the direct and transitive dependencies for the moduleKey.
depModuleKeys, err := graph.TopoSort(bufmodule.ModuleKeyToRegistryCommitID(moduleKey))
if err != nil {
return nil, err
}
// Remove the final key, this module.
depModuleKeys = depModuleKeys[:len(depModuleKeys)-1]
sort.Slice(
depModuleKeys,
func(i int, j int) bool {
return depModuleKeys[i].FullName().String() < depModuleKeys[j].FullName().String()
},
)

universalProtoContent, ok := commitIDToUniversalProtoContent[moduleKey.CommitID()]
if !ok {
// We only care to get content for a subset of the graph. If we have something
// in the graph without content, we just skip it.
return nil
}
indexedModuleKey, ok := commitIDToIndexedModuleKey[moduleKey.CommitID()]
if !ok {
return syserror.Newf("could not find indexed ModuleKey for commit ID %q", uuidutil.ToDashless(moduleKey.CommitID()))
}
indexedModuleData := slicesext.Indexed[bufmodule.ModuleData]{
Value: bufmodule.NewModuleData(
ctx,
moduleKey,
func() (storage.ReadBucket, error) {
return universalProtoFilesToBucket(universalProtoContent.Files)
},
func() ([]bufmodule.ModuleKey, error) { return depModuleKeys, nil },
func() (bufmodule.ObjectData, error) {
return universalProtoFileToObjectData(universalProtoContent.V1BufYAMLFile)
},
func() (bufmodule.ObjectData, error) {
return universalProtoFileToObjectData(universalProtoContent.V1BufLockFile)
},
),
Index: indexedModuleKey.Index,
}
indexedModuleDatas = append(indexedModuleDatas, indexedModuleData)
return nil
},
); err != nil {
return nil, err
universalProtoContent, ok := commitIDToUniversalProtoContent[moduleKey.CommitID()]
if !ok {
// This should never happen.
return nil, syserror.Newf("no content returned for commit ID %s", moduleKey.CommitID())
}
indexedModuleData := slicesext.Indexed[bufmodule.ModuleData]{
Value: bufmodule.NewModuleData(
ctx,
moduleKey,
func() (storage.ReadBucket, error) {
return universalProtoFilesToBucket(universalProtoContent.Files)
},
func() ([]bufmodule.ModuleKey, error) { return depModuleKeys, nil },
func() (bufmodule.ObjectData, error) {
return universalProtoFileToObjectData(universalProtoContent.V1BufYAMLFile)
},
func() (bufmodule.ObjectData, error) {
return universalProtoFileToObjectData(universalProtoContent.V1BufLockFile)
},
),
Index: indexedModuleKey.Index,
}
indexedModuleDatas = append(indexedModuleDatas, indexedModuleData)
}
return indexedModuleDatas, nil
}
Expand Down
45 changes: 23 additions & 22 deletions private/bufpkg/bufmodule/module.go
Original file line number Diff line number Diff line change
Expand Up @@ -452,33 +452,34 @@ func newGetDigestFuncForModuleAndDigestType(module *module, digestType DigestTyp
}
return getB4Digest(module.ctx, bucket, v1BufYAMLObjectData, v1BufLockObjectData)
case DigestTypeB5:
moduleDeps, err := module.ModuleDeps()
if err != nil {
return nil, err
}
// For remote modules to have consistent B5 digests, they must not change the digests of their
// dependencies based on the local workspace. Use the pruned b5 module keys from
// ModuleData.DeclaredDepModuleKeys to calculate the digest.
// B5 digests are calculated from the Module's content and its declared dependencies.
//
// Declared dependencies are all direct and transitive dependencies of the Module.
// For local Modules, the declared dependencies are resolved to the current ModuleSet.
// For remote Modules, the declared dependencies are the ModuleKeys of
// the direct and transitive dependencies of the Module at the specific commit.
//
// This is effectively the same. The local Modules have a declared dependency at the current commit,
// and the remote Modules have a declared dependency at a specific commit. Pushing up a local Module
// as a remote Module will result in the same set of declared dependencies.
//
// Local Module dependencies are resolved to the current ModuleSet, so the declared dependencies are not used.
// There is no FullName or CommitID for local Modules, calling getDeclaredDepModuleKeysB5 will return an error.
//
// See the comment on getDeclaredDepModuleKeysB5 for more information.
if !module.isLocal {
declaredDepModuleKeys, err := module.getDeclaredDepModuleKeysB5()
if err != nil {
return nil, err
}
moduleDepFullNames := make(map[string]struct{}, len(moduleDeps))
for _, dep := range moduleDeps {
fullName := dep.FullName()
if fullName == nil {
return nil, syserror.Newf("remote module dependencies should have full names")
}
moduleDepFullNames[fullName.String()] = struct{}{}
}
prunedDepModuleKeys := make([]ModuleKey, 0, len(declaredDepModuleKeys))
for _, dep := range declaredDepModuleKeys {
if _, ok := moduleDepFullNames[dep.FullName().String()]; ok {
prunedDepModuleKeys = append(prunedDepModuleKeys, dep)
}
}
return getB5DigestForBucketAndDepModuleKeys(module.ctx, bucket, prunedDepModuleKeys)
return getB5DigestForBucketAndDepModuleKeys(module.ctx, bucket, declaredDepModuleKeys)
}
// ModuleDeps returns the declared dependent Modules resolved for the target Module in the ModuleSet.
// A local Module will recursively resolve the Digest of other dependent local Modules.
// This is done by calling Module.Digest(DigestTypeB5) on each ModuleDep.
moduleDeps, err := module.ModuleDeps()
if err != nil {
return nil, err
}
return getB5DigestForBucketAndModuleDeps(module.ctx, bucket, moduleDeps)
default:
Expand Down
25 changes: 12 additions & 13 deletions private/bufpkg/bufmodule/module_data.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ package bufmodule

import (
"context"
"fmt"

Check failure on line 19 in private/bufpkg/bufmodule/module_data.go

View workflow job for this annotation

GitHub Actions / buf-binary-size

"fmt" imported and not used

Check failure on line 19 in private/bufpkg/bufmodule/module_data.go

View workflow job for this annotation

GitHub Actions / test-previous (1.22.x)

"fmt" imported and not used

Check failure on line 19 in private/bufpkg/bufmodule/module_data.go

View workflow job for this annotation

GitHub Actions / lint

"fmt" imported and not used

Check failure on line 19 in private/bufpkg/bufmodule/module_data.go

View workflow job for this annotation

GitHub Actions / test

"fmt" imported and not used

Check failure on line 19 in private/bufpkg/bufmodule/module_data.go

View workflow job for this annotation

GitHub Actions / test

"fmt" imported and not used
"sync"

"github.com/bufbuild/buf/private/pkg/storage"
Expand All @@ -41,6 +42,13 @@ type ModuleData interface {
// This bucket will only contain module files - it will be filtered via NewModuleData.
Bucket() (storage.ReadBucket, error)
// DeclaredDepModuleKeys returns the declared dependencies for this specific Module.
//
// The declared dependencies are the same as that would appear in the buf.lock file.
// These include all direct and transitive dependencies. A Module constructed
// from this ModuleData as the target will require all Modules referenced by
// its DeclaredDepModuleKeys to be present in the ModuleSet.
//
// This is used for digest calculations. It is not used otherwise.
DeclaredDepModuleKeys() ([]ModuleKey, error)

// V1Beta1OrV1BufYAMLObjectData gets the v1beta1 or v1 buf.yaml ObjectData.
Expand Down Expand Up @@ -144,19 +152,10 @@ func newModuleData(
if err != nil {
return err
}
// This isn't the Digest as computed by the Module exactly, as the Module uses
// file imports to determine what the dependencies are. However, this is checking whether
// or not the digest of the returned information matches the digest we expected, which is
// what we need for this use case (tamper-proofing). What we are looking for is "does the
// digest from the ModuleKey match the files and dependencies returned from the remote
// provider of the ModuleData?" The mismatch case is that a file import changed/was removed,
// which may result in a different computed set of dependencies, but in this case, the
// actual files would have changed, which will result in a mismatched digest anyways, and
// tamper-proofing failing.
//
// This mismatch is a bit weird, however, and also results in us effectively computing
// the digest twice for any remote module: once here, and once within Module.Digest,
// which does have a slight performance hit.
// The B5 digest is calculated based on the declared dependencies.
// Dependencies are not required to be resolved to a Module to calculate the digest.
// Each declared dependent ModuleKey is a reference to a specific commit of a module
// that includes the dependencies expected Digest.
actualDigest, err = getB5DigestForBucketAndDepModuleKeys(ctx, bucket, declaredDepModuleKeys)
if err != nil {
return err
Expand Down
6 changes: 3 additions & 3 deletions private/bufpkg/bufmodule/module_set_builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ package bufmodule
import (
"context"
"errors"
"fmt"

Check failure on line 20 in private/bufpkg/bufmodule/module_set_builder.go

View workflow job for this annotation

GitHub Actions / buf-binary-size

"fmt" imported and not used

Check failure on line 20 in private/bufpkg/bufmodule/module_set_builder.go

View workflow job for this annotation

GitHub Actions / test-previous (1.22.x)

"fmt" imported and not used

Check failure on line 20 in private/bufpkg/bufmodule/module_set_builder.go

View workflow job for this annotation

GitHub Actions / lint

"fmt" imported and not used

Check failure on line 20 in private/bufpkg/bufmodule/module_set_builder.go

View workflow job for this annotation

GitHub Actions / test

"fmt" imported and not used

Check failure on line 20 in private/bufpkg/bufmodule/module_set_builder.go

View workflow job for this annotation

GitHub Actions / test

"fmt" imported and not used
"log/slog"
"sync/atomic"

Expand Down Expand Up @@ -89,7 +90,6 @@ type ModuleSetBuilder interface {
//
// Remote modules are rarely targets. However, if we are reading a ModuleSet from a
// ModuleProvider for example with a buf build buf.build/foo/bar call, then this

// specific Module will be targeted, while its dependencies will not be.
//
// Returns the same ModuleSetBuilder.
Expand Down Expand Up @@ -144,9 +144,9 @@ func NewModuleSetForRemoteModule(
}
if err := graph.WalkNodes(
func(node ModuleKey, _ []ModuleKey, _ []ModuleKey) error {
if node.CommitID() != moduleKey.CommitID() {
if isTarget := node.CommitID() == moduleKey.CommitID(); !isTarget {
// Add the dependency ModuleKey with no path filters.
moduleSetBuilder.AddRemoteModule(node, false)
moduleSetBuilder.AddRemoteModule(node, isTarget)
}
return nil
},
Expand Down

0 comments on commit c89c7c2

Please sign in to comment.