diff --git a/private/bufpkg/bufmodule/added_module.go b/private/bufpkg/bufmodule/added_module.go index 2770778be7..bc1b7a0a50 100644 --- a/private/bufpkg/bufmodule/added_module.go +++ b/private/bufpkg/bufmodule/added_module.go @@ -161,101 +161,44 @@ func (a *addedModule) ToModule( } return moduleData.V1Beta1OrV1BufLockObjectData() } - // Imagine the following scenario: + // getDeclaredDepModuleKeysB5 gets the declared dependencies for the specific Module. // - // module-a (local) - // README.md - // a.proto - // b.proto - // module-b:c1 - // README.md - // c.proto - // d.proto - // module-b:c2 - // README.md - // c.proto - // d.proto - // module-c:c1 - // e.proto - // f.proto - // module-c:c2 - // e.proto - // f.proto - // g.proto + // This is needed to calculate the digest for the Module. A Module constructed from this + // ModuleData as the target will require all Modules referenced by its DeclaredDepModuleKeys + // to be present in the ModuleSet. // - // Then, you have this dependency graph: + // Modules that depend on this remote Module will include this Module and its data. + // However all the dependencies of the remote Module may not be present in the parents ModuleSet. + // As the target Module will use its direct dependencies to resolve the dependencies required. + // The digest of the remote Module is however, unchanged. It is calculated based on the contents + // and its declared dependencies, not the dependencies of the parent ModuleSet. // - // module-a -> module-b:c1, module-c:c2 - // module-b:c1 -> module-c:c1 + // In contrast, a local Module dependency can be thought of as a ModuleKey at the latest commit. + // It will always use the bucket and declared dependencies, which may be resolved recursively + // for dependencies on other local Modules, to calculate its digest. + // This is the difference between the ModuleData digest calculation and the Module + // digest calculation. As remote Modules are required to have all their dependencies + // declared as ModuleKeys they can calculate their digest directly from the contents + // and declared dependencies, they do not need to recursively resolve digests. // - // Note that module-b depends on an earlier commit of module-c than module-a does. + // For example, consider the following modules at commits with their declared dependencies: + // ``` + // X:C1 (X has no dependencies) + // A:C1 -> X:C1 (A depends on X) + // B:C1 -> A:C1 ~> X:C1 (B depends on A, transitive dependency on X) + // A:C2 (A removes the dependency on X) + // C:C1 -> A:C2, B:C1 (C depends on A and B, X is not a dependency) + // ``` + // The ModuleSet for C:C1 will include B:C1 and A:C2, but not A:C1 or X:C1. + // This is because for C:C1 it will use the direct dependencies to resolve its dependencies. + // A is required by both B:C1 and C:C1, the latest A:C2 is chosen. // - // If we were to just use the dependencies in the ModuleSet to compute the digest, the following - // would happen as a calculation: + // The ModuleSet for B:C1 will include A:C1 and X:C1. + // When calculating the digest for B:C1 in the ModuleSet of C:C1, the ModuleSet + // ModuleDeps cannot be used to resolve the dependencies of B:C1. It must use + // the declared dependencies of B:C1, which are A:C1 and X:C1, not A:C2. // - // DIGEST(module-a) = digest( - // // module-a contents - // README.md, - // a.proto, - // b.proto, - // // module-b:c1 digest - // DIGEST( - // README.md, - // c.proto, - // d.proto, - // // module-c:c2 digest - // DIGEST( - // README.md, - // e.proto, - // f.proto, - // g.proto, - // ), - // ), - // // module-c:c2 digest - // DIGEST( - // README.md, - // e.proto, - // f.proto, - // g.proto, - // ), - // ) - // - // Note that to compute the digest of module-b:c1, we would actually use the digest of - // module-c:c2, as opposed to module-c:c1, since within the ModuleSet, we would resolve - // to use module-c:c2 instead of module-c:c1. - // - // We should be using module-c:c1 to compute the digest of module-b:c1: - // - // DIGEST(module-a) = digest( - // // module-a contents - // README.md, - // a.proto, - // b.proto, - // // module-b:c1 digest - // DIGEST( - // README.md, - // c.proto, - // d.proto, - // // module-c:c1 digest - // DIGEST( - // README.md, - // e.proto, - // f.proto, - // ), - // ), - // // module-c:c2 digest - // DIGEST( - // README.md, - // e.proto, - // f.proto, - // g.proto, - // ), - // ) - // - // To accomplish this, we need to take the dependencies of the declared ModuleKeys (ie what - // the Module actually says is in its buf.lock). This function enables us to do that for - // digest calculations. Within the Module, we say that if we get a remote Module, use the - // declared ModuleKeys instead of whatever Module we have resolved to for a given FullName. + // This is used for digest calculations. It is not used otherwise. getDeclaredDepModuleKeysB5 := func() ([]ModuleKey, error) { moduleData, err := getModuleData() if err != nil { diff --git a/private/bufpkg/bufmodule/bufmoduleapi/module_data_provider.go b/private/bufpkg/bufmodule/bufmoduleapi/module_data_provider.go index 262b309727..d58607716d 100644 --- a/private/bufpkg/bufmodule/bufmoduleapi/module_data_provider.go +++ b/private/bufpkg/bufmodule/bufmoduleapi/module_data_provider.go @@ -125,6 +125,8 @@ func (a *moduleDataProvider) getIndexedModuleDatasForRegistryAndIndexedModuleKey indexedModuleKeys []slicesext.Indexed[bufmodule.ModuleKey], digestType bufmodule.DigestType, ) ([]slicesext.Indexed[bufmodule.ModuleData], error) { + // This returns the graph with pruned dependencies for the given indexedModuleKeys. + // We must resolve the declared dependencies for each ModuleKey. graph, err := a.graphProvider.GetGraphForModuleKeys(ctx, slicesext.IndexedToValues(indexedModuleKeys)) if err != nil { return nil, err @@ -149,62 +151,45 @@ func (a *moduleDataProvider) getIndexedModuleDatasForRegistryAndIndexedModuleKey return nil, err } indexedModuleDatas := make([]slicesext.Indexed[bufmodule.ModuleData], 0, len(indexedModuleKeys)) - if err := graph.WalkNodes( - func( - moduleKey bufmodule.ModuleKey, - _ []bufmodule.ModuleKey, - _ []bufmodule.ModuleKey, - ) error { - // TopoSort will get us both the direct and transitive dependencies for the key. - // - // The outgoing edge list is just the direct dependencies. - // - // There is definitely a better way to do this in one pass for all commits with - // memoization - this is algorithmically bad. - depModuleKeys, err := graph.TopoSort(bufmodule.ModuleKeyToRegistryCommitID(moduleKey)) - if err != nil { - return err - } - depModuleKeys = depModuleKeys[:len(depModuleKeys)-1] - sort.Slice( - depModuleKeys, - func(i int, j int) bool { - return depModuleKeys[i].FullName().String() < depModuleKeys[j].FullName().String() - }, - ) + for _, indexedModuleKey := range indexedModuleKeys { + moduleKey := indexedModuleKey.Value + // TopoSort will get us both the direct and transitive dependencies for the moduleKey. + depModuleKeys, err := graph.TopoSort(bufmodule.ModuleKeyToRegistryCommitID(moduleKey)) + if err != nil { + return nil, err + } + // Remove the final ModuleKey, this parent. + depModuleKeys = depModuleKeys[:len(depModuleKeys)-1] + sort.Slice( + depModuleKeys, + func(i int, j int) bool { + return depModuleKeys[i].FullName().String() < depModuleKeys[j].FullName().String() + }, + ) - universalProtoContent, ok := commitIDToUniversalProtoContent[moduleKey.CommitID()] - if !ok { - // We only care to get content for a subset of the graph. If we have something - // in the graph without content, we just skip it. - return nil - } - indexedModuleKey, ok := commitIDToIndexedModuleKey[moduleKey.CommitID()] - if !ok { - return syserror.Newf("could not find indexed ModuleKey for commit ID %q", uuidutil.ToDashless(moduleKey.CommitID())) - } - indexedModuleData := slicesext.Indexed[bufmodule.ModuleData]{ - Value: bufmodule.NewModuleData( - ctx, - moduleKey, - func() (storage.ReadBucket, error) { - return universalProtoFilesToBucket(universalProtoContent.Files) - }, - func() ([]bufmodule.ModuleKey, error) { return depModuleKeys, nil }, - func() (bufmodule.ObjectData, error) { - return universalProtoFileToObjectData(universalProtoContent.V1BufYAMLFile) - }, - func() (bufmodule.ObjectData, error) { - return universalProtoFileToObjectData(universalProtoContent.V1BufLockFile) - }, - ), - Index: indexedModuleKey.Index, - } - indexedModuleDatas = append(indexedModuleDatas, indexedModuleData) - return nil - }, - ); err != nil { - return nil, err + universalProtoContent, ok := commitIDToUniversalProtoContent[moduleKey.CommitID()] + if !ok { + // This should never happen. + return nil, syserror.Newf("no content returned for commit ID %s", moduleKey.CommitID()) + } + indexedModuleData := slicesext.Indexed[bufmodule.ModuleData]{ + Value: bufmodule.NewModuleData( + ctx, + moduleKey, + func() (storage.ReadBucket, error) { + return universalProtoFilesToBucket(universalProtoContent.Files) + }, + func() ([]bufmodule.ModuleKey, error) { return depModuleKeys, nil }, + func() (bufmodule.ObjectData, error) { + return universalProtoFileToObjectData(universalProtoContent.V1BufYAMLFile) + }, + func() (bufmodule.ObjectData, error) { + return universalProtoFileToObjectData(universalProtoContent.V1BufLockFile) + }, + ), + Index: indexedModuleKey.Index, + } + indexedModuleDatas = append(indexedModuleDatas, indexedModuleData) } return indexedModuleDatas, nil } diff --git a/private/bufpkg/bufmodule/module.go b/private/bufpkg/bufmodule/module.go index 4cfbdf0df9..87aeda7ef1 100644 --- a/private/bufpkg/bufmodule/module.go +++ b/private/bufpkg/bufmodule/module.go @@ -452,33 +452,31 @@ func newGetDigestFuncForModuleAndDigestType(module *module, digestType DigestTyp } return getB4Digest(module.ctx, bucket, v1BufYAMLObjectData, v1BufLockObjectData) case DigestTypeB5: - moduleDeps, err := module.ModuleDeps() - if err != nil { - return nil, err - } - // For remote modules to have consistent B5 digests, they must not change the digests of their - // dependencies based on the local workspace. Use the pruned b5 module keys from - // ModuleData.DeclaredDepModuleKeys to calculate the digest. + // B5 digests are calculated from the Module's content and its declared dependencies. + // + // Declared dependencies are all direct and transitive dependencies of the Module. + // For local Modules, the declared dependencies are resolved to the current ModuleSet. + // For remote Modules, the declared dependencies are the ModuleKeys of + // the direct and transitive dependencies of the Module at the specific commit. + // + // This is effectively the same. The local Modules have a declared dependency at the current commit, + // and the remote Modules have a declared dependency at a specific commit. Pushing up a local Module + // as a remote Module will result in the same Digest. + // + // See the comment on getDeclaredDepModuleKeysB5 for more information. if !module.isLocal { declaredDepModuleKeys, err := module.getDeclaredDepModuleKeysB5() if err != nil { return nil, err } - moduleDepFullNames := make(map[string]struct{}, len(moduleDeps)) - for _, dep := range moduleDeps { - fullName := dep.FullName() - if fullName == nil { - return nil, syserror.Newf("remote module dependencies should have full names") - } - moduleDepFullNames[fullName.String()] = struct{}{} - } - prunedDepModuleKeys := make([]ModuleKey, 0, len(declaredDepModuleKeys)) - for _, dep := range declaredDepModuleKeys { - if _, ok := moduleDepFullNames[dep.FullName().String()]; ok { - prunedDepModuleKeys = append(prunedDepModuleKeys, dep) - } - } - return getB5DigestForBucketAndDepModuleKeys(module.ctx, bucket, prunedDepModuleKeys) + return getB5DigestForBucketAndDepModuleKeys(module.ctx, bucket, declaredDepModuleKeys) + } + // ModuleDeps returns the declared dependent Modules resolved for the target Module in the ModuleSet. + // A local Module will recursively resolve the Digest of other dependent local Modules. + // This is done by calling Module.Digest(DigestTypeB5) on each ModuleDep. + moduleDeps, err := module.ModuleDeps() + if err != nil { + return nil, err } return getB5DigestForBucketAndModuleDeps(module.ctx, bucket, moduleDeps) default: diff --git a/private/bufpkg/bufmodule/module_data.go b/private/bufpkg/bufmodule/module_data.go index a05045b19f..7b7e06f03c 100644 --- a/private/bufpkg/bufmodule/module_data.go +++ b/private/bufpkg/bufmodule/module_data.go @@ -41,6 +41,13 @@ type ModuleData interface { // This bucket will only contain module files - it will be filtered via NewModuleData. Bucket() (storage.ReadBucket, error) // DeclaredDepModuleKeys returns the declared dependencies for this specific Module. + // + // The declared dependencies are the same as that would appear in the buf.lock file. + // These include all direct and transitive dependencies. A Module constructed + // from this ModuleData as the target will require all Modules referenced by + // its DeclaredDepModuleKeys to be present in the ModuleSet. + // + // This is used for digest calculations. It is not used otherwise. DeclaredDepModuleKeys() ([]ModuleKey, error) // V1Beta1OrV1BufYAMLObjectData gets the v1beta1 or v1 buf.yaml ObjectData. @@ -144,19 +151,10 @@ func newModuleData( if err != nil { return err } - // This isn't the Digest as computed by the Module exactly, as the Module uses - // file imports to determine what the dependencies are. However, this is checking whether - // or not the digest of the returned information matches the digest we expected, which is - // what we need for this use case (tamper-proofing). What we are looking for is "does the - // digest from the ModuleKey match the files and dependencies returned from the remote - // provider of the ModuleData?" The mismatch case is that a file import changed/was removed, - // which may result in a different computed set of dependencies, but in this case, the - // actual files would have changed, which will result in a mismatched digest anyways, and - // tamper-proofing failing. - // - // This mismatch is a bit weird, however, and also results in us effectively computing - // the digest twice for any remote module: once here, and once within Module.Digest, - // which does have a slight performance hit. + // The B5 digest is calculated based on the declared dependencies. + // Dependencies are not required to be resolved to a Module to calculate the digest. + // Each declared dependent ModuleKey is a reference to a specific commit of a module + // that includes the dependencies expected Digest. actualDigest, err = getB5DigestForBucketAndDepModuleKeys(ctx, bucket, declaredDepModuleKeys) if err != nil { return err diff --git a/private/bufpkg/bufmodule/module_set_builder.go b/private/bufpkg/bufmodule/module_set_builder.go index 4241513469..b0f77c8bf6 100644 --- a/private/bufpkg/bufmodule/module_set_builder.go +++ b/private/bufpkg/bufmodule/module_set_builder.go @@ -89,7 +89,6 @@ type ModuleSetBuilder interface { // // Remote modules are rarely targets. However, if we are reading a ModuleSet from a // ModuleProvider for example with a buf build buf.build/foo/bar call, then this - // specific Module will be targeted, while its dependencies will not be. // // Returns the same ModuleSetBuilder. @@ -144,9 +143,9 @@ func NewModuleSetForRemoteModule( } if err := graph.WalkNodes( func(node ModuleKey, _ []ModuleKey, _ []ModuleKey) error { - if node.CommitID() != moduleKey.CommitID() { + if isTarget := node.CommitID() == moduleKey.CommitID(); !isTarget { // Add the dependency ModuleKey with no path filters. - moduleSetBuilder.AddRemoteModule(node, false) + moduleSetBuilder.AddRemoteModule(node, isTarget) } return nil },