Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Retry GetResource NotFound when creating #1809

Merged
merged 4 commits into from
Nov 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 50 additions & 1 deletion provider/pkg/client/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,22 @@ import (
"encoding/json"
"fmt"
"strings"
"time"

"github.com/aws/aws-sdk-go-v2/aws/retry"
"github.com/aws/aws-sdk-go-v2/aws/transport/http"
"github.com/aws/aws-sdk-go-v2/service/cloudcontrol"
"github.com/aws/aws-sdk-go-v2/service/cloudcontrol/types"
"github.com/aws/smithy-go"
"github.com/golang/glog"
"github.com/mattbaird/jsonpatch"
"github.com/pkg/errors"
"github.com/pulumi/pulumi/sdk/v3/go/common/util/contract"
)

// CloudControlApiClient providers CRUD operations around Cloud Control API, with the mechanics of API calls abstracted away.
// For instance, it serializes and deserializes wire data and follows the protocol of long-running operations.
//
//go:generate mockgen -package client -source client.go -destination mock_client.go CloudControlApiClient
type CloudControlClient interface {
// Create creates a resource of the specified type with the desired state.
Expand Down Expand Up @@ -79,10 +84,18 @@ func (c *clientImpl) Create(ctx context.Context, typeName string, desiredState m
return nil, nil, errors.New("received nil identifier while awaiting completion")
}

// Pick the getResource method. If there was an error from awaiting completion, simply try once. If there were
// no errors however, try multiple times to tolerate occasional GetResource NotFound errors that arise due to
// eventual consistency.
getResource := c.api.GetResource
if waitErr == nil {
getResource = c.getResourceRetryNotFound
}

// Read the state - even if there was a creation error but the progress event contains a resource ID.
// Retrieve the resource state from AWS.
// Note that we do so even if creation hasn't succeeded but the identifier is assigned.
resourceState, err = c.api.GetResource(ctx, typeName, *pi.Identifier)
resourceState, err = getResource(ctx, typeName, *pi.Identifier)
if err != nil {
if waitErr != nil {
// Both wait and read fail. Provisioning failed entirely, return the wait error as more informative.
Expand Down Expand Up @@ -152,3 +165,39 @@ func (c *clientImpl) Delete(ctx context.Context, typeName, identifier string) er

return err
}

// Wraps c.api.GetResource with Not Found error retry logic to try to compensate for eventual consistency issues for
// newly created resources.
func (c *clientImpl) getResourceRetryNotFound(
ctx context.Context,
typeName, identifier string,
) (map[string]interface{}, error) {
retryBackoff := retry.NewExponentialJitterBackoff(30 * time.Second)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd reduce the max duration here a bit. We'd now wait up to ~60 seconds. That seems a bit too long to me.

Eventual consistency delays for those services are usually in the millis.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll send a quick follow-up PR, this one merged already.

maxAttempts := 5
var lastError error
for attempt := 0; attempt < maxAttempts; attempt++ {
result, err := c.api.GetResource(ctx, typeName, identifier)
var notFound *types.ResourceNotFoundException
switch {
case err == nil:
return result, nil
case !errors.As(err, &notFound):
return nil, err
}
lastError = err

delay, err := retryBackoff.BackoffDelay(attempt, nil)
contract.AssertNoErrorf(err, "BackoffDelay should not fail")

glog.V(9).Infof("CloudControl GetResource(%q, %q) failed with ResourceNotFoundException:"+
" attempt #%d, retrying in %v", typeName, identifier, attempt, delay)

select {
case <-ctx.Done():
return nil, ctx.Err()
case <-time.After(delay):
continue
}
}
return nil, lastError
}
44 changes: 44 additions & 0 deletions provider/pkg/client/client_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,50 @@ func TestClientCreate(t *testing.T) {
assert.Nil(t, id)
assert.Nil(t, outputs)
})

t.Run("GetResource is retried if it fails with a ResourceNotFoundException", func(t *testing.T) {
resourceID := "exampleID"
mockAPI.CreateResourceFunc = func(ctx context.Context, cfType, desiredState string) (*types.ProgressEvent, error) {
return &types.ProgressEvent{
OperationStatus: types.OperationStatusSuccess,
Identifier: &resourceID,
}, nil
}
mockAPI.WaitForResourceOpCompletionFunc = func(ctx context.Context, pi *types.ProgressEvent) (*types.ProgressEvent, error) {
return pi, nil
}
getResourceInvocation := 0
mockAPI.GetResourceFunc = func(ctx context.Context, typeName, identifier string) (map[string]interface{}, error) {
switch getResourceInvocation {
case 0:
getResourceInvocation++
return nil, &smithy.OperationError{
ServiceID: "CloudControl",
OperationName: "GetResource",
Err: &awshttp.ResponseError{
RequestID: "9068dc0f-1124-4fc7-a120-39956230b810",
ResponseError: &smithyhttp.ResponseError{
Response: &smithyhttp.Response{
Response: &http.Response{
StatusCode: 400,
},
},
Err: &types.ResourceNotFoundException{},
},
},
}
default:
return map[string]interface{}{"output1": "outvalue1"}, nil
}
}

id, outputs, err := client.Create(ctx, typeName, desiredState)

assert.NoError(t, err)
t.Logf("ID=%v", id)
t.Logf("Outputs=%v", outputs)
})

}

// Mock API implementation
Expand Down
Loading