From c0134468680f094dc509bf307af89f9579dc0f77 Mon Sep 17 00:00:00 2001 From: Kawsar Kamal Date: Wed, 20 Dec 2023 10:55:16 -0500 Subject: [PATCH] added CDK datasync example (#955) * added CDK datasync example --------- Co-authored-by: Michael Kaiser --- python/datasync-s3/.gitignore | 10 ++ python/datasync-s3/README.md | 132 ++++++++++++++++++ python/datasync-s3/app.py | 20 +++ python/datasync-s3/cdk.context.json | 29 ++++ python/datasync-s3/cdk.json | 51 +++++++ .../datasync-s3/datasync_s3_to_s3/__init__.py | 0 .../datasync_s3_to_s3_IAM_stack.py | 72 ++++++++++ .../datasync_s3_to_s3_stack.py | 102 ++++++++++++++ python/datasync-s3/requirements.txt | 2 + python/datasync-s3/source.bat | 13 ++ python/datasync-s3/tests/__init__.py | 0 python/datasync-s3/tests/unit/__init__.py | 0 .../tests/unit/test_datasync_s3_stack.py | 15 ++ 13 files changed, 446 insertions(+) create mode 100644 python/datasync-s3/.gitignore create mode 100644 python/datasync-s3/README.md create mode 100644 python/datasync-s3/app.py create mode 100644 python/datasync-s3/cdk.context.json create mode 100644 python/datasync-s3/cdk.json create mode 100644 python/datasync-s3/datasync_s3_to_s3/__init__.py create mode 100644 python/datasync-s3/datasync_s3_to_s3/datasync_s3_to_s3_IAM_stack.py create mode 100644 python/datasync-s3/datasync_s3_to_s3/datasync_s3_to_s3_stack.py create mode 100644 python/datasync-s3/requirements.txt create mode 100644 python/datasync-s3/source.bat create mode 100644 python/datasync-s3/tests/__init__.py create mode 100644 python/datasync-s3/tests/unit/__init__.py create mode 100644 python/datasync-s3/tests/unit/test_datasync_s3_stack.py diff --git a/python/datasync-s3/.gitignore b/python/datasync-s3/.gitignore new file mode 100644 index 000000000..37833f8be --- /dev/null +++ b/python/datasync-s3/.gitignore @@ -0,0 +1,10 @@ +*.swp +package-lock.json +__pycache__ +.pytest_cache +.venv +*.egg-info + +# CDK asset staging directory +.cdk.staging +cdk.out diff --git a/python/datasync-s3/README.md b/python/datasync-s3/README.md new file mode 100644 index 000000000..3642693b8 --- /dev/null +++ b/python/datasync-s3/README.md @@ -0,0 +1,132 @@ + +# Welcome to your CDK Python project! + +This repo uses the AWS CDK to enable Data movement among S3 buckets using AWS Datasync service. This example will support use-cases such as backups, bucket consolidation, data lake creations and so on. + +This CDK example application creates the following resources. +- New S3 buckets (Optional) +- One or more DataSync S3 locations +- IAM Role and policy for the DataSync service to read/write to S3 buckets +- DataSync task(s) to synchronize content between source and destination bucket pairs + +When you run `cdk deploy`, the CDK application creates two CloudFormation Stacks. +1. `cdk-datasync-s3-to-s3-iam` - creates necessary IAM Roles. This Stack is implemented in [datasync_s3_to_s3_IAM_stack.py](datasync_s3_to_s3/datasync_s3_to_s3_IAM_stack.py) +2. `cdk-datasync-s3-to-s3` - creates S3 buckets. This Stack is implemented in [datasync_s3_to_s3_stack.py](datasync_s3_to_s3/datasync_s3_to_s3_stack.py) + + +## Steps to use +1. Follow steps for intializing the CDK environment below. Ensure that Virtual Env is activated. See **Configuring Virtual Env**. +2. Ensure that you have exported AWS credentials, IAM profile, or EC2 instance role with Permissions to create IAM Role and DataSync resources. +3. Add the source and destination bucket names in `cdk.context.json`, and define one or more Datasync tasks to move data between source and destination pairs. See **Setting CDK Context**. +4. From the directory where `cdk.json` is present, run the `cdk diff` command. Adjust `app.py` if needed. +5. Run `cdk deploy --all` to create the resources. The Task outputs will be shown upon successful deployment. +6. Start the DataSync task using AWS CLI: `aws datasync start-task-execution --task-arn ` + + +## Cleanup +1. Follow the above steps 1 through 3. +2. Run `cdk destroy --all` to delete previously created Stacks. + + +## Setting CDK Context +This CDK application operates on two input lists, one for DataSync locations another for DataSync tasks. Each list can be populated with any number of configuration items. Below is an example of copying content from an existing source bucket to a new destination bucket. +``` +{ + "S3_datasync_locations": [ + { + "bucketName": "cdk-example-datasync-source-bucket", + "create": true, + "storage_lass": "STANDARD", + "subDirectory": "", + "tags": [ + { + "key": "Project", + "value": "CDK-example" + } + ] + }, + { + "create": true, + "bucketName": "cdk-example-datasync-destination-bucket", + "storageClass": "STANDARD", + "subDirectory": "", + "tags": [] + } + ], + "S3_datasync_tasks": [ + { + "source": "cdk-example-datasync-source-bucket", + "destination": "cdk-example-datasync-destination-bucket" + } + ] +} + +``` + +Below are the configuration elements. + +| Key | Description | Example | +| S3_datasync_locations | List containing S3 location configurations | | +| bucketName | List containing S3 location object configurations | | +| create | List containing S3 location object configurations | | +| storage_lass | List containing S3 location object configurations | | +| subDirectory | List containing S3 location object configurations | | +| tags | | +| S3_datasync_tasks | List containing S3 Datasync task configurations | | +| source | Source S3 bucket name for DataSync task | | +| destination | Destination S3 bucket name for DataSync task | | + +## Configuring Virtual Env +The `cdk.json` file tells the CDK Toolkit how to execute your app. + +This project is set up like a standard Python project. The initialization +process also creates a virtualenv within this project, stored under the `.venv` +directory. To create the virtualenv it assumes that there is a `python3` +(or `python` for Windows) executable in your path with access to the `venv` +package. If for any reason the automatic creation of the virtualenv fails, +you can create the virtualenv manually. + +To manually create a virtualenv on MacOS and Linux: + +``` +$ python3 -m venv .venv +``` + +After the init process completes and the virtualenv is created, you can use the following +step to activate your virtualenv. + +``` +$ source .venv/bin/activate +``` + +If you are a Windows platform, you would activate the virtualenv like this: + +``` +% .venv\Scripts\activate.bat +``` + +Once the virtualenv is activated, you can install the required dependencies. + +``` +$ pip install -r requirements.txt +``` + +At this point you can now synthesize the CloudFormation template for this code. + +``` +$ cdk synth +``` + +To add additional dependencies, for example other CDK libraries, just add +them to your `setup.py` file and rerun the `pip install -r requirements.txt` +command. + +## Useful commands + + * `cdk ls` list all stacks in the app + * `cdk synth` emits the synthesized CloudFormation template + * `cdk deploy` deploy this stack to your default AWS account/region + * `cdk diff` compare deployed stack with current state + * `cdk docs` open CDK documentation + +Enjoy! diff --git a/python/datasync-s3/app.py b/python/datasync-s3/app.py new file mode 100644 index 000000000..707b474c5 --- /dev/null +++ b/python/datasync-s3/app.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python3 + +import aws_cdk as cdk + +from datasync_s3_to_s3.datasync_s3_to_s3_IAM_stack import DataSyncS3toS3StackIAM +from datasync_s3_to_s3.datasync_s3_to_s3_stack import DataSyncS3toS3Stack + + +app = cdk.App() + +# Create Stack as defined in: datasync_s3_to_s3/datasync_s3_to_s3_IAM_stack.py +iam_stack = DataSyncS3toS3StackIAM(app, "cdk-datasync-s3-to-s3-iam") + +# Create Stack as defined in: datasync_s3_to_s3/datasync_s3_to_s3_stack.py +datasync_stack = DataSyncS3toS3Stack(app, "cdk-datasync-s3-to-s3") + +# Wait until the IAM stack has completed provisioning +datasync_stack.add_dependency(iam_stack) + +app.synth() diff --git a/python/datasync-s3/cdk.context.json b/python/datasync-s3/cdk.context.json new file mode 100644 index 000000000..3c67a19d3 --- /dev/null +++ b/python/datasync-s3/cdk.context.json @@ -0,0 +1,29 @@ +{ + "S3_datasync_locations": [ + { + "bucketName": "cdk-example-datasync-source-bucket", + "create": false, + "storage_lass": "STANDARD", + "subDirectory": "", + "tags": [ + { + "key": "Project", + "value": "CDK-example" + } + ] + }, + { + "create": false, + "bucketName": "cdk-example-datasync-dest-bucket", + "storageClass": "STANDARD", + "subDirectory": "", + "tags": [] + } + ], + "S3_datasync_tasks": [ + { + "source": "cdk-example-datasync-source-bucket", + "destination": "cdk-example-datasync-dest-bucket" + } + ] +} diff --git a/python/datasync-s3/cdk.json b/python/datasync-s3/cdk.json new file mode 100644 index 000000000..fd7abdbf9 --- /dev/null +++ b/python/datasync-s3/cdk.json @@ -0,0 +1,51 @@ +{ + "app": "python3 app.py", + "watch": { + "include": [ + "**" + ], + "exclude": [ + "README.md", + "cdk*.json", + "requirements*.txt", + "source.bat", + "**/__init__.py", + "python/__pycache__", + "tests" + ] + }, + "context": { + "@aws-cdk/aws-lambda:recognizeLayerVersion": true, + "@aws-cdk/core:checkSecretUsage": true, + "@aws-cdk/core:target-partitions": [ + "aws", + "aws-cn" + ], + "@aws-cdk-containers/ecs-service-extensions:enableDefaultLogDriver": true, + "@aws-cdk/aws-ec2:uniqueImdsv2TemplateName": true, + "@aws-cdk/aws-ecs:arnFormatIncludesClusterName": true, + "@aws-cdk/aws-iam:minimizePolicies": true, + "@aws-cdk/core:validateSnapshotRemovalPolicy": true, + "@aws-cdk/aws-codepipeline:crossAccountKeyAliasStackSafeResourceName": true, + "@aws-cdk/aws-s3:createDefaultLoggingPolicy": true, + "@aws-cdk/aws-sns-subscriptions:restrictSqsDescryption": true, + "@aws-cdk/aws-apigateway:disableCloudWatchRole": true, + "@aws-cdk/core:enablePartitionLiterals": true, + "@aws-cdk/aws-events:eventsTargetQueueSameAccount": true, + "@aws-cdk/aws-iam:standardizedServicePrincipals": true, + "@aws-cdk/aws-ecs:disableExplicitDeploymentControllerForCircuitBreaker": true, + "@aws-cdk/aws-iam:importedRoleStackSafeDefaultPolicyName": true, + "@aws-cdk/aws-s3:serverAccessLogsUseBucketPolicy": true, + "@aws-cdk/aws-route53-patters:useCertificate": true, + "@aws-cdk/customresources:installLatestAwsSdkDefault": false, + "@aws-cdk/aws-rds:databaseProxyUniqueResourceName": true, + "@aws-cdk/aws-codedeploy:removeAlarmsFromDeploymentGroup": true, + "@aws-cdk/aws-apigateway:authorizerChangeDeploymentLogicalId": true, + "@aws-cdk/aws-ec2:launchTemplateDefaultUserData": true, + "@aws-cdk/aws-secretsmanager:useAttachedSecretResourcePolicyForSecretTargetAttachments": true, + "@aws-cdk/aws-redshift:columnId": true, + "@aws-cdk/aws-stepfunctions-tasks:enableEmrServicePolicyV2": true, + "@aws-cdk/aws-ec2:restrictDefaultSecurityGroup": true, + "@aws-cdk/aws-apigateway:requestValidatorUniqueId": true + } +} diff --git a/python/datasync-s3/datasync_s3_to_s3/__init__.py b/python/datasync-s3/datasync_s3_to_s3/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/python/datasync-s3/datasync_s3_to_s3/datasync_s3_to_s3_IAM_stack.py b/python/datasync-s3/datasync_s3_to_s3/datasync_s3_to_s3_IAM_stack.py new file mode 100644 index 000000000..aba0cb0f2 --- /dev/null +++ b/python/datasync-s3/datasync_s3_to_s3/datasync_s3_to_s3_IAM_stack.py @@ -0,0 +1,72 @@ +import uuid +from constructs import Construct +from aws_cdk import ( + Stack, + aws_iam as iam, + CfnOutput +) + +class DataSyncS3toS3StackIAM(Stack): + + # Function to create IAM Role for Datasync + def create_datasync_roles(self, bucket_configs): + # Create a list of bucket paths ending in /* for IAM policy + suffix = "/*" + i=0 + datasync_s3_roles = [] + + for bc in bucket_configs: + # Create an IAM Role for DataSync to read and write to S3 bucket + # Create an IAM role + + role_name="CDKDataSyncS3Access-" + bc["bucketName"] + s3_role = iam.Role( + self, "CDKDataSyncS3AccessRole"+str(i), + assumed_by=iam.ServicePrincipal("datasync.amazonaws.com"), + description="CDK Datasync role for S3", + role_name=role_name + ) + + stmt1 = iam.PolicyStatement( + effect=iam.Effect.ALLOW, + actions=["s3:GetBucketLocation", "s3:ListBucket","s3:ListBucketMultipartUploads"], + resources=[bc["arn"]] + ) + + stmt2 = iam.PolicyStatement( + effect=iam.Effect.ALLOW, + actions=["s3:AbortMultipartUpload", "s3:DeleteObject","s3:GetObject","s3:ListMultipartUploadParts","s3:PutObjectTagging","s3:GetObjectTagging","s3:PutObject"], + resources=[bc["arn"]+suffix] + ) + + s3_policy = iam.ManagedPolicy(self,"CDKDataSyncS3Policy"+str(i), statements = [stmt1, stmt2], roles = [s3_role]) + + datasync_s3_roles.append(s3_role) + + # Export the name using the same format as the Role name + # This will be important by downstream Stack + CfnOutput(self, role_name, value=s3_role.role_arn, export_name=role_name) + + i = i+1 + + return datasync_s3_roles + + + # Main function + def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: + super().__init__(scope, construct_id, **kwargs) + + # Store bucket configs in an array + bucket_configs = self.node.try_get_context("S3_datasync_locations") + if bucket_configs: + # Add the arn to bucket_config, if it is not provided already + for b in bucket_configs: + if not "arn" in b: + b["arn"] = "arn:aws:s3:::" + b["bucketName"] + + self.create_datasync_roles(bucket_configs) + else: + print("ERROR: Please set a context variable for S3_datasync_locations") + + + diff --git a/python/datasync-s3/datasync_s3_to_s3/datasync_s3_to_s3_stack.py b/python/datasync-s3/datasync_s3_to_s3/datasync_s3_to_s3_stack.py new file mode 100644 index 000000000..3d42d2e86 --- /dev/null +++ b/python/datasync-s3/datasync_s3_to_s3/datasync_s3_to_s3_stack.py @@ -0,0 +1,102 @@ +import uuid +from constructs import Construct +from aws_cdk import ( + Stack, + aws_s3 as s3, + aws_datasync as datasync, + Fn, + CfnOutput +) + +class DataSyncS3toS3Stack(Stack): + + # Function to create Datasync Task + def create_datasync_s3_task(self, s3_src_location, s3_dest_location): + task = datasync.CfnTask( + self, + 'DataSyncS3toS3Task', + destination_location_arn=s3_src_location.attr_location_arn, + source_location_arn=s3_dest_location.attr_location_arn) + + CfnOutput(self, 'task_arn', value=task.attr_task_arn) + + return task + + # Function to create a S3 bucket using CDK + def create_bucket(self, name): + bucket = s3.Bucket(self, name, bucket_name = name) + return bucket + + # Function to get bucket ARN + def get_bucket_arn(self, config): + bucket_name = config["bucketName"] + bucket_arn = "" + + if config["create"]: + bucket = self.create_bucket(bucket_name) + bucket_arn = bucket.bucket_arn + else: + bucket_arn = "arn:aws:s3:::" + bucket_name + + return bucket_arn + + # Function to create Datasync S3 locations + def create_datasync_s3_locations(self, bucket_configs): + # Create the locations + i=0 + + s3_locations_dict = {} + for bc in bucket_configs: + role_name_export ="CDKDataSyncS3Access-" + bc["bucketName"] + + location = datasync.CfnLocationS3( + self, + 'DataSyncS3Location'+str(i), + s3_bucket_arn=bc["arn"], + s3_config=datasync.CfnLocationS3.S3ConfigProperty( + bucket_access_role_arn=Fn.import_value(role_name_export)) + ) + + # Add remaining configs if present + if "subDirectory" in bc: + location.subdirectory=bc["subDirectory"] + if "storageClass" in bc: + location.s3_storage_class=bc["storageClass"] + + # TODO: Add tags support + # if "tags" in bc and len(bc["tags"]) > 0: + # location.tags = bc["tags"] + + # Add this location to the result dict + s3_locations_dict[bc["bucketName"]] = location + i+=1 + + return s3_locations_dict + + def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: + super().__init__(scope, construct_id, **kwargs) + + # Process the bucket configurations + bucket_configs = self.node.try_get_context("S3_datasync_locations") + if bucket_configs: + # Add the arn to bucket_config, if it is not provided already. Creates S3 buckets if needed + for b in bucket_configs: + if not "arn" in b: + b["arn"] = self.get_bucket_arn(b) + else: + print("ERROR: Please set a context variable for S3_datasync_locations") + + + # Create the locations + s3_locations_dict = self.create_datasync_s3_locations(bucket_configs) + + + # Process the task configurations + datasync_tasks = self.node.try_get_context("S3_datasync_tasks") + if datasync_tasks: + for task in datasync_tasks: + self.create_datasync_s3_task(s3_locations_dict[task["source"]],s3_locations_dict[task["destination"]]) + else: + print("ERROR: Please set a context variable for S3_datasync_locations") + + diff --git a/python/datasync-s3/requirements.txt b/python/datasync-s3/requirements.txt new file mode 100644 index 000000000..f9a3319ee --- /dev/null +++ b/python/datasync-s3/requirements.txt @@ -0,0 +1,2 @@ +aws-cdk-lib==2.79.1 +constructs>=10.0.0,<11.0.0 diff --git a/python/datasync-s3/source.bat b/python/datasync-s3/source.bat new file mode 100644 index 000000000..9e1a83442 --- /dev/null +++ b/python/datasync-s3/source.bat @@ -0,0 +1,13 @@ +@echo off + +rem The sole purpose of this script is to make the command +rem +rem source .venv/bin/activate +rem +rem (which activates a Python virtualenv on Linux or Mac OS X) work on Windows. +rem On Windows, this command just runs this batch file (the argument is ignored). +rem +rem Now we don't need to document a Windows command for activating a virtualenv. + +echo Executing .venv\Scripts\activate.bat for you +.venv\Scripts\activate.bat diff --git a/python/datasync-s3/tests/__init__.py b/python/datasync-s3/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/python/datasync-s3/tests/unit/__init__.py b/python/datasync-s3/tests/unit/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/python/datasync-s3/tests/unit/test_datasync_s3_stack.py b/python/datasync-s3/tests/unit/test_datasync_s3_stack.py new file mode 100644 index 000000000..173a7e03c --- /dev/null +++ b/python/datasync-s3/tests/unit/test_datasync_s3_stack.py @@ -0,0 +1,15 @@ +import aws_cdk as core +import aws_cdk.assertions as assertions + +from datasync_s3.datasync_s3_stack import DatasyncS3Stack + +# example tests. To run these tests, uncomment this file along with the example +# resource in datasync_s3/datasync_s3_stack.py +# def test_sqs_queue_created(): +# app = core.App() +# stack = DatasyncS3Stack(app, "datasync-s3") +# template = assertions.Template.from_stack(stack) + +# template.has_resource_properties("AWS::SQS::Queue", { +# "VisibilityTimeout": 300 +# })