From 450dc5600b70748f56c47e3e5ce54273ba1ea396 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1t=C3=A9=20Kocsis?= Date: Fri, 30 Aug 2024 16:16:32 +0200 Subject: [PATCH] Fix automatic instance termination --- .gitignore | 2 + build/infrastructure/aws/main.tf | 157 +++++++++++++++++- build/infrastructure/aws/variables.tf | 19 +++ .../package/scheduler/__init__.py | 1 + .../package/scheduler/exceptions.py | 46 +++++ .../scheduler/filter_resources_by_tags.py | 54 ++++++ .../package/scheduler/instance_handler.py | 81 +++++++++ .../infrastructure/package/scheduler/main.py | 31 ++++ .../package/scheduler/waiters.py | 38 +++++ 9 files changed, 423 insertions(+), 6 deletions(-) create mode 100644 build/infrastructure/package/scheduler/__init__.py create mode 100644 build/infrastructure/package/scheduler/exceptions.py create mode 100644 build/infrastructure/package/scheduler/filter_resources_by_tags.py create mode 100644 build/infrastructure/package/scheduler/instance_handler.py create mode 100644 build/infrastructure/package/scheduler/main.py create mode 100644 build/infrastructure/package/scheduler/waiters.py diff --git a/.gitignore b/.gitignore index 4c791459..808fdd49 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,8 @@ vendor/ .terraform/ /app/*/* !/app/zend/* +/build/infrastructure/config/ +!/build/infrastructure/config/aws.tfvars.dist /config/**/*.* !/config/**/*.dist /tmp/ diff --git a/build/infrastructure/aws/main.tf b/build/infrastructure/aws/main.tf index f840a30e..3145a7f1 100644 --- a/build/infrastructure/aws/main.tf +++ b/build/infrastructure/aws/main.tf @@ -46,9 +46,7 @@ resource "aws_instance" "host" { iops = 8000 } - tags = { - Name = "php-benchmark-host" - } + tags = merge(var.tags, {(var.scheduler_tag["key"]) = var.scheduler_tag["value"]}) connection { type = "ssh" @@ -79,9 +77,6 @@ EOF inline = [ "set -e", - "# Automatic termination", - "#echo 'sudo halt' | at now + ${var.termination_timeout_in_min} min", - "# Update permissions", "sudo mkdir -p ${var.remote_project_root}", "sudo chmod -R 775 ${var.remote_project_root}", @@ -235,3 +230,153 @@ resource "aws_security_group" "security_group" { "0.0.0.0/0"] } } + +################################################ +# +# AUTOMATIC TERMINATION +# +################################################ + +resource "aws_iam_role" "this" { + name = "php-version-benchmark-termination-scheduler-lambda" + description = "Allows Lambda functions to stop and start ec2 and rds resources" + assume_role_policy = data.aws_iam_policy_document.this.json + tags = var.tags +} + +data "aws_iam_policy_document" "this" { + statement { + actions = ["sts:AssumeRole"] + + principals { + type = "Service" + identifiers = ["lambda.amazonaws.com"] + } + } +} + +resource "aws_iam_role_policy" "termination_lambda" { + name = "php-version-benchmark-termination-lambda-policy" + role = aws_iam_role.this.id + policy = data.aws_iam_policy_document.termination_lambda.json +} + +data "aws_iam_policy_document" "termination_lambda" { + statement { + actions = [ + "tag:GetResources", + "ec2:StopInstances", + "ec2:StartInstances", + "autoscaling:DescribeAutoScalingInstances", + ] + + resources = [ + "*", + ] + } +} + +resource "aws_iam_role_policy" "termination_lambda_cloudwatch_alarm" { + name = "php-version-benchmark-termination-cloudwatch-custom-policy-scheduler" + role = aws_iam_role.this.id + policy = data.aws_iam_policy_document.termination_lambda_cloudwatch_alarm.json +} + +data "aws_iam_policy_document" "termination_lambda_cloudwatch_alarm" { + statement { + actions = [ + "cloudwatch:DisableAlarmActions", + "cloudwatch:EnableAlarmActions", + ] + + resources = [ + "*", + ] + } +} + +resource "aws_iam_role_policy" "lambda_logging" { + name = "php-version-benchmark-termination-lambda-logging" + role = aws_iam_role.this.id + policy = jsonencode({ + "Version" : "2012-10-17", + "Statement" : [ + { + "Action" : [ + "logs:CreateLogStream", + "logs:PutLogEvents" + ], + "Resource" : "${aws_cloudwatch_log_group.this.arn}:*", + "Effect" : "Allow" + } + ] + }) +} + +# Convert *.py to .zip because AWS Lambda needs .zip +data "archive_file" "package" { + type = "zip" + source_dir = "${var.local_project_root}/build/infrastructure/package/" + output_path = "${var.local_project_root}/tmp/aws-stop-start-resources.zip" +} + +# Create Lambda function for stop or start aws resources +resource "aws_lambda_function" "this" { + filename = data.archive_file.package.output_path + source_code_hash = data.archive_file.package.output_base64sha256 + function_name = "php-version-benchmark-termination-lambda-function" + role = aws_iam_role.this.arn + handler = "scheduler.main.lambda_handler" + runtime = "python3.10" + timeout = "600" + kms_key_arn = "" + + environment { + variables = { + AWS_REGIONS = var.region + SCHEDULE_ACTION = "stop" + TAG_KEY = var.scheduler_tag["key"] + TAG_VALUE = var.scheduler_tag["value"] + EC2_SCHEDULE = "true" + } + } + + tags = var.tags +} + +locals { + rfc_3339_now = "${replace(var.now, " ", "T")}Z" + termination_time = timeadd(local.rfc_3339_now, "${var.termination_timeout_in_min}m") + termination_hour = formatdate("h", local.termination_time) + termination_minute = formatdate("m", local.termination_time) + termination_day = formatdate("D", local.termination_time) + termination_month = formatdate("M", local.termination_time) + termination_year = formatdate("YYYY", local.termination_time) + cloudwatch_schedule_expression = "cron(${local.termination_minute} ${local.termination_hour} ${local.termination_day} ${local.termination_month} ? ${local.termination_year})" +} + +resource "aws_cloudwatch_event_rule" "this" { + name = "php-version-benchmark-termination-lambda-scheduler" + description = "Trigger lambda scheduler" + schedule_expression = local.cloudwatch_schedule_expression + tags = var.tags +} + +resource "aws_cloudwatch_event_target" "this" { + arn = aws_lambda_function.this.arn + rule = aws_cloudwatch_event_rule.this.name +} + +resource "aws_lambda_permission" "this" { + statement_id = "AllowExecutionFromCloudWatch" + action = "lambda:InvokeFunction" + principal = "events.amazonaws.com" + function_name = aws_lambda_function.this.function_name + source_arn = aws_cloudwatch_event_rule.this.arn +} + +resource "aws_cloudwatch_log_group" "this" { + name = "/aws/lambda/php-version-benchmark-termination" + retention_in_days = 7 + tags = var.tags +} diff --git a/build/infrastructure/aws/variables.tf b/build/infrastructure/aws/variables.tf index 2bfc09a5..4ab93c83 100644 --- a/build/infrastructure/aws/variables.tf +++ b/build/infrastructure/aws/variables.tf @@ -36,6 +36,25 @@ variable "result_root_dir" { type = string } +variable "tags" { + description = "Custom tags on AWS resources" + type = map(string) + + default = { + "Name" = "php-version-benchmark" + } +} + +variable "scheduler_tag" { + description = "Identifies AWS resources to stop" + type = map(string) + + default = { + "key" = "to_stop" + "value" = "true" + } +} + variable "termination_timeout_in_min" { type = number } diff --git a/build/infrastructure/package/scheduler/__init__.py b/build/infrastructure/package/scheduler/__init__.py new file mode 100644 index 00000000..6ee487dc --- /dev/null +++ b/build/infrastructure/package/scheduler/__init__.py @@ -0,0 +1 @@ +"""Module containing the logic for the lambda scheduler entry-points.""" diff --git a/build/infrastructure/package/scheduler/exceptions.py b/build/infrastructure/package/scheduler/exceptions.py new file mode 100644 index 00000000..b93dc8c1 --- /dev/null +++ b/build/infrastructure/package/scheduler/exceptions.py @@ -0,0 +1,46 @@ +"""Exception functions.""" + +import logging + + +def ec2_exception(resource_name: str, resource_id: str, exception) -> None: + """Exception raised during execution of ec2 scheduler. + + Log instance, spot instance and autoscaling groups exceptions + on the specific aws resources. + + :param str resource_name: + Aws resource name + :param str resource_id: + Aws resource id + :param str exception: + Human readable string describing the exception + """ + info_codes = ["IncorrectInstanceState"] + warning_codes = [ + "UnsupportedOperation", + "IncorrectInstanceState", + "InvalidParameterCombination", + ] + + if exception.response["Error"]["Code"] in info_codes: + logging.info( + "%s %s: %s", + resource_name, + resource_id, + exception, + ) + elif exception.response["Error"]["Code"] in warning_codes: + logging.warning( + "%s %s: %s", + resource_name, + resource_id, + exception, + ) + else: + logging.error( + "Unexpected error on %s %s: %s", + resource_name, + resource_id, + exception, + ) diff --git a/build/infrastructure/package/scheduler/filter_resources_by_tags.py b/build/infrastructure/package/scheduler/filter_resources_by_tags.py new file mode 100644 index 00000000..c6999be0 --- /dev/null +++ b/build/infrastructure/package/scheduler/filter_resources_by_tags.py @@ -0,0 +1,54 @@ +"""Filter aws resouces with tags.""" + +from typing import Iterator + +import boto3 + + +class FilterByTags: + """Abstract Filter aws resources by tags in a class.""" + + def __init__(self, region_name=None) -> None: + """Initialize resourcegroupstaggingapi client.""" + if region_name: + self.rgta = boto3.client( + "resourcegroupstaggingapi", region_name=region_name + ) + else: + self.rgta = boto3.client("resourcegroupstaggingapi") + + def get_resources(self, resource_type, aws_tags) -> Iterator[str]: + """Filter aws resources using resource type and defined tags. + + Returns all the tagged defined resources that are located in + the specified Region for the AWS account. + + :param str resource_type: + The constraints on the resources that you want returned. + The format of each resource type is service[:resourceType] . + For example, specifying a resource type of ec2 returns all + Amazon EC2 resources (which includes EC2 instances). + Specifying a resource type of ec2:instance returns only + EC2 instances. + :param list[map] aws_tags: + A list of TagFilters (keys and values). + Each TagFilter specified must contain a key with values + as optional. For example: + [ + { + 'Key': 'string', + 'Values': [ + 'string', + ] + }, + ] + :yield Iterator[str]: + The ids of the resources + """ + paginator = self.rgta.get_paginator("get_resources") + page_iterator = paginator.paginate( + TagFilters=aws_tags, ResourceTypeFilters=[resource_type] + ) + for page in page_iterator: + for resource_tag_map in page["ResourceTagMappingList"]: + yield resource_tag_map["ResourceARN"] diff --git a/build/infrastructure/package/scheduler/instance_handler.py b/build/infrastructure/package/scheduler/instance_handler.py new file mode 100644 index 00000000..4c731243 --- /dev/null +++ b/build/infrastructure/package/scheduler/instance_handler.py @@ -0,0 +1,81 @@ +"""ec2 instances scheduler.""" + +from typing import Dict, List + +import boto3 + +from botocore.exceptions import ClientError + +from .exceptions import ec2_exception +from .filter_resources_by_tags import FilterByTags + + +class InstanceScheduler: + """Abstract ec2 scheduler in a class.""" + + def __init__(self, region_name=None) -> None: + """Initialize ec2 scheduler.""" + if region_name: + self.ec2 = boto3.client("ec2", region_name=region_name) + self.asg = boto3.client("autoscaling", region_name=region_name) + else: + self.ec2 = boto3.client("ec2") + self.asg = boto3.client("autoscaling") + self.tag_api = FilterByTags(region_name=region_name) + + def stop(self, aws_tags: List[Dict]) -> None: + """Aws ec2 instance stop function. + + Stop ec2 instances with defined tags and disable its Cloudwatch + alarms. + + :param list[map] aws_tags: + Aws tags to use for filter resources. + For example: + [ + { + 'Key': 'string', + 'Values': [ + 'string', + ] + } + ] + """ + for instance_arn in self.tag_api.get_resources("ec2:instance", aws_tags): + instance_id = instance_arn.split("/")[-1] + try: + if not self.asg.describe_auto_scaling_instances( + InstanceIds=[instance_id] + )["AutoScalingInstances"]: + self.ec2.stop_instances(InstanceIds=[instance_id]) + print(f"Stop instances {instance_id}") + except ClientError as exc: + ec2_exception("instance", instance_id, exc) + + def start(self, aws_tags: List[Dict]) -> None: + """Aws ec2 instance start function. + + Start ec2 instances with defined tags. + + Aws tags to use for filter resources + Aws tags to use for filter resources. + For example: + [ + { + 'Key': 'string', + 'Values': [ + 'string', + ] + } + ] + """ + for instance_arn in self.tag_api.get_resources("ec2:instance", aws_tags): + instance_id = instance_arn.split("/")[-1] + try: + if not self.asg.describe_auto_scaling_instances( + InstanceIds=[instance_id] + )["AutoScalingInstances"]: + self.ec2.start_instances(InstanceIds=[instance_id]) + print(f"Start instances {instance_id}") + except ClientError as exc: + ec2_exception("instance", instance_id, exc) diff --git a/build/infrastructure/package/scheduler/main.py b/build/infrastructure/package/scheduler/main.py new file mode 100644 index 00000000..380c1986 --- /dev/null +++ b/build/infrastructure/package/scheduler/main.py @@ -0,0 +1,31 @@ +"""This script stop and start aws resources.""" +import os + +from .instance_handler import InstanceScheduler + + +def lambda_handler(event, context): + """Main function entrypoint for lambda. + + Stop AWS resources: + - instance ec2 + """ + # Retrieve variables from aws lambda ENVIRONMENT + schedule_action = os.getenv("SCHEDULE_ACTION") + aws_regions = os.getenv("AWS_REGIONS").replace(" ", "").split(",") + format_tags = [{"Key": os.getenv("TAG_KEY"), "Values": [os.getenv("TAG_VALUE")]}] + + _strategy = { + InstanceScheduler: os.getenv("EC2_SCHEDULE"), + } + + for service, to_schedule in _strategy.items(): + if strtobool(to_schedule): + for aws_region in aws_regions: + strategy = service(aws_region) + getattr(strategy, schedule_action)(aws_tags=format_tags) + + +def strtobool(value: str) -> bool: + """Convert string to boolean.""" + return value.lower() in ("yes", "true", "t", "1") diff --git a/build/infrastructure/package/scheduler/waiters.py b/build/infrastructure/package/scheduler/waiters.py new file mode 100644 index 00000000..5ec37da8 --- /dev/null +++ b/build/infrastructure/package/scheduler/waiters.py @@ -0,0 +1,38 @@ +"""Autoscaling instances scheduler.""" + +from typing import List + +import boto3 + +from botocore.exceptions import ClientError + +from .exceptions import ec2_exception + + +class AwsWaiters: + """Abstract aws waiter in a class.""" + + def __init__(self, region_name=None) -> None: + """Initialize aws waiter.""" + if region_name: + self.ec2 = boto3.client("ec2", region_name=region_name) + else: + self.ec2 = boto3.client("ec2") + + def instance_running(self, instance_ids: List[str]) -> None: + """Aws waiter for instance running. + + Wait ec2 instances are in running state. + + :param list instance_ids: + The instance IDs to wait. + """ + if instance_ids: + try: + instance_waiter = self.ec2.get_waiter("instance_running") + instance_waiter.wait( + InstanceIds=instance_ids, + WaiterConfig={"Delay": 15, "MaxAttempts": 15}, + ) + except ClientError as exc: + ec2_exception("waiter", instance_waiter, exc)