diff --git a/config/stacks/htsget.ts b/config/stacks/htsget.ts index 321327a4e..316b58d48 100644 --- a/config/stacks/htsget.ts +++ b/config/stacks/htsget.ts @@ -6,8 +6,12 @@ import { vpcProps, } from '../constants'; import { HtsgetStackConfigurableProps } from '../../lib/workload/stateless/stacks/htsget/stack'; +import { fileManagerBuckets, fileManagerInventoryBuckets } from './fileManager'; export const getHtsgetProps = (stage: AppStage): HtsgetStackConfigurableProps => { + const inventorySourceBuckets = fileManagerInventoryBuckets(stage); + const eventSourceBuckets = fileManagerBuckets(stage); + return { vpcProps, apiGatewayCognitoProps: { @@ -17,5 +21,6 @@ export const getHtsgetProps = (stage: AppStage): HtsgetStackConfigurableProps => apiName: 'Htsget', customDomainNamePrefix: 'htsget-file', }, + buckets: [...inventorySourceBuckets, ...eventSourceBuckets], }; }; diff --git a/lib/workload/stateless/stacks/htsget/deploy.toml b/lib/workload/stateless/stacks/htsget/deploy.toml deleted file mode 100644 index 4b99d7c4d..000000000 --- a/lib/workload/stateless/stacks/htsget/deploy.toml +++ /dev/null @@ -1,24 +0,0 @@ -# TODO this will eventually be removed for props-only configuration. - -ticket_server_cors_allow_headers = "All" -ticket_server_cors_allow_origins = "Mirror" -ticket_server_cors_allow_methods = "All" -ticket_server_cors_allow_credentials = true -ticket_server_cors_max_age = 300 - -data_server_enabled = false - -name = "orcabus-htsget-rs" -version = "0.1.0" -organization_name = "UMCCR" -organization_url = "https://umccr.org/" -contact_url = "https://umccr.org/" -documentation_url = "https://github.com/umccr/htsget-rs" - -# The role should prevent any access to other files, although it should probably -# be set here as well. -[[resolvers]] -regex = '^(?P.*?)/(?P.*)$' -substitution_string = '$key' -storage.backend = 'S3' - diff --git a/lib/workload/stateless/stacks/htsget/stack.ts b/lib/workload/stateless/stacks/htsget/stack.ts index 1388f09c8..31d630250 100644 --- a/lib/workload/stateless/stacks/htsget/stack.ts +++ b/lib/workload/stateless/stacks/htsget/stack.ts @@ -3,8 +3,7 @@ import { Stack, StackProps } from 'aws-cdk-lib'; import { Role } from 'aws-cdk-lib/aws-iam'; import { IVpc, Vpc, VpcLookupOptions } from 'aws-cdk-lib/aws-ec2'; import { ApiGatewayConstruct, ApiGatewayConstructProps } from '../../../components/api-gateway'; -import path from 'path'; -import { HtsgetLambdaConstruct } from 'htsget-lambda'; +import { HtsgetLambda } from 'htsget-lambda'; /** * Configurable props for the htsget stack. @@ -18,6 +17,10 @@ export type HtsgetStackConfigurableProps = { * API gateway construct props. */ apiGatewayCognitoProps: ApiGatewayConstructProps; + /** + * The buckets to configure for htsget access. + */ + buckets: string[]; }; /** @@ -43,13 +46,24 @@ export class HtsgetStack extends Stack { this.vpc = Vpc.fromLookup(this, 'MainVpc', props.vpcProps); this.apiGateway = new ApiGatewayConstruct(this, 'ApiGateway', props.apiGatewayCognitoProps); - const configPath = path.join(__dirname, 'deploy.toml'); - new HtsgetLambdaConstruct(this, 'Htsget', { - config: configPath, + new HtsgetLambda(this, 'Htsget', { + htsgetConfig: { + environment_override: { + HTSGET_LOCATIONS: props.buckets.map((bucket) => { + let regex = `^${bucket}/(?P.*)$`; + let substitution_string = '$key'; + let backend = `{ kind=S3, bucket=${bucket} }`; + + return `{ regex=${regex}, substitution_string=${substitution_string}, backend=${backend} }`; + }), + }, + }, + cargoLambdaFlags: ['--features', 'aws'], vpc: this.vpc, role: props.role, httpApi: this.apiGateway.httpApi, - gitReference: 'htsget-lambda-v0.5.2', + gitReference: 'htsget-lambda-v0.6.0', + gitForceClone: false, }); } } diff --git a/lib/workload/stateless/stacks/metadata-manager/app/management/commands/clean_duplicated_libraries.py b/lib/workload/stateless/stacks/metadata-manager/app/management/commands/clean_duplicated_libraries.py new file mode 100644 index 000000000..c75bf5333 --- /dev/null +++ b/lib/workload/stateless/stacks/metadata-manager/app/management/commands/clean_duplicated_libraries.py @@ -0,0 +1,34 @@ +import json + +from django.core.management import BaseCommand + +from django.db.models import Q + +from app.models import Library + + +# https://docs.djangoproject.com/en/5.0/howto/custom-management-commands/ +class Command(BaseCommand): + help = "Delete all DB data" + + def add_arguments(self, parser): + parser.add_argument( + "--dry-run", + action="store_true", + help="List all libraries that will be deleted without actually deleting them", + ) + + def handle(self, *args, **options): + all_libraries = Library.objects.all().filter( + Q(library_id__icontains="_rerun") | Q(library_id__icontains="_topup")) + + print("Libraries contain matching pattern:") + print(json.dumps([library.library_id for library in all_libraries], indent=4)) + + if not options["dry_run"]: + print("Deleting all libraries") + all_libraries.delete() + else: + print("Dry run: not deleting libraries") + + print('Completed') diff --git a/lib/workload/stateless/stacks/metadata-manager/deploy/construct/lambda-django-command/index.ts b/lib/workload/stateless/stacks/metadata-manager/deploy/construct/lambda-django-command/index.ts new file mode 100644 index 000000000..b2bc52ad8 --- /dev/null +++ b/lib/workload/stateless/stacks/metadata-manager/deploy/construct/lambda-django-command/index.ts @@ -0,0 +1,46 @@ +import path from 'path'; +import { Construct } from 'constructs'; +import { Duration } from 'aws-cdk-lib'; +import { PythonFunction } from '@aws-cdk/aws-lambda-python-alpha'; +import { ISecret } from 'aws-cdk-lib/aws-secretsmanager'; +import { + DockerImageFunction, + DockerImageFunctionProps, + DockerImageCode, +} from 'aws-cdk-lib/aws-lambda'; + +type LambdaProps = { + /** + * The basic common lambda properties that it should inherit from + */ + basicLambdaConfig: Partial; + /** + * The secret for the db connection where the lambda will need access to + */ + dbConnectionSecret: ISecret; +}; + +export class LambdaDjangoCommandConstruct extends Construct { + readonly lambda: PythonFunction; + + constructor(scope: Construct, id: string, lambdaProps: LambdaProps) { + super(scope, id); + + this.lambda = new DockerImageFunction(this, 'DjangoCommandLambda', { + environment: { + ...lambdaProps.basicLambdaConfig.environment, + }, + securityGroups: lambdaProps.basicLambdaConfig.securityGroups, + vpc: lambdaProps.basicLambdaConfig.vpc, + vpcSubnets: lambdaProps.basicLambdaConfig.vpcSubnets, + architecture: lambdaProps.basicLambdaConfig.architecture, + code: DockerImageCode.fromImageAsset(path.join(__dirname, '../../../'), { + file: 'deploy/construct/lambda-django-command/lambda.Dockerfile', + }), + timeout: Duration.minutes(15), + memorySize: 4096, + }); + + lambdaProps.dbConnectionSecret.grantRead(this.lambda); + } +} diff --git a/lib/workload/stateless/stacks/metadata-manager/deploy/construct/lambda-django-command/lambda.Dockerfile b/lib/workload/stateless/stacks/metadata-manager/deploy/construct/lambda-django-command/lambda.Dockerfile new file mode 100644 index 000000000..06960068d --- /dev/null +++ b/lib/workload/stateless/stacks/metadata-manager/deploy/construct/lambda-django-command/lambda.Dockerfile @@ -0,0 +1,12 @@ +FROM public.ecr.aws/lambda/python:3.12 + +WORKDIR ${LAMBDA_TASK_ROOT} + +# COPY all files +COPY . . + +# Install the specified packages +RUN pip install -r deps/requirements-full.txt + +# Specify handler +CMD [ "handler.django_command.handler" ] diff --git a/lib/workload/stateless/stacks/metadata-manager/deploy/stack.ts b/lib/workload/stateless/stacks/metadata-manager/deploy/stack.ts index 46f763e3d..586fead23 100644 --- a/lib/workload/stateless/stacks/metadata-manager/deploy/stack.ts +++ b/lib/workload/stateless/stacks/metadata-manager/deploy/stack.ts @@ -12,6 +12,7 @@ import { LambdaAPIConstruct } from './construct/lambda-api'; import { ApiGatewayConstructProps } from '../../../../components/api-gateway'; import { PostgresManagerStack } from '../../../../stateful/stacks/postgres-manager/deploy/stack'; import { LambdaLoadCustomCSVConstruct } from './construct/lambda-load-custom-csv'; +import { LambdaDjangoCommandConstruct } from './construct/lambda-django-command'; export type MetadataManagerStackProps = { /** @@ -89,6 +90,11 @@ export class MetadataManagerStack extends Stack { vpc: vpc, }); + new LambdaDjangoCommandConstruct(this, 'DjangoCommandLambda', { + basicLambdaConfig: basicLambdaConfig, + dbConnectionSecret: dbSecret, + }); + const syncGsheetLambda = new LambdaSyncGsheetConstruct(this, 'SyncGsheetLambda', { basicLambdaConfig: basicLambdaConfig, dbConnectionSecret: dbSecret, diff --git a/lib/workload/stateless/stacks/metadata-manager/handler/django_command.py b/lib/workload/stateless/stacks/metadata-manager/handler/django_command.py new file mode 100644 index 000000000..0c55b975e --- /dev/null +++ b/lib/workload/stateless/stacks/metadata-manager/handler/django_command.py @@ -0,0 +1,28 @@ +# -*- coding: utf-8 -*- +"""migrate lambda module + +Convenience AWS lambda handler for Django database migration command hook +""" +import json +import logging +from django.core.management import execute_from_command_line + +logger = logging.getLogger() +logger.setLevel(logging.INFO) + + +def handler(event, context) -> dict[str, str]: + logger.info(f"Processing event: {json.dumps(event, indent=4)}") + + command = event.get("command", None) + args = event.get("args", []) + + whitelist_command = ["clean_duplicated_libraries"] + + if command not in whitelist_command: + raise ValueError(f"Command {command} not accepted") + + + res = execute_from_command_line(["./manage.py", command, *args]) + + return res diff --git a/lib/workload/stateless/stacks/metadata-manager/proc/service/utils.py b/lib/workload/stateless/stacks/metadata-manager/proc/service/utils.py index f65b289e2..a3cd5ffaa 100644 --- a/lib/workload/stateless/stacks/metadata-manager/proc/service/utils.py +++ b/lib/workload/stateless/stacks/metadata-manager/proc/service/utils.py @@ -23,7 +23,7 @@ def clean_model_history(minutes: int = None): call_command("clean_duplicate_history", "--auto", minutes=minutes, stdout=open(os.devnull, 'w')) -def sanitize_lab_metadata_df(df: pd.DataFrame): +def sanitize_lab_metadata_df(df: pd.DataFrame) -> pd.DataFrame: """ sanitize record by renaming columns, and clean df cells """ @@ -37,6 +37,11 @@ def sanitize_lab_metadata_df(df: pd.DataFrame): # dropping column that has empty column heading df = df.drop('', axis='columns', errors='ignore') + # We are now removing and '_rerun' or '_topup' postfix from libraries + # See https://github.com/umccr/orcabus/issues/865 + df['library_id'] = df['library_id'].str.replace(r'_rerun\d*$', '', regex=True) + df['library_id'] = df['library_id'].str.replace(r'_topup\d*$', '', regex=True) + df = df.reset_index(drop=True) return df diff --git a/lib/workload/stateless/stacks/metadata-manager/proc/tests/test_tracking_sheet_srv.py b/lib/workload/stateless/stacks/metadata-manager/proc/tests/test_tracking_sheet_srv.py index a8f69b1b1..b1f4fc556 100644 --- a/lib/workload/stateless/stacks/metadata-manager/proc/tests/test_tracking_sheet_srv.py +++ b/lib/workload/stateless/stacks/metadata-manager/proc/tests/test_tracking_sheet_srv.py @@ -1,6 +1,8 @@ import os import json import pandas as pd +from django.db.models import Q +from django.utils.timezone import override from libumccr.aws import libeb from unittest.mock import MagicMock @@ -11,6 +13,7 @@ from proc.service.tracking_sheet_srv import sanitize_lab_metadata_df, persist_lab_metadata, \ drop_incomplete_tracking_sheet_records from .utils import check_put_event_entries_format, check_put_event_value, is_expected_event_in_output +from ..service.utils import warn_drop_duplicated_library TEST_EVENT_BUS_NAME = "TEST_BUS" @@ -173,6 +176,50 @@ def test_persist_lab_metadata(self): ctc = prj.contact_set.get(contact_id=rec.get("ProjectOwner")) self.assertEqual(ctc.contact_id, rec.get("ProjectOwner"), 'incorrect project-contact link') + def test_rerun_topup_libraries(self) -> None: + """ + python manage.py test proc.tests.test_tracking_sheet_srv.TrackingSheetSrvUnitTests.test_rerun_topup_libraries + + we don't want to treat any topup / rerun libraries as a new record + """ + + # Prepare the initial data with a topup and rerun libraries + final_records = [RECORD_1] + + # topup record + topup_record = RECORD_1.copy() + topup_record['LibraryID'] = topup_record['LibraryID'] + '_topup' + final_records.append(topup_record) + + topup_2_record = RECORD_1.copy() + topup_2_record['LibraryID'] = topup_2_record['LibraryID'] + '_topup23' + final_records.append(topup_2_record) + + # rerun record + rerun_record = RECORD_1.copy() + rerun_record['LibraryID'] = rerun_record['LibraryID'] + '_rerun' + final_records.append(rerun_record) + + # Change the latest library properties to check if latest record is final stored + test_override_cycles = "TEST_123" + rerun_2_record = RECORD_1.copy() + rerun_2_record['LibraryID'] = rerun_2_record['LibraryID'] + '_rerun2342' + rerun_2_record["OverrideCycles"] = test_override_cycles + final_records.append(rerun_2_record) + + metadata_pd = pd.json_normalize(final_records) + metadata_pd = sanitize_lab_metadata_df(metadata_pd) + metadata_pd = warn_drop_duplicated_library(metadata_pd) + + persist_lab_metadata(metadata_pd, SHEET_YEAR) + + original_lib = Library.objects.get(library_id=RECORD_1.get("LibraryID")) + self.assertIsNotNone(original_lib, "Original library should be created") + self.assertEqual(original_lib.override_cycles, test_override_cycles, "Latest record is expected to be stored") + + dup_libraries = Library.objects.all().filter(Q(library_id__icontains="_rerun") | Q(library_id__icontains="_topup")) + self.assertEqual(dup_libraries.count(), 0, "Topup and rerun libraries should NOT exist") + def test_new_df_in_different_year(self) -> None: """ python manage.py test proc.tests.test_tracking_sheet_srv.TrackingSheetSrvUnitTests.test_new_df_in_different_year diff --git a/package.json b/package.json index 73187ef12..e72992f41 100644 --- a/package.json +++ b/package.json @@ -28,12 +28,12 @@ "@aws-cdk/aws-pipes-alpha": "2.177.0-alpha.0", "@aws-cdk/aws-pipes-sources-alpha": "2.177.0-alpha.0", "aws-cdk-lib": "2.177.0", - "cargo-lambda-cdk": "0.0.31", + "cargo-lambda-cdk": "^0.0.31", "cdk-nag": "^2.35.3", "constructs": "^10.4.2", "core-js-pure": "^3.40.0", "dotenv": "^16.4.7", - "htsget-lambda": "^0.7.2", + "htsget-lambda": "^0.8.7", "source-map-support": "^0.5.21", "sqs-dlq-monitoring": "^1.2.20" }, diff --git a/yarn.lock b/yarn.lock index ea63107b7..3155c4f19 100644 --- a/yarn.lock +++ b/yarn.lock @@ -23,9 +23,9 @@ __metadata: linkType: hard "@aws-cdk/asset-kubectl-v20@npm:^2.1.3": - version: 2.1.3 - resolution: "@aws-cdk/asset-kubectl-v20@npm:2.1.3" - checksum: 10/1d24c3ceaefa3ded3727e3353d48509a7a87ef99d88ce778e7739f21aa3f145233c8fea21ae93a42133a31d45599057a313847ad59d5cdc983904b51b6c2e761 + version: 2.1.4 + resolution: "@aws-cdk/asset-kubectl-v20@npm:2.1.4" + checksum: 10/2fac2872da5ffd628522e5e9d8600ac6e1e295609f7e77e208f931e2fd1a0c88a638b5b0c4c2c1d01ef3c1986a7e13551fe06dc7f8a927fd234bade4775e5736 languageName: node linkType: hard @@ -635,13 +635,6 @@ __metadata: languageName: node linkType: hard -"@iarna/toml@npm:^3.0.0": - version: 3.0.0 - resolution: "@iarna/toml@npm:3.0.0" - checksum: 10/c52161263aaed8c548befd868b1522506e4ea55cf51267f386a6acb468cedc05ab3e80a8b8dffca52344d553cdaea6dff89473ed2655f1fca4dd307561cddcf6 - languageName: node - linkType: hard - "@isaacs/cliui@npm:^8.0.2": version: 8.0.2 resolution: "@isaacs/cliui@npm:8.0.2" @@ -1763,7 +1756,7 @@ __metadata: languageName: node linkType: hard -"cargo-lambda-cdk@npm:0.0.31, cargo-lambda-cdk@npm:^0.0.31": +"cargo-lambda-cdk@npm:^0.0.31": version: 0.0.31 resolution: "cargo-lambda-cdk@npm:0.0.31" dependencies: @@ -2677,17 +2670,16 @@ __metadata: languageName: node linkType: hard -"htsget-lambda@npm:^0.7.2": - version: 0.7.2 - resolution: "htsget-lambda@npm:0.7.2" +"htsget-lambda@npm:^0.8.7": + version: 0.8.7 + resolution: "htsget-lambda@npm:0.8.7" dependencies: - "@iarna/toml": "npm:^3.0.0" cargo-lambda-cdk: "npm:^0.0.31" peerDependencies: aws-cdk-lib: ^2.112.0 bin: htsget_app: bin/htsget-lambda.js - checksum: 10/e313d4885be65b48a717d10b47389da2d850d4159a28e1ba4ca66ec009f1989ceec32f79e8f9f0957f7d541795dd02f252bc6c7d206b5a5df814f56f89864227 + checksum: 10/4a0bab885328a64c60580d8db8dd53ad1d8ffcff3c08a5b3ab350d76aade5314be8e0efb682aee4a3f88d8d660ad84a377d46ab3507d061d8acc66952927a4c9 languageName: node linkType: hard @@ -3977,7 +3969,7 @@ __metadata: "@types/node": "npm:^22.12.0" aws-cdk: "npm:2.177.0" aws-cdk-lib: "npm:2.177.0" - cargo-lambda-cdk: "npm:0.0.31" + cargo-lambda-cdk: "npm:^0.0.31" cdk-nag: "npm:^2.35.3" constructs: "npm:^10.4.2" core-js-pure: "npm:^3.40.0" @@ -3985,7 +3977,7 @@ __metadata: eslint: "npm:^9.19.0" eslint-config-prettier: "npm:^10.0.1" globals: "npm:^15.14.0" - htsget-lambda: "npm:^0.7.2" + htsget-lambda: "npm:^0.8.7" jest: "npm:^29.7.0" jest-junit: "npm:^16.0.0" prettier: "npm:^3.4.2"