From 8388ce20a8d75c2849637941fcadc8bd0465be13 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Fri, 20 Sep 2024 12:32:20 -0400 Subject: [PATCH 01/64] Add back custom storage endpoints --- backend/btrixcloud/storages.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 3c79648529..f2ee60d9d6 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -749,10 +749,6 @@ def get_storage_refs( def get_available_storages(org: Organization = Depends(org_owner_dep)): return storage_ops.get_available_storages(org) - # pylint: disable=unreachable, fixme - # todo: enable when ready to support custom storage - return storage_ops - @router.post( "/customStorage", tags=["organizations"], response_model=AddedResponseName ) From 5fbdca1e5250fc4ad53adc58c194159976da72cf Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Fri, 20 Sep 2024 12:59:39 -0400 Subject: [PATCH 02/64] Flush out tests for setting custom storage --- backend/btrixcloud/storages.py | 4 +++ backend/test/test_org.py | 64 +++++++++++++++++++++++++++++++--- 2 files changed, 64 insertions(+), 4 deletions(-) diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index f2ee60d9d6..50db7c2710 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -765,6 +765,10 @@ async def remove_custom_storage( ): return await storage_ops.remove_custom_storage(name, org) + # TODO: Modify this to make it easier to change just the primary or replica locations + # without needing to explicitly pass the secrets for what we're not changing every time + # - Maybe make it a PATCH + # - Maybe split out into two endpoints @router.post("/storage", tags=["organizations"], response_model=UpdatedResponse) async def update_storage_refs( storage: OrgStorageRefs, diff --git a/backend/test/test_org.py b/backend/test/test_org.py index ab652aa266..c8319198c5 100644 --- a/backend/test/test_org.py +++ b/backend/test/test_org.py @@ -13,6 +13,9 @@ invite_email = "test-user@example.com" +CUSTOM_PRIMARY_STORAGE_NAME = "custom-primary" +CUSTOM_REPLICA_STORAGE_NAME = "custom-replica" + def test_ensure_only_one_default_org(admin_auth_headers): r = requests.get(f"{API_PREFIX}/orgs", headers=admin_auth_headers) @@ -224,8 +227,7 @@ def test_create_org_duplicate_slug(admin_auth_headers, non_default_org_id, slug) assert data["detail"] == "duplicate_org_slug" -# disable until storage customization is enabled -def _test_change_org_storage(admin_auth_headers): +def test_change_org_storage(admin_auth_headers): # change to invalid storage r = requests.post( f"{API_PREFIX}/orgs/{new_oid}/storage", @@ -244,16 +246,70 @@ def _test_change_org_storage(admin_auth_headers): assert r.status_code == 400 - # change to valid storage + # add custom storages + r = requests.post( + f"{API_PREFIX}/orgs/{new_oid}/customStorage", + headers=admin_auth_headers, + json={ + "name": CUSTOM_PRIMARY_STORAGE_NAME, + "access_key": "ADMIN", + "secret_key": "PASSW0RD", + "bucket": CUSTOM_PRIMARY_STORAGE_NAME, + "endpoint_url": "http://local-minio.default:9000/", + }, + ) + assert r.status_code == 200 + data = r.json() + assert data["added"] + assert data["name"] == CUSTOM_PRIMARY_STORAGE_NAME + + r = requests.post( + f"{API_PREFIX}/orgs/{new_oid}/customStorage", + headers=admin_auth_headers, + json={ + "name": CUSTOM_REPLICA_STORAGE_NAME, + "access_key": "ADMIN", + "secret_key": "PASSW0RD", + "bucket": CUSTOM_REPLICA_STORAGE_NAME, + "endpoint_url": "http://local-minio.default:9000/", + }, + ) + assert r.status_code == 200 + data = r.json() + assert data["added"] + assert data["name"] == CUSTOM_REPLICA_STORAGE_NAME + + # set org to use custom storages moving forward r = requests.post( f"{API_PREFIX}/orgs/{new_oid}/storage", headers=admin_auth_headers, - json={"storage": {"name": "alt-storage", "custom": False}}, + json={ + "storage": {"name": CUSTOM_PRIMARY_STORAGE_NAME, "custom": True}, + "storageReplicas": [{"name": CUSTOM_REPLICA_STORAGE_NAME, "custom": True}], + }, ) assert r.status_code == 200 assert r.json()["updated"] + # check org was updated as expected + r = requests.get( + f"{API_PREFIX}/orgs/{new_oid}/storage", + headers=admin_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + + storage = data["storage"] + assert storage["name"] == CUSTOM_PRIMARY_STORAGE_NAME + assert storage["custom"] + + replicas = data["storageReplicas"] + assert len(replicas) == 1 + replica = replicas[0] + assert replica["name"] == CUSTOM_REPLICA_STORAGE_NAME + assert replica["custom"] + def test_remove_user_from_org(admin_auth_headers, default_org_id): # Add new user to org From fb181343117dc6dee7e91119f98991c1191c8215 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Fri, 20 Sep 2024 14:12:12 -0400 Subject: [PATCH 03/64] Fix test issue with bucket not existing for now --- backend/btrixcloud/storages.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 50db7c2710..8c50108fea 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -151,7 +151,7 @@ async def add_custom_storage( self, storagein: S3StorageIn, org: Organization ) -> dict: """Add new custom storage""" - name = "!" + slug_from_name(storagein.name) + name = slug_from_name(storagein.name) if name in org.customStorages: raise HTTPException(status_code=400, detail="storage_already_exists") @@ -292,6 +292,12 @@ async def verify_storage_upload(self, storage: S3Storage, filename: str) -> None key += filename data = b"" + # create bucket if it doesn't yet exist + # TODO: Remove this, useful for dev but in real cases we want to + # fail if bucket doesn't exist/has invalid credentials + resp = await client.create_bucket(Bucket=bucket) + assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200 + resp = await client.put_object(Bucket=bucket, Key=key, Body=data) assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200 @@ -769,6 +775,8 @@ async def remove_custom_storage( # without needing to explicitly pass the secrets for what we're not changing every time # - Maybe make it a PATCH # - Maybe split out into two endpoints + # - Add endpoint to reset to default so we don't have to pass secrets in POST request + # to remove custom storage? @router.post("/storage", tags=["organizations"], response_model=UpdatedResponse) async def update_storage_refs( storage: OrgStorageRefs, From 03022751787f9f831f9fbf35f578709ea3e4ee62 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 23 Sep 2024 12:04:13 -0400 Subject: [PATCH 04/64] Add additional tests --- backend/test/test_org.py | 53 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 50 insertions(+), 3 deletions(-) diff --git a/backend/test/test_org.py b/backend/test/test_org.py index c8319198c5..cb8c50718f 100644 --- a/backend/test/test_org.py +++ b/backend/test/test_org.py @@ -227,8 +227,8 @@ def test_create_org_duplicate_slug(admin_auth_headers, non_default_org_id, slug) assert data["detail"] == "duplicate_org_slug" -def test_change_org_storage(admin_auth_headers): - # change to invalid storage +def test_change_storage_invalid(admin_auth_headers): + # try to change to invalid storage r = requests.post( f"{API_PREFIX}/orgs/{new_oid}/storage", headers=admin_auth_headers, @@ -237,7 +237,6 @@ def test_change_org_storage(admin_auth_headers): assert r.status_code == 400 - # change to invalid storage r = requests.post( f"{API_PREFIX}/orgs/{new_oid}/storage", headers=admin_auth_headers, @@ -246,6 +245,8 @@ def test_change_org_storage(admin_auth_headers): assert r.status_code == 400 + +def test_add_custom_storage(admin_auth_headers): # add custom storages r = requests.post( f"{API_PREFIX}/orgs/{new_oid}/customStorage", @@ -311,6 +312,52 @@ def test_change_org_storage(admin_auth_headers): assert replica["custom"] +def test_remove_custom_storage(admin_auth_headers): + # Try to remove in-use storages, verify we get expected 400 response + r = requests.delete( + f"{API_PREFIX}/orgs/{new_oid}/customStorage/{CUSTOM_PRIMARY_STORAGE_NAME}", + headers=admin_auth_headers, + ) + assert r.status_code == 400 + assert r.json()["detail"] == "storage_in_use" + + r = requests.delete( + f"{API_PREFIX}/orgs/{new_oid}/customStorage/{CUSTOM_REPLICA_STORAGE_NAME}", + headers=admin_auth_headers, + ) + assert r.status_code == 400 + assert r.json()["detail"] == "storage_in_use" + + # Unset replica storage from org + r = requests.post( + f"{API_PREFIX}/orgs/{new_oid}/storage", + headers=admin_auth_headers, + json={"storage": {"name": CUSTOM_PRIMARY_STORAGE_NAME, "custom": True}}, + ) + + # Delete no longer used replica storage location + r = requests.delete( + f"{API_PREFIX}/orgs/{new_oid}/customStorage/{CUSTOM_REPLICA_STORAGE_NAME}", + headers=admin_auth_headers, + ) + assert r.status_code == 200 + assert r.json()["deleted"] + + # Check org + r = requests.get( + f"{API_PREFIX}/orgs/{new_oid}/storage", + headers=admin_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + + storage = data["storage"] + assert storage["name"] == CUSTOM_PRIMARY_STORAGE_NAME + assert storage["custom"] + + assert data["storageReplicas"] == [] + + def test_remove_user_from_org(admin_auth_headers, default_org_id): # Add new user to org r = requests.post( From 19850baf21c0153118d89c1eb5cf372a3873aad2 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 24 Sep 2024 14:36:29 -0400 Subject: [PATCH 05/64] Fix custom storage so it works as expected - Set access_endpoint_url to the endpoint url with bucket so that we can generate a presigned URL as expected - Make adding bucket in verify_storage_upload a backup routine after first exception is raised --- backend/btrixcloud/storages.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 8c50108fea..5a70af08f8 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -168,7 +168,8 @@ async def add_custom_storage( region=storagein.region, endpoint_url=endpoint_url, endpoint_no_bucket_url=endpoint_no_bucket_url, - access_endpoint_url=storagein.access_endpoint_url or storagein.endpoint_url, + access_endpoint_url=storagein.access_endpoint_url or endpoint_url, + use_access_for_presign=True, ) try: @@ -292,14 +293,16 @@ async def verify_storage_upload(self, storage: S3Storage, filename: str) -> None key += filename data = b"" - # create bucket if it doesn't yet exist - # TODO: Remove this, useful for dev but in real cases we want to - # fail if bucket doesn't exist/has invalid credentials - resp = await client.create_bucket(Bucket=bucket) - assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200 - - resp = await client.put_object(Bucket=bucket, Key=key, Body=data) - assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200 + try: + resp = await client.put_object(Bucket=bucket, Key=key, Body=data) + assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200 + except: + # create bucket if it doesn't yet exist and then try again + resp = await client.create_bucket(Bucket=bucket) + assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200 + + resp = await client.put_object(Bucket=bucket, Key=key, Body=data) + assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200 def resolve_internal_access_path(self, path): """Resolve relative path for internal access to minio bucket""" From c78b5b8cd7ff0c7e414744dc096f1d1e8ecb2ac5 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 24 Sep 2024 15:09:09 -0400 Subject: [PATCH 06/64] Actually unset custom replica storage before deleting --- backend/test/test_org.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/backend/test/test_org.py b/backend/test/test_org.py index cb8c50718f..db6e81d560 100644 --- a/backend/test/test_org.py +++ b/backend/test/test_org.py @@ -332,7 +332,10 @@ def test_remove_custom_storage(admin_auth_headers): r = requests.post( f"{API_PREFIX}/orgs/{new_oid}/storage", headers=admin_auth_headers, - json={"storage": {"name": CUSTOM_PRIMARY_STORAGE_NAME, "custom": True}}, + json={ + "storage": {"name": CUSTOM_PRIMARY_STORAGE_NAME, "custom": True}, + "storageReplicas": [], + }, ) # Delete no longer used replica storage location From d5fd8e16506e6397753ca4575ddc99480ac0fa8e Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 24 Sep 2024 16:20:07 -0400 Subject: [PATCH 07/64] Add TODO where custom storage deletion is failing --- backend/btrixcloud/crawlmanager.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/backend/btrixcloud/crawlmanager.py b/backend/btrixcloud/crawlmanager.py index d91e6de76d..7bfac756cc 100644 --- a/backend/btrixcloud/crawlmanager.py +++ b/backend/btrixcloud/crawlmanager.py @@ -248,6 +248,9 @@ async def remove_org_storage(self, storage: StorageRef, oid: str) -> bool: storage_secret = storage.get_storage_secret_name(oid) storage_label = f"btrix.storage={storage_secret}" + # TODO: This is causing deletion of custom storage to fail + # even when it hasn't been used - why has this label been applied + # when no crawls or profiles have been created yet? if await self.has_custom_jobs_with_label("crawljobs", storage_label): raise HTTPException(status_code=400, detail="storage_in_use") From 1dcd39bc40d70783b12d373c3792f47af48d15d7 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 24 Sep 2024 18:00:04 -0400 Subject: [PATCH 08/64] Fix check for whether storage label is in use --- backend/btrixcloud/crawlmanager.py | 3 --- backend/btrixcloud/k8sapi.py | 5 ++++- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/backend/btrixcloud/crawlmanager.py b/backend/btrixcloud/crawlmanager.py index 7bfac756cc..d91e6de76d 100644 --- a/backend/btrixcloud/crawlmanager.py +++ b/backend/btrixcloud/crawlmanager.py @@ -248,9 +248,6 @@ async def remove_org_storage(self, storage: StorageRef, oid: str) -> bool: storage_secret = storage.get_storage_secret_name(oid) storage_label = f"btrix.storage={storage_secret}" - # TODO: This is causing deletion of custom storage to fail - # even when it hasn't been used - why has this label been applied - # when no crawls or profiles have been created yet? if await self.has_custom_jobs_with_label("crawljobs", storage_label): raise HTTPException(status_code=400, detail="storage_in_use") diff --git a/backend/btrixcloud/k8sapi.py b/backend/btrixcloud/k8sapi.py index 238155d212..df1893f44e 100644 --- a/backend/btrixcloud/k8sapi.py +++ b/backend/btrixcloud/k8sapi.py @@ -316,7 +316,7 @@ async def has_custom_jobs_with_label(self, plural, label) -> bool: """return true/false if any crawljobs or profilejobs match given label""" try: - await self.custom_api.list_namespaced_custom_object( + resp = await self.custom_api.list_namespaced_custom_object( group="btrix.cloud", version="v1", namespace=self.namespace, @@ -324,6 +324,9 @@ async def has_custom_jobs_with_label(self, plural, label) -> bool: label_selector=label, limit=1, ) + matching_items = resp.get("items", []) + if not matching_items: + return False return True # pylint: disable=broad-exception-caught except Exception: From 1a1cb30da4742f55a623d8d98ccac46471e6b5a6 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 24 Sep 2024 18:08:29 -0400 Subject: [PATCH 09/64] Remove todo on endpoint that's fine --- backend/btrixcloud/storages.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 5a70af08f8..f5a9abb99f 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -774,12 +774,6 @@ async def remove_custom_storage( ): return await storage_ops.remove_custom_storage(name, org) - # TODO: Modify this to make it easier to change just the primary or replica locations - # without needing to explicitly pass the secrets for what we're not changing every time - # - Maybe make it a PATCH - # - Maybe split out into two endpoints - # - Add endpoint to reset to default so we don't have to pass secrets in POST request - # to remove custom storage? @router.post("/storage", tags=["organizations"], response_model=UpdatedResponse) async def update_storage_refs( storage: OrgStorageRefs, From 7558ac3acd6d3cc15d12be97efcee2f388bc854a Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 24 Sep 2024 18:13:20 -0400 Subject: [PATCH 10/64] Add todos re: tasks necessary to change storage --- backend/btrixcloud/storages.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index f5a9abb99f..562ce0c9ec 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -241,6 +241,21 @@ async def update_storage_refs( org.storage = storage_refs.storage org.storageReplicas = storage_refs.storageReplicas + # TODO: Account for replication if there's stored content, + # we'll need to move it to the new bucket and update paths in the + # database accordingly (if any updates are needed, at minimum should + # probably re-generate presigned URLs?) + # If a replica location is added, we should replicate everything in + # primary storage into it + # If a replica location is removed, should we wait for content to + # be deleted before removing it, or assume that happens manually as + # necessary? + + # We'll also need to make sure any running crawl, bg, or profile jobs + # that use this storage are completed first + + # We can set the org to read-only while handling these details + await self.org_ops.update_storage_refs(org) return {"updated": True} From 46e0fc93eeb8293fdabc04da3226914ce435986c Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 25 Sep 2024 09:15:42 -0400 Subject: [PATCH 11/64] Check that no crawls are running before updating storage --- backend/btrixcloud/orgs.py | 9 +++++++++ backend/btrixcloud/storages.py | 27 ++++++++++++++------------- 2 files changed, 23 insertions(+), 13 deletions(-) diff --git a/backend/btrixcloud/orgs.py b/backend/btrixcloud/orgs.py index e5b1ae4751..c3cad79ed6 100644 --- a/backend/btrixcloud/orgs.py +++ b/backend/btrixcloud/orgs.py @@ -961,6 +961,15 @@ async def get_org_metrics(self, org: Organization) -> dict[str, int]: "publicCollectionsCount": public_collections_count, } + async def is_crawl_running(self, oid: UUID) -> bool: + """Return boolean indicating whether any crawls are currently running in org""" + workflows_running_count = await self.crawls_db.count_documents( + {"oid": org.id, "state": {"$in": RUNNING_STATES}} + ) + if workflows_running_count > 0: + return True + return False + async def get_all_org_slugs(self) -> dict[str, list[str]]: """Return list of all org slugs.""" slugs = await self.orgs.distinct("slug", {}) diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 562ce0c9ec..9445646ba1 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -238,25 +238,26 @@ async def update_storage_refs( except: raise HTTPException(status_code=400, detail="invalid_storage_ref") + if await self.org_ops.is_crawl_running(org.id): + raise HTTPException(status_code=400, detail="crawl_running") + org.storage = storage_refs.storage org.storageReplicas = storage_refs.storageReplicas - # TODO: Account for replication if there's stored content, - # we'll need to move it to the new bucket and update paths in the - # database accordingly (if any updates are needed, at minimum should - # probably re-generate presigned URLs?) - # If a replica location is added, we should replicate everything in - # primary storage into it - # If a replica location is removed, should we wait for content to - # be deleted before removing it, or assume that happens manually as - # necessary? + await self.org_ops.update_storage_refs(org) - # We'll also need to make sure any running crawl, bg, or profile jobs - # that use this storage are completed first + # TODO: Handle practical consequences of changing buckets + # - If previous primary bucket(s) had files stored, copy or move those + # into new storage and make necessary updates (e.g. regenerate presigned + # URLs?) + # - If replica location is added, replicate everything in primary + # to new replica storage location + # - If replica location is removed, start jobs to delete content? + # (or do we want to handle that manually?) # We can set the org to read-only while handling these details - - await self.org_ops.update_storage_refs(org) + # Think through timing and/or how to communicate status of jobs to + # user, since background jobs don't block return {"updated": True} From 791ca6e781c976fae1bebbd23ba5acfe8b07f52c Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 25 Sep 2024 10:08:07 -0400 Subject: [PATCH 12/64] Start adding post-storage update logic --- backend/btrixcloud/orgs.py | 52 ++++++++++++++++++++++++++++-- backend/btrixcloud/storages.py | 59 +++++++++++++++++++++++++++------- 2 files changed, 96 insertions(+), 15 deletions(-) diff --git a/backend/btrixcloud/orgs.py b/backend/btrixcloud/orgs.py index c3cad79ed6..179214bb8e 100644 --- a/backend/btrixcloud/orgs.py +++ b/backend/btrixcloud/orgs.py @@ -478,6 +478,36 @@ async def update_storage_refs(self, org: Organization) -> bool: res = await self.orgs.find_one_and_update({"_id": org.id}, {"$set": set_dict}) return res is not None + async def update_file_storage_refs( + self, org: Organization, previous_storage: StorageRef, new_storage: StorageRef + ) -> bool: + """Update storage refs for all crawl and profile files in given org""" + res = await self.crawls_db.update_many( + {"_id": org.id, "files.$.storage": previous_storage}, + {"$set": {"files.$.storage": new_storage}}, + ) + if not res: + return False + + res = await self.profiles_db.update_many( + {"_id": org.id, "resource.storage": previous_storage}, + {"$set": {"resource.storage": new_storage}}, + ) + if not res: + return False + + return True + + async def unset_file_presigned_urls(self, org: Organization) -> bool: + """Unset all presigned URLs for files in org""" + res = await self.crawls_db.update_many( + {"_id": org.id}, {"$set": {"files.$.presignedUrl": None}} + ) + if not res: + return False + + return True + async def update_subscription_data( self, update: SubscriptionUpdate ) -> Optional[Organization]: @@ -961,13 +991,29 @@ async def get_org_metrics(self, org: Organization) -> dict[str, int]: "publicCollectionsCount": public_collections_count, } - async def is_crawl_running(self, oid: UUID) -> bool: + async def is_crawl_running(self, org: Organization) -> bool: """Return boolean indicating whether any crawls are currently running in org""" - workflows_running_count = await self.crawls_db.count_documents( + running_count = await self.crawls_db.count_documents( {"oid": org.id, "state": {"$in": RUNNING_STATES}} ) - if workflows_running_count > 0: + if running_count > 0: + return True + return False + + async def has_files_stored(self, org: Organization) -> bool: + """Return boolean indicating whether any files are stored on org""" + crawl_count = await self.crawls_db.count_documents( + {"_id": org.id, "files.1": {"$exists": True}}, + ) + if crawl_count > 0: + return True + + profile_count = await self.profiles_db.count_documents( + {"_id": org.id, "resource": {"$exists": True}}, + ) + if profile_count > 0: return True + return False async def get_all_org_slugs(self) -> dict[str, list[str]]: diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 9445646ba1..7f056a3158 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -241,26 +241,61 @@ async def update_storage_refs( if await self.org_ops.is_crawl_running(org.id): raise HTTPException(status_code=400, detail="crawl_running") + prev_storage = org.storage + prev_storage_replicas = org.storageReplicas + org.storage = storage_refs.storage org.storageReplicas = storage_refs.storageReplicas await self.org_ops.update_storage_refs(org) - # TODO: Handle practical consequences of changing buckets - # - If previous primary bucket(s) had files stored, copy or move those - # into new storage and make necessary updates (e.g. regenerate presigned - # URLs?) - # - If replica location is added, replicate everything in primary - # to new replica storage location - # - If replica location is removed, start jobs to delete content? - # (or do we want to handle that manually?) - - # We can set the org to read-only while handling these details - # Think through timing and/or how to communicate status of jobs to - # user, since background jobs don't block + asyncio.create_task( + self.run_post_storage_update_tasks( + OrgStorageRefs(storage=prev_storage, storageReplicas=prev_storage_refs), + storage_refs, + org, + ) + ) return {"updated": True} + async def run_post_storage_update_tasks( + self, + prev_storage_refs: StorageRef, + new_storage_refs: StorageRef, + org: Organization, + ): + """Handle tasks necessary after changing org storage""" + if not await self.org_ops.has_files_stored(org): + return + + if new_storage_refs.storage != prev_storage_refs.storage: + await self.org_ops.update_read_only(org, True, "Updating storage") + + # TODO: Copy files from from previous to new primary storage + # (Ideally block on this, otherwise unset read-only on completion in + # operator?) + + await self.org_ops.update_file_storage_refs( + org, prev_storage_refs.storage, new_storage_refs.storage + ) + + await self.org_ops.unset_file_presigned_urls(org) + + await self.org_ops.update_read_only(org, False) + + if new_storage_refs.storageReplicas != prev_storage_refs.storageReplicas: + # If replica location added, kick off background jobs to replicate + # primary storage to new replica storage location (non-blocking) + + # If replica location is removed, start jobs to delete files from + # removed replica location (non-blocking) + + # If we also changed primary storage in this update, we should make + # sure all files are successfully copied before doing anything to + # the replicas + pass + def get_available_storages(self, org: Organization) -> List[StorageRef]: """return a list of available default + custom storages""" refs: List[StorageRef] = [] From b899ecf2330385b8d6d2f68821ea26ce3eb057af Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 25 Sep 2024 10:59:49 -0400 Subject: [PATCH 13/64] WIP: Add background job to copy old s3 bucket to new --- backend/btrixcloud/background_jobs.py | 79 ++++++++++++++++++++++++++- backend/btrixcloud/crawlmanager.py | 41 +++++++++++++- backend/btrixcloud/main.py | 2 + backend/btrixcloud/models.py | 15 ++++- backend/btrixcloud/storages.py | 36 +++++++++--- 5 files changed, 162 insertions(+), 11 deletions(-) diff --git a/backend/btrixcloud/background_jobs.py b/backend/btrixcloud/background_jobs.py index 1e0da1e28f..e86cd6e107 100644 --- a/backend/btrixcloud/background_jobs.py +++ b/backend/btrixcloud/background_jobs.py @@ -22,6 +22,7 @@ DeleteReplicaJob, DeleteOrgJob, RecalculateOrgStatsJob, + CopyBucketJob, PaginatedBackgroundJobResponse, AnyJob, StorageRef, @@ -381,6 +382,67 @@ async def create_recalculate_org_stats_job( print(f"warning: recalculate org stats job could not be started: {exc}") return None + async def create_copy_bucket_job( + self, + org: Organization, + prev_storage_ref: StorageRef, + new_storage_ref: StorageRef, + existing_job_id: Optional[str] = None, + ) -> str: + """Start background job to copy entire s3 bucket and return job id""" + prev_storage = self.storage_ops.get_org_storage_by_ref(org, prev_storage_ref) + prev_endpoint, prev_bucket = self.strip_bucket(prev_storage.endpoint_url) + + new_storage = self.storage_ops.get_org_storage_by_ref(org, new_storage_ref) + new_endpoint, new_bucket = self.strip_bucket(new_storage.endpoint_url) + + job_type = BgJobType.COPY_BUCKET.value + + try: + job_id = await self.crawl_manager.run_copy_bucket_job( + oid=str(org.id), + job_type=job_type, + prev_storage=prev_storage_ref, + prev_endpoint=prev_endpoint, + prev_bucket=prev_bucket, + new_storage=new_storage_ref, + new_endpoint=new_endpoint, + new_bucket=new_bucket, + job_id_prefix=f"{job_type}-{object_id}", + existing_job_id=existing_job_id, + ) + if existing_job_id: + copy_job = await self.get_background_job(existing_job_id, org.id) + previous_attempt = { + "started": copy_job.started, + "finished": copy_job.finished, + } + if copy_job.previousAttempts: + copy_job.previousAttempts.append(previous_attempt) + else: + copy_job.previousAttempts = [previous_attempt] + copy_job.started = dt_now() + copy_job.finished = None + copy_job.success = None + else: + copy_job = CopyBucketJob( + id=job_id, + oid=org.id, + started=dt_now(), + prev_storage=prev_storage_ref, + new_storage=new_storage_ref, + ) + + await self.jobs.find_one_and_update( + {"_id": job_id}, {"$set": copy_job.to_dict()}, upsert=True + ) + + return job_id + # pylint: disable=broad-exception-caught + except Exception as exc: + print(f"warning: copy bucket job could not be started for org {org.id}") + return "" + async def job_finished( self, job_id: str, @@ -430,7 +492,11 @@ async def job_finished( async def get_background_job( self, job_id: str, oid: Optional[UUID] = None ) -> Union[ - CreateReplicaJob, DeleteReplicaJob, DeleteOrgJob, RecalculateOrgStatsJob + CreateReplicaJob, + DeleteReplicaJob, + CopyBucketJob, + DeleteOrgJob, + RecalculateOrgStatsJob, ]: """Get background job""" query: dict[str, object] = {"_id": job_id} @@ -454,6 +520,9 @@ def _get_job_by_type_from_data(self, data: dict[str, object]): if data["type"] == BgJobType.RECALCULATE_ORG_STATS: return RecalculateOrgStatsJob.from_dict(data) + if data["type"] == BgJobType.COPY_BUCKET: + return CopyBucketJob.from_dict(data) + return DeleteOrgJob.from_dict(data) async def list_background_jobs( @@ -595,6 +664,14 @@ async def retry_background_job( existing_job_id=job_id, ) + if job.type == BgJobType.COPY_BUCKET: + await self.create_copy_bucket_job( + org, + job.prev_storage, + job.new_storage, + existing_job_id=job_id, + ) + return {"success": True} async def retry_failed_background_jobs( diff --git a/backend/btrixcloud/crawlmanager.py b/backend/btrixcloud/crawlmanager.py index d91e6de76d..6d3f009b96 100644 --- a/backend/btrixcloud/crawlmanager.py +++ b/backend/btrixcloud/crawlmanager.py @@ -123,7 +123,6 @@ async def run_delete_org_job( existing_job_id: Optional[str] = None, ) -> str: """run job to delete org and all of its data""" - if existing_job_id: job_id = existing_job_id else: @@ -177,6 +176,46 @@ async def _run_bg_job_with_ops_classes( await self.create_from_yaml(data, namespace=DEFAULT_NAMESPACE) + async def run_copy_bucket_job( + self, + oid: str, + job_type: str, + prev_storage: StorageRef, + prev_endpoint: str, + prev_bucket: str, + new_storage: StorageRef, + new_endpoint: str, + new_bucket: str, + job_id_prefix: Optional[str] = None, + existing_job_id: Optional[str] = None, + ): + """run job to copy entire contents of one s3 bucket to another""" + if existing_job_id: + job_id = existing_job_id + else: + if not job_id_prefix: + job_id_prefix = job_type + + # ensure name is <=63 characters + job_id = f"{job_id_prefix[:52]}-{secrets.token_hex(5)}" + + params = { + "id": job_id, + "oid": oid, + "job_type": job_type, + "prev_secret_name": prev_storage.get_storage_secret_name(oid), + "prev_endpoint": prev_endpoint, + "prev_bucket": prev_bucket, + "new_secret_name": new_storage.get_storage_secret_name(oid), + "new_endpoint": new_endpoint, + "new_bucket": new_bucket, + "BgJobType": BgJobType, + } + + data = self.templates.env.get_template("copy_job.yaml").render(params) + + await self.create_from_yaml(data) + return job_id async def create_crawl_job( diff --git a/backend/btrixcloud/main.py b/backend/btrixcloud/main.py index f6b678cb82..d03ad73896 100644 --- a/backend/btrixcloud/main.py +++ b/backend/btrixcloud/main.py @@ -254,6 +254,8 @@ def main() -> None: org_ops.set_ops(base_crawl_ops, profiles, coll_ops, background_job_ops) + storage_ops.set_ops(background_job_ops) + user_manager.set_ops(org_ops, crawl_config_ops, base_crawl_ops) background_job_ops.set_ops(base_crawl_ops, profiles) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 2209c5f79e..7fcfd55072 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -2022,6 +2022,7 @@ class BgJobType(str, Enum): DELETE_REPLICA = "delete-replica" DELETE_ORG = "delete-org" RECALCULATE_ORG_STATS = "recalculate-org-stats" + COPY_BUCKET = "copy-bucket" # ============================================================================ @@ -2040,7 +2041,7 @@ class BackgroundJob(BaseMongoModel): # ============================================================================ class CreateReplicaJob(BackgroundJob): - """Model for tracking create of replica jobs""" + """Model for tracking creation of replica jobs""" type: Literal[BgJobType.CREATE_REPLICA] = BgJobType.CREATE_REPLICA file_path: str @@ -2075,15 +2076,25 @@ class RecalculateOrgStatsJob(BackgroundJob): type: Literal[BgJobType.RECALCULATE_ORG_STATS] = BgJobType.RECALCULATE_ORG_STATS +# ============================================================================ +class CopyBucketJob(BackgroundJob): + """Model for tracking job to copy entire s3 bucket""" + + type: Literal[BgJobType.CREATE_REPLICA] = BgJobType.CREATE_REPLICA + prev_storage: StorageRef + new_storage: StorageRef + + # ============================================================================ # Union of all job types, for response model AnyJob = RootModel[ Union[ + BackgroundJob, CreateReplicaJob, DeleteReplicaJob, - BackgroundJob, DeleteOrgJob, + CopyBucketJob, RecalculateOrgStatsJob, ] ] diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 7f056a3158..3ec6a00499 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -22,6 +22,7 @@ import zlib import json import os +import time from datetime import datetime from zipfile import ZipInfo @@ -58,8 +59,9 @@ if TYPE_CHECKING: from .orgs import OrgOps from .crawlmanager import CrawlManager + from .background_jobs import BackgroundJobOps else: - OrgOps = CrawlManager = object + OrgOps = CrawlManager = BackgroundJobOps = object CHUNK_SIZE = 1024 * 256 @@ -90,6 +92,8 @@ def __init__(self, org_ops, crawl_manager) -> None: default_namespace = os.environ.get("DEFAULT_NAMESPACE", "default") self.frontend_origin = f"{frontend_origin}.{default_namespace}" + self.base_crawl_ops = cast(BackgroundJobOps, None) + with open(os.environ["STORAGES_JSON"], encoding="utf-8") as fh: storage_list = json.loads(fh.read()) @@ -128,6 +132,10 @@ def __init__(self, org_ops, crawl_manager) -> None: self.org_ops.set_default_primary_storage(self.default_primary) + def set_ops(self, background_job_ops: BackgroundJobOps) -> None: + """Set background job ops""" + self.background_job_ops = background_job_ops + def _create_s3_storage(self, storage: dict[str, str]) -> S3Storage: """create S3Storage object""" endpoint_url = storage["endpoint_url"] @@ -261,8 +269,8 @@ async def update_storage_refs( async def run_post_storage_update_tasks( self, - prev_storage_refs: StorageRef, - new_storage_refs: StorageRef, + prev_storage_refs: OrgStorageRefs, + new_storage_refs: OrgStorageRefs, org: Organization, ): """Handle tasks necessary after changing org storage""" @@ -272,14 +280,28 @@ async def run_post_storage_update_tasks( if new_storage_refs.storage != prev_storage_refs.storage: await self.org_ops.update_read_only(org, True, "Updating storage") - # TODO: Copy files from from previous to new primary storage - # (Ideally block on this, otherwise unset read-only on completion in - # operator?) + # Create the background job to copy files + job_id = await self.background_job_ops.create_copy_bucket_job( + org, prev_storage_refs.storage, new_storage_refs.storage + ) + + # Block until copy job is complete + # TODO: Handle in operator instead? + while True: + bg_job = await self.background_job_ops.get_background_job( + job_id, org.id + ) + if bg_job.success: + break + if bg_job.success is False: + # TODO: Handle failure case + pass + + time.sleep(5) await self.org_ops.update_file_storage_refs( org, prev_storage_refs.storage, new_storage_refs.storage ) - await self.org_ops.unset_file_presigned_urls(org) await self.org_ops.update_read_only(org, False) From 21070d6da1a51003398a749cc30025e03cebdb71 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 25 Sep 2024 11:06:58 -0400 Subject: [PATCH 14/64] WIP: Start adding logic to handle replica location updates --- backend/btrixcloud/storages.py | 37 +++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 3ec6a00499..20e68d6000 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -294,7 +294,8 @@ async def run_post_storage_update_tasks( if bg_job.success: break if bg_job.success is False: - # TODO: Handle failure case + # TODO: Handle failure case, including partial failure + # (i.e. some files but not all copied) pass time.sleep(5) @@ -307,16 +308,32 @@ async def run_post_storage_update_tasks( await self.org_ops.update_read_only(org, False) if new_storage_refs.storageReplicas != prev_storage_refs.storageReplicas: - # If replica location added, kick off background jobs to replicate - # primary storage to new replica storage location (non-blocking) - - # If replica location is removed, start jobs to delete files from - # removed replica location (non-blocking) + # TODO: If we changed primary storage in this update, make sure that + # are files are successfully copied to new primary before doing + # anything with the replicas - this may be simple or complex depending + # on final approach taken to handle above + + # Replicate files to any new replica locations + for replica_storage in new_storage_refs.storageReplicas: + if replica_storage not in prev_storage_refs.storageReplicas: + # TODO: Kick off background jobs to replicate primary + # storage to new replica location + print( + "Not yet implemented: Replicate files to {replica_storage.name}", + flush=True, + ) - # If we also changed primary storage in this update, we should make - # sure all files are successfully copied before doing anything to - # the replicas - pass + # Delete files from previous replica locations that are no longer + # being used + for replica_storage in prev_storage_refs.storageReplicas: + if replica_storage not in new_storage_refs.storageReplicas: + # TODO: Kick off background jobs to delete replicas + # (may be easier to just delete all files from bucket + # in one rclone command) + print( + "Not yet implemented: Delete files from {replica_storage.name}", + flush=True, + ) def get_available_storages(self, org: Organization) -> List[StorageRef]: """return a list of available default + custom storages""" From 7b6a9179e6f6486e6eeb17ccf4f97c5acedb582c Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 25 Sep 2024 11:08:22 -0400 Subject: [PATCH 15/64] Add additional note --- backend/btrixcloud/storages.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 20e68d6000..22b84dab0a 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -273,7 +273,11 @@ async def run_post_storage_update_tasks( new_storage_refs: OrgStorageRefs, org: Organization, ): - """Handle tasks necessary after changing org storage""" + """Handle tasks necessary after changing org storage + + This is a good candidate for a background job with access to ops + classes, which may kick off other background jobs as needed. + """ if not await self.org_ops.has_files_stored(org): return From 3b3116f5405b4c9e63a9f3f1f0f13241a0abf728 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 25 Sep 2024 11:55:05 -0400 Subject: [PATCH 16/64] Fix argument --- backend/btrixcloud/storages.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 22b84dab0a..4ddc935e92 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -246,7 +246,7 @@ async def update_storage_refs( except: raise HTTPException(status_code=400, detail="invalid_storage_ref") - if await self.org_ops.is_crawl_running(org.id): + if await self.org_ops.is_crawl_running(org): raise HTTPException(status_code=400, detail="crawl_running") prev_storage = org.storage From 90765551ff591d2e50599b717320d99332411971 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 25 Sep 2024 12:16:23 -0400 Subject: [PATCH 17/64] Fix another argument --- backend/btrixcloud/storages.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 4ddc935e92..89c3991865 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -259,7 +259,9 @@ async def update_storage_refs( asyncio.create_task( self.run_post_storage_update_tasks( - OrgStorageRefs(storage=prev_storage, storageReplicas=prev_storage_refs), + OrgStorageRefs( + storage=prev_storage, storageReplicas=prev_storage_replicas + ), storage_refs, org, ) From 853b84084a464b2d5aabf30649571d7b952e43ce Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 25 Sep 2024 12:20:07 -0400 Subject: [PATCH 18/64] Fixups --- backend/btrixcloud/background_jobs.py | 2 +- backend/btrixcloud/storages.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/backend/btrixcloud/background_jobs.py b/backend/btrixcloud/background_jobs.py index e86cd6e107..064f9555de 100644 --- a/backend/btrixcloud/background_jobs.py +++ b/backend/btrixcloud/background_jobs.py @@ -408,7 +408,7 @@ async def create_copy_bucket_job( new_storage=new_storage_ref, new_endpoint=new_endpoint, new_bucket=new_bucket, - job_id_prefix=f"{job_type}-{object_id}", + job_id_prefix=job_type, existing_job_id=existing_job_id, ) if existing_job_id: diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 89c3991865..d970895af7 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -67,7 +67,7 @@ # ============================================================================ -# pylint: disable=broad-except,raise-missing-from +# pylint: disable=broad-except,raise-missing-from,too-many-public-methods class StorageOps: """All storage handling, download/upload operations""" @@ -92,7 +92,7 @@ def __init__(self, org_ops, crawl_manager) -> None: default_namespace = os.environ.get("DEFAULT_NAMESPACE", "default") self.frontend_origin = f"{frontend_origin}.{default_namespace}" - self.base_crawl_ops = cast(BackgroundJobOps, None) + self.background_job_ops = cast(BackgroundJobOps, None) with open(os.environ["STORAGES_JSON"], encoding="utf-8") as fh: storage_list = json.loads(fh.read()) From 3a4b43c32c33f7b50fe0282b85e39101e7f6b061 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 25 Sep 2024 13:11:37 -0400 Subject: [PATCH 19/64] Fix linting --- backend/btrixcloud/background_jobs.py | 6 ++++-- backend/btrixcloud/basecrawls.py | 2 +- backend/btrixcloud/colls.py | 1 + backend/btrixcloud/crawlconfigs.py | 2 +- backend/btrixcloud/crawlmanager.py | 1 + backend/btrixcloud/crawls.py | 4 ++-- backend/btrixcloud/db.py | 4 ++-- backend/btrixcloud/emailsender.py | 2 +- backend/btrixcloud/invites.py | 1 + backend/btrixcloud/k8sapi.py | 2 +- .../btrixcloud/migrations/migration_0032_dupe_org_names.py | 2 +- backend/btrixcloud/operator/baseoperator.py | 2 +- backend/btrixcloud/operator/crawls.py | 2 +- backend/btrixcloud/operator/cronjobs.py | 2 +- backend/btrixcloud/orgs.py | 2 +- backend/btrixcloud/pages.py | 4 ++-- backend/btrixcloud/profiles.py | 4 ++-- backend/btrixcloud/storages.py | 2 +- backend/btrixcloud/subs.py | 2 ++ backend/btrixcloud/uploads.py | 2 +- backend/btrixcloud/users.py | 2 +- backend/btrixcloud/webhooks.py | 4 ++-- 22 files changed, 31 insertions(+), 24 deletions(-) diff --git a/backend/btrixcloud/background_jobs.py b/backend/btrixcloud/background_jobs.py index 064f9555de..b9e4cc0659 100644 --- a/backend/btrixcloud/background_jobs.py +++ b/backend/btrixcloud/background_jobs.py @@ -440,7 +440,9 @@ async def create_copy_bucket_job( return job_id # pylint: disable=broad-exception-caught except Exception as exc: - print(f"warning: copy bucket job could not be started for org {org.id}") + print( + f"warning: copy bucket job could not be started for org {org.id}: {exc}" + ) return "" async def job_finished( @@ -707,7 +709,7 @@ async def retry_all_failed_background_jobs( # ============================================================================ -# pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme +# pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme, too-many-positional-arguments def init_background_jobs_api( app, mdb, email, user_manager, org_ops, crawl_manager, storage_ops, user_dep ): diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py index d913d3362b..fea5feeecc 100644 --- a/backend/btrixcloud/basecrawls.py +++ b/backend/btrixcloud/basecrawls.py @@ -54,7 +54,7 @@ # ============================================================================ -# pylint: disable=too-many-instance-attributes, too-many-public-methods, too-many-lines +# pylint: disable=too-many-instance-attributes, too-many-public-methods, too-many-lines, too-many-positional-arguments class BaseCrawlOps: """operations that apply to all crawls""" diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index d0fdd43a7c..a9b863da23 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -43,6 +43,7 @@ # ============================================================================ +# pylint: disable=too-many-positional-arguments class CollectionOps: """ops for working with named collections of crawls""" diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index 8d231733c7..28f24260e1 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -73,7 +73,7 @@ class CrawlConfigOps: """Crawl Config Operations""" - # pylint: disable=too-many-arguments, too-many-instance-attributes, too-many-public-methods + # pylint: disable=too-many-arguments, too-many-instance-attributes, too-many-public-methods, too-many-positional-arguments user_manager: UserManager org_ops: OrgOps diff --git a/backend/btrixcloud/crawlmanager.py b/backend/btrixcloud/crawlmanager.py index 6d3f009b96..f7e1a8a180 100644 --- a/backend/btrixcloud/crawlmanager.py +++ b/backend/btrixcloud/crawlmanager.py @@ -21,6 +21,7 @@ # ============================================================================ +# pylint: disable=too-many-positional-arguments class CrawlManager(K8sAPI): """abstract crawl manager""" diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index 5a0994fe70..3deec4ea95 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -71,7 +71,7 @@ # ============================================================================ -# pylint: disable=too-many-arguments, too-many-instance-attributes, too-many-public-methods +# pylint: disable=too-many-arguments, too-many-instance-attributes, too-many-public-methods, too-many-positional-arguments class CrawlOps(BaseCrawlOps): """Crawl Ops""" @@ -1092,7 +1092,7 @@ async def recompute_crawl_file_count_and_size(crawls, crawl_id: str): # ============================================================================ -# pylint: disable=too-many-arguments, too-many-locals, too-many-statements +# pylint: disable=too-many-arguments, too-many-locals, too-many-statements, too-many-positional-arguments def init_crawls_api(crawl_manager: CrawlManager, app, user_dep, *args): """API for crawl management, including crawl done callback""" # pylint: disable=invalid-name, duplicate-code diff --git a/backend/btrixcloud/db.py b/backend/btrixcloud/db.py index 91b2bc7057..d928c6da6d 100644 --- a/backend/btrixcloud/db.py +++ b/backend/btrixcloud/db.py @@ -72,7 +72,7 @@ async def ping_db(mdb): # ============================================================================ async def update_and_prepare_db( - # pylint: disable=R0913 + # pylint: disable=R0913, too-many-positional-arguments mdb, user_manager, org_ops, @@ -191,7 +191,7 @@ async def drop_indexes(mdb): # ============================================================================ -# pylint: disable=too-many-arguments +# pylint: disable=too-many-arguments, too-many-positional-arguments async def create_indexes( org_ops, crawl_ops, crawl_config_ops, coll_ops, invite_ops, user_manager, page_ops ): diff --git a/backend/btrixcloud/emailsender.py b/backend/btrixcloud/emailsender.py index 06f5311d4c..7c28cf4881 100644 --- a/backend/btrixcloud/emailsender.py +++ b/backend/btrixcloud/emailsender.py @@ -17,7 +17,7 @@ from .utils import is_bool, get_origin -# pylint: disable=too-few-public-methods, too-many-instance-attributes +# pylint: disable=too-few-public-methods, too-many-instance-attributes, too-many-positional-arguments class EmailSender: """SMTP Email Sender""" diff --git a/backend/btrixcloud/invites.py b/backend/btrixcloud/invites.py index 7a733bc7bb..ffe15b55ce 100644 --- a/backend/btrixcloud/invites.py +++ b/backend/btrixcloud/invites.py @@ -27,6 +27,7 @@ # ============================================================================ +# pylint: disable=too-many-positional-arguments class InviteOps: """invite users (optionally to an org), send emails and delete invites""" diff --git a/backend/btrixcloud/k8sapi.py b/backend/btrixcloud/k8sapi.py index df1893f44e..b9ae450060 100644 --- a/backend/btrixcloud/k8sapi.py +++ b/backend/btrixcloud/k8sapi.py @@ -22,7 +22,7 @@ # ============================================================================ -# pylint: disable=too-many-instance-attributes +# pylint: disable=too-many-instance-attributes, too-many-positional-arguments class K8sAPI: """K8S API accessors""" diff --git a/backend/btrixcloud/migrations/migration_0032_dupe_org_names.py b/backend/btrixcloud/migrations/migration_0032_dupe_org_names.py index d2c06781e0..c1f417a4d5 100644 --- a/backend/btrixcloud/migrations/migration_0032_dupe_org_names.py +++ b/backend/btrixcloud/migrations/migration_0032_dupe_org_names.py @@ -54,7 +54,7 @@ async def migrate_up(self): orgs_db, org_name_set, org_slug_set, name, org_dict.get("_id") ) - # pylint: disable=too-many-arguments + # pylint: disable=too-many-arguments, too-many-positional-arguments async def update_org_name_and_slug( self, orgs_db, diff --git a/backend/btrixcloud/operator/baseoperator.py b/backend/btrixcloud/operator/baseoperator.py index ab9fe43418..c1604b5486 100644 --- a/backend/btrixcloud/operator/baseoperator.py +++ b/backend/btrixcloud/operator/baseoperator.py @@ -136,7 +136,7 @@ async def async_init(self) -> None: print("Auto-Resize Enabled", self.enable_auto_resize) -# pylint: disable=too-many-instance-attributes, too-many-arguments +# pylint: disable=too-many-instance-attributes, too-many-arguments, too-many-positional-arguments # ============================================================================ class BaseOperator: """BaseOperator""" diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index a39064a1b2..52ca3dba24 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -78,7 +78,7 @@ # pylint: disable=too-many-public-methods, too-many-locals, too-many-branches, too-many-statements -# pylint: disable=invalid-name, too-many-lines, too-many-return-statements +# pylint: disable=invalid-name, too-many-lines, too-many-return-statements, too-many-positional-arguments # ============================================================================ class CrawlOperator(BaseOperator): """CrawlOperator Handler""" diff --git a/backend/btrixcloud/operator/cronjobs.py b/backend/btrixcloud/operator/cronjobs.py index 9d0367aec3..d25d49675c 100644 --- a/backend/btrixcloud/operator/cronjobs.py +++ b/backend/btrixcloud/operator/cronjobs.py @@ -49,7 +49,7 @@ def get_finished_response( status=status, ) - # pylint: disable=too-many-arguments + # pylint: disable=too-many-arguments, too-many-positional-arguments async def make_new_crawljob( self, cid: UUID, diff --git a/backend/btrixcloud/orgs.py b/backend/btrixcloud/orgs.py index 179214bb8e..269c9dec23 100644 --- a/backend/btrixcloud/orgs.py +++ b/backend/btrixcloud/orgs.py @@ -109,7 +109,7 @@ # ============================================================================ -# pylint: disable=too-many-public-methods, too-many-instance-attributes, too-many-locals +# pylint: disable=too-many-public-methods, too-many-instance-attributes, too-many-locals, too-many-positional-arguments class OrgOps: """Organization API operations""" diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index a980567c49..fc7ca065c8 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -42,7 +42,7 @@ # ============================================================================ -# pylint: disable=too-many-instance-attributes, too-many-arguments,too-many-public-methods +# pylint: disable=too-many-instance-attributes, too-many-arguments,too-many-public-methods, too-many-positional-arguments class PageOps: """crawl pages""" @@ -632,7 +632,7 @@ async def get_qa_run_aggregate_counts( # ============================================================================ -# pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme +# pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme, too-many-positional-arguments def init_pages_api(app, mdb, crawl_ops, org_ops, storage_ops, user_dep): """init pages API""" # pylint: disable=invalid-name diff --git a/backend/btrixcloud/profiles.py b/backend/btrixcloud/profiles.py index ab72422472..239f6f8d70 100644 --- a/backend/btrixcloud/profiles.py +++ b/backend/btrixcloud/profiles.py @@ -49,7 +49,7 @@ # ============================================================================ -# pylint: disable=too-many-instance-attributes, too-many-arguments +# pylint: disable=too-many-instance-attributes, too-many-arguments, too-many-positional-arguments class ProfileOps: """Profile management""" @@ -495,7 +495,7 @@ async def calculate_org_profile_file_storage(self, oid: UUID) -> int: # ============================================================================ -# pylint: disable=redefined-builtin,invalid-name,too-many-locals,too-many-arguments +# pylint: disable=redefined-builtin,invalid-name,too-many-locals,too-many-arguments, too-many-positional-arguments def init_profiles_api( mdb, org_ops: OrgOps, diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index d970895af7..ba4e2616e7 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -67,7 +67,7 @@ # ============================================================================ -# pylint: disable=broad-except,raise-missing-from,too-many-public-methods +# pylint: disable=broad-except,raise-missing-from,too-many-public-methods, too-many-positional-arguments class StorageOps: """All storage handling, download/upload operations""" diff --git a/backend/btrixcloud/subs.py b/backend/btrixcloud/subs.py index fd4a359686..2b4abe0a3f 100644 --- a/backend/btrixcloud/subs.py +++ b/backend/btrixcloud/subs.py @@ -56,6 +56,8 @@ class SubOps: """API for managing subscriptions. Only enabled if billing is enabled""" + # pylint: disable=too-many-positional-arguments + org_ops: OrgOps user_manager: UserManager diff --git a/backend/btrixcloud/uploads.py b/backend/btrixcloud/uploads.py index ded0630719..489fc19a5d 100644 --- a/backend/btrixcloud/uploads.py +++ b/backend/btrixcloud/uploads.py @@ -273,7 +273,7 @@ def read(self, size: Optional[int] = CHUNK_SIZE) -> bytes: # ============================================================================ -# pylint: disable=too-many-arguments, too-many-locals, invalid-name +# pylint: disable=too-many-arguments, too-many-locals, invalid-name, too-many-positional-arguments def init_uploads_api(app, user_dep, *args): """uploads api""" diff --git a/backend/btrixcloud/users.py b/backend/btrixcloud/users.py index ce480923cf..04de196f1e 100644 --- a/backend/btrixcloud/users.py +++ b/backend/btrixcloud/users.py @@ -318,7 +318,7 @@ async def request_verify( user.email, token, dict(request.headers) if request else None ) - # pylint: disable=too-many-arguments + # pylint: disable=too-many-arguments, too-many-positional-arguments async def create_user( self, name: str, diff --git a/backend/btrixcloud/webhooks.py b/backend/btrixcloud/webhooks.py index 251cc25106..4293f23891 100644 --- a/backend/btrixcloud/webhooks.py +++ b/backend/btrixcloud/webhooks.py @@ -40,7 +40,7 @@ class EventWebhookOps: """Event webhook notification management""" - # pylint: disable=invalid-name, too-many-arguments, too-many-locals + # pylint: disable=invalid-name, too-many-arguments, too-many-locals, too-many-positional-arguments org_ops: OrgOps crawl_ops: CrawlOps @@ -556,7 +556,7 @@ async def create_collection_deleted_notification( ) -# pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme +# pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme, too-many-positional-arguments def init_event_webhooks_api(mdb, org_ops, app): """init event webhooks system""" # pylint: disable=invalid-name From 048a563f4c81e5e9375942d12c3292605ccf9785 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 25 Sep 2024 13:17:06 -0400 Subject: [PATCH 20/64] More linting fixes --- backend/btrixcloud/background_jobs.py | 2 +- backend/btrixcloud/crawlconfigs.py | 2 +- backend/btrixcloud/storages.py | 2 +- backend/btrixcloud/subs.py | 2 +- backend/btrixcloud/uploads.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/backend/btrixcloud/background_jobs.py b/backend/btrixcloud/background_jobs.py index b9e4cc0659..8a0fd6572f 100644 --- a/backend/btrixcloud/background_jobs.py +++ b/backend/btrixcloud/background_jobs.py @@ -52,7 +52,7 @@ class BackgroundJobOps: base_crawl_ops: BaseCrawlOps profile_ops: ProfileOps - # pylint: disable=too-many-locals, too-many-arguments, invalid-name + # pylint: disable=too-many-locals, too-many-arguments, too-many-positional-arguments, invalid-name def __init__(self, mdb, email, user_manager, org_ops, crawl_manager, storage_ops): self.jobs = mdb["jobs"] diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index 28f24260e1..bc4ad3977a 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -1081,7 +1081,7 @@ async def stats_recompute_all(crawl_configs, crawls, cid: UUID): # ============================================================================ -# pylint: disable=redefined-builtin,invalid-name,too-many-locals,too-many-arguments +# pylint: disable=redefined-builtin,invalid-name,too-many-locals,too-many-arguments,too-many-positional-arguments def init_crawl_config_api( app, dbclient, diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index ba4e2616e7..2f1706ac3f 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -392,7 +392,7 @@ async def verify_storage_upload(self, storage: S3Storage, filename: str) -> None try: resp = await client.put_object(Bucket=bucket, Key=key, Body=data) assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200 - except: + except Exception: # create bucket if it doesn't yet exist and then try again resp = await client.create_bucket(Bucket=bucket) assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200 diff --git a/backend/btrixcloud/subs.py b/backend/btrixcloud/subs.py index 2b4abe0a3f..95c17dd158 100644 --- a/backend/btrixcloud/subs.py +++ b/backend/btrixcloud/subs.py @@ -334,7 +334,7 @@ async def get_billing_portal_url( return SubscriptionPortalUrlResponse() -# pylint: disable=invalid-name,too-many-arguments +# pylint: disable=invalid-name,too-many-arguments,too-many-positional-arguments def init_subs_api( app, mdb, diff --git a/backend/btrixcloud/uploads.py b/backend/btrixcloud/uploads.py index 489fc19a5d..05fcfa0214 100644 --- a/backend/btrixcloud/uploads.py +++ b/backend/btrixcloud/uploads.py @@ -53,7 +53,7 @@ async def get_upload( return UploadedCrawl.from_dict(res) # pylint: disable=too-many-arguments, too-many-instance-attributes, too-many-public-methods, too-many-function-args - # pylint: disable=too-many-arguments, too-many-locals, duplicate-code, invalid-name + # pylint: disable=too-many-positional-arguments, too-many-locals, duplicate-code, invalid-name async def upload_stream( self, stream, From e3c41b99d444b7dafee60d6ade9a0bef9d1f2e2a Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 26 Sep 2024 10:51:46 -0400 Subject: [PATCH 21/64] Refactor, seperate storage and replicas updates --- backend/btrixcloud/background_jobs.py | 15 ++- backend/btrixcloud/models.py | 13 +- backend/btrixcloud/orgs.py | 18 ++- backend/btrixcloud/storages.py | 168 ++++++++++++++------------ backend/test/test_org.py | 25 ++-- 5 files changed, 139 insertions(+), 100 deletions(-) diff --git a/backend/btrixcloud/background_jobs.py b/backend/btrixcloud/background_jobs.py index 8a0fd6572f..8bf3ea9801 100644 --- a/backend/btrixcloud/background_jobs.py +++ b/backend/btrixcloud/background_jobs.py @@ -408,7 +408,7 @@ async def create_copy_bucket_job( new_storage=new_storage_ref, new_endpoint=new_endpoint, new_bucket=new_bucket, - job_id_prefix=job_type, + job_id_prefix=f"{job_type}-{org.id}", existing_job_id=existing_job_id, ) if existing_job_id: @@ -470,6 +470,9 @@ async def job_finished( await self.handle_delete_replica_job_finished( cast(DeleteReplicaJob, job) ) + if job_type == BgJobType.COPY_BUCKET: + org = await self.orgs_ops.get_org_by_id(oid) + await self.org_ops.update_read_only(org, False) else: print( f"Background job {job.id} failed, sending email to superuser", @@ -513,16 +516,16 @@ async def get_background_job( def _get_job_by_type_from_data(self, data: dict[str, object]): """convert dict to propert background job type""" - if data["type"] == BgJobType.CREATE_REPLICA: + if data["type"] == BgJobType.CREATE_REPLICA.value: return CreateReplicaJob.from_dict(data) - if data["type"] == BgJobType.DELETE_REPLICA: + if data["type"] == BgJobType.DELETE_REPLICA.value: return DeleteReplicaJob.from_dict(data) - if data["type"] == BgJobType.RECALCULATE_ORG_STATS: + if data["type"] == BgJobType.RECALCULATE_ORG_STATS.value: return RecalculateOrgStatsJob.from_dict(data) - if data["type"] == BgJobType.COPY_BUCKET: + if data["type"] == BgJobType.COPY_BUCKET.value: return CopyBucketJob.from_dict(data) return DeleteOrgJob.from_dict(data) @@ -536,7 +539,7 @@ async def list_background_jobs( job_type: Optional[str] = None, sort_by: Optional[str] = None, sort_direction: Optional[int] = -1, - ) -> Tuple[List[BackgroundJob], int]: + ) -> Tuple[List[Union[CreateReplicaJob, DeleteReplicaJob, CopyBucketJob]], int]: """List all background jobs""" # pylint: disable=duplicate-code # Zero-index page for query diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 7fcfd55072..6404bd3fd4 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -1144,12 +1144,17 @@ class RenameOrg(BaseModel): # ============================================================================ -class OrgStorageRefs(BaseModel): - """Input model for setting primary storage + optional replicas""" +class OrgStorageRef(BaseModel): + """Input model for setting primary storage""" storage: StorageRef - storageReplicas: List[StorageRef] = [] + +# ============================================================================ +class OrgStorageReplicaRefs(BaseModel): + """Input model for setting replica storages""" + + storageReplicas: List[StorageRef] # ============================================================================ @@ -2080,7 +2085,7 @@ class RecalculateOrgStatsJob(BackgroundJob): class CopyBucketJob(BackgroundJob): """Model for tracking job to copy entire s3 bucket""" - type: Literal[BgJobType.CREATE_REPLICA] = BgJobType.CREATE_REPLICA + type: Literal[BgJobType.COPY_BUCKET] = BgJobType.COPY_BUCKET prev_storage: StorageRef new_storage: StorageRef diff --git a/backend/btrixcloud/orgs.py b/backend/btrixcloud/orgs.py index 269c9dec23..df2daf0564 100644 --- a/backend/btrixcloud/orgs.py +++ b/backend/btrixcloud/orgs.py @@ -471,9 +471,17 @@ async def update_slug_and_name(self, org: Organization) -> bool: ) return res is not None - async def update_storage_refs(self, org: Organization) -> bool: - """Update storage + replicas for given org""" - set_dict = org.dict(include={"storage": True, "storageReplicas": True}) + async def update_storage_refs( + self, org: Organization, replicas: bool = False + ) -> bool: + """Update storage or replica refs for given org""" + include = {} + if replicas: + include["storageReplicas"] = True + else: + include["storage"] = True + + set_dict = org.dict(include=include) res = await self.orgs.find_one_and_update({"_id": org.id}, {"$set": set_dict}) return res is not None @@ -1003,13 +1011,13 @@ async def is_crawl_running(self, org: Organization) -> bool: async def has_files_stored(self, org: Organization) -> bool: """Return boolean indicating whether any files are stored on org""" crawl_count = await self.crawls_db.count_documents( - {"_id": org.id, "files.1": {"$exists": True}}, + {"oid": org.id, "files": {"$exists": True, "$ne": []}}, ) if crawl_count > 0: return True profile_count = await self.profiles_db.count_documents( - {"_id": org.id, "resource": {"$exists": True}}, + {"oid": org.id, "resource": {"$exists": True}}, ) if profile_count > 0: return True diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 2f1706ac3f..b9582fdb96 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -46,7 +46,8 @@ StorageRef, S3Storage, S3StorageIn, - OrgStorageRefs, + OrgStorageRef, + OrgStorageReplicaRefs, DeletedResponse, UpdatedResponse, AddedResponseName, @@ -230,116 +231,117 @@ async def remove_custom_storage( return {"deleted": True} - async def update_storage_refs( + async def update_storage_ref( self, storage_refs: OrgStorageRefs, org: Organization, ) -> dict[str, bool]: """update storage for org""" + storage_ref = storage_refs.storage try: - self.get_org_storage_by_ref(org, storage_refs.storage) - - for replica in storage_refs.storageReplicas: - self.get_org_storage_by_ref(org, replica) - + self.get_org_storage_by_ref(org, storage_ref) except: raise HTTPException(status_code=400, detail="invalid_storage_ref") if await self.org_ops.is_crawl_running(org): raise HTTPException(status_code=400, detail="crawl_running") - prev_storage = org.storage - prev_storage_replicas = org.storageReplicas + if org.storage == storage_ref: + raise HTTPException(status_code=400, detail="identical_storage_ref") + + # TODO: Check that no CreateReplicaJobs are running - org.storage = storage_refs.storage - org.storageReplicas = storage_refs.storageReplicas + prev_storage = org.storage + org.storage = storage_ref await self.org_ops.update_storage_refs(org) - asyncio.create_task( - self.run_post_storage_update_tasks( - OrgStorageRefs( - storage=prev_storage, storageReplicas=prev_storage_replicas - ), - storage_refs, - org, - ) + # TODO: Consider running into asyncio task + await self.run_post_storage_update_tasks( + prev_storage_ref, + storage_ref, + org, ) return {"updated": True} async def run_post_storage_update_tasks( self, - prev_storage_refs: OrgStorageRefs, - new_storage_refs: OrgStorageRefs, + prev_storage_ref: StorageRef, + new_storage_ref: StorageRef, org: Organization, ): - """Handle tasks necessary after changing org storage - - This is a good candidate for a background job with access to ops - classes, which may kick off other background jobs as needed. - """ + """Handle tasks necessary after changing org storage""" if not await self.org_ops.has_files_stored(org): + print(f"No files stored", flush=True) return - if new_storage_refs.storage != prev_storage_refs.storage: + if new_storage_ref != prev_storage_ref: await self.org_ops.update_read_only(org, True, "Updating storage") # Create the background job to copy files - job_id = await self.background_job_ops.create_copy_bucket_job( - org, prev_storage_refs.storage, new_storage_refs.storage + await self.background_job_ops.create_copy_bucket_job( + org, prev_storage_ref, new_storage_ref ) - # Block until copy job is complete - # TODO: Handle in operator instead? - while True: - bg_job = await self.background_job_ops.get_background_job( - job_id, org.id - ) - if bg_job.success: - break - if bg_job.success is False: - # TODO: Handle failure case, including partial failure - # (i.e. some files but not all copied) - pass - - time.sleep(5) - await self.org_ops.update_file_storage_refs( - org, prev_storage_refs.storage, new_storage_refs.storage + org, prev_storage_ref, new_storage_ref ) await self.org_ops.unset_file_presigned_urls(org) - await self.org_ops.update_read_only(org, False) + async def update_storage_replica_refs( + self, + storage_refs: OrgStorageReplicaRefs, + org: Organization, + ) -> dict[str, bool]: + """update storage for org""" - if new_storage_refs.storageReplicas != prev_storage_refs.storageReplicas: - # TODO: If we changed primary storage in this update, make sure that - # are files are successfully copied to new primary before doing - # anything with the replicas - this may be simple or complex depending - # on final approach taken to handle above + replicas = storage_refs.storageReplicas - # Replicate files to any new replica locations - for replica_storage in new_storage_refs.storageReplicas: - if replica_storage not in prev_storage_refs.storageReplicas: - # TODO: Kick off background jobs to replicate primary - # storage to new replica location - print( - "Not yet implemented: Replicate files to {replica_storage.name}", - flush=True, - ) + try: + for replica in replicas: + self.get_org_storage_by_ref(org, replica) + except: + raise HTTPException(status_code=400, detail="invalid_storage_ref") - # Delete files from previous replica locations that are no longer - # being used - for replica_storage in prev_storage_refs.storageReplicas: - if replica_storage not in new_storage_refs.storageReplicas: - # TODO: Kick off background jobs to delete replicas - # (may be easier to just delete all files from bucket - # in one rclone command) - print( - "Not yet implemented: Delete files from {replica_storage.name}", - flush=True, - ) + if await self.org_ops.is_crawl_running(org): + raise HTTPException(status_code=400, detail="crawl_running") + + if org.storageReplicas == replicas: + raise HTTPException(status_code=400, detail="identical_storage_ref") + + # TODO: Check that no CreateReplicaJobs are running + + prev_storage_replicas = org.storageReplicas + org.storageReplicas = replicas + + await self.org_ops.update_storage_refs(org, replicas=True) + + # Replicate files to any new replica locations + for replica_storage in replicas: + if replica_storage not in prev_storage_replicas: + # TODO: Kick off background jobs to replicate primary + # storage to new replica location + print( + "Not yet implemented: Replicate files to {replica_storage.name}", + flush=True, + ) + + # Delete files from previous replica locations that are no longer + # being used + for replica_storage in prev_storage_replicas: + if replica_storage not in replicas: + # TODO: Kick off background jobs to delete replicas + # (may be easier to just delete all files from bucket + # in one rclone command - if so, will need to handle + # updating files in db as well) + print( + "Not yet implemented: Delete files from {replica_storage.name}", + flush=True, + ) + + return {"updated": True} def get_available_storages(self, org: Organization) -> List[StorageRef]: """return a list of available default + custom storages""" @@ -850,12 +852,15 @@ def get_storage_refs( """get storage refs for an org""" return OrgStorageRefs(storage=org.storage, storageReplicas=org.storageReplicas) - @router.get("/allStorages", tags=["organizations"], response_model=List[StorageRef]) + @router.get( + "/all-storages", tags=["organizations"], response_model=List[StorageRef] + ) def get_available_storages(org: Organization = Depends(org_owner_dep)): return storage_ops.get_available_storages(org) + # TODO: Normalize casing with other endpoints @router.post( - "/customStorage", tags=["organizations"], response_model=AddedResponseName + "/custom-storage", tags=["organizations"], response_model=AddedResponseName ) async def add_custom_storage( storage: S3StorageIn, org: Organization = Depends(org_owner_dep) @@ -863,7 +868,7 @@ async def add_custom_storage( return await storage_ops.add_custom_storage(storage, org) @router.delete( - "/customStorage/{name}", tags=["organizations"], response_model=DeletedResponse + "/custom-storage/{name}", tags=["organizations"], response_model=DeletedResponse ) async def remove_custom_storage( name: str, org: Organization = Depends(org_owner_dep) @@ -871,10 +876,21 @@ async def remove_custom_storage( return await storage_ops.remove_custom_storage(name, org) @router.post("/storage", tags=["organizations"], response_model=UpdatedResponse) - async def update_storage_refs( + async def update_storage_ref( + storage: OrgStorageRefs, + org: Organization = Depends(org_owner_dep), + ): + return await storage_ops.update_storage_ref(storage, org) + + return storage_ops + + @router.post( + "/storage-replicas", tags=["organizations"], response_model=UpdatedResponse + ) + async def update_storage_replica_refs( storage: OrgStorageRefs, org: Organization = Depends(org_owner_dep), ): - return await storage_ops.update_storage_refs(storage, org) + return await storage_ops.update_storage_replica_refs(storage, org) return storage_ops diff --git a/backend/test/test_org.py b/backend/test/test_org.py index db6e81d560..f447010c65 100644 --- a/backend/test/test_org.py +++ b/backend/test/test_org.py @@ -249,7 +249,7 @@ def test_change_storage_invalid(admin_auth_headers): def test_add_custom_storage(admin_auth_headers): # add custom storages r = requests.post( - f"{API_PREFIX}/orgs/{new_oid}/customStorage", + f"{API_PREFIX}/orgs/{new_oid}/custom-storage", headers=admin_auth_headers, json={ "name": CUSTOM_PRIMARY_STORAGE_NAME, @@ -265,7 +265,7 @@ def test_add_custom_storage(admin_auth_headers): assert data["name"] == CUSTOM_PRIMARY_STORAGE_NAME r = requests.post( - f"{API_PREFIX}/orgs/{new_oid}/customStorage", + f"{API_PREFIX}/orgs/{new_oid}/custom-storage", headers=admin_auth_headers, json={ "name": CUSTOM_REPLICA_STORAGE_NAME, @@ -280,19 +280,27 @@ def test_add_custom_storage(admin_auth_headers): assert data["added"] assert data["name"] == CUSTOM_REPLICA_STORAGE_NAME - # set org to use custom storages moving forward + # set org to use custom storage moving forward r = requests.post( f"{API_PREFIX}/orgs/{new_oid}/storage", headers=admin_auth_headers, json={ "storage": {"name": CUSTOM_PRIMARY_STORAGE_NAME, "custom": True}, - "storageReplicas": [{"name": CUSTOM_REPLICA_STORAGE_NAME, "custom": True}], }, ) assert r.status_code == 200 assert r.json()["updated"] + # set org to use custom storage replica moving forward + r = requests.post( + f"{API_PREFIX}/orgs/{new_oid}/storage-replicas", + headers=admin_auth_headers, + json={ + "storageReplicas": [{"name": CUSTOM_REPLICA_STORAGE_NAME, "custom": True}], + }, + ) + # check org was updated as expected r = requests.get( f"{API_PREFIX}/orgs/{new_oid}/storage", @@ -315,14 +323,14 @@ def test_add_custom_storage(admin_auth_headers): def test_remove_custom_storage(admin_auth_headers): # Try to remove in-use storages, verify we get expected 400 response r = requests.delete( - f"{API_PREFIX}/orgs/{new_oid}/customStorage/{CUSTOM_PRIMARY_STORAGE_NAME}", + f"{API_PREFIX}/orgs/{new_oid}/custom-storage/{CUSTOM_PRIMARY_STORAGE_NAME}", headers=admin_auth_headers, ) assert r.status_code == 400 assert r.json()["detail"] == "storage_in_use" r = requests.delete( - f"{API_PREFIX}/orgs/{new_oid}/customStorage/{CUSTOM_REPLICA_STORAGE_NAME}", + f"{API_PREFIX}/orgs/{new_oid}/custom-storage/{CUSTOM_REPLICA_STORAGE_NAME}", headers=admin_auth_headers, ) assert r.status_code == 400 @@ -330,17 +338,16 @@ def test_remove_custom_storage(admin_auth_headers): # Unset replica storage from org r = requests.post( - f"{API_PREFIX}/orgs/{new_oid}/storage", + f"{API_PREFIX}/orgs/{new_oid}/storage-replicas", headers=admin_auth_headers, json={ - "storage": {"name": CUSTOM_PRIMARY_STORAGE_NAME, "custom": True}, "storageReplicas": [], }, ) # Delete no longer used replica storage location r = requests.delete( - f"{API_PREFIX}/orgs/{new_oid}/customStorage/{CUSTOM_REPLICA_STORAGE_NAME}", + f"{API_PREFIX}/orgs/{new_oid}/custom-storage/{CUSTOM_REPLICA_STORAGE_NAME}", headers=admin_auth_headers, ) assert r.status_code == 200 From 34310dea16fc0bbf8b9e2e9b5a2062f43c9ffc96 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 26 Sep 2024 10:59:25 -0400 Subject: [PATCH 22/64] More refactoring --- backend/btrixcloud/storages.py | 58 +++++++++++++++++++++++++--------- 1 file changed, 43 insertions(+), 15 deletions(-) diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index b9582fdb96..a47d1e5d81 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -245,7 +245,10 @@ async def update_storage_ref( raise HTTPException(status_code=400, detail="invalid_storage_ref") if await self.org_ops.is_crawl_running(org): - raise HTTPException(status_code=400, detail="crawl_running") + raise HTTPException(status_code=403, detail="crawl_running") + + if org.readOnly: + raise HTTPException(status_code=403, detail="org_set_to_read_only") if org.storage == storage_ref: raise HTTPException(status_code=400, detail="identical_storage_ref") @@ -257,7 +260,7 @@ async def update_storage_ref( await self.org_ops.update_storage_refs(org) - # TODO: Consider running into asyncio task + # TODO: Consider running into asyncio task or background job await self.run_post_storage_update_tasks( prev_storage_ref, storage_ref, @@ -274,21 +277,20 @@ async def run_post_storage_update_tasks( ): """Handle tasks necessary after changing org storage""" if not await self.org_ops.has_files_stored(org): - print(f"No files stored", flush=True) + print(f"No files stored, no updates to do", flush=True) return - if new_storage_ref != prev_storage_ref: - await self.org_ops.update_read_only(org, True, "Updating storage") + await self.org_ops.update_read_only(org, True, "Updating storage") - # Create the background job to copy files - await self.background_job_ops.create_copy_bucket_job( - org, prev_storage_ref, new_storage_ref - ) + # Create the background job to copy files + await self.background_job_ops.create_copy_bucket_job( + org, prev_storage_ref, new_storage_ref + ) - await self.org_ops.update_file_storage_refs( - org, prev_storage_ref, new_storage_ref - ) - await self.org_ops.unset_file_presigned_urls(org) + await self.org_ops.update_file_storage_refs( + org, prev_storage_ref, new_storage_ref + ) + await self.org_ops.unset_file_presigned_urls(org) async def update_storage_replica_refs( self, @@ -306,7 +308,10 @@ async def update_storage_replica_refs( raise HTTPException(status_code=400, detail="invalid_storage_ref") if await self.org_ops.is_crawl_running(org): - raise HTTPException(status_code=400, detail="crawl_running") + raise HTTPException(status_code=403, detail="crawl_running") + + if org.readOnly: + raise HTTPException(status_code=403, detail="org_set_to_read_only") if org.storageReplicas == replicas: raise HTTPException(status_code=400, detail="identical_storage_ref") @@ -318,6 +323,26 @@ async def update_storage_replica_refs( await self.org_ops.update_storage_refs(org, replicas=True) + # TODO: Consider running in asyncio task or background job + await self.run_post_storage_replica_update_tasks( + prev_storage_replicas, replicas, org + ) + + return {"updated": True} + + async def run_post_storage_replica_update_tasks( + self, + prev_storage_refs: List[StorageRef], + new_storage_refs: List[StorageRef], + org: Organization, + ): + """Handle tasks necessary after updating org replica storages""" + if not await self.org_ops.has_files_stored(org): + print(f"No files stored, no updates to do", flush=True) + return + + await self.org_ops.update_read_only(org, True, "Updating storage replicas") + # Replicate files to any new replica locations for replica_storage in replicas: if replica_storage not in prev_storage_replicas: @@ -341,7 +366,10 @@ async def update_storage_replica_refs( flush=True, ) - return {"updated": True} + # TODO: Unset read only once all tasks are complete + # May need to handle this in the operators or a background job + # depending on how post-update tasks are run + await self.org_ops.update_read_only(org, False) def get_available_storages(self, org: Organization) -> List[StorageRef]: """return a list of available default + custom storages""" From 80cc5d410db8b9cc173b7dd5088e8ea06971eb97 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 26 Sep 2024 11:00:01 -0400 Subject: [PATCH 23/64] Make post-update task methods private --- backend/btrixcloud/storages.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index a47d1e5d81..b3b1c91176 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -261,7 +261,7 @@ async def update_storage_ref( await self.org_ops.update_storage_refs(org) # TODO: Consider running into asyncio task or background job - await self.run_post_storage_update_tasks( + await self._run_post_storage_update_tasks( prev_storage_ref, storage_ref, org, @@ -269,7 +269,7 @@ async def update_storage_ref( return {"updated": True} - async def run_post_storage_update_tasks( + async def _run_post_storage_update_tasks( self, prev_storage_ref: StorageRef, new_storage_ref: StorageRef, @@ -324,13 +324,13 @@ async def update_storage_replica_refs( await self.org_ops.update_storage_refs(org, replicas=True) # TODO: Consider running in asyncio task or background job - await self.run_post_storage_replica_update_tasks( + await self._run_post_storage_replica_update_tasks( prev_storage_replicas, replicas, org ) return {"updated": True} - async def run_post_storage_replica_update_tasks( + async def _run_post_storage_replica_update_tasks( self, prev_storage_refs: List[StorageRef], new_storage_refs: List[StorageRef], From 41120448c7c64d1b2fbc240dfabc3725f1c63a10 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 26 Sep 2024 11:04:58 -0400 Subject: [PATCH 24/64] Check if any bg jobs running before changing storage --- backend/btrixcloud/storages.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index b3b1c91176..fb89c295ee 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -244,16 +244,20 @@ async def update_storage_ref( except: raise HTTPException(status_code=400, detail="invalid_storage_ref") + if org.storage == storage_ref: + raise HTTPException(status_code=400, detail="identical_storage_ref") + if await self.org_ops.is_crawl_running(org): raise HTTPException(status_code=403, detail="crawl_running") if org.readOnly: raise HTTPException(status_code=403, detail="org_set_to_read_only") - if org.storage == storage_ref: - raise HTTPException(status_code=400, detail="identical_storage_ref") - - # TODO: Check that no CreateReplicaJobs are running + _, jobs_running_count = await self.background_job_ops.list_background_jobs( + org=org, success=None + ) + if jobs_running_count > 0: + raise HTTPException(status_code=403, detail="background_jobs_running") prev_storage = org.storage org.storage = storage_ref @@ -307,16 +311,20 @@ async def update_storage_replica_refs( except: raise HTTPException(status_code=400, detail="invalid_storage_ref") + if org.storageReplicas == replicas: + raise HTTPException(status_code=400, detail="identical_storage_ref") + if await self.org_ops.is_crawl_running(org): raise HTTPException(status_code=403, detail="crawl_running") if org.readOnly: raise HTTPException(status_code=403, detail="org_set_to_read_only") - if org.storageReplicas == replicas: - raise HTTPException(status_code=400, detail="identical_storage_ref") - - # TODO: Check that no CreateReplicaJobs are running + _, jobs_running_count = await self.background_job_ops.list_background_jobs( + org=org, success=None + ) + if jobs_running_count > 0: + raise HTTPException(status_code=403, detail="background_jobs_running") prev_storage_replicas = org.storageReplicas org.storageReplicas = replicas From 287fae9ea047be77d237655ea10fe36564fe2c50 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 26 Sep 2024 11:05:40 -0400 Subject: [PATCH 25/64] Check bg job finished as well --- backend/btrixcloud/storages.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index fb89c295ee..35745feb8c 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -254,7 +254,7 @@ async def update_storage_ref( raise HTTPException(status_code=403, detail="org_set_to_read_only") _, jobs_running_count = await self.background_job_ops.list_background_jobs( - org=org, success=None + org=org, success=None, finished=None ) if jobs_running_count > 0: raise HTTPException(status_code=403, detail="background_jobs_running") @@ -321,7 +321,7 @@ async def update_storage_replica_refs( raise HTTPException(status_code=403, detail="org_set_to_read_only") _, jobs_running_count = await self.background_job_ops.list_background_jobs( - org=org, success=None + org=org, success=None, finished=None ) if jobs_running_count > 0: raise HTTPException(status_code=403, detail="background_jobs_running") From 5096005acc6782d82c7f5f5325d32a0af5c14f41 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 26 Sep 2024 11:13:34 -0400 Subject: [PATCH 26/64] Fixups --- backend/btrixcloud/background_jobs.py | 3 +-- backend/btrixcloud/models.py | 9 +++++++++ backend/btrixcloud/storages.py | 28 +++++++++++++-------------- 3 files changed, 23 insertions(+), 17 deletions(-) diff --git a/backend/btrixcloud/background_jobs.py b/backend/btrixcloud/background_jobs.py index 8bf3ea9801..5e1e33917f 100644 --- a/backend/btrixcloud/background_jobs.py +++ b/backend/btrixcloud/background_jobs.py @@ -16,7 +16,6 @@ from .models import ( BaseFile, Organization, - BackgroundJob, BgJobType, CreateReplicaJob, DeleteReplicaJob, @@ -471,7 +470,7 @@ async def job_finished( cast(DeleteReplicaJob, job) ) if job_type == BgJobType.COPY_BUCKET: - org = await self.orgs_ops.get_org_by_id(oid) + org = await self.org_ops.get_org_by_id(oid) await self.org_ops.update_read_only(org, False) else: print( diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 6404bd3fd4..98d085af4d 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -1143,6 +1143,15 @@ class RenameOrg(BaseModel): slug: Optional[str] = None +# ============================================================================ +class OrgStorageRefs(BaseModel): + """Model for org storage references""" + + storage: StorageRef + + storageReplicas: List[StorageRef] = [] + + # ============================================================================ class OrgStorageRef(BaseModel): """Input model for setting primary storage""" diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 35745feb8c..e9ede970ff 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -22,7 +22,6 @@ import zlib import json import os -import time from datetime import datetime from zipfile import ZipInfo @@ -46,6 +45,7 @@ StorageRef, S3Storage, S3StorageIn, + OrgStorageRefs, OrgStorageRef, OrgStorageReplicaRefs, DeletedResponse, @@ -233,7 +233,7 @@ async def remove_custom_storage( async def update_storage_ref( self, - storage_refs: OrgStorageRefs, + storage_refs: OrgStorageRef, org: Organization, ) -> dict[str, bool]: """update storage for org""" @@ -259,7 +259,7 @@ async def update_storage_ref( if jobs_running_count > 0: raise HTTPException(status_code=403, detail="background_jobs_running") - prev_storage = org.storage + prev_storage_ref = org.storage org.storage = storage_ref await self.org_ops.update_storage_refs(org) @@ -281,7 +281,7 @@ async def _run_post_storage_update_tasks( ): """Handle tasks necessary after changing org storage""" if not await self.org_ops.has_files_stored(org): - print(f"No files stored, no updates to do", flush=True) + print("No files stored, no updates to do", flush=True) return await self.org_ops.update_read_only(org, True, "Updating storage") @@ -340,20 +340,20 @@ async def update_storage_replica_refs( async def _run_post_storage_replica_update_tasks( self, - prev_storage_refs: List[StorageRef], - new_storage_refs: List[StorageRef], + prev_replica_refs: List[StorageRef], + new_replica_refs: List[StorageRef], org: Organization, ): """Handle tasks necessary after updating org replica storages""" if not await self.org_ops.has_files_stored(org): - print(f"No files stored, no updates to do", flush=True) + print("No files stored, no updates to do", flush=True) return await self.org_ops.update_read_only(org, True, "Updating storage replicas") # Replicate files to any new replica locations - for replica_storage in replicas: - if replica_storage not in prev_storage_replicas: + for replica_storage in new_replica_refs: + if replica_storage not in prev_replica_refs: # TODO: Kick off background jobs to replicate primary # storage to new replica location print( @@ -363,8 +363,8 @@ async def _run_post_storage_replica_update_tasks( # Delete files from previous replica locations that are no longer # being used - for replica_storage in prev_storage_replicas: - if replica_storage not in replicas: + for replica_storage in prev_replica_refs: + if replica_storage not in new_replica_refs: # TODO: Kick off background jobs to delete replicas # (may be easier to just delete all files from bucket # in one rclone command - if so, will need to handle @@ -913,18 +913,16 @@ async def remove_custom_storage( @router.post("/storage", tags=["organizations"], response_model=UpdatedResponse) async def update_storage_ref( - storage: OrgStorageRefs, + storage: OrgStorageRef, org: Organization = Depends(org_owner_dep), ): return await storage_ops.update_storage_ref(storage, org) - return storage_ops - @router.post( "/storage-replicas", tags=["organizations"], response_model=UpdatedResponse ) async def update_storage_replica_refs( - storage: OrgStorageRefs, + storage: OrgStorageReplicaRefs, org: Organization = Depends(org_owner_dep), ): return await storage_ops.update_storage_replica_refs(storage, org) From 7d7fd3638fcdf5fd40ee0598ee08a6d50f469572 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 26 Sep 2024 11:29:57 -0400 Subject: [PATCH 27/64] Storage update improvements --- backend/btrixcloud/orgs.py | 44 ++++++++++++++++++++++++++++++++-- backend/btrixcloud/storages.py | 34 ++++++++++---------------- 2 files changed, 55 insertions(+), 23 deletions(-) diff --git a/backend/btrixcloud/orgs.py b/backend/btrixcloud/orgs.py index df2daf0564..eaebeae323 100644 --- a/backend/btrixcloud/orgs.py +++ b/backend/btrixcloud/orgs.py @@ -491,14 +491,14 @@ async def update_file_storage_refs( ) -> bool: """Update storage refs for all crawl and profile files in given org""" res = await self.crawls_db.update_many( - {"_id": org.id, "files.$.storage": previous_storage}, + {"oid": org.id, "files.$.storage": previous_storage}, {"$set": {"files.$.storage": new_storage}}, ) if not res: return False res = await self.profiles_db.update_many( - {"_id": org.id, "resource.storage": previous_storage}, + {"oid": org.id, "resource.storage": previous_storage}, {"$set": {"resource.storage": new_storage}}, ) if not res: @@ -506,6 +506,46 @@ async def update_file_storage_refs( return True + async def add_file_replica_storage_refs( + self, org: Organization, new_storage: StorageRef + ) -> bool: + """Add replica storage ref for all files in given org""" + res = await self.crawls_db.update_many( + {"oid": org.id}, + {"$push": {"files.$.replicas": new_storage}}, + ) + if not res: + return False + + res = await self.profiles_db.update_many( + {"oid": org.id}, + {"$push": {"resource.replicas": new_storage}}, + ) + if not res: + return False + + return True + + async def remove_file_replica_storage_refs( + self, org: Organization, new_storage: StorageRef + ) -> bool: + """Remove replica storage ref from all files in given org""" + res = await self.crawls_db.update_many( + {"oid": org.id}, + {"$pull": {"files.$.replicas": new_storage}}, + ) + if not res: + return False + + res = await self.profiles_db.update_many( + {"oid": org.id}, + {"$pull": {"resource.replicas": new_storage}}, + ) + if not res: + return False + + return True + async def unset_file_presigned_urls(self, org: Organization) -> bool: """Unset all presigned URLs for files in org""" res = await self.crawls_db.update_many( diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index e9ede970ff..aa5c22303b 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -264,7 +264,7 @@ async def update_storage_ref( await self.org_ops.update_storage_refs(org) - # TODO: Consider running into asyncio task or background job + # TODO: Run in asyncio task or background job? await self._run_post_storage_update_tasks( prev_storage_ref, storage_ref, @@ -286,7 +286,6 @@ async def _run_post_storage_update_tasks( await self.org_ops.update_read_only(org, True, "Updating storage") - # Create the background job to copy files await self.background_job_ops.create_copy_bucket_job( org, prev_storage_ref, new_storage_ref ) @@ -294,6 +293,7 @@ async def _run_post_storage_update_tasks( await self.org_ops.update_file_storage_refs( org, prev_storage_ref, new_storage_ref ) + await self.org_ops.unset_file_presigned_urls(org) async def update_storage_replica_refs( @@ -331,7 +331,7 @@ async def update_storage_replica_refs( await self.org_ops.update_storage_refs(org, replicas=True) - # TODO: Consider running in asyncio task or background job + # TODO: Run in asyncio task or background job? await self._run_post_storage_replica_update_tasks( prev_storage_replicas, replicas, org ) @@ -349,35 +349,27 @@ async def _run_post_storage_replica_update_tasks( print("No files stored, no updates to do", flush=True) return - await self.org_ops.update_read_only(org, True, "Updating storage replicas") + # TODO: Determine if we need to set read-only for replica operations + # (likely not?) + # await self.org_ops.update_read_only(org, True, "Updating storage replicas") # Replicate files to any new replica locations for replica_storage in new_replica_refs: if replica_storage not in prev_replica_refs: - # TODO: Kick off background jobs to replicate primary - # storage to new replica location - print( - "Not yet implemented: Replicate files to {replica_storage.name}", - flush=True, + await self.background_job_ops.create_copy_bucket_job( + org, org.storage, replica_storage ) + await self.org_ops.add_file_replica_storage_refs(org, replica_storage) # Delete files from previous replica locations that are no longer # being used for replica_storage in prev_replica_refs: if replica_storage not in new_replica_refs: - # TODO: Kick off background jobs to delete replicas - # (may be easier to just delete all files from bucket - # in one rclone command - if so, will need to handle - # updating files in db as well) - print( - "Not yet implemented: Delete files from {replica_storage.name}", - flush=True, - ) + # TODO: Background job to delete files with rclone delete? - # TODO: Unset read only once all tasks are complete - # May need to handle this in the operators or a background job - # depending on how post-update tasks are run - await self.org_ops.update_read_only(org, False) + await self.org_ops.remove_file_replica_storage_refs( + org, replica_storage + ) def get_available_storages(self, org: Organization) -> List[StorageRef]: """return a list of available default + custom storages""" From cb461f6258e7fef61619aabbbec79f9b50d16f0e Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 26 Sep 2024 11:57:53 -0400 Subject: [PATCH 28/64] Fixup --- backend/btrixcloud/orgs.py | 2 +- backend/btrixcloud/storages.py | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/backend/btrixcloud/orgs.py b/backend/btrixcloud/orgs.py index eaebeae323..84c249d6fa 100644 --- a/backend/btrixcloud/orgs.py +++ b/backend/btrixcloud/orgs.py @@ -549,7 +549,7 @@ async def remove_file_replica_storage_refs( async def unset_file_presigned_urls(self, org: Organization) -> bool: """Unset all presigned URLs for files in org""" res = await self.crawls_db.update_many( - {"_id": org.id}, {"$set": {"files.$.presignedUrl": None}} + {"oid": org.id}, {"$set": {"files.$.presignedUrl": None}} ) if not res: return False diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index aa5c22303b..89c1227e58 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -361,12 +361,10 @@ async def _run_post_storage_replica_update_tasks( ) await self.org_ops.add_file_replica_storage_refs(org, replica_storage) - # Delete files from previous replica locations that are no longer - # being used + # Delete no-longer-used replica storage refs from files + # TODO: Determine if we want to delete files from the buckets as well for replica_storage in prev_replica_refs: if replica_storage not in new_replica_refs: - # TODO: Background job to delete files with rclone delete? - await self.org_ops.remove_file_replica_storage_refs( org, replica_storage ) From 6e3f3efb6441891ac9fa806351c573e144b40848 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 26 Sep 2024 12:03:06 -0400 Subject: [PATCH 29/64] Remove TODO --- backend/btrixcloud/storages.py | 1 - 1 file changed, 1 deletion(-) diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 89c1227e58..094e2c327e 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -884,7 +884,6 @@ def get_storage_refs( def get_available_storages(org: Organization = Depends(org_owner_dep)): return storage_ops.get_available_storages(org) - # TODO: Normalize casing with other endpoints @router.post( "/custom-storage", tags=["organizations"], response_model=AddedResponseName ) From cf4024530908e9b53ef5c2dc6910fa41912ac623 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 26 Sep 2024 12:03:41 -0400 Subject: [PATCH 30/64] Remove another todo --- backend/btrixcloud/storages.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 094e2c327e..2846ea6079 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -349,10 +349,6 @@ async def _run_post_storage_replica_update_tasks( print("No files stored, no updates to do", flush=True) return - # TODO: Determine if we need to set read-only for replica operations - # (likely not?) - # await self.org_ops.update_read_only(org, True, "Updating storage replicas") - # Replicate files to any new replica locations for replica_storage in new_replica_refs: if replica_storage not in prev_replica_refs: From 3de96a893925116fbece2fd4378f58ff4455f54a Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 26 Sep 2024 12:30:40 -0400 Subject: [PATCH 31/64] More fixups --- backend/btrixcloud/background_jobs.py | 9 +++ backend/btrixcloud/orgs.py | 19 +++--- backend/btrixcloud/storages.py | 5 +- chart/app-templates/copy_job.yaml | 96 +++++++++++++++++++++++++++ 4 files changed, 118 insertions(+), 11 deletions(-) create mode 100644 chart/app-templates/copy_job.yaml diff --git a/backend/btrixcloud/background_jobs.py b/backend/btrixcloud/background_jobs.py index 5e1e33917f..b61571988d 100644 --- a/backend/btrixcloud/background_jobs.py +++ b/backend/btrixcloud/background_jobs.py @@ -535,6 +535,7 @@ async def list_background_jobs( page_size: int = DEFAULT_PAGE_SIZE, page: int = 1, success: Optional[bool] = None, + running: Optional[bool] = None, job_type: Optional[str] = None, sort_by: Optional[str] = None, sort_direction: Optional[int] = -1, @@ -550,6 +551,12 @@ async def list_background_jobs( if success in (True, False): query["success"] = success + if running: + query["success"] = None + + if running is False: + query["success"] = {"$in": [True, False]} + if job_type: query["type"] = job_type @@ -777,6 +784,7 @@ async def list_background_jobs( pageSize: int = DEFAULT_PAGE_SIZE, page: int = 1, success: Optional[bool] = None, + running: Optional[bool] = None, jobType: Optional[str] = None, sortBy: Optional[str] = None, sortDirection: Optional[int] = -1, @@ -787,6 +795,7 @@ async def list_background_jobs( page_size=pageSize, page=page, success=success, + running=running, job_type=jobType, sort_by=sortBy, sort_direction=sortDirection, diff --git a/backend/btrixcloud/orgs.py b/backend/btrixcloud/orgs.py index 84c249d6fa..b25cba95fd 100644 --- a/backend/btrixcloud/orgs.py +++ b/backend/btrixcloud/orgs.py @@ -491,15 +491,15 @@ async def update_file_storage_refs( ) -> bool: """Update storage refs for all crawl and profile files in given org""" res = await self.crawls_db.update_many( - {"oid": org.id, "files.$.storage": previous_storage}, - {"$set": {"files.$.storage": new_storage}}, + {"oid": org.id, "files.$.storage": dict(previous_storage)}, + {"$set": {"files.$.storage": dict(new_storage)}}, ) if not res: return False res = await self.profiles_db.update_many( - {"oid": org.id, "resource.storage": previous_storage}, - {"$set": {"resource.storage": new_storage}}, + {"oid": org.id, "resource.storage": dict(previous_storage)}, + {"$set": {"resource.storage": dict(new_storage)}}, ) if not res: return False @@ -512,14 +512,14 @@ async def add_file_replica_storage_refs( """Add replica storage ref for all files in given org""" res = await self.crawls_db.update_many( {"oid": org.id}, - {"$push": {"files.$.replicas": new_storage}}, + {"$push": {"files.$.replicas": dict(new_storage)}}, ) if not res: return False res = await self.profiles_db.update_many( {"oid": org.id}, - {"$push": {"resource.replicas": new_storage}}, + {"$push": {"resource.replicas": dict(new_storage)}}, ) if not res: return False @@ -532,14 +532,14 @@ async def remove_file_replica_storage_refs( """Remove replica storage ref from all files in given org""" res = await self.crawls_db.update_many( {"oid": org.id}, - {"$pull": {"files.$.replicas": new_storage}}, + {"$pull": {"files.$.replicas": dict(new_storage)}}, ) if not res: return False res = await self.profiles_db.update_many( {"oid": org.id}, - {"$pull": {"resource.replicas": new_storage}}, + {"$pull": {"resource.replicas": dict(new_storage)}}, ) if not res: return False @@ -549,7 +549,8 @@ async def remove_file_replica_storage_refs( async def unset_file_presigned_urls(self, org: Organization) -> bool: """Unset all presigned URLs for files in org""" res = await self.crawls_db.update_many( - {"oid": org.id}, {"$set": {"files.$.presignedUrl": None}} + {"oid": org.id, "files.$.presignedUrl": {"$ne": None}}, + {"$set": {"files.$.presignedUrl": None}}, ) if not res: return False diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 2846ea6079..16c14d18fd 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -197,6 +197,7 @@ async def add_custom_storage( "STORE_ENDPOINT_NO_BUCKET_URL": storage.endpoint_no_bucket_url, "STORE_ACCESS_KEY": storage.access_key, "STORE_SECRET_KEY": storage.secret_key, + "STORE_REGION": storage.region, } await self.crawl_manager.add_org_storage( @@ -254,7 +255,7 @@ async def update_storage_ref( raise HTTPException(status_code=403, detail="org_set_to_read_only") _, jobs_running_count = await self.background_job_ops.list_background_jobs( - org=org, success=None, finished=None + org=org, running=True ) if jobs_running_count > 0: raise HTTPException(status_code=403, detail="background_jobs_running") @@ -321,7 +322,7 @@ async def update_storage_replica_refs( raise HTTPException(status_code=403, detail="org_set_to_read_only") _, jobs_running_count = await self.background_job_ops.list_background_jobs( - org=org, success=None, finished=None + org=org, running=True ) if jobs_running_count > 0: raise HTTPException(status_code=403, detail="background_jobs_running") diff --git a/chart/app-templates/copy_job.yaml b/chart/app-templates/copy_job.yaml new file mode 100644 index 0000000000..f59b0da6ad --- /dev/null +++ b/chart/app-templates/copy_job.yaml @@ -0,0 +1,96 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: "{{ id }}" + labels: + role: "background-job" + job_type: {{ job_type }} + btrix.org: {{ oid }} + +spec: + ttlSecondsAfterFinished: 0 + backoffLimit: 3 + template: + spec: + restartPolicy: Never + priorityClassName: bg-job + podFailurePolicy: + rules: + - action: FailJob + onExitCodes: + containerName: rclone + operator: NotIn + values: [0] + containers: + - name: rclone + image: rclone/rclone:latest + env: + + - name: RCLONE_CONFIG_PREV_TYPE + value: "s3" + + - name: RCLONE_CONFIG_PREV_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + name: "{{ prev_secret_name }}" + key: STORE_ACCESS_KEY + + - name: RCLONE_CONFIG_PREV_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + name: "{{ prev_secret_name }}" + key: STORE_SECRET_KEY + + - name: RCLONE_CONFIG_PREV_REGION + valueFrom: + secretKeyRef: + name: "{{ prev_secret_name }}" + key: STORE_REGION + + - name: RCLONE_CONFIG_PREV_PROVIDER + valueFrom: + secretKeyRef: + name: "{{ prev_secret_name }}" + key: STORE_S3_PROVIDER + + - name: RCLONE_CONFIG_PREV_ENDPOINT + value: "{{ prev_endpoint }}" + + - name: RCLONE_CONFIG_NEW_TYPE + value: "s3" + + - name: RCLONE_CONFIG_NEW_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + name: "{{ new_secret_name }}" + key: STORE_ACCESS_KEY + + - name: RCLONE_CONFIG_NEW_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + name: "{{ new_secret_name }}" + key: STORE_SECRET_KEY + + - name: RCLONE_CONFIG_NEW_REGION + valueFrom: + secretKeyRef: + name: "{{ new_secret_name }}" + key: STORE_REGION + + - name: RCLONE_CONFIG_NEW_PROVIDER + valueFrom: + secretKeyRef: + name: "{{ new_secret_name }}" + key: STORE_S3_PROVIDER + + - name: RCLONE_CONFIG_NEW_ENDPOINT + value: "{{ new_endpoint }}" + + command: ["rclone", "-vv", "copy", "--checksum", "prev:{{ prev_bucket }}", "new:{{ new_bucket }}"] + resources: + limits: + memory: "200Mi" + + requests: + memory: "200Mi" + cpu: "50m" From db65dd8c5f52cf5c1dd746d78ef5942d6b68f9d1 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 26 Sep 2024 12:48:29 -0400 Subject: [PATCH 32/64] Add provider to s3storage for rclone --- backend/btrixcloud/models.py | 2 ++ backend/btrixcloud/storages.py | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 98d085af4d..884b5f95b2 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -1180,6 +1180,7 @@ class S3StorageIn(BaseModel): bucket: str access_endpoint_url: Optional[str] = None region: str = "" + provider: str = "Other" # ============================================================================ @@ -1194,6 +1195,7 @@ class S3Storage(BaseModel): secret_key: str access_endpoint_url: str region: str = "" + provider: str = "Other" # ============================================================================ diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 16c14d18fd..fccfe0c321 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -154,6 +154,7 @@ def _create_s3_storage(self, storage: dict[str, str]) -> S3Storage: endpoint_url=endpoint_url, endpoint_no_bucket_url=endpoint_no_bucket_url, access_endpoint_url=access_endpoint_url, + provider=storage.get("provider", "Other"), ) async def add_custom_storage( @@ -178,7 +179,7 @@ async def add_custom_storage( endpoint_url=endpoint_url, endpoint_no_bucket_url=endpoint_no_bucket_url, access_endpoint_url=storagein.access_endpoint_url or endpoint_url, - use_access_for_presign=True, + provider=storagein.provider, ) try: @@ -198,6 +199,7 @@ async def add_custom_storage( "STORE_ACCESS_KEY": storage.access_key, "STORE_SECRET_KEY": storage.secret_key, "STORE_REGION": storage.region, + "STORE_PROVIDER": storage.provider, } await self.crawl_manager.add_org_storage( From 89585fa9a8224436f82ba9228882e764400f7484 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 26 Sep 2024 12:50:55 -0400 Subject: [PATCH 33/64] Fix typo --- backend/btrixcloud/storages.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index fccfe0c321..6042ed42c1 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -199,7 +199,7 @@ async def add_custom_storage( "STORE_ACCESS_KEY": storage.access_key, "STORE_SECRET_KEY": storage.secret_key, "STORE_REGION": storage.region, - "STORE_PROVIDER": storage.provider, + "STORE_S3_PROVIDER": storage.provider, } await self.crawl_manager.add_org_storage( From 8abe1ef617e12ec2e845c3cf220db9d1a74f8c5a Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 30 Sep 2024 17:49:10 -0400 Subject: [PATCH 34/64] Make API endpoints that change storage superuser-only for now --- backend/btrixcloud/main.py | 2 +- backend/btrixcloud/storages.py | 24 +++++++++++++++++++++--- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/backend/btrixcloud/main.py b/backend/btrixcloud/main.py index d03ad73896..fcffd9cb3a 100644 --- a/backend/btrixcloud/main.py +++ b/backend/btrixcloud/main.py @@ -190,7 +190,7 @@ def main() -> None: crawl_manager = CrawlManager() - storage_ops = init_storages_api(org_ops, crawl_manager) + storage_ops = init_storages_api(org_ops, crawl_manager, current_active_user) background_job_ops = init_background_jobs_api( app, diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 6042ed42c1..6c71adf794 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -859,7 +859,7 @@ def _parse_json(line) -> dict: # ============================================================================ -def init_storages_api(org_ops, crawl_manager): +def init_storages_api(org_ops, crawl_manager, user_dep: Callable): """API for updating storage for an org""" storage_ops = StorageOps(org_ops, crawl_manager) @@ -887,23 +887,37 @@ def get_available_storages(org: Organization = Depends(org_owner_dep)): "/custom-storage", tags=["organizations"], response_model=AddedResponseName ) async def add_custom_storage( - storage: S3StorageIn, org: Organization = Depends(org_owner_dep) + storage: S3StorageIn, + org: Organization = Depends(org_owner_dep), + user: User = Depends(user_dep), ): + if not user.is_superuser: + raise HTTPException(status_code=403, detail="Not Allowed") + return await storage_ops.add_custom_storage(storage, org) @router.delete( "/custom-storage/{name}", tags=["organizations"], response_model=DeletedResponse ) async def remove_custom_storage( - name: str, org: Organization = Depends(org_owner_dep) + name: str, + org: Organization = Depends(org_owner_dep), + user: User = Depends(user_dep), ): + if not user.is_superuser: + raise HTTPException(status_code=403, detail="Not Allowed") + return await storage_ops.remove_custom_storage(name, org) @router.post("/storage", tags=["organizations"], response_model=UpdatedResponse) async def update_storage_ref( storage: OrgStorageRef, org: Organization = Depends(org_owner_dep), + user: User = Depends(user_dep), ): + if not user.is_superuser: + raise HTTPException(status_code=403, detail="Not Allowed") + return await storage_ops.update_storage_ref(storage, org) @router.post( @@ -912,7 +926,11 @@ async def update_storage_ref( async def update_storage_replica_refs( storage: OrgStorageReplicaRefs, org: Organization = Depends(org_owner_dep), + user: User = Depends(user_dep), ): + if not user.is_superuser: + raise HTTPException(status_code=403, detail="Not Allowed") + return await storage_ops.update_storage_replica_refs(storage, org) return storage_ops From c146ab3ed4fc65a18d2abfa7a71716e9af13a659 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 30 Sep 2024 18:03:02 -0400 Subject: [PATCH 35/64] Add typing for init_storages_api, import Callable --- backend/btrixcloud/storages.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 6c71adf794..3f7ef878cb 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -12,6 +12,7 @@ TYPE_CHECKING, Any, cast, + Callable, ) from urllib.parse import urlsplit from contextlib import asynccontextmanager @@ -859,7 +860,7 @@ def _parse_json(line) -> dict: # ============================================================================ -def init_storages_api(org_ops, crawl_manager, user_dep: Callable): +def init_storages_api(org_ops: OrgOps, crawl_manager: CrawlManager, user_dep: Callable): """API for updating storage for an org""" storage_ops = StorageOps(org_ops, crawl_manager) From abc238e6f6f390237ca3a52e026cc2ca327bf993 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 30 Sep 2024 19:01:55 -0400 Subject: [PATCH 36/64] Add missing User import --- backend/btrixcloud/storages.py | 1 + 1 file changed, 1 insertion(+) diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 3f7ef878cb..4591c97677 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -52,6 +52,7 @@ DeletedResponse, UpdatedResponse, AddedResponseName, + User, ) from .utils import slug_from_name From 29a54b7c86038699ca155781aa09097e3f0cc66b Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 1 Oct 2024 11:24:24 -0400 Subject: [PATCH 37/64] Fix StorageOps in operator main --- backend/btrixcloud/main_op.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/btrixcloud/main_op.py b/backend/btrixcloud/main_op.py index a6f6654be3..030b40a777 100644 --- a/backend/btrixcloud/main_op.py +++ b/backend/btrixcloud/main_op.py @@ -9,7 +9,6 @@ from .ops import init_ops from .utils import register_exit_handler - app_root = FastAPI() @@ -26,6 +25,7 @@ def main(): ) sys.exit(1) + ( org_ops, crawl_config_ops, From f348a16dc97e11504dbac61615f5ad5626edc54d Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 1 Oct 2024 16:09:15 -0400 Subject: [PATCH 38/64] Always use oid prefix in s3 storage Previously, files in a default bucket were prefixed with the oid but were not in custom storages. This commit removes that distinction to aid in copying files in buckets, removing the need for unnecessary filepath manipulation. The CopyBucketJob now only copies an organization's directory rather than the entire bucket to prevent accidentally copying another organization's data. --- backend/btrixcloud/models.py | 7 ++----- chart/app-templates/copy_job.yaml | 2 +- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 884b5f95b2..769be0c84d 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -681,11 +681,8 @@ def get_storage_secret_name(self, oid: str) -> str: return f"storage-cs-{self.name}-{oid[:12]}" def get_storage_extra_path(self, oid: str) -> str: - """return extra path added to the endpoint - using oid for default storages, no extra path for custom""" - if not self.custom: - return oid + "/" - return "" + """return extra oid path added to the endpoint""" + return oid + "/" # ============================================================================ diff --git a/chart/app-templates/copy_job.yaml b/chart/app-templates/copy_job.yaml index f59b0da6ad..971a10e675 100644 --- a/chart/app-templates/copy_job.yaml +++ b/chart/app-templates/copy_job.yaml @@ -86,7 +86,7 @@ spec: - name: RCLONE_CONFIG_NEW_ENDPOINT value: "{{ new_endpoint }}" - command: ["rclone", "-vv", "copy", "--checksum", "prev:{{ prev_bucket }}", "new:{{ new_bucket }}"] + command: ["rclone", "-vv", "copy", "--checksum", "prev:{{ prev_bucket }}/{{ oid }}", "new:{{ new_bucket }}/{{ oid }}"] resources: limits: memory: "200Mi" From e67868b5d04796948876cf66254cbba88b798c9a Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 10 Oct 2024 15:20:33 -0400 Subject: [PATCH 39/64] Post-rebase fixups and remove create bucket fallback Creating a bucket in the verification stage for adding custom storages if it didn't exist was useful for testing but is an anti-pattern for production, so we remove it here. --- backend/btrixcloud/background_jobs.py | 10 +++++++--- backend/btrixcloud/crawlmanager.py | 11 +++++++++-- backend/btrixcloud/storages.py | 18 +++++------------- 3 files changed, 21 insertions(+), 18 deletions(-) diff --git a/backend/btrixcloud/background_jobs.py b/backend/btrixcloud/background_jobs.py index b61571988d..771ceca796 100644 --- a/backend/btrixcloud/background_jobs.py +++ b/backend/btrixcloud/background_jobs.py @@ -295,14 +295,18 @@ async def create_delete_org_job( self, org: Organization, existing_job_id: Optional[str] = None, - ) -> Optional[str]: + ) -> str: """Create background job to delete org and its data""" + job_type = BgJobType.DELETE_ORG.value + try: job_id = await self.crawl_manager.run_delete_org_job( oid=str(org.id), + job_type=job_type, backend_image=os.environ.get("BACKEND_IMAGE", ""), pull_policy=os.environ.get("BACKEND_IMAGE_PULL_POLICY", ""), + job_id_prefix=f"{job_type}-{org.id}", existing_job_id=existing_job_id, ) if existing_job_id: @@ -334,7 +338,7 @@ async def create_delete_org_job( except Exception as exc: # pylint: disable=raise-missing-from print(f"warning: delete org job could not be started: {exc}") - return None + return "" async def create_recalculate_org_stats_job( self, @@ -745,7 +749,7 @@ async def get_background_job( """Retrieve information for background job""" return await ops.get_background_job(job_id, org.id) - @app.get("/orgs/all/jobs/{job_id}", response_model=SuccessResponse, tags=["jobs"]) + @app.get("/orgs/all/jobs/{job_id}", response_model=AnyJob, tags=["jobs"]) async def get_background_job_all_orgs(job_id: str, user: User = Depends(user_dep)): """Get background job from any org""" if not user.is_superuser: diff --git a/backend/btrixcloud/crawlmanager.py b/backend/btrixcloud/crawlmanager.py index f7e1a8a180..b6a2cce56a 100644 --- a/backend/btrixcloud/crawlmanager.py +++ b/backend/btrixcloud/crawlmanager.py @@ -119,15 +119,21 @@ async def run_replica_job( async def run_delete_org_job( self, oid: str, + job_type: str, backend_image: str, pull_policy: str, + job_id_prefix: Optional[str] = None, existing_job_id: Optional[str] = None, ) -> str: """run job to delete org and all of its data""" if existing_job_id: job_id = existing_job_id else: - job_id = f"delete-org-{oid}-{secrets.token_hex(5)}" + if not job_id_prefix: + job_id_prefix = job_type + + # ensure name is <=63 characters + job_id = f"{job_id_prefix[:52]}-{secrets.token_hex(5)}" return await self._run_bg_job_with_ops_classes( oid, backend_image, pull_policy, job_id, job_type=BgJobType.DELETE_ORG.value @@ -177,6 +183,8 @@ async def _run_bg_job_with_ops_classes( await self.create_from_yaml(data, namespace=DEFAULT_NAMESPACE) + return job_id + async def run_copy_bucket_job( self, oid: str, @@ -210,7 +218,6 @@ async def run_copy_bucket_job( "new_secret_name": new_storage.get_storage_secret_name(oid), "new_endpoint": new_endpoint, "new_bucket": new_bucket, - "BgJobType": BgJobType, } data = self.templates.env.get_template("copy_job.yaml").render(params) diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 4591c97677..c45277373f 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -269,7 +269,7 @@ async def update_storage_ref( await self.org_ops.update_storage_refs(org) - # TODO: Run in asyncio task or background job? + # Run in background jobs (1 to copy files, 1 for db updates) await self._run_post_storage_update_tasks( prev_storage_ref, storage_ref, @@ -336,7 +336,7 @@ async def update_storage_replica_refs( await self.org_ops.update_storage_refs(org, replicas=True) - # TODO: Run in asyncio task or background job? + # Run in background job? or just kick off background jobs? await self._run_post_storage_replica_update_tasks( prev_storage_replicas, replicas, org ) @@ -363,7 +363,7 @@ async def _run_post_storage_replica_update_tasks( await self.org_ops.add_file_replica_storage_refs(org, replica_storage) # Delete no-longer-used replica storage refs from files - # TODO: Determine if we want to delete files from the buckets as well + # Determine if we want to delete files from the buckets as well for replica_storage in prev_replica_refs: if replica_storage not in new_replica_refs: await self.org_ops.remove_file_replica_storage_refs( @@ -418,16 +418,8 @@ async def verify_storage_upload(self, storage: S3Storage, filename: str) -> None key += filename data = b"" - try: - resp = await client.put_object(Bucket=bucket, Key=key, Body=data) - assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200 - except Exception: - # create bucket if it doesn't yet exist and then try again - resp = await client.create_bucket(Bucket=bucket) - assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200 - - resp = await client.put_object(Bucket=bucket, Key=key, Body=data) - assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200 + resp = await client.put_object(Bucket=bucket, Key=key, Body=data) + assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200 def resolve_internal_access_path(self, path): """Resolve relative path for internal access to minio bucket""" From e2dfa93f2682e8f42e03cf1760cbc5a2b92ed5f7 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 15 Oct 2024 14:36:58 -0400 Subject: [PATCH 40/64] Create extra test buckets in CI --- .github/workflows/k3d-ci.yaml | 5 +++++ .github/workflows/microk8s-ci.yaml | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/.github/workflows/k3d-ci.yaml b/.github/workflows/k3d-ci.yaml index a2eff4e79f..f241b43733 100644 --- a/.github/workflows/k3d-ci.yaml +++ b/.github/workflows/k3d-ci.yaml @@ -91,6 +91,11 @@ jobs: - name: Wait for all pods to be ready run: kubectl wait --for=condition=ready pod --all --timeout=240s + - name: Create Extra Test Buckets + run: | + kubectl exec -i deployment/local-minio -c minio -- mkdir /data/custom-primary && + kubectl exec -i deployment/local-minio -c minio -- mkdir /data/custom-replica + - name: Run Tests timeout-minutes: 30 run: pytest -vv ./backend/test/test_*.py diff --git a/.github/workflows/microk8s-ci.yaml b/.github/workflows/microk8s-ci.yaml index eb1e139c10..488c4fbd3f 100644 --- a/.github/workflows/microk8s-ci.yaml +++ b/.github/workflows/microk8s-ci.yaml @@ -66,6 +66,11 @@ jobs: - name: Wait for all pods to be ready run: sudo microk8s kubectl wait --for=condition=ready pod --all --timeout=240s + - name: Create Extra Test Buckets + run: | + kubectl exec -i deployment/local-minio -c minio -- mkdir /data/custom-primary && + kubectl exec -i deployment/local-minio -c minio -- mkdir /data/custom-replica + - name: Run Tests run: pytest -vv ./backend/test/test_*.py From e054158828e86fb2cd085748ba2a86c4c2883ade Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 15 Oct 2024 14:40:16 -0400 Subject: [PATCH 41/64] Add test for non-verified custom storage --- backend/test/test_org.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/backend/test/test_org.py b/backend/test/test_org.py index f447010c65..b9fe8eb2ab 100644 --- a/backend/test/test_org.py +++ b/backend/test/test_org.py @@ -246,6 +246,24 @@ def test_change_storage_invalid(admin_auth_headers): assert r.status_code == 400 +def test_add_custom_storage_doesnt_verify(admin_auth_headers): + # verify that custom storage that can't be verified with + # a test file isn't added + r = requests.post( + f"{API_PREFIX}/orgs/{new_oid}/custom-storage", + headers=admin_auth_headers, + json={ + "name": "custom-bucket-doesnt-exist", + "access_key": "ADMIN", + "secret_key": "PASSW0RD", + "bucket": "custom-bucket-doesnt-exist", + "endpoint_url": "http://local-minio.default:9000/", + }, + ) + assert r.status_code == 400 + assert r.json()["detail"] == "Could not verify custom storage. Check credentials are valid?" + + def test_add_custom_storage(admin_auth_headers): # add custom storages r = requests.post( From f0dc3344dcf4ff0b807c4bb91af58c69514bcbf3 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 15 Oct 2024 16:18:42 -0400 Subject: [PATCH 42/64] Refactor to move updates to FastAPI background tasks --- backend/btrixcloud/models.py | 7 ++++ backend/btrixcloud/orgs.py | 63 ++++++++--------------------- backend/btrixcloud/storages.py | 74 +++++++++++++++++++++------------- 3 files changed, 70 insertions(+), 74 deletions(-) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 769be0c84d..a11a469ed8 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -2258,6 +2258,13 @@ class UpdatedResponse(BaseModel): updated: bool +# ============================================================================ +class UpdatedResponseId(UpdatedResponse): + """Response for API endpoints that return updated + id""" + + id: Optional[str] = None + + # ============================================================================ class SuccessResponse(BaseModel): """Response for API endpoints that return success""" diff --git a/backend/btrixcloud/orgs.py b/backend/btrixcloud/orgs.py index b25cba95fd..1fd617b1b1 100644 --- a/backend/btrixcloud/orgs.py +++ b/backend/btrixcloud/orgs.py @@ -488,74 +488,43 @@ async def update_storage_refs( async def update_file_storage_refs( self, org: Organization, previous_storage: StorageRef, new_storage: StorageRef - ) -> bool: + ) -> None: """Update storage refs for all crawl and profile files in given org""" - res = await self.crawls_db.update_many( + await self.crawls_db.update_many( {"oid": org.id, "files.$.storage": dict(previous_storage)}, {"$set": {"files.$.storage": dict(new_storage)}}, ) - if not res: - return False - res = await self.profiles_db.update_many( + await self.profiles_db.update_many( {"oid": org.id, "resource.storage": dict(previous_storage)}, {"$set": {"resource.storage": dict(new_storage)}}, ) - if not res: - return False - return True - - async def add_file_replica_storage_refs( - self, org: Organization, new_storage: StorageRef - ) -> bool: + async def add_or_remove_file_replica_storage_refs( + self, org: Organization, storage_ref: StorageRef, remove=False + ) -> None: """Add replica storage ref for all files in given org""" - res = await self.crawls_db.update_many( - {"oid": org.id}, - {"$push": {"files.$.replicas": dict(new_storage)}}, - ) - if not res: - return False - - res = await self.profiles_db.update_many( - {"oid": org.id}, - {"$push": {"resource.replicas": dict(new_storage)}}, - ) - if not res: - return False - - return True + if remove: + verb = "$pull" + else: + verb = "$push" - async def remove_file_replica_storage_refs( - self, org: Organization, new_storage: StorageRef - ) -> bool: - """Remove replica storage ref from all files in given org""" - res = await self.crawls_db.update_many( + await self.crawls_db.update_many( {"oid": org.id}, - {"$pull": {"files.$.replicas": dict(new_storage)}}, + {verb: {"files.$.replicas": dict(storage_ref)}}, ) - if not res: - return False - res = await self.profiles_db.update_many( + await self.profiles_db.update_many( {"oid": org.id}, - {"$pull": {"resource.replicas": dict(new_storage)}}, + {verb: {"resource.replicas": dict(storage_ref)}}, ) - if not res: - return False - - return True - async def unset_file_presigned_urls(self, org: Organization) -> bool: + async def unset_file_presigned_urls(self, org: Organization) -> None: """Unset all presigned URLs for files in org""" - res = await self.crawls_db.update_many( + await self.crawls_db.update_many( {"oid": org.id, "files.$.presignedUrl": {"$ne": None}}, {"$set": {"files.$.presignedUrl": None}}, ) - if not res: - return False - - return True async def update_subscription_data( self, update: SubscriptionUpdate diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index c45277373f..30240e50aa 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -13,6 +13,7 @@ Any, cast, Callable, + Union, ) from urllib.parse import urlsplit from contextlib import asynccontextmanager @@ -27,7 +28,7 @@ from datetime import datetime from zipfile import ZipInfo -from fastapi import Depends, HTTPException +from fastapi import Depends, HTTPException, BackgroundTasks from stream_zip import stream_zip, NO_COMPRESSION_64, Method from remotezip import RemoteZip from aiobotocore.config import AioConfig @@ -51,6 +52,7 @@ OrgStorageReplicaRefs, DeletedResponse, UpdatedResponse, + UpdatedResponseId, AddedResponseName, User, ) @@ -240,7 +242,8 @@ async def update_storage_ref( self, storage_refs: OrgStorageRef, org: Organization, - ) -> dict[str, bool]: + background_tasks: BackgroundTasks, + ) -> dict[str, Union[bool, Optional[str]]]: """update storage for org""" storage_ref = storage_refs.storage @@ -269,32 +272,35 @@ async def update_storage_ref( await self.org_ops.update_storage_refs(org) - # Run in background jobs (1 to copy files, 1 for db updates) - await self._run_post_storage_update_tasks( + if not await self.org_ops.has_files_stored(org): + print("No files stored, no updates to do", flush=True) + return {"updated": True, "id": None} + + await self.org_ops.update_read_only(org, True, "Updating storage") + + job_id = await self.background_job_ops.create_copy_bucket_job( + org, prev_storage_ref, storage_ref + ) + + # This runs only two update_many mongo commands, so should be safe to run + # in a FastAPI background task rather than requiring a full Browsertrix + # Background job + background_tasks.add_task( + self._run_post_storage_update_tasks, + org, prev_storage_ref, storage_ref, - org, ) - return {"updated": True} + return {"updated": True, "id": job_id} async def _run_post_storage_update_tasks( self, + org: Organization, prev_storage_ref: StorageRef, new_storage_ref: StorageRef, - org: Organization, ): """Handle tasks necessary after changing org storage""" - if not await self.org_ops.has_files_stored(org): - print("No files stored, no updates to do", flush=True) - return - - await self.org_ops.update_read_only(org, True, "Updating storage") - - await self.background_job_ops.create_copy_bucket_job( - org, prev_storage_ref, new_storage_ref - ) - await self.org_ops.update_file_storage_refs( org, prev_storage_ref, new_storage_ref ) @@ -305,6 +311,7 @@ async def update_storage_replica_refs( self, storage_refs: OrgStorageReplicaRefs, org: Organization, + background_tasks: BackgroundTasks, ) -> dict[str, bool]: """update storage for org""" @@ -336,18 +343,25 @@ async def update_storage_replica_refs( await self.org_ops.update_storage_refs(org, replicas=True) - # Run in background job? or just kick off background jobs? - await self._run_post_storage_replica_update_tasks( - prev_storage_replicas, replicas, org + # This only kicks off background jobs and runs a few update_many mongo + # commands, so it might be fine to keep as a FastAPI background job. + # Consider moving to a Browsertrix background job, but that may make + # retrying difficult as the job which would be retried also kicks off + # other background jobs which may have been successful already + background_tasks.add_task( + self._run_post_storage_replica_update_tasks, + org, + prev_storage_replicas, + replicas, ) return {"updated": True} async def _run_post_storage_replica_update_tasks( self, + org: Organization, prev_replica_refs: List[StorageRef], new_replica_refs: List[StorageRef], - org: Organization, ): """Handle tasks necessary after updating org replica storages""" if not await self.org_ops.has_files_stored(org): @@ -360,14 +374,16 @@ async def _run_post_storage_replica_update_tasks( await self.background_job_ops.create_copy_bucket_job( org, org.storage, replica_storage ) - await self.org_ops.add_file_replica_storage_refs(org, replica_storage) + await self.org_ops.add_or_remove_file_replica_storage_refs( + org, replica_storage + ) # Delete no-longer-used replica storage refs from files # Determine if we want to delete files from the buckets as well for replica_storage in prev_replica_refs: if replica_storage not in new_replica_refs: - await self.org_ops.remove_file_replica_storage_refs( - org, replica_storage + await self.org_ops.add_or_remove_file_replica_storage_refs( + org, replica_storage, remove=True ) def get_available_storages(self, org: Organization) -> List[StorageRef]: @@ -903,28 +919,32 @@ async def remove_custom_storage( return await storage_ops.remove_custom_storage(name, org) - @router.post("/storage", tags=["organizations"], response_model=UpdatedResponse) + @router.post("/storage", tags=["organizations"], response_model=UpdatedResponseId) async def update_storage_ref( storage: OrgStorageRef, + background_tasks: BackgroundTasks, org: Organization = Depends(org_owner_dep), user: User = Depends(user_dep), ): if not user.is_superuser: raise HTTPException(status_code=403, detail="Not Allowed") - return await storage_ops.update_storage_ref(storage, org) + return await storage_ops.update_storage_ref(storage, org, background_tasks) @router.post( "/storage-replicas", tags=["organizations"], response_model=UpdatedResponse ) async def update_storage_replica_refs( storage: OrgStorageReplicaRefs, + background_tasks: BackgroundTasks, org: Organization = Depends(org_owner_dep), user: User = Depends(user_dep), ): if not user.is_superuser: raise HTTPException(status_code=403, detail="Not Allowed") - return await storage_ops.update_storage_replica_refs(storage, org) + return await storage_ops.update_storage_replica_refs( + storage, org, background_tasks + ) return storage_ops From 0d3aa35249725ad9c17a75b98c9406f9a9610a5a Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 15 Oct 2024 18:07:26 -0400 Subject: [PATCH 43/64] Include default replicas in /storage response if no org replicas --- backend/btrixcloud/storages.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 30240e50aa..a04d603aef 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -885,7 +885,11 @@ def get_storage_refs( org: Organization = Depends(org_owner_dep), ): """get storage refs for an org""" - return OrgStorageRefs(storage=org.storage, storageReplicas=org.storageReplicas) + if org.storageReplicas: + replica_refs = org.storageReplicas + else: + replica_refs = storage_ops.default_replicas + return OrgStorageRefs(storage=org.storage, storageReplicas=replica_refs) @router.get( "/all-storages", tags=["organizations"], response_model=List[StorageRef] From 7a8ec5847e36515c8b9bdafb62c7375be0988943 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 16 Oct 2024 15:10:44 -0400 Subject: [PATCH 44/64] Fix unsetting of presigned URLs --- backend/btrixcloud/orgs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/btrixcloud/orgs.py b/backend/btrixcloud/orgs.py index 1fd617b1b1..6392cf6687 100644 --- a/backend/btrixcloud/orgs.py +++ b/backend/btrixcloud/orgs.py @@ -522,8 +522,8 @@ async def add_or_remove_file_replica_storage_refs( async def unset_file_presigned_urls(self, org: Organization) -> None: """Unset all presigned URLs for files in org""" await self.crawls_db.update_many( - {"oid": org.id, "files.$.presignedUrl": {"$ne": None}}, - {"$set": {"files.$.presignedUrl": None}}, + {"oid": org.id}, + {"$set": {"files.$[].presignedUrl": None}}, ) async def update_subscription_data( From 6277331a89c4ecf89076a939ea378d0b1234c687 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 16 Oct 2024 15:46:23 -0400 Subject: [PATCH 45/64] Add --progress flag to rclone copy command --- chart/app-templates/copy_job.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/chart/app-templates/copy_job.yaml b/chart/app-templates/copy_job.yaml index 971a10e675..f727a6deea 100644 --- a/chart/app-templates/copy_job.yaml +++ b/chart/app-templates/copy_job.yaml @@ -86,7 +86,7 @@ spec: - name: RCLONE_CONFIG_NEW_ENDPOINT value: "{{ new_endpoint }}" - command: ["rclone", "-vv", "copy", "--checksum", "prev:{{ prev_bucket }}/{{ oid }}", "new:{{ new_bucket }}/{{ oid }}"] + command: ["rclone", "-vv", "--progress", "copy", "--checksum", "prev:{{ prev_bucket }}/{{ oid }}", "new:{{ new_bucket }}/{{ oid }}"] resources: limits: memory: "200Mi" From 1eb533c338991b143e689759a718e5a896a130cc Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 16 Oct 2024 20:14:55 -0400 Subject: [PATCH 46/64] Increase ttl seconds after finished for testing on dev --- chart/app-templates/copy_job.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/chart/app-templates/copy_job.yaml b/chart/app-templates/copy_job.yaml index f727a6deea..bdb09b4db7 100644 --- a/chart/app-templates/copy_job.yaml +++ b/chart/app-templates/copy_job.yaml @@ -8,7 +8,7 @@ metadata: btrix.org: {{ oid }} spec: - ttlSecondsAfterFinished: 0 + ttlSecondsAfterFinished: 300 backoffLimit: 3 template: spec: From be10a9aece7653024f450433e6d7a7d74b830ff6 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 16 Oct 2024 20:37:26 -0400 Subject: [PATCH 47/64] Ensure there are no double slashes between bucket name and oid --- backend/btrixcloud/background_jobs.py | 4 ++++ chart/app-templates/copy_job.yaml | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/backend/btrixcloud/background_jobs.py b/backend/btrixcloud/background_jobs.py index 771ceca796..7ab882777a 100644 --- a/backend/btrixcloud/background_jobs.py +++ b/backend/btrixcloud/background_jobs.py @@ -399,6 +399,10 @@ async def create_copy_bucket_job( new_storage = self.storage_ops.get_org_storage_by_ref(org, new_storage_ref) new_endpoint, new_bucket = self.strip_bucket(new_storage.endpoint_url) + # Ensure buckets terminate with trailing slash + prev_bucket = os.path.join(prev_bucket, "") + new_bucket = os.path.join(new_bucket, "") + job_type = BgJobType.COPY_BUCKET.value try: diff --git a/chart/app-templates/copy_job.yaml b/chart/app-templates/copy_job.yaml index bdb09b4db7..8c2514ab26 100644 --- a/chart/app-templates/copy_job.yaml +++ b/chart/app-templates/copy_job.yaml @@ -86,7 +86,7 @@ spec: - name: RCLONE_CONFIG_NEW_ENDPOINT value: "{{ new_endpoint }}" - command: ["rclone", "-vv", "--progress", "copy", "--checksum", "prev:{{ prev_bucket }}/{{ oid }}", "new:{{ new_bucket }}/{{ oid }}"] + command: ["rclone", "-vv", "--progress", "copy", "--checksum", "prev:{{ prev_bucket }}{{ oid }}", "new:{{ new_bucket }}{{ oid }}"] resources: limits: memory: "200Mi" From e028dc8bcd9e8481437f68159f109f923db0a173 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 16 Oct 2024 20:53:40 -0400 Subject: [PATCH 48/64] Increase memory limit/request for copy job to 500Mi --- chart/app-templates/copy_job.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/chart/app-templates/copy_job.yaml b/chart/app-templates/copy_job.yaml index 8c2514ab26..3d8e5a4a03 100644 --- a/chart/app-templates/copy_job.yaml +++ b/chart/app-templates/copy_job.yaml @@ -89,8 +89,8 @@ spec: command: ["rclone", "-vv", "--progress", "copy", "--checksum", "prev:{{ prev_bucket }}{{ oid }}", "new:{{ new_bucket }}{{ oid }}"] resources: limits: - memory: "200Mi" + memory: "500Mi" requests: - memory: "200Mi" + memory: "500Mi" cpu: "50m" From edcd5c6617b4991b244d819fd1f9252474715ab8 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 16 Oct 2024 21:00:44 -0400 Subject: [PATCH 49/64] Reduce copy job ttlSecondsAfterFinished to 60 --- chart/app-templates/copy_job.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/chart/app-templates/copy_job.yaml b/chart/app-templates/copy_job.yaml index 3d8e5a4a03..501f7ef749 100644 --- a/chart/app-templates/copy_job.yaml +++ b/chart/app-templates/copy_job.yaml @@ -8,7 +8,7 @@ metadata: btrix.org: {{ oid }} spec: - ttlSecondsAfterFinished: 300 + ttlSecondsAfterFinished: 60 backoffLimit: 3 template: spec: From d6761133a02b05ec5627b924da15d4ba2d6f1235 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 16 Oct 2024 23:08:22 -0400 Subject: [PATCH 50/64] Add storage tag to API endpoints --- backend/btrixcloud/storages.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index a04d603aef..684275551e 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -880,7 +880,9 @@ def init_storages_api(org_ops: OrgOps, crawl_manager: CrawlManager, user_dep: Ca router = org_ops.router org_owner_dep = org_ops.org_owner_dep - @router.get("/storage", tags=["organizations"], response_model=OrgStorageRefs) + @router.get( + "/storage", tags=["organizations", "storage"], response_model=OrgStorageRefs + ) def get_storage_refs( org: Organization = Depends(org_owner_dep), ): @@ -892,13 +894,17 @@ def get_storage_refs( return OrgStorageRefs(storage=org.storage, storageReplicas=replica_refs) @router.get( - "/all-storages", tags=["organizations"], response_model=List[StorageRef] + "/all-storages", + tags=["organizations", "storage"], + response_model=List[StorageRef], ) def get_available_storages(org: Organization = Depends(org_owner_dep)): return storage_ops.get_available_storages(org) @router.post( - "/custom-storage", tags=["organizations"], response_model=AddedResponseName + "/custom-storage", + tags=["organizations", "storage"], + response_model=AddedResponseName, ) async def add_custom_storage( storage: S3StorageIn, @@ -911,7 +917,9 @@ async def add_custom_storage( return await storage_ops.add_custom_storage(storage, org) @router.delete( - "/custom-storage/{name}", tags=["organizations"], response_model=DeletedResponse + "/custom-storage/{name}", + tags=["organizations", "storage"], + response_model=DeletedResponse, ) async def remove_custom_storage( name: str, @@ -923,7 +931,9 @@ async def remove_custom_storage( return await storage_ops.remove_custom_storage(name, org) - @router.post("/storage", tags=["organizations"], response_model=UpdatedResponseId) + @router.post( + "/storage", tags=["organizations", "storage"], response_model=UpdatedResponseId + ) async def update_storage_ref( storage: OrgStorageRef, background_tasks: BackgroundTasks, @@ -936,7 +946,9 @@ async def update_storage_ref( return await storage_ops.update_storage_ref(storage, org, background_tasks) @router.post( - "/storage-replicas", tags=["organizations"], response_model=UpdatedResponse + "/storage-replicas", + tags=["organizations", "storage"], + response_model=UpdatedResponse, ) async def update_storage_replica_refs( storage: OrgStorageReplicaRefs, From ef59178c92326881f4a1aa6c1cf54ad245bc0e9d Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 17 Oct 2024 09:45:45 -0400 Subject: [PATCH 51/64] Add flags to rclone to reduce memory usage, set limit to 350Mi --- chart/app-templates/copy_job.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/chart/app-templates/copy_job.yaml b/chart/app-templates/copy_job.yaml index 501f7ef749..cae8a3a449 100644 --- a/chart/app-templates/copy_job.yaml +++ b/chart/app-templates/copy_job.yaml @@ -86,11 +86,11 @@ spec: - name: RCLONE_CONFIG_NEW_ENDPOINT value: "{{ new_endpoint }}" - command: ["rclone", "-vv", "--progress", "copy", "--checksum", "prev:{{ prev_bucket }}{{ oid }}", "new:{{ new_bucket }}{{ oid }}"] + command: ["rclone", "-vv", "--progress", "copy", "--checksum", "--use-mmap", "--transfers=2", "--checkers=2", "prev:{{ prev_bucket }}{{ oid }}", "new:{{ new_bucket }}{{ oid }}"] resources: limits: - memory: "500Mi" + memory: "350Mi" requests: - memory: "500Mi" + memory: "350Mi" cpu: "50m" From 07490fb55fd672ceda6d526624cb79079fba03ae Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 17 Oct 2024 10:18:51 -0400 Subject: [PATCH 52/64] Fix positional operator in storage ref update --- backend/btrixcloud/orgs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/btrixcloud/orgs.py b/backend/btrixcloud/orgs.py index 6392cf6687..3ac7d7f162 100644 --- a/backend/btrixcloud/orgs.py +++ b/backend/btrixcloud/orgs.py @@ -511,7 +511,7 @@ async def add_or_remove_file_replica_storage_refs( await self.crawls_db.update_many( {"oid": org.id}, - {verb: {"files.$.replicas": dict(storage_ref)}}, + {verb: {"files.$[].replicas": dict(storage_ref)}}, ) await self.profiles_db.update_many( From 4f1aa1d9a147c54874868e2f685947b6c790afb1 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 17 Oct 2024 10:27:03 -0400 Subject: [PATCH 53/64] One more positional operator fix --- backend/btrixcloud/orgs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/btrixcloud/orgs.py b/backend/btrixcloud/orgs.py index 3ac7d7f162..7e29e6f44c 100644 --- a/backend/btrixcloud/orgs.py +++ b/backend/btrixcloud/orgs.py @@ -491,8 +491,8 @@ async def update_file_storage_refs( ) -> None: """Update storage refs for all crawl and profile files in given org""" await self.crawls_db.update_many( - {"oid": org.id, "files.$.storage": dict(previous_storage)}, - {"$set": {"files.$.storage": dict(new_storage)}}, + {"oid": org.id}, + {"$set": {"files.$[].storage": dict(new_storage)}}, ) await self.profiles_db.update_many( From 5d81ed26e679e12c326032246822c071df315196 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 17 Oct 2024 10:34:17 -0400 Subject: [PATCH 54/64] Update docstrings and comments --- backend/btrixcloud/storages.py | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 684275551e..f57154deae 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -217,7 +217,7 @@ async def add_custom_storage( async def remove_custom_storage( self, name: str, org: Organization ) -> dict[str, bool]: - """remove custom storage""" + """Remove custom storage from org""" if org.storage.custom and org.storage.name == name: raise HTTPException(status_code=400, detail="storage_in_use") @@ -244,7 +244,7 @@ async def update_storage_ref( org: Organization, background_tasks: BackgroundTasks, ) -> dict[str, Union[bool, Optional[str]]]: - """update storage for org""" + """Update storage for org""" storage_ref = storage_refs.storage try: @@ -282,9 +282,6 @@ async def update_storage_ref( org, prev_storage_ref, storage_ref ) - # This runs only two update_many mongo commands, so should be safe to run - # in a FastAPI background task rather than requiring a full Browsertrix - # Background job background_tasks.add_task( self._run_post_storage_update_tasks, org, @@ -313,7 +310,7 @@ async def update_storage_replica_refs( org: Organization, background_tasks: BackgroundTasks, ) -> dict[str, bool]: - """update storage for org""" + """Update replica storage for org""" replicas = storage_refs.storageReplicas @@ -343,11 +340,6 @@ async def update_storage_replica_refs( await self.org_ops.update_storage_refs(org, replicas=True) - # This only kicks off background jobs and runs a few update_many mongo - # commands, so it might be fine to keep as a FastAPI background job. - # Consider moving to a Browsertrix background job, but that may make - # retrying difficult as the job which would be retried also kicks off - # other background jobs which may have been successful already background_tasks.add_task( self._run_post_storage_replica_update_tasks, org, From b0950238ab22180e6a01655239dfb0db2a825319 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 17 Oct 2024 11:16:55 -0400 Subject: [PATCH 55/64] Make all-storages response valid JSON with response model --- backend/btrixcloud/models.py | 7 +++++++ backend/btrixcloud/storages.py | 7 ++++--- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index a11a469ed8..16ad49d59f 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -1163,6 +1163,13 @@ class OrgStorageReplicaRefs(BaseModel): storageReplicas: List[StorageRef] +# ============================================================================ +class OrgAllStorages(BaseModel): + """Response model for listing all available storages""" + + allStorages: List[StorageRef] + + # ============================================================================ class S3StorageIn(BaseModel): """Custom S3 Storage input model""" diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index f57154deae..9640c0fd4d 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -50,6 +50,7 @@ OrgStorageRefs, OrgStorageRef, OrgStorageReplicaRefs, + OrgAllStorages, DeletedResponse, UpdatedResponse, UpdatedResponseId, @@ -378,14 +379,14 @@ async def _run_post_storage_replica_update_tasks( org, replica_storage, remove=True ) - def get_available_storages(self, org: Organization) -> List[StorageRef]: + def get_available_storages(self, org: Organization) -> Dict[str, List[StorageRef]]: """return a list of available default + custom storages""" refs: List[StorageRef] = [] for name in self.default_storages: refs.append(StorageRef(name=name, custom=False)) for name in org.customStorages: refs.append(StorageRef(name=name, custom=True)) - return refs + return {"allStorages": refs} @asynccontextmanager async def get_s3_client( @@ -888,7 +889,7 @@ def get_storage_refs( @router.get( "/all-storages", tags=["organizations", "storage"], - response_model=List[StorageRef], + response_model=OrgAllStorages, ) def get_available_storages(org: Organization = Depends(org_owner_dep)): return storage_ops.get_available_storages(org) From 085209bc42523fcc00cca957904124e66384f1ca Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 17 Oct 2024 11:38:45 -0400 Subject: [PATCH 56/64] Add admin docs for storage --- .../docs/docs/deploy/admin/org-storage.md | 74 +++++++++++++++++++ frontend/docs/mkdocs.yml | 1 + 2 files changed, 75 insertions(+) create mode 100644 frontend/docs/docs/deploy/admin/org-storage.md diff --git a/frontend/docs/docs/deploy/admin/org-storage.md b/frontend/docs/docs/deploy/admin/org-storage.md new file mode 100644 index 0000000000..b8bf11a2ad --- /dev/null +++ b/frontend/docs/docs/deploy/admin/org-storage.md @@ -0,0 +1,74 @@ +# Org Storage + +This guide covers configuring storage for an organization in Browsertrix. + +By default, all organizations will use the default storage and default replica locations (if any are configured) set in the Helm chart. + +The Browsertrix API supports adding custom storage locations per organization and configuring the organization to use custom storages for primary and/or replica storage. These endpoints are available to superusers only. + +## Adding Custom Storage + +The first step to configuring custom storage with an organization is to add the S3 buckets to the organization, e.g.: + +```sh +curl -X POST -H "Content-type: application/json" -H "Authorization: Bearer " https://app.browsertrix.com/api/orgs//custom-storage --data '{"name": "new-custom-storage", "access_key": "", "secret_key": "", "bucket": "new-custom-storage", "endpoint_url": "https://s3-provider.example.com/"}' +``` + +Verify that the custom storage has been added to the organization by checking the `/all-storages` API endpoint: + +```sh +curl -H "Content-type: application/json" -H "Authorization: Bearer " https://app.browsertrix.com/api/orgs//all-storages +``` + +The storage reference for our new custom storage should be present in the returned JSON, e.g.: + +```json +{ + "allStorages": [ + {"name": "default", "custom": false}, + {"name": "default-replica", "custom": false}, + {"name": "new-custom-storage", "custom": true}, + ] +} +``` + +The custom storage is now ready to be configured. + + +## Updating Org Storage + +Each organization has one primary storage location. It is possible to configure the organization to use any of the storage options listed in the `/all-storages` API endpoint as primary storage, e.g.: + +```sh +curl -X POST -H "Content-type: application/json" -H "Authorization: Bearer " https://app.browsertrix.com/api/orgs//storage --data '{"storage": {"name": "new-custom-storage", "custom": true}}' +``` + +If any crawls, uploads, or browser profiles have been created on the organization, modifying the primary storage will disable archiving on the organization while files are migrated from the previous to the new storage location. Archiving is re-enabled when the migration completes. + +At this time, all files are copied from the prior storage location to the new storage location, and are not automatically deleted from their source location. + + +## Updating Org Replica Storage + +Each organization can have any number of replica storage locations. These locations serve as replicas of the content in the primary storage location, and are most commonly used as backups. + +It is possible to configure the organization to use any of the storage options listed in the `/all-storages` API endpoint as replica storage, e.g.: + +```sh +curl -X POST -H "Content-type: application/json" -H "Authorization: Bearer " https://app.browsertrix.com/api/orgs//storage --data '{"storageReplicas": [{"name": "default-replica", "custom": false}, {"new-custom-storage": true}]}' +``` + +If any crawls, uploads, or browser profiles have been created on the organization, adding a new replica location will result in a background job to replicate all of the organization's files from primary storage to the new replica location. Unlike with updating primary storage, this process will not disable archiving on the organization. + +If any crawls, uploads, or browser profiles have been created on the organization, removing a previously used replica location will result in database updates to reflect that the prior replica location is no longer available. At this time, no files are automatically deleted from prior replica location. + + +## Removing Custom Storage + +It is also possible to remove a custom storage from an organization, referencing the storage to be deleted's name in the API endpoint, e.g.: + +```sh +curl -X DELETE -H "Content-type: application/json" -H "Authorization: Bearer " https://app.browsertrix.com/api/orgs//custom-storage/new-custom-storage +``` + +The custom storage location to be deleted must not be in use on the organization, or else the endpoint will return `400`. Default storage locations shared between organizations cannot be deleted with this endpoint. diff --git a/frontend/docs/mkdocs.yml b/frontend/docs/mkdocs.yml index bb65327485..7b138f75ed 100644 --- a/frontend/docs/mkdocs.yml +++ b/frontend/docs/mkdocs.yml @@ -84,6 +84,7 @@ nav: - deploy/ansible/k3s.md - Administration: - deploy/admin/org-import-export.md + - deploy/admin/org-storage.md - Development: - develop/index.md - develop/local-dev-setup.md From 76285ef8e90a69d0b3cd38ff14b900e0e6d216f0 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 17 Oct 2024 11:42:31 -0400 Subject: [PATCH 57/64] Fix API endpoint path in docs example --- frontend/docs/docs/deploy/admin/org-storage.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontend/docs/docs/deploy/admin/org-storage.md b/frontend/docs/docs/deploy/admin/org-storage.md index b8bf11a2ad..a9282dc9af 100644 --- a/frontend/docs/docs/deploy/admin/org-storage.md +++ b/frontend/docs/docs/deploy/admin/org-storage.md @@ -55,7 +55,7 @@ Each organization can have any number of replica storage locations. These locati It is possible to configure the organization to use any of the storage options listed in the `/all-storages` API endpoint as replica storage, e.g.: ```sh -curl -X POST -H "Content-type: application/json" -H "Authorization: Bearer " https://app.browsertrix.com/api/orgs//storage --data '{"storageReplicas": [{"name": "default-replica", "custom": false}, {"new-custom-storage": true}]}' +curl -X POST -H "Content-type: application/json" -H "Authorization: Bearer " https://app.browsertrix.com/api/orgs//storage-replicas --data '{"storageReplicas": [{"name": "default-replica", "custom": false}, {"new-custom-storage": true}]}' ``` If any crawls, uploads, or browser profiles have been created on the organization, adding a new replica location will result in a background job to replicate all of the organization's files from primary storage to the new replica location. Unlike with updating primary storage, this process will not disable archiving on the organization. From 41d4f7962b85253b3fe10353e2e337b98311d197 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 17 Oct 2024 11:43:38 -0400 Subject: [PATCH 58/64] Docs typo fix --- frontend/docs/docs/deploy/admin/org-storage.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/frontend/docs/docs/deploy/admin/org-storage.md b/frontend/docs/docs/deploy/admin/org-storage.md index a9282dc9af..2c3b613565 100644 --- a/frontend/docs/docs/deploy/admin/org-storage.md +++ b/frontend/docs/docs/deploy/admin/org-storage.md @@ -45,7 +45,7 @@ curl -X POST -H "Content-type: application/json" -H "Authorization: Bearer Date: Thu, 17 Oct 2024 11:54:21 -0400 Subject: [PATCH 59/64] Add provider field note --- frontend/docs/docs/deploy/admin/org-storage.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/frontend/docs/docs/deploy/admin/org-storage.md b/frontend/docs/docs/deploy/admin/org-storage.md index 2c3b613565..7f0867b3a7 100644 --- a/frontend/docs/docs/deploy/admin/org-storage.md +++ b/frontend/docs/docs/deploy/admin/org-storage.md @@ -14,6 +14,8 @@ The first step to configuring custom storage with an organization is to add the curl -X POST -H "Content-type: application/json" -H "Authorization: Bearer " https://app.browsertrix.com/api/orgs//custom-storage --data '{"name": "new-custom-storage", "access_key": "", "secret_key": "", "bucket": "new-custom-storage", "endpoint_url": "https://s3-provider.example.com/"}' ``` +If desired, the `provider` field can be set to any of the [values supported by rclone for S3 providers](https://rclone.org/s3/#s3-provider). By default, this field is set to "Other", which should work for nearly all S3 storage providers. This value is used solely for migrations from rclone when updating org storage and/or replica locations. + Verify that the custom storage has been added to the organization by checking the `/all-storages` API endpoint: ```sh From 5ca80f0e1af0d92bb7de8dc7b43dc6dadc8a3fd4 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 17 Oct 2024 11:54:58 -0400 Subject: [PATCH 60/64] Docs language cleanup --- frontend/docs/docs/deploy/admin/org-storage.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontend/docs/docs/deploy/admin/org-storage.md b/frontend/docs/docs/deploy/admin/org-storage.md index 7f0867b3a7..ad5563f77d 100644 --- a/frontend/docs/docs/deploy/admin/org-storage.md +++ b/frontend/docs/docs/deploy/admin/org-storage.md @@ -14,7 +14,7 @@ The first step to configuring custom storage with an organization is to add the curl -X POST -H "Content-type: application/json" -H "Authorization: Bearer " https://app.browsertrix.com/api/orgs//custom-storage --data '{"name": "new-custom-storage", "access_key": "", "secret_key": "", "bucket": "new-custom-storage", "endpoint_url": "https://s3-provider.example.com/"}' ``` -If desired, the `provider` field can be set to any of the [values supported by rclone for S3 providers](https://rclone.org/s3/#s3-provider). By default, this field is set to "Other", which should work for nearly all S3 storage providers. This value is used solely for migrations from rclone when updating org storage and/or replica locations. +If desired, the `provider` field can be set to any of the [values supported by rclone for S3 providers](https://rclone.org/s3/#s3-provider). By default, this field is set to "Other", which should work for nearly all S3 storage providers. This value is used solely for migrating existing files with rclone when updating org storage and/or replica locations. Verify that the custom storage has been added to the organization by checking the `/all-storages` API endpoint: From af0d966e97173ef8449185de626ed8d12193edf3 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 17 Oct 2024 11:58:23 -0400 Subject: [PATCH 61/64] Check /all-storages in backend tests --- backend/test/test_org.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/backend/test/test_org.py b/backend/test/test_org.py index b9fe8eb2ab..6ef30e5a12 100644 --- a/backend/test/test_org.py +++ b/backend/test/test_org.py @@ -261,7 +261,10 @@ def test_add_custom_storage_doesnt_verify(admin_auth_headers): }, ) assert r.status_code == 400 - assert r.json()["detail"] == "Could not verify custom storage. Check credentials are valid?" + assert ( + r.json()["detail"] + == "Could not verify custom storage. Check credentials are valid?" + ) def test_add_custom_storage(admin_auth_headers): @@ -298,6 +301,16 @@ def test_add_custom_storage(admin_auth_headers): assert data["added"] assert data["name"] == CUSTOM_REPLICA_STORAGE_NAME + # verify custom storages are now available on org + r = requests.get( + f"{API_PREFIX}/orgs/{new_oid}/all-storages", + headers=admin_auth_headers, + ) + assert r.status_code == 200 + all_storages = r.json()["allStorages"] + assert {"name": CUSTOM_PRIMARY_STORAGE_NAME, "custom": True} in all_storages + assert {"name": CUSTOM_REPLICA_STORAGE_NAME, "custom": True} in all_storages + # set org to use custom storage moving forward r = requests.post( f"{API_PREFIX}/orgs/{new_oid}/storage", From b461113ca736d1b9b46b788841c196195b27a28d Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 17 Oct 2024 22:45:20 -0400 Subject: [PATCH 62/64] Add API endpoint for background job progress --- backend/btrixcloud/background_jobs.py | 58 +++++++++++++++++++++++++++ backend/btrixcloud/crawlmanager.py | 16 ++++++++ backend/btrixcloud/models.py | 8 ++++ chart/app-templates/copy_job.yaml | 5 ++- 4 files changed, 85 insertions(+), 2 deletions(-) diff --git a/backend/btrixcloud/background_jobs.py b/backend/btrixcloud/background_jobs.py index 7ab882777a..bf14606600 100644 --- a/backend/btrixcloud/background_jobs.py +++ b/backend/btrixcloud/background_jobs.py @@ -27,6 +27,7 @@ StorageRef, User, SuccessResponse, + JobProgress, ) from .pagination import DEFAULT_PAGE_SIZE, paginated_format from .utils import dt_now @@ -537,6 +538,52 @@ def _get_job_by_type_from_data(self, data: dict[str, object]): return DeleteOrgJob.from_dict(data) + async def get_job_progress(self, job_id: str) -> JobProgress: + """Return progress of background job for supported types""" + job = await self.get_background_job(job_id) + + if job.type != BgJobType.COPY_BUCKET: + raise HTTPException(status_code=403, detail="job_type_not_supported") + + if job.success is False: + raise HTTPException(status_code=400, detail="job_failed") + + if job.finished: + return JobProgress(percentage=1.0) + + log_tail = await self.crawl_manager.tail_background_job(job_id) + if not log_tail: + raise HTTPException(status_code=400, detail="job_log_not_available") + + lines = log_tail.splitlines() + reversed_lines = list(reversed(lines)) + + progress = JobProgress(percentage=0.0) + + # Parse lines in reverse order until we find one with latest stats + for line in reversed_lines: + try: + if "ETA" not in line: + continue + + stats_groups = line.split(",") + for group in stats_groups: + group = group.strip() + if "%" in group: + progress.percentage = float(group.strip("%")) / 100 + if "ETA" in group: + eta_str = group.strip("ETA ") + # Split on white space to remove byte mark rclone sometimes + # adds to end of stats line + eta_list = eta_str.split(" ") + progress.eta = eta_list[0] + + break + except: + continue + + return progress + async def list_background_jobs( self, org: Organization, @@ -753,6 +800,17 @@ async def get_background_job( """Retrieve information for background job""" return await ops.get_background_job(job_id, org.id) + @router.get( + "/{job_id}/progress", + response_model=JobProgress, + ) + async def get_job_progress( + job_id: str, + org: Organization = Depends(org_crawl_dep), + ): + """Return progress information for background job""" + return await ops.get_job_progress(job_id) + @app.get("/orgs/all/jobs/{job_id}", response_model=AnyJob, tags=["jobs"]) async def get_background_job_all_orgs(job_id: str, user: User = Depends(user_dep)): """Get background job from any org""" diff --git a/backend/btrixcloud/crawlmanager.py b/backend/btrixcloud/crawlmanager.py index b6a2cce56a..c4f342bb46 100644 --- a/backend/btrixcloud/crawlmanager.py +++ b/backend/btrixcloud/crawlmanager.py @@ -383,6 +383,22 @@ async def delete_crawl_config_by_id(self, cid: str) -> None: """Delete all crawl configs by id""" await self._delete_crawl_configs(f"btrix.crawlconfig={cid}") + async def tail_background_job(self, job_id: str) -> str: + """Tail running background job pod""" + pods = await self.core_api.list_namespaced_pod( + namespace=self.namespace, + label_selector=f"batch.kubernetes.io/job-name={job_id}", + ) + + if not pods.items: + return "" + + pod_name = pods.items[0].metadata.name + + return await self.core_api.read_namespaced_pod_log( + pod_name, self.namespace, tail_lines=10 + ) + # ======================================================================== # Internal Methods async def _delete_crawl_configs(self, label) -> None: diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 16ad49d59f..b2da8c1461 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -2120,6 +2120,14 @@ class CopyBucketJob(BackgroundJob): ] +# ============================================================================ +class JobProgress(BaseModel): + """Model for reporting background job progress""" + + percentage: float + eta: Optional[str] = None + + # ============================================================================ ### PAGES ### diff --git a/chart/app-templates/copy_job.yaml b/chart/app-templates/copy_job.yaml index cae8a3a449..61a8febcde 100644 --- a/chart/app-templates/copy_job.yaml +++ b/chart/app-templates/copy_job.yaml @@ -3,12 +3,13 @@ kind: Job metadata: name: "{{ id }}" labels: + job-id: "{{ id }}" role: "background-job" job_type: {{ job_type }} btrix.org: {{ oid }} spec: - ttlSecondsAfterFinished: 60 + ttlSecondsAfterFinished: 30 backoffLimit: 3 template: spec: @@ -86,7 +87,7 @@ spec: - name: RCLONE_CONFIG_NEW_ENDPOINT value: "{{ new_endpoint }}" - command: ["rclone", "-vv", "--progress", "copy", "--checksum", "--use-mmap", "--transfers=2", "--checkers=2", "prev:{{ prev_bucket }}{{ oid }}", "new:{{ new_bucket }}{{ oid }}"] + command: ["rclone", "-v", "--stats-one-line", "--stats", "10s", "copy", "--checksum", "--use-mmap", "--transfers=2", "--checkers=2", "prev:{{ prev_bucket }}{{ oid }}", "new:{{ new_bucket }}{{ oid }}"] resources: limits: memory: "350Mi" From 60582c1bfe36473ea80ff2b8d9e7887225ac5cbf Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Fri, 18 Oct 2024 01:09:14 -0400 Subject: [PATCH 63/64] Fix linting --- backend/btrixcloud/background_jobs.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/backend/btrixcloud/background_jobs.py b/backend/btrixcloud/background_jobs.py index bf14606600..1515f09879 100644 --- a/backend/btrixcloud/background_jobs.py +++ b/backend/btrixcloud/background_jobs.py @@ -579,6 +579,7 @@ async def get_job_progress(self, job_id: str) -> JobProgress: progress.eta = eta_list[0] break + # pylint: disable=bare-except except: continue @@ -806,6 +807,7 @@ async def get_background_job( ) async def get_job_progress( job_id: str, + # pylint: disable=unused-argument org: Organization = Depends(org_crawl_dep), ): """Return progress information for background job""" From c88966452cdce6a32d1a42f7b80ee7300bf2de80 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 3 Dec 2024 16:56:00 -0500 Subject: [PATCH 64/64] Format post-rebase with Black --- backend/btrixcloud/main_op.py | 1 - 1 file changed, 1 deletion(-) diff --git a/backend/btrixcloud/main_op.py b/backend/btrixcloud/main_op.py index 030b40a777..84107ffa7b 100644 --- a/backend/btrixcloud/main_op.py +++ b/backend/btrixcloud/main_op.py @@ -25,7 +25,6 @@ def main(): ) sys.exit(1) - ( org_ops, crawl_config_ops,