webrecorder · tw4l · Sep 20, 2024 · Sep 20, 2024 · Sep 20, 2024 · Sep 23, 2024
diff --git a/.github/workflows/k3d-ci.yaml b/.github/workflows/k3d-ci.yaml
@@ -91,6 +91,11 @@ jobs:
       - name: Wait for all pods to be ready
         run: kubectl wait --for=condition=ready pod --all --timeout=240s
 
+      - name: Create Extra Test Buckets
+        run: |
+          kubectl exec -i deployment/local-minio -c minio -- mkdir /data/custom-primary &&
+          kubectl exec -i deployment/local-minio -c minio -- mkdir /data/custom-replica
+
       - name: Run Tests
         timeout-minutes: 30
         run: pytest -vv ./backend/test/test_*.py

diff --git a/.github/workflows/microk8s-ci.yaml b/.github/workflows/microk8s-ci.yaml
@@ -66,6 +66,11 @@ jobs:
       - name: Wait for all pods to be ready
         run: sudo microk8s kubectl wait --for=condition=ready pod --all --timeout=240s
 
+      - name: Create Extra Test Buckets
+        run: |
+          kubectl exec -i deployment/local-minio -c minio -- mkdir /data/custom-primary &&
+          kubectl exec -i deployment/local-minio -c minio -- mkdir /data/custom-replica
+
       - name: Run Tests
         run: pytest -vv ./backend/test/test_*.py
 

diff --git a/backend/btrixcloud/background_jobs.py b/backend/btrixcloud/background_jobs.py
@@ -16,17 +16,18 @@
 from .models import (
     BaseFile,
     Organization,
-    BackgroundJob,
     BgJobType,
     CreateReplicaJob,
     DeleteReplicaJob,
     DeleteOrgJob,
     RecalculateOrgStatsJob,
+    CopyBucketJob,
     PaginatedBackgroundJobResponse,
     AnyJob,
     StorageRef,
     User,
     SuccessResponse,
+    JobProgress,
 )
 from .pagination import DEFAULT_PAGE_SIZE, paginated_format
 from .utils import dt_now
@@ -51,7 +52,7 @@ class BackgroundJobOps:
     base_crawl_ops: BaseCrawlOps
     profile_ops: ProfileOps
 
-    # pylint: disable=too-many-locals, too-many-arguments, invalid-name
+    # pylint: disable=too-many-locals, too-many-arguments, too-many-positional-arguments, invalid-name
 
     def __init__(self, mdb, email, user_manager, org_ops, crawl_manager, storage_ops):
         self.jobs = mdb["jobs"]
@@ -295,14 +296,18 @@ async def create_delete_org_job(
         self,
         org: Organization,
         existing_job_id: Optional[str] = None,
-    ) -> Optional[str]:
+    ) -> str:
         """Create background job to delete org and its data"""
 
+        job_type = BgJobType.DELETE_ORG.value
+
         try:
             job_id = await self.crawl_manager.run_delete_org_job(
                 oid=str(org.id),
+                job_type=job_type,
                 backend_image=os.environ.get("BACKEND_IMAGE", ""),
                 pull_policy=os.environ.get("BACKEND_IMAGE_PULL_POLICY", ""),
+                job_id_prefix=f"{job_type}-{org.id}",
                 existing_job_id=existing_job_id,
             )
             if existing_job_id:
@@ -334,7 +339,7 @@ async def create_delete_org_job(
         except Exception as exc:
             # pylint: disable=raise-missing-from
             print(f"warning: delete org job could not be started: {exc}")
-            return None
+            return ""
 
     async def create_recalculate_org_stats_job(
         self,
@@ -381,6 +386,73 @@ async def create_recalculate_org_stats_job(
             print(f"warning: recalculate org stats job could not be started: {exc}")
             return None
 
+    async def create_copy_bucket_job(
+        self,
+        org: Organization,
+        prev_storage_ref: StorageRef,
+        new_storage_ref: StorageRef,
+        existing_job_id: Optional[str] = None,
+    ) -> str:
+        """Start background job to copy entire s3 bucket and return job id"""
+        prev_storage = self.storage_ops.get_org_storage_by_ref(org, prev_storage_ref)
+        prev_endpoint, prev_bucket = self.strip_bucket(prev_storage.endpoint_url)
+
+        new_storage = self.storage_ops.get_org_storage_by_ref(org, new_storage_ref)
+        new_endpoint, new_bucket = self.strip_bucket(new_storage.endpoint_url)
+
+        # Ensure buckets terminate with trailing slash
+        prev_bucket = os.path.join(prev_bucket, "")
+        new_bucket = os.path.join(new_bucket, "")
+
+        job_type = BgJobType.COPY_BUCKET.value
+
+        try:
+            job_id = await self.crawl_manager.run_copy_bucket_job(
+                oid=str(org.id),
+                job_type=job_type,
+                prev_storage=prev_storage_ref,
+                prev_endpoint=prev_endpoint,
+                prev_bucket=prev_bucket,
+                new_storage=new_storage_ref,
+                new_endpoint=new_endpoint,
+                new_bucket=new_bucket,
+                job_id_prefix=f"{job_type}-{org.id}",
+                existing_job_id=existing_job_id,
+            )
+            if existing_job_id:
+                copy_job = await self.get_background_job(existing_job_id, org.id)
+                previous_attempt = {
+                    "started": copy_job.started,
+                    "finished": copy_job.finished,
+                }
+                if copy_job.previousAttempts:
+                    copy_job.previousAttempts.append(previous_attempt)
+                else:
+                    copy_job.previousAttempts = [previous_attempt]
+                copy_job.started = dt_now()
+                copy_job.finished = None
+                copy_job.success = None
+            else:
+                copy_job = CopyBucketJob(
+                    id=job_id,
+                    oid=org.id,
+                    started=dt_now(),
+                    prev_storage=prev_storage_ref,
+                    new_storage=new_storage_ref,
+                )
+
+            await self.jobs.find_one_and_update(
+                {"_id": job_id}, {"$set": copy_job.to_dict()}, upsert=True
+            )
+
+            return job_id
+        # pylint: disable=broad-exception-caught
+        except Exception as exc:
+            print(
+                f"warning: copy bucket job could not be started for org {org.id}: {exc}"
+            )
+            return ""
+
     async def job_finished(
         self,
         job_id: str,
@@ -406,6 +478,9 @@ async def job_finished(
                 await self.handle_delete_replica_job_finished(
                     cast(DeleteReplicaJob, job)
                 )
+            if job_type == BgJobType.COPY_BUCKET:
+                org = await self.org_ops.get_org_by_id(oid)
+                await self.org_ops.update_read_only(org, False)
         else:
             print(
                 f"Background job {job.id} failed, sending email to superuser",
@@ -430,7 +505,11 @@ async def job_finished(
     async def get_background_job(
         self, job_id: str, oid: Optional[UUID] = None
     ) -> Union[
-        CreateReplicaJob, DeleteReplicaJob, DeleteOrgJob, RecalculateOrgStatsJob
+        CreateReplicaJob,
+        DeleteReplicaJob,
+        CopyBucketJob,
+        DeleteOrgJob,
+        RecalculateOrgStatsJob,
     ]:
         """Get background job"""
         query: dict[str, object] = {"_id": job_id}
@@ -445,27 +524,78 @@ async def get_background_job(
 
     def _get_job_by_type_from_data(self, data: dict[str, object]):
         """convert dict to propert background job type"""
-        if data["type"] == BgJobType.CREATE_REPLICA:
+        if data["type"] == BgJobType.CREATE_REPLICA.value:
             return CreateReplicaJob.from_dict(data)
 
-        if data["type"] == BgJobType.DELETE_REPLICA:
+        if data["type"] == BgJobType.DELETE_REPLICA.value:
             return DeleteReplicaJob.from_dict(data)
 
-        if data["type"] == BgJobType.RECALCULATE_ORG_STATS:
+        if data["type"] == BgJobType.RECALCULATE_ORG_STATS.value:
             return RecalculateOrgStatsJob.from_dict(data)
 
+        if data["type"] == BgJobType.COPY_BUCKET.value:
+            return CopyBucketJob.from_dict(data)
+
         return DeleteOrgJob.from_dict(data)
 
+    async def get_job_progress(self, job_id: str) -> JobProgress:
+        """Return progress of background job for supported types"""
+        job = await self.get_background_job(job_id)
+
+        if job.type != BgJobType.COPY_BUCKET:
+            raise HTTPException(status_code=403, detail="job_type_not_supported")
+
+        if job.success is False:
+            raise HTTPException(status_code=400, detail="job_failed")
+
+        if job.finished:
+            return JobProgress(percentage=1.0)
+
+        log_tail = await self.crawl_manager.tail_background_job(job_id)
+        if not log_tail:
+            raise HTTPException(status_code=400, detail="job_log_not_available")
+
+        lines = log_tail.splitlines()
+        reversed_lines = list(reversed(lines))
+
+        progress = JobProgress(percentage=0.0)
+
+        # Parse lines in reverse order until we find one with latest stats
+        for line in reversed_lines:
+            try:
+                if "ETA" not in line:
+                    continue
+
+                stats_groups = line.split(",")
+                for group in stats_groups:
+                    group = group.strip()
+                    if "%" in group:
+                        progress.percentage = float(group.strip("%")) / 100
+                    if "ETA" in group:
+                        eta_str = group.strip("ETA ")
+                        # Split on white space to remove byte mark rclone sometimes
+                        # adds to end of stats line
+                        eta_list = eta_str.split(" ")
+                        progress.eta = eta_list[0]
+
+                break
+            # pylint: disable=bare-except
+            except:
+                continue
+
+        return progress
+
     async def list_background_jobs(
         self,
         org: Organization,
         page_size: int = DEFAULT_PAGE_SIZE,
         page: int = 1,
         success: Optional[bool] = None,
+        running: Optional[bool] = None,
         job_type: Optional[str] = None,
         sort_by: Optional[str] = None,
         sort_direction: Optional[int] = -1,
-    ) -> Tuple[List[BackgroundJob], int]:
+    ) -> Tuple[List[Union[CreateReplicaJob, DeleteReplicaJob, CopyBucketJob]], int]:
         """List all background jobs"""
         # pylint: disable=duplicate-code
         # Zero-index page for query
@@ -477,6 +607,12 @@ async def list_background_jobs(
         if success in (True, False):
             query["success"] = success
 
+        if running:
+            query["success"] = None
+
+        if running is False:
+            query["success"] = {"$in": [True, False]}
+
         if job_type:
             query["type"] = job_type
 
@@ -595,6 +731,14 @@ async def retry_background_job(
                 existing_job_id=job_id,
             )
 
+        if job.type == BgJobType.COPY_BUCKET:
+            await self.create_copy_bucket_job(
+                org,
+                job.prev_storage,
+                job.new_storage,
+                existing_job_id=job_id,
+            )
+
         return {"success": True}
 
     async def retry_failed_background_jobs(
@@ -630,7 +774,7 @@ async def retry_all_failed_background_jobs(
 
 
 # ============================================================================
-# pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme
+# pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme, too-many-positional-arguments
 def init_background_jobs_api(
     app, mdb, email, user_manager, org_ops, crawl_manager, storage_ops, user_dep
 ):
@@ -657,7 +801,19 @@ async def get_background_job(
         """Retrieve information for background job"""
         return await ops.get_background_job(job_id, org.id)
 
-    @app.get("/orgs/all/jobs/{job_id}", response_model=SuccessResponse, tags=["jobs"])
+    @router.get(
+        "/{job_id}/progress",
+        response_model=JobProgress,
+    )
+    async def get_job_progress(
+        job_id: str,
+        # pylint: disable=unused-argument
+        org: Organization = Depends(org_crawl_dep),
+    ):
+        """Return progress information for background job"""
+        return await ops.get_job_progress(job_id)
+
+    @app.get("/orgs/all/jobs/{job_id}", response_model=AnyJob, tags=["jobs"])
     async def get_background_job_all_orgs(job_id: str, user: User = Depends(user_dep)):
         """Get background job from any org"""
         if not user.is_superuser:
@@ -696,6 +852,7 @@ async def list_background_jobs(
         pageSize: int = DEFAULT_PAGE_SIZE,
         page: int = 1,
         success: Optional[bool] = None,
+        running: Optional[bool] = None,
         jobType: Optional[str] = None,
         sortBy: Optional[str] = None,
         sortDirection: Optional[int] = -1,
@@ -706,6 +863,7 @@ async def list_background_jobs(
             page_size=pageSize,
             page=page,
             success=success,
+            running=running,
             job_type=jobType,
             sort_by=sortBy,
             sort_direction=sortDirection,

diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py
@@ -54,7 +54,7 @@
 
 
 # ============================================================================
-# pylint: disable=too-many-instance-attributes, too-many-public-methods, too-many-lines
+# pylint: disable=too-many-instance-attributes, too-many-public-methods, too-many-lines, too-many-positional-arguments
 class BaseCrawlOps:
     """operations that apply to all crawls"""
 

diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py
@@ -43,6 +43,7 @@
 
 
 # ============================================================================
+# pylint: disable=too-many-positional-arguments
 class CollectionOps:
     """ops for working with named collections of crawls"""
 

diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py
@@ -73,7 +73,7 @@
 class CrawlConfigOps:
     """Crawl Config Operations"""
 
-    # pylint: disable=too-many-arguments, too-many-instance-attributes, too-many-public-methods
+    # pylint: disable=too-many-arguments, too-many-instance-attributes, too-many-public-methods, too-many-positional-arguments
 
     user_manager: UserManager
     org_ops: OrgOps
@@ -1081,7 +1081,7 @@ async def stats_recompute_all(crawl_configs, crawls, cid: UUID):
 
 
 # ============================================================================
-# pylint: disable=redefined-builtin,invalid-name,too-many-locals,too-many-arguments
+# pylint: disable=redefined-builtin,invalid-name,too-many-locals,too-many-arguments,too-many-positional-arguments
 def init_crawl_config_api(
     app,
     dbclient,