feat(openadapt.privacy.providers): add private_ai as a scrubbing prov…

…ider (OpenAdaptAI#486) * ran black * add blank file for priavte ai * add class `PrivateAIScrubbingProvider` * add pvt_ai api key in config * ran `pre-commit run --all` * add scrub_text function * progress for image redaction * format * complete scrub_image * start scrub_pdf * add pdf redaction code * add more wrapper for invalid reponse from priavate ai * try to fix pytest * try to fix pytest * remove lst * remove uncessary methods * text scrubbing test all passses * pdf_redaction test works * add test_image_redaction test in pyetst * add easy ocr to poetry * pytest is fixed * remove versino files * remove un-neccasry files * add code to remove uncessary files after pytest * addressed OpenAdaptAI#486 (comment) * address comment OpenAdaptAI#486 (comment) * reduce line chars * addressed comment: OpenAdaptAI#486 (comment) * fix flake8 * use f strings * address comment: OpenAdaptAI#486 (comment) * address comment: OpenAdaptAI#486 (comment) * change to value error * remove .keys() * add constants * fix flake8 erros * use BytesIO * address comment OpenAdaptAI#486 (comment) * rna black * final commit * remove unused code * refactor typo * rename `redact_file_path` to `redacted_file_path` * use BytesIO wherever possible * fix flake8 * add documentation links * Apply suggestions from code review * Update tests/openadapt/privacy/providers/test_private_ai_scrub.py * fix poetry.lock * poetry.lock --------- Co-authored-by: Krish Patel <[email protected]> Co-authored-by: Richard Abrich <[email protected]> Co-authored-by: Richard Abrich <[email protected]>
R-ohit-B-isht · Nov 11, 2023 · b92667a · b92667a
1 parent 95963f0
commit b92667a
Show file tree

Hide file tree

Showing 13 changed files with 730 additions and 158 deletions.
diff --git a/assets/sample_llc_1.pdf b/assets/sample_llc_1.pdf
diff --git a/openadapt/privacy/providers/__init__.py b/openadapt/privacy/providers/__init__.py
@@ -9,3 +9,4 @@ class ScrubProvider:  # pylint: disable=too-few-public-methods
 
     PRESIDIO = "PRESIDIO"
     COMPREHEND = "COMPREHEND"
+    PRIVATE_AI = "PRIVATE_AI"
diff --git a/openadapt/privacy/providers/private_ai.py b/openadapt/privacy/providers/private_ai.py
@@ -0,0 +1,198 @@
+"""A Module for Private AI Scrubbing Provider."""
+
+from io import BytesIO
+from typing import List
+import base64
+
+from loguru import logger
+from PIL import Image
+import requests
+
+from openadapt import config
+from openadapt.privacy.base import Modality, ScrubbingProvider, TextScrubbingMixin
+from openadapt.privacy.providers import ScrubProvider
+
+BASE64_URL = "https://api.private-ai.com/deid/v3/process/files/base64"
+FILES_DIR = "assets/"
+HEADER_CONTENT_TYPE = "application/json"
+IMAGE_CONTENT_TYPE = "image/png"
+PDF_CONTENT_TYPE = "application/pdf"
+TEMP_IMAGEFILE_NAME = "temp_image_to_scrub.png"
+TEXT_URL = "https://api.private-ai.com/deid/v3/process/text"
+
+
+class PrivateAIScrubbingProvider(
+    ScrubProvider, ScrubbingProvider, TextScrubbingMixin
+):  # pylint: disable=abstract-method
+    """A Class for Private AI Scrubbing Provider."""
+
+    name: str = ScrubProvider.PRIVATE_AI
+    capabilities: List[Modality] = [Modality.TEXT, Modality.PIL_IMAGE, Modality.PDF]
+
+    def scrub_text(self, text: str, is_separated: bool = False) -> str:
+        """Scrub the text of all PII/PHI.
+
+        Args:
+            text (str): Text to be redacted
+            is_separated (bool): Whether the text is separated with special characters
+
+        Returns:
+            str: redacted text
+        """
+        payload = {
+            "text": [text],
+            "link_batch": False,
+            "entity_detection": {
+                "accuracy": "high",
+                "return_entity": True,
+            },
+            "processed_text": {
+                "type": "MARKER",
+                "pattern": "[UNIQUE_NUMBERED_ENTITY_TYPE]",
+            },
+        }
+
+        headers = {
+            "Content-Type": HEADER_CONTENT_TYPE,
+            "X-API-KEY": config.PRIVATE_AI_API_KEY,
+        }
+
+        response = requests.post(TEXT_URL, json=payload, headers=headers)
+        response.raise_for_status()
+        data = response.json()
+        logger.debug(f"{data=}")
+
+        # According to the PrivateAI API documentation,
+        # https://docs.private-ai.com/reference/latest/operation/process_text_v3_process_text_post/
+        # the response is a list of dicts when there is no error/issue in the request
+        # else it is a dict with a key "detail" containing the error message
+
+        if type(data) is dict and "detail" in data:
+            raise ValueError(data.get("detail"))
+
+        redacted_text = data[0].get("processed_text")
+        logger.debug(f"{redacted_text=}")
+
+        return redacted_text
+
+    def scrub_image(
+        self,
+        image: Image,
+        fill_color: int = config.SCRUB_FILL_COLOR,  # pylint: disable=no-member
+    ) -> Image:
+        """Scrub the image of all PII/PHI.
+
+        Args:
+            image (Image): A PIL.Image object to be redacted
+            fill_color (int): The color used to fill the redacted regions(BGR).
+
+        Returns:
+            Image: The redacted image with PII and PHI removed.
+        """
+        buffer = BytesIO()
+
+        image.save(buffer, format="PNG")
+        # Get the image data as bytes
+        image_data = buffer.getvalue()
+
+        file_data = base64.b64encode(image_data)
+        file_data = file_data.decode("ascii")
+
+        # Clean up by closing the BytesIO buffer
+        buffer.close()
+
+        payload = {
+            "file": {"data": file_data, "content_type": IMAGE_CONTENT_TYPE},
+            "entity_detection": {"accuracy": "high", "return_entity": True},
+            "pdf_options": {"density": 150, "max_resolution": 2000},
+            "audio_options": {"bleep_start_padding": 0, "bleep_end_padding": 0},
+        }
+
+        headers = {
+            "Content-Type": HEADER_CONTENT_TYPE,
+            "X-API-KEY": config.PRIVATE_AI_API_KEY,
+        }
+
+        response = requests.post(BASE64_URL, json=payload, headers=headers)
+        response = response.json()
+        logger.debug(f"{response=}")
+
+        # According to the PrivateAI API documentation,
+        # https://docs.private-ai.com/reference/latest/operation/process_files_base64_v3_process_files_base64_post/
+        # else it is a dict with a key "detail" containing the error message
+
+        if type(response) is dict and "detail" in response:
+            raise ValueError(response.get("detail"))
+
+        redacted_file_data = response.get("processed_file").encode("ascii")
+        redacted_file_data = base64.b64decode(redacted_file_data, validate=True)
+
+        # Use a BytesIO buffer to work with redacted_file_data
+        redacted_buffer = BytesIO(redacted_file_data)
+
+        redact_pil_image_data = Image.open(redacted_buffer)
+
+        return redact_pil_image_data
+
+    def scrub_pdf(self, path_to_pdf: str) -> str:
+        """Scrub the PDF of all PII/PHI.
+
+        Args:
+            path_to_pdf (str): Path to the PDF to be redacted
+
+        Returns:
+            str: Path to the redacted PDF
+        """
+        # Create a BytesIO buffer to read the PDF file
+        with open(path_to_pdf, "rb") as pdf_file:
+            pdf_buffer = BytesIO(pdf_file.read())
+
+        # Read PDF data from the BytesIO buffer
+        pdf_data = pdf_buffer.getvalue()
+        pdf_buffer.close()
+
+        # Encode PDF data as base64
+        pdf_base64 = base64.b64encode(pdf_data).decode("ascii")
+
+        payload = {
+            "file": {"data": pdf_base64, "content_type": PDF_CONTENT_TYPE},
+            "entity_detection": {"accuracy": "high", "return_entity": True},
+            "pdf_options": {"density": 150, "max_resolution": 2000},
+            "audio_options": {"bleep_start_padding": 0, "bleep_end_padding": 0},
+        }
+
+        headers = {
+            "Content-Type": HEADER_CONTENT_TYPE,
+            "X-API-KEY": config.PRIVATE_AI_API_KEY,
+        }
+
+        response = requests.post(BASE64_URL, json=payload, headers=headers)
+        response_data = response.json()
+
+        # According to the PrivateAI API documentation,
+        # https://docs.private-ai.com/reference/latest/operation/process_files_base64_v3_process_files_base64_post/
+        # the response is a list of dicts when there is no error/issue in the request
+        # else it is a dict with a key "detail" containing the error message
+
+        if isinstance(response_data, dict) and "details" in response_data:
+            raise ValueError(response_data.get("detail"))
+
+        logger.debug(f"{response_data.get('entities')=}")
+        logger.debug(f"{len(response_data.get('entities'))=}")
+
+        redacted_file_path = path_to_pdf.split(".")[0] + "_redacted.pdf"
+
+        # Create a BytesIO buffer to handle the redacted PDF data
+        redacted_buffer = BytesIO()
+
+        # Decode and write the redacted PDF data to the BytesIO buffer
+        processed_file = response_data.get("processed_file").encode("ascii")
+        processed_file = base64.b64decode(processed_file, validate=True)
+        redacted_buffer.write(processed_file)
+
+        # Write the redacted PDF data to a file
+        with open(redacted_file_path, "wb") as redacted_file:
+            redacted_buffer.seek(0)  # Move the buffer position to the beginning
+            redacted_file.write(redacted_buffer.read())
+
+        return redacted_file_path
Original file line number	Diff line number	Diff line change
Expand Up		@@ -9,3 +9,4 @@ class ScrubProvider: # pylint: disable=too-few-public-methods

		PRESIDIO = "PRESIDIO"
		COMPREHEND = "COMPREHEND"
		PRIVATE_AI = "PRIVATE_AI"