Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(openadapt.privacy.providers): add private_ai as a scrubbing provider #486

Merged
merged 57 commits into from
Nov 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
57 commits
Select commit Hold shift + click to select a range
648b39c
ran black
Aug 29, 2023
7d05aff
add blank file for priavte ai
Aug 29, 2023
c99fcff
add class `PrivateAIScrubbingProvider`
Aug 29, 2023
c7fc0b2
add pvt_ai api key in config
Aug 29, 2023
dccc0f5
ran `pre-commit run --all`
Aug 29, 2023
686f767
add scrub_text function
Aug 29, 2023
3734880
Merge branch 'OpenAdaptAI:main' into feat/private_ai
KrishPatel13 Aug 29, 2023
a677eff
progress for image redaction
Aug 29, 2023
3a3a533
Merge branch 'main' of https://github.com/OpenAdaptAI/OpenAdapt into …
Aug 29, 2023
7d5f03e
Merge branch 'test-2' into feat/private_ai
Aug 29, 2023
5fb354d
format
Aug 29, 2023
c7442a0
complete scrub_image
Aug 30, 2023
e49c6c0
start scrub_pdf
Aug 30, 2023
7ef8a63
Merge branch 'main' into feat/private_ai
KrishPatel13 Aug 30, 2023
1718e64
add pdf redaction code
Aug 30, 2023
c36c098
add more wrapper for invalid reponse from priavate ai
Aug 30, 2023
e9b85e9
try to fix pytest
Aug 30, 2023
295cb78
try to fix pytest
Aug 30, 2023
ccc8637
remove lst
Aug 30, 2023
ea3150e
remove uncessary methods
Aug 30, 2023
fd75b30
text scrubbing test all passses
Aug 30, 2023
68a4236
pdf_redaction test works
Aug 30, 2023
49e61b3
add test_image_redaction test in pyetst
Aug 30, 2023
4c9cb5b
Merge branch 'main' into feat/private_ai
KrishPatel13 Aug 30, 2023
630e51e
add easy ocr to poetry
Aug 30, 2023
b37cf02
pytest is fixed
Aug 30, 2023
5944d04
remove versino files
Aug 30, 2023
98096c8
remove un-neccasry files
Aug 30, 2023
e83f285
add code to remove uncessary files after pytest
Aug 30, 2023
b54bc66
addressed https://github.com/OpenAdaptAI/OpenAdapt/pull/486#discussio…
Aug 31, 2023
e743b6c
address comment https://github.com/OpenAdaptAI/OpenAdapt/pull/486#dis…
Aug 31, 2023
64a984b
reduce line chars
Aug 31, 2023
74399ce
addressed comment: https://github.com/OpenAdaptAI/OpenAdapt/pull/486#…
Aug 31, 2023
361a91b
fix flake8
Aug 31, 2023
b6508a4
use f strings
Aug 31, 2023
a37ee99
address comment: https://github.com/OpenAdaptAI/OpenAdapt/pull/486#di…
Aug 31, 2023
f486cf6
address comment: https://github.com/OpenAdaptAI/OpenAdapt/pull/486#di…
Aug 31, 2023
5bed44b
change to value error
Aug 31, 2023
2cb1e03
remove .keys()
Aug 31, 2023
6bc38d5
add constants
Aug 31, 2023
dd77701
fix flake8 erros
Aug 31, 2023
e1f4c19
use BytesIO
Aug 31, 2023
7a68b77
address comment https://github.com/OpenAdaptAI/OpenAdapt/pull/486#dis…
Sep 1, 2023
48f8118
rna black
Sep 1, 2023
9293793
final commit
Sep 1, 2023
1b3794e
remove unused code
Sep 1, 2023
5670381
refactor typo
Sep 1, 2023
aae24fb
rename `redact_file_path` to `redacted_file_path`
Sep 1, 2023
2f92260
use BytesIO wherever possible
Sep 1, 2023
a9457c0
fix flake8
Sep 1, 2023
20873b2
add documentation links
Sep 1, 2023
7887b0e
Merge branch 'main' into feat/private_ai
abrichr Oct 23, 2023
cef00fd
Apply suggestions from code review
abrichr Nov 10, 2023
63c82dd
Update tests/openadapt/privacy/providers/test_private_ai_scrub.py
abrichr Nov 10, 2023
b98ebcd
fix poetry.lock
abrichr Nov 11, 2023
6734edf
Merge branch 'main' into feat/private_ai
abrichr Nov 11, 2023
73a3484
poetry.lock
abrichr Nov 11, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file removed assets/sample_llc_1.pdf
Binary file not shown.
1 change: 1 addition & 0 deletions openadapt/privacy/providers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ class ScrubProvider: # pylint: disable=too-few-public-methods

PRESIDIO = "PRESIDIO"
COMPREHEND = "COMPREHEND"
PRIVATE_AI = "PRIVATE_AI"
198 changes: 198 additions & 0 deletions openadapt/privacy/providers/private_ai.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
"""A Module for Private AI Scrubbing Provider."""

from io import BytesIO
from typing import List
import base64

from loguru import logger
from PIL import Image
import requests

from openadapt import config
from openadapt.privacy.base import Modality, ScrubbingProvider, TextScrubbingMixin
from openadapt.privacy.providers import ScrubProvider

BASE64_URL = "https://api.private-ai.com/deid/v3/process/files/base64"
FILES_DIR = "assets/"
HEADER_CONTENT_TYPE = "application/json"
IMAGE_CONTENT_TYPE = "image/png"
PDF_CONTENT_TYPE = "application/pdf"
TEMP_IMAGEFILE_NAME = "temp_image_to_scrub.png"
TEXT_URL = "https://api.private-ai.com/deid/v3/process/text"


class PrivateAIScrubbingProvider(
ScrubProvider, ScrubbingProvider, TextScrubbingMixin
): # pylint: disable=abstract-method
"""A Class for Private AI Scrubbing Provider."""

name: str = ScrubProvider.PRIVATE_AI
capabilities: List[Modality] = [Modality.TEXT, Modality.PIL_IMAGE, Modality.PDF]

def scrub_text(self, text: str, is_separated: bool = False) -> str:
"""Scrub the text of all PII/PHI.
Args:
text (str): Text to be redacted
is_separated (bool): Whether the text is separated with special characters
Returns:
str: redacted text
"""
payload = {
"text": [text],
"link_batch": False,
"entity_detection": {
"accuracy": "high",
"return_entity": True,
},
"processed_text": {
"type": "MARKER",
"pattern": "[UNIQUE_NUMBERED_ENTITY_TYPE]",
},
}

headers = {
"Content-Type": HEADER_CONTENT_TYPE,
"X-API-KEY": config.PRIVATE_AI_API_KEY,
}

response = requests.post(TEXT_URL, json=payload, headers=headers)
response.raise_for_status()
data = response.json()
logger.debug(f"{data=}")

# According to the PrivateAI API documentation,
# https://docs.private-ai.com/reference/latest/operation/process_text_v3_process_text_post/
# the response is a list of dicts when there is no error/issue in the request
# else it is a dict with a key "detail" containing the error message

if type(data) is dict and "detail" in data:
raise ValueError(data.get("detail"))

redacted_text = data[0].get("processed_text")
logger.debug(f"{redacted_text=}")

return redacted_text

def scrub_image(
self,
image: Image,
fill_color: int = config.SCRUB_FILL_COLOR, # pylint: disable=no-member
) -> Image:
"""Scrub the image of all PII/PHI.
Args:
image (Image): A PIL.Image object to be redacted
fill_color (int): The color used to fill the redacted regions(BGR).
Returns:
Image: The redacted image with PII and PHI removed.
"""
buffer = BytesIO()

image.save(buffer, format="PNG")
# Get the image data as bytes
image_data = buffer.getvalue()

file_data = base64.b64encode(image_data)
file_data = file_data.decode("ascii")

# Clean up by closing the BytesIO buffer
buffer.close()

payload = {
"file": {"data": file_data, "content_type": IMAGE_CONTENT_TYPE},
"entity_detection": {"accuracy": "high", "return_entity": True},
"pdf_options": {"density": 150, "max_resolution": 2000},
"audio_options": {"bleep_start_padding": 0, "bleep_end_padding": 0},
}

headers = {
"Content-Type": HEADER_CONTENT_TYPE,
"X-API-KEY": config.PRIVATE_AI_API_KEY,
}

response = requests.post(BASE64_URL, json=payload, headers=headers)
response = response.json()
logger.debug(f"{response=}")

# According to the PrivateAI API documentation,
# https://docs.private-ai.com/reference/latest/operation/process_files_base64_v3_process_files_base64_post/
# else it is a dict with a key "detail" containing the error message

if type(response) is dict and "detail" in response:
raise ValueError(response.get("detail"))

redacted_file_data = response.get("processed_file").encode("ascii")
redacted_file_data = base64.b64decode(redacted_file_data, validate=True)

# Use a BytesIO buffer to work with redacted_file_data
redacted_buffer = BytesIO(redacted_file_data)

redact_pil_image_data = Image.open(redacted_buffer)

return redact_pil_image_data

def scrub_pdf(self, path_to_pdf: str) -> str:
"""Scrub the PDF of all PII/PHI.
Args:
path_to_pdf (str): Path to the PDF to be redacted
Returns:
str: Path to the redacted PDF
"""
# Create a BytesIO buffer to read the PDF file
with open(path_to_pdf, "rb") as pdf_file:
pdf_buffer = BytesIO(pdf_file.read())

# Read PDF data from the BytesIO buffer
pdf_data = pdf_buffer.getvalue()
pdf_buffer.close()

# Encode PDF data as base64
pdf_base64 = base64.b64encode(pdf_data).decode("ascii")

payload = {
"file": {"data": pdf_base64, "content_type": PDF_CONTENT_TYPE},
"entity_detection": {"accuracy": "high", "return_entity": True},
"pdf_options": {"density": 150, "max_resolution": 2000},
"audio_options": {"bleep_start_padding": 0, "bleep_end_padding": 0},
}

headers = {
"Content-Type": HEADER_CONTENT_TYPE,
"X-API-KEY": config.PRIVATE_AI_API_KEY,
}

response = requests.post(BASE64_URL, json=payload, headers=headers)
response_data = response.json()

# According to the PrivateAI API documentation,
# https://docs.private-ai.com/reference/latest/operation/process_files_base64_v3_process_files_base64_post/
# the response is a list of dicts when there is no error/issue in the request
# else it is a dict with a key "detail" containing the error message

if isinstance(response_data, dict) and "details" in response_data:
raise ValueError(response_data.get("detail"))

logger.debug(f"{response_data.get('entities')=}")
logger.debug(f"{len(response_data.get('entities'))=}")

redacted_file_path = path_to_pdf.split(".")[0] + "_redacted.pdf"

# Create a BytesIO buffer to handle the redacted PDF data
redacted_buffer = BytesIO()

# Decode and write the redacted PDF data to the BytesIO buffer
processed_file = response_data.get("processed_file").encode("ascii")
processed_file = base64.b64decode(processed_file, validate=True)
redacted_buffer.write(processed_file)

# Write the redacted PDF data to a file
with open(redacted_file_path, "wb") as redacted_file:
redacted_buffer.seek(0) # Move the buffer position to the beginning
redacted_file.write(redacted_buffer.read())

return redacted_file_path
Loading