Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Integration of Pebblo Reporter #593

Merged
merged 11 commits into from
Feb 24, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions deploy/docker/Dockerfile.cli
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Use Ubuntu as base image
FROM ubuntu:22.04

# Install system dependencies
RUN apt-get update && apt-get install -y \
python3.11 python3.11-dev python3.11-venv python3-pip \
build-essential poppler-utils \
libgl1-mesa-glx libglib2.0-0 \
libsm6 libxrender1 libxext6 \
libpango-1.0-0 libpangocairo-1.0-0 libcairo2 \
libjpeg-dev libpng-dev libgdk-pixbuf2.0-dev \
&& rm -rf /var/lib/apt/lists/*

# Set working directory
WORKDIR /opt/pebblo

# Copy the application code
COPY pebblo /opt/pebblo

# Create a virtual environment and install dependencies
RUN python3.11 -m venv /opt/pebblo/venv
RUN /opt/pebblo/venv/bin/pip install --no-cache-dir build setuptools wheel weasyprint

# Activate virtual environment for subsequent steps
ENV PATH="/opt/pebblo/venv/bin:$PATH"

# Build and install the package
RUN python3 -m build --wheel && pip install dist/*.whl

# Copy configuration files
COPY pebblo/deploy/docker/config.yaml /opt/pebblo/config/config.yaml

# Set entrypoint
ENTRYPOINT ["pebblo"]
CMD ["--config", "/opt/pebblo/config/config.yaml"]

7 changes: 4 additions & 3 deletions deploy/docker/config.yaml
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
daemon:
port: 8000
port: 9000
host: 0.0.0.0
logging:
level: info
reports:
format: pdf
renderer: weasyprint
cacheDir: /opt/.pebblo
cacheDir: ~/.pebblo
anonymizeSnippets: False
classifier:
mode: all
use_llm: True
storage:
type: file
type: file
56 changes: 56 additions & 0 deletions deploy/docker/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
version: "3.8"

services:
pebblo:
build:
context: ../../.. # Move up to `Pebblo1` (where `pebblo2/` is located)
dockerfile: pebblo/deploy/docker/Dockerfile.cli
ports:
- "9000:9000"
depends_on:
- llm_server
environment:
- AWS_ACCESS_KEY_ID=
- AWS_SECRET_ACCESS_KEY=
- AWS_DEFAULT_REGION=us-east-1
- MODEL_NAME=daxa-ai/pebblo_classifier_v3
- HOSTED_VLLM_API_KEY=vllm-placeholder-key
- BACKEND=vllm
- API_BASE_URL=http://llm_server:8000/v1
logging:
driver: json-file
options:
max-size: "50m"
max-file: "6"
volumes:
- /home/ubuntu/pebblo_report:/opt/.pebblo

llm_server:
image: vllm/vllm-openai:v0.6.6
runtime: nvidia
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
environment:
- HUGGING_FACE_HUB_TOKEN=
ports:
- "8000:8000"
extra_hosts:
- "host.docker.internal:host-gateway"
volumes:
- model_cache_huggingface:/root/.cache/huggingface/
ipc: host
command: --model daxa-ai/pebblo_classifier_v3 --max-model-len=3000 --gpu_memory_utilization=0.95
logging:
driver: json-file
options:
max-size: "50m"
max-file: "6"

volumes:
model_cache_huggingface:
indexing_huggingface_model_cache:
12 changes: 12 additions & 0 deletions pebblo/app/api/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,12 @@ def discover(
response = discover_obj.process_request(data.model_dump())
return response

@staticmethod
def discover_direct(data: ReqDiscover):
discover_obj = get_handler(handler_name="discover")
response = discover_obj.process_request(data.model_dump())
return response

@staticmethod
def loader_doc(
data: ReqLoaderDoc,
Expand All @@ -36,6 +42,12 @@ def loader_doc(
response = loader_doc_obj.process_request(data.model_dump())
return response

@staticmethod
def loader_doc_direct(data: ReqLoaderDoc):
loader_doc_obj = get_handler(handler_name="loader")
response = loader_doc_obj.process_request(data.model_dump())
return response

@staticmethod
def prompt(
data: ReqPrompt, prompt_obj=Depends(lambda: get_handler(handler_name="prompt"))
Expand Down
1 change: 1 addition & 0 deletions pebblo/app/config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,6 @@ reports:
anonymizeSnippets: False
classifier:
mode: all
use_llm: True
storage:
type: file
1 change: 1 addition & 0 deletions pebblo/app/config/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ def validate_anonymize_snippets(cls, anonymize_snippets: bool) -> bool:
# Classifier BaseModel
class ClassifierConfig(BaseSettings):
mode: str = Field(default=ClassificationMode.ALL.value)
use_llm: bool = Field(default=False)
anonymizeSnippets: Optional[bool] = None

@field_validator("mode")
Expand Down
2 changes: 2 additions & 0 deletions pebblo/app/config/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

with redirect_stdout(StringIO()), redirect_stderr(StringIO()):
from pebblo.app.routers.routers import api_v1_router_instance, router_instance
from pebblo.tools.routers import router as tools_router_instance
from pebblo.log import get_logger, get_uvicorn_logconfig

logger = get_logger(__name__)
Expand Down Expand Up @@ -45,6 +46,7 @@ def __init__(self, config_details):
self.app.include_router(api_v1_router_instance.router)
self.app.include_router(local_ui_router_instance.router)
self.app.include_router(redirect_router_instance.router)
self.app.include_router(tools_router_instance)
# Adding cors
self.app.add_middleware(
CORSMiddleware,
Expand Down
2 changes: 2 additions & 0 deletions pebblo/app/daemon.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import os
import warnings

import nltk
from tqdm import tqdm

from pebblo.app.config.config import (
Expand All @@ -15,6 +16,7 @@
)
from pebblo.app.utils.version import get_pebblo_version

nltk.download("punkt_tab")
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

Expand Down
32 changes: 29 additions & 3 deletions pebblo/app/pebblo-ui/src/constants/keywordMapping.js
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,26 @@ export const KEYWORD_MAPPING = {
"us-ssn": "US SSN",
"us-passport-number": "US Passport number",
"us-drivers-license": "US Drivers License",
"credit-card-number": "Credit card number",

"email-address": "Email Address",
"person-name": "Person",
"organization": "Organization",
"street-address": "Street Address",
"phone-number": "Phone Number",
"date-of-birth": "Date Of Birth",

"ip-address": "Ip Address",

"credit-card-number": "Credit card number",
"us-bank-account-number": "US Bank Account Number",
"iban-code": "IBAN code",
"us-itin": "US ITIN",
"bank-routing-number": "Bank Routing Number",
"swift-code": "Swift Code",
"bban-code": "Bban Code",

"github-token": "Github Token",
"github-finegrained-token": "Github Finegrained Token",
"github-finergrained-token": "Github Finegrained Token",
"private-key": "Private Key",
"dsa-private-key": "DSA Private Key",
"encrypted-private-key": "Encrypted Private Key",
Expand All @@ -43,8 +55,22 @@ export const KEYWORD_MAPPING = {
"azure-key-id": "Azure Key ID",
"azure-client-secret": "Azure Client Secret",
"google-api-key": "Google API Key",
"api-key": "Api Key",
"harmful": "Harmful",
"medical": "Medical" ,
"financial": "Financial",
"corporate-documents": "Corporate Documents"
"corporate-documents": "Corporate Documents",

"GOVERNANCE": "Governance",
"FINANCE" : "Finance",
"HR": "Hr",
"HEALTH": "Health",
"LEGAL" : "Legal",
"CUSTOMER" : "Customer",
"IP": "IP",
"PRODUCT" : "Product",
"MARKETING": "Marketing",
"SALES": "Sales",
"SECURITY": "Security",
"STRATEGY": "Strategy"
};
2 changes: 1 addition & 1 deletion pebblo/app/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,7 @@ def wrapper(*args, **kwargs):
response = func(*args, **kwargs)
end_time = time.perf_counter()
logger.debug(
f"Execution time of function <{func.__name__}> is {end_time-start_time} seconds."
f"Execution time of function <{func.__name__}> is {end_time - start_time} seconds."
)
return response
else:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,8 @@ class ExtendedCreditCardRecognizer(CreditCardRecognizer):
CONTEXT = [
"credit",
"credit_card",
"card" "debit",
"card",
"debit",
"Visa",
"Mastercard",
"Amex",
Expand Down
109 changes: 109 additions & 0 deletions pebblo/entity_classifier/custom_analyzer/llm_analyzer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
import json
from typing import List, Optional

from presidio_analyzer import (
EntityRecognizer,
Pattern,
RecognizerResult,
)
from presidio_analyzer.nlp_engine import NlpArtifacts

from pebblo.entity_classifier.utils.prompt_lib import get_entity_detection_prompt
from pebblo.entity_classifier.utils.result_validation import extract_entity_info
from pebblo.log import get_logger
from pebblo.text_generation.text_generation import TextGeneration

logger = get_logger(__name__)


ENTITY_LABELS_UNIQUE = {
"iban": "IBAN_CODE",
"ssn": "US_SSN",
"passport_number": "US_PASSPORT",
"driver_license_number": "US_DRIVER_LICENSE",
"credit_card_number": "CREDIT_CARD",
"name": "LLM_PERSON",
"company": "LLM_ORGANIZATION",
"street_address": "STREET_ADDRESS",
"email": "EMAIL_ADDRESS",
"phone_number": "PHONE_NUMBER",
"date_of_birth": "DATE_OF_BIRTH",
"bank_routing_number": "ROUTING_NUMBER",
"bank_account_number": "US_BANK_NUMBER",
"swift_bic_code": "SWIFT_CODE",
"api_key": "API_KEY",
"private_keys": "PRIVATE_KEY",
"itin": "US_ITIN",
"ip_address": "IP_ADDRESS",
"bban": "BBAN_CODE",
}


class LLMRecognizer(EntityRecognizer):
"""
Custom recognizer that uses an external LLM model for entity detection.
"""

def __init__(
self,
patterns: Optional[List[Pattern]] = None,
context: Optional[List[str]] = None,
supported_language: str = "en",
supported_entity: str = list(ENTITY_LABELS_UNIQUE.values()),
):
super().__init__(
supported_entities=supported_entity,
context=context,
supported_language=supported_language,
)
self.text_gen_obj = TextGeneration()

def analyze(
self,
text,
entities: List[str] = [],
nlp_artifacts: Optional[NlpArtifacts] = None,
regex_flags: Optional[int] = None,
):
"""
Override the analyze method to detect entities using the LLM model.
:param text: The text to analyze.
:param entities: The list of entity types to detect.
:param nlp_artifacts: Optional NLP pipeline artifacts (unused in this recognizer).
:return: List of RecognizerResult objects.
"""

# Calling the llm_analyzer function you provided
detected_entities = self.llm_analyzer(text)
# Create a list of RecognizerResult objects based on detected entities
results = []
for entity in detected_entities:
entity_det = entity.get("label")
if entity_det:
entity_type = ENTITY_LABELS_UNIQUE.get(entity_det)
start = entity.get("start")
end = entity.get("end")
score = entity.get(
"confidence", 0.8
) # default score is 1.0 if not provided

if abs(end - start) > 2:
# Append to results as a RecognizerResult object
results.append(RecognizerResult(entity_type, start, end, score))
return results

def llm_analyzer(self, text):
"""
Your custom LLM entity detection logic.
"""
messages = get_entity_detection_prompt(text)

# Assuming text_gen_obj is already defined and initialized elsewhere
entities = self.text_gen_obj.generate(messages)
entities = json.loads(entities)
if isinstance(entities, dict):
entities = [entities]

# Extract entity information from the detected entities
entities = extract_entity_info(entities, text)
return entities
Loading