Skip to content

Commit

Permalink
Integration of Pebblo Reporter (#593)
Browse files Browse the repository at this point in the history
The changes include

- Integration of Pebblo Classifier v3 and its prompt template for entity and topic classifier
- Option in config.yaml to switch off and on the use of llm-based classifier
- The Pebblo Reporter will generate a PDF/JSON report of the given files.
- Added new classes for topic classifier (llm-based) and entity classifier
  • Loading branch information
gr8nishan authored Feb 24, 2025
1 parent e67b011 commit 09f2097
Show file tree
Hide file tree
Showing 36 changed files with 2,098 additions and 103 deletions.
36 changes: 36 additions & 0 deletions deploy/docker/Dockerfile.cli
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Use Ubuntu as base image
FROM ubuntu:22.04

# Install system dependencies
RUN apt-get update && apt-get install -y \
python3.11 python3.11-dev python3.11-venv python3-pip \
build-essential poppler-utils \
libgl1-mesa-glx libglib2.0-0 \
libsm6 libxrender1 libxext6 \
libpango-1.0-0 libpangocairo-1.0-0 libcairo2 \
libjpeg-dev libpng-dev libgdk-pixbuf2.0-dev \
&& rm -rf /var/lib/apt/lists/*

# Set working directory
WORKDIR /opt/pebblo

# Copy the application code
COPY pebblo /opt/pebblo

# Create a virtual environment and install dependencies
RUN python3.11 -m venv /opt/pebblo/venv
RUN /opt/pebblo/venv/bin/pip install --no-cache-dir build setuptools wheel weasyprint

# Activate virtual environment for subsequent steps
ENV PATH="/opt/pebblo/venv/bin:$PATH"

# Build and install the package
RUN python3 -m build --wheel && pip install dist/*.whl

# Copy configuration files
COPY pebblo/deploy/docker/config.yaml /opt/pebblo/config/config.yaml

# Set entrypoint
ENTRYPOINT ["pebblo"]
CMD ["--config", "/opt/pebblo/config/config.yaml"]

7 changes: 4 additions & 3 deletions deploy/docker/config.yaml
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
daemon:
port: 8000
port: 9000
host: 0.0.0.0
logging:
level: info
reports:
format: pdf
renderer: weasyprint
cacheDir: /opt/.pebblo
cacheDir: ~/.pebblo
anonymizeSnippets: False
classifier:
mode: all
use_llm: True
storage:
type: file
type: file
56 changes: 56 additions & 0 deletions deploy/docker/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
version: "3.8"

services:
pebblo:
build:
context: ../../.. # Move up to `Pebblo1` (where `pebblo2/` is located)
dockerfile: pebblo/deploy/docker/Dockerfile.cli
ports:
- "9000:9000"
depends_on:
- llm_server
environment:
- AWS_ACCESS_KEY_ID=
- AWS_SECRET_ACCESS_KEY=
- AWS_DEFAULT_REGION=us-east-1
- MODEL_NAME=daxa-ai/pebblo_classifier_v3
- HOSTED_VLLM_API_KEY=vllm-placeholder-key
- BACKEND=vllm
- API_BASE_URL=http://llm_server:8000/v1
logging:
driver: json-file
options:
max-size: "50m"
max-file: "6"
volumes:
- /home/ubuntu/pebblo_report:/opt/.pebblo

llm_server:
image: vllm/vllm-openai:v0.6.6
runtime: nvidia
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
environment:
- HUGGING_FACE_HUB_TOKEN=
ports:
- "8000:8000"
extra_hosts:
- "host.docker.internal:host-gateway"
volumes:
- model_cache_huggingface:/root/.cache/huggingface/
ipc: host
command: --model daxa-ai/pebblo_classifier_v3 --max-model-len=3000 --gpu_memory_utilization=0.95
logging:
driver: json-file
options:
max-size: "50m"
max-file: "6"

volumes:
model_cache_huggingface:
indexing_huggingface_model_cache:
12 changes: 12 additions & 0 deletions pebblo/app/api/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,12 @@ def discover(
response = discover_obj.process_request(data.model_dump())
return response

@staticmethod
def discover_direct(data: ReqDiscover):
discover_obj = get_handler(handler_name="discover")
response = discover_obj.process_request(data.model_dump())
return response

@staticmethod
def loader_doc(
data: ReqLoaderDoc,
Expand All @@ -36,6 +42,12 @@ def loader_doc(
response = loader_doc_obj.process_request(data.model_dump())
return response

@staticmethod
def loader_doc_direct(data: ReqLoaderDoc):
loader_doc_obj = get_handler(handler_name="loader")
response = loader_doc_obj.process_request(data.model_dump())
return response

@staticmethod
def prompt(
data: ReqPrompt, prompt_obj=Depends(lambda: get_handler(handler_name="prompt"))
Expand Down
1 change: 1 addition & 0 deletions pebblo/app/config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,6 @@ reports:
anonymizeSnippets: False
classifier:
mode: all
use_llm: True
storage:
type: file
1 change: 1 addition & 0 deletions pebblo/app/config/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ def validate_anonymize_snippets(cls, anonymize_snippets: bool) -> bool:
# Classifier BaseModel
class ClassifierConfig(BaseSettings):
mode: str = Field(default=ClassificationMode.ALL.value)
use_llm: bool = Field(default=False)
anonymizeSnippets: Optional[bool] = None

@field_validator("mode")
Expand Down
2 changes: 2 additions & 0 deletions pebblo/app/config/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

with redirect_stdout(StringIO()), redirect_stderr(StringIO()):
from pebblo.app.routers.routers import api_v1_router_instance, router_instance
from pebblo.tools.routers import router as tools_router_instance
from pebblo.log import get_logger, get_uvicorn_logconfig

logger = get_logger(__name__)
Expand Down Expand Up @@ -45,6 +46,7 @@ def __init__(self, config_details):
self.app.include_router(api_v1_router_instance.router)
self.app.include_router(local_ui_router_instance.router)
self.app.include_router(redirect_router_instance.router)
self.app.include_router(tools_router_instance)
# Adding cors
self.app.add_middleware(
CORSMiddleware,
Expand Down
2 changes: 2 additions & 0 deletions pebblo/app/daemon.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import os
import warnings

import nltk
from tqdm import tqdm

from pebblo.app.config.config import (
Expand All @@ -15,6 +16,7 @@
)
from pebblo.app.utils.version import get_pebblo_version

nltk.download("punkt_tab")
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

Expand Down
32 changes: 29 additions & 3 deletions pebblo/app/pebblo-ui/src/constants/keywordMapping.js
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,26 @@ export const KEYWORD_MAPPING = {
"us-ssn": "US SSN",
"us-passport-number": "US Passport number",
"us-drivers-license": "US Drivers License",
"credit-card-number": "Credit card number",

"email-address": "Email Address",
"person-name": "Person",
"organization": "Organization",
"street-address": "Street Address",
"phone-number": "Phone Number",
"date-of-birth": "Date Of Birth",

"ip-address": "Ip Address",

"credit-card-number": "Credit card number",
"us-bank-account-number": "US Bank Account Number",
"iban-code": "IBAN code",
"us-itin": "US ITIN",
"bank-routing-number": "Bank Routing Number",
"swift-code": "Swift Code",
"bban-code": "Bban Code",

"github-token": "Github Token",
"github-finegrained-token": "Github Finegrained Token",
"github-finergrained-token": "Github Finegrained Token",
"private-key": "Private Key",
"dsa-private-key": "DSA Private Key",
"encrypted-private-key": "Encrypted Private Key",
Expand All @@ -43,8 +55,22 @@ export const KEYWORD_MAPPING = {
"azure-key-id": "Azure Key ID",
"azure-client-secret": "Azure Client Secret",
"google-api-key": "Google API Key",
"api-key": "Api Key",
"harmful": "Harmful",
"medical": "Medical" ,
"financial": "Financial",
"corporate-documents": "Corporate Documents"
"corporate-documents": "Corporate Documents",

"GOVERNANCE": "Governance",
"FINANCE" : "Finance",
"HR": "Hr",
"HEALTH": "Health",
"LEGAL" : "Legal",
"CUSTOMER" : "Customer",
"IP": "IP",
"PRODUCT" : "Product",
"MARKETING": "Marketing",
"SALES": "Sales",
"SECURITY": "Security",
"STRATEGY": "Strategy"
};
2 changes: 1 addition & 1 deletion pebblo/app/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,7 @@ def wrapper(*args, **kwargs):
response = func(*args, **kwargs)
end_time = time.perf_counter()
logger.debug(
f"Execution time of function <{func.__name__}> is {end_time-start_time} seconds."
f"Execution time of function <{func.__name__}> is {end_time - start_time} seconds."
)
return response
else:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,8 @@ class ExtendedCreditCardRecognizer(CreditCardRecognizer):
CONTEXT = [
"credit",
"credit_card",
"card" "debit",
"card",
"debit",
"Visa",
"Mastercard",
"Amex",
Expand Down
109 changes: 109 additions & 0 deletions pebblo/entity_classifier/custom_analyzer/llm_analyzer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
import json
from typing import List, Optional

from presidio_analyzer import (
EntityRecognizer,
Pattern,
RecognizerResult,
)
from presidio_analyzer.nlp_engine import NlpArtifacts

from pebblo.entity_classifier.utils.prompt_lib import get_entity_detection_prompt
from pebblo.entity_classifier.utils.result_validation import extract_entity_info
from pebblo.log import get_logger
from pebblo.text_generation.text_generation import TextGeneration

logger = get_logger(__name__)


ENTITY_LABELS_UNIQUE = {
"iban": "IBAN_CODE",
"ssn": "US_SSN",
"passport_number": "US_PASSPORT",
"driver_license_number": "US_DRIVER_LICENSE",
"credit_card_number": "CREDIT_CARD",
"name": "LLM_PERSON",
"company": "LLM_ORGANIZATION",
"street_address": "STREET_ADDRESS",
"email": "EMAIL_ADDRESS",
"phone_number": "PHONE_NUMBER",
"date_of_birth": "DATE_OF_BIRTH",
"bank_routing_number": "ROUTING_NUMBER",
"bank_account_number": "US_BANK_NUMBER",
"swift_bic_code": "SWIFT_CODE",
"api_key": "API_KEY",
"private_keys": "PRIVATE_KEY",
"itin": "US_ITIN",
"ip_address": "IP_ADDRESS",
"bban": "BBAN_CODE",
}


class LLMRecognizer(EntityRecognizer):
"""
Custom recognizer that uses an external LLM model for entity detection.
"""

def __init__(
self,
patterns: Optional[List[Pattern]] = None,
context: Optional[List[str]] = None,
supported_language: str = "en",
supported_entity: str = list(ENTITY_LABELS_UNIQUE.values()),
):
super().__init__(
supported_entities=supported_entity,
context=context,
supported_language=supported_language,
)
self.text_gen_obj = TextGeneration()

def analyze(
self,
text,
entities: List[str] = [],
nlp_artifacts: Optional[NlpArtifacts] = None,
regex_flags: Optional[int] = None,
):
"""
Override the analyze method to detect entities using the LLM model.
:param text: The text to analyze.
:param entities: The list of entity types to detect.
:param nlp_artifacts: Optional NLP pipeline artifacts (unused in this recognizer).
:return: List of RecognizerResult objects.
"""

# Calling the llm_analyzer function you provided
detected_entities = self.llm_analyzer(text)
# Create a list of RecognizerResult objects based on detected entities
results = []
for entity in detected_entities:
entity_det = entity.get("label")
if entity_det:
entity_type = ENTITY_LABELS_UNIQUE.get(entity_det)
start = entity.get("start")
end = entity.get("end")
score = entity.get(
"confidence", 0.8
) # default score is 1.0 if not provided

if abs(end - start) > 2:
# Append to results as a RecognizerResult object
results.append(RecognizerResult(entity_type, start, end, score))
return results

def llm_analyzer(self, text):
"""
Your custom LLM entity detection logic.
"""
messages = get_entity_detection_prompt(text)

# Assuming text_gen_obj is already defined and initialized elsewhere
entities = self.text_gen_obj.generate(messages)
entities = json.loads(entities)
if isinstance(entities, dict):
entities = [entities]

# Extract entity information from the detected entities
entities = extract_entity_info(entities, text)
return entities
Loading

0 comments on commit 09f2097

Please sign in to comment.