Integration of Pebblo Reporter (#593)

The changes include - Integration of Pebblo Classifier v3 and its prompt template for entity and topic classifier - Option in config.yaml to switch off and on the use of llm-based classifier - The Pebblo Reporter will generate a PDF/JSON report of the given files. - Added new classes for topic classifier (llm-based) and entity classifier
daxa-ai · Feb 24, 2025 · 09f2097 · 09f2097
1 parent e67b011
commit 09f2097
Show file tree

Hide file tree

Showing 36 changed files with 2,098 additions and 103 deletions.
diff --git a/deploy/docker/Dockerfile.cli b/deploy/docker/Dockerfile.cli
@@ -0,0 +1,36 @@
+# Use Ubuntu as base image
+FROM ubuntu:22.04
+
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    python3.11 python3.11-dev python3.11-venv python3-pip \
+    build-essential poppler-utils \
+    libgl1-mesa-glx libglib2.0-0 \
+    libsm6 libxrender1 libxext6 \
+    libpango-1.0-0 libpangocairo-1.0-0 libcairo2 \
+    libjpeg-dev libpng-dev libgdk-pixbuf2.0-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+# Set working directory
+WORKDIR /opt/pebblo
+
+# Copy the application code
+COPY pebblo /opt/pebblo
+
+# Create a virtual environment and install dependencies
+RUN python3.11 -m venv /opt/pebblo/venv
+RUN /opt/pebblo/venv/bin/pip install --no-cache-dir build setuptools wheel weasyprint
+
+# Activate virtual environment for subsequent steps
+ENV PATH="/opt/pebblo/venv/bin:$PATH"
+
+# Build and install the package
+RUN python3 -m build --wheel && pip install dist/*.whl
+
+# Copy configuration files
+COPY pebblo/deploy/docker/config.yaml /opt/pebblo/config/config.yaml
+
+# Set entrypoint
+ENTRYPOINT ["pebblo"]
+CMD ["--config", "/opt/pebblo/config/config.yaml"]
+
diff --git a/deploy/docker/config.yaml b/deploy/docker/config.yaml
@@ -1,14 +1,15 @@
 daemon:
-  port: 8000
+  port: 9000
   host: 0.0.0.0
 logging:
   level: info
 reports:
   format: pdf
   renderer: weasyprint
-  cacheDir: /opt/.pebblo
+  cacheDir: ~/.pebblo
   anonymizeSnippets: False
 classifier:
   mode: all
+  use_llm: True
 storage:
-  type: file
+  type: file
diff --git a/deploy/docker/docker-compose.yml b/deploy/docker/docker-compose.yml
@@ -0,0 +1,56 @@
+version: "3.8"
+
+services:
+  pebblo:
+    build:
+      context: ../../..  # Move up to `Pebblo1` (where `pebblo2/` is located)
+      dockerfile: pebblo/deploy/docker/Dockerfile.cli
+    ports:
+      - "9000:9000"
+    depends_on:
+      - llm_server
+    environment:
+      - AWS_ACCESS_KEY_ID=
+      - AWS_SECRET_ACCESS_KEY=
+      - AWS_DEFAULT_REGION=us-east-1
+      - MODEL_NAME=daxa-ai/pebblo_classifier_v3
+      - HOSTED_VLLM_API_KEY=vllm-placeholder-key
+      - BACKEND=vllm
+      - API_BASE_URL=http://llm_server:8000/v1
+    logging:
+      driver: json-file
+      options:
+        max-size: "50m"
+        max-file: "6"
+    volumes:
+      - /home/ubuntu/pebblo_report:/opt/.pebblo
+
+  llm_server:
+    image: vllm/vllm-openai:v0.6.6
+    runtime: nvidia
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    environment:
+      - HUGGING_FACE_HUB_TOKEN=
+    ports:
+      - "8000:8000"
+    extra_hosts:
+      - "host.docker.internal:host-gateway"
+    volumes:
+      - model_cache_huggingface:/root/.cache/huggingface/
+    ipc: host
+    command: --model daxa-ai/pebblo_classifier_v3 --max-model-len=3000 --gpu_memory_utilization=0.95
+    logging:
+      driver: json-file
+      options:
+        max-size: "50m"
+        max-file: "6"
+
+volumes:
+  model_cache_huggingface:
+  indexing_huggingface_model_cache:
diff --git a/pebblo/app/api/api.py b/pebblo/app/api/api.py
@@ -26,6 +26,12 @@ def discover(
         response = discover_obj.process_request(data.model_dump())
         return response
 
+    @staticmethod
+    def discover_direct(data: ReqDiscover):
+        discover_obj = get_handler(handler_name="discover")
+        response = discover_obj.process_request(data.model_dump())
+        return response
+
     @staticmethod
     def loader_doc(
         data: ReqLoaderDoc,
@@ -36,6 +42,12 @@ def loader_doc(
         response = loader_doc_obj.process_request(data.model_dump())
         return response
 
+    @staticmethod
+    def loader_doc_direct(data: ReqLoaderDoc):
+        loader_doc_obj = get_handler(handler_name="loader")
+        response = loader_doc_obj.process_request(data.model_dump())
+        return response
+
     @staticmethod
     def prompt(
         data: ReqPrompt, prompt_obj=Depends(lambda: get_handler(handler_name="prompt"))

diff --git a/pebblo/app/config/config.yaml b/pebblo/app/config/config.yaml
@@ -10,5 +10,6 @@ reports:
   anonymizeSnippets: False
 classifier:
   mode: all
+  use_llm: True
 storage:
   type: file
diff --git a/pebblo/app/config/models.py b/pebblo/app/config/models.py
@@ -139,6 +139,7 @@ def validate_anonymize_snippets(cls, anonymize_snippets: bool) -> bool:
 # Classifier BaseModel
 class ClassifierConfig(BaseSettings):
     mode: str = Field(default=ClassificationMode.ALL.value)
+    use_llm: bool = Field(default=False)
     anonymizeSnippets: Optional[bool] = None
 
     @field_validator("mode")

diff --git a/pebblo/app/config/service.py b/pebblo/app/config/service.py
@@ -16,6 +16,7 @@
 
 with redirect_stdout(StringIO()), redirect_stderr(StringIO()):
     from pebblo.app.routers.routers import api_v1_router_instance, router_instance
+    from pebblo.tools.routers import router as tools_router_instance
 from pebblo.log import get_logger, get_uvicorn_logconfig
 
 logger = get_logger(__name__)
@@ -45,6 +46,7 @@ def __init__(self, config_details):
         self.app.include_router(api_v1_router_instance.router)
         self.app.include_router(local_ui_router_instance.router)
         self.app.include_router(redirect_router_instance.router)
+        self.app.include_router(tools_router_instance)
         # Adding cors
         self.app.add_middleware(
             CORSMiddleware,

diff --git a/pebblo/app/daemon.py b/pebblo/app/daemon.py
@@ -6,6 +6,7 @@
 import os
 import warnings
 
+import nltk
 from tqdm import tqdm
 
 from pebblo.app.config.config import (
@@ -15,6 +16,7 @@
 )
 from pebblo.app.utils.version import get_pebblo_version
 
+nltk.download("punkt_tab")
 warnings.filterwarnings("ignore", category=UserWarning)
 warnings.filterwarnings("ignore", category=DeprecationWarning)
 

diff --git a/pebblo/app/pebblo-ui/src/constants/keywordMapping.js b/pebblo/app/pebblo-ui/src/constants/keywordMapping.js
@@ -22,14 +22,26 @@ export const KEYWORD_MAPPING = {
   "us-ssn": "US SSN",
   "us-passport-number": "US Passport number",
   "us-drivers-license": "US Drivers License",
-  "credit-card-number": "Credit card number",
+
   "email-address": "Email Address",
+  "person-name": "Person",
+  "organization": "Organization",
+  "street-address": "Street Address",
+  "phone-number": "Phone Number",
+  "date-of-birth": "Date Of Birth",
+
+  "ip-address": "Ip Address",
+
+  "credit-card-number": "Credit card number",
   "us-bank-account-number": "US Bank Account Number",
   "iban-code": "IBAN code",
   "us-itin": "US ITIN",
+  "bank-routing-number": "Bank Routing Number",
+  "swift-code": "Swift Code",
+  "bban-code": "Bban Code",
+
   "github-token": "Github Token",
   "github-finegrained-token": "Github Finegrained Token",
-  "github-finergrained-token": "Github Finegrained Token",
   "private-key": "Private Key",
   "dsa-private-key": "DSA Private Key",
   "encrypted-private-key": "Encrypted Private Key",
@@ -43,8 +55,22 @@ export const KEYWORD_MAPPING = {
   "azure-key-id": "Azure Key ID",
   "azure-client-secret": "Azure Client Secret",
   "google-api-key": "Google API Key",
+  "api-key": "Api Key",
   "harmful": "Harmful",
   "medical": "Medical" ,
   "financial": "Financial",
-  "corporate-documents": "Corporate Documents"
+  "corporate-documents": "Corporate Documents",
+
+  "GOVERNANCE": "Governance",
+  "FINANCE" : "Finance",
+  "HR": "Hr",
+  "HEALTH": "Health",
+  "LEGAL" : "Legal",
+  "CUSTOMER" : "Customer",
+  "IP": "IP",
+  "PRODUCT" : "Product",
+  "MARKETING": "Marketing",
+  "SALES": "Sales",
+  "SECURITY": "Security",
+  "STRATEGY": "Strategy"
 };
diff --git a/pebblo/app/utils/utils.py b/pebblo/app/utils/utils.py
@@ -302,7 +302,7 @@ def wrapper(*args, **kwargs):
             response = func(*args, **kwargs)
             end_time = time.perf_counter()
             logger.debug(
-                f"Execution time of function <{func.__name__}> is {end_time-start_time} seconds."
+                f"Execution time of function <{func.__name__}> is {end_time - start_time} seconds."
             )
             return response
         else:

diff --git a/pebblo/entity_classifier/custom_analyzer/cerdit_card_analyzer.py b/pebblo/entity_classifier/custom_analyzer/cerdit_card_analyzer.py
@@ -52,7 +52,8 @@ class ExtendedCreditCardRecognizer(CreditCardRecognizer):
     CONTEXT = [
         "credit",
         "credit_card",
-        "card" "debit",
+        "card",
+        "debit",
         "Visa",
         "Mastercard",
         "Amex",

diff --git a/pebblo/entity_classifier/custom_analyzer/llm_analyzer.py b/pebblo/entity_classifier/custom_analyzer/llm_analyzer.py
@@ -0,0 +1,109 @@
+import json
+from typing import List, Optional
+
+from presidio_analyzer import (
+    EntityRecognizer,
+    Pattern,
+    RecognizerResult,
+)
+from presidio_analyzer.nlp_engine import NlpArtifacts
+
+from pebblo.entity_classifier.utils.prompt_lib import get_entity_detection_prompt
+from pebblo.entity_classifier.utils.result_validation import extract_entity_info
+from pebblo.log import get_logger
+from pebblo.text_generation.text_generation import TextGeneration
+
+logger = get_logger(__name__)
+
+
+ENTITY_LABELS_UNIQUE = {
+    "iban": "IBAN_CODE",
+    "ssn": "US_SSN",
+    "passport_number": "US_PASSPORT",
+    "driver_license_number": "US_DRIVER_LICENSE",
+    "credit_card_number": "CREDIT_CARD",
+    "name": "LLM_PERSON",
+    "company": "LLM_ORGANIZATION",
+    "street_address": "STREET_ADDRESS",
+    "email": "EMAIL_ADDRESS",
+    "phone_number": "PHONE_NUMBER",
+    "date_of_birth": "DATE_OF_BIRTH",
+    "bank_routing_number": "ROUTING_NUMBER",
+    "bank_account_number": "US_BANK_NUMBER",
+    "swift_bic_code": "SWIFT_CODE",
+    "api_key": "API_KEY",
+    "private_keys": "PRIVATE_KEY",
+    "itin": "US_ITIN",
+    "ip_address": "IP_ADDRESS",
+    "bban": "BBAN_CODE",
+}
+
+
+class LLMRecognizer(EntityRecognizer):
+    """
+    Custom recognizer that uses an external LLM model for entity detection.
+    """
+
+    def __init__(
+        self,
+        patterns: Optional[List[Pattern]] = None,
+        context: Optional[List[str]] = None,
+        supported_language: str = "en",
+        supported_entity: str = list(ENTITY_LABELS_UNIQUE.values()),
+    ):
+        super().__init__(
+            supported_entities=supported_entity,
+            context=context,
+            supported_language=supported_language,
+        )
+        self.text_gen_obj = TextGeneration()
+
+    def analyze(
+        self,
+        text,
+        entities: List[str] = [],
+        nlp_artifacts: Optional[NlpArtifacts] = None,
+        regex_flags: Optional[int] = None,
+    ):
+        """
+        Override the analyze method to detect entities using the LLM model.
+        :param text: The text to analyze.
+        :param entities: The list of entity types to detect.
+        :param nlp_artifacts: Optional NLP pipeline artifacts (unused in this recognizer).
+        :return: List of RecognizerResult objects.
+        """
+
+        # Calling the llm_analyzer function you provided
+        detected_entities = self.llm_analyzer(text)
+        # Create a list of RecognizerResult objects based on detected entities
+        results = []
+        for entity in detected_entities:
+            entity_det = entity.get("label")
+            if entity_det:
+                entity_type = ENTITY_LABELS_UNIQUE.get(entity_det)
+                start = entity.get("start")
+                end = entity.get("end")
+                score = entity.get(
+                    "confidence", 0.8
+                )  # default score is 1.0 if not provided
+
+            if abs(end - start) > 2:
+                # Append to results as a RecognizerResult object
+                results.append(RecognizerResult(entity_type, start, end, score))
+        return results
+
+    def llm_analyzer(self, text):
+        """
+        Your custom LLM entity detection logic.
+        """
+        messages = get_entity_detection_prompt(text)
+
+        # Assuming text_gen_obj is already defined and initialized elsewhere
+        entities = self.text_gen_obj.generate(messages)
+        entities = json.loads(entities)
+        if isinstance(entities, dict):
+            entities = [entities]
+
+        # Extract entity information from the detected entities
+        entities = extract_entity_info(entities, text)
+        return entities