-
Notifications
You must be signed in to change notification settings - Fork 48
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Integration of Pebblo Reporter (#593)
The changes include - Integration of Pebblo Classifier v3 and its prompt template for entity and topic classifier - Option in config.yaml to switch off and on the use of llm-based classifier - The Pebblo Reporter will generate a PDF/JSON report of the given files. - Added new classes for topic classifier (llm-based) and entity classifier
- Loading branch information
Showing
36 changed files
with
2,098 additions
and
103 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
# Use Ubuntu as base image | ||
FROM ubuntu:22.04 | ||
|
||
# Install system dependencies | ||
RUN apt-get update && apt-get install -y \ | ||
python3.11 python3.11-dev python3.11-venv python3-pip \ | ||
build-essential poppler-utils \ | ||
libgl1-mesa-glx libglib2.0-0 \ | ||
libsm6 libxrender1 libxext6 \ | ||
libpango-1.0-0 libpangocairo-1.0-0 libcairo2 \ | ||
libjpeg-dev libpng-dev libgdk-pixbuf2.0-dev \ | ||
&& rm -rf /var/lib/apt/lists/* | ||
|
||
# Set working directory | ||
WORKDIR /opt/pebblo | ||
|
||
# Copy the application code | ||
COPY pebblo /opt/pebblo | ||
|
||
# Create a virtual environment and install dependencies | ||
RUN python3.11 -m venv /opt/pebblo/venv | ||
RUN /opt/pebblo/venv/bin/pip install --no-cache-dir build setuptools wheel weasyprint | ||
|
||
# Activate virtual environment for subsequent steps | ||
ENV PATH="/opt/pebblo/venv/bin:$PATH" | ||
|
||
# Build and install the package | ||
RUN python3 -m build --wheel && pip install dist/*.whl | ||
|
||
# Copy configuration files | ||
COPY pebblo/deploy/docker/config.yaml /opt/pebblo/config/config.yaml | ||
|
||
# Set entrypoint | ||
ENTRYPOINT ["pebblo"] | ||
CMD ["--config", "/opt/pebblo/config/config.yaml"] | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,14 +1,15 @@ | ||
daemon: | ||
port: 8000 | ||
port: 9000 | ||
host: 0.0.0.0 | ||
logging: | ||
level: info | ||
reports: | ||
format: pdf | ||
renderer: weasyprint | ||
cacheDir: /opt/.pebblo | ||
cacheDir: ~/.pebblo | ||
anonymizeSnippets: False | ||
classifier: | ||
mode: all | ||
use_llm: True | ||
storage: | ||
type: file | ||
type: file |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
version: "3.8" | ||
|
||
services: | ||
pebblo: | ||
build: | ||
context: ../../.. # Move up to `Pebblo1` (where `pebblo2/` is located) | ||
dockerfile: pebblo/deploy/docker/Dockerfile.cli | ||
ports: | ||
- "9000:9000" | ||
depends_on: | ||
- llm_server | ||
environment: | ||
- AWS_ACCESS_KEY_ID= | ||
- AWS_SECRET_ACCESS_KEY= | ||
- AWS_DEFAULT_REGION=us-east-1 | ||
- MODEL_NAME=daxa-ai/pebblo_classifier_v3 | ||
- HOSTED_VLLM_API_KEY=vllm-placeholder-key | ||
- BACKEND=vllm | ||
- API_BASE_URL=http://llm_server:8000/v1 | ||
logging: | ||
driver: json-file | ||
options: | ||
max-size: "50m" | ||
max-file: "6" | ||
volumes: | ||
- /home/ubuntu/pebblo_report:/opt/.pebblo | ||
|
||
llm_server: | ||
image: vllm/vllm-openai:v0.6.6 | ||
runtime: nvidia | ||
deploy: | ||
resources: | ||
reservations: | ||
devices: | ||
- driver: nvidia | ||
count: all | ||
capabilities: [gpu] | ||
environment: | ||
- HUGGING_FACE_HUB_TOKEN= | ||
ports: | ||
- "8000:8000" | ||
extra_hosts: | ||
- "host.docker.internal:host-gateway" | ||
volumes: | ||
- model_cache_huggingface:/root/.cache/huggingface/ | ||
ipc: host | ||
command: --model daxa-ai/pebblo_classifier_v3 --max-model-len=3000 --gpu_memory_utilization=0.95 | ||
logging: | ||
driver: json-file | ||
options: | ||
max-size: "50m" | ||
max-file: "6" | ||
|
||
volumes: | ||
model_cache_huggingface: | ||
indexing_huggingface_model_cache: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,5 +10,6 @@ reports: | |
anonymizeSnippets: False | ||
classifier: | ||
mode: all | ||
use_llm: True | ||
storage: | ||
type: file |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
109 changes: 109 additions & 0 deletions
109
pebblo/entity_classifier/custom_analyzer/llm_analyzer.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,109 @@ | ||
import json | ||
from typing import List, Optional | ||
|
||
from presidio_analyzer import ( | ||
EntityRecognizer, | ||
Pattern, | ||
RecognizerResult, | ||
) | ||
from presidio_analyzer.nlp_engine import NlpArtifacts | ||
|
||
from pebblo.entity_classifier.utils.prompt_lib import get_entity_detection_prompt | ||
from pebblo.entity_classifier.utils.result_validation import extract_entity_info | ||
from pebblo.log import get_logger | ||
from pebblo.text_generation.text_generation import TextGeneration | ||
|
||
logger = get_logger(__name__) | ||
|
||
|
||
ENTITY_LABELS_UNIQUE = { | ||
"iban": "IBAN_CODE", | ||
"ssn": "US_SSN", | ||
"passport_number": "US_PASSPORT", | ||
"driver_license_number": "US_DRIVER_LICENSE", | ||
"credit_card_number": "CREDIT_CARD", | ||
"name": "LLM_PERSON", | ||
"company": "LLM_ORGANIZATION", | ||
"street_address": "STREET_ADDRESS", | ||
"email": "EMAIL_ADDRESS", | ||
"phone_number": "PHONE_NUMBER", | ||
"date_of_birth": "DATE_OF_BIRTH", | ||
"bank_routing_number": "ROUTING_NUMBER", | ||
"bank_account_number": "US_BANK_NUMBER", | ||
"swift_bic_code": "SWIFT_CODE", | ||
"api_key": "API_KEY", | ||
"private_keys": "PRIVATE_KEY", | ||
"itin": "US_ITIN", | ||
"ip_address": "IP_ADDRESS", | ||
"bban": "BBAN_CODE", | ||
} | ||
|
||
|
||
class LLMRecognizer(EntityRecognizer): | ||
""" | ||
Custom recognizer that uses an external LLM model for entity detection. | ||
""" | ||
|
||
def __init__( | ||
self, | ||
patterns: Optional[List[Pattern]] = None, | ||
context: Optional[List[str]] = None, | ||
supported_language: str = "en", | ||
supported_entity: str = list(ENTITY_LABELS_UNIQUE.values()), | ||
): | ||
super().__init__( | ||
supported_entities=supported_entity, | ||
context=context, | ||
supported_language=supported_language, | ||
) | ||
self.text_gen_obj = TextGeneration() | ||
|
||
def analyze( | ||
self, | ||
text, | ||
entities: List[str] = [], | ||
nlp_artifacts: Optional[NlpArtifacts] = None, | ||
regex_flags: Optional[int] = None, | ||
): | ||
""" | ||
Override the analyze method to detect entities using the LLM model. | ||
:param text: The text to analyze. | ||
:param entities: The list of entity types to detect. | ||
:param nlp_artifacts: Optional NLP pipeline artifacts (unused in this recognizer). | ||
:return: List of RecognizerResult objects. | ||
""" | ||
|
||
# Calling the llm_analyzer function you provided | ||
detected_entities = self.llm_analyzer(text) | ||
# Create a list of RecognizerResult objects based on detected entities | ||
results = [] | ||
for entity in detected_entities: | ||
entity_det = entity.get("label") | ||
if entity_det: | ||
entity_type = ENTITY_LABELS_UNIQUE.get(entity_det) | ||
start = entity.get("start") | ||
end = entity.get("end") | ||
score = entity.get( | ||
"confidence", 0.8 | ||
) # default score is 1.0 if not provided | ||
|
||
if abs(end - start) > 2: | ||
# Append to results as a RecognizerResult object | ||
results.append(RecognizerResult(entity_type, start, end, score)) | ||
return results | ||
|
||
def llm_analyzer(self, text): | ||
""" | ||
Your custom LLM entity detection logic. | ||
""" | ||
messages = get_entity_detection_prompt(text) | ||
|
||
# Assuming text_gen_obj is already defined and initialized elsewhere | ||
entities = self.text_gen_obj.generate(messages) | ||
entities = json.loads(entities) | ||
if isinstance(entities, dict): | ||
entities = [entities] | ||
|
||
# Extract entity information from the detected entities | ||
entities = extract_entity_info(entities, text) | ||
return entities |
Oops, something went wrong.