samples: Added readme file, metadata to Text loader sample (#565)

* Add owner and size to text metadata * Add anonymize_snippets flag * Added readme for textloader sample. Update sample env file. * Fix formatting * Set default value for Pebblo classifier url * Added double quotes for consistency
daxa-ai · Sep 25, 2024 · 65add33 · 65add33
1 parent 5f02a89
commit 65add33
Show file tree

Hide file tree

Showing 5 changed files with 75 additions and 8 deletions.
diff --git a/pebblo_safeloader/langchain/textloader_postgress/.env.sample b/pebblo_safeloader/langchain/textloader_postgress/.env.sample
@@ -1,11 +1,11 @@
 # OpenAI credentials
 OPENAI_API_KEY=<YOUR OPENAI API KEY>
 
-# Pebblo configuration
-PEBBLO_CLOUD_URL=<PEBBLO CLOUD URL>
-PEBBLO_API_KEY=<YOUR PEBBLO API KEY>
-PEBBLO_CLASSIFIER_URL="http://localhost:8000/"
-
 # Postgres configuration
 PG_CONNECTION_STRING = "postgresql://<USERNAME>:<PASSWORD>@<HOST>:<PORT>/<DATABASE-NAME>"
 
+# Pebblo configuration
+PEBBLO_CLASSIFIER_URL="http://localhost:8000/"
+# Optional (only if you are using Pebblo Cloud)
+PEBBLO_CLOUD_URL=<PEBBLO CLOUD URL>
+PEBBLO_API_KEY=<YOUR PEBBLO API KEY>
diff --git a/pebblo_safeloader/langchain/textloader_postgress/README.md b/pebblo_safeloader/langchain/textloader_postgress/README.md
@@ -0,0 +1,62 @@
+# Pebblo Text Loader
+
+This is a sample application that demonstrates how to use the `Pebblo Text Loader` to load the text data
+with the `Pebblo Safe Loader` into `Postgres` Vector Database.
+
+\* This solution uses predefined text data and metadata from the utility functions to demonstrate the loading of
+in-memory text data using Pebblo Safe Loader. Real-world applications can use this solution to load text data from
+various sources.
+
+**PebbloTextLoader**: PebbloTextLoader is a loader for text data. Since PebbloSafeLoader is a wrapper around document
+loaders, this loader is used to load text data directly into Documents.
+
+**This solution uses:**
+
+- PostgreSQL 15.7
+- langchain-community from daxa-ai/langchain branch(pebblo-0.1.19)
+
+### Instructions
+
+1. Create Python virtual-env
+
+```console
+$ python3 -m venv .venv
+$ source .venv/bin/activate
+```
+
+2. Install dependencies
+
+```console
+$ pip3 install -r requirements.txt
+```
+
+3. Install langchain-community from the branch `pebblo-0.1.19`
+
+```console
+$ git clone https://github.com/daxa-ai/langchain.git
+$ cd langchain
+$ git fetch && git checkout pebblo-0.1.19
+$ cd libs/community
+$ pip3 install langchain-community .
+```
+
+4. Copy the `.env.sample` file to `.env` and populate the necessary environment variable. The `.env` file should look
+   like this:
+
+```console
+$ cat .env
+# OpenAI credentials
+OPENAI_API_KEY=<YOUR OPENAI API KEY>
+
+# Postgres configuration
+PG_CONNECTION_STRING = "postgresql://<USERNAME>:<PASSWORD>@<HOST>:<PORT>/<DATABASE-NAME>"
+```
+
+5. Run Pebblo Safe Loader sample app
+
+```console
+$ python3 pebblo_safeload.py
+```
+
+6. Retrieve the Pebblo PDF report in `$HOME/.pebblo/pebblo-safe-loader-text-loader/pebblo_report.pdf` file path on the
+   system where `Pebblo Server` is running.
diff --git a/pebblo_safeloader/langchain/textloader_postgress/pebblo_safeload.py b/pebblo_safeloader/langchain/textloader_postgress/pebblo_safeload.py
@@ -48,6 +48,7 @@ def __init__(self, collection_name: str):
             description="Identity & Semantic enabled SafeLoader app using Pebblo",  # Description (Optional)
             load_semantic=True,
             api_key=PEBBLO_API_KEY,
+            anonymize_snippets=True,
         )
         self.documents = self.loader.load()
         unique_identities = set()

diff --git a/pebblo_safeloader/langchain/textloader_postgress/requirements.txt b/pebblo_safeloader/langchain/textloader_postgress/requirements.txt
@@ -2,7 +2,7 @@ python-dotenv==1.0.0
 tiktoken # OpenAI tokenizer
 
 langchain-openai>=0.1.7 # For OpenAI LLM and OpenAIEmbeddings
-langchain-community>=0.2.16,<0.3 # for PebbloSafeLoader, PebbloRetrievalQA
+#langchain-community>=0.2.16,<0.3 # for PebbloSafeLoader, PebbloRetrievalQA
 
 psycopg2-binary # For Postgres VectorStore
 langchain-postgres # For Postgres VectorStore
diff --git a/pebblo_safeloader/langchain/textloader_postgress/util.py b/pebblo_safeloader/langchain/textloader_postgress/util.py
@@ -40,8 +40,12 @@ def get_data(
     if metadatas:
         # Metadata(source: fake news web url) for each text
         _metadata_list = [
-            {"source": f"https://www.acme.org/news/{i}"}
-            for i in range(1, len(texts) + 1)
+            {
+                "source": f"https://www.acme.org/news/{i + 1}",
+                "owner": "Joe Smith",
+                "size": f"{len(texts[i])}",
+            }
+            for i in range(len(texts))
         ]
     else:
         _metadata_list = None