GraphRAG

KruxAI · Sep 12, 2024 · af144aa · af144aa
1 parent 5df3885
commit af144aa
Show file tree

Hide file tree

Showing 14 changed files with 589 additions and 14 deletions.
diff --git a/.env-Sample b/.env-Sample
@@ -23,3 +23,7 @@ RUN_CONFIG_MAX_WORKERS=16
 RUN_CONFIG_MAX_WAIT=180
 RUN_CONFIG_MAX_RETRIES=10
 RUN_CONFIG_IS_ASYNC="true"
+NEO4J_URI=bolt://localhost:7687## use bolt://neo4j:7687 if using docker for ragbuilder
+NEO4J_USERNAME=neo4j
+NEO4J_PASSWORD=ragbuilder
+NEO4J_LOAD=true # set to false if graph is already loaded and you don't want to reload
diff --git a/README.md b/README.md
@@ -26,6 +26,7 @@ https://github.com/user-attachments/assets/8b4a5013-b1b7-40ee-820b-32c46fd99a2a
 ## Installation
 
 ### Option 1: Install using install script:
+Note: For GraphRAG, Neo4J Graph Database details must be added in the .env file. For spinning up a local Neo4J Graph Database refer repo https://github.com/KruxAI/neo4j-docker
 #### Mac
 
 ``` sh
@@ -68,7 +69,7 @@ This will start the Ragbuilder Uvicorn app and open the browser. If the browser
 
 ### Option 2: Using Prebuilt Docker Image
 #### Using Docker Compose
-1. Pull the docker-compose.yml file
+1. Pull the docker-compose.yml file. This will sping RAGBuilder and neo4J database needed for GraphRAG.
 
 ```
 curl -o docker-compose.yml  https://raw.githubusercontent.com/KruxAI/ragbuilder/main/docker-compose.yml
@@ -159,6 +160,8 @@ The environment variables are essential for authenticating and configuring vario
 
 ### Environment Variables
 
+### Environment Variables
+
 - **OPENAI_API_KEY**: The API key for OpenAI services.
 - **MISTRAL_API_KEY**: The API key for Mistral services.
 - **ENABLE_ANALYTICS**: A boolean flag to enable or disable analytics. Set to `True` or `False`.
@@ -167,6 +170,20 @@ The environment variables are essential for authenticating and configuring vario
 - **JINA_API_KEY**: The API key for Jina services.
 - **SINGLESTOREDB_URL**: The connection string for SingleStoreDB, formatted as `userid:password@host:port/dbname`.
 - **PINECONE_API_KEY**: The API key for Pinecone services.
+- **GROQ_API_KEY**: The API key for accessing Groq services.
+- **AZURE_OPENAI_API_KEY**: The API key for Azure OpenAI services.
+- **AZURE_OPENAI_ENDPOINT**: The endpoint URL for Azure OpenAI services. Example: `https://<your-resource-name>.openai.azure.com/`.
+- **OPENAI_API_VERSION**: Specifies the API version for OpenAI. Example: `2024-02-01`.
+- **GOOGLE_API_KEY**: The API key for Google Cloud services.
+- **GOOGLE_CLOUD_PROJECT**: The project ID for Google Cloud.
+- **PGVECTOR_CONNECTION_STRING**: The connection string for PGVector, formatted as `postgresql+psycopg://langchain:langchain@localhost:6024/langchain`.
+- **MILVUS_CONNECTION_STRING**: The connection string for Milvus, using a local demo DB. Example: `./milvus_demo.db`.
+- **OLLAMA_BASE_URL**: The base URL for accessing Ollama services. Example: `http://localhost:11434` (use `http://host.docker.internal:11434/` if using Docker for RagBuilder).
+- **NEO4J_URI**: The connection URI for Neo4J. Example: `bolt://localhost:7687` (use `bolt://neo4j:7687` if using Docker for RagBuilder).
+- **NEO4J_USERNAME**: The username for Neo4J. Default: `neo4j`.
+- **NEO4J_PASSWORD**: The password for Neo4J. Default: `ragbuilder`.
+- **NEO4J_LOAD**: A flag to enable loading data into Neo4J. Set to `true` or `false`. set this to false if the Graph database is loaded and want to skip the step
+
 
 ### Example `.env` File
 

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -1,12 +1,36 @@
-version: '3.8'
+version: "3.8"
 
 services:
+  neo4j:
+    build: ./neo4j
+    ports:
+      - "7474:7474"
+      - "7687:7687"
+    environment:
+      NEO4J_AUTH: "neo4j/ragbuilder"
+      NEO4J_apoc_export_file_enabled: "true"
+      NEO4J_apoc_import_file_enabled: "true"
+      NEO4J_apoc_import_file_use__neo4j__config: "true"
+      NEO4J_dbms_security_procedures_unrestricted: "apoc.*"
+    volumes:
+      - ./data:/data
+    networks:
+      - custom-network
+
   ragbuilder:
-    image: ashwinzyx/ragbuilder:latest
+    image: ashwinzyx/ragbuilder:neo_test
     ports:
       - "55003:8005"
     volumes:
       - .:/ragbuilder
     env_file:
       - .env
+    depends_on:
+      - neo4j
     command: ["ragbuilder"]
+    networks:
+      - custom-network
+
+networks:
+  custom-network:
+    driver: bridge
diff --git a/neo4j/Dockerfile b/neo4j/Dockerfile
@@ -0,0 +1,10 @@
+FROM neo4j:5.22.0
+
+
+ENV NEO4JLABS_PLUGINS '[ "apoc" ]'
+ENV NEO4J_dbms_security_procedures_unrestricted apoc.*
+
+COPY ./apoc-5.22.0-core.jar /var/lib/neo4j/plugins
+
+
+EXPOSE 7474 7687
diff --git a/neo4j/apoc-5.22.0-core.jar b/neo4j/apoc-5.22.0-core.jar
diff --git a/requirements.txt b/requirements.txt
@@ -75,4 +75,5 @@ langchain-ollama
 langchain_postgres
 psycopg[binary,pool]
 langchain_milvus
-langsmith
+langsmith
+neo4j
diff --git a/setup.py b/setup.py
@@ -4,7 +4,7 @@
     long_description = fh.read()
 setup(
     name='ragbuilder',
-    version='0.0.14',
+    version='0.0.15',
     author='Ashwin Aravind, Aravind Parameswaran',
     author_email='[email protected], [email protected]',
     description='RagBuilder is a toolkit designed to help you create optimal Production-ready Retrieval-Augmented Generation (RAG) pipeline for your data',

diff --git a/src/ragbuilder/analytics.py b/src/ragbuilder/analytics.py
@@ -12,7 +12,6 @@
 mp = Mixpanel(MIXPANEL_TOKEN)
 import time
 def track_event(event_str):
-    mp.track(int(time.time()),event_str)
-
-# js='{"name":"John", "age":30, "car":null}'
-# track_event(js)
+    enable_analytics = os.getenv('ENABLE_ANALYTICS', 'True').lower() == 'true'
+    if enable_analytics:
+        mp.track(int(time.time()),event_str)
diff --git a/src/ragbuilder/executor.py b/src/ragbuilder/executor.py
@@ -85,6 +85,33 @@
 from langchain.chains import LLMChain, HypotheticalDocumentEmbedder
 from langchain.prompts import ChatPromptTemplate
 from langchain.load import dumps, loads
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.pydantic_v1 import BaseModel, Field
+from langchain_core.output_parsers import StrOutputParser
+import os
+from langchain_community.graphs import Neo4jGraph
+from langchain.text_splitter import MarkdownHeaderTextSplitter
+from langchain_openai import ChatOpenAI
+from langchain_openai import OpenAIEmbeddings
+from langchain_community.vectorstores.neo4j_vector import remove_lucene_chars
+from langchain_community.document_loaders import *
+from dotenv import load_dotenv
+from langchain_chroma import Chroma
+from langchain_community.graphs.graph_document import (
+    Node as BaseNode,
+    Relationship as BaseRelationship,
+    GraphDocument,
+)
+from operator import itemgetter
+from langchain import hub
+from langchain_core.runnables import RunnablePassthrough, RunnableParallel
+from langchain.schema import Document
+from typing import List, Dict, Any, Optional
+from langchain.pydantic_v1 import Field, BaseModel
+from langchain.docstore.document import Document
+from langchain.prompts import ChatPromptTemplate
+from ragbuilder.graph_utils.graph_loader import load_graph 
+
 
 # import local modules
 from ragbuilder.langchain_module.retriever.retriever import *
@@ -362,6 +389,7 @@ def __init__(self, val):
 
         logger.info("Creating RAG object from generated code...(this may take a while in some cases)")
         try:
+            logger.info(f"Generated Code\n{self.router}")
             exec(self.router,globals_dict,locals_dict)
             self.rag = locals_dict['rag_pipeline']()
         except Exception as e:

diff --git a/src/ragbuilder/graph_utils/__init__.py b/src/ragbuilder/graph_utils/__init__.py
diff --git a/src/ragbuilder/graph_utils/graph_loader.py b/src/ragbuilder/graph_utils/graph_loader.py
@@ -0,0 +1,141 @@
+from langchain_core.runnables import  RunnablePassthrough
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.pydantic_v1 import BaseModel, Field
+from langchain_core.output_parsers import StrOutputParser
+import os
+from langchain_community.graphs import Neo4jGraph
+from langchain.text_splitter import MarkdownHeaderTextSplitter
+from langchain_openai import ChatOpenAI
+from langchain_openai import OpenAIEmbeddings
+from langchain_community.vectorstores.neo4j_vector import remove_lucene_chars
+from langchain_community.document_loaders import *
+from dotenv import load_dotenv
+from langchain_chroma import Chroma
+from langchain_community.graphs.graph_document import (
+    Node as BaseNode,
+    Relationship as BaseRelationship,
+    GraphDocument,
+)
+from operator import itemgetter
+from langchain import hub
+from langchain_core.runnables import RunnablePassthrough, RunnableParallel
+from langchain.schema import Document
+from typing import List, Dict, Any, Optional
+from langchain.pydantic_v1 import Field, BaseModel
+from langchain.docstore.document import Document
+from langchain.prompts import ChatPromptTemplate
+load_dotenv()
+import os
+
+
+class Property(BaseModel):
+    """A single property consisting of key and value"""
+    key: str = Field(..., description="key")
+    value: str = Field(..., description="value")
+
+class Node(BaseNode):
+    properties: Optional[List[Property]] = Field(None, description="List of node properties")
+
+class Relationship(BaseRelationship):
+    properties: Optional[List[Property]] = Field(None, description="List of relationship properties")
+
+class KnowledgeGraph(BaseModel):
+    """Generate a knowledge graph with entities and relationships."""
+    nodes: List[Node] = Field(..., description="List of nodes in the knowledge graph")
+    rels: List[Relationship] = Field(..., description="List of relationships in the knowledge graph")
+
+def format_property_key(s: str) -> str:
+    words = s.split()
+    if not words:
+        return s
+    first_word = words[0].lower()
+    capitalized_words = [word.capitalize() for word in words[1:]]
+    return "".join([first_word] + capitalized_words)
+
+def props_to_dict(props) -> dict:
+    """Convert properties to a dictionary."""
+    properties = {}
+    if not props:
+        return properties
+    for p in props:
+        properties[format_property_key(p.key)] = p.value
+    return properties
+
+def map_to_base_node(node: Node) -> BaseNode:
+    """Map the KnowledgeGraph Node to the base Node."""
+    properties = props_to_dict(node.properties) if node.properties else {}
+    # Add name property for better Cypher statement generation
+    properties["name"] = node.id.title()
+    return BaseNode(
+        id=node.id.title(), type=node.type.capitalize(), properties=properties
+    )
+
+def map_to_base_relationship(rel: Relationship) -> BaseRelationship:
+    """Map the KnowledgeGraph Relationship to the base Relationship."""
+    source = map_to_base_node(rel.source)
+    target = map_to_base_node(rel.target)
+    properties = props_to_dict(rel.properties) if rel.properties else {}
+    return BaseRelationship(
+        source=source, target=target, type=rel.type, properties=properties
+    )
+
+def get_extraction_chain(llm,allowed_nodes: Optional[List[str]] = None, allowed_rels: Optional[List[str]] = None):
+    prompt = ChatPromptTemplate.from_messages([
+        ("system", f"""# Knowledge Graph Instructions for GPT-4
+## 1. Overview
+You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph.
+- **Nodes** represent entities and concepts. They're akin to Wikipedia nodes.
+- The aim is to achieve simplicity and clarity in the knowledge graph, making it accessible for a vast audience.
+## 2. Labeling Nodes
+- **Consistency**: Ensure you use basic or elementary types for node labels.
+- For example, when you identify an entity representing a person, always label it as **"person"**. Avoid using more specific terms like "mathematician" or "scientist".
+- **Node IDs**: Never utilize integers as node IDs. Node IDs should be names or human-readable identifiers found in the text.
+{'- **Allowed Node Labels:**' + ", ".join(allowed_nodes) if allowed_nodes else ""}
+{'- **Allowed Relationship Types**:' + ", ".join(allowed_rels) if allowed_rels else ""}
+## 3. Handling Numerical Data and Dates
+- Numerical data, like age or other related information, should be incorporated as attributes or properties of the respective nodes.
+- **No Separate Nodes for Dates/Numbers**: Do not create separate nodes for dates or numerical values. Always attach them as attributes or properties of nodes.
+- **Property Format**: Properties must be in a key-value format.
+- **Quotation Marks**: Never use escaped single or double quotes within property values.
+- **Naming Convention**: Use camelCase for property keys, e.g., `birthDate`.
+## 4. Coreference Resolution
+- **Maintain Entity Consistency**: When extracting entities, it's vital to ensure consistency.
+If an entity, such as "John Doe", is mentioned multiple times in the text but is referred to by different names or pronouns (e.g., "Joe", "he"),
+always use the most complete identifier for that entity throughout the knowledge graph. In this example, use "John Doe" as the entity ID.
+Remember, the knowledge graph should be coherent and easily understandable, so maintaining consistency in entity references is crucial.
+## 5. Strict Compliance
+Adhere to the rules strictly. Non-compliance will result in termination.
+        """),
+        ("human", "Use the given format to extract information from the following input: {input}"),
+        ("human", "Tip: Make sure to answer in the correct format"),
+    ])
+    extraction_chain = prompt | llm.with_structured_output(KnowledgeGraph)
+    return extraction_chain
+
+def extract_and_store_graph(llm,graph,document: Document, nodes:Optional[List[str]] = None, rels:Optional[List[str]]=None) -> None:
+    # Extract graph data using OpenAI functions
+    extract_chain = get_extraction_chain(llm,nodes, rels)
+    data = extract_chain.invoke(document.page_content)
+    # Construct a graph document
+    graph_document = GraphDocument(
+        nodes=[map_to_base_node(node) for node in data.nodes],
+        relationships=[map_to_base_relationship(rel) for rel in data.rels],
+        source=document
+    )
+    # Store information into a graph
+    graph.add_graph_documents([graph_document])
+
+
+def load_graph(documents,llm):
+    NEO4J_URI = os.getenv("NEO4J_URI")
+    NEO4J_USER = os.getenv("NEO4J_USER")
+    NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")
+    graph = Neo4jGraph(
+        url=NEO4J_URI,
+        username=NEO4J_USER,
+        password=NEO4J_PASSWORD)
+    from tqdm import tqdm
+    for i, d in tqdm(enumerate(documents), total=len(documents)):
+        extract_and_store_graph(llm,graph,d)
+
+