Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add additional output fields to the project data query #7

Merged
merged 6 commits into from
Feb 23, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,15 @@ attrs==25.1.0
autopep8==2.3.2
beautifulsoup4==4.13.3
black==25.1.0
blinker==1.9.0
certifi==2025.1.31
charset-normalizer==3.4.1
click==8.1.8
dataclasses-json==0.6.7
dill==0.3.9
distro==1.9.0
filelock==3.17.0
Flask==3.1.0
frozenlist==1.5.0
fsspec==2025.2.0
h11==0.14.0
Expand All @@ -24,16 +26,20 @@ httpx-sse==0.4.0
huggingface-hub==0.28.1
idna==3.10
isort==6.0.0
itsdangerous==2.2.0
Jinja2==3.1.5
jiter==0.8.2
jsonpatch==1.33
jsonpointer==3.0.0
langchain==0.3.18
langchain-community==0.3.17
langchain-core==0.3.34
langchain-experimental==0.3.4
langchain-neo4j==0.3.0
langchain-openai==0.3.4
langchain-text-splitters==0.3.6
langsmith==0.3.5
MarkupSafe==3.0.2
marshmallow==3.26.1
mccabe==0.7.0
multidict==6.1.0
Expand Down Expand Up @@ -72,5 +78,6 @@ typing-inspect==0.9.0
typing_extensions==4.12.2
urllib3==2.3.0
uuid==1.30
Werkzeug==3.1.3
yarl==1.18.3
zstandard==0.23.0
4 changes: 4 additions & 0 deletions src/chunking.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,13 @@ def generate_chunk_uuid(chunk_text):


def chunk_text(content, chunk_size=512, chunk_overlap=50):
if not content:
return []

text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size, chunk_overlap=chunk_overlap
)

return text_splitter.split_text(content)


Expand Down
46 changes: 30 additions & 16 deletions src/cypher_query.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import re
from utils.openai import generate_embedding, openai_client
from neo4j_utils import get_neo4j_driver
import json
Expand All @@ -21,9 +22,12 @@ def check_if_embedding_needed(request, schema_hint):
END OF THE QUERY

By looking at the query, I want you to infer whether any semantic search is needed or not.
If needed, please provide a message that can be used to generate an embedding.
I mean should I should search for project chunk with similar meaning to the query or not.
If it's needed, please provide a message that can be used to generate an embedding.
For example, if the query asks "provide me 2 projects related to climate change impact on renewable energy", you must provide a message like "climate change impact on renewable energy" as the embedding message.
But if the query is like "provide me 2 projects", then no embedding is needed.
But if the intention is random projects or random donations, then no embedding is needed, and you should return {{"embedding_needed": "False"}}.



Now, please tell me does the query need an embedding? Respond strictly in this JSON format:
{{
Expand All @@ -38,6 +42,8 @@ def check_if_embedding_needed(request, schema_hint):
temperature=0.3,
)
result = response.choices[0].text.strip()
result = re.sub(r'("embedding_needed": )false', r"\1False", result)
result = re.sub(r'("embedding_needed": )true', r"\1True", result)
print(f"Embedding Check Result: {result}")
return eval(result) # Convert the JSON-like string to a dictionary

Expand Down Expand Up @@ -81,7 +87,8 @@ def generate_neo4j_query(request, schema_hint, embedding_message=None, embedding
# Replace the similarity function
cypher_query = response.choices[0].text.strip()
cypher_query = cypher_query.replace(
"gds.alpha.similarity.cosine", "gds.similarity.cosine")
"gds.alpha.similarity.cosine", "gds.similarity.cosine"
)
cypher_query = cypher_query.replace("gds.alpha.pageRank", "gds.pageRank")
return cypher_query

Expand All @@ -99,19 +106,21 @@ def process_user_request(request, schema_hint):
embedding_message = embedding_check["embedding_message"]
# Your embedding generation function
embedding = generate_embedding(embedding_message)
print(
f"Generated Embedding for '{embedding_message}': {embedding[:5]}...")
print(f"Generated Embedding for '{embedding_message}': {embedding[:5]}...")
else:
embedding_message = None
embedding = None

# Step 3: Generate the Cypher query
cypher_query = generate_neo4j_query(
request, schema_hint, embedding_message=embedding_message, embedding=embedding)
request, schema_hint, embedding_message=embedding_message, embedding=embedding
)
print(f"Generated Cypher Query: {cypher_query}")

if embedding:
parameters = {"queryVector": embedding, } # Pass query embedding
parameters = {
"queryVector": embedding,
} # Pass query embedding
else:
parameters = {}

Expand All @@ -130,20 +139,25 @@ def execute_cypher_query(cypher_query, parameters):

schema_hint = """
Neo4j Schema:
Node labels: Project, Chunk
Relationships: Project -> Chunk (:HAS_CHUNK)
Project properties: id, title, raised_amount, giv_power, listed
Node labels: Project, Chunk, Donation
Relationships: Project -> Chunk (:HAS_CHUNK), Project -> Donation (:HAS_DONATION)
Project properties: id, title, raised_amount, giv_power, given_power_rank, givbacks_eligible, in_active_qf_round, unique_donors, owner_wallet, ethereum_address, polygon_address, optimism_address, celo_address, base_address, arbitrum_address, gnosis_address, zkevm_address, ethereum_classic_address, stellar_address, solana_address, x, facebook, instagram, youtube, linkedin, reddit, discord, farcaster, lens, website, telegram, github, listed
Chunk properties: id, text, embedding, created_at
Donation properties: id, tx_hash, chain_id, project_title, created_at, amount, value_usd
Chunks are generated by splitting the description of a project.
"""
user_request = {
'query': "I want to hear about projects impact kids health",
'output_format': "{project_id, project_title, raised_amount, giv_power, related_chunks: [text]}"
"query": "I want to hear about projects impact kids health",
"output_format": "{project_id, project_title, raised_amount, giv_power, giv_power_rank, givbacks_eligible, in_active_qf_round, unique_donors, owner_wallet, ethereum_address, polygon_address, optimism_address, celo_address, base_address, arbitrum_address, gnosis_address, zkevm_address, ethereum_classic_address, stellar_address, solana_address, x, facebook, instagram, youtube, linkedin, reddit, discord, farcaster, lens, website, telegram, github, related_chunks: [text] (array)}",
}
# results = process_user_request(schema_hint=schema_hint, request=user_request)
print('#######################')

# print(json.dumps(results, indent=4))
# user_request = {
# "query": "5 random donations with value more than 100$",
# "output_format": "{tx_hash, chain_id, project_title(project.title), created_at, amount, value_usd}",
# }
results = process_user_request(schema_hint=schema_hint, request=user_request)
print("#######################")

print(json.dumps(results, indent=4))

# from langchain_neo4j import GraphCypherQAChain, Neo4jGraph
# from langchain_openai import ChatOpenAI
Expand Down
Loading