Skip to content

Commit

Permalink
Sota v0
Browse files Browse the repository at this point in the history
  • Loading branch information
ashwinzyx committed Aug 23, 2024
1 parent 686be75 commit 9bdfac3
Show file tree
Hide file tree
Showing 13 changed files with 793 additions and 9 deletions.
14 changes: 14 additions & 0 deletions byor/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Minimal initialization for the package
# You might leave this empty, or you could dynamically load modules here
# but don't execute any logic.

# For example, to dynamically load modules you could:
import os
import importlib

module_dir = os.path.dirname(__file__)

for filename in os.listdir(module_dir):
if filename.endswith('.py') and filename != '__init__.py':
module_name = filename[:-3]
importlib.import_module(f'.{module_name}', package=__name__)
40 changes: 40 additions & 0 deletions byor/myrag.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from langchain_community.llms import Ollama
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.embeddings import OllamaEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from operator import itemgetter
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableParallel, RunnableLambda
from langchain.retrievers import MergerRetriever
from langchain.retrievers.document_compressors import DocumentCompressorPipeline

def rag_pipeline():
try:
def format_docs(docs):
return "\\n".join(doc.page_content for doc in docs)

llm = Ollama(model='llama3.1:latest',base_url='http://localhost:11434')

loader = WebBaseLoader('https://ashwinaravind.github.io/')
docs = loader.load()

embedding = OllamaEmbeddings(model='mxbai-embed-large:latest',base_url='http://localhost:11434')

splitter = RecursiveCharacterTextSplitter(chunk_size=1600, chunk_overlap=200)
splits=splitter.split_documents(docs)
c=Chroma.from_documents(documents=splits, embedding=embedding, collection_name='testindex-ragbuilder',)
retrievers=[]
retriever=c.as_retriever(search_type='similarity', search_kwargs={'k': 5})
retrievers.append(retriever)
retriever=MergerRetriever(retrievers=retrievers)
prompt = hub.pull("rlm/rag-prompt")
rag_chain = (
RunnableParallel(context=retriever, question=RunnablePassthrough())
.assign(context=itemgetter("context") | RunnableLambda(format_docs))
.assign(answer=prompt | llm | StrOutputParser())
.pick(["answer", "context"]))
return rag_chain
except Exception as e:
print(f"An error occurred: {e}")
143 changes: 135 additions & 8 deletions src/ragbuilder/executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,8 @@
from langchain_postgres.vectorstores import PGVector
from langchain_community.llms import Ollama
from langchain_community.embeddings import OllamaEmbeddings
from langchain.chains import LLMChain, HypotheticalDocumentEmbedder


# import local modules
from ragbuilder.langchain_module.retriever.retriever import *
Expand Down Expand Up @@ -127,9 +129,11 @@ def rag_builder_bayes_optmization(**kwargs):
logger.info(f"Initializing RAG parameter set...")
lc_templates.init(vectorDB, min_chunk_size, max_chunk_size, other_embedding, other_llm)
configs_to_run=dict()
configs_to_run= {1:{'ragname':'simple_rag'},2:{'ragname':'semantic_chunker'},3:{'ragname':'hyde'},4:{'ragname':'hybrid_rag'},4:{'ragname':'crag'}}
#TODO: Add a check to see if the templates are to be included

if kwargs['compare_templates']:
configs_to_run.update(top_n_templates)
# if kwargs['compare_templates']:
# configs_to_run.update(top_n_templates)

space = lc_templates.generate_config_space(exclude_elements=disabled_opts)
logger.info(f"Config space={space}")
Expand All @@ -144,13 +148,17 @@ def rag_builder_bayes_optmization(**kwargs):
progress_state.set_total_runs(total_runs)

# Run Templates first if templates have been selected
# configs_to_run= {1:{'ragname':'simple_rag'}}
for key, val in configs_to_run.items():
logger.info("SOTA Ragbuilder Initiated")
progress_state.increment_progress()
logger.info(f"Running: {progress_state.get_progress()['current_run']}/{progress_state.get_progress()['total_runs']}")
logger.info(f"Template:{key}: {val['description']}:{val['retrieval_model']}")
# logger.info(f"Template:{key}: {val['description']}:{val['retrieval_model']}")
print(val)
val['loader_kwargs']=src_data
val['run_id']=run_id
rag_builder=RagBuilder(val)
rag_builder=sotaRAGBuilder(val)
logger.info("SOTA Ragbuilder Class Initiated")
run_config=RunConfig(timeout=RUN_CONFIG_TIMEOUT, max_workers=RUN_CONFIG_MAX_WORKERS, max_wait=RUN_CONFIG_MAX_WAIT, max_retries=RUN_CONFIG_MAX_RETRIES)
# logger.info(f"{repr(run_config)}")
# time.sleep(30)
Expand Down Expand Up @@ -245,8 +253,39 @@ def rag_builder(**kwargs):
result=None
configs_to_run=dict()

if kwargs['compare_templates']:
configs_to_run.update(top_n_templates)
# if kwargs['compare_templates']:
# configs_to_run.update(top_n_templates)
# Run Templates first if templates have been selected
configs_to_run= {1:{'ragname':'simple_rag'},2:{'ragname':'semantic_chunker'},3:{'ragname':'hyde'},4:{'ragname':'hybrid_rag'},4:{'ragname':'crag'}}

# configs_to_run= {1:{'ragname':'simple_rag'},2:{'ragname':'semantic_chunker'},2:{'ragname':'hyde'}}
for key, val in configs_to_run.items():
logger.info("SOTA Ragbuilder Initiated")
progress_state.increment_progress()
logger.info(f"Running: {progress_state.get_progress()['current_run']}/{progress_state.get_progress()['total_runs']}")
# logger.info(f"Template:{key}: {val['description']}:{val['retrieval_model']}")
print(val)
val['loader_kwargs']=src_data
val['run_id']=run_id
rag_builder=sotaRAGBuilder(val)
logger.info("SOTA Ragbuilder Class Initiated")
run_config=RunConfig(timeout=RUN_CONFIG_TIMEOUT, max_workers=RUN_CONFIG_MAX_WORKERS, max_wait=RUN_CONFIG_MAX_WAIT, max_retries=RUN_CONFIG_MAX_RETRIES)
# logger.info(f"{repr(run_config)}")
# time.sleep(30)
# result=0
logger.info(f"Evaluating RAG Config #{progress_state.get_progress()['current_run']}... (this may take a while)")
rageval=eval.RagEvaluator(
rag_builder, # code for rag function
test_ds,
llm = get_model_obj('llm', eval_llm),
embeddings = get_model_obj('embedding', eval_embedding),
#TODO: Fetch Run Config settings from advanced settings from front-end
run_config = run_config,
is_async = RUN_CONFIG_IS_ASYNC
)
result=rageval.evaluate()
logger.info(f'progress_state={progress_state.get_progress()}')
configs_to_run=dict()
if kwargs['include_granular_combos']:
logger.info(f"Initializing RAG parameter set...")
lc_templates.init(vectorDB, min_chunk_size, max_chunk_size, other_embedding, other_llm)
Expand Down Expand Up @@ -277,9 +316,56 @@ def rag_builder(**kwargs):
is_async = RUN_CONFIG_IS_ASYNC
)
result=rageval.evaluate()
# logger.info(f'progress_state={progress_state.get_progress()}')
# byor_ragbuilder()
return result

from ragbuilder.rag_templates.sota.simple_rag import code as simple_rag
from ragbuilder.rag_templates.sota.semantic_chunker import code as semantic_chunker
from ragbuilder.rag_templates.sota.hyde import code as hyde
from ragbuilder.rag_templates.sota.hybrid_rag import code as hybrid_rag
class sotaRAGBuilder:
def __init__(self,val):
self.config = val
self.run_id = val['run_id']
self.loader_kwargs = val['loader_kwargs']
logger.info("Sota Ragbuilder Invoked", val['loader_kwargs'])
# output of router is genrated code as string
if val['ragname']=="simple_rag":
logger.info("simple_rag initiated")
self.router=rag.sota_code_mod(simple_rag,self.loader_kwargs['input_path'])
if val['ragname']=="semantic_chunker":
logger.info("simple_rag initiated")
self.router=rag.sota_code_mod(semantic_chunker,self.loader_kwargs['input_path'])
if val['ragname']=="hyde":
logger.info("simple_rag initiated")
self.router=rag.sota_code_mod(hyde,self.loader_kwargs['input_path'])
if val['ragname']=="hybrid_rag":
logger.info("simple_rag initiated")
self.router=rag.sota_code_mod(hybrid_rag,self.loader_kwargs['input_path'])
locals_dict={}
globals_dict = globals()

logger.info("Creating RAG object from generated code...(this may take a while in some cases)")
try:
#execution os string
logger.info(f"Generated Code\n{self.router}")
logger.info("sota rag execution initiated")
exec(self.router,globals_dict,locals_dict)

#old rag func hooked to eval
self.rag = locals_dict['rag_pipeline']()
except Exception as e:
logger.error(f"Error invoking RAG. ERROR: {e}")

def __repr__(self):
try:
json_config=json.dumps(self.config)
except Exception as e:
logger.error(f"Error serializing RAG config as JSON: {e}")
logger.debug(f"self.config = {self.config}")
raw_config=str(self.config).replace("'", '"')
return json.dumps({"msg": "Failed to serialize RAG config", "raw_config": raw_config})
return json_config


class RagBuilder:
Expand Down Expand Up @@ -338,4 +424,45 @@ def __repr__(self):
raw_config=str(self.config).replace("'", '"')
return json.dumps({"msg": "Failed to serialize RAG config", "raw_config": raw_config})
return json_config


def byor_ragbuilder(test_ds,eval_llm,eval_embedding):
current_directory = os.getcwd()
print(f"Current Working Directory: {current_directory}")
folder_path = current_directory+'/byor'

# Iterate over all files in the folder
for filename in os.listdir(folder_path):
# Only consider .py files
print('filename',filename)
if filename.endswith('.py') and filename != '__init__.py':
module_name = filename[:-3] # Strip .py extension
module_path = os.path.join(folder_path, filename)

# Dynamically load the module
spec = importlib.util.spec_from_file_location(module_name, module_path)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)

# Check if the module has 'rag_pipline' function and execute it
if hasattr(module, 'rag_pipeline'):
print(f"found rag_pipeline() from {module_name}")
logger.info("BYOR Ragbuilder Initiated")
progress_state.increment_progress()
logger.info(f"Running: {progress_state.get_progress()['current_run']}/{progress_state.get_progress()['total_runs']}")
# logger.info(f"Template:{key}: {val['description']}:{val['retrieval_model']}")
logger.info("BYOR Ragbuilder Class Initiated")
run_config=RunConfig(timeout=RUN_CONFIG_TIMEOUT, max_workers=RUN_CONFIG_MAX_WORKERS, max_wait=RUN_CONFIG_MAX_WAIT, max_retries=RUN_CONFIG_MAX_RETRIES)
logger.info(f"Evaluating RAG Config #{progress_state.get_progress()['current_run']}... (this may take a while)")
rag_builder=module.rag_pipline()
rageval=eval.RagEvaluator(
rag_builder, # code for rag function
test_ds,
llm = get_model_obj('llm', eval_llm),
embeddings = get_model_obj('embedding', eval_embedding),
#TODO: Fetch Run Config settings from advanced settings from front-end
run_config = run_config,
is_async = RUN_CONFIG_IS_ASYNC
)
result=rageval.evaluate()
logger.info(f'progress_state={progress_state.get_progress()}')

18 changes: 17 additions & 1 deletion src/ragbuilder/langchain_module/rag/getCode.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,4 +119,20 @@ def format_docs(docs):
""".format(code_text.replace('\n', '\n '))
logger.info(f"Codegen completed")
return function_code
return function_code

def is_docker():
"""Check if the code is running inside a Docker container."""
path = '/.dockerenv'
return os.path.exists(path)
def sota_code_mod(code,input_path):
print("input_path", input_path)
docs = ragbuilder_loader(input_path=input_path)
code_string=docs['code_string'].replace("\n",'\n ')
print("code_string", code_string)
if is_docker():
code_string_docker=code.replace("BASE_URL","http://host.docker.internal:11434/")
else:
code_string_docker=code.replace("BASE_URL","http://localhost:11434")
codmod=code_string_docker.replace("{loader_class}",code_string)
return codmod
Empty file.
40 changes: 40 additions & 0 deletions src/ragbuilder/rag_templates/byor/myrag.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from langchain_community.llms import Ollama
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.embeddings import OllamaEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from operator import itemgetter
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableParallel, RunnableLambda
from langchain.retrievers import MergerRetriever
from langchain.retrievers.document_compressors import DocumentCompressorPipeline

def rag_pipeline():
try:
def format_docs(docs):
return "\\n".join(doc.page_content for doc in docs)

llm = Ollama(model='llama3.1:latest',base_url='http://localhost:11434')

loader = WebBaseLoader('https://ashwinaravind.github.io/')
docs = loader.load()

embedding = OllamaEmbeddings(model='mxbai-embed-large:latest',base_url='http://localhost:11434')

splitter = RecursiveCharacterTextSplitter(chunk_size=1600, chunk_overlap=200)
splits=splitter.split_documents(docs)
c=Chroma.from_documents(documents=splits, embedding=embedding, collection_name='testindex-ragbuilder',)
retrievers=[]
retriever=c.as_retriever(search_type='similarity', search_kwargs={'k': 5})
retrievers.append(retriever)
retriever=MergerRetriever(retrievers=retrievers)
prompt = hub.pull("rlm/rag-prompt")
rag_chain = (
RunnableParallel(context=retriever, question=RunnablePassthrough())
.assign(context=itemgetter("context") | RunnableLambda(format_docs))
.assign(answer=prompt | llm | StrOutputParser())
.pick(["answer", "context"]))
return rag_chain
except Exception as e:
print(f"An error occurred: {e}")
21 changes: 21 additions & 0 deletions src/ragbuilder/rag_templates/sota/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import os
import importlib

# Get the directory of this file
module_dir = os.path.dirname(__file__)

# Track whether any modules are found and imported
modules_found = False

# Iterate over all files in the directory
for filename in os.listdir(module_dir):
# Only consider .py files and skip __init__.py itself
if filename.endswith('.py') and filename != '__init__.py':
module_name = filename[:-3] # Strip .py extension
# Dynamically import the module
importlib.import_module(f'.{module_name}', package=__name__)
modules_found = True # Mark that at least one module was found

# If no modules were found, print a message or handle as needed
if not modules_found:
print("No Python modules found in the 'byor' folder.")
Loading

0 comments on commit 9bdfac3

Please sign in to comment.