Skip to content

Commit

Permalink
Defaults handling
Browse files Browse the repository at this point in the history
  • Loading branch information
aravind10x committed Dec 28, 2024
1 parent d848fc6 commit e036c67
Show file tree
Hide file tree
Showing 8 changed files with 2,261 additions and 153 deletions.
2,057 changes: 2,057 additions & 0 deletions rag_prompts.yml

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/ragbuilder/config/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ class EvalDataGenerationConfig(BaseModel):
class OptimizationConfig(BaseModel):
"""Optimization settings"""
type: Optional[str] = "Optuna"
n_trials: Optional[int] = Field(default=10, description="Number of trials for optimization")
n_trials: Optional[int] = Field(default=None, description="Number of trials for optimization")
n_jobs: Optional[int] = Field(default=1, description="Number of jobs for optimization")
timeout: Optional[int] = Field(default=None, description="Timeout for optimization")
storage: Optional[str] = Field(default="sqlite:///eval.db", description="Storage URL for Optuna (e.g., 'sqlite:///optuna.db')")
Expand Down
124 changes: 64 additions & 60 deletions src/ragbuilder/config/data_ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,53 +64,67 @@ class DataIngestOptionsConfig(BaseModel):
- etc.
"""
input_source: Union[str, List[str]] = Field(..., description="File path, directory path, or URL for input data")
document_loaders: Optional[List[LoaderConfig]] = Field(
default_factory=lambda: [LoaderConfig(type=ParserType.UNSTRUCTURED)],
description="Document loader configurations"
)
chunking_strategies: Optional[List[ChunkingStrategyConfig]] = Field(
default_factory=lambda: [ChunkingStrategyConfig(type=ChunkingStrategy.RECURSIVE)],
description="Chunking strategies to try"
)
chunk_size: Optional[ChunkSizeConfig] = Field(default_factory=ChunkSizeConfig, description="Chunk size configuration")
chunk_overlap: Optional[List[int]] = Field(default=[100], description="List of chunk overlap values to try")
embedding_models: Optional[List[EmbeddingConfig]] = Field(default_factory=list, description="List of embedding models")
vector_databases: Optional[List[VectorDBConfig]] = Field(
# default_factory=lambda: [VectorDBConfig(type=VectorDatabase.FAISS, vectordb_kwargs={})],
default_factory=lambda: [VectorDBConfig(type=VectorDatabase.CHROMA, vectordb_kwargs={'collection_metadata': {'hnsw:space': 'cosine'}, 'persist_directory': './chroma'})],
description="List of vector databases"
)
document_loaders: Optional[List[LoaderConfig]] = Field(default=None, description="Document loader configurations")
chunking_strategies: Optional[List[ChunkingStrategyConfig]] = Field(default=None, description="Chunking strategies to try")
chunk_size: Optional[ChunkSizeConfig] = Field(default=None, description="Chunk size configuration")
chunk_overlap: Optional[List[int]] = Field(default=None, description="List of chunk overlap values to try")
embedding_models: Optional[List[EmbeddingConfig]] = Field(default=None, description="List of embedding models")
vector_databases: Optional[List[VectorDBConfig]] = Field(default=None, description="List of vector databases")
sampling_rate: Optional[float] = Field(default=None, description="Sampling rate for documents (0.0 to 1.0). None or 1.0 means no sampling.")
optimization: Optional[OptimizationConfig] = Field(default_factory=OptimizationConfig, description="Optimization configuration")
# log_config: Optional[LogConfig] = Field(default_factory=LogConfig, description="Logging configuration")
database_logging: Optional[bool] = Field(default=True, description="Whether to log results to the DB")
database_path: Optional[str] = Field(default="eval.db", description="Path to the SQLite database file")
evaluation_config: EvaluationConfig = Field(
default_factory=lambda: EvaluationConfig(
type=EvaluatorType.SIMILARITY,
evaluator_kwargs={
"top_k": 5,
"position_weights": None,
"relevance_threshold": 0.75
}
),
description="Evaluation configuration"
)
optimization: Optional[OptimizationConfig] = Field(default=None, description="Optimization configuration")
database_logging: Optional[bool] = Field(default=None, description="Whether to log results to the DB")
database_path: Optional[str] = Field(default=None, description="Path to the SQLite database file")
evaluation_config: Optional[EvaluationConfig] = Field(default=None, description="Evaluation configuration")
graph: Optional[GraphConfig] = Field(default=None, description="Graph configuration")
metadata: Optional[ConfigMetadata] = Field(
default_factory=ConfigMetadata,
description="Metadata about the configuration"
)
metadata: Optional[ConfigMetadata] = Field(default=None, description="Metadata about the configuration")

def apply_defaults(self) -> None:
"""Apply default values from ConfigStore and set standard defaults"""
if self.optimization is None:
self.optimization = OptimizationConfig()

if self.optimization.n_trials is None:
self.optimization.n_trials = ConfigStore().get_default_n_trials()

def model_post_init(self, __context: Any) -> None:
"""Post initialization processing"""
if not self.embedding_models:
self.embedding_models = [
EmbeddingConfig(
type=EmbeddingType.HUGGINGFACE,
model_kwargs={"model_name": "mixedbread-ai/mxbai-embed-large-v1"}
)
]
self.embedding_models = [ConfigStore().get_default_embeddings()]

if self.document_loaders is None:
self.document_loaders = [LoaderConfig(type=ParserType.UNSTRUCTURED)]

if self.chunking_strategies is None:
self.chunking_strategies = [ChunkingStrategyConfig(type=ChunkingStrategy.RECURSIVE)]

if self.chunk_size is None:
self.chunk_size = ChunkSizeConfig()

if self.chunk_overlap is None:
self.chunk_overlap = [100]

if self.vector_databases is None:
self.vector_databases = [VectorDBConfig(
type=VectorDatabase.CHROMA,
vectordb_kwargs={'collection_metadata': {'hnsw:space': 'cosine'}, 'persist_directory': './chroma'}
)]

if self.database_logging is None:
self.database_logging = True

if self.database_path is None:
self.database_path = "eval.db"

if self.evaluation_config is None:
self.evaluation_config = EvaluationConfig(
type=EvaluatorType.SIMILARITY,
evaluator_kwargs={
"top_k": 5,
"position_weights": None,
"relevance_threshold": 0.75
}
)

if self.metadata is None:
self.metadata = ConfigMetadata()

@classmethod
def with_defaults(cls, input_source: str, test_dataset: Optional[str] = None) -> 'DataIngestOptionsConfig':
Expand Down Expand Up @@ -168,23 +182,13 @@ class DataIngestConfig(BaseModel):
- etc.
"""
input_source: Union[str, List[str]] = Field(..., description="File path, directory path, or URL for input data")
document_loader: LoaderConfig = Field(
default_factory=lambda: LoaderConfig(type=ParserType.UNSTRUCTURED),
description="Document loader configuration"
)
chunking_strategy: ChunkingStrategyConfig = Field(default_factory=lambda: ChunkingStrategyConfig(type=ChunkingStrategy.RECURSIVE), description="Chunking strategy")
chunk_size: int = Field(default=1000, description="Chunk size")
chunk_overlap: int = Field(default=100, description="Chunk overlap")
embedding_model: EmbeddingConfig = Field(
# default_factory=lambda: EmbeddingConfig(type=EmbeddingModel.HUGGINGFACE, model_kwargs={"model_name": "mixedbread-ai/mxbai-embed-large-v1"}), #model_kwargs={"model_name": "sentence-transformers/all-MiniLM-L6-v2"}),
default_factory=lambda: EmbeddingConfig(type=EmbeddingType.OPENAI, model_kwargs={"model_name": "text-embedding-3-large"}), #model_kwargs={"model_name": "sentence-transformers/all-MiniLM-L6-v2"}),
description="Embedding model configuration"
)
vector_database: VectorDBConfig = Field(
default_factory=lambda: VectorDBConfig(type=VectorDatabase.FAISS, vectordb_kwargs={}),
description="Vector store configuration"
)
sampling_rate: Optional[float] = Field(default=None, description="Sampling rate for documents (0.0 to 1.0). None or 1.0 means no sampling.")
document_loader: LoaderConfig
chunking_strategy: ChunkingStrategyConfig
chunk_size: int
chunk_overlap: int
embedding_model: EmbeddingConfig
vector_database: VectorDBConfig
sampling_rate: Optional[float] = None

def load_config(file_path: str) -> Union[DataIngestOptionsConfig, DataIngestConfig]:
with open(file_path, 'r') as file:
Expand Down
52 changes: 40 additions & 12 deletions src/ragbuilder/config/generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,22 +90,50 @@ class GenerationConfig(BaseConfig):
prompt_key: Optional[str] = None

class GenerationOptionsConfig(BaseConfig):
llms: List[LLMConfig] # List of LLM configurations
llms: Optional[List[LLMConfig]] = None
prompt_template_path: Optional[str] = None
eval_data_set_path: Optional[str] = None
local_prompt_template_path: Optional[str] = None
read_local_only: Optional[bool] = False
read_local_only: Optional[bool] = None
retriever: Optional[Any]=None
database_logging: Optional[bool] = Field(default=True, description="Whether to log results to the DB")
database_path: Optional[str] = Field(default="eval.db", description="Path to the SQLite database file")
optimization: Optional[OptimizationConfig] = Field(
default_factory=OptimizationConfig,
description="Optimization configuration"
)
evaluation_config: Optional[EvaluationConfig] = Field(
default_factory=lambda: EvaluationConfig(type=EvaluatorType.RAGAS),
description="Evaluation configuration"
)
database_logging: Optional[bool] = Field(default=None, description="Whether to log results to the DB")
database_path: Optional[str] = Field(default=None, description="Path to the SQLite database file")
optimization: Optional[OptimizationConfig] = Field(default=None, description="Optimization configuration")
evaluation_config: Optional[EvaluationConfig] = Field(default=None, description="Evaluation configuration")
metadata: Optional[ConfigMetadata] = Field(default=None, description="Metadata about the configuration")

def apply_defaults(self) -> None:
"""Apply default values from ConfigStore and set standard defaults"""
if self.optimization is None:
self.optimization = OptimizationConfig()

if self.optimization.n_trials is None:
self.optimization.n_trials = ConfigStore().get_default_n_trials()

if self.llms is None:
self.llms = [ConfigStore().get_default_llm()]

if self.read_local_only is None:
self.read_local_only = False

if self.database_logging is None:
self.database_logging = True

if self.database_path is None:
self.database_path = "eval.db"

if self.evaluation_config is None:
self.evaluation_config = EvaluationConfig(
type=EvaluatorType.SIMILARITY,
evaluator_kwargs={
"top_k": 5,
"position_weights": None,
"relevance_threshold": 0.75
}
)

if self.metadata is None:
self.metadata = ConfigMetadata()

def model_post_init(self, __context: Any) -> None:
if not self.llms:
Expand Down
69 changes: 37 additions & 32 deletions src/ragbuilder/config/retriever.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,29 +24,40 @@ class RerankerConfig(BaseModel):

class RetrievalOptionsConfig(BaseModel):
"""Configuration for retriever optimization options"""
retrievers: List[BaseRetrieverConfig] = Field(
default_factory=lambda: [BaseRetrieverConfig(type=RetrieverType.VECTOR_SIMILARITY)],
description="List of retrievers to try"
)
rerankers: Optional[List[RerankerConfig]] = Field(
default_factory=list,
description="List of rerankers to try"
)
top_k: List[int] = Field(
default=[3, 5, 10],
description="Final number of documents to return after all processing"
)
# log_config: Optional[LogConfig] = Field(default_factory=LogConfig, description="Logging configuration")
database_logging: Optional[bool] = Field(default=True, description="Whether to log results to the DB")
database_path: Optional[str] = Field(default="eval.db", description="Path to the SQLite database file")
optimization: Optional[OptimizationConfig] = Field(
default_factory=OptimizationConfig,
description="Optimization configuration"
)
evaluation_config: Optional[EvaluationConfig] = Field(
default_factory=lambda: EvaluationConfig(type=EvaluatorType.RAGAS),
description="Evaluation configuration"
)
retrievers: List[BaseRetrieverConfig] = Field(default=None, description="List of retrievers to try")
rerankers: Optional[List[RerankerConfig]] = Field(default=None, description="List of rerankers to try")
top_k: List[int] = Field(default=None, description="Final number of documents to return after all processing")
database_logging: Optional[bool] = Field(default=None, description="Whether to log results to the DB")
database_path: Optional[str] = Field(default=None, description="Path to the SQLite database file")
optimization: Optional[OptimizationConfig] = Field(default=None, description="Optimization configuration")
evaluation_config: Optional[EvaluationConfig] = Field(default=None, description="Evaluation configuration")
metadata: Optional[ConfigMetadata] = None

def apply_defaults(self) -> None:
"""Apply default values from ConfigStore and set standard defaults"""
if self.optimization is None:
self.optimization = OptimizationConfig()

if self.optimization.n_trials is None:
self.optimization.n_trials = ConfigStore().get_default_n_trials()

if self.retrievers is None:
self.retrievers = [
BaseRetrieverConfig(type=RetrieverType.VECTOR_SIMILARITY, retriever_k=[20]),
BaseRetrieverConfig(type="bm25", retriever_k=[20])
]

if self.rerankers is None:
self.rerankers = [RerankerConfig(type=RerankerType.BGE_BASE)]

if self.top_k is None:
self.top_k = [3, 5, 10]

if self.evaluation_config is None:
self.evaluation_config = EvaluationConfig(type=EvaluatorType.RAGAS)

if self.metadata is None:
self.metadata = ConfigMetadata()

@classmethod
def with_defaults(cls) -> 'RetrievalOptionsConfig':
Expand Down Expand Up @@ -85,15 +96,9 @@ def with_defaults(cls) -> 'RetrievalOptionsConfig':
)

class RetrievalConfig(BaseModel):
retrievers: List[BaseRetrieverConfig] = Field(
default_factory=lambda: [BaseRetrieverConfig(type=RetrieverType.VECTOR_SIMILARITY)],
description="List of retrievers to try"
)
rerankers: Optional[List[RerankerConfig]] = Field(
default_factory=list,
description="List of rerankers to try"
)
top_k: int = Field(default=5, description="Number of top results to consider for similarity scoring")
retrievers: List[BaseRetrieverConfig]
rerankers: Optional[List[RerankerConfig]] = None
top_k: int

def load_config(file_path: str) -> Union[RetrievalOptionsConfig, BaseRetrieverConfig]:
with open(file_path, 'r') as file:
Expand Down
Loading

0 comments on commit e036c67

Please sign in to comment.