Skip to content

Commit

Permalink
Refactor/namings (#76)
Browse files Browse the repository at this point in the history
* `module_type` -> `module_name`

* `from_datasets` -> `from_hub`

* stage progress on `prediction` -> `decision`

* stage progress on `Predictor` -> `Decision`

* finish renaming to decision

* stage progress on `retrieval` -> `embedding`
  • Loading branch information
voorhs authored Dec 9, 2024
1 parent 3fcf43c commit ecad794
Show file tree
Hide file tree
Showing 58 changed files with 444 additions and 456 deletions.
18 changes: 9 additions & 9 deletions autointent/_datafiles/default-multiclass-config.yaml
Original file line number Diff line number Diff line change
@@ -1,27 +1,27 @@
# TODO: make up a better and more versatile config
- node_type: retrieval
- node_type: embedding
metric: retrieval_hit_rate
search_space:
- module_type: vector_db
- module_name: retrieval
k: [10]
embedder_name:
- avsolatorio/GIST-small-Embedding-v0
- infgrad/stella-base-en-v2
- node_type: scoring
metric: scoring_roc_auc
search_space:
- module_type: knn
- module_name: knn
k: [1, 3, 5, 10]
weights: ["uniform", "distance", "closest"]
- module_type: linear
- module_type: dnnc
- module_name: linear
- module_name: dnnc
cross_encoder_name:
- BAAI/bge-reranker-base
- cross-encoder/ms-marco-MiniLM-L-6-v2
k: [1, 3, 5, 10]
- node_type: prediction
metric: prediction_accuracy
- node_type: decision
metric: decision_accuracy
search_space:
- module_type: threshold
- module_name: threshold
thresh: [0.5]
- module_type: argmax
- module_name: argmax
16 changes: 8 additions & 8 deletions autointent/_datafiles/default-multilabel-config.yaml
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
# TODO: make up a better and more versatile config
- node_type: retrieval
- node_type: embedding
metric: retrieval_hit_rate_intersecting
search_space:
- module_type: vector_db
- module_name: retrieval
k: [10]
embedder_name:
- deepvk/USER-bge-m3
- node_type: scoring
metric: scoring_roc_auc
search_space:
- module_type: knn
- module_name: knn
k: [3]
weights: ["uniform", "distance", "closest"]
- module_type: linear
- node_type: prediction
metric: prediction_accuracy
- module_name: linear
- node_type: decision
metric: decision_accuracy
search_space:
- module_type: threshold
- module_name: threshold
thresh: [0.5]
- module_type: adaptive
- module_name: adaptive
10 changes: 5 additions & 5 deletions autointent/_datafiles/inference-config-example.yaml
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
- node_type: retrieval
module_type: vector_db
- node_type: embedding
module_name: retrieval
module_config:
k: 10
model_name: infgrad/stella-base-en-v2
load_path: .
- node_type: scoring
module_type: knn
module_name: knn
module_config:
k: 10
weights: uniform
load_path: .
- node_type: prediction
module_type: threshold
- node_type: decision
module_name: threshold
module_config:
thresh: 0.5
load_path: .
2 changes: 1 addition & 1 deletion autointent/_dataset/_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def from_dict(cls, mapping: dict[str, Any]) -> "Dataset":
return DictReader().read(mapping)

@classmethod
def from_datasets(cls, repo_id: str) -> "Dataset":
def from_hub(cls, repo_id: str) -> "Dataset":
"""
Load a dataset from a Hugging Face repository.
Expand Down
4 changes: 2 additions & 2 deletions autointent/_pipeline/_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ def predict(self, utterances: list[str]) -> npt.NDArray[Any]:
raise RuntimeError(msg)

scores = self.nodes[NodeType.scoring].module.predict(utterances) # type: ignore[union-attr]
return self.nodes[NodeType.prediction].module.predict(scores) # type: ignore[union-attr]
return self.nodes[NodeType.decision].module.predict(scores) # type: ignore[union-attr]

def predict_with_metadata(self, utterances: list[str]) -> InferencePipelineOutput:
"""
Expand All @@ -193,7 +193,7 @@ def predict_with_metadata(self, utterances: list[str]) -> InferencePipelineOutpu
raise RuntimeError(msg)

scores, scores_metadata = self.nodes[NodeType.scoring].module.predict_with_metadata(utterances) # type: ignore[union-attr]
predictions = self.nodes[NodeType.prediction].module.predict(scores) # type: ignore[union-attr]
predictions = self.nodes[NodeType.decision].module.predict(scores) # type: ignore[union-attr]
regexp_predictions, regexp_predictions_metadata = None, None
if NodeType.regexp in self.nodes:
regexp_predictions, regexp_predictions_metadata = self.nodes[NodeType.regexp].module.predict_with_metadata( # type: ignore[union-attr]
Expand Down
2 changes: 1 addition & 1 deletion autointent/configs/_inference_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ class InferenceNodeConfig:

node_type: NodeType
"""Type of the node. Should be one of the NODE_TYPES"""
module_type: str
module_name: str
"""Type of the module. Should be one of the Module"""
module_config: dict[str, Any]
"""Configuration of the module"""
Expand Down
13 changes: 2 additions & 11 deletions autointent/context/_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from ._utils import NumpyEncoder, load_data
from .data_handler import DataHandler
from .optimization_info import OptimizationInfo
from .vector_index_client import VectorIndex, VectorIndexClient
from .vector_index_client import VectorIndexClient


class Context:
Expand Down Expand Up @@ -96,15 +96,6 @@ def set_dataset(self, dataset: Dataset, force_multilabel: bool = False) -> None:
random_seed=self.seed,
)

def get_best_index(self) -> VectorIndex:
"""
Retrieve the best vector index based on optimization results.
:return: Best vector index object.
"""
model_name = self.optimization_info.get_best_embedder()
return self.vector_index_client.get_index(model_name)

def get_inference_config(self) -> dict[str, Any]:
"""
Generate configuration settings for inference.
Expand Down Expand Up @@ -237,5 +228,5 @@ def has_saved_modules(self) -> bool:
:return: True if there are saved modules, False otherwise.
"""
node_types = ["regexp", "retrieval", "scoring", "prediction"]
node_types = ["regexp", "embedding", "scoring", "decision"]
return any(len(self.optimization_info.modules.get(nt)) > 0 for nt in node_types)
6 changes: 3 additions & 3 deletions autointent/context/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,9 @@ def load_data(filepath: str | Path) -> Dataset:
:return: A `Dataset` object containing the loaded data.
"""
if filepath == "default-multiclass":
return Dataset.from_datasets("AutoIntent/clinc150_subset")
return Dataset.from_hub("AutoIntent/clinc150_subset")
if filepath == "default-multilabel":
return Dataset.from_datasets("AutoIntent/clinc150_subset").to_multilabel().encode_labels()
return Dataset.from_hub("AutoIntent/clinc150_subset").to_multilabel().encode_labels()
if not Path(filepath).exists():
return Dataset.from_datasets(str(filepath))
return Dataset.from_hub(str(filepath))
return Dataset.from_json(filepath)
4 changes: 2 additions & 2 deletions autointent/context/optimization_info/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from ._data_models import Artifact, PredictorArtifact, RetrieverArtifact, ScorerArtifact
from ._data_models import Artifact, DecisionArtifact, RetrieverArtifact, ScorerArtifact
from ._optimization_info import OptimizationInfo

__all__ = ["Artifact", "OptimizationInfo", "PredictorArtifact", "RetrieverArtifact", "ScorerArtifact"]
__all__ = ["Artifact", "DecisionArtifact", "OptimizationInfo", "RetrieverArtifact", "ScorerArtifact"]
28 changes: 14 additions & 14 deletions autointent/context/optimization_info/_data_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@ class RegexpArtifact(Artifact):

class RetrieverArtifact(Artifact):
"""
Artifact containing details from the retrieval node.
Artifact containing details from the embedding node.
Name of the embedding model chosen after retrieval optimization.
Name of the embedding model chosen after embedding optimization.
"""

embedder_name: str
Expand All @@ -48,7 +48,7 @@ class ScorerArtifact(Artifact):
)


class PredictorArtifact(Artifact):
class DecisionArtifact(Artifact):
"""
Artifact containing outputs from the predictor node.
Expand All @@ -68,9 +68,9 @@ def validate_node_name(value: str) -> str:
:return: Validated node type string.
:raises ValueError: If the node type is invalid.
"""
if value in [NodeType.retrieval, NodeType.scoring, NodeType.prediction, NodeType.regexp]:
if value in [NodeType.embedding, NodeType.scoring, NodeType.decision, NodeType.regexp]:
return value
msg = f"Unknown node_type: {value}. Expected one of ['regexp', 'retrieval', 'scoring', 'prediction']"
msg = f"Unknown node_type: {value}. Expected one of ['regexp', 'embedding', 'scoring', 'decision']"
raise ValueError(msg)


Expand All @@ -84,9 +84,9 @@ class Artifacts(BaseModel):
model_config = ConfigDict(arbitrary_types_allowed=True)

regexp: list[RegexpArtifact] = []
retrieval: list[RetrieverArtifact] = []
embedding: list[RetrieverArtifact] = []
scoring: list[ScorerArtifact] = []
prediction: list[PredictorArtifact] = []
decision: list[DecisionArtifact] = []

def add_artifact(self, node_type: str, artifact: Artifact) -> None:
"""
Expand Down Expand Up @@ -120,7 +120,7 @@ def get_best_artifact(self, node_type: str, idx: int) -> Artifact:
class Trial(BaseModel):
"""Representation of an individual optimization trial."""

module_type: str
module_name: str
"""Type of the module being optimized."""
module_params: dict[str, Any]
"""Parameters of the module for the trial."""
Expand All @@ -136,9 +136,9 @@ class Trials(BaseModel):
"""Container for managing optimization trials for pipeline nodes."""

regexp: list[Trial] = []
retrieval: list[Trial] = []
embedding: list[Trial] = []
scoring: list[Trial] = []
prediction: list[Trial] = []
decision: list[Trial] = []

def get_trial(self, node_type: str, idx: int) -> Trial:
"""
Expand Down Expand Up @@ -174,12 +174,12 @@ class TrialsIds(BaseModel):

regexp: int | None = None
"""Best trial index for the regexp node."""
retrieval: int | None = None
"""Best trial index for the retrieval node."""
embedding: int | None = None
"""Best trial index for the embedding node."""
scoring: int | None = None
"""Best trial index for the scoring"""
prediction: int | None = None
"""Best trial index for the prediction node."""
decision: int | None = None
"""Best trial index for the decision node."""

def get_best_trial_idx(self, node_type: str) -> int | None:
"""
Expand Down
18 changes: 9 additions & 9 deletions autointent/context/optimization_info/_optimization_info.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Module for managing pipeline optimization.
This module handles the tracking, logging, and retrieval of optimization artifacts,
This module handles the tracking and logging of optimization artifacts,
trials, and modules during the pipeline's execution.
"""

Expand All @@ -25,15 +25,15 @@ class ModulesList:
"""Container for managing lists of modules for each node type."""

regexp: list["Module"] = field(default_factory=list)
retrieval: list["Module"] = field(default_factory=list)
embedding: list["Module"] = field(default_factory=list)
scoring: list["Module"] = field(default_factory=list)
prediction: list["Module"] = field(default_factory=list)
decision: list["Module"] = field(default_factory=list)

def get(self, node_type: str) -> list["Module"]:
"""
Retrieve the list of modules for a specific node type.
:param node_type: The type of node (e.g., "regexp", "retrieval").
:param node_type: The type of node (e.g., "regexp", "embedding").
:return: List of modules for the specified node type.
"""
return getattr(self, node_type) # type: ignore[no-any-return]
Expand Down Expand Up @@ -69,7 +69,7 @@ def __init__(self) -> None:
def log_module_optimization(
self,
node_type: str,
module_type: str,
module_name: str,
module_params: dict[str, Any],
metric_value: float,
metric_name: str,
Expand All @@ -81,7 +81,7 @@ def log_module_optimization(
Log optimization results for a module.
:param node_type: Type of the node being optimized.
:param module_type: Type of the module.
:param module_name: Type of the module.
:param module_params: Parameters of the module for the trial.
:param metric_value: Metric value achieved by the module.
:param metric_name: Name of the evaluation metric.
Expand All @@ -90,7 +90,7 @@ def log_module_optimization(
:param module: The module instance, if available.
"""
trial = Trial(
module_type=module_type,
module_name=module_name,
metric_name=metric_name,
metric_value=metric_value,
module_params=module_params,
Expand Down Expand Up @@ -144,7 +144,7 @@ def get_best_embedder(self) -> str:
:return: Name of the best embedder.
"""
best_retriever_artifact: RetrieverArtifact = self._get_best_artifact(node_type=NodeType.retrieval) # type: ignore[assignment]
best_retriever_artifact: RetrieverArtifact = self._get_best_artifact(node_type=NodeType.embedding) # type: ignore[assignment]
return best_retriever_artifact.embedder_name

def get_best_train_scores(self) -> NDArray[np.float64] | None:
Expand Down Expand Up @@ -214,7 +214,7 @@ def get_inference_nodes_config(self, asdict: bool = False) -> list[InferenceNode
trial = self.trials.get_trial(node_type, idx)
item = {
"node_type": node_type.value,
"module_type": trial.module_type,
"module_name": trial.module_name,
"module_config": trial.module_params,
"load_path": trial.module_dump_dir,
}
Expand Down
4 changes: 2 additions & 2 deletions autointent/custom_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,9 @@ class NodeType(str, Enum):
"""Enumeration of node types in the AutoIntent pipeline."""

regexp = "regexp"
retrieval = "retrieval"
embedding = "embedding"
scoring = "scoring"
prediction = "prediction"
decision = "decision"


class Split:
Expand Down
Loading

0 comments on commit ecad794

Please sign in to comment.