Refactor/namings (#76)

* `module_type` -> `module_name` * `from_datasets` -> `from_hub` * stage progress on `prediction` -> `decision` * stage progress on `Predictor` -> `Decision` * finish renaming to decision * stage progress on `retrieval` -> `embedding`
deeppavlov · Dec 9, 2024 · ecad794 · ecad794
1 parent 3fcf43c
commit ecad794
Show file tree

Hide file tree

Showing 58 changed files with 444 additions and 456 deletions.
diff --git a/autointent/_datafiles/default-multiclass-config.yaml b/autointent/_datafiles/default-multiclass-config.yaml
@@ -1,27 +1,27 @@
 # TODO: make up a better and more versatile config
-- node_type: retrieval
+- node_type: embedding
   metric: retrieval_hit_rate
   search_space:
-    - module_type: vector_db
+    - module_name: retrieval
       k: [10]
       embedder_name:
         - avsolatorio/GIST-small-Embedding-v0
         - infgrad/stella-base-en-v2
 - node_type: scoring
   metric: scoring_roc_auc
   search_space:
-    - module_type: knn
+    - module_name: knn
       k: [1, 3, 5, 10]
       weights: ["uniform", "distance", "closest"]
-    - module_type: linear
-    - module_type: dnnc
+    - module_name: linear
+    - module_name: dnnc
       cross_encoder_name:
         - BAAI/bge-reranker-base
         - cross-encoder/ms-marco-MiniLM-L-6-v2
       k: [1, 3, 5, 10]
-- node_type: prediction
-  metric: prediction_accuracy
+- node_type: decision
+  metric: decision_accuracy
   search_space:
-    - module_type: threshold
+    - module_name: threshold
       thresh: [0.5]
-    - module_type: argmax
+    - module_name: argmax
diff --git a/autointent/_datafiles/default-multilabel-config.yaml b/autointent/_datafiles/default-multilabel-config.yaml
@@ -1,21 +1,21 @@
 # TODO: make up a better and more versatile config
-- node_type: retrieval
+- node_type: embedding
   metric: retrieval_hit_rate_intersecting
   search_space:
-    - module_type: vector_db
+    - module_name: retrieval
       k: [10]
       embedder_name:
         - deepvk/USER-bge-m3
 - node_type: scoring
   metric: scoring_roc_auc
   search_space:
-    - module_type: knn
+    - module_name: knn
       k: [3]
       weights: ["uniform", "distance", "closest"]
-    - module_type: linear
-- node_type: prediction
-  metric: prediction_accuracy
+    - module_name: linear
+- node_type: decision
+  metric: decision_accuracy
   search_space:
-    - module_type: threshold
+    - module_name: threshold
       thresh: [0.5]
-    - module_type: adaptive
+    - module_name: adaptive
diff --git a/autointent/_datafiles/inference-config-example.yaml b/autointent/_datafiles/inference-config-example.yaml
@@ -1,17 +1,17 @@
-- node_type: retrieval
-  module_type: vector_db
+- node_type: embedding
+  module_name: retrieval
   module_config:
     k: 10
     model_name: infgrad/stella-base-en-v2
   load_path: .
 - node_type: scoring
-  module_type: knn
+  module_name: knn
   module_config:
     k: 10
     weights: uniform
   load_path: .
-- node_type: prediction
-  module_type: threshold
+- node_type: decision
+  module_name: threshold
   module_config:
     thresh: 0.5
   load_path: .
diff --git a/autointent/_dataset/_dataset.py b/autointent/_dataset/_dataset.py
@@ -98,7 +98,7 @@ def from_dict(cls, mapping: dict[str, Any]) -> "Dataset":
         return DictReader().read(mapping)
 
     @classmethod
-    def from_datasets(cls, repo_id: str) -> "Dataset":
+    def from_hub(cls, repo_id: str) -> "Dataset":
         """
         Load a dataset from a Hugging Face repository.
 

diff --git a/autointent/_pipeline/_pipeline.py b/autointent/_pipeline/_pipeline.py
@@ -179,7 +179,7 @@ def predict(self, utterances: list[str]) -> npt.NDArray[Any]:
             raise RuntimeError(msg)
 
         scores = self.nodes[NodeType.scoring].module.predict(utterances)  # type: ignore[union-attr]
-        return self.nodes[NodeType.prediction].module.predict(scores)  # type: ignore[union-attr]
+        return self.nodes[NodeType.decision].module.predict(scores)  # type: ignore[union-attr]
 
     def predict_with_metadata(self, utterances: list[str]) -> InferencePipelineOutput:
         """
@@ -193,7 +193,7 @@ def predict_with_metadata(self, utterances: list[str]) -> InferencePipelineOutpu
             raise RuntimeError(msg)
 
         scores, scores_metadata = self.nodes[NodeType.scoring].module.predict_with_metadata(utterances)  # type: ignore[union-attr]
-        predictions = self.nodes[NodeType.prediction].module.predict(scores)  # type: ignore[union-attr]
+        predictions = self.nodes[NodeType.decision].module.predict(scores)  # type: ignore[union-attr]
         regexp_predictions, regexp_predictions_metadata = None, None
         if NodeType.regexp in self.nodes:
             regexp_predictions, regexp_predictions_metadata = self.nodes[NodeType.regexp].module.predict_with_metadata(  # type: ignore[union-attr]

diff --git a/autointent/configs/_inference_node.py b/autointent/configs/_inference_node.py
@@ -12,7 +12,7 @@ class InferenceNodeConfig:
 
     node_type: NodeType
     """Type of the node. Should be one of the NODE_TYPES"""
-    module_type: str
+    module_name: str
     """Type of the module. Should be one of the Module"""
     module_config: dict[str, Any]
     """Configuration of the module"""

diff --git a/autointent/context/_context.py b/autointent/context/_context.py
@@ -18,7 +18,7 @@
 from ._utils import NumpyEncoder, load_data
 from .data_handler import DataHandler
 from .optimization_info import OptimizationInfo
-from .vector_index_client import VectorIndex, VectorIndexClient
+from .vector_index_client import VectorIndexClient
 
 
 class Context:
@@ -96,15 +96,6 @@ def set_dataset(self, dataset: Dataset, force_multilabel: bool = False) -> None:
             random_seed=self.seed,
         )
 
-    def get_best_index(self) -> VectorIndex:
-        """
-        Retrieve the best vector index based on optimization results.
-
-        :return: Best vector index object.
-        """
-        model_name = self.optimization_info.get_best_embedder()
-        return self.vector_index_client.get_index(model_name)
-
     def get_inference_config(self) -> dict[str, Any]:
         """
         Generate configuration settings for inference.
@@ -237,5 +228,5 @@ def has_saved_modules(self) -> bool:
 
         :return: True if there are saved modules, False otherwise.
         """
-        node_types = ["regexp", "retrieval", "scoring", "prediction"]
+        node_types = ["regexp", "embedding", "scoring", "decision"]
         return any(len(self.optimization_info.modules.get(nt)) > 0 for nt in node_types)
diff --git a/autointent/context/_utils.py b/autointent/context/_utils.py
@@ -54,9 +54,9 @@ def load_data(filepath: str | Path) -> Dataset:
     :return: A `Dataset` object containing the loaded data.
     """
     if filepath == "default-multiclass":
-        return Dataset.from_datasets("AutoIntent/clinc150_subset")
+        return Dataset.from_hub("AutoIntent/clinc150_subset")
     if filepath == "default-multilabel":
-        return Dataset.from_datasets("AutoIntent/clinc150_subset").to_multilabel().encode_labels()
+        return Dataset.from_hub("AutoIntent/clinc150_subset").to_multilabel().encode_labels()
     if not Path(filepath).exists():
-        return Dataset.from_datasets(str(filepath))
+        return Dataset.from_hub(str(filepath))
     return Dataset.from_json(filepath)
diff --git a/autointent/context/optimization_info/__init__.py b/autointent/context/optimization_info/__init__.py
@@ -1,4 +1,4 @@
-from ._data_models import Artifact, PredictorArtifact, RetrieverArtifact, ScorerArtifact
+from ._data_models import Artifact, DecisionArtifact, RetrieverArtifact, ScorerArtifact
 from ._optimization_info import OptimizationInfo
 
-__all__ = ["Artifact", "OptimizationInfo", "PredictorArtifact", "RetrieverArtifact", "ScorerArtifact"]
+__all__ = ["Artifact", "DecisionArtifact", "OptimizationInfo", "RetrieverArtifact", "ScorerArtifact"]
diff --git a/autointent/context/optimization_info/_data_models.py b/autointent/context/optimization_info/_data_models.py
@@ -23,9 +23,9 @@ class RegexpArtifact(Artifact):
 
 class RetrieverArtifact(Artifact):
     """
-    Artifact containing details from the retrieval node.
+    Artifact containing details from the embedding node.
 
-    Name of the embedding model chosen after retrieval optimization.
+    Name of the embedding model chosen after embedding optimization.
     """
 
     embedder_name: str
@@ -48,7 +48,7 @@ class ScorerArtifact(Artifact):
     )
 
 
-class PredictorArtifact(Artifact):
+class DecisionArtifact(Artifact):
     """
     Artifact containing outputs from the predictor node.
 
@@ -68,9 +68,9 @@ def validate_node_name(value: str) -> str:
     :return: Validated node type string.
     :raises ValueError: If the node type is invalid.
     """
-    if value in [NodeType.retrieval, NodeType.scoring, NodeType.prediction, NodeType.regexp]:
+    if value in [NodeType.embedding, NodeType.scoring, NodeType.decision, NodeType.regexp]:
         return value
-    msg = f"Unknown node_type: {value}. Expected one of ['regexp', 'retrieval', 'scoring', 'prediction']"
+    msg = f"Unknown node_type: {value}. Expected one of ['regexp', 'embedding', 'scoring', 'decision']"
     raise ValueError(msg)
 
 
@@ -84,9 +84,9 @@ class Artifacts(BaseModel):
     model_config = ConfigDict(arbitrary_types_allowed=True)
 
     regexp: list[RegexpArtifact] = []
-    retrieval: list[RetrieverArtifact] = []
+    embedding: list[RetrieverArtifact] = []
     scoring: list[ScorerArtifact] = []
-    prediction: list[PredictorArtifact] = []
+    decision: list[DecisionArtifact] = []
 
     def add_artifact(self, node_type: str, artifact: Artifact) -> None:
         """
@@ -120,7 +120,7 @@ def get_best_artifact(self, node_type: str, idx: int) -> Artifact:
 class Trial(BaseModel):
     """Representation of an individual optimization trial."""
 
-    module_type: str
+    module_name: str
     """Type of the module being optimized."""
     module_params: dict[str, Any]
     """Parameters of the module for the trial."""
@@ -136,9 +136,9 @@ class Trials(BaseModel):
     """Container for managing optimization trials for pipeline nodes."""
 
     regexp: list[Trial] = []
-    retrieval: list[Trial] = []
+    embedding: list[Trial] = []
     scoring: list[Trial] = []
-    prediction: list[Trial] = []
+    decision: list[Trial] = []
 
     def get_trial(self, node_type: str, idx: int) -> Trial:
         """
@@ -174,12 +174,12 @@ class TrialsIds(BaseModel):
 
     regexp: int | None = None
     """Best trial index for the regexp node."""
-    retrieval: int | None = None
-    """Best trial index for the retrieval node."""
+    embedding: int | None = None
+    """Best trial index for the embedding node."""
     scoring: int | None = None
     """Best trial index for the scoring"""
-    prediction: int | None = None
-    """Best trial index for the prediction node."""
+    decision: int | None = None
+    """Best trial index for the decision node."""
 
     def get_best_trial_idx(self, node_type: str) -> int | None:
         """

diff --git a/autointent/context/optimization_info/_optimization_info.py b/autointent/context/optimization_info/_optimization_info.py
@@ -1,6 +1,6 @@
 """Module for managing pipeline optimization.
 
-This module handles the tracking, logging, and retrieval of optimization artifacts,
+This module handles the tracking and logging of optimization artifacts,
 trials, and modules during the pipeline's execution.
 """
 
@@ -25,15 +25,15 @@ class ModulesList:
     """Container for managing lists of modules for each node type."""
 
     regexp: list["Module"] = field(default_factory=list)
-    retrieval: list["Module"] = field(default_factory=list)
+    embedding: list["Module"] = field(default_factory=list)
     scoring: list["Module"] = field(default_factory=list)
-    prediction: list["Module"] = field(default_factory=list)
+    decision: list["Module"] = field(default_factory=list)
 
     def get(self, node_type: str) -> list["Module"]:
         """
         Retrieve the list of modules for a specific node type.
 
-        :param node_type: The type of node (e.g., "regexp", "retrieval").
+        :param node_type: The type of node (e.g., "regexp", "embedding").
         :return: List of modules for the specified node type.
         """
         return getattr(self, node_type)  # type: ignore[no-any-return]
@@ -69,7 +69,7 @@ def __init__(self) -> None:
     def log_module_optimization(
         self,
         node_type: str,
-        module_type: str,
+        module_name: str,
         module_params: dict[str, Any],
         metric_value: float,
         metric_name: str,
@@ -81,7 +81,7 @@ def log_module_optimization(
         Log optimization results for a module.
 
         :param node_type: Type of the node being optimized.
-        :param module_type: Type of the module.
+        :param module_name: Type of the module.
         :param module_params: Parameters of the module for the trial.
         :param metric_value: Metric value achieved by the module.
         :param metric_name: Name of the evaluation metric.
@@ -90,7 +90,7 @@ def log_module_optimization(
         :param module: The module instance, if available.
         """
         trial = Trial(
-            module_type=module_type,
+            module_name=module_name,
             metric_name=metric_name,
             metric_value=metric_value,
             module_params=module_params,
@@ -144,7 +144,7 @@ def get_best_embedder(self) -> str:
 
         :return: Name of the best embedder.
         """
-        best_retriever_artifact: RetrieverArtifact = self._get_best_artifact(node_type=NodeType.retrieval)  # type: ignore[assignment]
+        best_retriever_artifact: RetrieverArtifact = self._get_best_artifact(node_type=NodeType.embedding)  # type: ignore[assignment]
         return best_retriever_artifact.embedder_name
 
     def get_best_train_scores(self) -> NDArray[np.float64] | None:
@@ -214,7 +214,7 @@ def get_inference_nodes_config(self, asdict: bool = False) -> list[InferenceNode
             trial = self.trials.get_trial(node_type, idx)
             item = {
                 "node_type": node_type.value,
-                "module_type": trial.module_type,
+                "module_name": trial.module_name,
                 "module_config": trial.module_params,
                 "load_path": trial.module_dump_dir,
             }

diff --git a/autointent/custom_types.py b/autointent/custom_types.py
@@ -46,9 +46,9 @@ class NodeType(str, Enum):
     """Enumeration of node types in the AutoIntent pipeline."""
 
     regexp = "regexp"
-    retrieval = "retrieval"
+    embedding = "embedding"
     scoring = "scoring"
-    prediction = "prediction"
+    decision = "decision"
 
 
 class Split: