diff --git a/.gitignore b/.gitignore
index 8a30d25..447413f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -396,3 +396,5 @@ FodyWeavers.xsd
 
 # JetBrains Rider
 *.sln.iml
+*.pyproj
+/keys
diff --git a/README.md b/README.md
index 2bd8f84..e95a772 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,32 @@
 # Eureka ML Insights Framework
 <p align="left">
-  <a href='https://arxiv.org/TODO'>
-    <img src='https://img.shields.io/badge/Arxiv-2308.16905-A42C25?style=flat&logo=arXiv&logoColor=A42C25'>
+  <a href='https://aka.ms/eureka-ml-insights-report'>
+    <img src=docs/figures/eureka_logo.png width="16">
+    Technical Report  
   </a>
-
+  <a href='https://aka.ms/eureka-ml-insights-blog'>
+    <img src=docs/figures/msr_blog.png width="16">
+    Blog Post
+  </a>
+  <a href='https://microsoft.github.io/eureka-ml-insights'>
+    <img src=docs/figures/github.png width="16">
+    Project Website
+  </a>
+</p>
 This repository contains the code for the Eureka ML Insights framework. The framework is designed to help researchers and practitioners run reproducible evaluations of generative models using a variety of benchmarks and metrics efficiently. The framework allows the user to define custom pipelines for data processing, inference, and evaluation, and provides a set of pre-defined evaluation pipelines for key benchmarks.
 
-![Eureka](./docs/figures/Benchmarks.png)
+| Benchmark <br> #prompts       | Modality  | Capability           |Logs| Pipeline Config |
+|-------------------------------|---------------|----------------------|------|-----|
+| GeoMeter <br> 1086            | Image -> Text | Geometric Reasoning  | [GeoMeter.zip](https://aifeval.z5.web.core.windows.net/eureka-bench-logs/GeoMeter.zip) | [geometer.py](eureka_ml_insights/configs/geometer.py) |
+| MMMU <br> 900                 | Image -> Text | Multimodal QA        | [MMMU.zip](https://aifeval.z5.web.core.windows.net/eureka-bench-logs/MMMU.zip) |[mmmu.py](eureka_ml_insights/configs/mmmu.py)|
+| Image Understanding <br> 10249| Image -> Text | Object Recognition <br> Object Detection <br> Visual Prompting <br> Spatial Reasoning | [IMAGE_UNDERSTANDING.zip](https://aifeval.z5.web.core.windows.net/eureka-bench-logs/IMAGE_UNDERSTANDING.zip) | [object_recognition.py](eureka_ml_insights/configs/spatial_understanding/object_recognition.py) <br> [object_detection.py](eureka_ml_insights/configs/spatial_understanding/object_detection.py) <br> [visual_prompting.py](eureka_ml_insights/configs/spatial_understanding/visual_prompting.py) <br> [spatial_reasoning.py](eureka_ml_insights/configs/spatial_understanding/spatial_reasoning.py) |
+| Vision Language <br> 13500    | Image -> Text | Spatial Understanding <br> Navigation <br> Counting| [VISION_LANGUAGE.zip](https://aifeval.z5.web.core.windows.net/eureka-bench-logs/VISION_LANGUAGE.zip) |[spatial_map.py](eureka_ml_insights/configs/vision_language/spatial_map.py) <br> [maze.py](eureka_ml_insights/configs/vision_language/maze.py) <br> [spatial_grid.py](eureka_ml_insights/configs/vision_language/spatial_grid.py)|
+| IFEval <br> 541                 | Text -> Text | Instruction Following        | [IFEval.zip](https://aifeval.z5.web.core.windows.net/eureka-bench-logs/IFEval.zip) |[ifeval.py](eureka_ml_insights/configs/ifeval.py)|
+| FlenQA <br> 12000               | Text -> Text | Long Context Multi-hop QA | [FlenQA.zip](https://aifeval.z5.web.core.windows.net/eureka-bench-logs/FlenQA.zip) |[flenQA.py](eureka_ml_insights/configs/flenqa.py)|
+| Kitab <br> 34217                | Text -> Text | Information Retrieval        | [Kitab.zip](https://aifeval.z5.web.core.windows.net/eureka-bench-logs/Kitab.zip) |[kitab.py](eureka_ml_insights/configs/kitab.py)|  
+| Toxigen <br> 10500              | Text -> Text | Toxicity Detection <br> Safe Language Generation         | [ToxiGen.zip](https://aifeval.z5.web.core.windows.net/eureka-bench-logs/ToxiGen.zip) |[toxigen.py](eureka_ml_insights/configs/toxigen.py)|
+
+For non-determinism evaluations using the above benchmarks, we provide pipelines in [nondeterminism.py](eureka_ml_insights/configs/nondeterminism.py) 
 
 ## Installation
 To get started, clone this repository to your local machine and navigate to the project directory.
@@ -32,21 +52,21 @@ To get started, clone this repository to your local machine and navigate to the
 4. Fetch from dir dist/ the .whl
 5. This file can be installed via `pip install eureka_ml_insights.whl`
 
-## 💥 Quick start
+## 🚀 Quick start
 To reproduce the results of a pre-defined experiment pipeline, you can run the following command:
 
 ```python main.py --exp_config exp_config_name --model_config model_config_name --exp_logdir your_log_dir```
 
-For example, to run the `KITAB_ONE_BOOK_CONSTRAINT_PIPELINE` experiment pipeline defined in `eureka_ml_insights/configs/kitab.py` using the OpenAI GPT4 1106 Preview model, you can run the following command:
+For example, to run the `FlenQA_Experiment_Pipeline` experiment pipeline defined in `eureka_ml_insights/configs/flenqa.py` using the OpenAI GPT4 1106 Preview model, you can run the following command:
 
-```python main.py --exp_config KITAB_ONE_BOOK_CONSTRAINT_PIPELINE --model_config OAI_GPT4_1106_PREVIEW_CONFIG --exp_logdir gpt4_1106_preveiw```
+```python main.py --exp_config FlenQA_Experiment_Pipeline --model_config OAI_GPT4_1106_PREVIEW_CONFIG --exp_logdir gpt4_1106_preveiw```
 
-The results of the experiment will be saved in the `logs/KITAB_ONE_BOOK_CONSTRAINT_PIPELINE/gpt4_1106_preveiw` directory.
-For other available experiment pipelines and model configurations, see the `eureka_ml_insights/configs` directory.
+The results of the experiment will be saved in a directory under `logs/FlenQA_Experiment_Pipeline/gpt4_1106_preveiw`. For each experiment you run with these configurations, a new directory will be created using the date and time of the experiment run. 
+For other available experiment pipelines and model configurations, see the `eureka_ml_insights/configs` directory. In [model_configs.py](eureka_ml_insights/configs/model_configs.py) you can configure the model classes to use your API keys, Keu Vault urls, endpoints, and other model-specific configurations.
 
-## 🔧 Configuring a Custom Experiment Pipeline
+## 🗺️ Overview of Experiment Pipelines
 ![Components](./docs/figures/transparent_uml.png)
-You can find examples of experiment pipeline configurations in `configs`. To create a new experiment configuration, you need to define a class that inherits from `ExperimentConfig` and implements the `configure_pipeline` method. In the `configure_pipeline` method you define the Pipeline config (arrangement of Components) for your Experiment. Once your class is ready, add it to `configs/__init__.py` import list.
+Experiment pipelines define the sequence of components that are run to process data, run inference, and evaluate the model outputs. You can find examples of experiment pipeline configurations in the `configs` directory. To create a new experiment configuration, you need to define a class that inherits from `ExperimentConfig` and implements the `configure_pipeline` method. In the `configure_pipeline` method you define the Pipeline config (arrangement of Components) for your Experiment. Once your class is ready, add it to `configs/__init__.py` import list.
 
 
 Your Pipeline can use any of the available Components which can be found under the `core` directory:
@@ -57,13 +77,13 @@ Your Pipeline can use any of the available Components which can be found under t
 - `DataJoin`: you can use this component to join two sources of data, for example to join the model outputs with the ground truth data for evaluation.
 
 Note that:
-- You can inherit from one of the existing experiment config classes and override the necessary attributes to reduce the amount of code you need to write. You can find examples of this too in `configs/spatial_reasoning.py`.
+- You can inherit from one of the existing experiment config classes and override the necessary attributes to reduce the amount of code you need to write. You can find examples of this in [spatial_reasoning.py](eureka_ml_insights/configs/spatial_understanding/spatial_reasoning.py).
 - Your pipeline does not need to use all of the components. You can use only the components you need. And you can use the components multiple times in the pipeline.
 - Make sure the input of each component matches the output of the previous component in the pipeline. The components are run sequentially in the order they are defined in the pipeline configuration.
-- For standard scenarios you do not need to implement new components for your pipeline, but you do need to configure the existing components to use the correct utility classes for your scenario.
+- For standard scenarios you do not need to implement new components for your pipeline, but you do need to configure the existing components to use the correct utility classes (i.e. models, data readers, metrics, etc.) for your scenario.
 
-### 🔧 Utility Classes Used in Components
-The components in your pipeline need to use the corrent utility classes for your scenario. In standard scenarios do not need to implement new components for your pipeline, but you do need to configure the existing components to work with the correct utility classes. If you need a functionality that is not provided by the existing utility classes, you can implement a new utility class and use it in your pipeline.
+### ⚒️ Utility Classes Used in Components
+Utility classes include Models, Metrics, DataLoaders, DataReaders, etc. The components in your pipeline need to use the correct utility classes for your scenario. For example, to evaluate an OpenAI model on a dataset that is available on HuggingFace, you need to use the [`HFDataReader`](eureka_ml_insights/data_utils/data.py) data reader and the [`OpenAIModelsOAI`](eureka_ml_insights/models/models.py) model class. In standard scenarios do not need to implement new components for your pipeline, but you do need to configure the existing components to work with the correct utility classes. If you need a functionality that is not provided by the existing utility classes, you can implement a new utility class and use it in your pipeline.
 
 In general, to find out what utility classes and other attributes need to be configured for a component, you can look at the component's corresponding Config dataclass in `configs/config.py`. For example, if you are configuring the `DataProcessing` component, you can look at the `DataProcessingConfig` dataclass in `configs/config.py`.
 
@@ -71,27 +91,27 @@ Utility classes are also configurable by providing the name of the class and the
 
 Our current components use the following utility classes: `DataReader`, `DataLoader`, `Model`, `Metric`, `Aggregator`. You can use the existing utility classes or implement new ones as needed to configure your components.
 
-### 🔧 Configuring the Data Processing Component
+### 🪛 Configuring the Data Processing Component
 This component is used for general data processing tasks.
 
 - `data_reader_config`: Configuration for the DataReader that is used to load the data into a pandas dataframe, apply any necessary processing on it (optional), and return the processed data. We currently support local and Azure Blob Storage data sources.
     - Transformations: you can find the available transformations in `data_utils/transforms.py`. If you need to implement new transform classes, add them to this file.
 - `output_dir`: This is the folder name where the processed data will be saved. This folder will automatically be created under the experiment log directory and the processed data will be saved in a file called `processed_data.jsonl`.
-- `transformed_data_columns` (OPTIONAL): This is the list of columns to save in transformed_data.jsonl. By default, all columns are saved.
+- `output_data_columns` (OPTIONAL): This is the list of columns to save in transformed_data.jsonl. By default, all columns are saved.
 
-### 🔧 Configuring the Prompt Processing Component
+### 🪛 Configuring the Prompt Processing Component
 This component inherits from the DataProcessing component and is used specifically for prompt processing tasks, such as applying a Jinja prompt template. If a prompt template is provided, the processed data will have a 'prompt' column that is expected by the inference component. Otherwise the input data is expected to already have a 'prompt' column. This component also reserves the "model_output" column for the model outputs so if it already exists in the input data, it will be removed. 
 
 In addition to the attributes of the DataProcessing component, the PromptProcessing component has the following attributes:
 - `prompt_template_path` (OPTIONAL): This template is used to format your data for model inference in case you need prompt templating or system prompts. Provide your jinja prompt template path to this component. See for example `prompt_templates/basic.jinja`. The prompt template processing step adds a 'prompt' column to the processed data, which is expected by the inference component. If you do not need prompt templating, make sure your data already does have a 'prompt' column.
 - `ignore_failure` (OPTIONAL): Whether to ignore the failure of prompt processing on a row and move on to the next, or to raise an exception. Default is False.
 
-### 🔧 Configuring the Inference Component
+### 🪛 Configuring the Inference Component
 - `model_config`: Configuration of the model class to use for inference. You can find the available models in `models/`.
-- `data_config`: Configuration of the data_loader class to use for inference. You can find the available data classes in `data_utils/data.py`.
+- `data_loader_config`: Configuration of the data_loader class to use for inference. You can find the available data classes in `data_utils/data.py`.
 - `output_dir`: This is the folder name where the model outputs will be saved. This folder will automatically be created under the experiment log directory and the model outputs will be saved in a file called `inference_result.jsonl`.
 
-### 🔧 Configuring the Evaluation Reporting  Component
+### 🪛 Configuring the Evaluation Reporting  Component
 - `data_reader_config`: Configuration object for the DataReader that is used to load the data into a pandas dataframe. This is the same type of utility class used in the DataProcessing component.
 - `metric_config`: a MetricConfig object to specify the metric class to use for evaluation. You can find the available metrics in `metrics/`. If you need to implement new metric classes, add them to this directory.
 - `aggregator_configs`/`visualizer_configs`: List of configs for aggregators/visualizers to apply to the metric results. These classes that take metric results and aggragate/analyze/vizualize them and save them. You can find the available aggregators and visualizers in `metrics/reports.py`.
@@ -99,11 +119,10 @@ In addition to the attributes of the DataProcessing component, the PromptProcess
 
 # ✋ How to contribute:
 - To contribute to the framework, please create a new branch.
-- Implement your pipeline configuration class under `configs` and any utility classes that your pipeline requires.
+- Implement your pipeline configuration class under `configs`, as well as any utility classes that your pipeline requires.
 - Please add end-to-end tests for your contributions in the `tests` directory.
 - Please add unit tests for any new utility classes you implement in the `tests` directory.
 - Please add documentation to your classes and methods in form of docstrings.
-- Use `git add filename` to add the files you want to commit, and ONLY the files you want to commit.  
 - Then use `make format-inplace` to format the files you have changed. This will only work on files that git is tracking, so make sure to `git add` any newly created files before running this command.
 - Use `make linters` to check any remaining style or format issues and fix them manually.
 - Use `make test` to run the tests and make sure they all pass.
@@ -122,4 +141,11 @@ If you use this framework in your research, please cite the following paper:
   url={TODO}, 
 }
 
-```
\ No newline at end of file
+```
+# Responsible AI Considerations
+ 
+A cross-cutting dimension for all capability evaluations is the evaluation of several aspects of model behavior important for the responsible fielding of AI systems. These consideration include the fairness, reliability, safety, privacy, and security of models. While evaluations through the Toxigen dataset (included in Eureka-Bench) capture notions of representational fairness for different demographic groups and, to some extent, the ability of the model to generate safe language despite non-safe input triggers in the prompt, other aspects or nuances of fairness and safety require further evaluation and additional clarity, which we hope to integrate in future versions and welcome contributions for. We are also interested in expanding Eureka-Bench with tasks where fairness and bias can be studied in more benign settings that simulate how risks may appear when humans use AI to assist them in everyday tasks (e.g. creative writing, information search etc.) and subtle language or visual biases encoded in training data might be reflected in the AI's assistance.
+ 
+A general rising concern on responsible AI evaluations is that there is a quick turnaround between new benchmarks being released and then included in content safety filters or in post training datasets. Because of this, scores on benchmarks focused on responsible and safe deployment may appear to be unusually high for most capable models. While the quick reaction is a positive development, from an evaluation and understanding perspective, the high scores indicate that the benchmarks are not sensitive enough to capture differences in alignment and safety processes followed for different models. At the same time, it is also the case that fielding thresholds for responsible AI measurements can be inherently higher and as such these evaluations will require a different interpretation lens. For example, a 5 percent error rate in instruction following for content length should not be weighed in the same way as a 5 percent error rate in detecting toxic content, or even a 5 percent success rates in jailbreak attacks. Therefore, successful and timely evaluations to this end depend on collaborative efforts that integrate red teaming, quantified evaluations, and human studies in the context of real-world applications.
+ 
+Finally, Eureka and the set of associated benchmarks are only the initial snapshot of an effort that aims at reliably measuring progress in AI. Our team is excited about further collaborations with the open-source community and research, with the goal of sharing and extending current measurements for new capabilities and models. Our current roadmap involves enriching Eureka with more measurements around planning, reasoning, fairness, reliability and safety, and advanced multimodal capabilities for video and audio.
\ No newline at end of file
diff --git a/docs/figures/eureka_logo.png b/docs/figures/eureka_logo.png
new file mode 100644
index 0000000..2e1ec0d
Binary files /dev/null and b/docs/figures/eureka_logo.png differ
diff --git a/docs/figures/github.png b/docs/figures/github.png
new file mode 100644
index 0000000..2ed19ff
Binary files /dev/null and b/docs/figures/github.png differ
diff --git a/docs/figures/msr_blog.png b/docs/figures/msr_blog.png
new file mode 100644
index 0000000..190599a
Binary files /dev/null and b/docs/figures/msr_blog.png differ
diff --git a/eureka_ml_insights/configs/__init__.py b/eureka_ml_insights/configs/__init__.py
index 5a2b3de..6263718 100644
--- a/eureka_ml_insights/configs/__init__.py
+++ b/eureka_ml_insights/configs/__init__.py
@@ -24,32 +24,32 @@
     KITAB_TWO_BOOK_CONSTRAINT_PIPELINE,
     KITAB_TWO_BOOK_CONSTRAINT_PIPELINE_WITH_CONTEXT,
 )
-from .mmmu import MMMU_PIPELINE
+from .mmmu import MMMU_BASELINE_PIPELINE
 from .nondeterminism import (
     Geo_Nondeterminism,
     IFEval_Nondeterminism,
     Kitab_Nondeterminism,
     MMMU_Nondeterminism,
 )
-from .spatial_understanding.object_detection import (
+from .image_understanding.object_detection import (
     OBJECT_DETECTION_PAIRS_LOCAL_PIPELINE,
     OBJECT_DETECTION_PAIRS_PIPELINE,
     OBJECT_DETECTION_SINGLE_LOCAL_PIPELINE,
     OBJECT_DETECTION_SINGLE_PIPELINE,
 )
-from .spatial_understanding.object_recognition import (
+from .image_understanding.object_recognition import (
     OBJECT_RECOGNITION_PAIRS_LOCAL_PIPELINE,
     OBJECT_RECOGNITION_PAIRS_PIPELINE,
     OBJECT_RECOGNITION_SINGLE_LOCAL_PIPELINE,
     OBJECT_RECOGNITION_SINGLE_PIPELINE,
 )
-from .spatial_understanding.spatial_reasoning import (
+from .image_understanding.spatial_reasoning import (
     SPATIAL_REASONING_PAIRS_LOCAL_PIPELINE,
     SPATIAL_REASONING_PAIRS_PIPELINE,
     SPATIAL_REASONING_SINGLE_LOCAL_PIPELINE,
     SPATIAL_REASONING_SINGLE_PIPELINE,
 )
-from .spatial_understanding.visual_prompting import (
+from .image_understanding.visual_prompting import (
     VISUAL_PROMPTING_PAIRS_LOCAL_PIPELINE,
     VISUAL_PROMPTING_PAIRS_PIPELINE,
     VISUAL_PROMPTING_SINGLE_LOCAL_PIPELINE,
@@ -115,7 +115,7 @@
     IFEval_PIPELINE,
     FlenQA_Experiment_Pipeline,
     GEOMETER_PIPELINE,
-    MMMU_PIPELINE,
+    MMMU_BASELINE_PIPELINE,
     BBH_MCQ_ORCA_PIPELINE,
     BBH_MCQ_OpenAI_PIPELINE,
     KITAB_ONE_BOOK_CONSTRAINT_PIPELINE,
diff --git a/eureka_ml_insights/configs/spatial_understanding/__init__.py b/eureka_ml_insights/configs/image_understanding/__init__.py
similarity index 100%
rename from eureka_ml_insights/configs/spatial_understanding/__init__.py
rename to eureka_ml_insights/configs/image_understanding/__init__.py
diff --git a/eureka_ml_insights/configs/spatial_understanding/common.py b/eureka_ml_insights/configs/image_understanding/common.py
similarity index 100%
rename from eureka_ml_insights/configs/spatial_understanding/common.py
rename to eureka_ml_insights/configs/image_understanding/common.py
diff --git a/eureka_ml_insights/configs/spatial_understanding/object_detection.py b/eureka_ml_insights/configs/image_understanding/object_detection.py
similarity index 73%
rename from eureka_ml_insights/configs/spatial_understanding/object_detection.py
rename to eureka_ml_insights/configs/image_understanding/object_detection.py
index 6f556ca..40820c2 100644
--- a/eureka_ml_insights/configs/spatial_understanding/object_detection.py
+++ b/eureka_ml_insights/configs/image_understanding/object_detection.py
@@ -3,9 +3,9 @@
 from eureka_ml_insights.configs.experiment_config import ExperimentConfig
 from eureka_ml_insights.core import EvalReporting, Inference, PromptProcessing
 from eureka_ml_insights.data_utils import (
-    AzureDataReader,
-    AzureJsonReader,
-    AzureMMDataLoader,
+    HFDataReader,
+    HFJsonReader,
+    MMDataLoader,
     ColumnRename,
     CopyColumn,
     DataReader,
@@ -52,20 +52,11 @@ def configure_pipeline(self, model_config, resume_from=None):
         self.data_processing_comp = PromptProcessingConfig(
             component_type=PromptProcessing,
             data_reader_config=DataSetConfig(
-                AzureDataReader,
+                HFDataReader,
                 {
-                    "account_url": "https://aifeval.blob.core.windows.net/",
-                    "blob_container": "datasets",
-                    "blob_name": "msr_aif_object_detection_pairs/object_detection_val_long_prompt.jsonl",
-                    "transform": SequenceTransform(
-                        [
-                            ColumnRename(name_mapping={"query_text": "prompt", "target_text": "ground_truth"}),
-                            CopyColumn(column_name_src="images", column_name_dst="images_prepended"),
-                            PrependStringTransform(
-                                columns="images_prepended", string="msr_aif_object_detection_pairs/"
-                            ),
-                        ]
-                    ),
+                    "path": "microsoft/IMAGE_UNDERSTANDING",
+                    "split": "val",
+                    "tasks": "object_detection_pairs",
                 },
             ),
             output_dir=os.path.join(self.log_dir, "data_processing_output"),
@@ -76,22 +67,19 @@ def configure_pipeline(self, model_config, resume_from=None):
             component_type=Inference,
             model_config=model_config,
             data_loader_config=DataSetConfig(
-                AzureMMDataLoader,
+                MMDataLoader,
                 {
                     "path": os.path.join(self.data_processing_comp.output_dir, "transformed_data.jsonl"),
-                    "account_url": "https://aifeval.blob.core.windows.net/",
-                    "blob_container": "datasets",
-                    "image_column_names": ["images_prepended"],
                 },
             ),
             output_dir=os.path.join(self.log_dir, "inference_result"),
             resume_from=resume_from,
         )
 
-        target_coco_json_reader = AzureJsonReader(
-            account_url="https://aifeval.blob.core.windows.net/",
-            blob_container="datasets",
-            blob_name="msr_aif_object_detection_pairs/coco_instances.json",
+        target_coco_json_reader = HFJsonReader(
+            repo_id="microsoft/IMAGE_UNDERSTANDING",
+            repo_type="dataset",
+            filename="object_detection_pairs/coco_instances.json",            
         )
 
         # Configure the evaluation and reporting component.
@@ -129,17 +117,14 @@ class OBJECT_DETECTION_SINGLE_PIPELINE(OBJECT_DETECTION_PAIRS_PIPELINE):
 
     def configure_pipeline(self, model_config, resume_from=None):
         config = super().configure_pipeline(model_config, resume_from)
-        self.data_processing_comp.data_reader_config.init_args["blob_name"] = (
-            "msr_aif_object_detection_single/object_detection_val_long_prompt.jsonl"
+        self.data_processing_comp.data_reader_config.init_args["tasks"] = (
+            "object_detection_single"
         )
-        self.data_processing_comp.data_reader_config.init_args["transform"].transforms[
-            2
-        ].string = "msr_aif_object_detection_single/"
-
-        target_coco_json_reader = AzureJsonReader(
-            account_url="https://aifeval.blob.core.windows.net/",
-            blob_container="datasets",
-            blob_name="msr_aif_object_detection_single/coco_instances.json",
+
+        target_coco_json_reader = HFJsonReader(
+            repo_id="microsoft/IMAGE_UNDERSTANDING",
+            repo_type="dataset",
+            filename="object_detection_single/coco_instances.json",
         )
 
         self.evalreporting_comp.metric_config.init_args["target_coco_json_reader"] = target_coco_json_reader
diff --git a/eureka_ml_insights/configs/spatial_understanding/object_recognition.py b/eureka_ml_insights/configs/image_understanding/object_recognition.py
similarity index 80%
rename from eureka_ml_insights/configs/spatial_understanding/object_recognition.py
rename to eureka_ml_insights/configs/image_understanding/object_recognition.py
index 41ad53c..c238a8a 100644
--- a/eureka_ml_insights/configs/spatial_understanding/object_recognition.py
+++ b/eureka_ml_insights/configs/image_understanding/object_recognition.py
@@ -3,8 +3,8 @@
 from eureka_ml_insights.configs.experiment_config import ExperimentConfig
 from eureka_ml_insights.core import EvalReporting, Inference, PromptProcessing
 from eureka_ml_insights.data_utils import (
-    AzureDataReader,
-    AzureMMDataLoader,
+    HFDataReader,
+    MMDataLoader,
     ColumnRename,
     DataReader,
     PrependStringTransform,
@@ -53,17 +53,11 @@ def configure_pipeline(self, model_config, resume_from=None):
         self.data_processing_comp = PromptProcessingConfig(
             component_type=PromptProcessing,
             data_reader_config=DataSetConfig(
-                AzureDataReader,
+                HFDataReader,
                 {
-                    "account_url": "https://aifeval.blob.core.windows.net/",
-                    "blob_container": "datasets",
-                    "blob_name": "msr_aif_spatial_reasoning_lrtb_pairs/recognition_val.jsonl",
-                    "transform": SequenceTransform(
-                        [
-                            ColumnRename(name_mapping={"query_text": "prompt", "target_text": "ground_truth"}),
-                            PrependStringTransform(columns="images", string="msr_aif_spatial_reasoning_lrtb_pairs/"),
-                        ]
-                    ),
+                    "path": "microsoft/IMAGE_UNDERSTANDING",
+                    "split": "val",
+                    "tasks": "object_recognition_pairs",
                 },
             ),
             output_dir=os.path.join(self.log_dir, "data_processing_output"),
@@ -74,12 +68,9 @@ def configure_pipeline(self, model_config, resume_from=None):
             component_type=Inference,
             model_config=model_config,
             data_loader_config=DataSetConfig(
-                AzureMMDataLoader,
+                MMDataLoader,
                 {
                     "path": os.path.join(self.data_processing_comp.output_dir, "transformed_data.jsonl"),
-                    "account_url": "https://aifeval.blob.core.windows.net/",
-                    "blob_container": "datasets",
-                    "image_column_names": ["images"],
                 },
             ),
             output_dir=os.path.join(self.log_dir, "inference_result"),
@@ -119,12 +110,9 @@ class OBJECT_RECOGNITION_SINGLE_PIPELINE(OBJECT_RECOGNITION_PAIRS_PIPELINE):
 
     def configure_pipeline(self, model_config, resume_from=None):
         config = super().configure_pipeline(model_config, resume_from)
-        self.data_processing_comp.data_reader_config.init_args["blob_name"] = (
-            "msr_aif_spatial_reasoning_lrtb_single/recognition_val.jsonl"
+        self.data_processing_comp.data_reader_config.init_args["tasks"] = (
+            "object_recognition_single"
         )
-        self.data_processing_comp.data_reader_config.init_args["transform"].transforms[
-            1
-        ].string = "msr_aif_spatial_reasoning_lrtb_single/"
         return config
 
 
diff --git a/eureka_ml_insights/configs/spatial_understanding/spatial_reasoning.py b/eureka_ml_insights/configs/image_understanding/spatial_reasoning.py
similarity index 82%
rename from eureka_ml_insights/configs/spatial_understanding/spatial_reasoning.py
rename to eureka_ml_insights/configs/image_understanding/spatial_reasoning.py
index f3bacfa..9f13ecf 100644
--- a/eureka_ml_insights/configs/spatial_understanding/spatial_reasoning.py
+++ b/eureka_ml_insights/configs/image_understanding/spatial_reasoning.py
@@ -5,8 +5,8 @@
 from eureka_ml_insights.data_utils import (
     AddColumnAndData,
     ASTEvalTransform,
-    AzureDataReader,
-    AzureMMDataLoader,
+    HFDataReader,
+    MMDataLoader,
     ColumnRename,
     DataReader,
     PrependStringTransform,
@@ -55,17 +55,11 @@ def configure_pipeline(self, model_config, resume_from=None):
         self.data_processing_comp = PromptProcessingConfig(
             component_type=PromptProcessing,
             data_reader_config=DataSetConfig(
-                AzureDataReader,
+                HFDataReader,
                 {
-                    "account_url": "https://aifeval.blob.core.windows.net/",
-                    "blob_container": "datasets",
-                    "blob_name": "msr_aif_spatial_reasoning_lrtb_pairs/msr_aif_spatial_reasoning_lrtb_pairs.jsonl",
-                    "transform": SequenceTransform(
-                        [
-                            ColumnRename(name_mapping={"query_text": "prompt", "target_text": "ground_truth"}),
-                            PrependStringTransform(columns="images", string="msr_aif_spatial_reasoning_lrtb_pairs/"),
-                        ]
-                    ),
+                    "path": "microsoft/IMAGE_UNDERSTANDING",
+                    "split": "val",
+                    "tasks": "spatial_reasoning_lrtb_pairs",
                 },
             ),
             output_dir=os.path.join(self.log_dir, "data_processing_output"),
@@ -76,12 +70,9 @@ def configure_pipeline(self, model_config, resume_from=None):
             component_type=Inference,
             model_config=model_config,
             data_loader_config=DataSetConfig(
-                AzureMMDataLoader,
+                MMDataLoader,
                 {
                     "path": os.path.join(self.data_processing_comp.output_dir, "transformed_data.jsonl"),
-                    "account_url": "https://aifeval.blob.core.windows.net/",
-                    "blob_container": "datasets",
-                    "image_column_names": ["images"],
                 },
             ),
             output_dir=os.path.join(self.log_dir, "inference_result"),
@@ -129,12 +120,9 @@ class SPATIAL_REASONING_SINGLE_PIPELINE(SPATIAL_REASONING_PAIRS_PIPELINE):
 
     def configure_pipeline(self, model_config, resume_from=None):
         config = super().configure_pipeline(model_config=model_config, resume_from=resume_from)
-        self.data_processing_comp.data_reader_config.init_args["blob_name"] = (
-            "msr_aif_spatial_reasoning_lrtb_single/msr_aif_spatial_reasoning_lrtb_single.jsonl"
+        self.data_processing_comp.data_reader_config.init_args["tasks"] = (
+            "spatial_reasoning_lrtb_single"
         )
-        self.data_processing_comp.data_reader_config.init_args["transform"].transforms[
-            1
-        ].string = "msr_aif_spatial_reasoning_lrtb_single/"
         self.evalreporting_comp.data_reader_config.init_args["transform"].transforms[
             0
         ].data = "['left', 'right', 'top', 'bottom']"
diff --git a/eureka_ml_insights/configs/spatial_understanding/visual_prompting.py b/eureka_ml_insights/configs/image_understanding/visual_prompting.py
similarity index 81%
rename from eureka_ml_insights/configs/spatial_understanding/visual_prompting.py
rename to eureka_ml_insights/configs/image_understanding/visual_prompting.py
index 6fb81ab..bb9438d 100644
--- a/eureka_ml_insights/configs/spatial_understanding/visual_prompting.py
+++ b/eureka_ml_insights/configs/image_understanding/visual_prompting.py
@@ -3,8 +3,8 @@
 from eureka_ml_insights.configs.experiment_config import ExperimentConfig
 from eureka_ml_insights.core import EvalReporting, Inference, PromptProcessing
 from eureka_ml_insights.data_utils import (
-    AzureDataReader,
-    AzureMMDataLoader,
+    HFDataReader,
+    MMDataLoader,
     ColumnRename,
     DataReader,
     PrependStringTransform,
@@ -53,17 +53,11 @@ def configure_pipeline(self, model_config, resume_from=None):
         self.data_processing_comp = PromptProcessingConfig(
             component_type=PromptProcessing,
             data_reader_config=DataSetConfig(
-                AzureDataReader,
+                HFDataReader,
                 {
-                    "account_url": "https://aifeval.blob.core.windows.net/",
-                    "blob_container": "datasets",
-                    "blob_name": "msr_aif_visual_prompting_pairs/visual_prompting_val.jsonl",
-                    "transform": SequenceTransform(
-                        [
-                            ColumnRename(name_mapping={"query_text": "prompt", "target_text": "ground_truth"}),
-                            PrependStringTransform(columns="images", string="msr_aif_visual_prompting_pairs/"),
-                        ]
-                    ),
+                    "path": "microsoft/IMAGE_UNDERSTANDING",
+                    "split": "val",
+                    "tasks": "visual_prompting_pairs",
                 },
             ),
             output_dir=os.path.join(self.log_dir, "data_processing_output"),
@@ -74,12 +68,9 @@ def configure_pipeline(self, model_config, resume_from=None):
             component_type=Inference,
             model_config=model_config,
             data_loader_config=DataSetConfig(
-                AzureMMDataLoader,
+                MMDataLoader,
                 {
                     "path": os.path.join(self.data_processing_comp.output_dir, "transformed_data.jsonl"),
-                    "account_url": "https://aifeval.blob.core.windows.net/",
-                    "blob_container": "datasets",
-                    "image_column_names": ["images"],
                 },
             ),
             output_dir=os.path.join(self.log_dir, "inference_result"),
@@ -119,12 +110,9 @@ class VISUAL_PROMPTING_SINGLE_PIPELINE(VISUAL_PROMPTING_PAIRS_PIPELINE):
 
     def configure_pipeline(self, model_config, resume_from=None):
         config = super().configure_pipeline(model_config, resume_from)
-        self.data_processing_comp.data_reader_config.init_args["blob_name"] = (
-            "msr_aif_visual_prompting_single/visual_prompting_val.jsonl"
+        self.data_processing_comp.data_reader_config.init_args["tasks"] = (
+            "visual_prompting_single"
         )
-        self.data_processing_comp.data_reader_config.init_args["transform"].transforms[
-            1
-        ].string = "msr_aif_visual_prompting_single/"
         return config
 
 
diff --git a/eureka_ml_insights/configs/mmmu.py b/eureka_ml_insights/configs/mmmu.py
index 169343d..2d2a3a2 100644
--- a/eureka_ml_insights/configs/mmmu.py
+++ b/eureka_ml_insights/configs/mmmu.py
@@ -32,7 +32,7 @@
 )
 
 
-class MMMU_PIPELINE(ExperimentConfig):
+class MMMU_BASELINE_PIPELINE(ExperimentConfig):
     """
     This defines an ExperimentConfig pipeline for the MMMU dataset.
     There is no model_config by default and the model config must be passed in via command lime.
diff --git a/eureka_ml_insights/configs/nondeterminism.py b/eureka_ml_insights/configs/nondeterminism.py
index 9626120..7d1fc32 100644
--- a/eureka_ml_insights/configs/nondeterminism.py
+++ b/eureka_ml_insights/configs/nondeterminism.py
@@ -8,7 +8,7 @@
 from .geometer import GEOMETER_PIPELINE
 from .ifeval import IFEval_PIPELINE
 from .kitab import KITAB_ONE_BOOK_CONSTRAINT_PIPELINE
-from .mmmu import MMMU_PIPELINE
+from .mmmu import MMMU_BASELINE_PIPELINE
 
 
 class IFEval_Nondeterminism(IFEval_PIPELINE):
@@ -45,7 +45,7 @@ def configure_pipeline(self, **kwargs):
         return config
 
 
-class MMMU_Nondeterminism(MMMU_PIPELINE):
+class MMMU_Nondeterminism(MMMU_BASELINE_PIPELINE):
     def configure_pipeline(self, **kwargs):
         config = super().configure_pipeline(**kwargs)
         # Downsample the data and repeat each prompt 3 time
diff --git a/eureka_ml_insights/configs/specifications/spatial_understanding.txt b/eureka_ml_insights/configs/specifications/image_understanding.txt
similarity index 100%
rename from eureka_ml_insights/configs/specifications/spatial_understanding.txt
rename to eureka_ml_insights/configs/specifications/image_understanding.txt
diff --git a/eureka_ml_insights/configs/vision_language/maze.py b/eureka_ml_insights/configs/vision_language/maze.py
index f7c06fe..0f8da2e 100644
--- a/eureka_ml_insights/configs/vision_language/maze.py
+++ b/eureka_ml_insights/configs/vision_language/maze.py
@@ -3,8 +3,8 @@
 from eureka_ml_insights.configs.experiment_config import ExperimentConfig
 from eureka_ml_insights.core import EvalReporting, Inference, PromptProcessing
 from eureka_ml_insights.data_utils import (
-    AzureDataReader,
-    AzureMMDataLoader,
+    HFDataReader,    
+    MMDataLoader,
     ColumnRename,
     DataLoader,
     DataReader,
@@ -47,14 +47,11 @@ def configure_pipeline(self, model_config: ModelConfig, resume_from: str = None)
         self.data_processing_comp = PromptProcessingConfig(
             component_type=PromptProcessing,
             data_reader_config=DataSetConfig(
-                AzureDataReader,
+                HFDataReader,
                 {
-                    "account_url": "https://aifeval.blob.core.windows.net/",
-                    "blob_container": "datasets",
-                    "blob_name": "spatial_reason_vlm_datasets/maze_dataset/n500/questions/base_500/gpt4-eval-g3-n500-a_dfs-h76195_seed_42_QA_merged.jsonl",
-                    "transform": PrependStringTransform(
-                        columns="image", string="spatial_reason_vlm_datasets/maze_dataset/n500/"
-                    ),
+                    "path": "microsoft/VISION_LANGUAGE",
+                    "split": "val",
+                    "tasks": "maze",
                 },
             ),
             output_dir=os.path.join(self.log_dir, "data_processing_output"),
@@ -65,12 +62,9 @@ def configure_pipeline(self, model_config: ModelConfig, resume_from: str = None)
             component_type=Inference,
             model_config=model_config,
             data_loader_config=DataSetConfig(
-                AzureMMDataLoader,
+                MMDataLoader,
                 {
                     "path": os.path.join(self.data_processing_comp.output_dir, "transformed_data.jsonl"),
-                    "account_url": "https://aifeval.blob.core.windows.net/",
-                    "blob_container": "datasets",
-                    "image_column_names": ["image"],
                 },
             ),
             output_dir=os.path.join(self.log_dir, "inference_result"),
@@ -121,18 +115,9 @@ class MAZE_TEXTONLY_PIPELINE(MAZE_PIPELINE):
 
     def configure_pipeline(self, model_config: ModelConfig, resume_from: str = None) -> PipelineConfig:
         config = super().configure_pipeline(model_config, resume_from)
-        self.data_processing_comp.data_reader_config.init_args["blob_name"] = (
-            "spatial_reason_vlm_datasets/maze_dataset/n500/questions/base_500/gpt4-eval-g3-n500-a_dfs-h76195_seed_42_QA_text_only_merged.jsonl"
-        )
-        self.data_processing_comp.data_reader_config.init_args["transform"] = None
-
-        self.inference_comp.data_loader_config = DataSetConfig(
-            DataLoader,
-            {
-                "path": os.path.join(self.data_processing_comp.output_dir, "transformed_data.jsonl"),
-            },
+        self.data_processing_comp.data_reader_config.init_args["tasks"] = (
+            "maze_text_only"
         )
-
         return config
 
 
diff --git a/eureka_ml_insights/configs/vision_language/spatial_grid.py b/eureka_ml_insights/configs/vision_language/spatial_grid.py
index 0bfe853..92166eb 100644
--- a/eureka_ml_insights/configs/vision_language/spatial_grid.py
+++ b/eureka_ml_insights/configs/vision_language/spatial_grid.py
@@ -3,8 +3,8 @@
 from eureka_ml_insights.configs.experiment_config import ExperimentConfig
 from eureka_ml_insights.core import EvalReporting, Inference, PromptProcessing
 from eureka_ml_insights.data_utils import (
-    AzureDataReader,
-    AzureMMDataLoader,
+    HFDataReader,    
+    MMDataLoader,
     ColumnRename,
     DataLoader,
     DataReader,
@@ -46,14 +46,11 @@ def configure_pipeline(self, model_config: ModelConfig, resume_from: str = None)
         self.data_processing_comp = PromptProcessingConfig(
             component_type=PromptProcessing,
             data_reader_config=DataSetConfig(
-                AzureDataReader,
+                HFDataReader,
                 {
-                    "account_url": "https://aifeval.blob.core.windows.net/",
-                    "blob_container": "datasets",
-                    "blob_name": "spatial_reason_vlm_datasets/grid_dataset/n500/questions/animal/test-g5-n500_seed_42_QA_merged.jsonl",
-                    "transform": PrependStringTransform(
-                        columns="image", string="spatial_reason_vlm_datasets/grid_dataset/n500/"
-                    ),
+                    "path": "microsoft/VISION_LANGUAGE",
+                    "split": "val",
+                    "tasks": "spatial_grid",
                 },
             ),
             output_dir=os.path.join(self.log_dir, "data_processing_output"),
@@ -64,12 +61,9 @@ def configure_pipeline(self, model_config: ModelConfig, resume_from: str = None)
             component_type=Inference,
             model_config=model_config,
             data_loader_config=DataSetConfig(
-                AzureMMDataLoader,
+                MMDataLoader,
                 {
                     "path": os.path.join(self.data_processing_comp.output_dir, "transformed_data.jsonl"),
-                    "account_url": "https://aifeval.blob.core.windows.net/",
-                    "blob_container": "datasets",
-                    "image_column_names": ["image"],
                 },
             ),
             output_dir=os.path.join(self.log_dir, "inference_result"),
@@ -121,18 +115,9 @@ class SPATIAL_GRID_TEXTONLY_PIPELINE(SPATIAL_GRID_PIPELINE):
 
     def configure_pipeline(self, model_config: ModelConfig, resume_from: str = None) -> PipelineConfig:
         config = super().configure_pipeline(model_config, resume_from)
-        self.data_processing_comp.data_reader_config.init_args["blob_name"] = (
-            "spatial_reason_vlm_datasets/grid_dataset/n500/questions/animal/test-g5-n500_seed_42_QA_text_only_merged.jsonl"
-        )
-        self.data_processing_comp.data_reader_config.init_args["transform"] = None
-
-        self.inference_comp.data_loader_config = DataSetConfig(
-            DataLoader,
-            {
-                "path": os.path.join(self.data_processing_comp.output_dir, "transformed_data.jsonl"),
-            },
+        self.data_processing_comp.data_reader_config.init_args["tasks"] = (
+            "spatial_grid_text_only"
         )
-
         return config
 
 
diff --git a/eureka_ml_insights/configs/vision_language/spatial_map.py b/eureka_ml_insights/configs/vision_language/spatial_map.py
index ee85bd5..7a4cfdf 100644
--- a/eureka_ml_insights/configs/vision_language/spatial_map.py
+++ b/eureka_ml_insights/configs/vision_language/spatial_map.py
@@ -3,8 +3,8 @@
 from eureka_ml_insights.configs.experiment_config import ExperimentConfig
 from eureka_ml_insights.core import EvalReporting, Inference, PromptProcessing
 from eureka_ml_insights.data_utils import (
-    AzureDataReader,
-    AzureMMDataLoader,
+    HFDataReader,    
+    MMDataLoader,
     ColumnRename,
     DataLoader,
     DataReader,
@@ -47,14 +47,11 @@ def configure_pipeline(self, model_config: ModelConfig, resume_from: str = None)
         self.data_processing_comp = PromptProcessingConfig(
             component_type=PromptProcessing,
             data_reader_config=DataSetConfig(
-                AzureDataReader,
+                HFDataReader,
                 {
-                    "account_url": "https://aifeval.blob.core.windows.net/",
-                    "blob_container": "datasets",
-                    "blob_name": "spatial_reason_vlm_datasets/spatial_loc_dataset/n500/questions/test/gpt4-eval-g6-n2500_QA_merged.jsonl",
-                    "transform": PrependStringTransform(
-                        columns="image", string="spatial_reason_vlm_datasets/spatial_loc_dataset/n500/"
-                    ),
+                    "path": "microsoft/VISION_LANGUAGE",
+                    "split": "val",
+                    "tasks": "spatial_map",
                 },
             ),
             output_dir=os.path.join(self.log_dir, "data_processing_output"),
@@ -65,12 +62,9 @@ def configure_pipeline(self, model_config: ModelConfig, resume_from: str = None)
             component_type=Inference,
             model_config=model_config,
             data_loader_config=DataSetConfig(
-                AzureMMDataLoader,
+                MMDataLoader,
                 {
                     "path": os.path.join(self.data_processing_comp.output_dir, "transformed_data.jsonl"),
-                    "account_url": "https://aifeval.blob.core.windows.net/",
-                    "blob_container": "datasets",
-                    "image_column_names": ["image"],
                 },
             ),
             output_dir=os.path.join(self.log_dir, "inference_result"),
@@ -123,18 +117,9 @@ class SPATIAL_MAP_TEXTONLY_PIPELINE(SPATIAL_MAP_PIPELINE):
 
     def configure_pipeline(self, model_config: ModelConfig, resume_from: str = None) -> PipelineConfig:
         config = super().configure_pipeline(model_config, resume_from)
-        self.data_processing_comp.data_reader_config.init_args["blob_name"] = (
-            "spatial_reason_vlm_datasets/spatial_loc_dataset/n500/questions/test/gpt4-eval-g6-n2500_QA_text_only_merged.jsonl"
-        )
-        self.data_processing_comp.data_reader_config.init_args["transform"] = None
-
-        self.inference_comp.data_loader_config = DataSetConfig(
-            DataLoader,
-            {
-                "path": os.path.join(self.data_processing_comp.output_dir, "transformed_data.jsonl"),
-            },
+        self.data_processing_comp.data_reader_config.init_args["tasks"] = (
+            "spatial_map_text_only"
         )
-
         return config
 
 
diff --git a/eureka_ml_insights/data_utils/__init__.py b/eureka_ml_insights/data_utils/__init__.py
index 937093f..c9ec62d 100644
--- a/eureka_ml_insights/data_utils/__init__.py
+++ b/eureka_ml_insights/data_utils/__init__.py
@@ -7,6 +7,7 @@
     HFDataReader,
     JsonLinesWriter,
     JsonReader,
+    HFJsonReader,
     MMDataLoader,
     TXTWriter,
 )
@@ -38,6 +39,7 @@
 __all__ = [
     JsonLinesWriter,
     JsonReader,
+    HFJsonReader,
     AzureJsonReader,
     TXTWriter,
     CopyColumn,
diff --git a/eureka_ml_insights/data_utils/data.py b/eureka_ml_insights/data_utils/data.py
index d20ed30..6150e82 100644
--- a/eureka_ml_insights/data_utils/data.py
+++ b/eureka_ml_insights/data_utils/data.py
@@ -333,7 +333,7 @@ def __init__(
         Initializes an AzureJsonReader.
         args:
             account_url: str, The Azure storage account URL.
-            blob_container: str ,Azure storage container name.
+            blob_container: str, Azure storage container name.
             blob_name: str, Azure storage blob name.
         """
         self.blob_url = f"{account_url}/{blob_container}/{blob_name}"
@@ -351,6 +351,24 @@ def read(self) -> dict:
         return data
 
 
+class HFJsonReader(JsonReader):
+    """
+    This is a DataReader that loads a json or jsonl data file from HuggingFace.
+    """
+    def __init__(self, repo_id, repo_type, filename):
+        """
+        Initializes an HFJsonReader.
+        args:
+            repo_id: str, The HF repo id.
+            repo_type: str, The HF repo_type.
+            filename: str, The HF filename.
+        """
+        from huggingface_hub import hf_hub_download
+
+        cached_file_path = hf_hub_download(repo_id=repo_id, filename=filename, repo_type=repo_type)
+        super().__init__(cached_file_path)
+
+
 class Writer:
     def __init__(self, out_path):
         self.out_path = out_path
@@ -505,14 +523,17 @@ def _save_base64_to_image_file(self, image_base64: dict, cache_path: str) -> str
 
         if image_base64:
             # create path to save image
-            file_path = os.path.join(cache_path, image_base64["path"])
-
+            file_path = os.path.join(cache_path, image_base64["path"]) 
+            
             # only do this if the image doesn't already exist
             if not os.path.exists(file_path):
                 # base64 string to binary image data
                 buffered = BytesIO(image_base64["bytes"])
                 query_image = Image.open(buffered).convert("RGB")
-                # save image
+
+                # save image and make the dir path if needed (need for paths with nested new dirs)
+                dir_path = os.path.dirname(file_path)
+                os.makedirs(dir_path, exist_ok=True)
                 query_image.save(file_path)
 
         return file_path
diff --git a/eureka_ml_insights/metrics/metrics_base.py b/eureka_ml_insights/metrics/metrics_base.py
index 18ea5b3..7494427 100644
--- a/eureka_ml_insights/metrics/metrics_base.py
+++ b/eureka_ml_insights/metrics/metrics_base.py
@@ -64,7 +64,7 @@ class DetectionMetric(Metric):
     def validate_data(self, data):
         """This method checks if the data has the required fields."""
 
-        assert "images" in data.columns, "Data does not have 'model_output' field."
+        assert "id" in data.columns, "Data does not have 'id' field."
         assert "model_output" in data.columns, "Data does not have 'model_output' field."
         assert "is_valid" in data.columns, "Data does not have 'is_valid' field."
         return True
@@ -75,7 +75,7 @@ def evaluate(self, data):
         tqdm.pandas()
 
         data[self.__class__.__name__ + "_result"] = data.progress_apply(
-            lambda x: self.__evaluate__(x["images"], x["model_output"], x["is_valid"]), axis=1
+            lambda x: self.__evaluate__(x["id"], x["model_output"], x["is_valid"]), axis=1
         )
         return data
 
diff --git a/eureka_ml_insights/metrics/spatial_and_layout_metrics.py b/eureka_ml_insights/metrics/spatial_and_layout_metrics.py
index 9e0d103..df1d9b1 100644
--- a/eureka_ml_insights/metrics/spatial_and_layout_metrics.py
+++ b/eureka_ml_insights/metrics/spatial_and_layout_metrics.py
@@ -160,15 +160,6 @@ def __init__(self, target_coco_json_reader: JsonReader):
         self.coco.dataset = target_coco_json_reader.read()
         self.coco.createIndex()
 
-        # get the list of images
-        coco_img_ids = self.coco.getImgIds()
-        coco_imgs = self.coco.loadImgs(coco_img_ids)
-        self.coco_file_name_to_id = {}
-
-        # create a dict to look up image id by filename
-        for c in coco_imgs:
-            self.coco_file_name_to_id[c["file_name"]] = c["id"]
-
         # get a lst of all cats
         coco_cat_ids = self.coco.getCatIds()
         coco_cats = self.coco.loadCats(coco_cat_ids)
@@ -178,13 +169,11 @@ def __init__(self, target_coco_json_reader: JsonReader):
         for c in coco_cats:
             self.coco_cat_name_to_id[c["name"]] = c["id"]
 
-    def __evaluate__(self, images, answer_text, is_valid):
+    def __evaluate__(self, image_id, answer_text, is_valid):
         if not is_valid:
             return "none"
 
         # load image info, need w and h
-        image = images[0]
-        image_id = self.coco_file_name_to_id[image]
         img = self.coco.loadImgs(image_id)
         w = img[0]["width"]
         h = img[0]["height"]
@@ -216,7 +205,7 @@ def __evaluate__(self, images, answer_text, is_valid):
                     if wordnet_compare(label, cat):
 
                         annotation = {
-                            "image_id": self.coco_file_name_to_id[image],
+                            "image_id": image_id,
                             "category_id": self.coco_cat_name_to_id[cat],
                             "bbox": box,
                             "score": confidence,
diff --git a/tests/pipeline_tests.py b/tests/pipeline_tests.py
index ac88330..fd9c679 100644
--- a/tests/pipeline_tests.py
+++ b/tests/pipeline_tests.py
@@ -18,7 +18,7 @@
     KITAB_ONE_BOOK_CONSTRAINT_PIPELINE,
     MAZE_PIPELINE,
     MAZE_TEXTONLY_PIPELINE,
-    MMMU_PIPELINE,
+    MMMU_BASELINE_PIPELINE,
     OBJECT_DETECTION_SINGLE_PIPELINE,
     OBJECT_RECOGNITION_SINGLE_PIPELINE,
     SPATIAL_GRID_PIPELINE,
@@ -47,7 +47,6 @@
     KitabTestModel,
     MultipleChoiceTestModel,
     SpatialReasoningTestModel,
-    TestAzureMMDataLoader,
     TestDataLoader,
     TestKitabMetric,
     TestMMDataLoader,
@@ -58,63 +57,63 @@
 
 
 class TEST_SPATIAL_REASONING_PIPELINE(SPATIAL_REASONING_SINGLE_PIPELINE):
-    # Test config the spatial reasoning benchmark with the SpatialReasoningTestModel and TestAzureMMDataLoader
+    # Test config the spatial reasoning benchmark with the SpatialReasoningTestModel and TestMMDataLoader
     # with small sample data and a test model
     def configure_pipeline(self):
         model_config = ModelConfig(SpatialReasoningTestModel, {})
         config = super().configure_pipeline(model_config=model_config)
         self.inference_comp = config.component_configs[1]
-        self.inference_comp.data_loader_config.class_name = TestAzureMMDataLoader
+        self.inference_comp.data_loader_config.class_name = TestMMDataLoader
         self.inference_comp.data_loader_config.init_args["n_iter"] = N_ITER
         return config
 
 
 class TEST_OBJECT_DETECTION_PIPELINE(OBJECT_DETECTION_SINGLE_PIPELINE):
-    # Test config the object detection benchmark with the DetectionTestModel and TestAzureMMDataLoader
+    # Test config the object detection benchmark with the DetectionTestModel and TestMMDataLoader
     # with small sample data and a test model
     def configure_pipeline(self, resume_from=None):
         model_config = ModelConfig(DetectionTestModel, {})
         config = super().configure_pipeline(model_config=model_config)
-        self.inference_comp.data_loader_config.class_name = TestAzureMMDataLoader
+        self.inference_comp.data_loader_config.class_name = TestMMDataLoader
         self.inference_comp.data_loader_config.init_args["n_iter"] = N_ITER
         return config
 
 
 class TEST_VISUAL_PROMPTING_PIPELINE(VISUAL_PROMPTING_SINGLE_PIPELINE):
-    # Test config the visual prompting benchmark with the GenericTestModel and TestAzureMMDataLoader
+    # Test config the visual prompting benchmark with the GenericTestModel and TestMMDataLoader
     # with small sample data and a test model
     def configure_pipeline(self, resume_from=None):
         model_config = ModelConfig(GenericTestModel, {})
         config = super().configure_pipeline(model_config=model_config)
-        self.inference_comp.data_loader_config.class_name = TestAzureMMDataLoader
+        self.inference_comp.data_loader_config.class_name = TestMMDataLoader
         self.inference_comp.data_loader_config.init_args["n_iter"] = N_ITER
         return config
 
 
 class TEST_OBJECT_RECOGNITION_PIPELINE(OBJECT_RECOGNITION_SINGLE_PIPELINE):
-    # Test config the object recognition benchmark with the GenericTestModel and TestAzureMMDataLoader
+    # Test config the object recognition benchmark with the GenericTestModel and TestMMDataLoader
     # with small sample data and a test model
     def configure_pipeline(self, resume_from=None):
         model_config = ModelConfig(GenericTestModel, {})
         config = super().configure_pipeline(model_config=model_config)
-        self.inference_comp.data_loader_config.class_name = TestAzureMMDataLoader
+        self.inference_comp.data_loader_config.class_name = TestMMDataLoader
         self.inference_comp.data_loader_config.init_args["n_iter"] = N_ITER
         return config
 
 
 class TEST_SPATIAL_GRID_PIPELINE(SPATIAL_GRID_PIPELINE):
-    # Test config the spatial grid counting benchmark with the TestAzureMMDataLoader
+    # Test config the spatial grid counting benchmark with the TestMMDataLoader
     # with small sample data and a test model
     def configure_pipeline(self, resume_from=None):
         model_config = ModelConfig(GenericTestModel, {})
         config = super().configure_pipeline(model_config=model_config)
-        self.inference_comp.data_loader_config.class_name = TestAzureMMDataLoader
+        self.inference_comp.data_loader_config.class_name = TestMMDataLoader
         self.inference_comp.data_loader_config.init_args["n_iter"] = N_ITER
         return config
 
 
 class TEST_SPATIAL_GRID_TEXTONLY_PIPELINE(SPATIAL_GRID_TEXTONLY_PIPELINE):
-    # Test config the spatial grid counting benchmark textonly version with the TestAzureMMDataLoader
+    # Test config the spatial grid counting benchmark textonly version with the TestDataLoader
     # with small sample data and a test model
     def configure_pipeline(self, resume_from=None):
         model_config = ModelConfig(GenericTestModel, {})
@@ -125,18 +124,18 @@ def configure_pipeline(self, resume_from=None):
 
 
 class TEST_SPATIAL_MAP_PIPELINE(SPATIAL_MAP_PIPELINE):
-    # Test config the spatial map benchmark with the TestAzureMMDataLoader
+    # Test config the spatial map benchmark with the TestMMDataLoader
     # with small sample data and a test model
     def configure_pipeline(self, resume_from=None):
         model_config = ModelConfig(GenericTestModel, {"model_name": "generic_test_model"})
         config = super().configure_pipeline(model_config=model_config)
-        self.inference_comp.data_loader_config.class_name = TestAzureMMDataLoader
+        self.inference_comp.data_loader_config.class_name = TestMMDataLoader
         self.inference_comp.data_loader_config.init_args["n_iter"] = N_ITER
         return config
 
 
 class TEST_SPATIAL_MAP_TEXTONLY_PIPELINE(SPATIAL_MAP_TEXTONLY_PIPELINE):
-    # Test config the spatial map benchmark textonly version with the TestAzureMMDataLoader
+    # Test config the spatial map benchmark textonly version with the TestDataLoader
     # with small sample data and a test model
     def configure_pipeline(self, resume_from=None):
         model_config = ModelConfig(GenericTestModel, {"model_name": "generic_test_model"})
@@ -147,18 +146,18 @@ def configure_pipeline(self, resume_from=None):
 
 
 class TEST_MAZE_PIPELINE(MAZE_PIPELINE):
-    # Test config the maze benchmark with the TestAzureMMDataLoader
+    # Test config the maze benchmark with the TestMMDataLoader
     # with small sample data and a test model
     def configure_pipeline(self, resume_from=None):
         model_config = ModelConfig(GenericTestModel, {})
         config = super().configure_pipeline(model_config=model_config)
-        self.inference_comp.data_loader_config.class_name = TestAzureMMDataLoader
+        self.inference_comp.data_loader_config.class_name = TestMMDataLoader
         self.inference_comp.data_loader_config.init_args["n_iter"] = N_ITER
         return config
 
 
 class TEST_MAZE_TEXTONLY_PIPELINE(MAZE_TEXTONLY_PIPELINE):
-    # Test config the maze benchmark textonly version with the TestAzureMMDataLoader
+    # Test config the maze benchmark textonly version with the TestDataLoader
     # with small sample data and a test model
     def configure_pipeline(self, resume_from=None):
         model_config = ModelConfig(GenericTestModel, {})
@@ -248,7 +247,7 @@ def configure_pipeline(self):
         return config
 
 
-class TEST_MMMU_PIPELINE(MMMU_PIPELINE):
+class TEST_MMMU_PIPELINE(MMMU_BASELINE_PIPELINE):
     # Test config the MMMU benchmark with MultipleChoiceTestModel and TestMMDataLoader
     def configure_pipeline(self, resume_from=None):
         config = super().configure_pipeline(model_config=ModelConfig(MultipleChoiceTestModel, {}))
@@ -285,25 +284,21 @@ def test_outputs_exist(self) -> None:
         self.assertEqual(n_aggregators, n_aggregator_files)
 
 
-@unittest.skipIf("skip_tests_with_auth" in os.environ, "Tests that require some auth are skipped.")
 class SR1_PipelineTest(PipelineTest, unittest.TestCase):
     def get_config(self):
         return TEST_SPATIAL_REASONING_PIPELINE().pipeline_config
 
 
-@unittest.skipIf("skip_tests_with_auth" in os.environ, "Tests that require some auth are skipped.")
 class VP1_PipelineTest(PipelineTest, unittest.TestCase):
     def get_config(self):
         return TEST_VISUAL_PROMPTING_PIPELINE().pipeline_config
 
 
-@unittest.skipIf("skip_tests_with_auth" in os.environ, "Tests that require some auth are skipped.")
 class OR1_PipelineTest(PipelineTest, unittest.TestCase):
     def get_config(self):
         return TEST_OBJECT_RECOGNITION_PIPELINE().pipeline_config
 
 
-@unittest.skipIf("skip_tests_with_auth" in os.environ, "Tests that require some auth are skipped.")
 class OD1_PipelineTest(PipelineTest, unittest.TestCase):
     def get_config(self):
         return TEST_OBJECT_DETECTION_PIPELINE().pipeline_config
@@ -314,43 +309,36 @@ def get_config(self):
         return TEST_MMMU_PIPELINE().pipeline_config
 
 
-@unittest.skipIf("skip_tests_with_auth" in os.environ, "Tests that require some auth are skipped.")
 class SPATIAL_GRID_PipelineTest(PipelineTest, unittest.TestCase):
     def get_config(self):
         return TEST_SPATIAL_GRID_PIPELINE().pipeline_config
 
 
-@unittest.skipIf("skip_tests_with_auth" in os.environ, "Tests that require some auth are skipped.")
 class SPATIAL_GRID_TEXTONLY_PipelineTest(PipelineTest, unittest.TestCase):
     def get_config(self):
         return TEST_SPATIAL_GRID_TEXTONLY_PIPELINE().pipeline_config
 
 
-@unittest.skipIf("skip_tests_with_auth" in os.environ, "Tests that require some auth are skipped.")
 class SPATIAL_MAP_PipelineTest(PipelineTest, unittest.TestCase):
     def get_config(self):
         return TEST_SPATIAL_MAP_PIPELINE().pipeline_config
 
 
-@unittest.skipIf("skip_tests_with_auth" in os.environ, "Tests that require some auth are skipped.")
 class SPATIAL_MAP_TEXTONLY_PipelineTest(PipelineTest, unittest.TestCase):
     def get_config(self):
         return TEST_SPATIAL_MAP_TEXTONLY_PIPELINE().pipeline_config
 
 
-@unittest.skipIf("skip_tests_with_auth" in os.environ, "Tests that require some auth are skipped.")
 class MAZE_PipelineTest(PipelineTest, unittest.TestCase):
     def get_config(self):
         return TEST_MAZE_PIPELINE().pipeline_config
 
 
-@unittest.skipIf("skip_tests_with_auth" in os.environ, "Tests that require some auth are skipped.")
 class MAZE_TEXTONLY_PipelineTest(PipelineTest, unittest.TestCase):
     def get_config(self):
         return TEST_MAZE_TEXTONLY_PIPELINE().pipeline_config
 
 
-@unittest.skipIf("skip_tests_with_auth" in os.environ, "Tests that require some auth are skipped.")
 class GR1_PipelineTest(PipelineTest, unittest.TestCase):
     def get_config(self):
         return TEST_GEOMETRIC_REASONING_PIPELINE().pipeline_config
@@ -373,7 +361,6 @@ def test_labels(self):
             )
 
 
-@unittest.skipIf("skip_tests_with_auth" in os.environ, "Tests that require some auth are skipped.")
 class IFEval_PipelineTest(PipelineTest, unittest.TestCase):
     def get_config(self):
         self.test_pipeline = TEST_IFEval_PIPELINE()