Skip to content

Commit

Permalink
Moving image understanding and vision language datasets to HuggingFac…
Browse files Browse the repository at this point in the history
…e hosting and minor cleanup (#7)

* rename

* spatial_understnading rename
mmmu_pipeline rename
moving to HF data
added HF json reader

* removing unittest.skipIf("skip_tests_with_auth" in os.environ, "Azure tests are skipped.") from pipleine tests that no loner use azure

---------

Co-authored-by: neel <[email protected]>
  • Loading branch information
neelsj and neel authored Sep 13, 2024
1 parent e6adf9c commit 65dc475
Show file tree
Hide file tree
Showing 19 changed files with 134 additions and 229 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -396,3 +396,5 @@ FodyWeavers.xsd

# JetBrains Rider
*.sln.iml
*.pyproj
/keys
12 changes: 6 additions & 6 deletions eureka_ml_insights/configs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,32 +24,32 @@
KITAB_TWO_BOOK_CONSTRAINT_PIPELINE,
KITAB_TWO_BOOK_CONSTRAINT_PIPELINE_WITH_CONTEXT,
)
from .mmmu import MMMU_PIPELINE
from .mmmu import MMMU_BASELINE_PIPELINE
from .nondeterminism import (
Geo_Nondeterminism,
IFEval_Nondeterminism,
Kitab_Nondeterminism,
MMMU_Nondeterminism,
)
from .spatial_understanding.object_detection import (
from .image_understanding.object_detection import (
OBJECT_DETECTION_PAIRS_LOCAL_PIPELINE,
OBJECT_DETECTION_PAIRS_PIPELINE,
OBJECT_DETECTION_SINGLE_LOCAL_PIPELINE,
OBJECT_DETECTION_SINGLE_PIPELINE,
)
from .spatial_understanding.object_recognition import (
from .image_understanding.object_recognition import (
OBJECT_RECOGNITION_PAIRS_LOCAL_PIPELINE,
OBJECT_RECOGNITION_PAIRS_PIPELINE,
OBJECT_RECOGNITION_SINGLE_LOCAL_PIPELINE,
OBJECT_RECOGNITION_SINGLE_PIPELINE,
)
from .spatial_understanding.spatial_reasoning import (
from .image_understanding.spatial_reasoning import (
SPATIAL_REASONING_PAIRS_LOCAL_PIPELINE,
SPATIAL_REASONING_PAIRS_PIPELINE,
SPATIAL_REASONING_SINGLE_LOCAL_PIPELINE,
SPATIAL_REASONING_SINGLE_PIPELINE,
)
from .spatial_understanding.visual_prompting import (
from .image_understanding.visual_prompting import (
VISUAL_PROMPTING_PAIRS_LOCAL_PIPELINE,
VISUAL_PROMPTING_PAIRS_PIPELINE,
VISUAL_PROMPTING_SINGLE_LOCAL_PIPELINE,
Expand Down Expand Up @@ -115,7 +115,7 @@
IFEval_PIPELINE,
FlenQA_Experiment_Pipeline,
GEOMETER_PIPELINE,
MMMU_PIPELINE,
MMMU_BASELINE_PIPELINE,
BBH_MCQ_ORCA_PIPELINE,
BBH_MCQ_OpenAI_PIPELINE,
KITAB_ONE_BOOK_CONSTRAINT_PIPELINE,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
from eureka_ml_insights.configs.experiment_config import ExperimentConfig
from eureka_ml_insights.core import EvalReporting, Inference, PromptProcessing
from eureka_ml_insights.data_utils import (
AzureDataReader,
AzureJsonReader,
AzureMMDataLoader,
HFDataReader,
HFJsonReader,
MMDataLoader,
ColumnRename,
CopyColumn,
DataReader,
Expand Down Expand Up @@ -52,20 +52,11 @@ def configure_pipeline(self, model_config, resume_from=None):
self.data_processing_comp = PromptProcessingConfig(
component_type=PromptProcessing,
data_reader_config=DataSetConfig(
AzureDataReader,
HFDataReader,
{
"account_url": "https://aifeval.blob.core.windows.net/",
"blob_container": "datasets",
"blob_name": "msr_aif_object_detection_pairs/object_detection_val_long_prompt.jsonl",
"transform": SequenceTransform(
[
ColumnRename(name_mapping={"query_text": "prompt", "target_text": "ground_truth"}),
CopyColumn(column_name_src="images", column_name_dst="images_prepended"),
PrependStringTransform(
columns="images_prepended", string="msr_aif_object_detection_pairs/"
),
]
),
"path": "microsoft/IMAGE_UNDERSTANDING",
"split": "val",
"tasks": "object_detection_pairs",
},
),
output_dir=os.path.join(self.log_dir, "data_processing_output"),
Expand All @@ -76,22 +67,19 @@ def configure_pipeline(self, model_config, resume_from=None):
component_type=Inference,
model_config=model_config,
data_loader_config=DataSetConfig(
AzureMMDataLoader,
MMDataLoader,
{
"path": os.path.join(self.data_processing_comp.output_dir, "transformed_data.jsonl"),
"account_url": "https://aifeval.blob.core.windows.net/",
"blob_container": "datasets",
"image_column_names": ["images_prepended"],
},
),
output_dir=os.path.join(self.log_dir, "inference_result"),
resume_from=resume_from,
)

target_coco_json_reader = AzureJsonReader(
account_url="https://aifeval.blob.core.windows.net/",
blob_container="datasets",
blob_name="msr_aif_object_detection_pairs/coco_instances.json",
target_coco_json_reader = HFJsonReader(
repo_id="microsoft/IMAGE_UNDERSTANDING",
repo_type="dataset",
filename="object_detection_pairs/coco_instances.json",
)

# Configure the evaluation and reporting component.
Expand Down Expand Up @@ -129,17 +117,14 @@ class OBJECT_DETECTION_SINGLE_PIPELINE(OBJECT_DETECTION_PAIRS_PIPELINE):

def configure_pipeline(self, model_config, resume_from=None):
config = super().configure_pipeline(model_config, resume_from)
self.data_processing_comp.data_reader_config.init_args["blob_name"] = (
"msr_aif_object_detection_single/object_detection_val_long_prompt.jsonl"
self.data_processing_comp.data_reader_config.init_args["tasks"] = (
"object_detection_single"
)
self.data_processing_comp.data_reader_config.init_args["transform"].transforms[
2
].string = "msr_aif_object_detection_single/"

target_coco_json_reader = AzureJsonReader(
account_url="https://aifeval.blob.core.windows.net/",
blob_container="datasets",
blob_name="msr_aif_object_detection_single/coco_instances.json",

target_coco_json_reader = HFJsonReader(
repo_id="microsoft/IMAGE_UNDERSTANDING",
repo_type="dataset",
filename="object_detection_single/coco_instances.json",
)

self.evalreporting_comp.metric_config.init_args["target_coco_json_reader"] = target_coco_json_reader
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
from eureka_ml_insights.configs.experiment_config import ExperimentConfig
from eureka_ml_insights.core import EvalReporting, Inference, PromptProcessing
from eureka_ml_insights.data_utils import (
AzureDataReader,
AzureMMDataLoader,
HFDataReader,
MMDataLoader,
ColumnRename,
DataReader,
PrependStringTransform,
Expand Down Expand Up @@ -53,17 +53,11 @@ def configure_pipeline(self, model_config, resume_from=None):
self.data_processing_comp = PromptProcessingConfig(
component_type=PromptProcessing,
data_reader_config=DataSetConfig(
AzureDataReader,
HFDataReader,
{
"account_url": "https://aifeval.blob.core.windows.net/",
"blob_container": "datasets",
"blob_name": "msr_aif_spatial_reasoning_lrtb_pairs/recognition_val.jsonl",
"transform": SequenceTransform(
[
ColumnRename(name_mapping={"query_text": "prompt", "target_text": "ground_truth"}),
PrependStringTransform(columns="images", string="msr_aif_spatial_reasoning_lrtb_pairs/"),
]
),
"path": "microsoft/IMAGE_UNDERSTANDING",
"split": "val",
"tasks": "object_recognition_pairs",
},
),
output_dir=os.path.join(self.log_dir, "data_processing_output"),
Expand All @@ -74,12 +68,9 @@ def configure_pipeline(self, model_config, resume_from=None):
component_type=Inference,
model_config=model_config,
data_loader_config=DataSetConfig(
AzureMMDataLoader,
MMDataLoader,
{
"path": os.path.join(self.data_processing_comp.output_dir, "transformed_data.jsonl"),
"account_url": "https://aifeval.blob.core.windows.net/",
"blob_container": "datasets",
"image_column_names": ["images"],
},
),
output_dir=os.path.join(self.log_dir, "inference_result"),
Expand Down Expand Up @@ -119,12 +110,9 @@ class OBJECT_RECOGNITION_SINGLE_PIPELINE(OBJECT_RECOGNITION_PAIRS_PIPELINE):

def configure_pipeline(self, model_config, resume_from=None):
config = super().configure_pipeline(model_config, resume_from)
self.data_processing_comp.data_reader_config.init_args["blob_name"] = (
"msr_aif_spatial_reasoning_lrtb_single/recognition_val.jsonl"
self.data_processing_comp.data_reader_config.init_args["tasks"] = (
"object_recognition_single"
)
self.data_processing_comp.data_reader_config.init_args["transform"].transforms[
1
].string = "msr_aif_spatial_reasoning_lrtb_single/"
return config


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
from eureka_ml_insights.data_utils import (
AddColumnAndData,
ASTEvalTransform,
AzureDataReader,
AzureMMDataLoader,
HFDataReader,
MMDataLoader,
ColumnRename,
DataReader,
PrependStringTransform,
Expand Down Expand Up @@ -55,17 +55,11 @@ def configure_pipeline(self, model_config, resume_from=None):
self.data_processing_comp = PromptProcessingConfig(
component_type=PromptProcessing,
data_reader_config=DataSetConfig(
AzureDataReader,
HFDataReader,
{
"account_url": "https://aifeval.blob.core.windows.net/",
"blob_container": "datasets",
"blob_name": "msr_aif_spatial_reasoning_lrtb_pairs/msr_aif_spatial_reasoning_lrtb_pairs.jsonl",
"transform": SequenceTransform(
[
ColumnRename(name_mapping={"query_text": "prompt", "target_text": "ground_truth"}),
PrependStringTransform(columns="images", string="msr_aif_spatial_reasoning_lrtb_pairs/"),
]
),
"path": "microsoft/IMAGE_UNDERSTANDING",
"split": "val",
"tasks": "spatial_reasoning_lrtb_pairs",
},
),
output_dir=os.path.join(self.log_dir, "data_processing_output"),
Expand All @@ -76,12 +70,9 @@ def configure_pipeline(self, model_config, resume_from=None):
component_type=Inference,
model_config=model_config,
data_loader_config=DataSetConfig(
AzureMMDataLoader,
MMDataLoader,
{
"path": os.path.join(self.data_processing_comp.output_dir, "transformed_data.jsonl"),
"account_url": "https://aifeval.blob.core.windows.net/",
"blob_container": "datasets",
"image_column_names": ["images"],
},
),
output_dir=os.path.join(self.log_dir, "inference_result"),
Expand Down Expand Up @@ -129,12 +120,9 @@ class SPATIAL_REASONING_SINGLE_PIPELINE(SPATIAL_REASONING_PAIRS_PIPELINE):

def configure_pipeline(self, model_config, resume_from=None):
config = super().configure_pipeline(model_config=model_config, resume_from=resume_from)
self.data_processing_comp.data_reader_config.init_args["blob_name"] = (
"msr_aif_spatial_reasoning_lrtb_single/msr_aif_spatial_reasoning_lrtb_single.jsonl"
self.data_processing_comp.data_reader_config.init_args["tasks"] = (
"spatial_reasoning_lrtb_single"
)
self.data_processing_comp.data_reader_config.init_args["transform"].transforms[
1
].string = "msr_aif_spatial_reasoning_lrtb_single/"
self.evalreporting_comp.data_reader_config.init_args["transform"].transforms[
0
].data = "['left', 'right', 'top', 'bottom']"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
from eureka_ml_insights.configs.experiment_config import ExperimentConfig
from eureka_ml_insights.core import EvalReporting, Inference, PromptProcessing
from eureka_ml_insights.data_utils import (
AzureDataReader,
AzureMMDataLoader,
HFDataReader,
MMDataLoader,
ColumnRename,
DataReader,
PrependStringTransform,
Expand Down Expand Up @@ -53,17 +53,11 @@ def configure_pipeline(self, model_config, resume_from=None):
self.data_processing_comp = PromptProcessingConfig(
component_type=PromptProcessing,
data_reader_config=DataSetConfig(
AzureDataReader,
HFDataReader,
{
"account_url": "https://aifeval.blob.core.windows.net/",
"blob_container": "datasets",
"blob_name": "msr_aif_visual_prompting_pairs/visual_prompting_val.jsonl",
"transform": SequenceTransform(
[
ColumnRename(name_mapping={"query_text": "prompt", "target_text": "ground_truth"}),
PrependStringTransform(columns="images", string="msr_aif_visual_prompting_pairs/"),
]
),
"path": "microsoft/IMAGE_UNDERSTANDING",
"split": "val",
"tasks": "visual_prompting_pairs",
},
),
output_dir=os.path.join(self.log_dir, "data_processing_output"),
Expand All @@ -74,12 +68,9 @@ def configure_pipeline(self, model_config, resume_from=None):
component_type=Inference,
model_config=model_config,
data_loader_config=DataSetConfig(
AzureMMDataLoader,
MMDataLoader,
{
"path": os.path.join(self.data_processing_comp.output_dir, "transformed_data.jsonl"),
"account_url": "https://aifeval.blob.core.windows.net/",
"blob_container": "datasets",
"image_column_names": ["images"],
},
),
output_dir=os.path.join(self.log_dir, "inference_result"),
Expand Down Expand Up @@ -119,12 +110,9 @@ class VISUAL_PROMPTING_SINGLE_PIPELINE(VISUAL_PROMPTING_PAIRS_PIPELINE):

def configure_pipeline(self, model_config, resume_from=None):
config = super().configure_pipeline(model_config, resume_from)
self.data_processing_comp.data_reader_config.init_args["blob_name"] = (
"msr_aif_visual_prompting_single/visual_prompting_val.jsonl"
self.data_processing_comp.data_reader_config.init_args["tasks"] = (
"visual_prompting_single"
)
self.data_processing_comp.data_reader_config.init_args["transform"].transforms[
1
].string = "msr_aif_visual_prompting_single/"
return config


Expand Down
2 changes: 1 addition & 1 deletion eureka_ml_insights/configs/mmmu.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
)


class MMMU_PIPELINE(ExperimentConfig):
class MMMU_BASELINE_PIPELINE(ExperimentConfig):
"""
This defines an ExperimentConfig pipeline for the MMMU dataset.
There is no model_config by default and the model config must be passed in via command lime.
Expand Down
4 changes: 2 additions & 2 deletions eureka_ml_insights/configs/nondeterminism.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from .geometer import GEOMETER_PIPELINE
from .ifeval import IFEval_PIPELINE
from .kitab import KITAB_ONE_BOOK_CONSTRAINT_PIPELINE
from .mmmu import MMMU_PIPELINE
from .mmmu import MMMU_BASELINE_PIPELINE


class IFEval_Nondeterminism(IFEval_PIPELINE):
Expand Down Expand Up @@ -45,7 +45,7 @@ def configure_pipeline(self, **kwargs):
return config


class MMMU_Nondeterminism(MMMU_PIPELINE):
class MMMU_Nondeterminism(MMMU_BASELINE_PIPELINE):
def configure_pipeline(self, **kwargs):
config = super().configure_pipeline(**kwargs)
# Downsample the data and repeat each prompt 3 time
Expand Down
Loading

0 comments on commit 65dc475

Please sign in to comment.