diff --git a/cubids/cubids.py b/cubids/cubids.py index 7b2f657f..8b236a15 100644 --- a/cubids/cubids.py +++ b/cubids/cubids.py @@ -22,12 +22,12 @@ import pandas as pd from bids.layout import parse_file_entities from bids.utils import listify -from sklearn.cluster import AgglomerativeClustering from tqdm import tqdm from cubids.config import load_config -from cubids.constants import ID_VARS, NON_KEY_ENTITIES +from cubids.constants import NON_KEY_ENTITIES from cubids.metadata_merge import check_merging_operations, group_by_acquisition_sets +from cubids import utils warnings.simplefilter(action="ignore", category=FutureWarning) bids.config.set_option("extension_initial_dot", True) @@ -343,7 +343,7 @@ def add_nifti_info(self): voxel_sizes = img.header.get_zooms() matrix_dims = img.shape # add nifti info to corresponding sidecars​ - sidecar = img_to_new_ext(str(path), ".json") + sidecar = utils.img_to_new_ext(str(path), ".json") if Path(sidecar).exists(): try: with open(sidecar) as f: @@ -430,9 +430,9 @@ def apply_tsv_changes(self, summary_tsv, files_tsv, new_prefix, raise_on_error=T # Get a source json file img_full_path = self.path + source_files.iloc[0].FilePath - source_json = img_to_new_ext(img_full_path, ".json") + source_json = utils.img_to_new_ext(img_full_path, ".json") for dest_nii in dest_files.FilePath: - dest_json = img_to_new_ext(self.path + dest_nii, ".json") + dest_json = utils.img_to_new_ext(self.path + dest_nii, ".json") if Path(dest_json).exists() and Path(source_json).exists(): merge_commands.append(f"bids-sidecar-merge {source_json} {dest_json}") @@ -475,7 +475,7 @@ def apply_tsv_changes(self, summary_tsv, files_tsv, new_prefix, raise_on_error=T new_key = entity_sets[orig_key_param] - new_entities = _entity_set_to_entities(new_key) + new_entities = utils._entity_set_to_entities(new_key) # generate new filenames according to new entity set self.change_filename(file_path, new_entities) @@ -544,7 +544,7 @@ def change_filename(self, filepath, entities): ----- This is the function I need to spend the most time on, since it has entities hardcoded. """ - new_path = build_path( + new_path = utils.build_path( filepath=filepath, entities=entities, out_dir=str(self.path), @@ -556,9 +556,9 @@ def change_filename(self, filepath, entities): suffix = entities["suffix"] - sub = get_entity_value(filepath, "sub") + sub = utils.get_entity_value(filepath, "sub") if self.is_longitudinal: - ses = get_entity_value(filepath, "ses") + ses = utils.get_entity_value(filepath, "ses") # Add the scan path + new path to the lists of old, new filenames self.old_filenames.append(filepath) @@ -577,21 +577,24 @@ def change_filename(self, filepath, entities): # ensure assoc not an IntendedFor reference if ".nii" not in str(assoc_path): self.old_filenames.append(assoc_path) - new_ext_path = img_to_new_ext(new_path, "".join(Path(assoc_path).suffixes)) + new_ext_path = utils.img_to_new_ext( + new_path, + "".join(Path(assoc_path).suffixes), + ) self.new_filenames.append(new_ext_path) # MAKE SURE THESE AREN'T COVERED BY get_associations!!! # Update DWI-specific files if "/dwi/" in filepath: # add the bval and bvec if there - bval_old = img_to_new_ext(filepath, ".bval") - bval_new = img_to_new_ext(new_path, ".bval") + bval_old = utils.img_to_new_ext(filepath, ".bval") + bval_new = utils.img_to_new_ext(new_path, ".bval") if Path(bval_old).exists() and bval_old not in self.old_filenames: self.old_filenames.append(bval_old) self.new_filenames.append(bval_new) - bvec_old = img_to_new_ext(filepath, ".bvec") - bvec_new = img_to_new_ext(new_path, ".bvec") + bvec_old = utils.img_to_new_ext(filepath, ".bvec") + bvec_new = utils.img_to_new_ext(new_path, ".bvec") if Path(bvec_old).exists() and bvec_old not in self.old_filenames: self.old_filenames.append(bvec_old) self.new_filenames.append(bvec_new) @@ -666,7 +669,7 @@ def change_filename(self, filepath, entities): self.IF_rename_paths.append(filename_with_if) # json_file = self.layout.get_file(filename_with_if) # data = json_file.get_dict() - data = get_sidecar_metadata(filename_with_if) + data = utils.get_sidecar_metadata(filename_with_if) if data == "Erroneous sidecar": print("Error parsing sidecar: ", filename_with_if) continue @@ -675,20 +678,20 @@ def change_filename(self, filepath, entities): # Coerce IntendedFor to a list. data["IntendedFor"] = listify(data["IntendedFor"]) for item in data["IntendedFor"]: - if item == _get_participant_relative_path(filepath): + if item == utils._get_participant_relative_path(filepath): # remove old filename data["IntendedFor"].remove(item) # add new filename - data["IntendedFor"].append(_get_participant_relative_path(new_path)) + data["IntendedFor"].append(utils._get_participant_relative_path(new_path)) - if item == _get_bidsuri(filepath, self.path): + if item == utils._get_bidsuri(filepath, self.path): # remove old filename data["IntendedFor"].remove(item) # add new filename - data["IntendedFor"].append(_get_bidsuri(new_path, self.path)) + data["IntendedFor"].append(utils._get_bidsuri(new_path, self.path)) # update the json with the new data dictionary - _update_json(filename_with_if, data) + utils._update_json(filename_with_if, data) # save IntendedFor purges so that you can datalad run the # remove association file commands on a clean dataset @@ -807,12 +810,12 @@ def _purge_associations(self, scans): # sub, ses, modality only (no self.path) if_scans = [] for scan in scans: - if_scans.append(_get_participant_relative_path(self.path + scan)) + if_scans.append(utils._get_participant_relative_path(self.path + scan)) for path in Path(self.path).rglob("sub-*/*/fmap/*.json"): # json_file = self.layout.get_file(str(path)) # data = json_file.get_dict() - data = get_sidecar_metadata(str(path)) + data = utils.get_sidecar_metadata(str(path)) if data == "Erroneous sidecar": print("Error parsing sidecar: ", str(path)) continue @@ -826,7 +829,7 @@ def _purge_associations(self, scans): data["IntendedFor"].remove(item) # update the json with the new data dictionary - _update_json(str(path), data) + utils._update_json(str(path), data) # save IntendedFor purges so that you can datalad run the # remove association file commands on a clean dataset @@ -855,14 +858,14 @@ def _purge_associations(self, scans): if ".nii" not in str(path): if "/dwi/" in str(path): # add the bval and bvec if there - if Path(img_to_new_ext(str(path), ".bval")).exists(): - to_remove.append(img_to_new_ext(str(path), ".bval")) - if Path(img_to_new_ext(str(path), ".bvec")).exists(): - to_remove.append(img_to_new_ext(str(path), ".bvec")) + if Path(utils.img_to_new_ext(str(path), ".bval")).exists(): + to_remove.append(utils.img_to_new_ext(str(path), ".bval")) + if Path(utils.img_to_new_ext(str(path), ".bvec")).exists(): + to_remove.append(utils.img_to_new_ext(str(path), ".bvec")) if "/func/" in str(path): # add tsvs - tsv = img_to_new_ext(str(path), ".tsv").replace("_bold", "_events") + tsv = utils.img_to_new_ext(str(path), ".tsv").replace("_bold", "_events") if Path(tsv).exists(): to_remove.append(tsv) # add tsv json (if exists) @@ -954,8 +957,8 @@ def _cache_fieldmaps(self): files_to_fmaps = defaultdict(list) for fmap_file in tqdm(fmap_files): # intentions = listify(fmap_file.get_metadata().get("IntendedFor")) - fmap_json = img_to_new_ext(fmap_file.path, ".json") - metadata = get_sidecar_metadata(fmap_json) + fmap_json = utils.img_to_new_ext(fmap_file.path, ".json") + metadata = utils.get_sidecar_metadata(fmap_json) if metadata == "Erroneous sidecar": print("Error parsing sidecar: ", str(fmap_json)) continue @@ -996,7 +999,7 @@ def get_param_groups_from_entity_set(self, entity_set): """ if not self.fieldmaps_cached: raise Exception("Fieldmaps must be cached to find parameter groups.") - key_entities = _entity_set_to_entities(entity_set) + key_entities = utils._entity_set_to_entities(entity_set) key_entities["extension"] = ".nii[.gz]*" matching_files = self.layout.get( @@ -1007,7 +1010,7 @@ def get_param_groups_from_entity_set(self, entity_set): # entities do not also get added to matching_files to_include = [] for filepath in matching_files: - f_entity_set = _file_to_entity_set(filepath) + f_entity_set = utils._file_to_entity_set(filepath) if f_entity_set == entity_set: to_include.append(filepath) @@ -1023,7 +1026,7 @@ def get_param_groups_from_entity_set(self, entity_set): print(f"Unusual Modality Detected: {filepath}") modality = "other" - ret = _get_param_groups( + ret = utils._get_param_groups( to_include, self.fieldmap_lookup, entity_set, @@ -1210,14 +1213,14 @@ def get_param_groups_dataframes(self): param_group_summaries.append(param_summary) labeled_files.append(labeled_file_params) - big_df = _order_columns(pd.concat(labeled_files, ignore_index=True)) + big_df = utils._order_columns(pd.concat(labeled_files, ignore_index=True)) # make Filepaths relative to bids dir for row in range(len(big_df)): long_name = big_df.loc[row, "FilePath"] big_df.loc[row, "FilePath"] = long_name.replace(self.path, "") - summary = _order_columns(pd.concat(param_group_summaries, ignore_index=True)) + summary = utils._order_columns(pd.concat(param_group_summaries, ignore_index=True)) # create new col that strings key and param group together summary["KeyParamGroup"] = summary["EntitySet"] + "__" + summary["ParamGroup"].map(str) @@ -1274,7 +1277,7 @@ def get_param_groups_dataframes(self): if relational["IntendedForKey"]["display_mode"] == "bool": rename_cols.append("UsedAsFieldmap") - summary = assign_variants(summary, rename_cols) + summary = utils.assign_variants(summary, rename_cols) return big_df, summary @@ -1365,10 +1368,10 @@ def get_entity_sets(self): continue if str(path).endswith(".nii") or str(path).endswith(".nii.gz"): - entity_sets.update((_file_to_entity_set(path),)) + entity_sets.update((utils._file_to_entity_set(path),)) # Fill the dictionary of entity set, list of filenames pairrs - ret = _file_to_entity_set(path) + ret = utils._file_to_entity_set(path) if ret not in self.keys_files.keys(): self.keys_files[ret] = [] @@ -1410,7 +1413,7 @@ def change_metadata(self, filters, metadata): for bidsfile in files_to_change: # get the sidecar file # bidsjson_file = bidsfile.get_associations() - bidsjson_file = img_to_new_ext(str(bidsfile), ".json") + bidsjson_file = utils.img_to_new_ext(str(bidsfile), ".json") if not bidsjson_file: print("NO JSON FILES FOUND IN ASSOCIATIONS") continue @@ -1427,7 +1430,7 @@ def change_metadata(self, filters, metadata): sidecar.update(metadata) # write out - _update_json(json_file.path, sidecar) + utils._update_json(json_file.path, sidecar) def get_all_metadata_fields(self): """Return all metadata fields in a BIDS directory. @@ -1516,845 +1519,3 @@ def get_fieldmap_lookup(self): def get_layout(self): """Get layout.""" return self.layout - - -# XXX: Remove _validate_json? -def _validate_json(): - """Validate a JSON file's contents. - - This is currently not implemented, but would accept metadata as its param. - """ - # TODO: implement this or delete ??? - return True - - -def _update_json(json_file, metadata): - """Update a JSON file with the provided metadata. - - This function writes the given metadata to the specified JSON file if the - JSON data is valid. If the JSON data is invalid, it prints an error message. - - Parameters - ---------- - json_file : str - The path to the JSON file to be updated. - metadata : dict - The metadata to be written to the JSON file. - - Returns - ------- - None - """ - if _validate_json(): - with open(json_file, "w", encoding="utf-8") as f: - json.dump(metadata, f, ensure_ascii=False, indent=4) - else: - print("INVALID JSON DATA") - - -def _entity_set_to_entities(entity_set): - """Split an entity_set name into a pybids dictionary of entities. - - Parameters - ---------- - entity_set : str - A string representing a set of entities, where each entity is - separated by an underscore and each key-value pair is separated by a hyphen. - - Returns - ------- - dict - A dictionary where the keys are entity names and the values are entity values. - - Examples - -------- - >>> _entity_set_to_entities("sub-01_ses-02_task-rest") - {'sub': '01', 'ses': '02', 'task': 'rest'} - """ - return dict([group.split("-") for group in entity_set.split("_")]) - - -def _entities_to_entity_set(entities): - """Convert a pybids entities dictionary into an entity set name. - - Parameters - ---------- - entities : dict - A dictionary containing pybids entities where keys are entity names - and values are entity values. - - Returns - ------- - str - A string representing the entity set name, constructed by joining - the sorted entity keys and their corresponding values, separated by hyphens. - """ - group_keys = sorted(entities.keys() - NON_KEY_ENTITIES) - return "_".join([f"{key}-{entities[key]}" for key in group_keys]) - - -def _file_to_entity_set(filename): - """Identify and return the entity set of a BIDS valid filename. - - Parameters - ---------- - filename : str - The filename to parse for BIDS entities. - - Returns - ------- - set - A set of entities extracted from the filename. - """ - entities = parse_file_entities(str(filename)) - return _entities_to_entity_set(entities) - - -def _get_participant_relative_path(scan): - """Build the relative-from-subject version of a Path to a file. - - Parameters - ---------- - scan : str - The full path to the scan file. - - Returns - ------- - str - The relative path from the subject directory. - - Raises - ------ - ValueError - If the subject directory cannot be found in the path. - - Examples - -------- - >>> _get_participant_relative_path( - ... "/path/to/dset/sub-01/ses-01/func/sub-01_ses-01_bold.nii.gz", - ... ) - 'ses-01/func/sub-01_ses-01_bold.nii.gz' - - >>> _get_participant_relative_path( - ... "/path/to/dset/sub-01/func/sub-01_bold.nii.gz", - ... ) - 'func/sub-01_bold.nii.gz' - - >>> _get_participant_relative_path( - ... "/path/to/dset/ses-01/func/ses-01_bold.nii.gz", - ... ) - Traceback (most recent call last): - ValueError: Could not find subject in ... - """ - parts = Path(scan).parts - # Find the first part that starts with "sub-" - for i, part in enumerate(parts): - if part.startswith("sub-"): - return "/".join(parts[i + 1 :]) - raise ValueError(f"Could not find subject in {scan}") - - -def _get_bidsuri(filename, dataset_root): - """Convert a file path to a BIDS URI. - - Parameters - ---------- - filename : str - The full path to the file within the BIDS dataset. - dataset_root : str - The root directory of the BIDS dataset. - - Returns - ------- - str - The BIDS URI corresponding to the given file path. - - Raises - ------ - ValueError - If the filename is not within the dataset_root. - - Examples - -------- - >>> _get_bidsuri("/path/to/bids/sub-01/ses-01/dataset_description.json", "/path/to/bids") - 'bids::sub-01/ses-01/dataset_description.json' - """ - if dataset_root in filename: - return filename.replace(dataset_root, "bids::").replace("bids::/", "bids::") - raise ValueError(f"Only local datasets are supported: {filename}") - - -def _get_param_groups( - files, - fieldmap_lookup, - entity_set_name, - grouping_config, - modality, - keys_files, -): - """Find a list of *parameter groups* from a list of files. - - For each file in `files`, find critical parameters for metadata. Then find - unique sets of these critical parameters. - - Parameters - ---------- - files : :obj:`list` of :obj:`str` - List of file names - fieldmap_lookup : :obj:`dict` - mapping of filename strings relative to the bids root - (e.g. "sub-X/ses-Y/func/sub-X_ses-Y_task-rest_bold.nii.gz") - grouping_config : :obj:`dict` - configuration for defining parameter groups - - Returns - ------- - ordered_labeled_files : :obj:`pandas.DataFrame` - A data frame with one row per file where the ParamGroup column - indicates which group each scan is a part of. - param_groups_with_counts : :obj:`pandas.DataFrame` - A data frame with param group summaries. - """ - if not files: - print("WARNING: no files for", entity_set_name) - return None, None - - # Split the config into separate parts - imaging_params = grouping_config.get("sidecar_params", {}) - imaging_params = imaging_params[modality] - - relational_params = grouping_config.get("relational_params", {}) - - derived_params = grouping_config.get("derived_params") - derived_params = derived_params[modality] - - imaging_params.update(derived_params) - - dfs = [] - # path needs to be relative to the root with no leading prefix - - for path in files: - # metadata = layout.get_metadata(path) - metadata = get_sidecar_metadata(img_to_new_ext(path, ".json")) - if metadata == "Erroneous sidecar": - print("Error parsing sidecar: ", img_to_new_ext(path, ".json")) - else: - intentions = metadata.get("IntendedFor", []) - slice_times = metadata.get("SliceTiming", []) - - wanted_keys = metadata.keys() & imaging_params - example_data = {key: metadata[key] for key in wanted_keys} - example_data["EntitySet"] = entity_set_name - - # Get the fieldmaps out and add their types - if "FieldmapKey" in relational_params: - fieldmap_types = sorted( - [_file_to_entity_set(fmap.path) for fmap in fieldmap_lookup[path]] - ) - - # check if config says columns or bool - if relational_params["FieldmapKey"]["display_mode"] == "bool": - if len(fieldmap_types) > 0: - example_data["HasFieldmap"] = True - else: - example_data["HasFieldmap"] = False - else: - for fmap_num, fmap_type in enumerate(fieldmap_types): - example_data[f"FieldmapKey{fmap_num:02d}"] = fmap_type - - # Add the number of slice times specified - if "NSliceTimes" in derived_params: - example_data["NSliceTimes"] = len(slice_times) - - example_data["FilePath"] = path - - # If it's a fieldmap, see what entity set it's intended to correct - if "IntendedForKey" in relational_params: - intended_entity_sets = sorted( - [_file_to_entity_set(intention) for intention in intentions] - ) - - # check if config says columns or bool - if relational_params["IntendedForKey"]["display_mode"] == "bool": - if len(intended_entity_sets) > 0: - example_data["UsedAsFieldmap"] = True - else: - example_data["UsedAsFieldmap"] = False - else: - for intention_num, intention_entity_set in enumerate(intended_entity_sets): - example_data[f"IntendedForKey{intention_num:02d}"] = intention_entity_set - - dfs.append(example_data) - - # Assign each file to a ParamGroup - - # round param groups based on precision - df = round_params(pd.DataFrame(dfs), grouping_config, modality) - - # cluster param groups based on tolerance - df = format_params(df, grouping_config, modality) - # param_group_cols = list(set(df.columns.to_list()) - set(["FilePath"])) - - # get the subset of columns to drop duplicates by - check_cols = [] - for col in list(df.columns): - if f"Cluster_{col}" not in list(df.columns) and col != "FilePath": - check_cols.append(col) - - # Find the unique ParamGroups and assign ID numbers in "ParamGroup"\ - try: - deduped = df.drop("FilePath", axis=1) - except Exception: - return "erroneous sidecar found" - - deduped = deduped.drop_duplicates(subset=check_cols, ignore_index=True) - deduped["ParamGroup"] = np.arange(deduped.shape[0]) + 1 - - # add the modality as a column - deduped["Modality"] = modality - - # add entity set count column (will delete later) - deduped["EntitySetCount"] = len(keys_files[entity_set_name]) - - # Add the ParamGroup to the whole list of files - labeled_files = pd.merge(df, deduped, on=check_cols) - - value_counts = labeled_files.ParamGroup.value_counts() - - param_group_counts = pd.DataFrame( - {"Counts": value_counts.to_numpy(), "ParamGroup": value_counts.index.to_numpy()} - ) - - param_groups_with_counts = pd.merge(deduped, param_group_counts, on=["ParamGroup"]) - - # Sort by counts and relabel the param groups - param_groups_with_counts.sort_values(by=["Counts"], inplace=True, ascending=False) - param_groups_with_counts["ParamGroup"] = np.arange(param_groups_with_counts.shape[0]) + 1 - - # Send the new, ordered param group ids to the files list - ordered_labeled_files = pd.merge( - df, param_groups_with_counts, on=check_cols, suffixes=("_x", "") - ) - - # sort ordered_labeled_files by param group - ordered_labeled_files.sort_values(by=["Counts"], inplace=True, ascending=False) - - return ordered_labeled_files, param_groups_with_counts - - -def round_params(param_group_df, config, modality): - """Round columns' values in a DataFrame according to requested precision. - - Parameters - ---------- - param_group_df : pandas.DataFrame - DataFrame containing the parameters to be rounded. - config : dict - Configuration dictionary containing rounding precision information. - modality : str - The modality key to access the relevant rounding precision settings in the config. - - Returns - ------- - pandas.DataFrame - DataFrame with the specified columns' values rounded to the requested precision. - """ - to_format = config["sidecar_params"][modality] - to_format.update(config["derived_params"][modality]) - - for column_name, column_fmt in to_format.items(): - if column_name not in param_group_df: - continue - - if "precision" in column_fmt: - if isinstance(param_group_df[column_name], float): - param_group_df[column_name] = param_group_df[column_name].round( - column_fmt["precision"] - ) - - return param_group_df - - -def get_sidecar_metadata(json_file): - """Get all metadata values in a file's sidecar. - - Transform JSON dictionary to Python dictionary. - - Parameters - ---------- - json_file : str - Path to the JSON sidecar file. - - Returns - ------- - dict or str - Returns a dictionary containing the metadata if the file is successfully read, - otherwise returns the string "Erroneous sidecar". - - Raises - ------ - Exception - If there is an error loading the JSON file. - """ - try: - with open(json_file) as json_file: - data = json.load(json_file) - return data - except Exception: - # print("Error loading sidecar: ", json_filename) - return "Erroneous sidecar" - - -def format_params(param_group_df, config, modality): - """Run AgglomerativeClustering on param groups and add columns to dataframe. - - Parameters - ---------- - param_group_df : :obj:`pandas.DataFrame` - A data frame with one row per file where the ParamGroup column - indicates which group each scan is a part of. - config : :obj:`dict` - Configuration for defining parameter groups. - This dictionary has two keys: ``'sidecar_params'`` and ``'derived_params'``. - modality : :obj:`str` - Modality of the scan. - This is used to select the correct configuration from the config dict. - - Returns - ------- - param_group_df : :obj:`pandas.DataFrame` - An updated version of the input data frame, - with a new column added for each element in the modality's - ``'sidecar_params'`` and ``'derived_params'`` dictionaries. - The new columns will have the name ``'Cluster_' + column_name``, - and will contain the cluster labels for each parameter group. - - Notes - ----- - ``'sidecar_params'`` is a dictionary of dictionaries, where keys are modalities. - The modality-wise dictionary's keys are names of BIDS fields to directly include - in the Parameter Groupings, - and the values describe the parameters by which those BIDS' fields are compared. - For example, - {"RepetitionTime": {"tolerance": 0.000001, "precision": 6, "suggest_variant_rename": True} - means that the RepetitionTime field should be compared across files and flagged as a - variant if it differs from others by 0.000001 or more. - - ``'derived_params'`` is a dictionary of dictionaries, where keys are modalities. - The modality-wise dictionary's keys are names of BIDS fields to derive from the - NIfTI header and include in the Parameter Groupings. - """ - to_format = config["sidecar_params"][modality] - to_format.update(config["derived_params"][modality]) - - for column_name, column_fmt in to_format.items(): - if column_name not in param_group_df: - continue - - if "tolerance" in column_fmt and len(param_group_df) > 1: - array = param_group_df[column_name].to_numpy().reshape(-1, 1) - - for i in range(len(array)): - if np.isnan(array[i, 0]): - array[i, 0] = -999 - - tolerance = to_format[column_name]["tolerance"] - clustering = AgglomerativeClustering( - n_clusters=None, distance_threshold=tolerance, linkage="complete" - ).fit(array) - - for i in range(len(array)): - if array[i, 0] == -999: - array[i, 0] = np.nan - - # now add clustering_labels as a column - param_group_df[f"Cluster_{column_name}"] = clustering.labels_ - - return param_group_df - - -def _order_columns(df): - """Organize columns of the summary and files DataFrames. - - Parameters - ---------- - df : pandas.DataFrame - The DataFrame whose columns need to be organized. - - Returns - ------- - pandas.DataFrame - The DataFrame with columns organized such that 'EntitySet' and - 'ParamGroup' are the first two columns, 'FilePath' is the last - column (if present), and the remaining columns are sorted - alphabetically. - - Notes - ----- - This is the only place where the constant ID_VARS is used, - and the strings in that constant are hardcoded here, - so we might not need that constant at all. - """ - cols = set(df.columns.to_list()) - non_id_cols = cols - ID_VARS - new_columns = ["EntitySet", "ParamGroup"] + sorted(non_id_cols) - if "FilePath" in cols: - new_columns.append("FilePath") - - df = df[new_columns] - - return df[new_columns] - - -def img_to_new_ext(img_path, new_ext): - """Convert an image file path to a new extension. - - Parameters - ---------- - img_path : str - The file path of the image to be converted. - new_ext : str - The new extension to be applied to the image file path. - - Returns - ------- - str - The file path with the new extension applied. - - Examples - -------- - >>> img_to_new_ext('/path/to/file_image.nii.gz', '.tsv') - '/path/to/file_events.tsv' - - >>> img_to_new_ext('/path/to/file_image.nii.gz', '.tsv.gz') - '/path/to/file_physio.tsv.gz' - - >>> img_to_new_ext('/path/to/file_image.nii.gz', '.json') - '/path/to/file_image.json' - - Notes - ----- - The hardcoded suffix associated with each extension may not be comprehensive. - BIDS has been extended a lot in recent years. - """ - # handle .tsv edge case - if new_ext == ".tsv": - # take out suffix - return img_path.rpartition("_")[0] + "_events" + new_ext - elif new_ext == ".tsv.gz": - return img_path.rpartition("_")[0] + "_physio" + new_ext - else: - return img_path.replace(".nii.gz", "").replace(".nii", "") + new_ext - - -def get_entity_value(path, key): - """Given a filepath and BIDS key name, return the value associated with the key. - - Parameters - ---------- - path : str - The file path to be parsed. - key : str - The BIDS key name to search for in the file path. - - Returns - ------- - str or None - The value associated with the BIDS key if found, otherwise None. - - Examples - -------- - >>> get_entity_value('/path/to/sub-01/ses-01/func/sub-01_ses-02_task-rest_bold.nii.gz', 'sub') - 'sub-01' - >>> get_entity_value('/path/to/sub-01/ses-02/func/sub-01_ses-02_task-rest_bold.nii.gz', 'ses') - 'ses-02' - """ - parts = Path(path).parts - for part in parts: - if part.startswith(key + "-"): - return part - - -def build_path(filepath, entities, out_dir, is_longitudinal): - """Build a new path for a file based on its BIDS entities. - - Parameters - ---------- - filepath : str - The original file path. - entities : dict - A dictionary of BIDS entities. - This should include all of the entities in the filename *except* for subject and session. - out_dir : str - The output directory for the new file. - is_longitudinal : bool - If True, add "ses" to file path. - - Returns - ------- - new_path : str - The new file path. - - Examples - -------- - >>> build_path( - ... "/input/sub-01/ses-01/anat/sub-01_ses-01_T1w.nii.gz", - ... {"acquisition": "VAR", "suffix": "T2w"}, - ... "/output", - ... True, - ... ) - '/output/sub-01/ses-01/anat/sub-01_ses-01_acq-VAR_T2w.nii.gz' - - The function does not add an extra leading zero to the run entity when it's a string. - >>> build_path( - ... "/input/sub-01/ses-01/func/sub-01_ses-01_task-rest_run-01_bold.nii.gz", - ... {"task": "rest", "run": "2", "acquisition": "VAR", "suffix": "bold"}, - ... "/output", - ... True, - ... ) - '/output/sub-01/ses-01/func/sub-01_ses-01_task-rest_acq-VAR_run-2_bold.nii.gz' - - The function adds an extra leading zero to the run entity when it's an integer - and the original filename has a leading zero. - >>> build_path( - ... "/input/sub-01/ses-01/func/sub-01_ses-01_task-rest_run-00001_bold.nii.gz", - ... {"task": "rest", "run": 2, "acquisition": "VAR", "suffix": "bold"}, - ... "/output", - ... True, - ... ) - '/output/sub-01/ses-01/func/sub-01_ses-01_task-rest_acq-VAR_run-00002_bold.nii.gz' - - The function does not add an extra leading zero to the run entity when it's an integer - and the original filename doesn't have a leading zero. - >>> build_path( - ... "/input/sub-01/ses-01/func/sub-01_ses-01_task-rest_run-1_bold.nii.gz", - ... {"task": "rest", "run": 2, "acquisition": "VAR", "suffix": "bold"}, - ... "/output", - ... True, - ... ) - '/output/sub-01/ses-01/func/sub-01_ses-01_task-rest_acq-VAR_run-2_bold.nii.gz' - - The function doesn't add an extra leading zero to the run entity when there isn't a zero. - >>> build_path( - ... "/input/sub-01/ses-01/func/sub-01_ses-01_task-rest_run-1_bold.nii.gz", - ... {"task": "rest", "run": "2", "acquisition": "VAR", "suffix": "bold"}, - ... "/output", - ... True, - ... ) - '/output/sub-01/ses-01/func/sub-01_ses-01_task-rest_acq-VAR_run-2_bold.nii.gz' - - Entities in the original path, but not the entity dictionary, are not included, - like run in this case. - >>> build_path( - ... "/input/sub-01/ses-01/func/sub-01_ses-01_task-rest_run-01_bold.nii.gz", - ... {"task": "rest", "acquisition": "VAR", "suffix": "bold"}, - ... "/output", - ... True, - ... ) - '/output/sub-01/ses-01/func/sub-01_ses-01_task-rest_acq-VAR_bold.nii.gz' - - Entities outside of the prescribed list are ignored, such as "subject"... - >>> build_path( - ... "/input/sub-01/ses-01/func/sub-01_ses-01_task-rest_run-01_bold.nii.gz", - ... {"subject": "02", "task": "rest", "acquisition": "VAR", "suffix": "bold"}, - ... "/output", - ... True, - ... ) - '/output/sub-01/ses-01/func/sub-01_ses-01_task-rest_acq-VAR_bold.nii.gz' - - or "echo". - >>> build_path( - ... "/input/sub-01/ses-01/func/sub-01_ses-01_task-rest_run-01_bold.nii.gz", - ... {"task": "rest", "acquisition": "VAR", "echo": 1, "suffix": "bold"}, - ... "/output", - ... True, - ... ) - '/output/sub-01/ses-01/func/sub-01_ses-01_task-rest_acq-VAR_bold.nii.gz' - - It can change the datatype, but will warn the user. - >>> build_path( - ... "/input/sub-01/ses-01/anat/sub-01_ses-01_asl.nii.gz", - ... {"datatype": "perf", "acquisition": "VAR", "suffix": "asl"}, - ... "/output", - ... True, - ... ) - WARNING: DATATYPE CHANGE DETECTED - '/output/sub-01/ses-01/perf/sub-01_ses-01_acq-VAR_asl.nii.gz' - - It also works for cross-sectional filename. - >>> build_path( - ... "/input/sub-01/func/sub-01_task-rest_run-01_bold.nii.gz", - ... {"task": "rest", "acquisition": "VAR", "suffix": "bold"}, - ... "/output", - ... False, - ... ) - '/output/sub-01/func/sub-01_task-rest_acq-VAR_bold.nii.gz' - """ - exts = Path(filepath).suffixes - old_ext = "".join(exts) - - suffix = entities["suffix"] - entity_file_keys = [] - - # Entities that may be in the filename? - file_keys = ["task", "acquisition", "direction", "reconstruction", "run"] - - for key in file_keys: - if key in list(entities.keys()): - entity_file_keys.append(key) - - sub = get_entity_value(filepath, "sub") - if sub is None: - raise ValueError(f"Could not extract subject from {filepath}") - - if is_longitudinal: - ses = get_entity_value(filepath, "ses") - if ses is None: - raise ValueError(f"Could not extract session from {filepath}") - - # Add leading zeros to run entity if it's an integer. - # If it's a string, respect the value provided. - if "run" in entities.keys() and isinstance(entities["run"], int): - # Infer the number of leading zeros needed from the original filename - n_leading = 2 # default to 1 leading zero - if "_run-" in filepath: - run_str = filepath.split("_run-")[1].split("_")[0] - n_leading = len(run_str) - entities["run"] = str(entities["run"]).zfill(n_leading) - - filename = "_".join([f"{key}-{entities[key]}" for key in entity_file_keys]) - filename = ( - filename.replace("acquisition", "acq") - .replace("direction", "dir") - .replace("reconstruction", "rec") - ) - if len(filename) > 0: - if is_longitudinal: - filename = f"{sub}_{ses}_{filename}_{suffix}{old_ext}" - elif not is_longitudinal: - filename = f"{sub}_{filename}_{suffix}{old_ext}" - else: - raise ValueError(f"Could not construct new filename for {filepath}") - - # CHECK TO SEE IF DATATYPE CHANGED - # datatype may be overridden/changed if the original file is located in the wrong folder. - dtypes = ["anat", "func", "perf", "fmap", "dwi"] - dtype_orig = "" - for dtype in dtypes: - if dtype in filepath: - dtype_orig = dtype - - if "datatype" in entities.keys(): - dtype_new = entities["datatype"] - if entities["datatype"] != dtype_orig: - print("WARNING: DATATYPE CHANGE DETECTED") - else: - dtype_new = dtype_orig - - # Construct the new filename - if is_longitudinal: - new_path = str(Path(out_dir) / sub / ses / dtype_new / filename) - elif not is_longitudinal: - new_path = str(Path(out_dir) / sub / dtype_new / filename) - - return new_path - - -def assign_variants(summary, rename_cols): - """Assign variant names to files based on differences from dominant group. - - Parameters - ---------- - summary : pandas.DataFrame - The summary DataFrame containing the metadata for each file. - The columns that are used include "ParamGroup", "EntitySet", - the columns in ``rename_cols``, - and any columns in ``rename_cols`` that are prefixed with "Cluster_". - rename_cols : list of str - A list of column names to use for renaming files. - The values in these columns will be compared against the dominant group - and labeled with a variant name if they differ. - - Returns - ------- - pandas.DataFrame - The updated summary DataFrame with a new column "RenameEntitySet" - containing the new entity set names for each file. - """ - # loop through summary tsv and create dom_dict - dom_dict = {} - for row in range(len(summary)): - # if dominant group identified - if str(summary.loc[row, "ParamGroup"]) == "1": - val = {} - # grab col, all vals send to dict - key = summary.loc[row, "EntitySet"] - for col in rename_cols: - summary[col] = summary[col].apply(str) - val[col] = summary.loc[row, col] - - if f"Cluster_{col}" in summary.columns: - val[f"Cluster_{col}"] = summary.loc[row, f"Cluster_{col}"] - - dom_dict[key] = val - - # now loop through again and ID variance - for row in range(len(summary)): - # check to see if renaming has already happened - renamed = False - entities = _entity_set_to_entities(summary.loc[row, "EntitySet"]) - if "VARIANT" in summary.loc[row, "EntitySet"]: - renamed = True - - if summary.loc[row, "ParamGroup"] != 1 and not renamed: - acq_str = "VARIANT" - # now we know we have a deviant param group - # check if TR is same as param group 1 - entity_set = summary.loc[row, "EntitySet"] - for col in rename_cols: - dom_entity_set = dom_dict[entity_set] - summary[col] = summary[col].apply(str) - - if f"Cluster_{col}" in dom_entity_set.keys(): - if summary.loc[row, f"Cluster_{col}"] != dom_entity_set[f"Cluster_{col}"]: - acq_str += col - elif summary.loc[row, col] != dom_entity_set[col]: - if col == "HasFieldmap": - if dom_entity_set[col] == "True": - acq_str += "NoFmap" - else: - acq_str += "HasFmap" - elif col == "UsedAsFieldmap": - if dom_entity_set[col] == "True": - acq_str += "Unused" - else: - acq_str += "IsUsed" - else: - acq_str += col - - if acq_str == "VARIANT": - acq_str += "Other" - - if "acquisition" in entities.keys(): - acq = f"acquisition-{entities['acquisition'] + acq_str}" - - new_name = summary.loc[row, "EntitySet"].replace( - f"acquisition-{entities['acquisition']}", - acq, - ) - else: - acq = f"acquisition-{acq_str}" - new_name = acq + "_" + summary.loc[row, "EntitySet"] - - summary.at[row, "RenameEntitySet"] = new_name - - # convert all "nan" to empty str - # so they don't show up in the summary tsv - if summary.loc[row, "RenameEntitySet"] == "nan": - summary.at[row, "RenameEntitySet"] = "" - - for col in rename_cols: - if summary.loc[row, col] == "nan": - summary.at[row, col] = "" - - return summary diff --git a/cubids/utils.py b/cubids/utils.py new file mode 100644 index 00000000..376c2244 --- /dev/null +++ b/cubids/utils.py @@ -0,0 +1,853 @@ +"""Utility functions for CuBIDS.""" + +import json +from pathlib import Path + +import numpy as np +import pandas as pd +from bids.layout import parse_file_entities +from sklearn.cluster import AgglomerativeClustering + +from cubids.constants import ID_VARS, NON_KEY_ENTITIES + + +# XXX: Remove _validate_json? +def _validate_json(): + """Validate a JSON file's contents. + + This is currently not implemented, but would accept metadata as its param. + """ + # TODO: implement this or delete ??? + return True + + +def _update_json(json_file, metadata): + """Update a JSON file with the provided metadata. + + This function writes the given metadata to the specified JSON file if the + JSON data is valid. If the JSON data is invalid, it prints an error message. + + Parameters + ---------- + json_file : str + The path to the JSON file to be updated. + metadata : dict + The metadata to be written to the JSON file. + + Returns + ------- + None + """ + if _validate_json(): + with open(json_file, "w", encoding="utf-8") as f: + json.dump(metadata, f, ensure_ascii=False, indent=4) + else: + print("INVALID JSON DATA") + + +def _entity_set_to_entities(entity_set): + """Split an entity_set name into a pybids dictionary of entities. + + Parameters + ---------- + entity_set : str + A string representing a set of entities, where each entity is + separated by an underscore and each key-value pair is separated by a hyphen. + + Returns + ------- + dict + A dictionary where the keys are entity names and the values are entity values. + + Examples + -------- + >>> _entity_set_to_entities("sub-01_ses-02_task-rest") + {'sub': '01', 'ses': '02', 'task': 'rest'} + """ + return dict([group.split("-") for group in entity_set.split("_")]) + + +def _entities_to_entity_set(entities): + """Convert a pybids entities dictionary into an entity set name. + + Parameters + ---------- + entities : dict + A dictionary containing pybids entities where keys are entity names + and values are entity values. + + Returns + ------- + str + A string representing the entity set name, constructed by joining + the sorted entity keys and their corresponding values, separated by hyphens. + """ + group_keys = sorted(entities.keys() - NON_KEY_ENTITIES) + return "_".join([f"{key}-{entities[key]}" for key in group_keys]) + + +def _file_to_entity_set(filename): + """Identify and return the entity set of a BIDS valid filename. + + Parameters + ---------- + filename : str + The filename to parse for BIDS entities. + + Returns + ------- + set + A set of entities extracted from the filename. + """ + entities = parse_file_entities(str(filename)) + return _entities_to_entity_set(entities) + + +def _get_participant_relative_path(scan): + """Build the relative-from-subject version of a Path to a file. + + Parameters + ---------- + scan : str + The full path to the scan file. + + Returns + ------- + str + The relative path from the subject directory. + + Raises + ------ + ValueError + If the subject directory cannot be found in the path. + + Examples + -------- + >>> _get_participant_relative_path( + ... "/path/to/dset/sub-01/ses-01/func/sub-01_ses-01_bold.nii.gz", + ... ) + 'ses-01/func/sub-01_ses-01_bold.nii.gz' + + >>> _get_participant_relative_path( + ... "/path/to/dset/sub-01/func/sub-01_bold.nii.gz", + ... ) + 'func/sub-01_bold.nii.gz' + + >>> _get_participant_relative_path( + ... "/path/to/dset/ses-01/func/ses-01_bold.nii.gz", + ... ) + Traceback (most recent call last): + ValueError: Could not find subject in ... + """ + parts = Path(scan).parts + # Find the first part that starts with "sub-" + for i, part in enumerate(parts): + if part.startswith("sub-"): + return "/".join(parts[i + 1 :]) + raise ValueError(f"Could not find subject in {scan}") + + +def _get_bidsuri(filename, dataset_root): + """Convert a file path to a BIDS URI. + + Parameters + ---------- + filename : str + The full path to the file within the BIDS dataset. + dataset_root : str + The root directory of the BIDS dataset. + + Returns + ------- + str + The BIDS URI corresponding to the given file path. + + Raises + ------ + ValueError + If the filename is not within the dataset_root. + + Examples + -------- + >>> _get_bidsuri("/path/to/bids/sub-01/ses-01/dataset_description.json", "/path/to/bids") + 'bids::sub-01/ses-01/dataset_description.json' + """ + if dataset_root in filename: + return filename.replace(dataset_root, "bids::").replace("bids::/", "bids::") + raise ValueError(f"Only local datasets are supported: {filename}") + + +def _get_param_groups( + files, + fieldmap_lookup, + entity_set_name, + grouping_config, + modality, + keys_files, +): + """Find a list of *parameter groups* from a list of files. + + For each file in `files`, find critical parameters for metadata. Then find + unique sets of these critical parameters. + + Parameters + ---------- + files : :obj:`list` of :obj:`str` + List of file names + fieldmap_lookup : :obj:`dict` + mapping of filename strings relative to the bids root + (e.g. "sub-X/ses-Y/func/sub-X_ses-Y_task-rest_bold.nii.gz") + grouping_config : :obj:`dict` + configuration for defining parameter groups + + Returns + ------- + ordered_labeled_files : :obj:`pandas.DataFrame` + A data frame with one row per file where the ParamGroup column + indicates which group each scan is a part of. + param_groups_with_counts : :obj:`pandas.DataFrame` + A data frame with param group summaries. + """ + if not files: + print("WARNING: no files for", entity_set_name) + return None, None + + # Split the config into separate parts + imaging_params = grouping_config.get("sidecar_params", {}) + imaging_params = imaging_params[modality] + + relational_params = grouping_config.get("relational_params", {}) + + derived_params = grouping_config.get("derived_params") + derived_params = derived_params[modality] + + imaging_params.update(derived_params) + + dfs = [] + # path needs to be relative to the root with no leading prefix + + for path in files: + # metadata = layout.get_metadata(path) + metadata = get_sidecar_metadata(img_to_new_ext(path, ".json")) + if metadata == "Erroneous sidecar": + print("Error parsing sidecar: ", img_to_new_ext(path, ".json")) + else: + intentions = metadata.get("IntendedFor", []) + slice_times = metadata.get("SliceTiming", []) + + wanted_keys = metadata.keys() & imaging_params + example_data = {key: metadata[key] for key in wanted_keys} + example_data["EntitySet"] = entity_set_name + + # Get the fieldmaps out and add their types + if "FieldmapKey" in relational_params: + fieldmap_types = sorted( + [_file_to_entity_set(fmap.path) for fmap in fieldmap_lookup[path]] + ) + + # check if config says columns or bool + if relational_params["FieldmapKey"]["display_mode"] == "bool": + if len(fieldmap_types) > 0: + example_data["HasFieldmap"] = True + else: + example_data["HasFieldmap"] = False + else: + for fmap_num, fmap_type in enumerate(fieldmap_types): + example_data[f"FieldmapKey{fmap_num:02d}"] = fmap_type + + # Add the number of slice times specified + if "NSliceTimes" in derived_params: + example_data["NSliceTimes"] = len(slice_times) + + example_data["FilePath"] = path + + # If it's a fieldmap, see what entity set it's intended to correct + if "IntendedForKey" in relational_params: + intended_entity_sets = sorted( + [_file_to_entity_set(intention) for intention in intentions] + ) + + # check if config says columns or bool + if relational_params["IntendedForKey"]["display_mode"] == "bool": + if len(intended_entity_sets) > 0: + example_data["UsedAsFieldmap"] = True + else: + example_data["UsedAsFieldmap"] = False + else: + for intention_num, intention_entity_set in enumerate(intended_entity_sets): + example_data[f"IntendedForKey{intention_num:02d}"] = intention_entity_set + + dfs.append(example_data) + + # Assign each file to a ParamGroup + + # round param groups based on precision + df = round_params(pd.DataFrame(dfs), grouping_config, modality) + + # cluster param groups based on tolerance + df = format_params(df, grouping_config, modality) + # param_group_cols = list(set(df.columns.to_list()) - set(["FilePath"])) + + # get the subset of columns to drop duplicates by + check_cols = [] + for col in list(df.columns): + if f"Cluster_{col}" not in list(df.columns) and col != "FilePath": + check_cols.append(col) + + # Find the unique ParamGroups and assign ID numbers in "ParamGroup"\ + try: + deduped = df.drop("FilePath", axis=1) + except Exception: + return "erroneous sidecar found" + + deduped = deduped.drop_duplicates(subset=check_cols, ignore_index=True) + deduped["ParamGroup"] = np.arange(deduped.shape[0]) + 1 + + # add the modality as a column + deduped["Modality"] = modality + + # add entity set count column (will delete later) + deduped["EntitySetCount"] = len(keys_files[entity_set_name]) + + # Add the ParamGroup to the whole list of files + labeled_files = pd.merge(df, deduped, on=check_cols) + + value_counts = labeled_files.ParamGroup.value_counts() + + param_group_counts = pd.DataFrame( + {"Counts": value_counts.to_numpy(), "ParamGroup": value_counts.index.to_numpy()} + ) + + param_groups_with_counts = pd.merge(deduped, param_group_counts, on=["ParamGroup"]) + + # Sort by counts and relabel the param groups + param_groups_with_counts.sort_values(by=["Counts"], inplace=True, ascending=False) + param_groups_with_counts["ParamGroup"] = np.arange(param_groups_with_counts.shape[0]) + 1 + + # Send the new, ordered param group ids to the files list + ordered_labeled_files = pd.merge( + df, param_groups_with_counts, on=check_cols, suffixes=("_x", "") + ) + + # sort ordered_labeled_files by param group + ordered_labeled_files.sort_values(by=["Counts"], inplace=True, ascending=False) + + return ordered_labeled_files, param_groups_with_counts + + +def round_params(param_group_df, config, modality): + """Round columns' values in a DataFrame according to requested precision. + + Parameters + ---------- + param_group_df : pandas.DataFrame + DataFrame containing the parameters to be rounded. + config : dict + Configuration dictionary containing rounding precision information. + modality : str + The modality key to access the relevant rounding precision settings in the config. + + Returns + ------- + pandas.DataFrame + DataFrame with the specified columns' values rounded to the requested precision. + """ + to_format = config["sidecar_params"][modality] + to_format.update(config["derived_params"][modality]) + + for column_name, column_fmt in to_format.items(): + if column_name not in param_group_df: + continue + + if "precision" in column_fmt: + if isinstance(param_group_df[column_name], float): + param_group_df[column_name] = param_group_df[column_name].round( + column_fmt["precision"] + ) + + return param_group_df + + +def get_sidecar_metadata(json_file): + """Get all metadata values in a file's sidecar. + + Transform JSON dictionary to Python dictionary. + + Parameters + ---------- + json_file : str + Path to the JSON sidecar file. + + Returns + ------- + dict or str + Returns a dictionary containing the metadata if the file is successfully read, + otherwise returns the string "Erroneous sidecar". + + Raises + ------ + Exception + If there is an error loading the JSON file. + """ + try: + with open(json_file) as json_file: + data = json.load(json_file) + return data + except Exception: + # print("Error loading sidecar: ", json_filename) + return "Erroneous sidecar" + + +def format_params(param_group_df, config, modality): + """Run AgglomerativeClustering on param groups and add columns to dataframe. + + Parameters + ---------- + param_group_df : :obj:`pandas.DataFrame` + A data frame with one row per file where the ParamGroup column + indicates which group each scan is a part of. + config : :obj:`dict` + Configuration for defining parameter groups. + This dictionary has two keys: ``'sidecar_params'`` and ``'derived_params'``. + modality : :obj:`str` + Modality of the scan. + This is used to select the correct configuration from the config dict. + + Returns + ------- + param_group_df : :obj:`pandas.DataFrame` + An updated version of the input data frame, + with a new column added for each element in the modality's + ``'sidecar_params'`` and ``'derived_params'`` dictionaries. + The new columns will have the name ``'Cluster_' + column_name``, + and will contain the cluster labels for each parameter group. + + Notes + ----- + ``'sidecar_params'`` is a dictionary of dictionaries, where keys are modalities. + The modality-wise dictionary's keys are names of BIDS fields to directly include + in the Parameter Groupings, + and the values describe the parameters by which those BIDS' fields are compared. + For example, + {"RepetitionTime": {"tolerance": 0.000001, "precision": 6, "suggest_variant_rename": True} + means that the RepetitionTime field should be compared across files and flagged as a + variant if it differs from others by 0.000001 or more. + + ``'derived_params'`` is a dictionary of dictionaries, where keys are modalities. + The modality-wise dictionary's keys are names of BIDS fields to derive from the + NIfTI header and include in the Parameter Groupings. + """ + to_format = config["sidecar_params"][modality] + to_format.update(config["derived_params"][modality]) + + for column_name, column_fmt in to_format.items(): + if column_name not in param_group_df: + continue + + if "tolerance" in column_fmt and len(param_group_df) > 1: + array = param_group_df[column_name].to_numpy().reshape(-1, 1) + + for i in range(len(array)): + if np.isnan(array[i, 0]): + array[i, 0] = -999 + + tolerance = to_format[column_name]["tolerance"] + clustering = AgglomerativeClustering( + n_clusters=None, distance_threshold=tolerance, linkage="complete" + ).fit(array) + + for i in range(len(array)): + if array[i, 0] == -999: + array[i, 0] = np.nan + + # now add clustering_labels as a column + param_group_df[f"Cluster_{column_name}"] = clustering.labels_ + + return param_group_df + + +def _order_columns(df): + """Organize columns of the summary and files DataFrames. + + Parameters + ---------- + df : pandas.DataFrame + The DataFrame whose columns need to be organized. + + Returns + ------- + pandas.DataFrame + The DataFrame with columns organized such that 'EntitySet' and + 'ParamGroup' are the first two columns, 'FilePath' is the last + column (if present), and the remaining columns are sorted + alphabetically. + + Notes + ----- + This is the only place where the constant ID_VARS is used, + and the strings in that constant are hardcoded here, + so we might not need that constant at all. + """ + cols = set(df.columns.to_list()) + non_id_cols = cols - ID_VARS + new_columns = ["EntitySet", "ParamGroup"] + sorted(non_id_cols) + if "FilePath" in cols: + new_columns.append("FilePath") + + df = df[new_columns] + + return df[new_columns] + + +def img_to_new_ext(img_path, new_ext): + """Convert an image file path to a new extension. + + Parameters + ---------- + img_path : str + The file path of the image to be converted. + new_ext : str + The new extension to be applied to the image file path. + + Returns + ------- + str + The file path with the new extension applied. + + Examples + -------- + >>> img_to_new_ext('/path/to/file_image.nii.gz', '.tsv') + '/path/to/file_events.tsv' + + >>> img_to_new_ext('/path/to/file_image.nii.gz', '.tsv.gz') + '/path/to/file_physio.tsv.gz' + + >>> img_to_new_ext('/path/to/file_image.nii.gz', '.json') + '/path/to/file_image.json' + + Notes + ----- + The hardcoded suffix associated with each extension may not be comprehensive. + BIDS has been extended a lot in recent years. + """ + # handle .tsv edge case + if new_ext == ".tsv": + # take out suffix + return img_path.rpartition("_")[0] + "_events" + new_ext + elif new_ext == ".tsv.gz": + return img_path.rpartition("_")[0] + "_physio" + new_ext + else: + return img_path.replace(".nii.gz", "").replace(".nii", "") + new_ext + + +def get_entity_value(path, key): + """Given a filepath and BIDS key name, return the value associated with the key. + + Parameters + ---------- + path : str + The file path to be parsed. + key : str + The BIDS key name to search for in the file path. + + Returns + ------- + str or None + The value associated with the BIDS key if found, otherwise None. + + Examples + -------- + >>> get_entity_value('/path/to/sub-01/ses-01/func/sub-01_ses-02_task-rest_bold.nii.gz', 'sub') + 'sub-01' + >>> get_entity_value('/path/to/sub-01/ses-02/func/sub-01_ses-02_task-rest_bold.nii.gz', 'ses') + 'ses-02' + """ + parts = Path(path).parts + for part in parts: + if part.startswith(key + "-"): + return part + + +def build_path(filepath, entities, out_dir, is_longitudinal): + """Build a new path for a file based on its BIDS entities. + + Parameters + ---------- + filepath : str + The original file path. + entities : dict + A dictionary of BIDS entities. + This should include all of the entities in the filename *except* for subject and session. + out_dir : str + The output directory for the new file. + is_longitudinal : bool + If True, add "ses" to file path. + + Returns + ------- + new_path : str + The new file path. + + Examples + -------- + >>> build_path( + ... "/input/sub-01/ses-01/anat/sub-01_ses-01_T1w.nii.gz", + ... {"acquisition": "VAR", "suffix": "T2w"}, + ... "/output", + ... True, + ... ) + '/output/sub-01/ses-01/anat/sub-01_ses-01_acq-VAR_T2w.nii.gz' + + The function does not add an extra leading zero to the run entity when it's a string. + >>> build_path( + ... "/input/sub-01/ses-01/func/sub-01_ses-01_task-rest_run-01_bold.nii.gz", + ... {"task": "rest", "run": "2", "acquisition": "VAR", "suffix": "bold"}, + ... "/output", + ... True, + ... ) + '/output/sub-01/ses-01/func/sub-01_ses-01_task-rest_acq-VAR_run-2_bold.nii.gz' + + The function adds an extra leading zero to the run entity when it's an integer + and the original filename has a leading zero. + >>> build_path( + ... "/input/sub-01/ses-01/func/sub-01_ses-01_task-rest_run-00001_bold.nii.gz", + ... {"task": "rest", "run": 2, "acquisition": "VAR", "suffix": "bold"}, + ... "/output", + ... True, + ... ) + '/output/sub-01/ses-01/func/sub-01_ses-01_task-rest_acq-VAR_run-00002_bold.nii.gz' + + The function does not add an extra leading zero to the run entity when it's an integer + and the original filename doesn't have a leading zero. + >>> build_path( + ... "/input/sub-01/ses-01/func/sub-01_ses-01_task-rest_run-1_bold.nii.gz", + ... {"task": "rest", "run": 2, "acquisition": "VAR", "suffix": "bold"}, + ... "/output", + ... True, + ... ) + '/output/sub-01/ses-01/func/sub-01_ses-01_task-rest_acq-VAR_run-2_bold.nii.gz' + + The function doesn't add an extra leading zero to the run entity when there isn't a zero. + >>> build_path( + ... "/input/sub-01/ses-01/func/sub-01_ses-01_task-rest_run-1_bold.nii.gz", + ... {"task": "rest", "run": "2", "acquisition": "VAR", "suffix": "bold"}, + ... "/output", + ... True, + ... ) + '/output/sub-01/ses-01/func/sub-01_ses-01_task-rest_acq-VAR_run-2_bold.nii.gz' + + Entities in the original path, but not the entity dictionary, are not included, + like run in this case. + >>> build_path( + ... "/input/sub-01/ses-01/func/sub-01_ses-01_task-rest_run-01_bold.nii.gz", + ... {"task": "rest", "acquisition": "VAR", "suffix": "bold"}, + ... "/output", + ... True, + ... ) + '/output/sub-01/ses-01/func/sub-01_ses-01_task-rest_acq-VAR_bold.nii.gz' + + Entities outside of the prescribed list are ignored, such as "subject"... + >>> build_path( + ... "/input/sub-01/ses-01/func/sub-01_ses-01_task-rest_run-01_bold.nii.gz", + ... {"subject": "02", "task": "rest", "acquisition": "VAR", "suffix": "bold"}, + ... "/output", + ... True, + ... ) + '/output/sub-01/ses-01/func/sub-01_ses-01_task-rest_acq-VAR_bold.nii.gz' + + or "echo". + >>> build_path( + ... "/input/sub-01/ses-01/func/sub-01_ses-01_task-rest_run-01_bold.nii.gz", + ... {"task": "rest", "acquisition": "VAR", "echo": 1, "suffix": "bold"}, + ... "/output", + ... True, + ... ) + '/output/sub-01/ses-01/func/sub-01_ses-01_task-rest_acq-VAR_bold.nii.gz' + + It can change the datatype, but will warn the user. + >>> build_path( + ... "/input/sub-01/ses-01/anat/sub-01_ses-01_asl.nii.gz", + ... {"datatype": "perf", "acquisition": "VAR", "suffix": "asl"}, + ... "/output", + ... True, + ... ) + WARNING: DATATYPE CHANGE DETECTED + '/output/sub-01/ses-01/perf/sub-01_ses-01_acq-VAR_asl.nii.gz' + + It also works for cross-sectional filename. + >>> build_path( + ... "/input/sub-01/func/sub-01_task-rest_run-01_bold.nii.gz", + ... {"task": "rest", "acquisition": "VAR", "suffix": "bold"}, + ... "/output", + ... False, + ... ) + '/output/sub-01/func/sub-01_task-rest_acq-VAR_bold.nii.gz' + """ + exts = Path(filepath).suffixes + old_ext = "".join(exts) + + suffix = entities["suffix"] + entity_file_keys = [] + + # Entities that may be in the filename? + file_keys = ["task", "acquisition", "direction", "reconstruction", "run"] + + for key in file_keys: + if key in list(entities.keys()): + entity_file_keys.append(key) + + sub = get_entity_value(filepath, "sub") + if sub is None: + raise ValueError(f"Could not extract subject from {filepath}") + + if is_longitudinal: + ses = get_entity_value(filepath, "ses") + if ses is None: + raise ValueError(f"Could not extract session from {filepath}") + + # Add leading zeros to run entity if it's an integer. + # If it's a string, respect the value provided. + if "run" in entities.keys() and isinstance(entities["run"], int): + # Infer the number of leading zeros needed from the original filename + n_leading = 2 # default to 1 leading zero + if "_run-" in filepath: + run_str = filepath.split("_run-")[1].split("_")[0] + n_leading = len(run_str) + entities["run"] = str(entities["run"]).zfill(n_leading) + + filename = "_".join([f"{key}-{entities[key]}" for key in entity_file_keys]) + filename = ( + filename.replace("acquisition", "acq") + .replace("direction", "dir") + .replace("reconstruction", "rec") + ) + if len(filename) > 0: + if is_longitudinal: + filename = f"{sub}_{ses}_{filename}_{suffix}{old_ext}" + elif not is_longitudinal: + filename = f"{sub}_{filename}_{suffix}{old_ext}" + else: + raise ValueError(f"Could not construct new filename for {filepath}") + + # CHECK TO SEE IF DATATYPE CHANGED + # datatype may be overridden/changed if the original file is located in the wrong folder. + dtypes = ["anat", "func", "perf", "fmap", "dwi"] + dtype_orig = "" + for dtype in dtypes: + if dtype in filepath: + dtype_orig = dtype + + if "datatype" in entities.keys(): + dtype_new = entities["datatype"] + if entities["datatype"] != dtype_orig: + print("WARNING: DATATYPE CHANGE DETECTED") + else: + dtype_new = dtype_orig + + # Construct the new filename + if is_longitudinal: + new_path = str(Path(out_dir) / sub / ses / dtype_new / filename) + elif not is_longitudinal: + new_path = str(Path(out_dir) / sub / dtype_new / filename) + + return new_path + + +def assign_variants(summary, rename_cols): + """Assign variant names to files based on differences from dominant group. + + Parameters + ---------- + summary : pandas.DataFrame + The summary DataFrame containing the metadata for each file. + The columns that are used include "ParamGroup", "EntitySet", + the columns in ``rename_cols``, + and any columns in ``rename_cols`` that are prefixed with "Cluster_". + rename_cols : list of str + A list of column names to use for renaming files. + The values in these columns will be compared against the dominant group + and labeled with a variant name if they differ. + + Returns + ------- + pandas.DataFrame + The updated summary DataFrame with a new column "RenameEntitySet" + containing the new entity set names for each file. + """ + # loop through summary tsv and create dom_dict + dom_dict = {} + for row in range(len(summary)): + # if dominant group identified + if str(summary.loc[row, "ParamGroup"]) == "1": + val = {} + # grab col, all vals send to dict + key = summary.loc[row, "EntitySet"] + for col in rename_cols: + summary[col] = summary[col].apply(str) + val[col] = summary.loc[row, col] + + if f"Cluster_{col}" in summary.columns: + val[f"Cluster_{col}"] = summary.loc[row, f"Cluster_{col}"] + + dom_dict[key] = val + + # now loop through again and ID variance + for row in range(len(summary)): + # check to see if renaming has already happened + renamed = False + entities = _entity_set_to_entities(summary.loc[row, "EntitySet"]) + if "VARIANT" in summary.loc[row, "EntitySet"]: + renamed = True + + if summary.loc[row, "ParamGroup"] != 1 and not renamed: + acq_str = "VARIANT" + # now we know we have a deviant param group + # check if TR is same as param group 1 + entity_set = summary.loc[row, "EntitySet"] + for col in rename_cols: + dom_entity_set = dom_dict[entity_set] + summary[col] = summary[col].apply(str) + + if f"Cluster_{col}" in dom_entity_set.keys(): + if summary.loc[row, f"Cluster_{col}"] != dom_entity_set[f"Cluster_{col}"]: + acq_str += col + elif summary.loc[row, col] != dom_entity_set[col]: + if col == "HasFieldmap": + if dom_entity_set[col] == "True": + acq_str += "NoFmap" + else: + acq_str += "HasFmap" + elif col == "UsedAsFieldmap": + if dom_entity_set[col] == "True": + acq_str += "Unused" + else: + acq_str += "IsUsed" + else: + acq_str += col + + if acq_str == "VARIANT": + acq_str += "Other" + + if "acquisition" in entities.keys(): + acq = f"acquisition-{entities['acquisition'] + acq_str}" + + new_name = summary.loc[row, "EntitySet"].replace( + f"acquisition-{entities['acquisition']}", + acq, + ) + else: + acq = f"acquisition-{acq_str}" + new_name = acq + "_" + summary.loc[row, "EntitySet"] + + summary.at[row, "RenameEntitySet"] = new_name + + # convert all "nan" to empty str + # so they don't show up in the summary tsv + if summary.loc[row, "RenameEntitySet"] == "nan": + summary.at[row, "RenameEntitySet"] = "" + + for col in rename_cols: + if summary.loc[row, col] == "nan": + summary.at[row, col] = "" + + return summary