Skip to content

Commit

Permalink
standardize metadata to include "identifier" instead of "repertoire_i…
Browse files Browse the repository at this point in the history
…dentifier"
  • Loading branch information
pavlovicmilena committed Aug 3, 2021
1 parent 329456b commit 4f6928e
Show file tree
Hide file tree
Showing 9 changed files with 12 additions and 12 deletions.
2 changes: 1 addition & 1 deletion docs/source/galaxy/galaxy_intro.rst
Original file line number Diff line number Diff line change
Expand Up @@ -163,4 +163,4 @@ you expected of the tool, and click 'Report'.

.. image:: ../_static/images/galaxy/report_bug.png
:alt: bug report
:width: 250
:width: 80%
3 changes: 2 additions & 1 deletion immuneML/IO/dataset_export/AIRRExporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,8 +84,9 @@ def get_sequence_aa_field(region_type):
@staticmethod
def export_updated_metadata(dataset: RepertoireDataset, result_path: Path, repertoire_folder: str):
df = pd.read_csv(dataset.metadata_file, comment=Constants.COMMENT_SIGN)
identifiers = df["repertoire_identifier"].values.tolist() if "repertoire_identifier" in df.columns else dataset.get_example_ids()
identifiers = df["identifier"].values.tolist() if "identifier" in df.columns else dataset.get_example_ids()
df["filename"] = [str(Path(repertoire_folder) / f"{repertoire.data_filename.stem}.tsv") for repertoire in dataset.get_data()]
df['identifier'] = identifiers
df.to_csv(result_path / "metadata.csv", index=False)

@staticmethod
Expand Down
4 changes: 2 additions & 2 deletions immuneML/IO/dataset_import/ImmuneMLImport.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,8 @@ def _update_receptor_paths(pickle_params, dataset: ElementDataset):
return dataset

@staticmethod
def _discover_repertoire_path(pickle_params, dataset):
dataset_dir = ImmuneMLImport._discover_dataset_dir(pickle_params)
def _discover_repertoire_path(params, dataset):
dataset_dir = ImmuneMLImport._discover_dataset_dir(params)

if len(list(dataset_dir.glob("*.npy"))) == len(dataset.repertoires):
path = dataset_dir
Expand Down
2 changes: 1 addition & 1 deletion immuneML/data_model/dataset/RepertoireDataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def build(cls, **kwargs):
filename = filename.parent.parent / Path(row['filename']).name
repertoire = Repertoire(data_filename=filename,
metadata_filename=filename.parent / f'{filename.stem}_metadata.yaml',
identifier=row['repertoire_identifier'])
identifier=row['identifier'])
repertoires.append(repertoire)

if "repertoire_ids" in kwargs.keys() and "repertoires" not in kwargs.keys() and kwargs['repertoire_ids'] is not None:
Expand Down
6 changes: 3 additions & 3 deletions immuneML/encodings/distance_encoding/DistanceEncoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,13 +107,13 @@ def build_distance_matrix(self, dataset: RepertoireDataset, params: EncoderParam

def build_labels(self, dataset: RepertoireDataset, params: EncoderParams) -> dict:

lbl = ["repertoire_identifier"]
lbl = ["identifier"]
lbl.extend(params.label_config.get_labels_by_name())

tmp_labels = dataset.get_metadata(lbl, return_df=True)
tmp_labels = tmp_labels.iloc[pd.Index(tmp_labels['repertoire_identifier']).get_indexer(dataset.get_repertoire_ids())]
tmp_labels = tmp_labels.iloc[pd.Index(tmp_labels['identifier']).get_indexer(dataset.get_repertoire_ids())]
tmp_labels = tmp_labels.to_dict("list")
del tmp_labels["repertoire_identifier"]
del tmp_labels["identifier"]

return tmp_labels

Expand Down
2 changes: 1 addition & 1 deletion immuneML/environment/Constants.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
class Constants:

VERSION = "2.0.2"
VERSION = "2.0.3"

# encoding constants
FEATURE_DELIMITER = "///"
Expand Down
1 change: 0 additions & 1 deletion immuneML/util/ImportHelper.py
Original file line number Diff line number Diff line change
Expand Up @@ -495,7 +495,6 @@ def import_receptors_by_id(df, identifier, chain_pair, metadata_columns) -> List
f"Missing {chain_pair.value[i]} chain for receptor with identifier {identifier}, this receptor will be omitted.")
return []

# todo add options like IRIS import: option to import all dual chains or just the first pair / all V genes when uncertain annotation, etc
# todo add possibility to import multiple chain combo's? (BCR heavy-light & heavy-kappa, as seen in 10xGenomics?)

return [ImportHelper.build_receptor_from_rows(first_row.iloc[0], second_row.iloc[0], identifier, chain_pair, metadata_columns)]
Expand Down
2 changes: 1 addition & 1 deletion immuneML/util/RepertoireBuilder.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def build(sequences: list, path: Path, labels: dict = None, seq_metadata: list =

df = pd.DataFrame({**{"filename": [repertoire.data_filename for repertoire in repertoires],
"subject_id": subject_ids,
"repertoire_identifier": [repertoire.identifier for repertoire in repertoires]},
"identifier": [repertoire.identifier for repertoire in repertoires]},
**(labels if labels is not None else {})})
df.to_csv(path / "metadata.csv", index=False)

Expand Down
2 changes: 1 addition & 1 deletion immuneML/workflows/steps/SignalImplanter.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ def _create_metadata_file(processed_repertoires: List[Repertoire], simulation_st

path = simulation_state.result_path / "metadata.csv"

new_df = pd.DataFrame([{**repertoire.metadata, **{'repertoire_identifier': repertoire.identifier}} for repertoire in processed_repertoires])
new_df = pd.DataFrame([{**repertoire.metadata, **{'identifier': repertoire.identifier}} for repertoire in processed_repertoires])
new_df.drop('field_list', axis=1, inplace=True)
new_df["filename"] = [repertoire.data_filename.name for repertoire in processed_repertoires]
new_df.to_csv(path, index=False)
Expand Down

0 comments on commit 4f6928e

Please sign in to comment.