Skip to content

Commit

Permalink
Merge pull request #17 from monarch-initiative/multiple-mapping
Browse files Browse the repository at this point in the history
Added support for applying multiple mapping files
  • Loading branch information
kevinschaper authored Jun 14, 2022
2 parents 0383538 + 4ef888c commit 255a5c9
Show file tree
Hide file tree
Showing 5 changed files with 16 additions and 13 deletions.
4 changes: 2 additions & 2 deletions cat_merge/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@
@click.command()
@click.option('--name', help='Name of KG to merge')
@click.option('--input_dir', help='Optional directory containing node and edge files')
@click.option('--mapping', required=False, help='Optional SSSOM mapping file')
@click.option('--mapping', multiple=True, required=False, help='Optional SSSOM mapping file(s)')
@click.option('--output_dir', help='Directory to output knowledge graph')
@click.option('--qc_report', required=False, default=True,
help='Boolean for whether to generate a qc report (defaults to True')
def main(name, input_dir, mapping, output_dir, qc_report):
merge(name=name, input_dir=input_dir, mapping=mapping, output_dir=output_dir, qc_report=qc_report)
merge(name=name, input_dir=input_dir, mappings=mapping, output_dir=output_dir, qc_report=qc_report)


if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion cat_merge/file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def read_dfs(files: List[str], add_provided_by: bool = True) -> List[pd.DataFram


def read_df(file: str, add_provided_by: bool = True):
df = pd.read_csv(file, sep="\t", dtype="string", lineterminator="\n", quoting=csv.QUOTE_NONE)
df = pd.read_csv(file, sep="\t", dtype="string", lineterminator="\n", quoting=csv.QUOTE_NONE, comment='#')

if add_provided_by:
df["provided_by"] = os.path.basename(file)
Expand Down
13 changes: 7 additions & 6 deletions cat_merge/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def merge(
input_dir: str = None, # Optional directory containing node and edge files
edges: List[str] = None, # Optional list of edge files
nodes: List[str] = None, # Optional list of node files
mapping: str = None, # Optional SSSOM mapping file
mappings: List[str] = None, # Optional list of SSSOM mapping files
output_dir: str = "merged-output", # Directory to output knowledge graph
merge_delimiter: str = "|", # Delimiter to use when merging categories and properties on duplicates
qc_report: bool = True
Expand All @@ -24,7 +24,7 @@ def merge(
input_dir: {input_dir}
nodes: {nodes}
edges: {edges}
mapping: {mapping}
mappings: {mappings}
output_dir: {output_dir}
""")

Expand All @@ -38,12 +38,13 @@ def merge(
node_dfs = read_dfs(node_files)
edge_dfs = read_dfs(edge_files)

mapping_df = None
if mapping is not None:
mapping_df = read_df(mapping, add_provided_by=False)
mapping_dfs = []
if mappings is not None:
for file in mappings:
mapping_dfs.append(read_df(file, add_provided_by=False))

print("Merging...")
kg = merge_kg(node_dfs=node_dfs, edge_dfs=edge_dfs, mapping=mapping_df, merge_delimiter=merge_delimiter)
kg = merge_kg(node_dfs=node_dfs, edge_dfs=edge_dfs, mapping_dfs=mapping_dfs, merge_delimiter=merge_delimiter)
write(
name=name,
kg=kg,
Expand Down
8 changes: 5 additions & 3 deletions cat_merge/merge_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from cat_merge.model.merged_kg import MergedKG
from cat_merge.mapping_utils import apply_mappings


def concat_dataframes(dataframes: List[DataFrame]) -> DataFrame:
return pd.concat(dataframes, axis=0)

Expand All @@ -26,14 +27,15 @@ def get_dangling_edges(edges: DataFrame, nodes: DataFrame) -> DataFrame:
return dangling_edges


def merge_kg(edge_dfs: List[DataFrame], node_dfs: List[DataFrame], mapping: DataFrame = None, merge_delimiter: str = "|") -> MergedKG:
def merge_kg(edge_dfs: List[DataFrame], node_dfs: List[DataFrame], mapping_dfs: List[DataFrame] = None, merge_delimiter: str = "|") -> MergedKG:
all_nodes = concat_dataframes(node_dfs)
all_nodes.fillna("None", inplace=True)
all_edges = concat_dataframes(edge_dfs)
all_edges.fillna("None", inplace=True)

if mapping is not None:
all_edges = apply_mappings(all_edges, mapping)
if mapping_dfs is not None and len(mapping_dfs) > 0:
mapping_df = concat_dataframes(mapping_dfs)
all_edges = apply_mappings(all_edges, mapping_df)

duplicate_nodes = get_duplicate_rows(df=all_nodes)
dangling_edges = get_dangling_edges(edges=all_edges, nodes=all_nodes)
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "cat-merge"
version = "0.1.13"
version = "0.1.14"
description = ""
authors = [
"Monarch Initiative <[email protected]>",
Expand Down

0 comments on commit 255a5c9

Please sign in to comment.