From c840af7008cdbf5157c84aade2bfbbe5adb2f49e Mon Sep 17 00:00:00 2001 From: Evan Morris Date: Thu, 19 Dec 2024 14:16:05 -0500 Subject: [PATCH] making dont merge an option for graph spec merge strategy --- Common/kgx_file_merger.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/Common/kgx_file_merger.py b/Common/kgx_file_merger.py index b6d54159..8c14a000 100644 --- a/Common/kgx_file_merger.py +++ b/Common/kgx_file_merger.py @@ -45,14 +45,11 @@ def merge(self, for graph_source in chain(graph_spec.sources, graph_spec.subgraphs): if graph_source.merge_strategy == 'default': primary_sources.append(graph_source) + elif graph_source.merge_strategy == 'dont_merge_edges': + primary_sources.append(graph_source) elif graph_source.merge_strategy == 'connected_edge_subset': secondary_sources.append(graph_source) - # TODO we should be able to process a single primary source more efficiently (ie copy and paste it) - # if len(primary_sources) == 1: - # self.process_single_source(primary_sources[0], nodes_output_file_path, edges_output_file_path) - # else: - merge_metadata = { 'sources': {}, 'final_node_count': 0, @@ -110,10 +107,18 @@ def merge_primary_sources(self, merge_metadata["sources"][graph_source.id][source_filename]["nodes"] = nodes_count elif "edges" in file_path: - with jsonlines.open(file_path) as edges: - edges_count = graph_merger.merge_edges(edges) - merge_metadata["sources"][graph_source.id][source_filename]["edges"] = edges_count - + if graph_source.merge_strategy == "default": + with jsonlines.open(file_path) as edges: + edges_count = graph_merger.merge_edges(edges) + merge_metadata["sources"][graph_source.id][source_filename]["edges"] = edges_count + elif graph_source.merge_strategy == "dont_merge_edges": + # just copy the lines from the data source edges file verbatim without merging + with jsonlines.open(file_path) as edges, open(edges_out_file, 'a') as edges_out: + edges_count = 0 + for edge in edges: + edges_out.write(edge) + edges_count += 1 + merge_metadata["sources"][graph_source.id][source_filename]["edges"] = edges_count else: raise ValueError(f"Did not recognize file {file_path} for merging " f"from data source {graph_source.id}.") @@ -177,7 +182,7 @@ def __write_back_to_file(self, self.logger.debug(f'Writing merged edges to file...') edges_written = 0 - with open(edges_out_file, 'w') as edges_out: + with open(edges_out_file, 'a') as edges_out: for edge_line in graph_merger.get_merged_edges_jsonl(): edges_out.write(edge_line) edges_written += 1