Skip to content

Commit

Permalink
Merge pull request #11 from monarch-initiative/9-retain-or-add-delimi…
Browse files Browse the repository at this point in the history
…ters-in-merged-fields

Add option to modify merge delimiter for duplicate nodes
  • Loading branch information
kevinschaper authored May 12, 2022
2 parents 0fd40f7 + 0f99887 commit fab01f9
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 6 deletions.
5 changes: 3 additions & 2 deletions cat_merge/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@ def merge(
input_dir: str = None,#typer.Option(None, help="Optional directory containing node and edge files"),
edges: List[str] = None,#typer.Option(None, help="Optional list of edge files"),
nodes: List[str] = None,#typer.Option(None, help="Optional list of node files"),
output_dir: str = "merged-output",#typer.Option("merge-output", help="Directory to output knowledge graph")
output_dir: str = "merged-output",#typer.Option("merged-output", help="Directory to output knowledge graph")
merge_delimiter: str = "|",#typer.Option("|", help="Delimiter to use when merging categories and properties on duplicates")
):

print(f"Merging KG files...\nName: {name} // input_dir: {input_dir} // nodes: {nodes} // edges: {edges} // output_dir: {output_dir}")
Expand All @@ -22,6 +23,6 @@ def merge(

write(
name=name,
kg=merge_kg(node_dfs=node_dfs, edge_dfs=edge_dfs),
kg=merge_kg(node_dfs=node_dfs, edge_dfs=edge_dfs, merge_delimiter=merge_delimiter),
output_dir=output_dir
)
8 changes: 4 additions & 4 deletions cat_merge/merge_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,12 @@ def get_duplicate_rows(df: DataFrame) -> DataFrame:
return df[df.index.duplicated(keep=False)]


def clean_nodes(nodes: DataFrame) -> DataFrame:
def clean_nodes(nodes: DataFrame, merge_delimiter: str = " ") -> DataFrame:
nodes.reset_index(inplace=True)
nodes.drop_duplicates(inplace=True)
nodes = nodes.rename(columns={'index': 'id'})
nodes.fillna("None", inplace=True)
column_agg = {x: ' '.join for x in nodes.columns if x != 'id'}
column_agg = {x: merge_delimiter.join for x in nodes.columns if x != 'id'}
nodes = nodes.groupby(['id'], as_index=True).agg(column_agg)
return nodes

Expand All @@ -31,15 +31,15 @@ def get_dangling_edges(edges: DataFrame, nodes: DataFrame) -> DataFrame:
return edges[~edges.subject.isin(nodes.index) | ~edges.object.isin(nodes.index)]


def merge_kg(edge_dfs: List[DataFrame], node_dfs: List[DataFrame]) -> MergedKG:
def merge_kg(edge_dfs: List[DataFrame], node_dfs: List[DataFrame], merge_delimiter: str = "|") -> MergedKG:

all_nodes = concat_dataframes(node_dfs)
all_edges = concat_dataframes(edge_dfs)

duplicate_nodes = get_duplicate_rows(df=all_nodes)
dangling_edges = get_dangling_edges(edges=all_edges, nodes=all_nodes)

nodes = clean_nodes(nodes=all_nodes)
nodes = clean_nodes(nodes=all_nodes, merge_delimiter=merge_delimiter)
edges = clean_edges(edges=all_edges, nodes=nodes)

return MergedKG(nodes=nodes, edges=edges, duplicate_nodes=duplicate_nodes, dangling_edges=dangling_edges)

0 comments on commit fab01f9

Please sign in to comment.