Skip to content

Commit

Permalink
Merge pull request #12 from monarch-initiative/apply-mappings
Browse files Browse the repository at this point in the history
Apply mappings
  • Loading branch information
kevinschaper authored May 13, 2022
2 parents fab01f9 + b1a628c commit 99bca89
Show file tree
Hide file tree
Showing 6 changed files with 95 additions and 11 deletions.
20 changes: 16 additions & 4 deletions cat_merge/file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from cat_merge.model.merged_kg import MergedKG


def get_files(filepath: str):
node_files = []
edge_files = []
Expand All @@ -17,15 +18,26 @@ def get_files(filepath: str):
edge_files.append(f"{filepath}/{file}")
return node_files, edge_files


def read_dfs(files: List[str], add_provided_by: bool = True) -> List[pd.DataFrame]:
dataframes = []
for file in files:
dataframes.append(read_df(file, add_provided_by=add_provided_by))
return dataframes


def read_df(file: str, add_provided_by: bool = True, index_column_is_id: bool = True):

if index_column_is_id:
df = pd.read_csv(file, sep="\t", dtype="string", lineterminator="\n", index_col='id', quoting=csv.QUOTE_NONE)
df.index.name = 'id'
if add_provided_by:
df["provided_by"] = os.path.basename(file)
dataframes.append(df)
return dataframes
else:
df = pd.read_csv(file, sep="\t", dtype="string", lineterminator="\n", quoting=csv.QUOTE_NONE)

if add_provided_by:
df["provided_by"] = os.path.basename(file)
return df


def write_df(df: pd.DataFrame, filename: str):
df.to_csv(filename, sep="\t")
Expand Down
17 changes: 17 additions & 0 deletions cat_merge/mapping_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import numpy as np
from pandas.core.frame import DataFrame


def apply_mappings(edges: DataFrame, mapping: DataFrame):

mapping_dict = mapping.set_index('subject_id')['object_id']

edges['original_subject'] = edges['subject']
edges['subject'].replace(mapping_dict, inplace=True)
edges['original_subject'] = np.where(edges.subject == edges.original_subject, None, edges.original_subject)

edges['original_object'] = edges['object']
edges['object'].replace(mapping_dict, inplace=True)
edges['original_object'] = np.where(edges.object == edges.original_object, None, edges.original_object)

return edges
7 changes: 6 additions & 1 deletion cat_merge/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ def merge(
input_dir: str = None,#typer.Option(None, help="Optional directory containing node and edge files"),
edges: List[str] = None,#typer.Option(None, help="Optional list of edge files"),
nodes: List[str] = None,#typer.Option(None, help="Optional list of node files"),
mapping: str = None,#typer.Option(None, help="Optional SSSOM mapping file")
output_dir: str = "merged-output",#typer.Option("merged-output", help="Directory to output knowledge graph")
merge_delimiter: str = "|",#typer.Option("|", help="Delimiter to use when merging categories and properties on duplicates")
):
Expand All @@ -21,8 +22,12 @@ def merge(
node_dfs = read_dfs(node_files)
edge_dfs = read_dfs(edge_files)

mapping_df = None
if mapping is not None:
mapping_df = read_df()

write(
name=name,
kg=merge_kg(node_dfs=node_dfs, edge_dfs=edge_dfs, merge_delimiter=merge_delimiter),
kg=merge_kg(node_dfs=node_dfs, edge_dfs=edge_dfs, mapping=mapping_df, merge_delimiter=merge_delimiter),
output_dir=output_dir
)
9 changes: 5 additions & 4 deletions cat_merge/merge_utils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
import pandas as pd
from pandas.core.frame import DataFrame
from typing import List
import os
from cat_merge.model.merged_kg import MergedKG

from cat_merge.mapping_utils import apply_mappings

def concat_dataframes(dataframes: List[DataFrame]) -> DataFrame:
return pd.concat(dataframes, axis=0)
Expand Down Expand Up @@ -31,11 +30,13 @@ def get_dangling_edges(edges: DataFrame, nodes: DataFrame) -> DataFrame:
return edges[~edges.subject.isin(nodes.index) | ~edges.object.isin(nodes.index)]


def merge_kg(edge_dfs: List[DataFrame], node_dfs: List[DataFrame], merge_delimiter: str = "|") -> MergedKG:

def merge_kg(edge_dfs: List[DataFrame], node_dfs: List[DataFrame], mapping: DataFrame = None, merge_delimiter: str = "|") -> MergedKG:
all_nodes = concat_dataframes(node_dfs)
all_edges = concat_dataframes(edge_dfs)

if mapping is not None:
all_edges = apply_mappings(all_edges, mapping)

duplicate_nodes = get_duplicate_rows(df=all_nodes)
dangling_edges = get_dangling_edges(edges=all_edges, nodes=all_nodes)

Expand Down
7 changes: 5 additions & 2 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@


# Borrowed from https://stackoverflow.com/questions/58771331/cleanly-hard-code-a-pandas-dataframe-into-a-python-script
def string_df(data: str):
df = pd.read_csv(StringIO(data), index_col='id', sep=r"\s+", engine='python')
def string_df(data: str, index_column_is_id=True):
if index_column_is_id:
df = pd.read_csv(StringIO(data), index_col='id', sep=r"\s+", engine='python')
else:
df = pd.read_csv(StringIO(data), sep=r"\s+", engine='python')
return df
46 changes: 46 additions & 0 deletions tests/unit/test_apply_mappings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import pytest
from tests.test_utils import string_df
from cat_merge.mapping_utils import apply_mappings

@pytest.fixture
def edges():
edges = u"""\
id subject object
uuid:1 Gene:1 Disease:1
uuid:2 XGene:2 Disease:2
uuid:3 Gene:2 XDisease:3
uuid:4 XGene:3 XDisease:4
"""
return string_df(edges)


@pytest.fixture
def mapping():
mapping = u"""\
subject_id object_id
XGene:1 Gene:1
XGene:2 Gene:2
XGene:3 Gene:3
XDisease:1 Disease:1
XDisease:2 Disease:2
XDisease:3 Disease:3
XDisease:4 Disease:4
"""
return string_df(mapping, index_column_is_id=False)


def test_apply_mappings(edges, mapping):
mapped_edges = apply_mappings(edges, mapping)

assert mapped_edges.loc['uuid:3']['subject'] == 'Gene:2'
assert mapped_edges.loc['uuid:3']['object'] == 'Disease:3'
assert mapped_edges.loc['uuid:4']['subject'] == 'Gene:3'
assert mapped_edges.loc['uuid:4']['object'] == 'Disease:4'


def test_original_subject_and_object(edges, mapping):
mapped_edges = apply_mappings(edges, mapping)

assert mapped_edges.loc['uuid:2']['original_subject'] == 'XGene:2'
assert mapped_edges.loc['uuid:3']['original_object'] == 'XDisease:3'
assert mapped_edges.loc['uuid:4']['original_object'] == 'XDisease:4'

0 comments on commit 99bca89

Please sign in to comment.