diff --git a/cat_merge/file_utils.py b/cat_merge/file_utils.py index 6f9ee5f..1617011 100644 --- a/cat_merge/file_utils.py +++ b/cat_merge/file_utils.py @@ -6,6 +6,7 @@ from cat_merge.model.merged_kg import MergedKG + def get_files(filepath: str): node_files = [] edge_files = [] @@ -17,15 +18,26 @@ def get_files(filepath: str): edge_files.append(f"{filepath}/{file}") return node_files, edge_files + def read_dfs(files: List[str], add_provided_by: bool = True) -> List[pd.DataFrame]: dataframes = [] for file in files: + dataframes.append(read_df(file, add_provided_by=add_provided_by)) + return dataframes + + +def read_df(file: str, add_provided_by: bool = True, index_column_is_id: bool = True): + + if index_column_is_id: df = pd.read_csv(file, sep="\t", dtype="string", lineterminator="\n", index_col='id', quoting=csv.QUOTE_NONE) df.index.name = 'id' - if add_provided_by: - df["provided_by"] = os.path.basename(file) - dataframes.append(df) - return dataframes + else: + df = pd.read_csv(file, sep="\t", dtype="string", lineterminator="\n", quoting=csv.QUOTE_NONE) + + if add_provided_by: + df["provided_by"] = os.path.basename(file) + return df + def write_df(df: pd.DataFrame, filename: str): df.to_csv(filename, sep="\t") diff --git a/cat_merge/mapping_utils.py b/cat_merge/mapping_utils.py new file mode 100644 index 0000000..2157735 --- /dev/null +++ b/cat_merge/mapping_utils.py @@ -0,0 +1,17 @@ +import numpy as np +from pandas.core.frame import DataFrame + + +def apply_mappings(edges: DataFrame, mapping: DataFrame): + + mapping_dict = mapping.set_index('subject_id')['object_id'] + + edges['original_subject'] = edges['subject'] + edges['subject'].replace(mapping_dict, inplace=True) + edges['original_subject'] = np.where(edges.subject == edges.original_subject, None, edges.original_subject) + + edges['original_object'] = edges['object'] + edges['object'].replace(mapping_dict, inplace=True) + edges['original_object'] = np.where(edges.object == edges.original_object, None, edges.original_object) + + return edges diff --git a/cat_merge/merge.py b/cat_merge/merge.py index af45eff..b94df44 100644 --- a/cat_merge/merge.py +++ b/cat_merge/merge.py @@ -7,6 +7,7 @@ def merge( input_dir: str = None,#typer.Option(None, help="Optional directory containing node and edge files"), edges: List[str] = None,#typer.Option(None, help="Optional list of edge files"), nodes: List[str] = None,#typer.Option(None, help="Optional list of node files"), + mapping: str = None,#typer.Option(None, help="Optional SSSOM mapping file") output_dir: str = "merged-output",#typer.Option("merged-output", help="Directory to output knowledge graph") merge_delimiter: str = "|",#typer.Option("|", help="Delimiter to use when merging categories and properties on duplicates") ): @@ -21,8 +22,12 @@ def merge( node_dfs = read_dfs(node_files) edge_dfs = read_dfs(edge_files) + mapping_df = None + if mapping is not None: + mapping_df = read_df() + write( name=name, - kg=merge_kg(node_dfs=node_dfs, edge_dfs=edge_dfs, merge_delimiter=merge_delimiter), + kg=merge_kg(node_dfs=node_dfs, edge_dfs=edge_dfs, mapping=mapping_df, merge_delimiter=merge_delimiter), output_dir=output_dir ) diff --git a/cat_merge/merge_utils.py b/cat_merge/merge_utils.py index 0b96a1e..e7ac819 100644 --- a/cat_merge/merge_utils.py +++ b/cat_merge/merge_utils.py @@ -1,9 +1,8 @@ import pandas as pd from pandas.core.frame import DataFrame from typing import List -import os from cat_merge.model.merged_kg import MergedKG - +from cat_merge.mapping_utils import apply_mappings def concat_dataframes(dataframes: List[DataFrame]) -> DataFrame: return pd.concat(dataframes, axis=0) @@ -31,11 +30,13 @@ def get_dangling_edges(edges: DataFrame, nodes: DataFrame) -> DataFrame: return edges[~edges.subject.isin(nodes.index) | ~edges.object.isin(nodes.index)] -def merge_kg(edge_dfs: List[DataFrame], node_dfs: List[DataFrame], merge_delimiter: str = "|") -> MergedKG: - +def merge_kg(edge_dfs: List[DataFrame], node_dfs: List[DataFrame], mapping: DataFrame = None, merge_delimiter: str = "|") -> MergedKG: all_nodes = concat_dataframes(node_dfs) all_edges = concat_dataframes(edge_dfs) + if mapping is not None: + all_edges = apply_mappings(all_edges, mapping) + duplicate_nodes = get_duplicate_rows(df=all_nodes) dangling_edges = get_dangling_edges(edges=all_edges, nodes=all_nodes) diff --git a/tests/test_utils.py b/tests/test_utils.py index b1ce976..88fb138 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -3,6 +3,9 @@ # Borrowed from https://stackoverflow.com/questions/58771331/cleanly-hard-code-a-pandas-dataframe-into-a-python-script -def string_df(data: str): - df = pd.read_csv(StringIO(data), index_col='id', sep=r"\s+", engine='python') +def string_df(data: str, index_column_is_id=True): + if index_column_is_id: + df = pd.read_csv(StringIO(data), index_col='id', sep=r"\s+", engine='python') + else: + df = pd.read_csv(StringIO(data), sep=r"\s+", engine='python') return df diff --git a/tests/unit/test_apply_mappings.py b/tests/unit/test_apply_mappings.py new file mode 100644 index 0000000..806fd16 --- /dev/null +++ b/tests/unit/test_apply_mappings.py @@ -0,0 +1,46 @@ +import pytest +from tests.test_utils import string_df +from cat_merge.mapping_utils import apply_mappings + +@pytest.fixture +def edges(): + edges = u"""\ + id subject object + uuid:1 Gene:1 Disease:1 + uuid:2 XGene:2 Disease:2 + uuid:3 Gene:2 XDisease:3 + uuid:4 XGene:3 XDisease:4 + """ + return string_df(edges) + + +@pytest.fixture +def mapping(): + mapping = u"""\ + subject_id object_id + XGene:1 Gene:1 + XGene:2 Gene:2 + XGene:3 Gene:3 + XDisease:1 Disease:1 + XDisease:2 Disease:2 + XDisease:3 Disease:3 + XDisease:4 Disease:4 + """ + return string_df(mapping, index_column_is_id=False) + + +def test_apply_mappings(edges, mapping): + mapped_edges = apply_mappings(edges, mapping) + + assert mapped_edges.loc['uuid:3']['subject'] == 'Gene:2' + assert mapped_edges.loc['uuid:3']['object'] == 'Disease:3' + assert mapped_edges.loc['uuid:4']['subject'] == 'Gene:3' + assert mapped_edges.loc['uuid:4']['object'] == 'Disease:4' + + +def test_original_subject_and_object(edges, mapping): + mapped_edges = apply_mappings(edges, mapping) + + assert mapped_edges.loc['uuid:2']['original_subject'] == 'XGene:2' + assert mapped_edges.loc['uuid:3']['original_object'] == 'XDisease:3' + assert mapped_edges.loc['uuid:4']['original_object'] == 'XDisease:4'