From 298e7207057fa4dda445db8243157fb0fd8b0f85 Mon Sep 17 00:00:00 2001 From: Kevin Schaper Date: Mon, 2 May 2022 16:45:09 -0700 Subject: [PATCH 1/7] scaffolding --- tests/unit/test_apply_mappings.py | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 tests/unit/test_apply_mappings.py diff --git a/tests/unit/test_apply_mappings.py b/tests/unit/test_apply_mappings.py new file mode 100644 index 0000000..3374d17 --- /dev/null +++ b/tests/unit/test_apply_mappings.py @@ -0,0 +1,6 @@ +import pytest +from tests.test_utils import string_df + + +def test_apply_mappings(): + pass From 093e97696845d40233d589ca9db8729fd17cd144 Mon Sep 17 00:00:00 2001 From: Kevin Schaper Date: Tue, 3 May 2022 12:07:12 -0700 Subject: [PATCH 2/7] more scaffolding --- cat_merge/mapping_utils.py | 7 +++++++ cat_merge/merge_utils.py | 1 - tests/unit/test_apply_mappings.py | 23 +++++++++++++++++++++++ 3 files changed, 30 insertions(+), 1 deletion(-) create mode 100644 cat_merge/mapping_utils.py diff --git a/cat_merge/mapping_utils.py b/cat_merge/mapping_utils.py new file mode 100644 index 0000000..9fbbfcc --- /dev/null +++ b/cat_merge/mapping_utils.py @@ -0,0 +1,7 @@ +import pandas as pd +from pandas.core.frame import DataFrame +from typing import List + + +def apply_mappings(edges: DataFrame): + pass diff --git a/cat_merge/merge_utils.py b/cat_merge/merge_utils.py index a3dcef9..2a2d5da 100644 --- a/cat_merge/merge_utils.py +++ b/cat_merge/merge_utils.py @@ -1,7 +1,6 @@ import pandas as pd from pandas.core.frame import DataFrame from typing import List -import os from cat_merge.model.merged_kg import MergedKG diff --git a/tests/unit/test_apply_mappings.py b/tests/unit/test_apply_mappings.py index 3374d17..53f17e1 100644 --- a/tests/unit/test_apply_mappings.py +++ b/tests/unit/test_apply_mappings.py @@ -2,5 +2,28 @@ from tests.test_utils import string_df +@pytest.fixture +def edges(): + edges = u"""\ + id subject object + uuid:1 Gene:1 Disease:1 + uuid:2 XGene:2 Disease:2 + uuid:3 Gene:2 XDisease:3 + uuid:2 XGene:3 XDisease:4 + """ + +@pytest.fixture +def mapping(): # owl:sameAs + mapping = u"""\ + subject_id predicate_id + XGene:1 Gene:1 + XGene:2 Gene:2 + XGene:3 Gene:3 + XDisease:1 Disease:1 + XDisease:2 Disease:2 + XDisease:3 Disease:3 + XDisease:4 Disease:4 + """ + def test_apply_mappings(): pass From 4d5080b116f52082c8b3b4d3b15f758512b4c048 Mon Sep 17 00:00:00 2001 From: Kevin Schaper Date: Mon, 9 May 2022 21:05:49 -0700 Subject: [PATCH 3/7] some initial mapping tests --- cat_merge/mapping_utils.py | 7 +++++-- tests/test_utils.py | 7 +++++-- tests/unit/test_apply_mappings.py | 27 ++++++++++++++++++++++----- 3 files changed, 32 insertions(+), 9 deletions(-) diff --git a/cat_merge/mapping_utils.py b/cat_merge/mapping_utils.py index 9fbbfcc..fac6d86 100644 --- a/cat_merge/mapping_utils.py +++ b/cat_merge/mapping_utils.py @@ -3,5 +3,8 @@ from typing import List -def apply_mappings(edges: DataFrame): - pass +def apply_mappings(edges: DataFrame, mapping: DataFrame): + + # todo: apply mappings + + return edges diff --git a/tests/test_utils.py b/tests/test_utils.py index b1ce976..88fb138 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -3,6 +3,9 @@ # Borrowed from https://stackoverflow.com/questions/58771331/cleanly-hard-code-a-pandas-dataframe-into-a-python-script -def string_df(data: str): - df = pd.read_csv(StringIO(data), index_col='id', sep=r"\s+", engine='python') +def string_df(data: str, index_column_is_id=True): + if index_column_is_id: + df = pd.read_csv(StringIO(data), index_col='id', sep=r"\s+", engine='python') + else: + df = pd.read_csv(StringIO(data), sep=r"\s+", engine='python') return df diff --git a/tests/unit/test_apply_mappings.py b/tests/unit/test_apply_mappings.py index 53f17e1..b05cc82 100644 --- a/tests/unit/test_apply_mappings.py +++ b/tests/unit/test_apply_mappings.py @@ -1,6 +1,6 @@ import pytest from tests.test_utils import string_df - +from cat_merge.mapping_utils import apply_mappings @pytest.fixture def edges(): @@ -9,13 +9,15 @@ def edges(): uuid:1 Gene:1 Disease:1 uuid:2 XGene:2 Disease:2 uuid:3 Gene:2 XDisease:3 - uuid:2 XGene:3 XDisease:4 + uuid:4 XGene:3 XDisease:4 """ + return string_df(edges) + @pytest.fixture def mapping(): # owl:sameAs mapping = u"""\ - subject_id predicate_id + subject_id object_id XGene:1 Gene:1 XGene:2 Gene:2 XGene:3 Gene:3 @@ -24,6 +26,21 @@ def mapping(): # owl:sameAs XDisease:3 Disease:3 XDisease:4 Disease:4 """ + return string_df(mapping, index_column_is_id=False) + + +def test_apply_mappings(edges, mapping): + mapped_edges = apply_mappings(edges, mapping) + + assert mapped_edges.loc['uuid:3']['subject'] == 'Gene:2' + assert mapped_edges.loc['uuid:3']['object'] == 'Disease:3' + assert mapped_edges.loc['uuid:4']['subject'] == 'Gene:3' + assert mapped_edges.loc['uuid:4']['object'] == 'Disease:4' + + +def test_original_subject_and_object(edges, mapping): + mapped_edges = apply_mappings(edges, mapping) -def test_apply_mappings(): - pass + assert mapped_edges.loc['uuid:3']['original_subject'] == 'XGene:2' + assert mapped_edges.loc['uuid:3']['original_object'] == 'XDisease:3' + assert mapped_edges.loc['uuid:4']['original_object'] == 'XDisease:4' From e93cacdb185a257f4ba12fc0f2d285cdbac1d229 Mon Sep 17 00:00:00 2001 From: Kevin Schaper Date: Thu, 12 May 2022 15:03:54 -0700 Subject: [PATCH 4/7] added pandas logic to apply mappings --- cat_merge/mapping_utils.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/cat_merge/mapping_utils.py b/cat_merge/mapping_utils.py index fac6d86..2157735 100644 --- a/cat_merge/mapping_utils.py +++ b/cat_merge/mapping_utils.py @@ -1,10 +1,17 @@ -import pandas as pd +import numpy as np from pandas.core.frame import DataFrame -from typing import List def apply_mappings(edges: DataFrame, mapping: DataFrame): - # todo: apply mappings + mapping_dict = mapping.set_index('subject_id')['object_id'] + + edges['original_subject'] = edges['subject'] + edges['subject'].replace(mapping_dict, inplace=True) + edges['original_subject'] = np.where(edges.subject == edges.original_subject, None, edges.original_subject) + + edges['original_object'] = edges['object'] + edges['object'].replace(mapping_dict, inplace=True) + edges['original_object'] = np.where(edges.object == edges.original_object, None, edges.original_object) return edges From c5ce9f7f038591236fede60f21ce7da5861076ca Mon Sep 17 00:00:00 2001 From: Kevin Schaper Date: Thu, 12 May 2022 15:15:07 -0700 Subject: [PATCH 5/7] a bug in the test! --- tests/unit/test_apply_mappings.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/test_apply_mappings.py b/tests/unit/test_apply_mappings.py index b05cc82..806fd16 100644 --- a/tests/unit/test_apply_mappings.py +++ b/tests/unit/test_apply_mappings.py @@ -15,7 +15,7 @@ def edges(): @pytest.fixture -def mapping(): # owl:sameAs +def mapping(): mapping = u"""\ subject_id object_id XGene:1 Gene:1 @@ -41,6 +41,6 @@ def test_apply_mappings(edges, mapping): def test_original_subject_and_object(edges, mapping): mapped_edges = apply_mappings(edges, mapping) - assert mapped_edges.loc['uuid:3']['original_subject'] == 'XGene:2' + assert mapped_edges.loc['uuid:2']['original_subject'] == 'XGene:2' assert mapped_edges.loc['uuid:3']['original_object'] == 'XDisease:3' assert mapped_edges.loc['uuid:4']['original_object'] == 'XDisease:4' From 8e9d0c6b9fb648aa7317cfa60d4a876930ac2299 Mon Sep 17 00:00:00 2001 From: Kevin Schaper Date: Thu, 12 May 2022 15:39:44 -0700 Subject: [PATCH 6/7] Adds support for mappings to the top level merge commands --- cat_merge/file_utils.py | 20 ++++++++++++++++---- cat_merge/merge.py | 7 ++++++- cat_merge/merge_utils.py | 7 +++++-- 3 files changed, 27 insertions(+), 7 deletions(-) diff --git a/cat_merge/file_utils.py b/cat_merge/file_utils.py index 4b184f3..8138591 100644 --- a/cat_merge/file_utils.py +++ b/cat_merge/file_utils.py @@ -5,6 +5,7 @@ from cat_merge.model.merged_kg import MergedKG + def get_files(filepath: str): node_files = [] edge_files = [] @@ -16,15 +17,26 @@ def get_files(filepath: str): edge_files.append(f"{filepath}/{file}") return node_files, edge_files + def read_dfs(files: List[str], add_provided_by: bool = True) -> List[pd.DataFrame]: dataframes = [] for file in files: + dataframes.append(read_df(file, add_provided_by=add_provided_by)) + return dataframes + + +def read_df(file: str, add_provided_by: bool = True, index_column_is_id: bool = True): + + if index_column_is_id: df = pd.read_csv(file, sep="\t", dtype="string", lineterminator="\n", index_col='id') df.index.name = 'id' - if add_provided_by: - df["provided_by"] = os.path.basename(file) - dataframes.append(df) - return dataframes + else: + df = pd.read_csv(file, sep="\t", dtype="string", lineterminator="\n") + + if add_provided_by: + df["provided_by"] = os.path.basename(file) + return df + def write_df(df: pd.DataFrame, filename: str): df.to_csv(filename, sep="\t") diff --git a/cat_merge/merge.py b/cat_merge/merge.py index 51786ab..6eebda9 100644 --- a/cat_merge/merge.py +++ b/cat_merge/merge.py @@ -7,6 +7,7 @@ def merge( input_dir: str = None,#typer.Option(None, help="Optional directory containing node and edge files"), edges: List[str] = None,#typer.Option(None, help="Optional list of edge files"), nodes: List[str] = None,#typer.Option(None, help="Optional list of node files"), + mapping: str = None,#typer.Option(None, help="Optional SSSOM mapping file") output_dir: str = "merged-output",#typer.Option("merge-output", help="Directory to output knowledge graph") ): @@ -20,8 +21,12 @@ def merge( node_dfs = read_dfs(node_files) edge_dfs = read_dfs(edge_files) + mapping_df = None + if mapping is not None: + mapping_df = read_df() + write( name=name, - kg=merge_kg(node_dfs=node_dfs, edge_dfs=edge_dfs), + kg=merge_kg(node_dfs=node_dfs, edge_dfs=edge_dfs, mapping=mapping_df), output_dir=output_dir ) diff --git a/cat_merge/merge_utils.py b/cat_merge/merge_utils.py index 2a2d5da..0db9b13 100644 --- a/cat_merge/merge_utils.py +++ b/cat_merge/merge_utils.py @@ -2,7 +2,7 @@ from pandas.core.frame import DataFrame from typing import List from cat_merge.model.merged_kg import MergedKG - +from cat_merge.mapping_utils import apply_mappings def concat_dataframes(dataframes: List[DataFrame]) -> DataFrame: return pd.concat(dataframes, axis=0) @@ -30,11 +30,14 @@ def get_dangling_edges(edges: DataFrame, nodes: DataFrame) -> DataFrame: return edges[~edges.subject.isin(nodes.index) | ~edges.object.isin(nodes.index)] -def merge_kg(edge_dfs: List[DataFrame], node_dfs: List[DataFrame]) -> MergedKG: +def merge_kg(edge_dfs: List[DataFrame], node_dfs: List[DataFrame], mapping: DataFrame) -> MergedKG: all_nodes = concat_dataframes(node_dfs) all_edges = concat_dataframes(edge_dfs) + if mapping is not None: + all_edges = apply_mappings(all_edges, mapping) + duplicate_nodes = get_duplicate_rows(df=all_nodes) dangling_edges = get_dangling_edges(edges=all_edges, nodes=all_nodes) From bc2035b157317730800d1f91c834d058a7ff6e04 Mon Sep 17 00:00:00 2001 From: Kevin Schaper Date: Thu, 12 May 2022 18:07:31 -0700 Subject: [PATCH 7/7] bump version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 28e67d1..91c6441 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "cat-merge" -version = "0.1.6" +version = "0.1.7" description = "" authors = [ "Monarch Initiative ",