Skip to content

Commit

Permalink
Merge branch 'apply-mappings' of https://github.com/monarch-initiativ…
Browse files Browse the repository at this point in the history
…e/cat-merge into apply-mappings
  • Loading branch information
kevinschaper committed May 13, 2022
2 parents bc2035b + 9f5134e commit b1a628c
Show file tree
Hide file tree
Showing 4 changed files with 12 additions and 11 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ jobs:
- name: Publish to PyPi
env:
PYPI_USERNAME: "@token"
PYPI_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
PYPI_API_TOKEN: ${{ secrets.PYPI_API_TOKEN }}
run: |
poetry config http-basic.pypi "__token__" "${PYPI_API_TOKEN}"
poetry publish
5 changes: 3 additions & 2 deletions cat_merge/file_utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import csv
import os, tarfile
from pathlib import Path
import pandas as pd
Expand Down Expand Up @@ -28,10 +29,10 @@ def read_dfs(files: List[str], add_provided_by: bool = True) -> List[pd.DataFram
def read_df(file: str, add_provided_by: bool = True, index_column_is_id: bool = True):

if index_column_is_id:
df = pd.read_csv(file, sep="\t", dtype="string", lineterminator="\n", index_col='id')
df = pd.read_csv(file, sep="\t", dtype="string", lineterminator="\n", index_col='id', quoting=csv.QUOTE_NONE)
df.index.name = 'id'
else:
df = pd.read_csv(file, sep="\t", dtype="string", lineterminator="\n")
df = pd.read_csv(file, sep="\t", dtype="string", lineterminator="\n", quoting=csv.QUOTE_NONE)

if add_provided_by:
df["provided_by"] = os.path.basename(file)
Expand Down
5 changes: 3 additions & 2 deletions cat_merge/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ def merge(
edges: List[str] = None,#typer.Option(None, help="Optional list of edge files"),
nodes: List[str] = None,#typer.Option(None, help="Optional list of node files"),
mapping: str = None,#typer.Option(None, help="Optional SSSOM mapping file")
output_dir: str = "merged-output",#typer.Option("merge-output", help="Directory to output knowledge graph")
output_dir: str = "merged-output",#typer.Option("merged-output", help="Directory to output knowledge graph")
merge_delimiter: str = "|",#typer.Option("|", help="Delimiter to use when merging categories and properties on duplicates")
):

print(f"Merging KG files...\nName: {name} // input_dir: {input_dir} // nodes: {nodes} // edges: {edges} // output_dir: {output_dir}")
Expand All @@ -27,6 +28,6 @@ def merge(

write(
name=name,
kg=merge_kg(node_dfs=node_dfs, edge_dfs=edge_dfs, mapping=mapping_df),
kg=merge_kg(node_dfs=node_dfs, edge_dfs=edge_dfs, mapping=mapping_df, merge_delimiter=merge_delimiter),
output_dir=output_dir
)
9 changes: 4 additions & 5 deletions cat_merge/merge_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@ def get_duplicate_rows(df: DataFrame) -> DataFrame:
return df[df.index.duplicated(keep=False)]


def clean_nodes(nodes: DataFrame) -> DataFrame:
def clean_nodes(nodes: DataFrame, merge_delimiter: str = " ") -> DataFrame:
nodes.reset_index(inplace=True)
nodes.drop_duplicates(inplace=True)
nodes = nodes.rename(columns={'index': 'id'})
nodes.fillna("None", inplace=True)
column_agg = {x: ' '.join for x in nodes.columns if x != 'id'}
column_agg = {x: merge_delimiter.join for x in nodes.columns if x != 'id'}
nodes = nodes.groupby(['id'], as_index=True).agg(column_agg)
return nodes

Expand All @@ -30,8 +30,7 @@ def get_dangling_edges(edges: DataFrame, nodes: DataFrame) -> DataFrame:
return edges[~edges.subject.isin(nodes.index) | ~edges.object.isin(nodes.index)]


def merge_kg(edge_dfs: List[DataFrame], node_dfs: List[DataFrame], mapping: DataFrame) -> MergedKG:

def merge_kg(edge_dfs: List[DataFrame], node_dfs: List[DataFrame], mapping: DataFrame = None, merge_delimiter: str = "|") -> MergedKG:
all_nodes = concat_dataframes(node_dfs)
all_edges = concat_dataframes(edge_dfs)

Expand All @@ -41,7 +40,7 @@ def merge_kg(edge_dfs: List[DataFrame], node_dfs: List[DataFrame], mapping: Data
duplicate_nodes = get_duplicate_rows(df=all_nodes)
dangling_edges = get_dangling_edges(edges=all_edges, nodes=all_nodes)

nodes = clean_nodes(nodes=all_nodes)
nodes = clean_nodes(nodes=all_nodes, merge_delimiter=merge_delimiter)
edges = clean_edges(edges=all_edges, nodes=nodes)

return MergedKG(nodes=nodes, edges=edges, duplicate_nodes=duplicate_nodes, dangling_edges=dangling_edges)

0 comments on commit b1a628c

Please sign in to comment.