Merge branch 'apply-mappings' of https://github.com/monarch-initiativ…

…e/cat-merge into apply-mappings
monarch-initiative · May 13, 2022 · b1a628c · b1a628c
2 parents bc2035b + 9f5134e
commit b1a628c
Show file tree

Hide file tree

Showing 4 changed files with 12 additions and 11 deletions.
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -26,7 +26,7 @@ jobs:
 
             - name: Publish to PyPi
               env:
-                  PYPI_USERNAME: "@token"
-                  PYPI_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
+                 PYPI_API_TOKEN: ${{ secrets.PYPI_API_TOKEN }}
               run: |
+                  poetry config http-basic.pypi "__token__" "${PYPI_API_TOKEN}"
                   poetry publish
diff --git a/cat_merge/file_utils.py b/cat_merge/file_utils.py
@@ -1,3 +1,4 @@
+import csv
 import os, tarfile
 from pathlib import Path
 import pandas as pd
@@ -28,10 +29,10 @@ def read_dfs(files: List[str], add_provided_by: bool = True) -> List[pd.DataFram
 def read_df(file: str, add_provided_by: bool = True, index_column_is_id: bool = True):
 
     if index_column_is_id:
-        df = pd.read_csv(file, sep="\t", dtype="string", lineterminator="\n", index_col='id')
+        df = pd.read_csv(file, sep="\t", dtype="string", lineterminator="\n", index_col='id', quoting=csv.QUOTE_NONE)
         df.index.name = 'id'
     else:
-        df = pd.read_csv(file, sep="\t", dtype="string", lineterminator="\n")
+        df = pd.read_csv(file, sep="\t", dtype="string", lineterminator="\n", quoting=csv.QUOTE_NONE)
 
     if add_provided_by:
         df["provided_by"] = os.path.basename(file)

diff --git a/cat_merge/merge.py b/cat_merge/merge.py
@@ -8,7 +8,8 @@ def merge(
     edges: List[str] = None,#typer.Option(None, help="Optional list of edge files"),
     nodes: List[str] = None,#typer.Option(None, help="Optional list of node files"),
     mapping: str = None,#typer.Option(None, help="Optional SSSOM mapping file")
-    output_dir: str = "merged-output",#typer.Option("merge-output", help="Directory to output knowledge graph")
+    output_dir: str = "merged-output",#typer.Option("merged-output", help="Directory to output knowledge graph")
+    merge_delimiter: str = "|",#typer.Option("|", help="Delimiter to use when merging categories and properties on duplicates")
     ):
 
     print(f"Merging KG files...\nName: {name} // input_dir: {input_dir} // nodes: {nodes} // edges: {edges} // output_dir: {output_dir}")
@@ -27,6 +28,6 @@ def merge(
 
     write(
         name=name,
-        kg=merge_kg(node_dfs=node_dfs, edge_dfs=edge_dfs, mapping=mapping_df),
+        kg=merge_kg(node_dfs=node_dfs, edge_dfs=edge_dfs, mapping=mapping_df, merge_delimiter=merge_delimiter),
         output_dir=output_dir
     )
diff --git a/cat_merge/merge_utils.py b/cat_merge/merge_utils.py
@@ -12,12 +12,12 @@ def get_duplicate_rows(df: DataFrame) -> DataFrame:
     return df[df.index.duplicated(keep=False)]
 
 
-def clean_nodes(nodes: DataFrame) -> DataFrame:
+def clean_nodes(nodes: DataFrame, merge_delimiter: str = " ") -> DataFrame:
     nodes.reset_index(inplace=True)
     nodes.drop_duplicates(inplace=True)
     nodes = nodes.rename(columns={'index': 'id'})
     nodes.fillna("None", inplace=True)
-    column_agg = {x: ' '.join for x in nodes.columns if x != 'id'}
+    column_agg = {x: merge_delimiter.join for x in nodes.columns if x != 'id'}
     nodes = nodes.groupby(['id'], as_index=True).agg(column_agg)
     return nodes
 
@@ -30,8 +30,7 @@ def get_dangling_edges(edges: DataFrame, nodes: DataFrame) -> DataFrame:
     return edges[~edges.subject.isin(nodes.index) | ~edges.object.isin(nodes.index)]
 
 
-def merge_kg(edge_dfs: List[DataFrame], node_dfs: List[DataFrame], mapping: DataFrame) -> MergedKG:
-
+def merge_kg(edge_dfs: List[DataFrame], node_dfs: List[DataFrame], mapping: DataFrame = None, merge_delimiter: str = "|") -> MergedKG:
     all_nodes = concat_dataframes(node_dfs)
     all_edges = concat_dataframes(edge_dfs)
 
@@ -41,7 +40,7 @@ def merge_kg(edge_dfs: List[DataFrame], node_dfs: List[DataFrame], mapping: Data
     duplicate_nodes = get_duplicate_rows(df=all_nodes)
     dangling_edges = get_dangling_edges(edges=all_edges, nodes=all_nodes)
 
-    nodes = clean_nodes(nodes=all_nodes)
+    nodes = clean_nodes(nodes=all_nodes, merge_delimiter=merge_delimiter)
     edges = clean_edges(edges=all_edges, nodes=nodes)
 
     return MergedKG(nodes=nodes, edges=edges, duplicate_nodes=duplicate_nodes, dangling_edges=dangling_edges)