Skip to content

Commit

Permalink
Merge pull request #57 from monarch-initiative/read-df-comment-char
Browse files Browse the repository at this point in the history
Use comment characters when reading SSSOM files but not for reading kgx files
  • Loading branch information
kevinschaper authored Aug 21, 2023
2 parents c1f7685 + 31897f1 commit 3e5163c
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 7 deletions.
12 changes: 8 additions & 4 deletions cat_merge/file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,9 @@ def get_files(filepath: str, nodes_match: str = "_nodes", edges_match: str = "_e
return node_files, edge_files


def read_dfs(files: List[str], add_source_col: Optional[str] = "provided_by") -> List[pd.DataFrame]:
def read_dfs(files: List[str],
add_source_col: Optional[str] = "provided_by",
comment_character: str = None) -> List[pd.DataFrame]:
"""
Read a list of files into dataframes.
Expand All @@ -43,7 +45,7 @@ def read_dfs(files: List[str], add_source_col: Optional[str] = "provided_by") ->
"""
dataframes = []
for file in files:
dataframes.append(read_df(file, add_source_col, Path(file).stem))
dataframes.append(read_df(file, add_source_col, Path(file).stem, comment_character=comment_character))
return dataframes


Expand All @@ -68,19 +70,21 @@ def read_tar_dfs(tar: tarfile.TarFile, type_name, add_source_col: str = "provide

def read_df(fh: Union[str, IO[bytes]],
add_source_col: Optional[str] = "provided_by",
source_col_value: Optional[str] = None) -> pd.DataFrame:
source_col_value: Optional[str] = None,
comment_character: str = None) -> pd.DataFrame:
"""
Read a file into a dataframe.
Args:
fh (str, io.TextIOWrapper): File handle.
add_source_col (str, optional): Name of column to add to the dataframe with the name of the file.
source_col_value (Any, optional): Value to add to the source column.
comment_character (str, optional): Character to ignore lines starting with, or anything after.
Returns:
pandas.DataFrame: Dataframe.
"""
df = pd.read_csv(fh, sep="\t", dtype="string", lineterminator="\n", quoting=csv.QUOTE_NONE, comment='#')
df = pd.read_csv(fh, sep="\t", dtype="string", lineterminator="\n", quoting=csv.QUOTE_NONE, comment=comment_character)
if add_source_col is not None:
df[add_source_col] = source_col_value
return df
Expand Down
4 changes: 2 additions & 2 deletions cat_merge/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def merge(
mappings: {mappings}
output_dir: {output_dir}
""")
if nodes is None != (edges is None) or source is None == (nodes is None):
if source is None and (nodes is None or edges is None):
raise ValueError("Wrong attributes: must specify both nodes & edges or source")

if source is not None and (nodes or edges):
Expand All @@ -72,7 +72,7 @@ def merge(

mapping_dfs = []
if mappings is not None:
mapping_dfs = read_dfs(mappings, add_source_col=None)
mapping_dfs = read_dfs(mappings, add_source_col=None, comment_character="#")

print("Merging...")
kg, qc = merge_kg(node_dfs=node_dfs, edge_dfs=edge_dfs, mapping_dfs=mapping_dfs)
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "cat-merge"
version = "0.1.18"
version = "0.1.19"
description = ""
authors = [
"Monarch Initiative <[email protected]>",
Expand Down

0 comments on commit 3e5163c

Please sign in to comment.