Skip to content

Commit

Permalink
Add support for special paths in splits discovery (#236)
Browse files Browse the repository at this point in the history
This commit adds support for split paths prefixed with `:data_dir:`, and these are resolved relative to the base data directory instead of a dataset's specific data directory.

* Fix missing default data_dir during splits discovery

* Add documentation for :data_dir: paths
  • Loading branch information
fdalvi authored Sep 18, 2023
1 parent e7d52db commit 3a329a1
Show file tree
Hide file tree
Showing 6 changed files with 15 additions and 7 deletions.
7 changes: 7 additions & 0 deletions docs/tutorials/adding_asset.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,13 @@ def config():
# }
# }
# }
#
# If the Dataset you are using does not have your required splits by default,
# "custom_test_split" and "custom_train_split" can be used instead. These are
# usually strings, but their structure is dictated by the data loader in a
# specific dataset. The framework supports absolute paths, relative paths (
# relative to `data_dir/*Dataset/`) or special paths prefixed with `:data_dir:`,
# which are resolved relative to `data_dir`.

def prompt(input_sample):
# This function receives an input_sample and pre-processes it into the
Expand Down
2 changes: 1 addition & 1 deletion llmebench/datasets/TyDiQA.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def metadata():
"license": "Apache License Version 2.0",
"splits": {
"dev": "tydiqa-goldp-dev-arabic.json",
"train": ":depends:ARCD/arcd-train.json",
"train": ":data_dir:ARCD/arcd-train.json",
},
"task_type": TaskType.QuestionAnswering,
}
2 changes: 1 addition & 1 deletion llmebench/datasets/UnifiedFCFactuality.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def metadata():
"license": "Research Purpose Only",
"splits": {
"test": "ramy_arabic_fact_checking.tsv",
"train": ":depends:ANSStance/claim/train.csv",
"train": ":data_dir:ANSStance/claim/train.csv",
},
"task_type": TaskType.Classification,
"class_labels": ["true", "false"],
Expand Down
2 changes: 1 addition & 1 deletion llmebench/datasets/UnifiedFCStance.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def metadata():
"license": "Research Purpose Only",
"splits": {
"test": "ramy_arabic_stance.jsonl",
"train": ":depends:ANSStance/stance/train.csv",
"train": ":data_dir:ANSStance/stance/train.csv",
},
"task_type": TaskType.Classification,
"class_labels": ["agree", "disagree", "discuss", "unrelated"],
Expand Down
2 changes: 1 addition & 1 deletion llmebench/datasets/XQuAD.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def metadata():
"license": "CC-BY-SA4.0",
"splits": {
"test": "xquad.ar.json",
"train": ":depends:ARCD/arcd-train.json",
"train": ":data_dir:ARCD/arcd-train.json",
},
"task_type": TaskType.QuestionAnswering,
}
7 changes: 4 additions & 3 deletions llmebench/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def get_data_paths(config, split):
assert split in ["train", "test"]

dataset_args = config.get("dataset_args", {})
dataset_args["data_dir"] = ""
dataset = config["dataset"](**dataset_args)

if split == "test":
Expand Down Expand Up @@ -129,10 +130,10 @@ def resolve_path(path, dataset, data_dir):
if not isinstance(data_dir, Path):
data_dir = Path(data_dir)

if not str(path).startswith(":depends:") and path.is_absolute():
if not str(path).startswith(":data_dir:") and path.is_absolute():
return path
elif str(path).startswith(":depends:"):
return data_dir / str(path)[len(":depends:") :]
elif str(path).startswith(":data_dir:"):
return data_dir / str(path)[len(":data_dir:") :]
else:
dataset_name = dataset.__class__.__name__
if dataset_name.endswith("Dataset"):
Expand Down

0 comments on commit 3a329a1

Please sign in to comment.