Update citations for various datasets (#208)

* updated bib entries * Fix unicode issues in citations --------- Co-authored-by: Fahim Imaduddin Dalvi <[email protected]>
qcri · Sep 10, 2023 · aa0436b · aa0436b
1 parent 07f4bf6
commit aa0436b
Show file tree

Hide file tree

Showing 11 changed files with 101 additions and 48 deletions.
diff --git a/llmebench/datasets/ARCD.py b/llmebench/datasets/ARCD.py
@@ -10,12 +10,20 @@ def __init__(self, **kwargs):
     def metadata():
         return {
             "language": "ar",
-            "citation": """@misc{mozannar2019neural,
-                title={Neural Arabic Question Answering},
-                author={Hussein Mozannar and Karl El Hajal and Elie Maamary and Hazem Hajj},
-                year={2019},
-                eprint={1906.05394},
-                archivePrefix={arXiv},
-                primaryClass={cs.CL}
+            "citation": """@inproceedings{mozannar-etal-2019-neural,
+                title = "Neural {A}rabic Question Answering",
+                author = "Mozannar, Hussein  and
+                  Maamary, Elie  and
+                  El Hajal, Karl  and
+                  Hajj, Hazem",
+                booktitle = "Proceedings of the Fourth Arabic Natural Language Processing Workshop",
+                month = aug,
+                year = "2019",
+                address = "Florence, Italy",
+                publisher = "Association for Computational Linguistics",
+                url = "https://www.aclweb.org/anthology/W19-4612",
+                doi = "10.18653/v1/W19-4612",
+                pages = "108--118",
+                abstract = "This paper tackles the problem of open domain factual Arabic question answering (QA) using Wikipedia as our knowledge source. This constrains the answer of any question to be a span of text in Wikipedia. Open domain QA for Arabic entails three challenges: annotated QA datasets in Arabic, large scale efficient information retrieval and machine reading comprehension. To deal with the lack of Arabic QA datasets we present the Arabic Reading Comprehension Dataset (ARCD) composed of 1,395 questions posed by crowdworkers on Wikipedia articles, and a machine translation of the Stanford Question Answering Dataset (Arabic-SQuAD). Our system for open domain question answering in Arabic (SOQAL) is based on two components: (1) a document retriever using a hierarchical TF-IDF approach and (2) a neural reading comprehension model using the pre-trained bi-directional transformer BERT. Our experiments on ARCD indicate the effectiveness of our approach with our BERT-based reader achieving a 61.3 F1 score, and our open domain system SOQAL achieving a 27.6 F1 score.",
             }""",
         }
diff --git a/llmebench/datasets/Emotion.py b/llmebench/datasets/Emotion.py
@@ -8,13 +8,19 @@ def __init__(self, **kwargs):
     def metadata():
         return {
             "language": "ar",
-            "citation": """@misc{hassan2022crosslingual,
-                title={Cross-lingual Emotion Detection}, 
-                author={Sabit Hassan and Shaden Shaar and Kareem Darwish},
-                year={2022},
-                eprint={2106.06017},
-                archivePrefix={arXiv},
-                primaryClass={cs.CL}
+            "citation": """@inproceedings{hassan-etal-2022-cross,
+                title = "Cross-lingual Emotion Detection",
+                author = "Hassan, Sabit  and
+                  Shaar, Shaden  and
+                  Darwish, Kareem",
+                booktitle = "Proceedings of the Thirteenth Language Resources and Evaluation Conference",
+                month = jun,
+                year = "2022",
+                address = "Marseille, France",
+                publisher = "European Language Resources Association",
+                url = "https://aclanthology.org/2022.lrec-1.751",
+                pages = "6948--6958",
+                abstract = "Emotion detection can provide us with a window into understanding human behavior. Due to the complex dynamics of human emotions, however, constructing annotated datasets to train automated models can be expensive. Thus, we explore the efficacy of cross-lingual approaches that would use data from a source language to build models for emotion detection in a target language. We compare three approaches, namely: i) using inherently multilingual models; ii) translating training data into the target language; and iii) using an automatically tagged parallel corpus. In our study, we consider English as the source language with Arabic and Spanish as target languages. We study the effectiveness of different classification models such as BERT and SVMs trained with different features. Our BERT-based monolingual models that are trained on target language data surpass state-of-the-art (SOTA) by 4{\\%} and 5{\\%} absolute Jaccard score for Arabic and Spanish respectively. Next, we show that using cross-lingual approaches with English data alone, we can achieve more than 90{\\%} and 80{\\%} relative effectiveness of the Arabic and Spanish BERT models respectively. Lastly, we use LIME to analyze the challenges of training cross-lingual models for different language pairs.",
             }""",
         }
 

diff --git a/llmebench/datasets/Khouja20Factuality.py b/llmebench/datasets/Khouja20Factuality.py
@@ -8,11 +8,18 @@ def __init__(self, **kwargs):
     def metadata():
         return {
             "language": "ar",
-            "citation": """@article{khouja2020stance,
-                title={Stance prediction and claim verification: An Arabic perspective},
-                author={Khouja, Jude},
-                journal={arXiv preprint arXiv:2005.10410},
-                year={2020}
+            "citation": """@inproceedings{khouja-2020-stance,
+                title = "Stance Prediction and Claim Verification: An {A}rabic Perspective",
+                author = "Khouja, Jude",
+                booktitle = "Proceedings of the Third Workshop on Fact Extraction and VERification (FEVER)",
+                month = jul,
+                year = "2020",
+                address = "Online",
+                publisher = "Association for Computational Linguistics",
+                url = "https://aclanthology.org/2020.fever-1.2",
+                doi = "10.18653/v1/2020.fever-1.2",
+                pages = "8--17",
+                abstract = "This work explores the application of textual entailment in news claim verification and stance prediction using a new corpus in Arabic. The publicly available corpus comes in two perspectives: a version consisting of 4,547 true and false claims and a version consisting of 3,786 pairs (claim, evidence). We describe the methodology for creating the corpus and the annotation process. Using the introduced corpus, we also develop two machine learning baselines for two proposed tasks: claim verification and stance prediction. Our best model utilizes pretraining (BERT) and achieves 76.7 F1 on the stance prediction task and 64.3 F1 on the claim verification task. Our preliminary experiments shed some light on the limits of automatic claim verification that relies on claims text only. Results hint that while the linguistic features and world knowledge learned during pretraining are useful for stance prediction, such learned representations from pretraining are insufficient for verifying claims without access to context or evidence.",
             }""",
         }
 

diff --git a/llmebench/datasets/Khouja20Stance.py b/llmebench/datasets/Khouja20Stance.py
@@ -8,11 +8,18 @@ def __init__(self, **kwargs):
     def metadata():
         return {
             "language": "ar",
-            "citation": """@article{khouja2020stance,
-                title={Stance prediction and claim verification: An Arabic perspective},
-                author={Khouja, Jude},
-                journal={arXiv preprint arXiv:2005.10410},
-                year={2020}
+            "citation": """@inproceedings{khouja-2020-stance,
+                title = "Stance Prediction and Claim Verification: An {A}rabic Perspective",
+                author = "Khouja, Jude",
+                booktitle = "Proceedings of the Third Workshop on Fact Extraction and VERification (FEVER)",
+                month = jul,
+                year = "2020",
+                address = "Online",
+                publisher = "Association for Computational Linguistics",
+                url = "https://aclanthology.org/2020.fever-1.2",
+                doi = "10.18653/v1/2020.fever-1.2",
+                pages = "8--17",
+                abstract = "This work explores the application of textual entailment in news claim verification and stance prediction using a new corpus in Arabic. The publicly available corpus comes in two perspectives: a version consisting of 4,547 true and false claims and a version consisting of 3,786 pairs (claim, evidence). We describe the methodology for creating the corpus and the annotation process. Using the introduced corpus, we also develop two machine learning baselines for two proposed tasks: claim verification and stance prediction. Our best model utilizes pretraining (BERT) and achieves 76.7 F1 on the stance prediction task and 64.3 F1 on the claim verification task. Our preliminary experiments shed some light on the limits of automatic claim verification that relies on claims text only. Results hint that while the linguistic features and world knowledge learned during pretraining are useful for stance prediction, such learned representations from pretraining are insufficient for verifying claims without access to context or evidence.",
             }""",
         }
 

diff --git a/llmebench/datasets/Location.py b/llmebench/datasets/Location.py
@@ -9,7 +9,7 @@ def metadata():
         return {
             "language": "ar",
             "citation": """@inproceedings{mubarak2021ul2c,
-                title={UL2C: Mapping user locations to countries on Arabic Twitter},
+                title={{UL2C}: Mapping user locations to countries on Arabic Twitter},
                 author={Mubarak, Hamdy and Hassan, Sabit},
                 booktitle={Proceedings of the Sixth Arabic Natural Language Processing Workshop},
                 pages={145--153},

diff --git a/llmebench/datasets/MLQA.py b/llmebench/datasets/MLQA.py
@@ -10,10 +10,21 @@ def __init__(self, **kwargs):
     def metadata():
         return {
             "language": "ar",
-            "citation": """@article{lewis2019mlqa,
-                title=MLQA: Evaluating Cross-lingual Extractive Question Answering,
-                author={Lewis, Patrick and Ouguz, Barlas and Rinott, Ruty and Riedel, Sebastian and Schwenk, Holger},
-                journal={arXiv preprint arXiv:1910.07475},
-                year={2019}
+            "citation": """@inproceedings{lewis-etal-2020-mlqa,
+                title = "{MLQA}: Evaluating Cross-lingual Extractive Question Answering",
+                author = "Lewis, Patrick  and
+                  Oguz, Barlas  and
+                  Rinott, Ruty  and
+                  Riedel, Sebastian  and
+                  Schwenk, Holger",
+                booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics",
+                month = jul,
+                year = "2020",
+                address = "Online",
+                publisher = "Association for Computational Linguistics",
+                url = "https://aclanthology.org/2020.acl-main.653",
+                doi = "10.18653/v1/2020.acl-main.653",
+                pages = "7315--7330",
+                abstract = "Question answering (QA) models have shown rapid progress enabled by the availability of large, high-quality benchmark datasets. Such annotated datasets are difficult and costly to collect, and rarely exist in languages other than English, making building QA systems that work well in other languages challenging. In order to develop such systems, it is crucial to invest in high quality multilingual evaluation benchmarks to measure progress. We present MLQA, a multi-way aligned extractive QA evaluation benchmark intended to spur research in this area. MLQA contains QA instances in 7 languages, English, Arabic, German, Spanish, Hindi, Vietnamese and Simplified Chinese. MLQA has over 12K instances in English and 5K in each other language, with each instance parallel between 4 languages on average. We evaluate state-of-the-art cross-lingual models and machine-translation-based baselines on MLQA. In all cases, transfer results are shown to be significantly behind training-language performance.",
             }""",
         }
diff --git a/llmebench/datasets/NameInfo.py b/llmebench/datasets/NameInfo.py
@@ -8,8 +8,7 @@ def __init__(self, **kwargs):
     def metadata():
         return {
             "language": "ar",
-            "citation": """@inproceedings{Under review...}
-            }""",
+            "citation": """@inproceedings{Under review...}""",
         }
 
     def get_data_sample(self):

diff --git a/llmebench/datasets/OSACT4SubtaskA.py b/llmebench/datasets/OSACT4SubtaskA.py
@@ -8,12 +8,29 @@ def __init__(self, **kwargs):
     def metadata():
         return {
             "language": "ar",
-            "citation": """@article{zampieri2020semeval,
-                title={SemEval-2020 task 12: Multilingual offensive language identification in social media (OffensEval 2020)},
-                author={Zampieri, Marcos and Nakov, Preslav and Rosenthal, Sara and Atanasova, Pepa and Karadzhov, Georgi and Mubarak, Hamdy and Derczynski, Leon and Pitenis, ...},
-                journal={arXiv preprint arXiv:2006.07235},
-                year={2020}
-            }""",
+            "citation": """
+            @inproceedings{zampieri-etal-2020-semeval,
+                title = "{S}em{E}val-2020 Task 12: Multilingual Offensive Language Identification in Social Media ({O}ffens{E}val 2020)",
+                author = {Zampieri, Marcos  and
+                  Nakov, Preslav  and
+                  Rosenthal, Sara  and
+                  Atanasova, Pepa  and
+                  Karadzhov, Georgi  and
+                  Mubarak, Hamdy  and
+                  Derczynski, Leon  and
+                  Pitenis, Zeses  and
+                  {\\c{C}}{\\"o}ltekin, {\\c{C}}a{\\u{g}}r{\\i}},
+                booktitle = "Proceedings of the Fourteenth Workshop on Semantic Evaluation",
+                month = dec,
+                year = "2020",
+                address = "Barcelona (online)",
+                publisher = "International Committee for Computational Linguistics",
+                url = "https://aclanthology.org/2020.semeval-1.188",
+                doi = "10.18653/v1/2020.semeval-1.188",
+                pages = "1425--1447",
+                abstract = "We present the results and the main findings of SemEval-2020 Task 12 on Multilingual Offensive Language Identification in Social Media (OffensEval-2020). The task included three subtasks corresponding to the hierarchical taxonomy of the OLID schema from OffensEval-2019, and it was offered in five languages: Arabic, Danish, English, Greek, and Turkish. OffensEval-2020 was one of the most popular tasks at SemEval-2020, attracting a large number of participants across all subtasks and languages: a total of 528 teams signed up to participate in the task, 145 teams submitted official runs on the test data, and 70 teams submitted system description papers.",
+            }
+            """,
         }
 
     def get_data_sample(self):

diff --git a/llmebench/datasets/QADI.py b/llmebench/datasets/QADI.py
@@ -9,7 +9,7 @@ def metadata():
         return {
             "language": "ar",
             "citation": """@inproceedings{abdelali2021qadi,
-                title={QADI: Arabic dialect identification in the wild},
+                title={{QADI}: Arabic dialect identification in the wild},
                 author={Abdelali, Ahmed and Mubarak, Hamdy and Samih, Younes and Hassan, Sabit and Darwish, Kareem},
                 booktitle={Proceedings of the Sixth Arabic Natural Language Processing Workshop},
                 pages={1--10},

diff --git a/llmebench/datasets/WANLP22T3Propaganda.py b/llmebench/datasets/WANLP22T3Propaganda.py
@@ -15,7 +15,7 @@ def metadata():
         return {
             "language": "ar",
             "citation": """@inproceedings{alam2022overview,
-              title={Overview of the $\\{$WANLP$\\}$ 2022 Shared Task on Propaganda Detection in $\\{$A$\\}$ rabic},
+              title={Overview of the $\\{$WANLP$\\}$ 2022 Shared Task on Propaganda Detection in $\\{$A$\\}$rabic},
               author={Alam, Firoj and Mubarak, Hamdy and Zaghouani, Wajdi and Da San Martino, Giovanni and Nakov, Preslav and others},
               booktitle={Proceedings of the The Seventh Arabic Natural Language Processing Workshop (WANLP)},
               pages={108--118},

diff --git a/llmebench/datasets/XQuAD.py b/llmebench/datasets/XQuAD.py
@@ -10,13 +10,11 @@ def __init__(self, **kwargs):
     def metadata():
         return {
             "language": "ar",
-            "citation": """@article{Artetxe:etal:2019,
-                author={Mikel Artetxe and Sebastian Ruder and Dani Yogatama},
-                title={On the cross-lingual transferability of monolingual representations},
-                journal={CoRR},
-                volume={abs/1910.11856},
-                year={2019},
-                archivePrefix={arXiv},
-                eprint={1910.11856}
+            "citation": """@inproceedings{artetxe2020cross,
+                title={On the Cross-lingual Transferability of Monolingual Representations},
+                author={Artetxe, Mikel and Ruder, Sebastian and Yogatama, Dani},
+                booktitle={Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics},
+                pages={4623--4637},
+                year={2020}
             }""",
         }