From bc1837eb7576ad443af03c7a81649fcbbfd1da3d Mon Sep 17 00:00:00 2001 From: Felix Dittrich Date: Wed, 4 Dec 2024 14:00:41 +0100 Subject: [PATCH] [Bug] Fix vocabs and add corresponding test case (#1813) changes with vocab and documentation dec 10 --- docs/source/modules/datasets.rst | 10 +++++----- doctr/datasets/vocabs.py | 8 ++++---- tests/common/test_datasets_vocabs.py | 11 +++++++++++ 3 files changed, 20 insertions(+), 9 deletions(-) create mode 100644 tests/common/test_datasets_vocabs.py diff --git a/docs/source/modules/datasets.rst b/docs/source/modules/datasets.rst index 2104012d28..4a73772c0e 100644 --- a/docs/source/modules/datasets.rst +++ b/docs/source/modules/datasets.rst @@ -169,19 +169,19 @@ of vocabs. - 115 - абвгдежзийклмнопрстуфхцчшщьюяАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЬЮЯ0123456789!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿ґіїєҐІЇЄ₴ * - vietnamese - - 236 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿áàảạãăắằẳẵặâấầẩẫậđéèẻẽẹêếềểễệóòỏõọôốồổộỗơớờởợỡúùủũụưứừửữựiíìỉĩịýỳỷỹỵÁÀẢẠÃĂẮẰẲẴẶÂẤẦẨẪẬĐÉÈẺẼẸÊẾỀỂỄỆÓÒỎÕỌÔỐỒỔỘỖƠỚỜỞỢỠÚÙỦŨỤƯỨỪỬỮỰIÍÌỈĨỊÝỲỶỸỴ + - 234 + - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿áàảạãăắằẳẵặâấầẩẫậđéèẻẽẹêếềểễệóòỏõọôốồổộỗơớờởợỡúùủũụưứừửữựíìỉĩịýỳỷỹỵÁÀẢẠÃĂẮẰẲẴẶÂẤẦẨẪẬĐÉÈẺẼẸÊẾỀỂỄỆÓÒỎÕỌÔỐỒỔỘỖƠỚỜỞỢỠÚÙỦŨỤƯỨỪỬỮỰÍÌỈĨỊÝỲỶỸỴ * - hebrew - 123 - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿אבגדהוזחטיכלמנסעפצקרשת₪ * - hindi - - 71 - - अआइईउऊऋॠऌॡएऐओऔअंअःकखगघङचछजझञटठडढणतथदधनपफबभमयरलवशषसह०१२३४५६७८९।,?!:्ॐ॰॥॰ + - 68 + - अआइईउऊऋॠऌॡएऐओऔंःकखगघङचछजझञटठडढणतथदधनपफबभमयरलवशषसह०१२३४५६७८९।,?!:्ॐ॰॥ * - bangla - 70 - অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহ়ঽািীুূৃেৈোৌ্ৎংঃঁ০১২৩৪৫৬৭৮৯ * - gujarati - - 106 + - 104 - અઆઇઈઉઊઋએઐઓઔઅંઅઃકખગઘચછજઝઞટઠડઢણતથદધનપફબભમયરલવશષસહળક્ષજ્ઞ૦૧૨૩૪૫૬૭૮૯૰ઽ◌ંઃ॥ૐ઼ઁ!"#$%&'()*+,-./:;<=>?@[\]^_{|}~ * - multilingual - 195 diff --git a/doctr/datasets/vocabs.py b/doctr/datasets/vocabs.py index 91c5b215d3..29716804a8 100644 --- a/doctr/datasets/vocabs.py +++ b/doctr/datasets/vocabs.py @@ -19,9 +19,9 @@ "arabic_digits": "٠١٢٣٤٥٦٧٨٩", "arabic_diacritics": "ًٌٍَُِّْ", "arabic_punctuation": "؟؛«»—", - "hindi_letters": "अआइईउऊऋॠऌॡएऐओऔअंअःकखगघङचछजझञटठडढणतथदधनपफबभमयरलवशषसह", + "hindi_letters": "अआइईउऊऋॠऌॡएऐओऔंःकखगघङचछजझञटठडढणतथदधनपफबभमयरलवशषसह", "hindi_digits": "०१२३४५६७८९", - "hindi_punctuation": "।,?!:्ॐ॰॥॰", + "hindi_punctuation": "।,?!:्ॐ॰॥", "gujarati_vowels": "અઆઇઈઉઊઋએઐઓઔઅંઅઃ", "gujarati_consonants":"કખગઘચછજઝઞટઠડઢણતથદધનપફબભમયરલવશષસહળક્ષજ્ઞ", "gujarati_digits":"૦૧૨૩૪૫૬૭૮૯", @@ -57,8 +57,8 @@ VOCABS["swedish"] = VOCABS["english"] + "åäöÅÄÖ" VOCABS["vietnamese"] = ( VOCABS["english"] - + "áàảạãăắằẳẵặâấầẩẫậđéèẻẽẹêếềểễệóòỏõọôốồổộỗơớờởợỡúùủũụưứừửữựiíìỉĩịýỳỷỹỵ" - + "ÁÀẢẠÃĂẮẰẲẴẶÂẤẦẨẪẬĐÉÈẺẼẸÊẾỀỂỄỆÓÒỎÕỌÔỐỒỔỘỖƠỚỜỞỢỠÚÙỦŨỤƯỨỪỬỮỰIÍÌỈĨỊÝỲỶỸỴ" + + "áàảạãăắằẳẵặâấầẩẫậđéèẻẽẹêếềểễệóòỏõọôốồổộỗơớờởợỡúùủũụưứừửữựíìỉĩịýỳỷỹỵ" + + "ÁÀẢẠÃĂẮẰẲẴẶÂẤẦẨẪẬĐÉÈẺẼẸÊẾỀỂỄỆÓÒỎÕỌÔỐỒỔỘỖƠỚỜỞỢỠÚÙỦŨỤƯỨỪỬỮỰÍÌỈĨỊÝỲỶỸỴ" ) VOCABS["hebrew"] = VOCABS["english"] + "אבגדהוזחטיכלמנסעפצקרשת" + "₪" VOCABS["hindi"] = VOCABS["hindi_letters"] + VOCABS["hindi_digits"] + VOCABS["hindi_punctuation"] diff --git a/tests/common/test_datasets_vocabs.py b/tests/common/test_datasets_vocabs.py new file mode 100644 index 0000000000..cd84bf7ac7 --- /dev/null +++ b/tests/common/test_datasets_vocabs.py @@ -0,0 +1,11 @@ +from collections import Counter + +from doctr.datasets import VOCABS + + +def test_vocabs_duplicates(): + for key, vocab in VOCABS.items(): + assert isinstance(vocab, str) + + duplicates = [char for char, count in Counter(vocab).items() if count > 1] + assert not duplicates, f"Duplicate characters in {key} vocab: {duplicates}"