From 5b20d8bb9544ebe50ebe3d30babd85223d27d0bb Mon Sep 17 00:00:00 2001 From: Philipp Schmid Date: Fri, 11 Aug 2023 12:37:09 +0000 Subject: [PATCH 01/16] ppl --- easyllm/data/__init__.py | 0 easyllm/data/filters/__init__.py | 1 + easyllm/data/filters/kenlm_ppl.py | 201 ++++++++++++++++++++++++++++++ notebooks/data-filter.ipynb | 91 ++++++++++++++ pyproject.toml | 1 + 5 files changed, 294 insertions(+) create mode 100644 easyllm/data/__init__.py create mode 100644 easyllm/data/filters/__init__.py create mode 100644 easyllm/data/filters/kenlm_ppl.py create mode 100644 notebooks/data-filter.ipynb diff --git a/easyllm/data/__init__.py b/easyllm/data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/easyllm/data/filters/__init__.py b/easyllm/data/filters/__init__.py new file mode 100644 index 0000000..f411be4 --- /dev/null +++ b/easyllm/data/filters/__init__.py @@ -0,0 +1 @@ +from easyllm.data.filters.kenlm_ppl import PerplexityFilter diff --git a/easyllm/data/filters/kenlm_ppl.py b/easyllm/data/filters/kenlm_ppl.py new file mode 100644 index 0000000..fbfa362 --- /dev/null +++ b/easyllm/data/filters/kenlm_ppl.py @@ -0,0 +1,201 @@ + +import os +import re +import unicodedata +from typing import Dict +import importlib.util +from pydantic import BaseModel, ConfigDict +from huggingface_hub import hf_hub_download + +_kenlm = importlib.util.find_spec("kenlm") is not None +_sentencepiece = importlib.util.find_spec("sentencepiece") is not None + +if _kenlm or not _sentencepiece: + import kenlm + import sentencepiece + + + +class SentencePiece: + def __init__( + self, + model: str, + ): + super().__init__() + self.sp = sentencepiece.SentencePieceProcessor() + self.sp.load(str(model)) + + def do(self, text: dict) -> dict: + tokenized = self.sp.encode_as_pieces(text) + return " ".join(tokenized) + + +class KenlmModel: + digit_re: re.Pattern[str] = re.compile(r"\d") + unicode_punct: Dict[str, str] = { + ",": ",", + "。": ".", + "、": ",", + "„": '"', + "”": '"', + "“": '"', + "«": '"', + "»": '"', + "1": '"', + "」": '"', + "「": '"', + "《": '"', + "》": '"', + "´": "'", + "∶": ":", + ":": ":", + "?": "?", + "!": "!", + "(": "(", + ")": ")", + ";": ";", + "–": "-", + "—": " - ", + ".": ". ", + "~": "~", + "’": "'", + "…": "...", + "━": "-", + "〈": "<", + "〉": ">", + "【": "[", + "】": "]", + "%": "%", + "►": "-", + } + unicode_punct_re:re.Pattern = re.compile(f"[{''.join(unicode_punct.keys())}]") + non_printing_chars_re:re.Pattern = re.compile( + f"[{''.join(map(chr, list(range(0,32)) + list(range(127,160))))}]" + ) + model: kenlm.Model = None + tokenizer: SentencePiece = None + accent: bool = False + case: bool = False + numbers: bool = True + punct: int = 1 + + def __init__( + self, + model_path: str, + tokenizer_path: str, + lower_case: bool = False, + remove_accents: bool = False, + normalize_numbers: bool = True, + punctuation: int = 1, + ): + self.model = kenlm.Model(model_path) + self.tokenizer = SentencePiece(tokenizer_path) + self.accent = remove_accents + self.case = lower_case + self.numbers = normalize_numbers + self.punct = punctuation + + @classmethod + def from_pretrained( + cls, + language_or_path: str, + ): + try: + model = hf_hub_download("philschmid/kenlm",filename=f"wikipedia/{language_or_path}.arpa.bin") + tokenizer = hf_hub_download("philschmid/kenlm",filename=f"wikipedia/{language_or_path}.sp.model") + except: + raise ValueError(f"KenLM model for {language_or_path} not found at https://huggingface.co/philschmid/kenlm. Please train your own model and upload it to the hub.") + + + return cls( + model, + tokenizer, + False, + False, + True, + 1, + ) + + def pp(self, log_score, length): + return 10.0 ** (-log_score / length) + + def get_perplexity(self, doc: str, normalize_cc_net: bool = True): + if normalize_cc_net: + doc = self.normalize( + doc, + accent=self.accent, + case=self.case, + numbers=self.numbers, + punct=self.punct, + ) + # Tokenize (after normalizing): See https://github.com/facebookresearch/cc_net/blob/bda555bd1cf1ee2e0b925363e62a61cd46c8b60d/cc_net/mine.py#L352 for full pipeline + doc = self.tokenizer.do(doc) + doc_log_score, doc_length = 0, 0 + for line in doc.split("\n"): + log_score = self.model.score(line) + length = len(line.split()) + 1 + doc_log_score += log_score + doc_length += length + return round(self.pp(doc_log_score, doc_length), 1) + + def normalize( + self, + line: str, + accent: bool = True, + case: bool = True, + numbers: bool = True, + punct: int = 1, + ) -> str: + line = line.strip() + if not line: + return line + if case: + line = line.lower() + if accent: + line = self.strip_accents(line) + if numbers: + line = self.digit_re.sub("0", line) + if punct == 1: + line = self.replace_unicode_punct(line) + elif punct == 2: + line = self.remove_unicode_punct(line) + line = self.remove_non_printing_char(line) + return line + + def strip_accents(self, line: str) -> str: + """Strips accents from a piece of text.""" + nfd = unicodedata.normalize("NFD", line) + output = [c for c in nfd if unicodedata.category(c) != "Mn"] + if len(output) == line: + return line + return "".join(output) + + def replace_unicode_punct(self, text: str) -> str: + return "".join(self.unicode_punct.get(c, c) for c in text) + + def remove_unicode_punct(self, text: str) -> str: + """More aggressive version of replace_unicode_punct but also faster.""" + return self.unicode_punct_re.sub("", text) + + def remove_non_printing_char(self, text: str) -> str: + return self.non_printing_chars_re.sub("", text) + + +class PerplexityFilter(BaseModel): + model: KenlmModel = None + min_threshold: int = 0 + max_threshold: int = 1000 + model_config = ConfigDict(arbitrary_types_allowed=True) + + def __init__(self,language:str,min_threshold:int=0,max_threshold:int=1000): + super().__init__() + self.min_threshold = min_threshold + self.max_threshold = max_threshold + self.model = KenlmModel.from_pretrained(language) + + + def __call__(self, doc: str) -> bool: + # returns True if the perplexity of the document outside of the threshold, + # meaning smaller than min_threshold or larger than max_threshold + return not self.min_threshold <= self.model.get_perplexity(doc) <= self.max_threshold + \ No newline at end of file diff --git a/notebooks/data-filter.ipynb b/notebooks/data-filter.ipynb new file mode 100644 index 0000000..75e3b5f --- /dev/null +++ b/notebooks/data-filter.ipynb @@ -0,0 +1,91 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# How to use EasyLLM Quality data filters\n", + "\n", + "EasyLLMs `data` package adds quality filters for preprocessing text data for improved pretraining. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install \"easyllm[data]\" --upgrade" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Perplexity filtering\n", + "\n", + "Perplexity filtering can be used to improve model quality, coherence, and training efficiency by removing confusing text segments and focusing model learning on more standard, comprehensible language.\n", + "Perplexity filtering is implemented using `KenLM` models trained on wikipedia. You just need to provide your language id, e.g. `de` and your perplexity `min_threshold` and `max_threshold` the filter will return `True` if the perplexity of the text outside of the threshold `False` otherwise.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "341.3\n", + "46793.5\n" + ] + } + ], + "source": [ + "from easyllm.data.filters import PerplexityFilter\n", + "\n", + "ppl = PerplexityFilter(\"en\",min_threshold=10,max_threshold=1000)\n", + "\n", + "# Get perplexity\n", + "print(ppl.model.get_perplexity(\"I am very perplexed\"))\n", + "# 341.3 (low perplexity, since sentence style is formal and with no grammar mistakes)\n", + "\n", + "print(ppl.model.get_perplexity(\"im hella trippin\"))\n", + "# 46793.5 (high perplexity, since the sentence is colloquial and contains grammar mistakes)\n", + "\n", + "# testing the filter\n", + "assert ppl(\"I am very perplexed\") == False\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "dev", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/pyproject.toml b/pyproject.toml index e5da485..4bc1c5b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,6 +36,7 @@ scripts = { easyllm = "easyllm.cli:main" } dependencies = ["pydantic==2.1.1", "nanoid==2.0.0", "huggingface-hub==0.16.4"] [project.optional-dependencies] +data = ["datasets","https://github.com/kpu/kenlm/archive/master.zip","sentencepiece"] test = ["pytest", "ruff", "black", "isort", "mypy", "hatch"] dev = ["ruff", "black", "isort", "mypy", "hatch"] docs = [ From aba974a95c4d30a39f6446a71ea9c118912342e9 Mon Sep 17 00:00:00 2001 From: Philipp Schmid Date: Fri, 11 Aug 2023 12:58:27 +0000 Subject: [PATCH 02/16] non alpha numeric filter --- easyllm/data/filters/__init__.py | 1 + easyllm/data/filters/kenlm_ppl.py | 16 +++++----- easyllm/data/filters/non_alpha_numeric.py | 27 ++++++++++++++++ notebooks/data-filter.ipynb | 39 +++++++++++++++++++++++ 4 files changed, 75 insertions(+), 8 deletions(-) create mode 100644 easyllm/data/filters/non_alpha_numeric.py diff --git a/easyllm/data/filters/__init__.py b/easyllm/data/filters/__init__.py index f411be4..f589d8c 100644 --- a/easyllm/data/filters/__init__.py +++ b/easyllm/data/filters/__init__.py @@ -1 +1,2 @@ from easyllm.data.filters.kenlm_ppl import PerplexityFilter +from easyllm.data.filters.non_alpha_numeric import NonAlphaNumericFilter diff --git a/easyllm/data/filters/kenlm_ppl.py b/easyllm/data/filters/kenlm_ppl.py index fbfa362..55b56be 100644 --- a/easyllm/data/filters/kenlm_ppl.py +++ b/easyllm/data/filters/kenlm_ppl.py @@ -1,11 +1,11 @@ -import os +import importlib.util import re import unicodedata from typing import Dict -import importlib.util -from pydantic import BaseModel, ConfigDict + from huggingface_hub import hf_hub_download +from pydantic import BaseModel, ConfigDict _kenlm = importlib.util.find_spec("kenlm") is not None _sentencepiece = importlib.util.find_spec("sentencepiece") is not None @@ -99,14 +99,14 @@ def __init__( def from_pretrained( cls, language_or_path: str, - ): - try: + ): + try: model = hf_hub_download("philschmid/kenlm",filename=f"wikipedia/{language_or_path}.arpa.bin") tokenizer = hf_hub_download("philschmid/kenlm",filename=f"wikipedia/{language_or_path}.sp.model") except: raise ValueError(f"KenLM model for {language_or_path} not found at https://huggingface.co/philschmid/kenlm. Please train your own model and upload it to the hub.") - - + + return cls( model, tokenizer, @@ -198,4 +198,4 @@ def __call__(self, doc: str) -> bool: # returns True if the perplexity of the document outside of the threshold, # meaning smaller than min_threshold or larger than max_threshold return not self.min_threshold <= self.model.get_perplexity(doc) <= self.max_threshold - \ No newline at end of file + diff --git a/easyllm/data/filters/non_alpha_numeric.py b/easyllm/data/filters/non_alpha_numeric.py new file mode 100644 index 0000000..194889d --- /dev/null +++ b/easyllm/data/filters/non_alpha_numeric.py @@ -0,0 +1,27 @@ +import re + +from pydantic import BaseModel + + +class NonAlphaNumericFilter(BaseModel): + """ + Ref: Gopher (Rae et al., 2021) + Desc: If more than 20% of the document is non-alphanumeric then remove + """ + + name: str = "non_alpha_numeric" + regex: re.Pattern = re.compile("[^a-zA-Z0-9\s]") + cutoff_percentage: float = 0.2 + + def __call__(self, text): + num_characters = len(text) + # check if there are any characters in the text + if num_characters == 0: + return True + # calculate the percentage of non-alphanumeric characters + percentage = 1 - ((num_characters - len(self.regex.findall(text))) / num_characters) + # if the percentage is greater than the cutoff_percentage then remove + if percentage > self.cutoff_percentage: + return True + # otherwise keep + return False diff --git a/notebooks/data-filter.ipynb b/notebooks/data-filter.ipynb index 75e3b5f..8a35c72 100644 --- a/notebooks/data-filter.ipynb +++ b/notebooks/data-filter.ipynb @@ -58,6 +58,45 @@ "assert ppl(\"I am very perplexed\") == False\n" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## NonAlphaNumericFilter\n", + "\n", + "The `NonAlphaNumericFilter` removes documents based on the number of non-alphanumeric characters in the document. Based on [Gopher (Rae et al., 2021)](https://arxiv.org/pdf/2112.11446.pdf), if the document has more then 20% non-alphanumeric characters, it is removed." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "14\n", + "0\n", + "0.0\n", + "21\n", + "7\n", + "0.33333333333333337\n" + ] + } + ], + "source": [ + "from easyllm.data.filters import NonAlphaNumericFilter\n", + "\n", + "nam = NonAlphaNumericFilter()\n", + "\n", + "# not filtered\n", + "assert nam(\"This is a test\") == False\n", + "\n", + "# filtered\n", + "assert nam(\"This is a test!!!!!!!\") == True\n" + ] + }, { "cell_type": "code", "execution_count": null, From e9c922ed618d32381fc7e9ea2066723a009e2ba9 Mon Sep 17 00:00:00 2001 From: Philipp Schmid Date: Fri, 11 Aug 2023 13:33:45 +0000 Subject: [PATCH 03/16] more filters --- easyllm/data/filters/__init__.py | 2 + easyllm/data/filters/digit_to_character.py | 22 +++++++ easyllm/data/filters/non_alpha_numeric.py | 6 +- easyllm/data/filters/words_to_symbol.py | 33 ++++++++++ notebooks/data-filter.ipynb | 74 ++++++++++++++++++---- 5 files changed, 121 insertions(+), 16 deletions(-) create mode 100644 easyllm/data/filters/digit_to_character.py create mode 100644 easyllm/data/filters/words_to_symbol.py diff --git a/easyllm/data/filters/__init__.py b/easyllm/data/filters/__init__.py index f589d8c..7156fc4 100644 --- a/easyllm/data/filters/__init__.py +++ b/easyllm/data/filters/__init__.py @@ -1,2 +1,4 @@ from easyllm.data.filters.kenlm_ppl import PerplexityFilter from easyllm.data.filters.non_alpha_numeric import NonAlphaNumericFilter +from easyllm.data.filters.digit_to_character import DigitToCharacter +from easyllm.data.filters.words_to_symbol import SymbolToWordFilter diff --git a/easyllm/data/filters/digit_to_character.py b/easyllm/data/filters/digit_to_character.py new file mode 100644 index 0000000..7916afc --- /dev/null +++ b/easyllm/data/filters/digit_to_character.py @@ -0,0 +1,22 @@ +import re + +from pydantic import BaseModel + + +class DigitToCharacter(BaseModel): + """ + Desc: If more than 20% of the document are digits then remove + """ + + name: str = "digit_to_character" + remove_percentage: float = 0.2 + + def __call__(self, text): + digits = re.findall(r"\d", text) + num_digits = len(digits) + total_chars = len(text) + # check if there are any characters in the text + if num_digits / total_chars > self.remove_percentage: + return True + # otherwise keep + return False diff --git a/easyllm/data/filters/non_alpha_numeric.py b/easyllm/data/filters/non_alpha_numeric.py index 194889d..8f0ba8b 100644 --- a/easyllm/data/filters/non_alpha_numeric.py +++ b/easyllm/data/filters/non_alpha_numeric.py @@ -11,7 +11,7 @@ class NonAlphaNumericFilter(BaseModel): name: str = "non_alpha_numeric" regex: re.Pattern = re.compile("[^a-zA-Z0-9\s]") - cutoff_percentage: float = 0.2 + remove_percentage: float = 0.2 def __call__(self, text): num_characters = len(text) @@ -20,8 +20,8 @@ def __call__(self, text): return True # calculate the percentage of non-alphanumeric characters percentage = 1 - ((num_characters - len(self.regex.findall(text))) / num_characters) - # if the percentage is greater than the cutoff_percentage then remove - if percentage > self.cutoff_percentage: + # if the percentage is greater than the remove_percentage then remove + if percentage > self.remove_percentage: return True # otherwise keep return False diff --git a/easyllm/data/filters/words_to_symbol.py b/easyllm/data/filters/words_to_symbol.py new file mode 100644 index 0000000..7539dec --- /dev/null +++ b/easyllm/data/filters/words_to_symbol.py @@ -0,0 +1,33 @@ +import re + +from pydantic import BaseModel + + +class SymbolToWordFilter(BaseModel): + """ + Ref: Gopher (Rae et al., 2021) + Desc: If more than 10% of the document are symbols (hashes [#] or ellipsis (...)) then remove + """ + + name: str = "symbol_to_word" + regex: re.Pattern = r"(\#+|(\.{3,}))(?!\w)" + remove_percentage: float = 0.1 + + def __call__(self, text: str): + num_hashes = len(re.findall(r"\#+", text)) + num_ellipses = len(re.findall(r"\.{3,}", text)) + num_words = len(re.findall(r"\w+", text)) + + # check if there are any words in the text + if num_words == 0: + return True + + hash_ratio = num_hashes / num_words + ellipses_ratio = num_ellipses / num_words + + # if the percentage is greater than the remove_percentage then remove + if hash_ratio > self.remove_percentage or ellipses_ratio > self.remove_percentage: + return True + + # otherwise keep + return False diff --git a/notebooks/data-filter.ipynb b/notebooks/data-filter.ipynb index 8a35c72..518e2f8 100644 --- a/notebooks/data-filter.ipynb +++ b/notebooks/data-filter.ipynb @@ -69,32 +69,80 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from easyllm.data.filters import NonAlphaNumericFilter\n", + "\n", + "nam = NonAlphaNumericFilter()\n", + "\n", + "# not filtered\n", + "assert nam(\"This is a test\") == False\n", + "\n", + "# filtered\n", + "assert nam(\"This is a test!!!!!!!\") == True\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## SymbolToWordFilter\n", + "\n", + "The `SymbolToWordFilter` removes any document with a symbol-to-word ratio greater than 0.1 for either the hash symbol or the ellipsis.Based on [Gopher (Rae et al., 2021)](https://arxiv.org/pdf/2112.11446.pdf)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from easyllm.data.filters import SymbolToWordFilter\n", + "\n", + "stw = SymbolToWordFilter()\n", + "\n", + "assert stw(\"This is a test\") == False\n", + "\n", + "assert stw(\"spam#spam#spam#spam#spam#spam#spam#spam\") == True" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## NumbersToCharacterFilter\n", + "\n", + "The `NumbersToCharacterFilter` removes any document where the 20% of the document are numbers." + ] + }, + { + "cell_type": "code", + "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "14\n", - "0\n", - "0.0\n", - "21\n", - "7\n", - "0.33333333333333337\n" + "num_digits: 13\n", + "total_chars: 66\n", + "num_digits / total_chars: 0.19696969696969696\n", + "num_digits: 10\n", + "total_chars: 18\n", + "num_digits / total_chars: 0.5555555555555556\n" ] } ], "source": [ - "from easyllm.data.filters import NonAlphaNumericFilter\n", + "from easyllm.data.filters import DigitToCharacter\n", "\n", - "nam = NonAlphaNumericFilter()\n", + "ntw = DigitToCharacter()\n", "\n", - "# not filtered\n", - "assert nam(\"This is a test\") == False\n", + "assert ntw(\"Hello 123 world 456 this text 789 contains 1234 numbers more words\") == False\n", "\n", - "# filtered\n", - "assert nam(\"This is a test!!!!!!!\") == True\n" + "assert ntw(\"Hello 34534 34534 \") == True\n" ] }, { From 332ba531bdb7ed1359097ec506109469a674a27a Mon Sep 17 00:00:00 2001 From: Philipp Schmid Date: Fri, 11 Aug 2023 14:05:24 +0000 Subject: [PATCH 04/16] more filters --- easyllm/data/filters/__init__.py | 5 + easyllm/data/filters/bulletpoint_ratio.py | 43 +++++++ easyllm/data/filters/longword.py | 21 ++++ easyllm/data/filters/parantheses_ration.py | 23 ++++ easyllm/data/filters/url_ratio.py | 24 ++++ easyllm/data/filters/whitespace_ration.py | 23 ++++ notebooks/data-filter.ipynb | 137 ++++++++++++++++++--- 7 files changed, 261 insertions(+), 15 deletions(-) create mode 100644 easyllm/data/filters/bulletpoint_ratio.py create mode 100644 easyllm/data/filters/longword.py create mode 100644 easyllm/data/filters/parantheses_ration.py create mode 100644 easyllm/data/filters/url_ratio.py create mode 100644 easyllm/data/filters/whitespace_ration.py diff --git a/easyllm/data/filters/__init__.py b/easyllm/data/filters/__init__.py index 7156fc4..168ca92 100644 --- a/easyllm/data/filters/__init__.py +++ b/easyllm/data/filters/__init__.py @@ -1,4 +1,9 @@ +from easyllm.data.filters.bulletpoint_ratio import BulletpointRatioFilter from easyllm.data.filters.kenlm_ppl import PerplexityFilter +from easyllm.data.filters.longword import LongWordFilter from easyllm.data.filters.non_alpha_numeric import NonAlphaNumericFilter from easyllm.data.filters.digit_to_character import DigitToCharacter +from easyllm.data.filters.parantheses_ration import ParenthesesRationFilter +from easyllm.data.filters.url_ratio import UrlRatioFilter +from easyllm.data.filters.whitespace_ration import WhitespaceRatioFilter from easyllm.data.filters.words_to_symbol import SymbolToWordFilter diff --git a/easyllm/data/filters/bulletpoint_ratio.py b/easyllm/data/filters/bulletpoint_ratio.py new file mode 100644 index 0000000..1e02966 --- /dev/null +++ b/easyllm/data/filters/bulletpoint_ratio.py @@ -0,0 +1,43 @@ +import re +from typing import List + +from pydantic import BaseModel + + +class BulletpointRatioFilter(BaseModel): + """ + Ref: Gopher (Rae et al., 2021) + Desc: If more than 90% of the document are bulletpoints then remove + """ + + name: str = "bulletpoint_ratio" + potential_bullet_points: List[str] = [ + "•", + "‣", + "⁃", + "⁌", + "⁍", + "∙", + "○", + "●", + "◘", + "◦", + "⦾", + "⦿", + "-", + ] + remove_percentage: float = 0.9 + + def __call__(self, text): + # split text into lines + lines = text.split("\n") + num_bullet_points = 0 + for line in lines: + # check if the line is a bullet point + if line.startswith(tuple(self.potential_bullet_points)): + num_bullet_points += 1 + # check if the ratio of bullet points to lines is greater than the remove percentage + if num_bullet_points / len(lines) > self.remove_percentage: + return True + # otherwise keep + return False diff --git a/easyllm/data/filters/longword.py b/easyllm/data/filters/longword.py new file mode 100644 index 0000000..ed59081 --- /dev/null +++ b/easyllm/data/filters/longword.py @@ -0,0 +1,21 @@ +import re + +from pydantic import BaseModel + + +class LongWordFilter(BaseModel): + """ + Ref: C4 Raffel et al. + Desc: If document includes words with > 1000 character are removed, e.g. js or minified files. + """ + + name: str = "long_word" + max_length: int = 1000 + + def __call__(self, text): + words = text.split() + max_len = max(len(word) for word in words) + if max_len > self.max_length: + return True + # otherwise keep + return False diff --git a/easyllm/data/filters/parantheses_ration.py b/easyllm/data/filters/parantheses_ration.py new file mode 100644 index 0000000..02c8e76 --- /dev/null +++ b/easyllm/data/filters/parantheses_ration.py @@ -0,0 +1,23 @@ +import re + +from pydantic import BaseModel + + +class ParenthesesRationFilter(BaseModel): + """ + Desc: If more than 10% of the document are Parentheses then remove + """ + + name: str = "parentheses_ratio" + regex: re.Pattern = re.compile(r"\[|\]|\(|\)|{|}|⟨|⟩") + remove_percentage: float = 0.1 + + def __call__(self, text): + # parentheses characters + parentheses_count = len(self.regex.findall(text)) + sentence_length = len(text) + # check if the ratio of parentheses to text is greater than the remove percentage + if parentheses_count / sentence_length > self.remove_percentage: + return True + # otherwise keep + return False diff --git a/easyllm/data/filters/url_ratio.py b/easyllm/data/filters/url_ratio.py new file mode 100644 index 0000000..4571982 --- /dev/null +++ b/easyllm/data/filters/url_ratio.py @@ -0,0 +1,24 @@ +import re + +from pydantic import BaseModel + + +class UrlRatioFilter(BaseModel): + """ + Desc: If more than 20% of the document are urls then remove + """ + + name: str = "url_ratio" + regex: re.Pattern[ + str + ] = r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)" + remove_percentage: float = 0.2 + + def __call__(self, text): + # find all urls + urls = re.findall(self.regex, text) + # check if the ratio of urls to words is greater than the remove percentage + if len(urls) / len(text.split()) > self.remove_percentage: + return True + # otherwise keep + return False diff --git a/easyllm/data/filters/whitespace_ration.py b/easyllm/data/filters/whitespace_ration.py new file mode 100644 index 0000000..e9ff23a --- /dev/null +++ b/easyllm/data/filters/whitespace_ration.py @@ -0,0 +1,23 @@ +import re + +from pydantic import BaseModel + + +class WhitespaceRatioFilter(BaseModel): + """ + Desc: If more than 25% of the document are bulletpoints then remove + """ + + name: str = "whitespace_ratio" + regex: re.Pattern = re.compile(r"\s") + remove_percentage: float = 0.25 + + def __call__(self, text): + # whitespace characters + whitespace_count = len(self.regex.findall(text)) + text_length = len(text) + # check if the ratio of whitespace to text is greater than the remove percentage + if whitespace_count / text_length > self.remove_percentage: + return True + # otherwise keep + return False diff --git a/notebooks/data-filter.ipynb b/notebooks/data-filter.ipynb index 518e2f8..0db8d1a 100644 --- a/notebooks/data-filter.ipynb +++ b/notebooks/data-filter.ipynb @@ -90,7 +90,7 @@ "source": [ "## SymbolToWordFilter\n", "\n", - "The `SymbolToWordFilter` removes any document with a symbol-to-word ratio greater than 0.1 for either the hash symbol or the ellipsis.Based on [Gopher (Rae et al., 2021)](https://arxiv.org/pdf/2112.11446.pdf)" + "The `SymbolToWordFilter` removes any document with a symbol-to-word ratio greater than 0.1 for either the hash symbol or the ellipsis. Based on [Gopher (Rae et al., 2021)](https://arxiv.org/pdf/2112.11446.pdf)" ] }, { @@ -121,20 +121,7 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "num_digits: 13\n", - "total_chars: 66\n", - "num_digits / total_chars: 0.19696969696969696\n", - "num_digits: 10\n", - "total_chars: 18\n", - "num_digits / total_chars: 0.5555555555555556\n" - ] - } - ], + "outputs": [], "source": [ "from easyllm.data.filters import DigitToCharacter\n", "\n", @@ -145,6 +132,126 @@ "assert ntw(\"Hello 34534 34534 \") == True\n" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## UrlRatioFilter\n", + "\n", + "The `UrlRatioFilter` removes any document where 20% of the document is a URL." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from easyllm.data.filters import UrlRatioFilter \n", + "\n", + "ur = UrlRatioFilter()\n", + "\n", + "assert ur(\"https://www.google.com\") == True\n", + "\n", + "assert ur(\"Example text with some urls http://www.example.com and more text https://www.example2.com and more text\") == False" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## BulletpointRatioFilter \n", + "\n", + "The `BulletpointRatioFilter` removes documents that have more than 90% bulletpoints. Based on [Gopher (Rae et al., 2021)](https://arxiv.org/pdf/2112.11446.pdf)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from easyllm.data.filters import BulletpointRatioFilter\n", + "\n", + "br = BulletpointRatioFilter()\n", + "\n", + "assert br(\"This is a text with \\n- some bullets but\\nnot all\") == False\n", + "\n", + "assert br(\"- some bullets and\\n- some more\") == True" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## WhitespaceRatioFilter\n", + "\n", + "The `WhitespaceRatioFilter` is a filter that removes documents that more than 25% of the text is whitespace.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from easyllm.data.filters import WhitespaceRatioFilter\n", + "\n", + "wr = WhitespaceRatioFilter()\n", + "\n", + "assert wr(\"This is a test\") == False\n", + "\n", + "assert wr(\"Hello world! This text has extra whitespace.\") == True" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ParenthesesRationFilter\n", + "\n", + "The `ParenthesesRationFilter` is a filter that removes all sentences that have a parentheses ratio greater than 10%." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from easyllm.data.filters import ParenthesesRationFilter\n", + "\n", + "pr = ParenthesesRationFilter()\n", + "\n", + "assert pr(\"This is a normal sentence\") == False\n", + "\n", + "assert pr(\"This a (with ) ] {(e)\") == True" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## LongWordFilter\n", + "\n", + "The `LongWordFilter` is a filter that removes documents that include words longer > 1000 character, e.g. js minfied files." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from easyllm.data.filters import LongWordFilter\n", + "\n", + "lw = LongWordFilter()\n", + "\n", + "assert lw(\"This is a test\") == False\n", + "\n", + "assert lw(f\"This is a test with a {'longword'*500}\") == True" + ] + }, { "cell_type": "code", "execution_count": null, From 78914d2ec5fe5838595387eaabfd33d312b2e827 Mon Sep 17 00:00:00 2001 From: Philipp Schmid Date: Fri, 11 Aug 2023 14:40:21 +0000 Subject: [PATCH 05/16] more filters --- easyllm/data/filters/__init__.py | 2 + easyllm/data/filters/cookie_banner.py | 18 +++++++ easyllm/data/filters/length.py | 21 ++++++++ easyllm/data/filters/repeating.py | 53 +++++++++++++++++++ notebooks/data-filter.ipynb | 73 +++++++++++++++++++++++++++ 5 files changed, 167 insertions(+) create mode 100644 easyllm/data/filters/cookie_banner.py create mode 100644 easyllm/data/filters/length.py create mode 100644 easyllm/data/filters/repeating.py diff --git a/easyllm/data/filters/__init__.py b/easyllm/data/filters/__init__.py index 168ca92..673bc7e 100644 --- a/easyllm/data/filters/__init__.py +++ b/easyllm/data/filters/__init__.py @@ -1,9 +1,11 @@ from easyllm.data.filters.bulletpoint_ratio import BulletpointRatioFilter from easyllm.data.filters.kenlm_ppl import PerplexityFilter +from easyllm.data.filters.length import LengthFilter from easyllm.data.filters.longword import LongWordFilter from easyllm.data.filters.non_alpha_numeric import NonAlphaNumericFilter from easyllm.data.filters.digit_to_character import DigitToCharacter from easyllm.data.filters.parantheses_ration import ParenthesesRationFilter +from easyllm.data.filters.repeating import RepeatedParagraphFilter, RepeatedLinesFilter from easyllm.data.filters.url_ratio import UrlRatioFilter from easyllm.data.filters.whitespace_ration import WhitespaceRatioFilter from easyllm.data.filters.words_to_symbol import SymbolToWordFilter diff --git a/easyllm/data/filters/cookie_banner.py b/easyllm/data/filters/cookie_banner.py new file mode 100644 index 0000000..0df4cbd --- /dev/null +++ b/easyllm/data/filters/cookie_banner.py @@ -0,0 +1,18 @@ +import re + +from pydantic import BaseModel + + +class CookieBannerFilter(BaseModel): + """ + Ref: C4 Raffel et al. + Desc: Removes documents if more than 40% of the documents include terms for cookies, tos, privacy policy, etc. Requires external list. + """ + + name: str = "cookie_banner" + regex: re.Pattern = re.compile(r"(terms of use|privacy policy|copyright|all rights reserved)", re.IGNORECASE) + remove_percentage: float = 0.4 + + def __call__(self, text): + # check if the regex matches + raise NotImplementedError("CookieBannerFilter not implemented yet") diff --git a/easyllm/data/filters/length.py b/easyllm/data/filters/length.py new file mode 100644 index 0000000..4eaa9ff --- /dev/null +++ b/easyllm/data/filters/length.py @@ -0,0 +1,21 @@ +import re + +from pydantic import BaseModel + + +class LengthFilter(BaseModel): + """ + Desc: Removes documents below or above a certain length of words + """ + + name: str = "length" + min_length: int = 10 + max_length: int = 1_000_000 + + def __call__(self, text): + num_words = len(text.split()) + + if num_words < self.min_length or num_words > self.max_length: + return True + # otherwise keep + return False diff --git a/easyllm/data/filters/repeating.py b/easyllm/data/filters/repeating.py new file mode 100644 index 0000000..3df3077 --- /dev/null +++ b/easyllm/data/filters/repeating.py @@ -0,0 +1,53 @@ +import re + +from pydantic import BaseModel + + +class RepeatedLinesFilter(BaseModel): + """ + Ref: Gopher (Rae et al., 2021) + Desc: If the document shrinks by > 30% after removing repeated lines then remove + """ + + name: str = "repeated_lines" + remove_percentage: float = 0.3 + + def __call__(self, text): + # split the text into lines + lines = text.split("\n") + # remove empty lines + lines = [line for line in lines if line.strip()] + if len(lines) == 0: + return True + # remove repeated lines + unique_lines = list(set(lines)) + # calculate the percentage of lines removed + if len(unique_lines) / len(lines) < self.remove_percentage: + return True + # otherwise keep + return False + + +class RepeatedParagraphFilter(BaseModel): + """ + Ref: Gopher (Rae et al., 2021) + Desc: If the document shrinks by > 30% after removing repeated paragraphs then remove + """ + + name: str = "repeated_paragraph" + remove_percentage: float = 0.3 + + def __call__(self, text): + # split the text into lines + paragraphes = text.split("\n\n") + # remove empty paragraph + paragraphes = [p for p in paragraphes if p.strip()] + if len(paragraphes) == 0: + return True + # remove repeated paragraphes + unique_paragraphes = list(set(paragraphes)) + # calculate the percentage of lines removed + if len(unique_paragraphes) / len(paragraphes) < self.remove_percentage: + return True + # otherwise keep + return False diff --git a/notebooks/data-filter.ipynb b/notebooks/data-filter.ipynb index 0db8d1a..9814842 100644 --- a/notebooks/data-filter.ipynb +++ b/notebooks/data-filter.ipynb @@ -252,6 +252,79 @@ "assert lw(f\"This is a test with a {'longword'*500}\") == True" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## LengthFilter\n", + "\n", + "The `LengthFilter` removes documents below or above a certain number of words. Not tokens since its more expensive to compute." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from easyllm.data.filters import LengthFilter\n", + "\n", + "l = LengthFilter(min_length=1, max_length=100)\n", + "\n", + "assert l(\"hello world\") == False\n", + "\n", + "assert l(\"hello world \" * 100) == True" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## RepeatedParagraphFilter, RepeatedLinesFilter\n", + "\n", + "The `RepeatedParagraphFilter` & `RepeatedLinesFilter` remove documents which have more than 30% repeated lines or paragraphs. Based on [Gopher (Rae et al., 2021)](https://arxiv.org/pdf/2112.11446.pdf) " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['hello', 'world']\n", + "1.0\n", + "['hello', 'hello', 'hello']\n", + "0.3333333333333333\n" + ] + }, + { + "ename": "AssertionError", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m/home/ubuntu/easyllm/notebooks/data-filter.ipynb Cell 24\u001b[0m in \u001b[0;36m7\n\u001b[1;32m 4\u001b[0m rp \u001b[39m=\u001b[39m RepeatedParagraphFilter()\n\u001b[1;32m 6\u001b[0m \u001b[39massert\u001b[39;00m rl(\u001b[39m\"\u001b[39m\u001b[39mhello\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39mworld\u001b[39m\u001b[39m\"\u001b[39m) \u001b[39m==\u001b[39m \u001b[39mFalse\u001b[39;00m\n\u001b[0;32m----> 7\u001b[0m \u001b[39massert\u001b[39;00m rl(\u001b[39m\"\u001b[39m\u001b[39mhello\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39mhello\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39mhello\u001b[39m\u001b[39m\"\u001b[39m) \u001b[39m==\u001b[39m \u001b[39mTrue\u001b[39;00m\n\u001b[1;32m 9\u001b[0m \u001b[39massert\u001b[39;00m rp(\u001b[39m\"\u001b[39m\u001b[39mhello\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m\\n\u001b[39;00m\u001b[39mworld\u001b[39m\u001b[39m\"\u001b[39m) \u001b[39m==\u001b[39m \u001b[39mFalse\u001b[39;00m\n\u001b[1;32m 10\u001b[0m \u001b[39massert\u001b[39;00m rp(\u001b[39m\"\u001b[39m\u001b[39mhello\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m\\n\u001b[39;00m\u001b[39mhello\u001b[39m\u001b[39m\"\u001b[39m) \u001b[39m==\u001b[39m \u001b[39mTrue\u001b[39;00m\n", + "\u001b[0;31mAssertionError\u001b[0m: " + ] + } + ], + "source": [ + "from easyllm.data.filters import RepeatedLinesFilter, RepeatedParagraphFilter\n", + "\n", + "rl = RepeatedLinesFilter()\n", + "rp = RepeatedParagraphFilter()\n", + "\n", + "assert rl(\"hello\\nworld\") == False\n", + "assert rl(\"hello\\nhello\\nhello\") == True\n", + "\n", + "assert rp(\"hello\\n\\nworld\") == False\n", + "assert rp(\"hello\\n\\nhello\") == True" + ] + }, { "cell_type": "code", "execution_count": null, From 7f1392484a0d01ce05429187046e52a8310d892c Mon Sep 17 00:00:00 2001 From: Philipp Schmid Date: Fri, 11 Aug 2023 15:28:28 +0000 Subject: [PATCH 06/16] more filter --- easyllm/data/filters/__init__.py | 2 + easyllm/data/filters/n_gram.py | 31 ++++++++++++ easyllm/data/filters/punctuation.py | 27 ++++++++++ easyllm/data/filters/repeating.py | 2 - easyllm/data/filters/stop_word.py | 0 notebooks/data-filter.ipynb | 78 +++++++++++++++++++---------- 6 files changed, 112 insertions(+), 28 deletions(-) create mode 100644 easyllm/data/filters/n_gram.py create mode 100644 easyllm/data/filters/punctuation.py create mode 100644 easyllm/data/filters/stop_word.py diff --git a/easyllm/data/filters/__init__.py b/easyllm/data/filters/__init__.py index 673bc7e..b1ddc05 100644 --- a/easyllm/data/filters/__init__.py +++ b/easyllm/data/filters/__init__.py @@ -2,9 +2,11 @@ from easyllm.data.filters.kenlm_ppl import PerplexityFilter from easyllm.data.filters.length import LengthFilter from easyllm.data.filters.longword import LongWordFilter +from easyllm.data.filters.n_gram import TopNGramsFilter from easyllm.data.filters.non_alpha_numeric import NonAlphaNumericFilter from easyllm.data.filters.digit_to_character import DigitToCharacter from easyllm.data.filters.parantheses_ration import ParenthesesRationFilter +from easyllm.data.filters.punctuation import PunctuationFilter from easyllm.data.filters.repeating import RepeatedParagraphFilter, RepeatedLinesFilter from easyllm.data.filters.url_ratio import UrlRatioFilter from easyllm.data.filters.whitespace_ration import WhitespaceRatioFilter diff --git a/easyllm/data/filters/n_gram.py b/easyllm/data/filters/n_gram.py new file mode 100644 index 0000000..0fa1f53 --- /dev/null +++ b/easyllm/data/filters/n_gram.py @@ -0,0 +1,31 @@ +from pydantic import BaseModel +from itertools import chain +from collections import Counter + + +def get_ngrams(input_list, n): + return [item for item in zip(*[input_list[i:] for i in range(n)])] + + +class TopNGramsFilter(BaseModel): + """ + Ref: Gopher (Rae et al., 2021) + Desc: If the document shrinks by > 20% after removing top n-grams then remove + """ + + name: str = "top_n_grams" + remove_percentage: float = 0.2 + n: int = 2 + + def __call__(self, text): + words = text.split() + if len(words) <= self.n: + return True + ngrams = get_ngrams(words, self.n) + n_grams = Counter(chain(ngrams)) + most_common = n_grams.most_common(1)[0][0] + + if n_grams[most_common] / len(n_grams) > self.remove_percentage: + return True + # otherwise keep + return False diff --git a/easyllm/data/filters/punctuation.py b/easyllm/data/filters/punctuation.py new file mode 100644 index 0000000..4a229e5 --- /dev/null +++ b/easyllm/data/filters/punctuation.py @@ -0,0 +1,27 @@ +import re +from typing import List + +from pydantic import BaseModel + + +class PunctuationFilter(BaseModel): + """ + Ref: C4 Raffel et al. + Desc: If less than 15% of the sentences end with a punctuation mark then remove + """ + + name: str = "punctuation" + punctuations: List[str] = [".", "!", "?"] + remove_percentage: float = 0.15 + + def __call__(self, text): + sentences = text.split("\n") + # count the number of sentences not ending with a punctuation mark + num_sentences_wo_p = sum( + 1 for sentence in sentences if sentence[-1] not in self.punctuations + ) + # check if the ratio of sentences not ending with a punctuation mark is greater than the remove percentage + if num_sentences_wo_p / len(sentences) > self.remove_percentage: + return True + # otherwise keep + return False \ No newline at end of file diff --git a/easyllm/data/filters/repeating.py b/easyllm/data/filters/repeating.py index 3df3077..a37f5ca 100644 --- a/easyllm/data/filters/repeating.py +++ b/easyllm/data/filters/repeating.py @@ -1,5 +1,3 @@ -import re - from pydantic import BaseModel diff --git a/easyllm/data/filters/stop_word.py b/easyllm/data/filters/stop_word.py new file mode 100644 index 0000000..e69de29 diff --git a/notebooks/data-filter.ipynb b/notebooks/data-filter.ipynb index 9814842..47bb67d 100644 --- a/notebooks/data-filter.ipynb +++ b/notebooks/data-filter.ipynb @@ -287,31 +287,9 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['hello', 'world']\n", - "1.0\n", - "['hello', 'hello', 'hello']\n", - "0.3333333333333333\n" - ] - }, - { - "ename": "AssertionError", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m/home/ubuntu/easyllm/notebooks/data-filter.ipynb Cell 24\u001b[0m in \u001b[0;36m7\n\u001b[1;32m 4\u001b[0m rp \u001b[39m=\u001b[39m RepeatedParagraphFilter()\n\u001b[1;32m 6\u001b[0m \u001b[39massert\u001b[39;00m rl(\u001b[39m\"\u001b[39m\u001b[39mhello\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39mworld\u001b[39m\u001b[39m\"\u001b[39m) \u001b[39m==\u001b[39m \u001b[39mFalse\u001b[39;00m\n\u001b[0;32m----> 7\u001b[0m \u001b[39massert\u001b[39;00m rl(\u001b[39m\"\u001b[39m\u001b[39mhello\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39mhello\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39mhello\u001b[39m\u001b[39m\"\u001b[39m) \u001b[39m==\u001b[39m \u001b[39mTrue\u001b[39;00m\n\u001b[1;32m 9\u001b[0m \u001b[39massert\u001b[39;00m rp(\u001b[39m\"\u001b[39m\u001b[39mhello\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m\\n\u001b[39;00m\u001b[39mworld\u001b[39m\u001b[39m\"\u001b[39m) \u001b[39m==\u001b[39m \u001b[39mFalse\u001b[39;00m\n\u001b[1;32m 10\u001b[0m \u001b[39massert\u001b[39;00m rp(\u001b[39m\"\u001b[39m\u001b[39mhello\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m\\n\u001b[39;00m\u001b[39mhello\u001b[39m\u001b[39m\"\u001b[39m) \u001b[39m==\u001b[39m \u001b[39mTrue\u001b[39;00m\n", - "\u001b[0;31mAssertionError\u001b[0m: " - ] - } - ], + "outputs": [], "source": [ "from easyllm.data.filters import RepeatedLinesFilter, RepeatedParagraphFilter\n", "\n", @@ -319,10 +297,58 @@ "rp = RepeatedParagraphFilter()\n", "\n", "assert rl(\"hello\\nworld\") == False\n", - "assert rl(\"hello\\nhello\\nhello\") == True\n", + "assert rl(\"hello\\nhello\\nhello\\nhello\") == True\n", "\n", "assert rp(\"hello\\n\\nworld\") == False\n", - "assert rp(\"hello\\n\\nhello\") == True" + "assert rp(\"hello\\n\\nhello\\n\\nhello\\n\\nhello\") == True" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## TopNGramsFilter\n", + "\n", + "The `TopNGramsFilter` removes the document if the top n-gram makes more than 20% of the document." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from easyllm.data.filters import TopNGramsFilter\n", + "\n", + "tng = TopNGramsFilter()\n", + "\n", + "assert tng(\"This is a test for a longer sentence\") == False \n", + "\n", + "assert tng(\"The quick brown fox jumps over the lazy dog The quick brown\") == True" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## PunctuationFilter\n", + "\n", + "The `PunctuationFilter` removes the document if more than 15% of the \"linebreaks\" don't contain any punctuation." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "from easyllm.data.filters import PunctuationFilter\n", + "\n", + "pf = PunctuationFilter()\n", + "\n", + "assert pf(\"This is a sentence.\") == False\n", + "\n", + "assert pf(\"This is a sentence\\n But is not one.\\nNo oneyet.\") == True" ] }, { From e8108058dc24ace9943f8a342a0924750b36ba42 Mon Sep 17 00:00:00 2001 From: Philipp Schmid Date: Fri, 11 Aug 2023 15:47:36 +0000 Subject: [PATCH 07/16] more filters --- easyllm/data/filters/__init__.py | 3 +- easyllm/data/filters/common_word.py | 29 +++++++++++++++++++ easyllm/data/filters/cookie_banner.py | 35 +++++++++++++++++++++++ easyllm/data/filters/punctuation.py | 33 ++++++++++++++++++---- easyllm/data/filters/stop_word.py | 0 notebooks/data-filter.ipynb | 40 +++++++++++++++++++++++---- 6 files changed, 129 insertions(+), 11 deletions(-) create mode 100644 easyllm/data/filters/common_word.py delete mode 100644 easyllm/data/filters/stop_word.py diff --git a/easyllm/data/filters/__init__.py b/easyllm/data/filters/__init__.py index b1ddc05..9862e86 100644 --- a/easyllm/data/filters/__init__.py +++ b/easyllm/data/filters/__init__.py @@ -1,4 +1,5 @@ from easyllm.data.filters.bulletpoint_ratio import BulletpointRatioFilter +from easyllm.data.filters.common_word import CommonWordFilter from easyllm.data.filters.kenlm_ppl import PerplexityFilter from easyllm.data.filters.length import LengthFilter from easyllm.data.filters.longword import LongWordFilter @@ -6,7 +7,7 @@ from easyllm.data.filters.non_alpha_numeric import NonAlphaNumericFilter from easyllm.data.filters.digit_to_character import DigitToCharacter from easyllm.data.filters.parantheses_ration import ParenthesesRationFilter -from easyllm.data.filters.punctuation import PunctuationFilter +from easyllm.data.filters.punctuation import EllipsisFilter, PunctuationFilter from easyllm.data.filters.repeating import RepeatedParagraphFilter, RepeatedLinesFilter from easyllm.data.filters.url_ratio import UrlRatioFilter from easyllm.data.filters.whitespace_ration import WhitespaceRatioFilter diff --git a/easyllm/data/filters/common_word.py b/easyllm/data/filters/common_word.py new file mode 100644 index 0000000..f562020 --- /dev/null +++ b/easyllm/data/filters/common_word.py @@ -0,0 +1,29 @@ +from typing import List + +from pydantic import BaseModel + +COMMON_WORDS_EN = ["the", "be", "to", "of", "and", "that", "have", "with", "this"] +COMMON_WORDS_DE = ["der", "die", "das", "er" "sein", "zu", "ist", "war", "von", "und", "haben", "mit"] + + +class CommonWordFilter(BaseModel): + """ + Ref: Gopher (Rae et al., 2021) + Desc: Makes sure that the document contains at least 2 common words if not remove + """ + + name: str = "common_word" + common_words: List[str] = COMMON_WORDS_EN + n: int = 2 + + def __call__(self, text): + words = text.split() + common_word_counter = 0 + # count the number of common words + for word in words: + if word.lower() in self.common_words: + common_word_counter += 1 + if common_word_counter >= self.n: + return False + # otherwise remove + return True diff --git a/easyllm/data/filters/cookie_banner.py b/easyllm/data/filters/cookie_banner.py index 0df4cbd..91ed1bb 100644 --- a/easyllm/data/filters/cookie_banner.py +++ b/easyllm/data/filters/cookie_banner.py @@ -2,6 +2,41 @@ from pydantic import BaseModel +policy_substrings = [ + "terms of use", + "privacy policy", + "cookie policy", + "uses cookies", + "privacy overview", + "use of cookies", + "use cookies", + "privacy & cookies policy", + "privacy and cookies policy", + "This website uses cookies to improve your experience while you " + "navigate through the website. Out of these cookies, the cookies " + "that are categorized as necessary are stored on your browser as they " + "are essential for the working of basic functionalities of the website. " + "We also use third-party cookies that help us analyze and understand how " + "you use this website. These cookies will be stored in your browser only " + "with your consent. You also have the option to opt-out of these " + "cookies. But opting out of some of these cookies may have an effect " + "on your browsing experience.".lower(), + "Necessary cookies are absolutely essential for the website to " + "function properly. This category only includes cookies that " + "ensures basic functionalities and security features of the website. " + "These cookies do not store any personal information.".lower(), + "Any cookies that may not be particularly necessary for the website " + "to function and is used specifically to collect user personal data " + "via analytics, ads, other embedded contents are termed as non-necessary " + "cookies. It is mandatory to procure user consent prior to running these " + "cookies on your website.".lower(), + "This site uses cookies, including for analytics, personalization, and " + "advertising purposes. For more information or to change your " + "cookie settings, click here.".lower(), + "If you continue to browse this site without changing your cookie " + "settings, you agree to this use. AcceptRead More".lower(), +] + class CookieBannerFilter(BaseModel): """ diff --git a/easyllm/data/filters/punctuation.py b/easyllm/data/filters/punctuation.py index 4a229e5..a0a1d53 100644 --- a/easyllm/data/filters/punctuation.py +++ b/easyllm/data/filters/punctuation.py @@ -1,4 +1,3 @@ -import re from typing import List from pydantic import BaseModel @@ -17,11 +16,35 @@ class PunctuationFilter(BaseModel): def __call__(self, text): sentences = text.split("\n") # count the number of sentences not ending with a punctuation mark - num_sentences_wo_p = sum( - 1 for sentence in sentences if sentence[-1] not in self.punctuations - ) + num_sentences_wo_p = sum(1 for sentence in sentences if sentence[-1] not in self.punctuations) # check if the ratio of sentences not ending with a punctuation mark is greater than the remove percentage if num_sentences_wo_p / len(sentences) > self.remove_percentage: return True # otherwise keep - return False \ No newline at end of file + return False + + +class EllipsisFilter(BaseModel): + """ + Ref: C4 Raffel et al. + Desc: If more than 30% of the sentences endwith an elipsis then remove + """ + + name: str = "ellipsis" + ellipsis: List[str] = ["...", "[...]", "…", "(...)", "[…]", "-»", "read more..", "read more"] + remove_percentage: float = 0.3 + + def __call__(self, text): + sentences = text.split("\n") + # count the number of sentences ending with an ellipsis + ellipsis_counter = 0 + for sentence in sentences: + for ellipsis in self.ellipsis: + if sentence.endswith(ellipsis): + ellipsis_counter += 1 + break + # check if the ratio of sentences ending with an ellipsis is greater than the remove percentage + if ellipsis_counter / len(sentences) > self.remove_percentage: + return True + # otherwise keep + return False diff --git a/easyllm/data/filters/stop_word.py b/easyllm/data/filters/stop_word.py deleted file mode 100644 index e69de29..0000000 diff --git a/notebooks/data-filter.ipynb b/notebooks/data-filter.ipynb index 47bb67d..ab940af 100644 --- a/notebooks/data-filter.ipynb +++ b/notebooks/data-filter.ipynb @@ -331,24 +331,54 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## PunctuationFilter\n", + "## PunctuationFilter & EllipsisFilter\n", "\n", - "The `PunctuationFilter` removes the document if more than 15% of the \"linebreaks\" don't contain any punctuation." + "The `PunctuationFilter` & `EllipsisFilter` removes the document if more than 15% of the \"linebreaks\" don't contain any punctuation or if more than 30% of the \"linebreaks\" contain an ellipsis." ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ - "from easyllm.data.filters import PunctuationFilter\n", + "from easyllm.data.filters import PunctuationFilter, EllipsisFilter\n", "\n", "pf = PunctuationFilter()\n", "\n", "assert pf(\"This is a sentence.\") == False\n", "\n", - "assert pf(\"This is a sentence\\n But is not one.\\nNo oneyet.\") == True" + "assert pf(\"This is a sentence\\n But is not one.\\nNo oneyet.\") == True\n", + "\n", + "ef = EllipsisFilter()\n", + "\n", + "assert ef(\"This is a sentence.\") == False\n", + "\n", + "assert ef(\"This is a sentence\\n But is not one....\") == True" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## CommonWordFilter\n", + "\n", + "The `CommonWordFilter` removes documents if they don't include atleast 2 common words." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from easyllm.data.filters import CommonWordFilter\n", + "\n", + "cw = CommonWordFilter()\n", + "\n", + "assert cw(\"This is a sentence with a common word.\") == False\n", + "\n", + "assert cw(\"cat dog mouse\") == True" ] }, { From 1f652a08d9b7c1d19d599770c38f8d150a4c86af Mon Sep 17 00:00:00 2001 From: Philipp Schmid Date: Fri, 11 Aug 2023 16:02:18 +0000 Subject: [PATCH 08/16] style --- easyllm/data/filters/__init__.py | 4 ++-- easyllm/data/filters/bulletpoint_ratio.py | 1 - easyllm/data/filters/kenlm_ppl.py | 4 ++-- easyllm/data/filters/length.py | 1 - easyllm/data/filters/longword.py | 1 - easyllm/data/filters/n_gram.py | 7 ++++--- easyllm/data/filters/non_alpha_numeric.py | 2 +- 7 files changed, 9 insertions(+), 11 deletions(-) diff --git a/easyllm/data/filters/__init__.py b/easyllm/data/filters/__init__.py index 9862e86..d37d7b2 100644 --- a/easyllm/data/filters/__init__.py +++ b/easyllm/data/filters/__init__.py @@ -1,14 +1,14 @@ from easyllm.data.filters.bulletpoint_ratio import BulletpointRatioFilter from easyllm.data.filters.common_word import CommonWordFilter +from easyllm.data.filters.digit_to_character import DigitToCharacter from easyllm.data.filters.kenlm_ppl import PerplexityFilter from easyllm.data.filters.length import LengthFilter from easyllm.data.filters.longword import LongWordFilter from easyllm.data.filters.n_gram import TopNGramsFilter from easyllm.data.filters.non_alpha_numeric import NonAlphaNumericFilter -from easyllm.data.filters.digit_to_character import DigitToCharacter from easyllm.data.filters.parantheses_ration import ParenthesesRationFilter from easyllm.data.filters.punctuation import EllipsisFilter, PunctuationFilter -from easyllm.data.filters.repeating import RepeatedParagraphFilter, RepeatedLinesFilter +from easyllm.data.filters.repeating import RepeatedLinesFilter, RepeatedParagraphFilter from easyllm.data.filters.url_ratio import UrlRatioFilter from easyllm.data.filters.whitespace_ration import WhitespaceRatioFilter from easyllm.data.filters.words_to_symbol import SymbolToWordFilter diff --git a/easyllm/data/filters/bulletpoint_ratio.py b/easyllm/data/filters/bulletpoint_ratio.py index 1e02966..566d554 100644 --- a/easyllm/data/filters/bulletpoint_ratio.py +++ b/easyllm/data/filters/bulletpoint_ratio.py @@ -1,4 +1,3 @@ -import re from typing import List from pydantic import BaseModel diff --git a/easyllm/data/filters/kenlm_ppl.py b/easyllm/data/filters/kenlm_ppl.py index 55b56be..fb40cfa 100644 --- a/easyllm/data/filters/kenlm_ppl.py +++ b/easyllm/data/filters/kenlm_ppl.py @@ -103,8 +103,8 @@ def from_pretrained( try: model = hf_hub_download("philschmid/kenlm",filename=f"wikipedia/{language_or_path}.arpa.bin") tokenizer = hf_hub_download("philschmid/kenlm",filename=f"wikipedia/{language_or_path}.sp.model") - except: - raise ValueError(f"KenLM model for {language_or_path} not found at https://huggingface.co/philschmid/kenlm. Please train your own model and upload it to the hub.") + except Exception: + raise ValueError(f"KenLM model for {language_or_path} not found at https://huggingface.co/philschmid/kenlm. Please train your own model and upload it to the hub.") from None return cls( diff --git a/easyllm/data/filters/length.py b/easyllm/data/filters/length.py index 4eaa9ff..51646f2 100644 --- a/easyllm/data/filters/length.py +++ b/easyllm/data/filters/length.py @@ -1,4 +1,3 @@ -import re from pydantic import BaseModel diff --git a/easyllm/data/filters/longword.py b/easyllm/data/filters/longword.py index ed59081..f98ba38 100644 --- a/easyllm/data/filters/longword.py +++ b/easyllm/data/filters/longword.py @@ -1,4 +1,3 @@ -import re from pydantic import BaseModel diff --git a/easyllm/data/filters/n_gram.py b/easyllm/data/filters/n_gram.py index 0fa1f53..5523be3 100644 --- a/easyllm/data/filters/n_gram.py +++ b/easyllm/data/filters/n_gram.py @@ -1,10 +1,11 @@ -from pydantic import BaseModel -from itertools import chain from collections import Counter +from itertools import chain + +from pydantic import BaseModel def get_ngrams(input_list, n): - return [item for item in zip(*[input_list[i:] for i in range(n)])] + return list(zip(*[input_list[i:] for i in range(n)])) class TopNGramsFilter(BaseModel): diff --git a/easyllm/data/filters/non_alpha_numeric.py b/easyllm/data/filters/non_alpha_numeric.py index 8f0ba8b..81815f3 100644 --- a/easyllm/data/filters/non_alpha_numeric.py +++ b/easyllm/data/filters/non_alpha_numeric.py @@ -10,7 +10,7 @@ class NonAlphaNumericFilter(BaseModel): """ name: str = "non_alpha_numeric" - regex: re.Pattern = re.compile("[^a-zA-Z0-9\s]") + regex: re.Pattern = re.compile(r"[^a-zA-Z0-9\s]") remove_percentage: float = 0.2 def __call__(self, text): From ed573afb665102fa0acef821dfa31b7c34254d68 Mon Sep 17 00:00:00 2001 From: Philipp Schmid Date: Fri, 11 Aug 2023 16:05:43 +0000 Subject: [PATCH 09/16] fix docs dependencies --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 4bc1c5b..a9567f7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,7 +36,7 @@ scripts = { easyllm = "easyllm.cli:main" } dependencies = ["pydantic==2.1.1", "nanoid==2.0.0", "huggingface-hub==0.16.4"] [project.optional-dependencies] -data = ["datasets","https://github.com/kpu/kenlm/archive/master.zip","sentencepiece"] +data = ["datasets","kenlm @ https://github.com/kpu/kenlm/archive/master.zip","sentencepiece"] test = ["pytest", "ruff", "black", "isort", "mypy", "hatch"] dev = ["ruff", "black", "isort", "mypy", "hatch"] docs = [ From c15240bc381b24d85251b12808d84a1bca8f2622 Mon Sep 17 00:00:00 2001 From: Philipp Schmid Date: Fri, 11 Aug 2023 18:37:48 +0000 Subject: [PATCH 10/16] remote builds --- pyproject.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index a9567f7..84fb77c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,6 +10,9 @@ Source = "https://github.com/unknown/hatch-demo" [tool.hatch.version] path = "easyllm/__init__.py" +[tool.hatch.metadata] +allow-direct-references = true + [project] name = "easyllm" description = "Description" From 323566dd5d4557178a0fc7aca839c4a202c5ae86 Mon Sep 17 00:00:00 2001 From: Philipp Schmid Date: Fri, 11 Aug 2023 19:16:54 +0000 Subject: [PATCH 11/16] fix punctuation --- easyllm/data/filters/punctuation.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/easyllm/data/filters/punctuation.py b/easyllm/data/filters/punctuation.py index a0a1d53..0dd1297 100644 --- a/easyllm/data/filters/punctuation.py +++ b/easyllm/data/filters/punctuation.py @@ -16,9 +16,14 @@ class PunctuationFilter(BaseModel): def __call__(self, text): sentences = text.split("\n") # count the number of sentences not ending with a punctuation mark - num_sentences_wo_p = sum(1 for sentence in sentences if sentence[-1] not in self.punctuations) + punc_counter = 0 + for sentence in sentences: + for punc in self.punctuations: + if not sentence.endswith(punc): + punc_counter += 1 + break # check if the ratio of sentences not ending with a punctuation mark is greater than the remove percentage - if num_sentences_wo_p / len(sentences) > self.remove_percentage: + if punc_counter / len(sentences) > self.remove_percentage: return True # otherwise keep return False From aa9a2f9ee2cde331ba478a871c73585ba50d39d9 Mon Sep 17 00:00:00 2001 From: Philipp Schmid Date: Fri, 11 Aug 2023 19:26:07 +0000 Subject: [PATCH 12/16] test --- easyllm/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/easyllm/__init__.py b/easyllm/__init__.py index e7e3057..ed7aec4 100644 --- a/easyllm/__init__.py +++ b/easyllm/__init__.py @@ -1,4 +1,4 @@ # SPDX-FileCopyrightText: 2023-present philschmid # # SPDX-License-Identifier: MIT -__version__ = "0.5.0.dev0" +__version__ = "0.5.1.dev0" From f761f5b2fc830c40b151561d7e0bfc9526556579 Mon Sep 17 00:00:00 2001 From: Philipp Schmid Date: Fri, 11 Aug 2023 19:35:28 +0000 Subject: [PATCH 13/16] fix punc --- easyllm/data/filters/punctuation.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/easyllm/data/filters/punctuation.py b/easyllm/data/filters/punctuation.py index 0dd1297..91052e1 100644 --- a/easyllm/data/filters/punctuation.py +++ b/easyllm/data/filters/punctuation.py @@ -15,15 +15,15 @@ class PunctuationFilter(BaseModel): def __call__(self, text): sentences = text.split("\n") - # count the number of sentences not ending with a punctuation mark + # count the number of sentences ending with a punctuation mark punc_counter = 0 for sentence in sentences: for punc in self.punctuations: - if not sentence.endswith(punc): + if sentence.endswith(punc): punc_counter += 1 break # check if the ratio of sentences not ending with a punctuation mark is greater than the remove percentage - if punc_counter / len(sentences) > self.remove_percentage: + if 1 - (punc_counter / len(sentences)) > self.remove_percentage: return True # otherwise keep return False From 437b0dc0b3b8e25b723af228ac2f339b8201e0c7 Mon Sep 17 00:00:00 2001 From: Philipp Schmid Date: Sat, 12 Aug 2023 18:30:09 +0000 Subject: [PATCH 14/16] html extractor --- easyllm/data/extractor/__init__.py | 1 + easyllm/data/extractor/html_extractor.py | 24 ++++++++++++++++++ easyllm/data/filters/kenlm_ppl.py | 31 ++++++++++++------------ easyllm/data/filters/punctuation.py | 2 +- pyproject.toml | 2 +- 5 files changed, 42 insertions(+), 18 deletions(-) create mode 100644 easyllm/data/extractor/__init__.py create mode 100644 easyllm/data/extractor/html_extractor.py diff --git a/easyllm/data/extractor/__init__.py b/easyllm/data/extractor/__init__.py new file mode 100644 index 0000000..3f25c04 --- /dev/null +++ b/easyllm/data/extractor/__init__.py @@ -0,0 +1 @@ +from easyllm.data.extractor.html_extractor import HtmlExtractor diff --git a/easyllm/data/extractor/html_extractor.py b/easyllm/data/extractor/html_extractor.py new file mode 100644 index 0000000..41bf77f --- /dev/null +++ b/easyllm/data/extractor/html_extractor.py @@ -0,0 +1,24 @@ +from pydantic import BaseModel + +# +from inscriptis import get_text +from inscriptis.css_profiles import CSS_PROFILES +from inscriptis.model.config import ParserConfig +from readability import Document + +INSCRIPTIS_CONFIG = ParserConfig(css=CSS_PROFILES["strict"]) + + +class HtmlExtractor(BaseModel): + """ + Desc: Extracts text from the HTML document using mozzilas readability and inscriptis. + """ + + name: str = "html_extractor" + min_doc_length: int = 25 + + def __call__(self, document: str) -> str: + parsed_doc = Document(document, min_text_length=self.min_doc_length) + clean_html = parsed_doc.summary(html_partial=True) + content = get_text(clean_html, INSCRIPTIS_CONFIG).strip() + return content diff --git a/easyllm/data/filters/kenlm_ppl.py b/easyllm/data/filters/kenlm_ppl.py index fb40cfa..8cb4e1b 100644 --- a/easyllm/data/filters/kenlm_ppl.py +++ b/easyllm/data/filters/kenlm_ppl.py @@ -1,4 +1,3 @@ - import importlib.util import re import unicodedata @@ -11,9 +10,8 @@ _sentencepiece = importlib.util.find_spec("sentencepiece") is not None if _kenlm or not _sentencepiece: - import kenlm - import sentencepiece - + import kenlm + import sentencepiece class SentencePiece: @@ -68,10 +66,8 @@ class KenlmModel: "%": "%", "►": "-", } - unicode_punct_re:re.Pattern = re.compile(f"[{''.join(unicode_punct.keys())}]") - non_printing_chars_re:re.Pattern = re.compile( - f"[{''.join(map(chr, list(range(0,32)) + list(range(127,160))))}]" - ) + unicode_punct_re: re.Pattern = re.compile(f"[{''.join(unicode_punct.keys())}]") + non_printing_chars_re: re.Pattern = re.compile(f"[{''.join(map(chr, list(range(0,32)) + list(range(127,160))))}]") model: kenlm.Model = None tokenizer: SentencePiece = None accent: bool = False @@ -101,11 +97,12 @@ def from_pretrained( language_or_path: str, ): try: - model = hf_hub_download("philschmid/kenlm",filename=f"wikipedia/{language_or_path}.arpa.bin") - tokenizer = hf_hub_download("philschmid/kenlm",filename=f"wikipedia/{language_or_path}.sp.model") + model = hf_hub_download("philschmid/kenlm", filename=f"wikipedia/{language_or_path}.arpa.bin") + tokenizer = hf_hub_download("philschmid/kenlm", filename=f"wikipedia/{language_or_path}.sp.model") except Exception: - raise ValueError(f"KenLM model for {language_or_path} not found at https://huggingface.co/philschmid/kenlm. Please train your own model and upload it to the hub.") from None - + raise ValueError( + f"KenLM model for {language_or_path} not found at https://huggingface.co/philschmid/kenlm. Please train your own model and upload it to the hub." + ) from None return cls( model, @@ -187,15 +184,17 @@ class PerplexityFilter(BaseModel): max_threshold: int = 1000 model_config = ConfigDict(arbitrary_types_allowed=True) - def __init__(self,language:str,min_threshold:int=0,max_threshold:int=1000): + def __init__(self, language: str, min_threshold: int = 0, max_threshold: int = 1000): super().__init__() self.min_threshold = min_threshold self.max_threshold = max_threshold self.model = KenlmModel.from_pretrained(language) - def __call__(self, doc: str) -> bool: # returns True if the perplexity of the document outside of the threshold, # meaning smaller than min_threshold or larger than max_threshold - return not self.min_threshold <= self.model.get_perplexity(doc) <= self.max_threshold - + perplexity = self.model.get_perplexity(doc) + if perplexity < self.min_threshold or perplexity > self.max_threshold: + return True + # otherwise keep + return False diff --git a/easyllm/data/filters/punctuation.py b/easyllm/data/filters/punctuation.py index 91052e1..17da9a2 100644 --- a/easyllm/data/filters/punctuation.py +++ b/easyllm/data/filters/punctuation.py @@ -23,7 +23,7 @@ def __call__(self, text): punc_counter += 1 break # check if the ratio of sentences not ending with a punctuation mark is greater than the remove percentage - if 1 - (punc_counter / len(sentences)) > self.remove_percentage: + if punc_counter / len(sentences) < self.remove_percentage: return True # otherwise keep return False diff --git a/pyproject.toml b/pyproject.toml index 84fb77c..83b7595 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,7 +39,7 @@ scripts = { easyllm = "easyllm.cli:main" } dependencies = ["pydantic==2.1.1", "nanoid==2.0.0", "huggingface-hub==0.16.4"] [project.optional-dependencies] -data = ["datasets","kenlm @ https://github.com/kpu/kenlm/archive/master.zip","sentencepiece"] +data = ["datasets","kenlm @ https://github.com/kpu/kenlm/archive/master.zip","sentencepiece","readability-lxml","inscriptis"] test = ["pytest", "ruff", "black", "isort", "mypy", "hatch"] dev = ["ruff", "black", "isort", "mypy", "hatch"] docs = [ From 02e6526b42469b7c0d5fbd1f5a53990cfbde4a32 Mon Sep 17 00:00:00 2001 From: Philipp Schmid Date: Fri, 24 Nov 2023 14:49:50 +0000 Subject: [PATCH 15/16] example --- notebooks/datasets/filter-dataset.ipynb | 2316 +++++++++++++++++++++++ 1 file changed, 2316 insertions(+) create mode 100644 notebooks/datasets/filter-dataset.ipynb diff --git a/notebooks/datasets/filter-dataset.ipynb b/notebooks/datasets/filter-dataset.ipynb new file mode 100644 index 0000000..a55f94a --- /dev/null +++ b/notebooks/datasets/filter-dataset.ipynb @@ -0,0 +1,2316 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip uninstall easyllm -y\n", + "%pip install git+https://github.com/philschmid/easyllm.git@datafilter --upgrade" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import load_dataset\n", + "\n", + "ds = load_dataset('philschmid/oscar-2301-de-minhash-dedup',split=\"train\")\n", + "# ds = load_dataset('wikipedia','20220301.de',split=\"train\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Perplexity filtering \n" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Nach § 80 Abs. 5 Satz 1 Halbsatz 2 VwGO kann das Gericht der Hauptsache die aufschiebende Wirkung der Klage ganz oder teilweise wiederherstellen. Ist die sofortige Vollziehung von der Behörde den formellen Anforderungen des § 80 Abs. 3 Satz 1 VwGO genügend angeordnet worden, so entscheidet das Gericht nach § 80 Abs. 5 Satz 1 Halbsatz 2 VwGO über die Wiederherstellung der aufschiebenden Wirkung der Klage auf der Grundlage einer eigenen Abwägung des Interesses des Antragstellers, von der Vollziehung des angefochtenen Verwaltungsakts bis zur endgültigen Entscheidung über seine Rechtmäßigkeit verschont zu bleiben, gegen das besondere öffentliche Interesse an dessen sofortiger Vollziehung (vgl. BVerwG, Beschl. v. 19.12.2014 - 7 VR 5.14 -, juris Rn. 9; Nds. OVG, Beschl. v. 10.09.2014 - 8 ME 87/14 -, juris Rn. 2). Im Rahmen der Interessenabwägung haben die Erfolgsaussichten des in der Hauptsache eingelegten Rechtsbehelfs eine entscheidende Bedeutung. Ergibt sich bei der im Rahmen des vorläufigen Rechtsschutzes gebotenen, aber grundsätzlich auch ausreichenden (vgl. Nds. OVG, Beschl. v. 16.8.2017 - 13 ME 173/17 -, juris Rn. 4, vgl. auch Beschl. v. 24.01.2018 - 7 ME 110/17 -, juris Rn. 28) summarischen Überprüfung, dass der Rechtsbehelf in der Hauptsache keinen Erfolg haben wird, weil sich der angegriffene Verwaltungsakt als offensichtlich rechtmäßig erweist, so überwiegt regelmäßig das öffentliche Interesse an der sofortigen Vollziehung des Verwaltungsakts. Erweist sich der Rechtsbehelf bei summarischer Überprüfung demgegenüber als offensichtlich erfolgreich, überwiegt regelmäßig das Interesse des Adressaten des Verwaltungsakts, von dessen Vollziehung vorerst verschont zu bleiben. Stellen sich die Erfolgsaussichten des Rechtsbehelfs hingegen als offen dar, so ist eine Abwägung der widerstreitenden Interessen erforderlich, bei der in Rechnung zu stellen ist, welche Gründe bei bestehender Unsicherheit im Hinblick auf die Erfolgsaussichten des Rechtsbehelfs für und gegen eine Aufrechterhaltung der sofortigen Vollziehung des Verwaltungsakts sprechen (vgl. Nds. OVG, Beschl. v. 10.5.2010 - 13 ME 181/09 -, juris Rn. 4). Außerdem ist zu berücksichtigen, dass die voraussichtliche Rechtmäßigkeit eines Verwaltungsakts für sich allein nur das allgemeine Interesse an seiner Vollziehung begründet, nicht aber zugleich auch deren, für die behördliche Anordnung nach § 80 Abs. 2 Satz 1 Nr. 4 VwGO erforderliche Dringlichkeit (vgl. grundlegend BVerfG, Beschl. v. 27.4.2005 - 1 BvR 223/05 -, NVwZ 2005, 1303; Beschl. v. 18.7.1973, - 1 BvR 23/73 -, BVerfGE 35, 382, 402; Nds. OVG, Beschl. v. 10.9.2014, a.a.O.; Finkelnburg/Dombert/Külpmann, Vorläufiger Rechtsschutz im Verwaltungsstreitverfahren, 7. Aufl., Rn. 757 f. m.w.N.).\n" + ] + } + ], + "source": [ + "print(ds[456][\"text\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "8071b0d5472949deabe06d5600f46054", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "add url (num_proc=128): 0%| | 0/53172498 [00:00" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# df = ds.to_pandas()\n", + "\n", + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# Example dataframe\n", + "def plot_distribution(dfs):\n", + " # Get summary stats and quartiles\n", + " q1 = dfs['perplexity'].quantile(.05)\n", + " q2 = dfs['perplexity'].quantile(.5)\n", + " q3 = dfs['perplexity'].quantile(.95)\n", + "\n", + " # Create line chart \n", + " counts, bins = np.histogram(dfs['perplexity'], bins=30000)\n", + " bin_centers = 0.5*(bins[1:] + bins[:-1])\n", + " plt.plot(bin_centers, counts)\n", + "\n", + " # Add vertical lines for quartiles \n", + " plt.axvline(x=q1, color='r')\n", + " plt.axvline(x=q2, color='g')\n", + " plt.axvline(x=q3, color='b')\n", + "\n", + " plt.title('Perplexity Distribution')\n", + " plt.xlabel('Perplexity')\n", + " plt.ylabel('Frequency')\n", + " plt.xscale('log')\n", + "\n", + " plt.show()\n", + "\n", + "plot_distribution(df)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "get some random samples from the dataset with low and high perplexity" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Low: 3.3\n", + "High: 1155.9099999999978\n" + ] + } + ], + "source": [ + "low = df.perplexity.quantile(0)\n", + "high = df.perplexity.quantile(0.9)\n", + "\n", + "print(f'Low: {low}')\n", + "print(f'High: {high}')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "_lowest sample:_\n", + "```\n", + "'Die Skulptur Madonna mit Kind in der katholischen Kirche St-Lucien in Angy, einer französischen Gemeinde im Département Oise in der Region Hauts-de-France, wurde im dritten Viertel des 14. Jahrhunderts geschaffen. Im Jahr 1912 wurde die gotische Skulptur als Monument historique in die Liste der geschützten Objekte (Base Palissy) in Frankreich aufgenommen.\\nDie 1,10 Meter hohe Skulptur aus Kalkstein ist farbig gefasst. Maria hält das Jesuskind auf dem linken Arm. Sein Gesicht wendet sich in Richtung des Betrachters. Maria, mit bäuerlichem Gesicht und roten Wangen, trägt auf ihrem Haupt eine Krone. Die vielen Falten von ihrem Kleid geben ihrer Erscheinung eine Fülle.'\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Filterstrategy: \n", + "\n", + "1. AlphanumericFilter remove sentence with more than 20% alphanumeric\n", + "2. ParenthesesRationFilter remove sentence with more than 5% parentheses\n", + "3. PunctuationFilter remove sentence with more than 15% missing punctuation\n", + "4. EllipsisFilter remove sentence with more than 30% ellipsis\n", + "5. LengthFilter: filter short documets < 5 words\n", + "6. LongWordFilter: for js stuff\n", + "7. CommonWordFilter: check if coherent sentence maybe not needed\n", + "8. RepeatedLinesFilter: remove repeated lines 30%\n", + "9. WhitespaceRatioFilter: remove sentence with more than 25% whitespace\n", + "10. UrlRatioFilter: remove sentence with more than 20% url\n", + "11. PerplexityFilter: remove sentence with perplexity > 1000\n", + "\n", + "\n", + "TODO: find law example which is super long and to filter it " + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "wikipedia_filters = [\n", + " NonAlphaNumericFilter(),\n", + " LengthFilter(min_length=10),\n", + " CommonWordFilter(common_words=COMMON_WORDS_DE),\n", + " UrlRatioFilter(),\n", + " PerplexityFilter(language=\"de\",min_threshold=0,max_threshold=perplexity_threshold)\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "da7067303e814628917d39b02cab5c0e", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "filter documents... (num_proc=128): 0%| | 0/53172498 [00:00 300_000:\n", + " return False\n", + " return True\n", + "\n", + "\n", + "# datasets filters keeps true elements, meaning if the filter is we want to set it to false\n", + "ds = ds.filter(apply_filters,num_proc=os.cpu_count(),\n", + " desc=\"filter documents...\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "37786e0a47ec4375a7495bc1c5ed7ff2", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Saving the dataset (0/564 shards): 0%| | 0/44401239 [00:00 Date: Fri, 24 Nov 2023 14:50:56 +0000 Subject: [PATCH 16/16] happy quality --- easyllm/data/extractor/html_extractor.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/easyllm/data/extractor/html_extractor.py b/easyllm/data/extractor/html_extractor.py index 41bf77f..6a1d4a3 100644 --- a/easyllm/data/extractor/html_extractor.py +++ b/easyllm/data/extractor/html_extractor.py @@ -1,9 +1,8 @@ -from pydantic import BaseModel - # from inscriptis import get_text from inscriptis.css_profiles import CSS_PROFILES from inscriptis.model.config import ParserConfig +from pydantic import BaseModel from readability import Document INSCRIPTIS_CONFIG = ParserConfig(css=CSS_PROFILES["strict"])