From 5b20d8bb9544ebe50ebe3d30babd85223d27d0bb Mon Sep 17 00:00:00 2001
From: Philipp Schmid <schmidphilipp1995@gmail.com>
Date: Fri, 11 Aug 2023 12:37:09 +0000
Subject: [PATCH 01/16] ppl

---
 easyllm/data/__init__.py          |   0
 easyllm/data/filters/__init__.py  |   1 +
 easyllm/data/filters/kenlm_ppl.py | 201 ++++++++++++++++++++++++++++++
 notebooks/data-filter.ipynb       |  91 ++++++++++++++
 pyproject.toml                    |   1 +
 5 files changed, 294 insertions(+)
 create mode 100644 easyllm/data/__init__.py
 create mode 100644 easyllm/data/filters/__init__.py
 create mode 100644 easyllm/data/filters/kenlm_ppl.py
 create mode 100644 notebooks/data-filter.ipynb

diff --git a/easyllm/data/__init__.py b/easyllm/data/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/easyllm/data/filters/__init__.py b/easyllm/data/filters/__init__.py
new file mode 100644
index 0000000..f411be4
--- /dev/null
+++ b/easyllm/data/filters/__init__.py
@@ -0,0 +1 @@
+from easyllm.data.filters.kenlm_ppl import PerplexityFilter
diff --git a/easyllm/data/filters/kenlm_ppl.py b/easyllm/data/filters/kenlm_ppl.py
new file mode 100644
index 0000000..fbfa362
--- /dev/null
+++ b/easyllm/data/filters/kenlm_ppl.py
@@ -0,0 +1,201 @@
+
+import os
+import re
+import unicodedata
+from typing import Dict
+import importlib.util
+from pydantic import BaseModel, ConfigDict
+from huggingface_hub import hf_hub_download
+
+_kenlm = importlib.util.find_spec("kenlm") is not None
+_sentencepiece = importlib.util.find_spec("sentencepiece") is not None
+
+if _kenlm or not _sentencepiece:
+  import kenlm
+  import sentencepiece
+
+
+
+class SentencePiece:
+    def __init__(
+        self,
+        model: str,
+    ):
+        super().__init__()
+        self.sp = sentencepiece.SentencePieceProcessor()
+        self.sp.load(str(model))
+
+    def do(self, text: dict) -> dict:
+        tokenized = self.sp.encode_as_pieces(text)
+        return " ".join(tokenized)
+
+
+class KenlmModel:
+    digit_re: re.Pattern[str] = re.compile(r"\d")
+    unicode_punct: Dict[str, str] = {
+        "，": ",",
+        "。": ".",
+        "、": ",",
+        "„": '"',
+        "”": '"',
+        "“": '"',
+        "«": '"',
+        "»": '"',
+        "１": '"',
+        "」": '"',
+        "「": '"',
+        "《": '"',
+        "》": '"',
+        "´": "'",
+        "∶": ":",
+        "：": ":",
+        "？": "?",
+        "！": "!",
+        "（": "(",
+        "）": ")",
+        "；": ";",
+        "–": "-",
+        "—": " - ",
+        "．": ". ",
+        "～": "~",
+        "’": "'",
+        "…": "...",
+        "━": "-",
+        "〈": "<",
+        "〉": ">",
+        "【": "[",
+        "】": "]",
+        "％": "%",
+        "►": "-",
+    }
+    unicode_punct_re:re.Pattern = re.compile(f"[{''.join(unicode_punct.keys())}]")
+    non_printing_chars_re:re.Pattern = re.compile(
+        f"[{''.join(map(chr, list(range(0,32)) + list(range(127,160))))}]"
+    )
+    model: kenlm.Model = None
+    tokenizer: SentencePiece = None
+    accent: bool = False
+    case: bool = False
+    numbers: bool = True
+    punct: int = 1
+
+    def __init__(
+        self,
+        model_path: str,
+        tokenizer_path: str,
+        lower_case: bool = False,
+        remove_accents: bool = False,
+        normalize_numbers: bool = True,
+        punctuation: int = 1,
+    ):
+        self.model = kenlm.Model(model_path)
+        self.tokenizer = SentencePiece(tokenizer_path)
+        self.accent = remove_accents
+        self.case = lower_case
+        self.numbers = normalize_numbers
+        self.punct = punctuation
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        language_or_path: str,
+    ):      
+        try: 
+          model = hf_hub_download("philschmid/kenlm",filename=f"wikipedia/{language_or_path}.arpa.bin")
+          tokenizer = hf_hub_download("philschmid/kenlm",filename=f"wikipedia/{language_or_path}.sp.model")
+        except:
+          raise ValueError(f"KenLM model for {language_or_path} not found at https://huggingface.co/philschmid/kenlm. Please train your own model and upload it to the hub.")
+          
+          
+        return cls(
+            model,
+            tokenizer,
+            False,
+            False,
+            True,
+            1,
+        )
+
+    def pp(self, log_score, length):
+        return 10.0 ** (-log_score / length)
+
+    def get_perplexity(self, doc: str, normalize_cc_net: bool = True):
+        if normalize_cc_net:
+            doc = self.normalize(
+                doc,
+                accent=self.accent,
+                case=self.case,
+                numbers=self.numbers,
+                punct=self.punct,
+            )
+        # Tokenize (after normalizing): See https://github.com/facebookresearch/cc_net/blob/bda555bd1cf1ee2e0b925363e62a61cd46c8b60d/cc_net/mine.py#L352 for full pipeline
+        doc = self.tokenizer.do(doc)
+        doc_log_score, doc_length = 0, 0
+        for line in doc.split("\n"):
+            log_score = self.model.score(line)
+            length = len(line.split()) + 1
+            doc_log_score += log_score
+            doc_length += length
+        return round(self.pp(doc_log_score, doc_length), 1)
+
+    def normalize(
+        self,
+        line: str,
+        accent: bool = True,
+        case: bool = True,
+        numbers: bool = True,
+        punct: int = 1,
+    ) -> str:
+        line = line.strip()
+        if not line:
+            return line
+        if case:
+            line = line.lower()
+        if accent:
+            line = self.strip_accents(line)
+        if numbers:
+            line = self.digit_re.sub("0", line)
+        if punct == 1:
+            line = self.replace_unicode_punct(line)
+        elif punct == 2:
+            line = self.remove_unicode_punct(line)
+        line = self.remove_non_printing_char(line)
+        return line
+
+    def strip_accents(self, line: str) -> str:
+        """Strips accents from a piece of text."""
+        nfd = unicodedata.normalize("NFD", line)
+        output = [c for c in nfd if unicodedata.category(c) != "Mn"]
+        if len(output) == line:
+            return line
+        return "".join(output)
+
+    def replace_unicode_punct(self, text: str) -> str:
+        return "".join(self.unicode_punct.get(c, c) for c in text)
+
+    def remove_unicode_punct(self, text: str) -> str:
+        """More aggressive version of replace_unicode_punct but also faster."""
+        return self.unicode_punct_re.sub("", text)
+
+    def remove_non_printing_char(self, text: str) -> str:
+        return self.non_printing_chars_re.sub("", text)
+
+
+class PerplexityFilter(BaseModel):
+    model: KenlmModel = None
+    min_threshold: int = 0
+    max_threshold: int = 1000
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+    def __init__(self,language:str,min_threshold:int=0,max_threshold:int=1000):
+        super().__init__()
+        self.min_threshold = min_threshold
+        self.max_threshold = max_threshold
+        self.model = KenlmModel.from_pretrained(language)
+
+
+    def __call__(self, doc: str) -> bool:
+        # returns True if the perplexity of the document outside of the threshold,
+        # meaning smaller than min_threshold or larger than max_threshold
+        return not self.min_threshold <= self.model.get_perplexity(doc) <= self.max_threshold
+  
\ No newline at end of file
diff --git a/notebooks/data-filter.ipynb b/notebooks/data-filter.ipynb
new file mode 100644
index 0000000..75e3b5f
--- /dev/null
+++ b/notebooks/data-filter.ipynb
@@ -0,0 +1,91 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# How to use EasyLLM Quality data filters\n",
+    "\n",
+    "EasyLLMs `data` package adds quality filters for preprocessing text data for improved pretraining. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install \"easyllm[data]\" --upgrade"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Perplexity filtering\n",
+    "\n",
+    "Perplexity filtering can be used to improve model quality, coherence, and training efficiency by removing confusing text segments and focusing model learning on more standard, comprehensible language.\n",
+    "Perplexity filtering is implemented using `KenLM` models trained on wikipedia. You just need to provide your language id, e.g. `de` and your perplexity `min_threshold` and `max_threshold` the filter will return `True` if the perplexity of the text outside of the threshold `False` otherwise.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "341.3\n",
+      "46793.5\n"
+     ]
+    }
+   ],
+   "source": [
+    "from easyllm.data.filters import PerplexityFilter\n",
+    "\n",
+    "ppl = PerplexityFilter(\"en\",min_threshold=10,max_threshold=1000)\n",
+    "\n",
+    "# Get perplexity\n",
+    "print(ppl.model.get_perplexity(\"I am very perplexed\"))\n",
+    "# 341.3 (low perplexity, since sentence style is formal and with no grammar mistakes)\n",
+    "\n",
+    "print(ppl.model.get_perplexity(\"im hella trippin\"))\n",
+    "# 46793.5 (high perplexity, since the sentence is colloquial and contains grammar mistakes)\n",
+    "\n",
+    "# testing the filter\n",
+    "assert ppl(\"I am very perplexed\") == False\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "dev",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.13"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/pyproject.toml b/pyproject.toml
index e5da485..4bc1c5b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,6 +36,7 @@ scripts = { easyllm = "easyllm.cli:main" }
 dependencies = ["pydantic==2.1.1", "nanoid==2.0.0", "huggingface-hub==0.16.4"]
 
 [project.optional-dependencies]
+data = ["datasets","https://github.com/kpu/kenlm/archive/master.zip","sentencepiece"]
 test = ["pytest", "ruff", "black", "isort", "mypy", "hatch"]
 dev = ["ruff", "black", "isort", "mypy", "hatch"]
 docs = [

From aba974a95c4d30a39f6446a71ea9c118912342e9 Mon Sep 17 00:00:00 2001
From: Philipp Schmid <schmidphilipp1995@gmail.com>
Date: Fri, 11 Aug 2023 12:58:27 +0000
Subject: [PATCH 02/16] non alpha numeric filter

---
 easyllm/data/filters/__init__.py          |  1 +
 easyllm/data/filters/kenlm_ppl.py         | 16 +++++-----
 easyllm/data/filters/non_alpha_numeric.py | 27 ++++++++++++++++
 notebooks/data-filter.ipynb               | 39 +++++++++++++++++++++++
 4 files changed, 75 insertions(+), 8 deletions(-)
 create mode 100644 easyllm/data/filters/non_alpha_numeric.py

diff --git a/easyllm/data/filters/__init__.py b/easyllm/data/filters/__init__.py
index f411be4..f589d8c 100644
--- a/easyllm/data/filters/__init__.py
+++ b/easyllm/data/filters/__init__.py
@@ -1 +1,2 @@
 from easyllm.data.filters.kenlm_ppl import PerplexityFilter
+from easyllm.data.filters.non_alpha_numeric import NonAlphaNumericFilter
diff --git a/easyllm/data/filters/kenlm_ppl.py b/easyllm/data/filters/kenlm_ppl.py
index fbfa362..55b56be 100644
--- a/easyllm/data/filters/kenlm_ppl.py
+++ b/easyllm/data/filters/kenlm_ppl.py
@@ -1,11 +1,11 @@
 
-import os
+import importlib.util
 import re
 import unicodedata
 from typing import Dict
-import importlib.util
-from pydantic import BaseModel, ConfigDict
+
 from huggingface_hub import hf_hub_download
+from pydantic import BaseModel, ConfigDict
 
 _kenlm = importlib.util.find_spec("kenlm") is not None
 _sentencepiece = importlib.util.find_spec("sentencepiece") is not None
@@ -99,14 +99,14 @@ def __init__(
     def from_pretrained(
         cls,
         language_or_path: str,
-    ):      
-        try: 
+    ):
+        try:
           model = hf_hub_download("philschmid/kenlm",filename=f"wikipedia/{language_or_path}.arpa.bin")
           tokenizer = hf_hub_download("philschmid/kenlm",filename=f"wikipedia/{language_or_path}.sp.model")
         except:
           raise ValueError(f"KenLM model for {language_or_path} not found at https://huggingface.co/philschmid/kenlm. Please train your own model and upload it to the hub.")
-          
-          
+
+
         return cls(
             model,
             tokenizer,
@@ -198,4 +198,4 @@ def __call__(self, doc: str) -> bool:
         # returns True if the perplexity of the document outside of the threshold,
         # meaning smaller than min_threshold or larger than max_threshold
         return not self.min_threshold <= self.model.get_perplexity(doc) <= self.max_threshold
-  
\ No newline at end of file
+
diff --git a/easyllm/data/filters/non_alpha_numeric.py b/easyllm/data/filters/non_alpha_numeric.py
new file mode 100644
index 0000000..194889d
--- /dev/null
+++ b/easyllm/data/filters/non_alpha_numeric.py
@@ -0,0 +1,27 @@
+import re
+
+from pydantic import BaseModel
+
+
+class NonAlphaNumericFilter(BaseModel):
+    """
+    Ref: Gopher (Rae et al., 2021)
+    Desc: If more than 20% of the document is non-alphanumeric then remove
+    """
+
+    name: str = "non_alpha_numeric"
+    regex: re.Pattern = re.compile("[^a-zA-Z0-9\s]")
+    cutoff_percentage: float = 0.2
+
+    def __call__(self, text):
+        num_characters = len(text)
+        # check if there are any characters in the text
+        if num_characters == 0:
+            return True
+        # calculate the percentage of non-alphanumeric characters
+        percentage = 1 - ((num_characters - len(self.regex.findall(text))) / num_characters)
+        # if the percentage is greater than the cutoff_percentage then remove
+        if percentage > self.cutoff_percentage:
+            return True
+        # otherwise keep
+        return False
diff --git a/notebooks/data-filter.ipynb b/notebooks/data-filter.ipynb
index 75e3b5f..8a35c72 100644
--- a/notebooks/data-filter.ipynb
+++ b/notebooks/data-filter.ipynb
@@ -58,6 +58,45 @@
     "assert ppl(\"I am very perplexed\") == False\n"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## NonAlphaNumericFilter\n",
+    "\n",
+    "The `NonAlphaNumericFilter` removes documents based on the number of non-alphanumeric characters in the document. Based on [Gopher (Rae et al., 2021)](https://arxiv.org/pdf/2112.11446.pdf), if the document has more then 20% non-alphanumeric characters, it is removed."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "14\n",
+      "0\n",
+      "0.0\n",
+      "21\n",
+      "7\n",
+      "0.33333333333333337\n"
+     ]
+    }
+   ],
+   "source": [
+    "from easyllm.data.filters import NonAlphaNumericFilter\n",
+    "\n",
+    "nam = NonAlphaNumericFilter()\n",
+    "\n",
+    "# not filtered\n",
+    "assert nam(\"This is a test\") == False\n",
+    "\n",
+    "# filtered\n",
+    "assert nam(\"This is a test!!!!!!!\") == True\n"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,

From e9c922ed618d32381fc7e9ea2066723a009e2ba9 Mon Sep 17 00:00:00 2001
From: Philipp Schmid <schmidphilipp1995@gmail.com>
Date: Fri, 11 Aug 2023 13:33:45 +0000
Subject: [PATCH 03/16] more filters

---
 easyllm/data/filters/__init__.py           |  2 +
 easyllm/data/filters/digit_to_character.py | 22 +++++++
 easyllm/data/filters/non_alpha_numeric.py  |  6 +-
 easyllm/data/filters/words_to_symbol.py    | 33 ++++++++++
 notebooks/data-filter.ipynb                | 74 ++++++++++++++++++----
 5 files changed, 121 insertions(+), 16 deletions(-)
 create mode 100644 easyllm/data/filters/digit_to_character.py
 create mode 100644 easyllm/data/filters/words_to_symbol.py

diff --git a/easyllm/data/filters/__init__.py b/easyllm/data/filters/__init__.py
index f589d8c..7156fc4 100644
--- a/easyllm/data/filters/__init__.py
+++ b/easyllm/data/filters/__init__.py
@@ -1,2 +1,4 @@
 from easyllm.data.filters.kenlm_ppl import PerplexityFilter
 from easyllm.data.filters.non_alpha_numeric import NonAlphaNumericFilter
+from easyllm.data.filters.digit_to_character import DigitToCharacter
+from easyllm.data.filters.words_to_symbol import SymbolToWordFilter
diff --git a/easyllm/data/filters/digit_to_character.py b/easyllm/data/filters/digit_to_character.py
new file mode 100644
index 0000000..7916afc
--- /dev/null
+++ b/easyllm/data/filters/digit_to_character.py
@@ -0,0 +1,22 @@
+import re
+
+from pydantic import BaseModel
+
+
+class DigitToCharacter(BaseModel):
+    """
+    Desc: If more than 20% of the document are digits then remove
+    """
+
+    name: str = "digit_to_character"
+    remove_percentage: float = 0.2
+
+    def __call__(self, text):
+        digits = re.findall(r"\d", text)
+        num_digits = len(digits)
+        total_chars = len(text)
+        # check if there are any characters in the text
+        if num_digits / total_chars > self.remove_percentage:
+            return True
+        # otherwise keep
+        return False
diff --git a/easyllm/data/filters/non_alpha_numeric.py b/easyllm/data/filters/non_alpha_numeric.py
index 194889d..8f0ba8b 100644
--- a/easyllm/data/filters/non_alpha_numeric.py
+++ b/easyllm/data/filters/non_alpha_numeric.py
@@ -11,7 +11,7 @@ class NonAlphaNumericFilter(BaseModel):
 
     name: str = "non_alpha_numeric"
     regex: re.Pattern = re.compile("[^a-zA-Z0-9\s]")
-    cutoff_percentage: float = 0.2
+    remove_percentage: float = 0.2
 
     def __call__(self, text):
         num_characters = len(text)
@@ -20,8 +20,8 @@ def __call__(self, text):
             return True
         # calculate the percentage of non-alphanumeric characters
         percentage = 1 - ((num_characters - len(self.regex.findall(text))) / num_characters)
-        # if the percentage is greater than the cutoff_percentage then remove
-        if percentage > self.cutoff_percentage:
+        # if the percentage is greater than the remove_percentage then remove
+        if percentage > self.remove_percentage:
             return True
         # otherwise keep
         return False
diff --git a/easyllm/data/filters/words_to_symbol.py b/easyllm/data/filters/words_to_symbol.py
new file mode 100644
index 0000000..7539dec
--- /dev/null
+++ b/easyllm/data/filters/words_to_symbol.py
@@ -0,0 +1,33 @@
+import re
+
+from pydantic import BaseModel
+
+
+class SymbolToWordFilter(BaseModel):
+    """
+    Ref: Gopher (Rae et al., 2021)
+    Desc: If more than 10% of the document are symbols (hashes [#] or ellipsis (...)) then remove
+    """
+
+    name: str = "symbol_to_word"
+    regex: re.Pattern = r"(\#+|(\.{3,}))(?!\w)"
+    remove_percentage: float = 0.1
+
+    def __call__(self, text: str):
+        num_hashes = len(re.findall(r"\#+", text))
+        num_ellipses = len(re.findall(r"\.{3,}", text))
+        num_words = len(re.findall(r"\w+", text))
+
+        # check if there are any words in the text
+        if num_words == 0:
+            return True
+
+        hash_ratio = num_hashes / num_words
+        ellipses_ratio = num_ellipses / num_words
+
+        # if the percentage is greater than the remove_percentage then remove
+        if hash_ratio > self.remove_percentage or ellipses_ratio > self.remove_percentage:
+            return True
+
+        # otherwise keep
+        return False
diff --git a/notebooks/data-filter.ipynb b/notebooks/data-filter.ipynb
index 8a35c72..518e2f8 100644
--- a/notebooks/data-filter.ipynb
+++ b/notebooks/data-filter.ipynb
@@ -69,32 +69,80 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from easyllm.data.filters import NonAlphaNumericFilter\n",
+    "\n",
+    "nam = NonAlphaNumericFilter()\n",
+    "\n",
+    "# not filtered\n",
+    "assert nam(\"This is a test\") == False\n",
+    "\n",
+    "# filtered\n",
+    "assert nam(\"This is a test!!!!!!!\") == True\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## SymbolToWordFilter\n",
+    "\n",
+    "The `SymbolToWordFilter` removes any document with a symbol-to-word ratio greater than 0.1 for either the hash symbol or the ellipsis.Based on [Gopher (Rae et al., 2021)](https://arxiv.org/pdf/2112.11446.pdf)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from easyllm.data.filters import SymbolToWordFilter\n",
+    "\n",
+    "stw = SymbolToWordFilter()\n",
+    "\n",
+    "assert stw(\"This is a test\") == False\n",
+    "\n",
+    "assert stw(\"spam#spam#spam#spam#spam#spam#spam#spam\") == True"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## NumbersToCharacterFilter\n",
+    "\n",
+    "The `NumbersToCharacterFilter` removes any document where the 20% of the document are numbers."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "14\n",
-      "0\n",
-      "0.0\n",
-      "21\n",
-      "7\n",
-      "0.33333333333333337\n"
+      "num_digits: 13\n",
+      "total_chars: 66\n",
+      "num_digits / total_chars: 0.19696969696969696\n",
+      "num_digits: 10\n",
+      "total_chars: 18\n",
+      "num_digits / total_chars: 0.5555555555555556\n"
      ]
     }
    ],
    "source": [
-    "from easyllm.data.filters import NonAlphaNumericFilter\n",
+    "from easyllm.data.filters import DigitToCharacter\n",
     "\n",
-    "nam = NonAlphaNumericFilter()\n",
+    "ntw = DigitToCharacter()\n",
     "\n",
-    "# not filtered\n",
-    "assert nam(\"This is a test\") == False\n",
+    "assert ntw(\"Hello 123 world 456 this text 789 contains 1234 numbers more words\") == False\n",
     "\n",
-    "# filtered\n",
-    "assert nam(\"This is a test!!!!!!!\") == True\n"
+    "assert ntw(\"Hello 34534 34534 \") == True\n"
    ]
   },
   {

From 332ba531bdb7ed1359097ec506109469a674a27a Mon Sep 17 00:00:00 2001
From: Philipp Schmid <schmidphilipp1995@gmail.com>
Date: Fri, 11 Aug 2023 14:05:24 +0000
Subject: [PATCH 04/16] more filters

---
 easyllm/data/filters/__init__.py           |   5 +
 easyllm/data/filters/bulletpoint_ratio.py  |  43 +++++++
 easyllm/data/filters/longword.py           |  21 ++++
 easyllm/data/filters/parantheses_ration.py |  23 ++++
 easyllm/data/filters/url_ratio.py          |  24 ++++
 easyllm/data/filters/whitespace_ration.py  |  23 ++++
 notebooks/data-filter.ipynb                | 137 ++++++++++++++++++---
 7 files changed, 261 insertions(+), 15 deletions(-)
 create mode 100644 easyllm/data/filters/bulletpoint_ratio.py
 create mode 100644 easyllm/data/filters/longword.py
 create mode 100644 easyllm/data/filters/parantheses_ration.py
 create mode 100644 easyllm/data/filters/url_ratio.py
 create mode 100644 easyllm/data/filters/whitespace_ration.py

diff --git a/easyllm/data/filters/__init__.py b/easyllm/data/filters/__init__.py
index 7156fc4..168ca92 100644
--- a/easyllm/data/filters/__init__.py
+++ b/easyllm/data/filters/__init__.py
@@ -1,4 +1,9 @@
+from easyllm.data.filters.bulletpoint_ratio import BulletpointRatioFilter
 from easyllm.data.filters.kenlm_ppl import PerplexityFilter
+from easyllm.data.filters.longword import LongWordFilter
 from easyllm.data.filters.non_alpha_numeric import NonAlphaNumericFilter
 from easyllm.data.filters.digit_to_character import DigitToCharacter
+from easyllm.data.filters.parantheses_ration import ParenthesesRationFilter
+from easyllm.data.filters.url_ratio import UrlRatioFilter
+from easyllm.data.filters.whitespace_ration import WhitespaceRatioFilter
 from easyllm.data.filters.words_to_symbol import SymbolToWordFilter
diff --git a/easyllm/data/filters/bulletpoint_ratio.py b/easyllm/data/filters/bulletpoint_ratio.py
new file mode 100644
index 0000000..1e02966
--- /dev/null
+++ b/easyllm/data/filters/bulletpoint_ratio.py
@@ -0,0 +1,43 @@
+import re
+from typing import List
+
+from pydantic import BaseModel
+
+
+class BulletpointRatioFilter(BaseModel):
+    """
+    Ref: Gopher (Rae et al., 2021)
+    Desc: If more than 90% of the document are bulletpoints then remove
+    """
+
+    name: str = "bulletpoint_ratio"
+    potential_bullet_points: List[str] = [
+        "•",
+        "‣",
+        "⁃",
+        "⁌",
+        "⁍",
+        "∙",
+        "○",
+        "●",
+        "◘",
+        "◦",
+        "⦾",
+        "⦿",
+        "-",
+    ]
+    remove_percentage: float = 0.9
+
+    def __call__(self, text):
+        # split text into lines
+        lines = text.split("\n")
+        num_bullet_points = 0
+        for line in lines:
+            # check if the line is a bullet point
+            if line.startswith(tuple(self.potential_bullet_points)):
+                num_bullet_points += 1
+        # check if the ratio of bullet points to lines is greater than the remove percentage
+        if num_bullet_points / len(lines) > self.remove_percentage:
+            return True
+        # otherwise keep
+        return False
diff --git a/easyllm/data/filters/longword.py b/easyllm/data/filters/longword.py
new file mode 100644
index 0000000..ed59081
--- /dev/null
+++ b/easyllm/data/filters/longword.py
@@ -0,0 +1,21 @@
+import re
+
+from pydantic import BaseModel
+
+
+class LongWordFilter(BaseModel):
+    """
+    Ref: C4 Raffel et al.
+    Desc: If document includes words with > 1000 character are removed, e.g. js or minified files.
+    """
+
+    name: str = "long_word"
+    max_length: int = 1000
+
+    def __call__(self, text):
+        words = text.split()
+        max_len = max(len(word) for word in words)
+        if max_len > self.max_length:
+            return True
+        # otherwise keep
+        return False
diff --git a/easyllm/data/filters/parantheses_ration.py b/easyllm/data/filters/parantheses_ration.py
new file mode 100644
index 0000000..02c8e76
--- /dev/null
+++ b/easyllm/data/filters/parantheses_ration.py
@@ -0,0 +1,23 @@
+import re
+
+from pydantic import BaseModel
+
+
+class ParenthesesRationFilter(BaseModel):
+    """
+    Desc: If more than 10% of the document are Parentheses then remove
+    """
+
+    name: str = "parentheses_ratio"
+    regex: re.Pattern = re.compile(r"\[|\]|\(|\)|{|}|⟨|⟩")
+    remove_percentage: float = 0.1
+
+    def __call__(self, text):
+        # parentheses characters
+        parentheses_count = len(self.regex.findall(text))
+        sentence_length = len(text)
+        # check if the ratio of parentheses to text is greater than the remove percentage
+        if parentheses_count / sentence_length > self.remove_percentage:
+            return True
+        # otherwise keep
+        return False
diff --git a/easyllm/data/filters/url_ratio.py b/easyllm/data/filters/url_ratio.py
new file mode 100644
index 0000000..4571982
--- /dev/null
+++ b/easyllm/data/filters/url_ratio.py
@@ -0,0 +1,24 @@
+import re
+
+from pydantic import BaseModel
+
+
+class UrlRatioFilter(BaseModel):
+    """
+    Desc: If more than 20% of the document are urls then remove
+    """
+
+    name: str = "url_ratio"
+    regex: re.Pattern[
+        str
+    ] = r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)"
+    remove_percentage: float = 0.2
+
+    def __call__(self, text):
+        # find all urls
+        urls = re.findall(self.regex, text)
+        # check if the ratio of urls to words is greater than the remove percentage
+        if len(urls) / len(text.split()) > self.remove_percentage:
+            return True
+        # otherwise keep
+        return False
diff --git a/easyllm/data/filters/whitespace_ration.py b/easyllm/data/filters/whitespace_ration.py
new file mode 100644
index 0000000..e9ff23a
--- /dev/null
+++ b/easyllm/data/filters/whitespace_ration.py
@@ -0,0 +1,23 @@
+import re
+
+from pydantic import BaseModel
+
+
+class WhitespaceRatioFilter(BaseModel):
+    """
+    Desc: If more than 25% of the document are bulletpoints then remove
+    """
+
+    name: str = "whitespace_ratio"
+    regex: re.Pattern = re.compile(r"\s")
+    remove_percentage: float = 0.25
+
+    def __call__(self, text):
+        # whitespace characters
+        whitespace_count = len(self.regex.findall(text))
+        text_length = len(text)
+        # check if the ratio of whitespace to text is greater than the remove percentage
+        if whitespace_count / text_length > self.remove_percentage:
+            return True
+        # otherwise keep
+        return False
diff --git a/notebooks/data-filter.ipynb b/notebooks/data-filter.ipynb
index 518e2f8..0db8d1a 100644
--- a/notebooks/data-filter.ipynb
+++ b/notebooks/data-filter.ipynb
@@ -90,7 +90,7 @@
    "source": [
     "## SymbolToWordFilter\n",
     "\n",
-    "The `SymbolToWordFilter` removes any document with a symbol-to-word ratio greater than 0.1 for either the hash symbol or the ellipsis.Based on [Gopher (Rae et al., 2021)](https://arxiv.org/pdf/2112.11446.pdf)"
+    "The `SymbolToWordFilter` removes any document with a symbol-to-word ratio greater than 0.1 for either the hash symbol or the ellipsis. Based on [Gopher (Rae et al., 2021)](https://arxiv.org/pdf/2112.11446.pdf)"
    ]
   },
   {
@@ -121,20 +121,7 @@
    "cell_type": "code",
    "execution_count": 1,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "num_digits: 13\n",
-      "total_chars: 66\n",
-      "num_digits / total_chars: 0.19696969696969696\n",
-      "num_digits: 10\n",
-      "total_chars: 18\n",
-      "num_digits / total_chars: 0.5555555555555556\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "from easyllm.data.filters import DigitToCharacter\n",
     "\n",
@@ -145,6 +132,126 @@
     "assert ntw(\"Hello 34534 34534 \") == True\n"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## UrlRatioFilter\n",
+    "\n",
+    "The `UrlRatioFilter` removes any document where 20% of the document is a URL."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from easyllm.data.filters import UrlRatioFilter \n",
+    "\n",
+    "ur = UrlRatioFilter()\n",
+    "\n",
+    "assert ur(\"https://www.google.com\") == True\n",
+    "\n",
+    "assert ur(\"Example text with some urls http://www.example.com and more text https://www.example2.com and more text\") == False"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## BulletpointRatioFilter \n",
+    "\n",
+    "The `BulletpointRatioFilter` removes documents that have more than 90% bulletpoints. Based on [Gopher (Rae et al., 2021)](https://arxiv.org/pdf/2112.11446.pdf)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from easyllm.data.filters import BulletpointRatioFilter\n",
+    "\n",
+    "br = BulletpointRatioFilter()\n",
+    "\n",
+    "assert br(\"This is a text with \\n- some bullets but\\nnot all\") == False\n",
+    "\n",
+    "assert br(\"- some bullets and\\n- some more\") == True"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## WhitespaceRatioFilter\n",
+    "\n",
+    "The `WhitespaceRatioFilter` is a filter that removes documents that more than 25% of the text is whitespace.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from easyllm.data.filters import WhitespaceRatioFilter\n",
+    "\n",
+    "wr = WhitespaceRatioFilter()\n",
+    "\n",
+    "assert wr(\"This is a test\") == False\n",
+    "\n",
+    "assert wr(\"Hello world!      This text has    extra whitespace.\") == True"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## ParenthesesRationFilter\n",
+    "\n",
+    "The `ParenthesesRationFilter` is a filter that removes all sentences that have a parentheses ratio greater than 10%."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from easyllm.data.filters import ParenthesesRationFilter\n",
+    "\n",
+    "pr = ParenthesesRationFilter()\n",
+    "\n",
+    "assert pr(\"This is a normal sentence\") == False\n",
+    "\n",
+    "assert pr(\"This a (with ) ] {(e)\") == True"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## LongWordFilter\n",
+    "\n",
+    "The `LongWordFilter` is a filter that removes documents that include words longer > 1000 character, e.g. js minfied files."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from easyllm.data.filters import LongWordFilter\n",
+    "\n",
+    "lw = LongWordFilter()\n",
+    "\n",
+    "assert lw(\"This is a test\") == False\n",
+    "\n",
+    "assert lw(f\"This is a test with a {'longword'*500}\") == True"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,

From 78914d2ec5fe5838595387eaabfd33d312b2e827 Mon Sep 17 00:00:00 2001
From: Philipp Schmid <schmidphilipp1995@gmail.com>
Date: Fri, 11 Aug 2023 14:40:21 +0000
Subject: [PATCH 05/16] more filters

---
 easyllm/data/filters/__init__.py      |  2 +
 easyllm/data/filters/cookie_banner.py | 18 +++++++
 easyllm/data/filters/length.py        | 21 ++++++++
 easyllm/data/filters/repeating.py     | 53 +++++++++++++++++++
 notebooks/data-filter.ipynb           | 73 +++++++++++++++++++++++++++
 5 files changed, 167 insertions(+)
 create mode 100644 easyllm/data/filters/cookie_banner.py
 create mode 100644 easyllm/data/filters/length.py
 create mode 100644 easyllm/data/filters/repeating.py

diff --git a/easyllm/data/filters/__init__.py b/easyllm/data/filters/__init__.py
index 168ca92..673bc7e 100644
--- a/easyllm/data/filters/__init__.py
+++ b/easyllm/data/filters/__init__.py
@@ -1,9 +1,11 @@
 from easyllm.data.filters.bulletpoint_ratio import BulletpointRatioFilter
 from easyllm.data.filters.kenlm_ppl import PerplexityFilter
+from easyllm.data.filters.length import LengthFilter
 from easyllm.data.filters.longword import LongWordFilter
 from easyllm.data.filters.non_alpha_numeric import NonAlphaNumericFilter
 from easyllm.data.filters.digit_to_character import DigitToCharacter
 from easyllm.data.filters.parantheses_ration import ParenthesesRationFilter
+from easyllm.data.filters.repeating import RepeatedParagraphFilter, RepeatedLinesFilter
 from easyllm.data.filters.url_ratio import UrlRatioFilter
 from easyllm.data.filters.whitespace_ration import WhitespaceRatioFilter
 from easyllm.data.filters.words_to_symbol import SymbolToWordFilter
diff --git a/easyllm/data/filters/cookie_banner.py b/easyllm/data/filters/cookie_banner.py
new file mode 100644
index 0000000..0df4cbd
--- /dev/null
+++ b/easyllm/data/filters/cookie_banner.py
@@ -0,0 +1,18 @@
+import re
+
+from pydantic import BaseModel
+
+
+class CookieBannerFilter(BaseModel):
+    """
+    Ref: C4 Raffel et al.
+    Desc: Removes documents if more than 40% of the documents include terms for cookies, tos, privacy policy, etc. Requires external list.
+    """
+
+    name: str = "cookie_banner"
+    regex: re.Pattern = re.compile(r"(terms of use|privacy policy|copyright|all rights reserved)", re.IGNORECASE)
+    remove_percentage: float = 0.4
+
+    def __call__(self, text):
+        # check if the regex matches
+        raise NotImplementedError("CookieBannerFilter not implemented yet")
diff --git a/easyllm/data/filters/length.py b/easyllm/data/filters/length.py
new file mode 100644
index 0000000..4eaa9ff
--- /dev/null
+++ b/easyllm/data/filters/length.py
@@ -0,0 +1,21 @@
+import re
+
+from pydantic import BaseModel
+
+
+class LengthFilter(BaseModel):
+    """
+    Desc: Removes documents below or above a certain length of words
+    """
+
+    name: str = "length"
+    min_length: int = 10
+    max_length: int = 1_000_000
+
+    def __call__(self, text):
+        num_words = len(text.split())
+
+        if num_words < self.min_length or num_words > self.max_length:
+            return True
+        # otherwise keep
+        return False
diff --git a/easyllm/data/filters/repeating.py b/easyllm/data/filters/repeating.py
new file mode 100644
index 0000000..3df3077
--- /dev/null
+++ b/easyllm/data/filters/repeating.py
@@ -0,0 +1,53 @@
+import re
+
+from pydantic import BaseModel
+
+
+class RepeatedLinesFilter(BaseModel):
+    """
+    Ref: Gopher (Rae et al., 2021)
+    Desc: If the document shrinks by > 30% after removing repeated lines then remove
+    """
+
+    name: str = "repeated_lines"
+    remove_percentage: float = 0.3
+
+    def __call__(self, text):
+        # split the text into lines
+        lines = text.split("\n")
+        # remove empty lines
+        lines = [line for line in lines if line.strip()]
+        if len(lines) == 0:
+            return True
+        # remove repeated lines
+        unique_lines = list(set(lines))
+        # calculate the percentage of lines removed
+        if len(unique_lines) / len(lines) < self.remove_percentage:
+            return True
+        # otherwise keep
+        return False
+
+
+class RepeatedParagraphFilter(BaseModel):
+    """
+    Ref: Gopher (Rae et al., 2021)
+    Desc: If the document shrinks by > 30% after removing repeated paragraphs then remove
+    """
+
+    name: str = "repeated_paragraph"
+    remove_percentage: float = 0.3
+
+    def __call__(self, text):
+        # split the text into lines
+        paragraphes = text.split("\n\n")
+        # remove empty paragraph
+        paragraphes = [p for p in paragraphes if p.strip()]
+        if len(paragraphes) == 0:
+            return True
+        # remove repeated paragraphes
+        unique_paragraphes = list(set(paragraphes))
+        # calculate the percentage of lines removed
+        if len(unique_paragraphes) / len(paragraphes) < self.remove_percentage:
+            return True
+        # otherwise keep
+        return False
diff --git a/notebooks/data-filter.ipynb b/notebooks/data-filter.ipynb
index 0db8d1a..9814842 100644
--- a/notebooks/data-filter.ipynb
+++ b/notebooks/data-filter.ipynb
@@ -252,6 +252,79 @@
     "assert lw(f\"This is a test with a {'longword'*500}\") == True"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## LengthFilter\n",
+    "\n",
+    "The `LengthFilter` removes documents below or above a certain number of words. Not tokens since its more expensive to compute."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from easyllm.data.filters import LengthFilter\n",
+    "\n",
+    "l = LengthFilter(min_length=1, max_length=100)\n",
+    "\n",
+    "assert l(\"hello world\") == False\n",
+    "\n",
+    "assert l(\"hello world \" * 100) == True"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## RepeatedParagraphFilter, RepeatedLinesFilter\n",
+    "\n",
+    "The `RepeatedParagraphFilter` & `RepeatedLinesFilter` remove documents which have more than 30% repeated lines or paragraphs. Based on [Gopher (Rae et al., 2021)](https://arxiv.org/pdf/2112.11446.pdf) "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['hello', 'world']\n",
+      "1.0\n",
+      "['hello', 'hello', 'hello']\n",
+      "0.3333333333333333\n"
+     ]
+    },
+    {
+     "ename": "AssertionError",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
+      "\u001b[1;32m/home/ubuntu/easyllm/notebooks/data-filter.ipynb Cell 24\u001b[0m in \u001b[0;36m7\n\u001b[1;32m      <a href='vscode-notebook-cell://ssh-remote%2Bc6i/home/ubuntu/easyllm/notebooks/data-filter.ipynb#X35sdnNjb2RlLXJlbW90ZQ%3D%3D?line=3'>4</a>\u001b[0m rp \u001b[39m=\u001b[39m RepeatedParagraphFilter()\n\u001b[1;32m      <a href='vscode-notebook-cell://ssh-remote%2Bc6i/home/ubuntu/easyllm/notebooks/data-filter.ipynb#X35sdnNjb2RlLXJlbW90ZQ%3D%3D?line=5'>6</a>\u001b[0m \u001b[39massert\u001b[39;00m rl(\u001b[39m\"\u001b[39m\u001b[39mhello\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39mworld\u001b[39m\u001b[39m\"\u001b[39m) \u001b[39m==\u001b[39m \u001b[39mFalse\u001b[39;00m\n\u001b[0;32m----> <a href='vscode-notebook-cell://ssh-remote%2Bc6i/home/ubuntu/easyllm/notebooks/data-filter.ipynb#X35sdnNjb2RlLXJlbW90ZQ%3D%3D?line=6'>7</a>\u001b[0m \u001b[39massert\u001b[39;00m rl(\u001b[39m\"\u001b[39m\u001b[39mhello\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39mhello\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39mhello\u001b[39m\u001b[39m\"\u001b[39m) \u001b[39m==\u001b[39m \u001b[39mTrue\u001b[39;00m\n\u001b[1;32m      <a href='vscode-notebook-cell://ssh-remote%2Bc6i/home/ubuntu/easyllm/notebooks/data-filter.ipynb#X35sdnNjb2RlLXJlbW90ZQ%3D%3D?line=8'>9</a>\u001b[0m \u001b[39massert\u001b[39;00m rp(\u001b[39m\"\u001b[39m\u001b[39mhello\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m\\n\u001b[39;00m\u001b[39mworld\u001b[39m\u001b[39m\"\u001b[39m) \u001b[39m==\u001b[39m \u001b[39mFalse\u001b[39;00m\n\u001b[1;32m     <a href='vscode-notebook-cell://ssh-remote%2Bc6i/home/ubuntu/easyllm/notebooks/data-filter.ipynb#X35sdnNjb2RlLXJlbW90ZQ%3D%3D?line=9'>10</a>\u001b[0m \u001b[39massert\u001b[39;00m rp(\u001b[39m\"\u001b[39m\u001b[39mhello\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m\\n\u001b[39;00m\u001b[39mhello\u001b[39m\u001b[39m\"\u001b[39m) \u001b[39m==\u001b[39m \u001b[39mTrue\u001b[39;00m\n",
+      "\u001b[0;31mAssertionError\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "from easyllm.data.filters import RepeatedLinesFilter, RepeatedParagraphFilter\n",
+    "\n",
+    "rl = RepeatedLinesFilter()\n",
+    "rp = RepeatedParagraphFilter()\n",
+    "\n",
+    "assert rl(\"hello\\nworld\") == False\n",
+    "assert rl(\"hello\\nhello\\nhello\") == True\n",
+    "\n",
+    "assert rp(\"hello\\n\\nworld\") == False\n",
+    "assert rp(\"hello\\n\\nhello\") == True"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,

From 7f1392484a0d01ce05429187046e52a8310d892c Mon Sep 17 00:00:00 2001
From: Philipp Schmid <schmidphilipp1995@gmail.com>
Date: Fri, 11 Aug 2023 15:28:28 +0000
Subject: [PATCH 06/16] more filter

---
 easyllm/data/filters/__init__.py    |  2 +
 easyllm/data/filters/n_gram.py      | 31 ++++++++++++
 easyllm/data/filters/punctuation.py | 27 ++++++++++
 easyllm/data/filters/repeating.py   |  2 -
 easyllm/data/filters/stop_word.py   |  0
 notebooks/data-filter.ipynb         | 78 +++++++++++++++++++----------
 6 files changed, 112 insertions(+), 28 deletions(-)
 create mode 100644 easyllm/data/filters/n_gram.py
 create mode 100644 easyllm/data/filters/punctuation.py
 create mode 100644 easyllm/data/filters/stop_word.py

diff --git a/easyllm/data/filters/__init__.py b/easyllm/data/filters/__init__.py
index 673bc7e..b1ddc05 100644
--- a/easyllm/data/filters/__init__.py
+++ b/easyllm/data/filters/__init__.py
@@ -2,9 +2,11 @@
 from easyllm.data.filters.kenlm_ppl import PerplexityFilter
 from easyllm.data.filters.length import LengthFilter
 from easyllm.data.filters.longword import LongWordFilter
+from easyllm.data.filters.n_gram import TopNGramsFilter
 from easyllm.data.filters.non_alpha_numeric import NonAlphaNumericFilter
 from easyllm.data.filters.digit_to_character import DigitToCharacter
 from easyllm.data.filters.parantheses_ration import ParenthesesRationFilter
+from easyllm.data.filters.punctuation import PunctuationFilter
 from easyllm.data.filters.repeating import RepeatedParagraphFilter, RepeatedLinesFilter
 from easyllm.data.filters.url_ratio import UrlRatioFilter
 from easyllm.data.filters.whitespace_ration import WhitespaceRatioFilter
diff --git a/easyllm/data/filters/n_gram.py b/easyllm/data/filters/n_gram.py
new file mode 100644
index 0000000..0fa1f53
--- /dev/null
+++ b/easyllm/data/filters/n_gram.py
@@ -0,0 +1,31 @@
+from pydantic import BaseModel
+from itertools import chain
+from collections import Counter
+
+
+def get_ngrams(input_list, n):
+    return [item for item in zip(*[input_list[i:] for i in range(n)])]
+
+
+class TopNGramsFilter(BaseModel):
+    """
+    Ref: Gopher (Rae et al., 2021)
+    Desc: If the document shrinks by > 20% after removing top n-grams then remove
+    """
+
+    name: str = "top_n_grams"
+    remove_percentage: float = 0.2
+    n: int = 2
+
+    def __call__(self, text):
+        words = text.split()
+        if len(words) <= self.n:
+            return True
+        ngrams = get_ngrams(words, self.n)
+        n_grams = Counter(chain(ngrams))
+        most_common = n_grams.most_common(1)[0][0]
+
+        if n_grams[most_common] / len(n_grams) > self.remove_percentage:
+            return True
+        # otherwise keep
+        return False
diff --git a/easyllm/data/filters/punctuation.py b/easyllm/data/filters/punctuation.py
new file mode 100644
index 0000000..4a229e5
--- /dev/null
+++ b/easyllm/data/filters/punctuation.py
@@ -0,0 +1,27 @@
+import re
+from typing import List
+
+from pydantic import BaseModel
+
+
+class PunctuationFilter(BaseModel):
+    """
+    Ref: C4 Raffel et al.
+    Desc: If less than 15% of the sentences end with a punctuation mark then remove
+    """
+
+    name: str = "punctuation"
+    punctuations: List[str] = [".", "!", "?"]
+    remove_percentage: float = 0.15
+
+    def __call__(self, text):
+        sentences = text.split("\n")
+        # count the number of sentences not ending with a punctuation mark
+        num_sentences_wo_p = sum(
+            1 for sentence in sentences if sentence[-1] not in self.punctuations
+        )
+        # check if the ratio of sentences not ending with a punctuation mark is greater than the remove percentage
+        if num_sentences_wo_p / len(sentences) > self.remove_percentage:
+            return True
+        # otherwise keep
+        return False
\ No newline at end of file
diff --git a/easyllm/data/filters/repeating.py b/easyllm/data/filters/repeating.py
index 3df3077..a37f5ca 100644
--- a/easyllm/data/filters/repeating.py
+++ b/easyllm/data/filters/repeating.py
@@ -1,5 +1,3 @@
-import re
-
 from pydantic import BaseModel
 
 
diff --git a/easyllm/data/filters/stop_word.py b/easyllm/data/filters/stop_word.py
new file mode 100644
index 0000000..e69de29
diff --git a/notebooks/data-filter.ipynb b/notebooks/data-filter.ipynb
index 9814842..47bb67d 100644
--- a/notebooks/data-filter.ipynb
+++ b/notebooks/data-filter.ipynb
@@ -287,31 +287,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "['hello', 'world']\n",
-      "1.0\n",
-      "['hello', 'hello', 'hello']\n",
-      "0.3333333333333333\n"
-     ]
-    },
-    {
-     "ename": "AssertionError",
-     "evalue": "",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
-      "\u001b[1;32m/home/ubuntu/easyllm/notebooks/data-filter.ipynb Cell 24\u001b[0m in \u001b[0;36m7\n\u001b[1;32m      <a href='vscode-notebook-cell://ssh-remote%2Bc6i/home/ubuntu/easyllm/notebooks/data-filter.ipynb#X35sdnNjb2RlLXJlbW90ZQ%3D%3D?line=3'>4</a>\u001b[0m rp \u001b[39m=\u001b[39m RepeatedParagraphFilter()\n\u001b[1;32m      <a href='vscode-notebook-cell://ssh-remote%2Bc6i/home/ubuntu/easyllm/notebooks/data-filter.ipynb#X35sdnNjb2RlLXJlbW90ZQ%3D%3D?line=5'>6</a>\u001b[0m \u001b[39massert\u001b[39;00m rl(\u001b[39m\"\u001b[39m\u001b[39mhello\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39mworld\u001b[39m\u001b[39m\"\u001b[39m) \u001b[39m==\u001b[39m \u001b[39mFalse\u001b[39;00m\n\u001b[0;32m----> <a href='vscode-notebook-cell://ssh-remote%2Bc6i/home/ubuntu/easyllm/notebooks/data-filter.ipynb#X35sdnNjb2RlLXJlbW90ZQ%3D%3D?line=6'>7</a>\u001b[0m \u001b[39massert\u001b[39;00m rl(\u001b[39m\"\u001b[39m\u001b[39mhello\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39mhello\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39mhello\u001b[39m\u001b[39m\"\u001b[39m) \u001b[39m==\u001b[39m \u001b[39mTrue\u001b[39;00m\n\u001b[1;32m      <a href='vscode-notebook-cell://ssh-remote%2Bc6i/home/ubuntu/easyllm/notebooks/data-filter.ipynb#X35sdnNjb2RlLXJlbW90ZQ%3D%3D?line=8'>9</a>\u001b[0m \u001b[39massert\u001b[39;00m rp(\u001b[39m\"\u001b[39m\u001b[39mhello\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m\\n\u001b[39;00m\u001b[39mworld\u001b[39m\u001b[39m\"\u001b[39m) \u001b[39m==\u001b[39m \u001b[39mFalse\u001b[39;00m\n\u001b[1;32m     <a href='vscode-notebook-cell://ssh-remote%2Bc6i/home/ubuntu/easyllm/notebooks/data-filter.ipynb#X35sdnNjb2RlLXJlbW90ZQ%3D%3D?line=9'>10</a>\u001b[0m \u001b[39massert\u001b[39;00m rp(\u001b[39m\"\u001b[39m\u001b[39mhello\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m\\n\u001b[39;00m\u001b[39mhello\u001b[39m\u001b[39m\"\u001b[39m) \u001b[39m==\u001b[39m \u001b[39mTrue\u001b[39;00m\n",
-      "\u001b[0;31mAssertionError\u001b[0m: "
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "from easyllm.data.filters import RepeatedLinesFilter, RepeatedParagraphFilter\n",
     "\n",
@@ -319,10 +297,58 @@
     "rp = RepeatedParagraphFilter()\n",
     "\n",
     "assert rl(\"hello\\nworld\") == False\n",
-    "assert rl(\"hello\\nhello\\nhello\") == True\n",
+    "assert rl(\"hello\\nhello\\nhello\\nhello\") == True\n",
     "\n",
     "assert rp(\"hello\\n\\nworld\") == False\n",
-    "assert rp(\"hello\\n\\nhello\") == True"
+    "assert rp(\"hello\\n\\nhello\\n\\nhello\\n\\nhello\") == True"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## TopNGramsFilter\n",
+    "\n",
+    "The `TopNGramsFilter` removes the document if the top n-gram makes more than 20% of the document."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from easyllm.data.filters import TopNGramsFilter\n",
+    "\n",
+    "tng = TopNGramsFilter()\n",
+    "\n",
+    "assert tng(\"This is a test for a longer sentence\") == False \n",
+    "\n",
+    "assert tng(\"The quick brown fox jumps over the lazy dog The quick brown\") == True"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## PunctuationFilter\n",
+    "\n",
+    "The `PunctuationFilter` removes the document if more than 15% of the \"linebreaks\" don't contain any punctuation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from easyllm.data.filters import PunctuationFilter\n",
+    "\n",
+    "pf = PunctuationFilter()\n",
+    "\n",
+    "assert pf(\"This is a sentence.\") == False\n",
+    "\n",
+    "assert pf(\"This is a sentence\\n But is not one.\\nNo oneyet.\") == True"
    ]
   },
   {

From e8108058dc24ace9943f8a342a0924750b36ba42 Mon Sep 17 00:00:00 2001
From: Philipp Schmid <schmidphilipp1995@gmail.com>
Date: Fri, 11 Aug 2023 15:47:36 +0000
Subject: [PATCH 07/16] more filters

---
 easyllm/data/filters/__init__.py      |  3 +-
 easyllm/data/filters/common_word.py   | 29 +++++++++++++++++++
 easyllm/data/filters/cookie_banner.py | 35 +++++++++++++++++++++++
 easyllm/data/filters/punctuation.py   | 33 ++++++++++++++++++----
 easyllm/data/filters/stop_word.py     |  0
 notebooks/data-filter.ipynb           | 40 +++++++++++++++++++++++----
 6 files changed, 129 insertions(+), 11 deletions(-)
 create mode 100644 easyllm/data/filters/common_word.py
 delete mode 100644 easyllm/data/filters/stop_word.py

diff --git a/easyllm/data/filters/__init__.py b/easyllm/data/filters/__init__.py
index b1ddc05..9862e86 100644
--- a/easyllm/data/filters/__init__.py
+++ b/easyllm/data/filters/__init__.py
@@ -1,4 +1,5 @@
 from easyllm.data.filters.bulletpoint_ratio import BulletpointRatioFilter
+from easyllm.data.filters.common_word import CommonWordFilter
 from easyllm.data.filters.kenlm_ppl import PerplexityFilter
 from easyllm.data.filters.length import LengthFilter
 from easyllm.data.filters.longword import LongWordFilter
@@ -6,7 +7,7 @@
 from easyllm.data.filters.non_alpha_numeric import NonAlphaNumericFilter
 from easyllm.data.filters.digit_to_character import DigitToCharacter
 from easyllm.data.filters.parantheses_ration import ParenthesesRationFilter
-from easyllm.data.filters.punctuation import PunctuationFilter
+from easyllm.data.filters.punctuation import EllipsisFilter, PunctuationFilter
 from easyllm.data.filters.repeating import RepeatedParagraphFilter, RepeatedLinesFilter
 from easyllm.data.filters.url_ratio import UrlRatioFilter
 from easyllm.data.filters.whitespace_ration import WhitespaceRatioFilter
diff --git a/easyllm/data/filters/common_word.py b/easyllm/data/filters/common_word.py
new file mode 100644
index 0000000..f562020
--- /dev/null
+++ b/easyllm/data/filters/common_word.py
@@ -0,0 +1,29 @@
+from typing import List
+
+from pydantic import BaseModel
+
+COMMON_WORDS_EN = ["the", "be", "to", "of", "and", "that", "have", "with", "this"]
+COMMON_WORDS_DE = ["der", "die", "das", "er" "sein", "zu", "ist", "war", "von", "und", "haben", "mit"]
+
+
+class CommonWordFilter(BaseModel):
+    """
+    Ref: Gopher (Rae et al., 2021)
+    Desc: Makes sure that the document contains at least 2 common words if not remove
+    """
+
+    name: str = "common_word"
+    common_words: List[str] = COMMON_WORDS_EN
+    n: int = 2
+
+    def __call__(self, text):
+        words = text.split()
+        common_word_counter = 0
+        # count the number of common words
+        for word in words:
+            if word.lower() in self.common_words:
+                common_word_counter += 1
+            if common_word_counter >= self.n:
+                return False
+        # otherwise remove
+        return True
diff --git a/easyllm/data/filters/cookie_banner.py b/easyllm/data/filters/cookie_banner.py
index 0df4cbd..91ed1bb 100644
--- a/easyllm/data/filters/cookie_banner.py
+++ b/easyllm/data/filters/cookie_banner.py
@@ -2,6 +2,41 @@
 
 from pydantic import BaseModel
 
+policy_substrings = [
+    "terms of use",
+    "privacy policy",
+    "cookie policy",
+    "uses cookies",
+    "privacy overview",
+    "use of cookies",
+    "use cookies",
+    "privacy & cookies policy",
+    "privacy and cookies policy",
+    "This website uses cookies to improve your experience while you "
+    "navigate through the website. Out of these cookies, the cookies "
+    "that are categorized as necessary are stored on your browser as they "
+    "are essential for the working of basic functionalities of the website. "
+    "We also use third-party cookies that help us analyze and understand how "
+    "you use this website. These cookies will be stored in your browser only "
+    "with your consent. You also have the option to opt-out of these "
+    "cookies. But opting out of some of these cookies may have an effect "
+    "on your browsing experience.".lower(),
+    "Necessary cookies are absolutely essential for the website to "
+    "function properly. This category only includes cookies that "
+    "ensures basic functionalities and security features of the website. "
+    "These cookies do not store any personal information.".lower(),
+    "Any cookies that may not be particularly necessary for the website "
+    "to function and is used specifically to collect user personal data "
+    "via analytics, ads, other embedded contents are termed as non-necessary "
+    "cookies. It is mandatory to procure user consent prior to running these "
+    "cookies on your website.".lower(),
+    "This site uses cookies, including for analytics, personalization, and "
+    "advertising purposes. For more information or to change your "
+    "cookie settings, click here.".lower(),
+    "If you continue to browse this site without changing your cookie "
+    "settings, you agree to this use. AcceptRead More".lower(),
+]
+
 
 class CookieBannerFilter(BaseModel):
     """
diff --git a/easyllm/data/filters/punctuation.py b/easyllm/data/filters/punctuation.py
index 4a229e5..a0a1d53 100644
--- a/easyllm/data/filters/punctuation.py
+++ b/easyllm/data/filters/punctuation.py
@@ -1,4 +1,3 @@
-import re
 from typing import List
 
 from pydantic import BaseModel
@@ -17,11 +16,35 @@ class PunctuationFilter(BaseModel):
     def __call__(self, text):
         sentences = text.split("\n")
         # count the number of sentences not ending with a punctuation mark
-        num_sentences_wo_p = sum(
-            1 for sentence in sentences if sentence[-1] not in self.punctuations
-        )
+        num_sentences_wo_p = sum(1 for sentence in sentences if sentence[-1] not in self.punctuations)
         # check if the ratio of sentences not ending with a punctuation mark is greater than the remove percentage
         if num_sentences_wo_p / len(sentences) > self.remove_percentage:
             return True
         # otherwise keep
-        return False
\ No newline at end of file
+        return False
+
+
+class EllipsisFilter(BaseModel):
+    """
+    Ref: C4 Raffel et al.
+    Desc: If more than 30% of the sentences endwith an elipsis then remove
+    """
+
+    name: str = "ellipsis"
+    ellipsis: List[str] = ["...", "[...]", "…", "(...)", "[…]", "-»", "read more..", "read more"]
+    remove_percentage: float = 0.3
+
+    def __call__(self, text):
+        sentences = text.split("\n")
+        # count the number of sentences ending with an ellipsis
+        ellipsis_counter = 0
+        for sentence in sentences:
+            for ellipsis in self.ellipsis:
+                if sentence.endswith(ellipsis):
+                    ellipsis_counter += 1
+                    break
+        # check if the ratio of sentences ending with an ellipsis is greater than the remove percentage
+        if ellipsis_counter / len(sentences) > self.remove_percentage:
+            return True
+        # otherwise keep
+        return False
diff --git a/easyllm/data/filters/stop_word.py b/easyllm/data/filters/stop_word.py
deleted file mode 100644
index e69de29..0000000
diff --git a/notebooks/data-filter.ipynb b/notebooks/data-filter.ipynb
index 47bb67d..ab940af 100644
--- a/notebooks/data-filter.ipynb
+++ b/notebooks/data-filter.ipynb
@@ -331,24 +331,54 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## PunctuationFilter\n",
+    "## PunctuationFilter & EllipsisFilter\n",
     "\n",
-    "The `PunctuationFilter` removes the document if more than 15% of the \"linebreaks\" don't contain any punctuation."
+    "The `PunctuationFilter` & `EllipsisFilter` removes the document if more than 15% of the \"linebreaks\" don't contain any punctuation or if more than 30% of the \"linebreaks\" contain an ellipsis."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
-    "from easyllm.data.filters import PunctuationFilter\n",
+    "from easyllm.data.filters import PunctuationFilter, EllipsisFilter\n",
     "\n",
     "pf = PunctuationFilter()\n",
     "\n",
     "assert pf(\"This is a sentence.\") == False\n",
     "\n",
-    "assert pf(\"This is a sentence\\n But is not one.\\nNo oneyet.\") == True"
+    "assert pf(\"This is a sentence\\n But is not one.\\nNo oneyet.\") == True\n",
+    "\n",
+    "ef = EllipsisFilter()\n",
+    "\n",
+    "assert ef(\"This is a sentence.\") == False\n",
+    "\n",
+    "assert ef(\"This is a sentence\\n But is not one....\") == True"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## CommonWordFilter\n",
+    "\n",
+    "The `CommonWordFilter` removes documents if they don't include atleast 2 common words."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from easyllm.data.filters import CommonWordFilter\n",
+    "\n",
+    "cw = CommonWordFilter()\n",
+    "\n",
+    "assert cw(\"This is a sentence with a common word.\") == False\n",
+    "\n",
+    "assert cw(\"cat dog mouse\") == True"
    ]
   },
   {

From 1f652a08d9b7c1d19d599770c38f8d150a4c86af Mon Sep 17 00:00:00 2001
From: Philipp Schmid <schmidphilipp1995@gmail.com>
Date: Fri, 11 Aug 2023 16:02:18 +0000
Subject: [PATCH 08/16] style

---
 easyllm/data/filters/__init__.py          | 4 ++--
 easyllm/data/filters/bulletpoint_ratio.py | 1 -
 easyllm/data/filters/kenlm_ppl.py         | 4 ++--
 easyllm/data/filters/length.py            | 1 -
 easyllm/data/filters/longword.py          | 1 -
 easyllm/data/filters/n_gram.py            | 7 ++++---
 easyllm/data/filters/non_alpha_numeric.py | 2 +-
 7 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/easyllm/data/filters/__init__.py b/easyllm/data/filters/__init__.py
index 9862e86..d37d7b2 100644
--- a/easyllm/data/filters/__init__.py
+++ b/easyllm/data/filters/__init__.py
@@ -1,14 +1,14 @@
 from easyllm.data.filters.bulletpoint_ratio import BulletpointRatioFilter
 from easyllm.data.filters.common_word import CommonWordFilter
+from easyllm.data.filters.digit_to_character import DigitToCharacter
 from easyllm.data.filters.kenlm_ppl import PerplexityFilter
 from easyllm.data.filters.length import LengthFilter
 from easyllm.data.filters.longword import LongWordFilter
 from easyllm.data.filters.n_gram import TopNGramsFilter
 from easyllm.data.filters.non_alpha_numeric import NonAlphaNumericFilter
-from easyllm.data.filters.digit_to_character import DigitToCharacter
 from easyllm.data.filters.parantheses_ration import ParenthesesRationFilter
 from easyllm.data.filters.punctuation import EllipsisFilter, PunctuationFilter
-from easyllm.data.filters.repeating import RepeatedParagraphFilter, RepeatedLinesFilter
+from easyllm.data.filters.repeating import RepeatedLinesFilter, RepeatedParagraphFilter
 from easyllm.data.filters.url_ratio import UrlRatioFilter
 from easyllm.data.filters.whitespace_ration import WhitespaceRatioFilter
 from easyllm.data.filters.words_to_symbol import SymbolToWordFilter
diff --git a/easyllm/data/filters/bulletpoint_ratio.py b/easyllm/data/filters/bulletpoint_ratio.py
index 1e02966..566d554 100644
--- a/easyllm/data/filters/bulletpoint_ratio.py
+++ b/easyllm/data/filters/bulletpoint_ratio.py
@@ -1,4 +1,3 @@
-import re
 from typing import List
 
 from pydantic import BaseModel
diff --git a/easyllm/data/filters/kenlm_ppl.py b/easyllm/data/filters/kenlm_ppl.py
index 55b56be..fb40cfa 100644
--- a/easyllm/data/filters/kenlm_ppl.py
+++ b/easyllm/data/filters/kenlm_ppl.py
@@ -103,8 +103,8 @@ def from_pretrained(
         try:
           model = hf_hub_download("philschmid/kenlm",filename=f"wikipedia/{language_or_path}.arpa.bin")
           tokenizer = hf_hub_download("philschmid/kenlm",filename=f"wikipedia/{language_or_path}.sp.model")
-        except:
-          raise ValueError(f"KenLM model for {language_or_path} not found at https://huggingface.co/philschmid/kenlm. Please train your own model and upload it to the hub.")
+        except Exception:
+          raise ValueError(f"KenLM model for {language_or_path} not found at https://huggingface.co/philschmid/kenlm. Please train your own model and upload it to the hub.") from None
 
 
         return cls(
diff --git a/easyllm/data/filters/length.py b/easyllm/data/filters/length.py
index 4eaa9ff..51646f2 100644
--- a/easyllm/data/filters/length.py
+++ b/easyllm/data/filters/length.py
@@ -1,4 +1,3 @@
-import re
 
 from pydantic import BaseModel
 
diff --git a/easyllm/data/filters/longword.py b/easyllm/data/filters/longword.py
index ed59081..f98ba38 100644
--- a/easyllm/data/filters/longword.py
+++ b/easyllm/data/filters/longword.py
@@ -1,4 +1,3 @@
-import re
 
 from pydantic import BaseModel
 
diff --git a/easyllm/data/filters/n_gram.py b/easyllm/data/filters/n_gram.py
index 0fa1f53..5523be3 100644
--- a/easyllm/data/filters/n_gram.py
+++ b/easyllm/data/filters/n_gram.py
@@ -1,10 +1,11 @@
-from pydantic import BaseModel
-from itertools import chain
 from collections import Counter
+from itertools import chain
+
+from pydantic import BaseModel
 
 
 def get_ngrams(input_list, n):
-    return [item for item in zip(*[input_list[i:] for i in range(n)])]
+    return list(zip(*[input_list[i:] for i in range(n)]))
 
 
 class TopNGramsFilter(BaseModel):
diff --git a/easyllm/data/filters/non_alpha_numeric.py b/easyllm/data/filters/non_alpha_numeric.py
index 8f0ba8b..81815f3 100644
--- a/easyllm/data/filters/non_alpha_numeric.py
+++ b/easyllm/data/filters/non_alpha_numeric.py
@@ -10,7 +10,7 @@ class NonAlphaNumericFilter(BaseModel):
     """
 
     name: str = "non_alpha_numeric"
-    regex: re.Pattern = re.compile("[^a-zA-Z0-9\s]")
+    regex: re.Pattern = re.compile(r"[^a-zA-Z0-9\s]")
     remove_percentage: float = 0.2
 
     def __call__(self, text):

From ed573afb665102fa0acef821dfa31b7c34254d68 Mon Sep 17 00:00:00 2001
From: Philipp Schmid <schmidphilipp1995@gmail.com>
Date: Fri, 11 Aug 2023 16:05:43 +0000
Subject: [PATCH 09/16]  fix docs dependencies

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 4bc1c5b..a9567f7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,7 +36,7 @@ scripts = { easyllm = "easyllm.cli:main" }
 dependencies = ["pydantic==2.1.1", "nanoid==2.0.0", "huggingface-hub==0.16.4"]
 
 [project.optional-dependencies]
-data = ["datasets","https://github.com/kpu/kenlm/archive/master.zip","sentencepiece"]
+data = ["datasets","kenlm @ https://github.com/kpu/kenlm/archive/master.zip","sentencepiece"]
 test = ["pytest", "ruff", "black", "isort", "mypy", "hatch"]
 dev = ["ruff", "black", "isort", "mypy", "hatch"]
 docs = [

From c15240bc381b24d85251b12808d84a1bca8f2622 Mon Sep 17 00:00:00 2001
From: Philipp Schmid <schmidphilipp1995@gmail.com>
Date: Fri, 11 Aug 2023 18:37:48 +0000
Subject: [PATCH 10/16] remote builds

---
 pyproject.toml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index a9567f7..84fb77c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,6 +10,9 @@ Source = "https://github.com/unknown/hatch-demo"
 [tool.hatch.version]
 path = "easyllm/__init__.py"
 
+[tool.hatch.metadata]
+allow-direct-references = true
+
 [project]
 name = "easyllm"
 description = "Description"

From 323566dd5d4557178a0fc7aca839c4a202c5ae86 Mon Sep 17 00:00:00 2001
From: Philipp Schmid <schmidphilipp1995@gmail.com>
Date: Fri, 11 Aug 2023 19:16:54 +0000
Subject: [PATCH 11/16] fix punctuation

---
 easyllm/data/filters/punctuation.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/easyllm/data/filters/punctuation.py b/easyllm/data/filters/punctuation.py
index a0a1d53..0dd1297 100644
--- a/easyllm/data/filters/punctuation.py
+++ b/easyllm/data/filters/punctuation.py
@@ -16,9 +16,14 @@ class PunctuationFilter(BaseModel):
     def __call__(self, text):
         sentences = text.split("\n")
         # count the number of sentences not ending with a punctuation mark
-        num_sentences_wo_p = sum(1 for sentence in sentences if sentence[-1] not in self.punctuations)
+        punc_counter = 0
+        for sentence in sentences:
+            for punc in self.punctuations:
+                if not sentence.endswith(punc):
+                    punc_counter += 1
+                    break
         # check if the ratio of sentences not ending with a punctuation mark is greater than the remove percentage
-        if num_sentences_wo_p / len(sentences) > self.remove_percentage:
+        if punc_counter / len(sentences) > self.remove_percentage:
             return True
         # otherwise keep
         return False

From aa9a2f9ee2cde331ba478a871c73585ba50d39d9 Mon Sep 17 00:00:00 2001
From: Philipp Schmid <schmidphilipp1995@gmail.com>
Date: Fri, 11 Aug 2023 19:26:07 +0000
Subject: [PATCH 12/16] test

---
 easyllm/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/easyllm/__init__.py b/easyllm/__init__.py
index e7e3057..ed7aec4 100644
--- a/easyllm/__init__.py
+++ b/easyllm/__init__.py
@@ -1,4 +1,4 @@
 # SPDX-FileCopyrightText: 2023-present philschmid <schmidphilipp1995@gmail.com>
 #
 # SPDX-License-Identifier: MIT
-__version__ = "0.5.0.dev0"
+__version__ = "0.5.1.dev0"

From f761f5b2fc830c40b151561d7e0bfc9526556579 Mon Sep 17 00:00:00 2001
From: Philipp Schmid <schmidphilipp1995@gmail.com>
Date: Fri, 11 Aug 2023 19:35:28 +0000
Subject: [PATCH 13/16] fix punc

---
 easyllm/data/filters/punctuation.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/easyllm/data/filters/punctuation.py b/easyllm/data/filters/punctuation.py
index 0dd1297..91052e1 100644
--- a/easyllm/data/filters/punctuation.py
+++ b/easyllm/data/filters/punctuation.py
@@ -15,15 +15,15 @@ class PunctuationFilter(BaseModel):
 
     def __call__(self, text):
         sentences = text.split("\n")
-        # count the number of sentences not ending with a punctuation mark
+        # count the number of sentences ending with a punctuation mark
         punc_counter = 0
         for sentence in sentences:
             for punc in self.punctuations:
-                if not sentence.endswith(punc):
+                if sentence.endswith(punc):
                     punc_counter += 1
                     break
         # check if the ratio of sentences not ending with a punctuation mark is greater than the remove percentage
-        if punc_counter / len(sentences) > self.remove_percentage:
+        if 1 - (punc_counter / len(sentences)) > self.remove_percentage:
             return True
         # otherwise keep
         return False

From 437b0dc0b3b8e25b723af228ac2f339b8201e0c7 Mon Sep 17 00:00:00 2001
From: Philipp Schmid <schmidphilipp1995@gmail.com>
Date: Sat, 12 Aug 2023 18:30:09 +0000
Subject: [PATCH 14/16] html extractor

---
 easyllm/data/extractor/__init__.py       |  1 +
 easyllm/data/extractor/html_extractor.py | 24 ++++++++++++++++++
 easyllm/data/filters/kenlm_ppl.py        | 31 ++++++++++++------------
 easyllm/data/filters/punctuation.py      |  2 +-
 pyproject.toml                           |  2 +-
 5 files changed, 42 insertions(+), 18 deletions(-)
 create mode 100644 easyllm/data/extractor/__init__.py
 create mode 100644 easyllm/data/extractor/html_extractor.py

diff --git a/easyllm/data/extractor/__init__.py b/easyllm/data/extractor/__init__.py
new file mode 100644
index 0000000..3f25c04
--- /dev/null
+++ b/easyllm/data/extractor/__init__.py
@@ -0,0 +1 @@
+from easyllm.data.extractor.html_extractor import HtmlExtractor
diff --git a/easyllm/data/extractor/html_extractor.py b/easyllm/data/extractor/html_extractor.py
new file mode 100644
index 0000000..41bf77f
--- /dev/null
+++ b/easyllm/data/extractor/html_extractor.py
@@ -0,0 +1,24 @@
+from pydantic import BaseModel
+
+#
+from inscriptis import get_text
+from inscriptis.css_profiles import CSS_PROFILES
+from inscriptis.model.config import ParserConfig
+from readability import Document
+
+INSCRIPTIS_CONFIG = ParserConfig(css=CSS_PROFILES["strict"])
+
+
+class HtmlExtractor(BaseModel):
+    """
+    Desc: Extracts text from the HTML document using mozzilas readability and inscriptis.
+    """
+
+    name: str = "html_extractor"
+    min_doc_length: int = 25
+
+    def __call__(self, document: str) -> str:
+        parsed_doc = Document(document, min_text_length=self.min_doc_length)
+        clean_html = parsed_doc.summary(html_partial=True)
+        content = get_text(clean_html, INSCRIPTIS_CONFIG).strip()
+        return content
diff --git a/easyllm/data/filters/kenlm_ppl.py b/easyllm/data/filters/kenlm_ppl.py
index fb40cfa..8cb4e1b 100644
--- a/easyllm/data/filters/kenlm_ppl.py
+++ b/easyllm/data/filters/kenlm_ppl.py
@@ -1,4 +1,3 @@
-
 import importlib.util
 import re
 import unicodedata
@@ -11,9 +10,8 @@
 _sentencepiece = importlib.util.find_spec("sentencepiece") is not None
 
 if _kenlm or not _sentencepiece:
-  import kenlm
-  import sentencepiece
-
+    import kenlm
+    import sentencepiece
 
 
 class SentencePiece:
@@ -68,10 +66,8 @@ class KenlmModel:
         "％": "%",
         "►": "-",
     }
-    unicode_punct_re:re.Pattern = re.compile(f"[{''.join(unicode_punct.keys())}]")
-    non_printing_chars_re:re.Pattern = re.compile(
-        f"[{''.join(map(chr, list(range(0,32)) + list(range(127,160))))}]"
-    )
+    unicode_punct_re: re.Pattern = re.compile(f"[{''.join(unicode_punct.keys())}]")
+    non_printing_chars_re: re.Pattern = re.compile(f"[{''.join(map(chr, list(range(0,32)) + list(range(127,160))))}]")
     model: kenlm.Model = None
     tokenizer: SentencePiece = None
     accent: bool = False
@@ -101,11 +97,12 @@ def from_pretrained(
         language_or_path: str,
     ):
         try:
-          model = hf_hub_download("philschmid/kenlm",filename=f"wikipedia/{language_or_path}.arpa.bin")
-          tokenizer = hf_hub_download("philschmid/kenlm",filename=f"wikipedia/{language_or_path}.sp.model")
+            model = hf_hub_download("philschmid/kenlm", filename=f"wikipedia/{language_or_path}.arpa.bin")
+            tokenizer = hf_hub_download("philschmid/kenlm", filename=f"wikipedia/{language_or_path}.sp.model")
         except Exception:
-          raise ValueError(f"KenLM model for {language_or_path} not found at https://huggingface.co/philschmid/kenlm. Please train your own model and upload it to the hub.") from None
-
+            raise ValueError(
+                f"KenLM model for {language_or_path} not found at https://huggingface.co/philschmid/kenlm. Please train your own model and upload it to the hub."
+            ) from None
 
         return cls(
             model,
@@ -187,15 +184,17 @@ class PerplexityFilter(BaseModel):
     max_threshold: int = 1000
     model_config = ConfigDict(arbitrary_types_allowed=True)
 
-    def __init__(self,language:str,min_threshold:int=0,max_threshold:int=1000):
+    def __init__(self, language: str, min_threshold: int = 0, max_threshold: int = 1000):
         super().__init__()
         self.min_threshold = min_threshold
         self.max_threshold = max_threshold
         self.model = KenlmModel.from_pretrained(language)
 
-
     def __call__(self, doc: str) -> bool:
         # returns True if the perplexity of the document outside of the threshold,
         # meaning smaller than min_threshold or larger than max_threshold
-        return not self.min_threshold <= self.model.get_perplexity(doc) <= self.max_threshold
-
+        perplexity = self.model.get_perplexity(doc)
+        if perplexity < self.min_threshold or perplexity > self.max_threshold:
+            return True
+        # otherwise keep
+        return False
diff --git a/easyllm/data/filters/punctuation.py b/easyllm/data/filters/punctuation.py
index 91052e1..17da9a2 100644
--- a/easyllm/data/filters/punctuation.py
+++ b/easyllm/data/filters/punctuation.py
@@ -23,7 +23,7 @@ def __call__(self, text):
                     punc_counter += 1
                     break
         # check if the ratio of sentences not ending with a punctuation mark is greater than the remove percentage
-        if 1 - (punc_counter / len(sentences)) > self.remove_percentage:
+        if punc_counter / len(sentences) < self.remove_percentage:
             return True
         # otherwise keep
         return False
diff --git a/pyproject.toml b/pyproject.toml
index 84fb77c..83b7595 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -39,7 +39,7 @@ scripts = { easyllm = "easyllm.cli:main" }
 dependencies = ["pydantic==2.1.1", "nanoid==2.0.0", "huggingface-hub==0.16.4"]
 
 [project.optional-dependencies]
-data = ["datasets","kenlm @ https://github.com/kpu/kenlm/archive/master.zip","sentencepiece"]
+data = ["datasets","kenlm @ https://github.com/kpu/kenlm/archive/master.zip","sentencepiece","readability-lxml","inscriptis"]
 test = ["pytest", "ruff", "black", "isort", "mypy", "hatch"]
 dev = ["ruff", "black", "isort", "mypy", "hatch"]
 docs = [

From 02e6526b42469b7c0d5fbd1f5a53990cfbde4a32 Mon Sep 17 00:00:00 2001
From: Philipp Schmid <schmidphilipp1995@gmail.com>
Date: Fri, 24 Nov 2023 14:49:50 +0000
Subject: [PATCH 15/16] example

---
 notebooks/datasets/filter-dataset.ipynb | 2316 +++++++++++++++++++++++
 1 file changed, 2316 insertions(+)
 create mode 100644 notebooks/datasets/filter-dataset.ipynb

diff --git a/notebooks/datasets/filter-dataset.ipynb b/notebooks/datasets/filter-dataset.ipynb
new file mode 100644
index 0000000..a55f94a
--- /dev/null
+++ b/notebooks/datasets/filter-dataset.ipynb
@@ -0,0 +1,2316 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%pip uninstall easyllm -y\n",
+    "%pip install git+https://github.com/philschmid/easyllm.git@datafilter --upgrade"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import load_dataset\n",
+    "\n",
+    "ds = load_dataset('philschmid/oscar-2301-de-minhash-dedup',split=\"train\")\n",
+    "# ds = load_dataset('wikipedia','20220301.de',split=\"train\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Perplexity filtering \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Nach § 80 Abs. 5 Satz 1 Halbsatz 2 VwGO kann das Gericht der Hauptsache die aufschiebende Wirkung der Klage ganz oder teilweise wiederherstellen. Ist die sofortige Vollziehung von der Behörde den formellen Anforderungen des § 80 Abs. 3 Satz 1 VwGO genügend angeordnet worden, so entscheidet das Gericht nach § 80 Abs. 5 Satz 1 Halbsatz 2 VwGO über die Wiederherstellung der aufschiebenden Wirkung der Klage auf der Grundlage einer eigenen Abwägung des Interesses des Antragstellers, von der Vollziehung des angefochtenen Verwaltungsakts bis zur endgültigen Entscheidung über seine Rechtmäßigkeit verschont zu bleiben, gegen das besondere öffentliche Interesse an dessen sofortiger Vollziehung (vgl. BVerwG, Beschl. v. 19.12.2014 - 7 VR 5.14 -, juris Rn. 9; Nds. OVG, Beschl. v. 10.09.2014 - 8 ME 87/14 -, juris Rn. 2). Im Rahmen der Interessenabwägung haben die Erfolgsaussichten des in der Hauptsache eingelegten Rechtsbehelfs eine entscheidende Bedeutung. Ergibt sich bei der im Rahmen des vorläufigen Rechtsschutzes gebotenen, aber grundsätzlich auch ausreichenden (vgl. Nds. OVG, Beschl. v. 16.8.2017 - 13 ME 173/17 -, juris Rn. 4, vgl. auch Beschl. v. 24.01.2018 - 7 ME 110/17 -, juris Rn. 28) summarischen Überprüfung, dass der Rechtsbehelf in der Hauptsache keinen Erfolg haben wird, weil sich der angegriffene Verwaltungsakt als offensichtlich rechtmäßig erweist, so überwiegt regelmäßig das öffentliche Interesse an der sofortigen Vollziehung des Verwaltungsakts. Erweist sich der Rechtsbehelf bei summarischer Überprüfung demgegenüber als offensichtlich erfolgreich, überwiegt regelmäßig das Interesse des Adressaten des Verwaltungsakts, von dessen Vollziehung vorerst verschont zu bleiben. Stellen sich die Erfolgsaussichten des Rechtsbehelfs hingegen als offen dar, so ist eine Abwägung der widerstreitenden Interessen erforderlich, bei der in Rechnung zu stellen ist, welche Gründe bei bestehender Unsicherheit im Hinblick auf die Erfolgsaussichten des Rechtsbehelfs für und gegen eine Aufrechterhaltung der sofortigen Vollziehung des Verwaltungsakts sprechen (vgl. Nds. OVG, Beschl. v. 10.5.2010 - 13 ME 181/09 -, juris Rn. 4). Außerdem ist zu berücksichtigen, dass die voraussichtliche Rechtmäßigkeit eines Verwaltungsakts für sich allein nur das allgemeine Interesse an seiner Vollziehung begründet, nicht aber zugleich auch deren, für die behördliche Anordnung nach § 80 Abs. 2 Satz 1 Nr. 4 VwGO erforderliche Dringlichkeit (vgl. grundlegend BVerfG, Beschl. v. 27.4.2005 - 1 BvR 223/05 -, NVwZ 2005, 1303; Beschl. v. 18.7.1973, - 1 BvR 23/73 -, BVerfGE 35, 382, 402; Nds. OVG, Beschl. v. 10.9.2014, a.a.O.; Finkelnburg/Dombert/Külpmann, Vorläufiger Rechtsschutz im Verwaltungsstreitverfahren, 7. Aufl., Rn. 757 f. m.w.N.).\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(ds[456][\"text\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8071b0d5472949deabe06d5600f46054",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "add url (num_proc=128):   0%|          | 0/53172498 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "import os\n",
+    "from easyllm.data.filters import PerplexityFilter\n",
+    "\n",
+    "ppl = PerplexityFilter(\"de\",min_threshold=10,max_threshold=1000)\n",
+    "\n",
+    "def calc_pp(doc):\n",
+    "  # pp = ppl.model.get_perplexity(doc[\"text\"])\n",
+    "  return {\n",
+    "    # \"perplexity\": pp,\n",
+    "    \"timestamp\": doc[\"meta\"][\"warc_headers\"][\"warc-date\"],\n",
+    "    \"url\": doc[\"meta\"][\"warc_headers\"][\"warc-target-uri\"]\n",
+    "  }\n",
+    "\n",
+    "ds = ds.map(\n",
+    "    calc_pp,\n",
+    "    remove_columns=[\"meta\"],\n",
+    "    num_proc=os.cpu_count(),\n",
+    "    desc=\"add url\",\n",
+    ")\n",
+    "# time 1min for 1M docs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAkQAAAHLCAYAAAAp7ofKAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjYuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8o6BhiAAAACXBIWXMAAA9hAAAPYQGoP6dpAABa20lEQVR4nO3deXhTVf4G8DdNmjRd0paWbrS0lX0vslkHEKRSNkcBFRQtm6IOKIjLiD9FcEMQEEQFcSng4Ig4oogCIouIsomyQ1kEWuhKt3RL0yb390eaW0ILbdM0N8v7eZ4+k9yc3PtN7xjennPuuTJBEAQQERERuTEPqQsgIiIikhoDEREREbk9BiIiIiJyewxERERE5PYYiIiIiMjtMRARERGR22MgIiIiIrfHQERERERuj4GIiIiI3B4DERHV265duyCTybBr164mO8aAAQMwYMCAJtt/Y0yYMAExMTF2OVZMTAwmTJggPl+1ahVkMhn++OMPuxzfkc8DUVNgICJyUOZ/AM0/Xl5eaNu2LaZNm4asrCypy7Ob9PR0zJkzB4cPH7bpfufMmWPx+/X29kbLli1x9913Izk5GeXl5TY5zsmTJzFnzhxcvHjRJvuzJUeujcjeFFIXQEQ399prryE2NhY6nQ579uzB8uXL8eOPP+L48ePw9vaWujyb++mnnyyep6enY+7cuYiJiUFcXJzNj7d8+XL4+vqivLwcV65cwdatWzFp0iQsWbIEmzZtQlRUlNj2448/htFobND+T548iblz52LAgAEN6l1KSUmBh0fT/s16s9quPw9Ero6BiMjBDR06FD179gQAPProowgKCsLixYvx3Xff4cEHH2zUvktLSx0uVCmVSrse77777kNwcLD4fPbs2Vi7di2SkpJw//33Y9++feJrnp6eTVqLIAjQ6XRQq9VQqVRNeqy62Ps8EEmNQ2ZETubOO+8EAFy4cEHc9p///Ac9evSAWq1Gs2bNMHbsWKSlpVm8b8CAAejcuTMOHTqE/v37w9vbGy+99BIA03yVESNG4KeffkJcXBy8vLzQsWNHfPPNN/Wqaf/+/RgyZAj8/f3h7e2NO+64A7/99pv4+qlTp6BWq5GUlGTxvj179kAul+Pf//63RZ3muSu7du1Cr169AAATJ04Uh7dWrVqFV199FZ6ensjJyalRz5QpUxAQEACdTlev+q83btw4PProo9i/fz+2bdsmbq9tDtGXX36JHj16wM/PDxqNBl26dMHSpUsBmIY977//fgDAwIEDxfrNc7DMv/etW7eiZ8+eUKvV+Oijj8TXrp1DZFZaWorHH38cQUFB0Gg0SEpKQn5+vkUbmUyGOXPm1Hjvtfusq7ba5hBlZ2dj8uTJCA0NhZeXF7p164bVq1dbtLl48SJkMhkWLlyIlStXolWrVlCpVOjVqxcOHjxY6++byBEwEBE5mfPnzwMAgoKCAABvvvkmkpKS0KZNGyxevBgzZszA9u3b0b9/fxQUFFi8Nzc3F0OHDkVcXByWLFmCgQMHiq+dPXsWY8aMwdChQzFv3jwoFArcf//9FoGgNjt27ED//v2h1Wrx6quv4q233kJBQQHuvPNOHDhwAADQoUMHvP766/j888+xceNGAEBJSQkmTJiA9u3b47XXXqt13x06dBBfmzJlCj7//HN8/vnn6N+/Px555BFUVlZi3bp1Fu/R6/X4+uuvMXr0aHh5edXzt1rTI488AuDmQ0fbtm3Dgw8+iMDAQMyfPx9vv/02BgwYIIbB/v374+mnnwYAvPTSS2L9HTp0EPeRkpKCBx98EHfddReWLl1a57DgtGnTcOrUKcyZMwdJSUlYu3Yt7r33XgiC0KDPV5/arlVWVoYBAwbg888/x7hx4/DOO+/A398fEyZMEAPgtb744gu88847ePzxx/HGG2/g4sWLGDVqFCoqKhpUJ5HdCETkkJKTkwUAws8//yzk5OQIaWlpwpdffikEBQUJarVauHz5snDx4kVBLpcLb775psV7jx07JigUCovtd9xxhwBAWLFiRY1jRUdHCwCE//3vf+K2wsJCITw8XOjevbu4befOnQIAYefOnYIgCILRaBTatGkjJCYmCkajUWxXWloqxMbGCnfddZe4zWAwCH379hVCQ0OFq1evClOnThUUCoVw8OBBi1ruuOMO4Y477hCfHzx4UAAgJCcn16g7Pj5e6NOnj8W2b775xqLGG3n11VcFAEJOTk6tr+fn5wsAhJEjR4rbxo8fL0RHR4vPp0+fLmg0GqGysvKGx1m/fv0N6zH/3rds2VLra+PHjxefm///0KNHD0Gv14vbFyxYIAAQvvvuO3EbAOHVV1+tc583q+3687BkyRIBgPCf//xH3KbX64X4+HjB19dX0Gq1giAIwoULFwQAQlBQkJCXlye2/e677wQAwvfff1/jWESOgD1ERA4uISEBzZs3R1RUFMaOHQtfX19s2LABLVq0wDfffAOj0YgHHngAV69eFX/CwsLQpk0b7Ny502JfKpUKEydOrPU4ERERGDlypPjcPBzz119/ITMzs9b3HD58GGfPnsVDDz2E3Nxc8fglJSUYNGgQdu/eLU5C9vDwwKpVq1BcXIyhQ4fiww8/xKxZs8T5UdZISkrC/v37xV4zAFi7di2ioqJwxx13WL1fAPD19QUAFBUV3bBNQEAASkpK6uxFu5nY2FgkJibWu/2UKVMs5jI9+eSTUCgU+PHHH62uoT5+/PFHhIWFWcxb8/T0xNNPP43i4mL88ssvFu3HjBmDwMBA8Xm/fv0AAH///XeT1klkLU6qJnJwH3zwAdq2bQuFQoHQ0FC0a9dOvPro7NmzEAQBbdq0qfW9108CbtGixQ0ny7Zu3RoymcxiW9u2bQGY5oWEhYXVeM/Zs2cBAOPHj79h/YWFheI/jK1atcKcOXPw/PPPo3PnznjllVdu+L76GDNmDGbMmIG1a9di9uzZKCwsxKZNm/DMM8/U+CwNVVxcDADw8/O7YZt//etf+OqrrzB06FC0aNECgwcPxgMPPIAhQ4bU+zixsbENquv6c+3r64vw8PAmv3T+0qVLaNOmTY0r38xDbJcuXbLY3rJlS4vn5v8PXD/fichRMBARObjevXvfsBfFaDRCJpNh8+bNkMvlNV4393KYqdVqm9Zm7v155513bjj35foazHNy0tPTkZubW2vQqq/AwECMGDFCDERff/01ysvL8fDDD1u9T7Pjx48DMAXFGwkJCcHhw4exdetWbN68GZs3b0ZycjKSkpJqTDa+EVufk5sxGAx2O1Zt/38E0OC5TkT2wkBE5MRatWoFQRAQGxsr9uZY69y5cxAEwaJn5cyZMwBww/VzWrVqBcA0vJaQkFDnMVasWIFt27bhzTffxLx58/D444/ju+++u+l76urpSUpKwj333IODBw9i7dq16N69Ozp16lRnLXX5/PPPAaDO4SylUom7774bd999N4xGI/71r3/ho48+wiuvvFJrr1tjnT171mIyfHFxMTIyMjBs2DBxW2BgYI0J9Xq9HhkZGRbbGlJbdHQ0jh49CqPRaNFLdPr0afF1ImfGOURETmzUqFGQy+WYO3dujb+8BUFAbm5uvfeVnp6ODRs2iM+1Wi3WrFmDuLi4G/bi9OjRA61atcLChQvFIaZrXXtJ/IULF/D8889j9OjReOmll7Bw4UJs3LgRa9asuWldPj4+AFDjH3izoUOHIjg4GPPnz8cvv/xik96hL774Ap988gni4+MxaNCgG7a7/vfr4eGBrl27AoC40nVd9TfUypUrLa7UWr58OSorKzF06FBxW6tWrbB79+4a77u+h6ghtQ0bNgyZmZkWV/VVVlZi2bJl8PX1bfScLSKpsYeIyIm1atUKb7zxBmbNmoWLFy/i3nvvhZ+fHy5cuIANGzZgypQpeO655+q1r7Zt22Ly5Mk4ePAgQkND8dlnnyErKwvJyck3fI+Hhwc++eQTDB06FJ06dcLEiRPRokULXLlyBTt37oRGo8H3338PQRAwadIkqNVqLF++HADw+OOP43//+x+mT5+OhIQERERE3PAzBgQEYMWKFfDz84OPjw/69Okjzr3x9PTE2LFj8f7770Mulzd4scqvv/4avr6+0Ov14krVv/32G7p164b169ff9L2PPvoo8vLycOeddyIyMhKXLl3CsmXLEBcXJ86tiYuLg1wux/z581FYWAiVSoU777wTISEhDarTTK/XY9CgQXjggQeQkpKCDz/8EH379sU///lPi7qeeOIJjB49GnfddReOHDmCrVu3WixA2dDapkyZgo8++ggTJkzAoUOHEBMTg6+//hq//fYblixZctO5VkROQcIr3IjoJsyXWV9/WXpt/ve//wl9+/YVfHx8BB8fH6F9+/bC1KlThZSUFLHNHXfcIXTq1KnW90dHRwvDhw8Xtm7dKnTt2lVQqVRC+/bthfXr11u0u/6ye7O//vpLGDVqlBAUFCSoVCohOjpaeOCBB4Tt27cLgiAIS5curXFZvyAIQmpqqqDRaIRhw4ZZ1Hnt5d6CYLpku2PHjoJCoaj1EvwDBw4IAITBgwfX+bsyM192b/7x8vISIiMjhREjRgifffaZoNPparzn+svuv/76a2Hw4MFCSEiIoFQqhZYtWwqPP/64kJGRYfG+jz/+WLjlllsEuVxu8fsz/95rc6PL7n/55RdhypQpQmBgoODr6yuMGzdOyM3NtXivwWAQ/v3vfwvBwcGCt7e3kJiYKJw7d67GPm9WW23nISsrS5g4caIQHBwsKJVKoUuXLjXOhfmy+3feeafGZ8INlgMgcgQyQeAMNyJ3FxMTg86dO2PTpk1Sl2KVI0eOIC4uDmvWrBEXVCQiagjOISIip/fxxx/D19cXo0aNkroUInJSnENERE7r+++/x8mTJ7Fy5UpMmzZNnCRMRNRQDERE5LSeeuopZGVlYdiwYZg7d67U5RCRE+McIiIiInJ7nENEREREbo+BiIiIiNwe5xDVg9FoRHp6Ovz8/Gy+DD8RERE1DUEQUFRUhIiIiBo3Jr4eA1E9pKenIyoqSuoyiIiIyAppaWmIjIy8aRsGonowL0mflpYGjUYjcTVETaSkBDDfPiM9HXCAS9hL9CWIWGSqKf3ZdPgopavJAX89RFQHrVaLqKioet1ahoGoHszDZBqNhoGIXJdcXv1Yo3GIf/HlejngZXqs0WgkDUQO+Oshonqqz3QXTqomIiIit8dARERERG6PgYiIiIjcHgMRERERuT0GIiIiInJ7DERERETk9hiIiIiIyO0xEBEREZHbYyAiIiIit8dARERERG6PgYiIiIjcHgMRERERuT0GIiKSXEGpHi98fQQHLuRJXQoRuSkGIiKS3Js/nMJXf1zGAx/tlboUInJTDEREJKnCsgqsP3RZ6jKIyM0xEBGRZCoNRiz6KUXqMoiIoJC6ACJyP0W6Cuz7Ow/vbjuDkxlaqcshImIgIiL7e+q/f2FXSk6tr53K0KJDuMbOFRGRu+OQGRHZ3Y3CEAAMXfor/kzNt2M1REQMRETkgF7ecFzqEojIzXDIjIjsZvG2M9j/d26d7TIKy+xQDRFRNQYiIrKb97afrVe7/NIKpBeUwd+7iQsiIqrCITMiklyAtyeGdQmz2Jb02QGJqiEid8QeIiKyC12FodbtX065DV0j/eGtVKCwtALdXvsJAHAuu9ie5RGRm2MPERHZRZGustbtMUE+8Faa/jbTqC3/RntvR/2G2IiIGouBiIjsQqursHjePswPvWObIVSjErfJZDIM7xIuPl++67zd6iMi98ZARER2cX0P0ebp/bBuym2QyWQW2z8Ydyua+SjtWRoREecQEZF9FF3XQ3R9ELqWSsG/1YjIvvitQ0R2cW0P0dKxcTdtW3aDCdhERE2FgYiI7GJXSjYA4M72IbgnrsVN21ZUGu1REhGRiIGIiJpcSXklvvrjMgDAWymvs32FQWjqkoiILDAQEVGTy9TqxMf92zavs73ewB4iIrIvBiIianJ5JXoAQFQzNR7oGSVxNURENTEQEVGTO5WhBQA091XV0dLEz4sXwBKRfTEQEVGTOpNVhNnfnQAABNUzEK2a2AsdwjVNWRYRkQUGIiJqUl8dTBMfB3p71us9PaKbYfP0fhbbjl0usGVZREQWGIiIqEnJ5dULMBaUVtyk5c09sHKfLcohIqoVAxERNakr+WXiY1/ODSIiB8VARERNKjWvVHw88662ElZCRHRjDERE1KTMgWjLjH6IDPRu0HvXTbnN4rmeK1gTURNhICKiJlNYViHOG2rZrGFhCAC6RgZYPD+dqbVFWURENTAQEVGTSavqHQr2VcFb2fj5Q7tSchq9DyKi2jAQEVGTuZRrCkQtm6ltsr/c4nKb7IeI6HoMRETUZMzzh6KDfGyyv+Jyg032Q0R0PQYiImoyOUWmHp1QjZdN9ldSXmmT/RARXY+BiIiaTKneFGB8VXKb7K+YgYiImggDERE1mRK9aYhLbYMJ1QADERE1HQYiImoyZVU9RD5K9hARkWNjICKiJlNSNQnaW2WbHiLOISKipsJARERNYuHWFOz9OxcA4O1pmx6iIh0DERE1DQYiImoS7+88Jz72tuGkal0FL70nIttjICKiJmeLVao95TIAQF6JvtH7IiK6HgMREdnc9b04tphU3cxbCQC4ytWqiagJMBARkc3ll1r24thiUnWQnykQ5Razh4iIbI+BiIhs7vrQYotJ1c28VQCAHPYQEVETYCAiIpur2UPU+EAU7GsKRBwyI6KmwEBERDZ3/cRnpbzxXzVBPp4AgJ2nsxu9LyKi6zEQEZHNXR+IZDJZo/cZ6GOaQ3TwYj6OXi5o9P6IiK7FQERENpddVD2sZYveIQCoMBjFx+v/uGyTfRIRmTEQEZHNXckvAwDc3yMSe2fdaZN93hPXQnzMe5oRka1JGogMBgNeeeUVxMbGQq1Wo1WrVnj99dchCILYRhAEzJ49G+Hh4VCr1UhISMDZs2ct9pOXl4dx48ZBo9EgICAAkydPRnFxsUWbo0ePol+/fvDy8kJUVBQWLFhgl89I5I4u55cCAAa2D0FQ1WToxgr3V+PNkZ0BMBARke1JGojmz5+P5cuX4/3338epU6cwf/58LFiwAMuWLRPbLFiwAO+99x5WrFiB/fv3w8fHB4mJidDpdGKbcePG4cSJE9i2bRs2bdqE3bt3Y8qUKeLrWq0WgwcPRnR0NA4dOoR33nkHc+bMwcqVK+36eYncxZUCUw9RZKDapvv1rVrPqJj3NCMiG7PNLait9Pvvv+Oee+7B8OHDAQAxMTH473//iwMHDgAw9Q4tWbIEL7/8Mu655x4AwJo1axAaGopvv/0WY8eOxalTp7BlyxYcPHgQPXv2BAAsW7YMw4YNw8KFCxEREYG1a9dCr9fjs88+g1KpRKdOnXD48GEsXrzYIjgRUeOVVxqQpTXNIWoRYNtA5Odl+soq0TMQEZFtSdpDdPvtt2P79u04c+YMAODIkSPYs2cPhg4dCgC4cOECMjMzkZCQIL7H398fffr0wd69ewEAe/fuRUBAgBiGACAhIQEeHh7Yv3+/2KZ///5QKpVim8TERKSkpCA/P79GXeXl5dBqtRY/RFQ/5vlDak85mvko62jdMD5K9hARUdOQtIfoxRdfhFarRfv27SGXy2EwGPDmm29i3LhxAIDMzEwAQGhoqMX7QkNDxdcyMzMREhJi8bpCoUCzZs0s2sTGxtbYh/m1wMBAi9fmzZuHuXPn2uhTErmXlMwiAEDrEF+bXG5/Ld+qHiLOISIiW5O0h+irr77C2rVr8cUXX+DPP//E6tWrsXDhQqxevVrKsjBr1iwUFhaKP2lpaZLWQ+RMTmWYelQ7hPvZfN/iHCIGIiKyMUl7iJ5//nm8+OKLGDt2LACgS5cuuHTpEubNm4fx48cjLCwMAJCVlYXw8HDxfVlZWYiLiwMAhIWFITvbcuXayspK5OXlie8PCwtDVlaWRRvzc3Oba6lUKqhUtrkyhsjdnMww9RB1CNfYfN/eVUNmpXoDBEGweQ8UEbkvSXuISktL4eFhWYJcLofRaFqALTY2FmFhYdi+fbv4ularxf79+xEfHw8AiI+PR0FBAQ4dOiS22bFjB4xGI/r06SO22b17NyoqKsQ227ZtQ7t27WoMlxFR45zNNgWi9mG2D0RKRfX3RYVBuElLIqKGkTQQ3X333XjzzTfxww8/4OLFi9iwYQMWL16MkSNHAjAt9z9jxgy88cYb2LhxI44dO4akpCRERETg3nvvBQB06NABQ4YMwWOPPYYDBw7gt99+w7Rp0zB27FhEREQAAB566CEolUpMnjwZJ06cwLp167B06VLMnDlTqo9O5JKMRgHpVZfctwzytvn+VdcEovJKg833T0TuS9Ihs2XLluGVV17Bv/71L2RnZyMiIgKPP/44Zs+eLbZ54YUXUFJSgilTpqCgoAB9+/bFli1b4OXlJbZZu3Ytpk2bhkGDBsHDwwOjR4/Ge++9J77u7++Pn376CVOnTkWPHj0QHByM2bNn85J7IhvLLipHhUGAwkOGUD/bDztfexuQ7w6n4+Hbom1+DCJyTzLh2mWhqVZarRb+/v4oLCyERmP7YQAih1BSAvj6mh4XFwM+Pg3exR8X83Dfir2IaqbGry80/pYdJfoS+M4z1VQ8qxg+Sh/EvPiD+PrFt4c3+hj1rqXxvx4isrOG/PvNe5kRkc1crlqDKDLA9sNlRERNiYGIiGwmLc90DzNb37KDiKipMRARkc3su5ALAGgXZvs1iIiImhIDERHZhFZXgf1/5wEABnUIraM1EZFjYSAiIps4fqUQlUYBkYFqxAbbZ8ZxhcFol+MQketjICIim8iuusN9y2b2m1BdqudaRERkGwxERGQTWVodACBU41VHS9sp1fOeZkRkGwxERGQTWVU9RCEa+90HsKScPUREZBsMRERkE1lFph6iMDv2EJVxyIyIbISBiIhsIttOQ2av39NJfFzCITMishEGIiKyiUwxEDXtkNkj8THo0sIfAOcQEZHtMBARkU1cLdIDAJr7Nv2QmbdSDoBziIjIdhiIiKjR9JVGlFWYwom/2rPJj+ejUgDgHCIish0GIiJqtCJdhfjY10vR5MdTe5p6iN7fea7Jj0VE7oGBiIgaTaszzeXxUykg95A1+fH+uGS6RUhqXikEQWjy4xGR62MgIqJG05aZeog0dhguA6rXPAKA3BK9XY5JRK6NgYiIGk1bNWTmZ4fhMgB4e1QX8XF6QZldjklEro2BiIgarahqyEzjZZ8eorG9WyLEz3R5f2ahzi7HJCLXxkBERI1WPWRmnx4iAGgd4gsA4tVtRESNwUBERI1mHjKzVw8RAHhVXWlWXmG02zGJyHUxEBFRo2nLqobM7DSpGgC8PE1fX7pK9hARUeMxEBFRo1X3ENlvyMxLYeoh0nHIjIhsgIGIiBqt0M6X3QOAytMciDhkRkSNx0BERI12PqcYABARoLbbMcUhM/YQEZENMBARUaPoKgw4nVEEAOga6W+343qxh4iIbIiBiIga5VSGFpVGAUE+SrSwZw+ReQ4RJ1UTkQ0wEBFRo5zM0AIAOrfwh0zW9PcxM+OQGRHZEgMRETVKdtV9xew5fwjgOkREZFsMRETUKFeLTYGoua/Srsf1UZku8Tdf8k9E1BgMRETUKOZAFFx1bzF7Mc9XupzPm7sSUeMxEBFRo1wt1gMAgn3tG4giA02B6Ep+GYxGwa7HJiLXw0BERI0i9hDZORCF+3tBJgP0BiNyS/R2PTYRuR4GIiJqlKtF5kBk3zlECrkHmnmbjmkOZURE1mIgIiKrlekNKNGbLnu39xwiAGhedcycIgYiImocBiIispq5Z0ap8ICfyn43djUzD9N9vu+S3Y9NRK6FgYiIrFZ9yb3Krosymnl4mI558WqJ3Y9NRK6FgYiIrFZ9hZl95w+ZTbw9BgBg4FVmRNRIDEREZDWprjAzMx+3jLfvIKJGYiAiIqtVX2EmTSBSK01fYQxERNRYDEREZLXqVaqlGTJTme94z0BERI3EQEREVpNqlWoztdIciIxcrZqIGoWBiIisliPxHCJ11R3vAaC8kne9JyLrMRARkdWknlTtdU0g4jwiImoMBiIispp5UnVzieYQyT1kUMpNX2OcR0REjcFARERWKa80QKurBCBdDxFQPY9o++lsyWogIufHQEREVsmtmlDtKZfBX+0pWR1hGi8AQFpeqWQ1EJHzYyAiIquY5w8F+Uhz2w6zwZ1CAXDIjIgah4GIiKwi9RpEZuaJ1QxERNQYDEREZJWrRdKuQWSmUpgnVfOyeyKyHgMREVlF6jWIzKoXZ2QPERFZj4GIiKwi9RpEZl7m23dwYUYiagQGIiKySvVtOziHiIicHwMREVklq1AHAAituuxdKl6eXJiRiBqPgYiIrHI537TuT2SgWtI62ENERLbAQEREDVZhMCJTa+ohaiF5IOJVZkTUeAxERNRgmYU6GAVAqfBAsI+0k6pD/ExDdlcKypBbNdGbiKihGIiIqMEu55cBACID1PDwkG6VagCIauaNmCBvGIwCUjKLJK2FiJwXAxERNZh5/pDUw2VmzXxMV7qZbzZLRNRQDERE1GAXc0sAAJGB3hJXYuLrZbq57PmcYokrISJnxUBERA127IoWANApQiNxJSbFugoAwDtbUySuhIicFQMRETWIIAg4erkAANAtMkDSWsxOZmjFx4IgSFgJETkryQPRlStX8PDDDyMoKAhqtRpdunTBH3/8Ib4uCAJmz56N8PBwqNVqJCQk4OzZsxb7yMvLw7hx46DRaBAQEIDJkyejuNiy6/zo0aPo168fvLy8EBUVhQULFtjl8xG5msv5ZSgorYBS7oG2Yb5SlwMAqDRUh6BSPdcjIqKGkzQQ5efn4x//+Ac8PT2xefNmnDx5EosWLUJgYKDYZsGCBXjvvfewYsUK7N+/Hz4+PkhMTIROpxPbjBs3DidOnMC2bduwadMm7N69G1OmTBFf12q1GDx4MKKjo3Ho0CG88847mDNnDlauXGnXz0vkCsxXcrUJ9YWq6j5iUnvmrrbiY23V8BkRUUMopDz4/PnzERUVheTkZHFbbGys+FgQBCxZsgQvv/wy7rnnHgDAmjVrEBoaim+//RZjx47FqVOnsGXLFhw8eBA9e/YEACxbtgzDhg3DwoULERERgbVr10Kv1+Ozzz6DUqlEp06dcPjwYSxevNgiOJmVl5ejvLx6PROtVlujDZG7yioy/TES7u8YV5gBwOP9bxHnD2nLKhHuL3FBROR0JO0h2rhxI3r27In7778fISEh6N69Oz7++GPx9QsXLiAzMxMJCQniNn9/f/Tp0wd79+4FAOzduxcBAQFiGAKAhIQEeHh4YP/+/WKb/v37Q6msvgllYmIiUlJSkJ+fX6OuefPmwd/fX/yJioqy+WcnclZZWtMfC6EaaRdkvJZC7oGYINMVb0XsISIiK0gaiP7++28sX74cbdq0wdatW/Hkk0/i6aefxurVqwEAmZmZAIDQ0FCL94WGhoqvZWZmIiQkxOJ1hUKBZs2aWbSpbR/XHuNas2bNQmFhofiTlpZmg09L5BpyqnqIzCtEOwp/b9MfPLkleokrISJnJOmQmdFoRM+ePfHWW28BALp3747jx49jxYoVGD9+vGR1qVQqqFSO89cvkSNxxB4iwHST2SNpBUjLK5W6FCJyQpL2EIWHh6Njx44W2zp06IDU1FQAQFhYGAAgKyvLok1WVpb4WlhYGLKzsy1er6ysRF5enkWb2vZx7TGIqH6yzT1EDhaIWjYzDZmlMhARkRUkDUT/+Mc/kJJiuZDamTNnEB0dDcA0wTosLAzbt28XX9dqtdi/fz/i4+MBAPHx8SgoKMChQ4fENjt27IDRaESfPn3ENrt370ZFRfXcgm3btqFdu3YWV7QRUd3MPUSONmQWXRWILuUyEBFRw0kaiJ555hns27cPb731Fs6dO4cvvvgCK1euxNSpUwEAMpkMM2bMwBtvvIGNGzfi2LFjSEpKQkREBO69914Aph6lIUOG4LHHHsOBAwfw22+/Ydq0aRg7diwiIiIAAA899BCUSiUmT56MEydOYN26dVi6dClmzpwp1UcnckoGoyDeUd5Re4g4ZEZE1pB0DlGvXr2wYcMGzJo1C6+99hpiY2OxZMkSjBs3TmzzwgsvoKSkBFOmTEFBQQH69u2LLVu2wMur+q/TtWvXYtq0aRg0aBA8PDwwevRovPfee+Lr/v7++OmnnzB16lT06NEDwcHBmD17dq2X3BPRjeWWlMMoAB4yIMjHsQJRVFUgupxfBkEQIJPJJK6IiJyJTOA693XSarXw9/dHYWEhNBrHuHcTkc2VlAC+VStPFxcDPj41mqRkFiFxyW4081Hiz1fuavqS9CXwnWeqqXhWMXyUNWsS25ZXotOrWwEAJ19LhLfStn/v1ePXQ0QOpiH/fkt+6w4ich4FpaZL2gPUnhJXUpO3Ug5PualXqKCUaxERUcMwEBFRvRWUmYJGgLfjBSKZTAb/qqDGQEREDcVARET1JvYQeSvraCkNcyAqLGMgIqKGYSAionoz97w44pAZAGiq6uINXomooRiIiKjezENm/g44ZAYAXgo5AKC80ihxJUTkbBiIiKjezD1EgQ46ZKbyNH2llVcYJK6EiJyNVYHo77//tnUdROQEqucQOXYPkY49RETUQFYFotatW2PgwIH4z3/+A51OZ+uaiMhBmXuI/B10DhF7iIjIWlYFoj///BNdu3bFzJkzERYWhscffxwHDhywdW1E5GCqL7t30CEzRVUgYg8RETWQVYEoLi4OS5cuRXp6Oj777DNkZGSgb9++6Ny5MxYvXoycnBxb10lEDiC/xHEXZgQAFSdVE5GVGjWpWqFQYNSoUVi/fj3mz5+Pc+fO4bnnnkNUVBSSkpKQkZFhqzqJSGJfHkhFplYHDxnQIlAtdTm18uKQGRFZqVGB6I8//sC//vUvhIeHY/HixXjuuedw/vx5bNu2Denp6bjnnntsVScRSWzV7xcBADMS2iLY17Fu7GrGHiIispZVdz9cvHgxkpOTkZKSgmHDhmHNmjUYNmwYPDxM+So2NharVq1CTEyMLWslIokIgoC0vFIAwPCu4RJXc2PKqjlE+/7OlbgSInI2VgWi5cuXY9KkSZgwYQLCw2v/cgwJCcGnn37aqOKIyDHkl1agRG8ahmoR4JjDZQCgqLq56+nMIhiNAjw8ZBJXRETOwqpAdPbs2TrbKJVKjB8/3prdE5GDMfcOhWpU8PKUS1zNjSV0CMWCLSkAgOyicoT5e0lcERE5C6vmECUnJ2P9+vU1tq9fvx6rV69udFFE5FjS8k2BKDLQW+JKbq5tqB8iqyZ8X66qmYioPqwKRPPmzUNwcHCN7SEhIXjrrbcaXRQROZbL+WUAgCgHvbrsWuFVvUJZ2nKJKyEiZ2JVIEpNTUVsbGyN7dHR0UhNTW10UUTkWMxDZlHNHLuHCAB8VaaZACXllRJXQkTOxKpAFBISgqNHj9bYfuTIEQQFBTW6KCJyLGlVPUSRTtBD5OtlWjSyiIGIiBrAqkD04IMP4umnn8bOnTthMBhgMBiwY8cOTJ8+HWPHjrV1jUQkscvmHiIHn0MEVPcQFesYiIio/qy6yuz111/HxYsXMWjQICgUpl0YjUYkJSVxDhGRizEaBVwuqJpD5ARDZn5eVYGovELiSojImVgViJRKJdatW4fXX38dR44cgVqtRpcuXRAdHW3r+ohIYjnF5dBXGuEhg1Ncxi72EHHIjIgawKpAZNa2bVu0bdvWVrUQkQMyT6gO91fDU96ou/3YhY8YiHg/MyKqP6sCkcFgwKpVq7B9+3ZkZ2fDaLS8b9COHTtsUhwRSc+8BlFUM8efUA0A6qqFI8v0DEREVH9WBaLp06dj1apVGD58ODp37gyZjMvjE7mqy3nmK8wcf/4QAHgrTYFIxzveE1EDWBWIvvzyS3z11VcYNmyYreshIgdztdi0wGGoxjHvcH89861FyhiIiKgBrJoQoFQq0bp1a1vXQkQOSFt1+bqman0fR6eu6iEq5ZAZETWAVYHo2WefxdKlSyEIgq3rISIHU6QzXb7u5ySByDxkdipDi0qDsY7WREQmVg2Z7dmzBzt37sTmzZvRqVMneHpaflF+8803NimOiKRn7iEyr+/j6MyTqgHgh2MZuCeuhYTVEJGzsOobLiAgACNHjrR1LUTkgIqcLBAp5NUXeZhvSktEVBervuGSk5NtXQcROShnGzIL9atePFLjJCGOiKRn9SprlZWV+Pnnn/HRRx+hqKgIAJCeno7i4mKbFUdE0isSJ1U7R7gI9FGiU4QGAG/wSkT1Z9U33KVLlzBkyBCkpqaivLwcd911F/z8/DB//nyUl5djxYoVtq6TiCQgCIJ4Cwxn6SECgF4xzXAiXSuGOSKiuljVQzR9+nT07NkT+fn5UKurV68dOXIktm/fbrPiiEhapXoDDEbT1aTOMocIqO7NMg/3ERHVxapvuF9//RW///47lEqlxfaYmBhcuXLFJoURkfTMPSxyD5l4ObszCPM3/aF2KbdU4kqIyFlY1UNkNBphMNRc9Ozy5cvw8/NrdFFE5BiqJ1QrnOoWPR3CTd9DpzOLJK6EiJyFVYFo8ODBWLJkifhcJpOhuLgYr776Km/nQeRCnG0NIrOoZqb7rl0tLufijERUL1Z9yy1atAiJiYno2LEjdDodHnroIZw9exbBwcH473//a+saiUgiYg+RynkmVANAoLcScg8ZDEYBeSV6hGi86n4TEbk1qwJRZGQkjhw5gi+//BJHjx5FcXExJk+ejHHjxllMsiYi5+asPURyDxma+SiRU1SO7KJyBiIiqpPV33IKhQIPP/ywLWshIgfjbIsyXqu5rwo5ReXIKS6XuhQicgJWBaI1a9bc9PWkpCSriiEix3LsciEAoLmfso6Wjqe5nwrIAK4WMRARUd2sCkTTp0+3eF5RUYHS0lIolUp4e3szEBG5gNzicnzzl2kZjdG3RkpcTcMF+6oAgD1ERFQvVl1llp+fb/FTXFyMlJQU9O3bl5OqiVzElhOZ0Fca0aWFP3pEB0pdToM196sKROwhIqJ6sPpeZtdr06YN3n777Rq9R0TknMzDZf3bBjvVGkRm5kB0tVgvcSVE5AxsFogA00Tr9PR0W+6SiCRytCoQdWkRIG0hVqruIdJJXAkROQOr5hBt3LjR4rkgCMjIyMD777+Pf/zjHzYpjIiko6sw4EyWaZXnrpH+EldjnWBf00TwLC2HzIioblYFonvvvdfiuUwmQ/PmzXHnnXdi0aJFtqiLiCSUkqlFpVFAsK8S4f7OuYZPu1A/yGTAhaslyNbquBYREd2UVYHIaORS+ESu7FKe6aaobUL8nHL+EAAE+aoQpvFCRqEOmQxERFQHm84hIiLXkFtkmohsnofjrLw85QAAXQX/iCOim7Oqh2jmzJn1brt48WJrDkFEEsotMc27Ma/l46xUCtPffLoKg8SVEJGjsyoQ/fXXX/jrr79QUVGBdu3aAQDOnDkDuVyOW2+9VWznrF3tRO7OfKl6sBOuUH2t6h4iBiIiujmrAtHdd98NPz8/rF69GoGBpgXb8vPzMXHiRPTr1w/PPvusTYskIvu6WlIViJy8h8jLs6qHqJJDZkR0c1bNIVq0aBHmzZsnhiEACAwMxBtvvMGrzIhcQG7V6s7NnT4QsYeIiOrHqkCk1WqRk5NTY3tOTg6KiooaXRQRSctV5hB5KUyBqJyBiIjqYFUgGjlyJCZOnIhvvvkGly9fxuXLl/G///0PkydPxqhRo2xdIxHZWV5JBQDnn0OUlm9aPuCV705IXAkROTqr5hCtWLECzz33HB566CFUVJi+OBUKBSZPnox33nnHpgUSkf0ZjAIAIMjHuXuIzmUXS10CETkJq3qIvL298eGHHyI3N1e84iwvLw8ffvghfHx8bF0jEUnAX+0JpcK5lyp7/6Hqq14FQZCwEiJydI36tsvIyEBGRgbatGkDHx8ffuEQuRDzvcCc2e2tgsTHxeWVElZCRI7OqkCUm5uLQYMGoW3bthg2bBgyMjIAAJMnT+Yl90QuwtknVAOAj0oBX5VpZkB2EW/ySkQ3ZlUgeuaZZ+Dp6YnU1FR4e3uL28eMGYMtW7bYrDgikk6wk9+2wyyk6nNk8673RHQTVgWin376CfPnz0dkZKTF9jZt2uDSpUtWFfL2229DJpNhxowZ4jadToepU6ciKCgIvr6+GD16NLKysizel5qaiuHDh8Pb2xshISF4/vnnUVlp2TW+a9cu3HrrrVCpVGjdujVWrVplVY1E7sTZ1yAyC9FUBaIincSVEJEjsyoQlZSUWPQMmeXl5UGlaviX6MGDB/HRRx+ha9euFtufeeYZfP/991i/fj1++eUXpKenW1zWbzAYMHz4cOj1evz+++9YvXo1Vq1ahdmzZ4ttLly4gOHDh2PgwIE4fPgwZsyYgUcffRRbt25tcJ1E7sQV5hABQIif6S737CEiopuxKhD169cPa9asEZ/LZDIYjUYsWLAAAwcObNC+iouLMW7cOHz88ccWK18XFhbi008/xeLFi3HnnXeiR48eSE5Oxu+//459+/YBMPVUnTx5Ev/5z38QFxeHoUOH4vXXX8cHH3wAvd5064EVK1YgNjYWixYtQocOHTBt2jTcd999ePfdd6356ERuwxXmEAHXDJmxh4iIbsKqQLRgwQKsXLkSQ4cOhV6vxwsvvIDOnTtj9+7dmD9/foP2NXXqVAwfPhwJCQkW2w8dOoSKigqL7e3bt0fLli2xd+9eAMDevXvRpUsXhIaGim0SExOh1Wpx4sQJsc31+05MTBT3UZvy8nJotVqLHyJ34yqBKFRj6iHKYg8REd2EVYGoc+fOOHPmDPr27Yt77rkHJSUlGDVqFP766y+0atWq3vv58ssv8eeff2LevHk1XsvMzIRSqURAQIDF9tDQUGRmZoptrg1D5tfNr92sjVarRVlZWa11zZs3D/7+/uJPVFRUvT8TkatwlUnV4QGmQHS5atVqIqLaNHil6oqKCgwZMgQrVqzA//3f/1l94LS0NEyfPh3btm2Dl5eX1ftpCrNmzcLMmTPF51qtlqGI3I6rzCGKCTItFnsxl4GIiG6swT1Enp6eOHr0aKMPfOjQIWRnZ+PWW2+FQqGAQqHAL7/8gvfeew8KhQKhoaHQ6/UoKCiweF9WVhbCwsIAAGFhYTWuOjM/r6uNRqOBWq2utTaVSgWNRmPxQ+RuXGXILDLQ9N95Xoke5ZW8ySsR1c6qIbOHH34Yn376aaMOPGjQIBw7dgyHDx8Wf3r27Ilx48aJjz09PbF9+3bxPSkpKUhNTUV8fDwAID4+HseOHUN2drbYZtu2bdBoNOjYsaPY5tp9mNuY90FENfmpFPDylEtdhk1ovDwhk5keF5ZVSFsMETksq27uWllZic8++ww///wzevToUeP+ZYsXL65zH35+fujcubPFNh8fHwQFBYnbJ0+ejJkzZ6JZs2bQaDR46qmnEB8fj9tuuw0AMHjwYHTs2BGPPPIIFixYgMzMTLz88suYOnWqePn/E088gffffx8vvPACJk2ahB07duCrr77CDz/8YM1HJ3JZWYU6mGfbOftd7q/l4SGDn0oBra4S2rIK8TJ8IqJrNSgQ/f3334iJicHx48dx662mmyaeOXPGoo3M/KeYDbz77rvw8PDA6NGjUV5ejsTERHz44Yfi63K5HJs2bcKTTz6J+Ph4+Pj4YPz48XjttdfENrGxsfjhhx/wzDPPYOnSpYiMjMQnn3yCxMREm9VJ5AqOpBVgcNXjyf1ukbQWWwvwVkKrq2QPERHdkExowB1Z5XI5MjIyEBISAsB0q4733nuvxlVcrkar1cLf3x+FhYWcT0Qua+32kxiX0Mn0pLgYuK7nVwol+hL4zvMFABTPKoaP0rqa7nl/D45cLsSKh3tgSOcw62opAXxNpTjKr4eI6tCQf78bNIfo+uy0efNmlJSUNLxCInI4OcWuu3Bhy6orzS7l8vuKiGpn1aRqswZ0LhGRg8vW6qUuoclENzPdaiiNaxER0Q00KBDJZLIac4RsOWeIiKSTU+y6KzkHeHsCAIp1lXW0JCJ31aBJ1YIgYMKECeIVXDqdDk888USNq8y++eYb21VIRHaRU+S6gchHZfqqKy5nICKi2jUoEI0fP97i+cMPP2zTYohIOlddeA6Rb1UgOn6F9yUkoto1KBAlJyc3VR1EJKFKgxF5Ja57Sbo5EGVqdcgpKkdzF7lPGxHZTqMmVRORa7harIe7XCNxOpO9RERUEwMRESG7yHWHywAgvlWQ+FhbxnlERFQTAxERIVvruhOqAcDLU47hXcIBADkuHv6IyDoMRESE9MIyqUtocuZ5Q668vAARWY+BiIhwpcCNApELLy9ARNZjICIipBe4/jBSc18GIiK6MQYiIkKGO/UQcciMiGrBQERESHenQMQeIiKqBQMRkZurNBiRqXWDIbOqQJSlLUeZ3iBxNUTkaBiIiNxcVlE5jALgKXftGzUH+1avTj37u+MSVkJEjoiBiMjNmYfLwvy9JK6kack9qgPf+kOXJayEiBwRAxGRm7uSbwpE4S4eiADgmYS2AIAuLfwlroSIHA0DEZGbS8srBQC0CPCWuJKm179tMAAgr0QvcSVE5GgYiIjcXGpVIIoKVEtcSdPzqbrrfVkFJ1UTkSUGIiI3JwaiINfvIVJ7ygEAJeW8wSsRWWIgInJzl6vmELUIcP0eIm+lKRCVVxphMAoSV0NEjoSBiMiN6SuN4o1dIwNdv4fIPGQGcNiMiCwxEBG5sSsFZRAE01BSsK9S6nKanErhAVnV1fcFpZxYTUTVGIiI3Jj5CrOoZmrIZK69MCMAyGQy+CpNvUQ7T2dLXA0RORIGIiI3dim3BAAQ5QbDZWa9YpsBAK4Ws4eIiKoxEBG5sUOX8gEAnSI0EldiP52rPutV3vWeiK7BQETkpgRBwP4LeQCAPrcESVyN/QRX3eQ1lz1ERHQNBiIiN3U5vwwZhTooPGTo3jJA6nLsxnyTV/YQEdG1GIiI3NS+v3MBAF0j/eGtVNTR2nUE+ZiupmMgIqJrMRARuakT6VoAQM+YZhJXYl/mIbOLuaWoMBglroaIHAUDEZGbSi8wLcjoDvcwu1a4v5f4+N1tZySshIgcCQMRkZvKKNQBAML93SsQXTs8+OGu8xJWQkSOhIGIyE1lVN2yI+yaHhN3ofAwLULZt3WwxJUQkaNgICJyQ+WVBnFhwgg3uKnr9d4dEwcAnENERCIGIiI3lFVousJKpfBAoLenxNXYn7/a9Jm1ukqJKyEiR8FAROSGzHe4D/f3cot7mF3P18s0jyiz6vdARMRAROSGMt10QrWZpioQ5ZdW4O+cYomrISJHwEBE5IbEHqIA95tQDQC+quphwi/2p0pYCRE5CgYiIjdU3UPknoEo0Kc6ELnhiCER1YKBiMjNVBqM2HvedNuOyEBviauRhkohx6hbWwAAyit5pRkRMRARuZ11f6ThbHYxArw9MaxzuNTlSKZVc18AgK7CIHElROQIGIiI3Mxney4AAKYPagN/N7zk3szLUw4A0FWwh4iIGIiI3IquwoALV0sAACO6RkhcjbS8PE1ff+whIiKAgYjIrZzPKYZRAAK8PRHsq5S6HEl5KUw9RD+dzJK4EiJyBAxERG7kXLZpzZ22IX5uuSDjtSqN1UNlOUXlElZCRI6AgYjIjZzJKgIAtA71lbgS6Znv5WZ6zEBE5O4YiIjcyNkscw8RA9HI7i3Ex49/fkjCSojIETAQEbmRs1VDZm1C/SSuRHoRAdW3LUnNK5WwEiJyBAxERG6iVF+JS7mmK8zasIcIAKBU8CuQiEz4bUDkJn47lwujALQIUKO5n0rqchzC5un9xMcVBq5HROTOGIiI3MSO06bLyxM6hLj9FWZmMUE+8JSbfheXcjlsRuTOGIiI3IDRKGD7qWwAwKAOoRJX4zjkHjLcdksQAODnU1yPiMidMRARuYET6VpkF5XDWylHn1uaSV2OQzEHIvMaTUTknhiIiNzA7+evAgBubxUEVdUKzWSiUZvu51akq5C4EiKSEgMRkRs4cCEPANAnNkjiShyPxksBANCWVUpcCRFJiYGIyMUZjAIOXKwKRBwuq0HjZeohKixjDxGRO2MgInJxpzK0KNJVwlelQMdwjdTlOJyoZt4ATLc1KS5nLxGRu2IgInJx+6uGy3rGBEIh53/y12sd4gt/tScqjQLSC8qkLoeIJMJvRyIXd/RyAQCgZ3SgtIU4sDCNFwAgS6uTuBIikoqkgWjevHno1asX/Pz8EBISgnvvvRcpKSkWbXQ6HaZOnYqgoCD4+vpi9OjRyMqyXC8kNTUVw4cPh7e3N0JCQvD888+jstKy63vXrl249dZboVKp0Lp1a6xataqpPx6RQzidYbrDfccIDpfdSIjGtHL3jtPZEldCRFKRNBD98ssvmDp1Kvbt24dt27ahoqICgwcPRklJidjmmWeewffff4/169fjl19+QXp6OkaNGiW+bjAYMHz4cOj1evz+++9YvXo1Vq1ahdmzZ4ttLly4gOHDh2PgwIE4fPgwZsyYgUcffRRbt2616+clsrfySgPO55jW12kfxkB0I7+eNS1LkPzbRWkLISLJKKQ8+JYtWyyer1q1CiEhITh06BD69++PwsJCfPrpp/jiiy9w5513AgCSk5PRoUMH7Nu3D7fddht++uknnDx5Ej///DNCQ0MRFxeH119/Hf/+978xZ84cKJVKrFixArGxsVi0aBEAoEOHDtizZw/effddJCYm2v1zE9nL+ewSVBoFaLwUCPf3kroch9UpQoMT6VqpyyAiCTnUHKLCwkIAQLNmpkuDDx06hIqKCiQkJIht2rdvj5YtW2Lv3r0AgL1796JLly4IDa2+HUFiYiK0Wi1OnDghtrl2H+Y25n1cr7y8HFqt1uKHyBmdzjT9f7d9uIb3L7uJBfd1lboEIpKYwwQio9GIGTNm4B//+Ac6d+4MAMjMzIRSqURAQIBF29DQUGRmZoptrg1D5tfNr92sjVarRVlZzatK5s2bB39/f/EnKirKJp+RyN5OZ5rmD3UI85O4EsfWIkAtPtZX8q73RO7IYQLR1KlTcfz4cXz55ZdSl4JZs2ahsLBQ/ElLS5O6JCKrnMqo7iGiG9N4ecJHabqlyY/HMiSuhoik4BCBaNq0adi0aRN27tyJyMhIcXtYWBj0ej0KCgos2mdlZSEsLExsc/1VZ+bndbXRaDRQq9W4nkqlgkajsfghcjZFugr8eSkfALggYx08PGQY0TUCAPDlwVSJqyEiKUgaiARBwLRp07Bhwwbs2LEDsbGxFq/36NEDnp6e2L59u7gtJSUFqampiI+PBwDEx8fj2LFjyM6uvlx227Zt0Gg06Nixo9jm2n2Y25j3QeSKNvx1BSV6A25p7oOukf5Sl+PwhnYx/QFVyHuaEbklSa8ymzp1Kr744gt899138PPzE+f8+Pv7Q61Ww9/fH5MnT8bMmTPRrFkzaDQaPPXUU4iPj8dtt90GABg8eDA6duyIRx55BAsWLEBmZiZefvllTJ06FSqVaW2RJ554Au+//z5eeOEFTJo0CTt27MBXX32FH374QbLPTtSUBEHAmr2XAABJt0VzQnU9qD1NQ2bllQaJKyEiKUjaQ7R8+XIUFhZiwIABCA8PF3/WrVsntnn33XcxYsQIjB49Gv3790dYWBi++eYb8XW5XI5NmzZBLpcjPj4eDz/8MJKSkvDaa6+JbWJjY/HDDz9g27Zt6NatGxYtWoRPPvmEl9yTy/ozNR/nsovhrZRjVI/Iut9A8KoKRH/nlOAKb+FB5HYk7SESBKHONl5eXvjggw/wwQcf3LBNdHQ0fvzxx5vuZ8CAAfjrr78aXCORM9p6wjRn7q6OoeLd3OnmVJ7Vfx+u2XsRs4Z2kLAaIrI3h5hUTUS2IwgCtp00BaLBHcMkrsZ5eCnk4uNdp3MkrISIpMBARORizucU48LVEijlHrijXXOpy3EaSkX116GhHr3XRORaGIiIXMxPVb1D8a2C4KuSdFTcqaiuCUQKD05CJ3I3DERELuana+YPUf0F+aowoms4ANMK398fSZe4IiKyJwYiIheSrdXhcFoBAAYiazw5oJX4+Kn/8iIMInfCQETkQn4+ZVqgtFtUAEI1vLt9Q/mreUUekbtiICJyIdtOmhY3HczeIatE+Fveyie/RC9RJURkbwxERC5CV2HAb+dzATAQWcvjusnUH//6t0SVEJG9MRARuYhjVwqhrzSiuZ8KrUN8pS7HaS1+oJv4WFdhlLASIrInBiIiF/FXqunO9re2DOC9yxrB55qlCnxU8pu0JCJXwkBE5CL+Si0AAHRvGShtIU7OYKxelNFbyXWciNwFAxGRixADUVSApHU4u2JdpfiYCzQSuQ8GIiIXkFFYhkytDnIPGbpE+ktdjnO7JgOVVxqkq4OI7IqBiMgF/HmpAADQPsyPwzyN9M9uEeLjhT+dkbASIrInBiIiJ5eaW4r5W04DAHpEc/5QY3l5yvHi0Pbi80oDrzQjcgcMREROTFdhwIMf70NqXilaNvPGE3e0qvtNVKfJfWPFxwVlFRJWQkT2wkBE5MR+PXsVVwrK0NxPha+fiEdEgLruN1GdPOUeCPA23cbjrR9PSVwNEdkDAxGRE/vxWAYA4O6uEQjhvctsqkVVuPzmzyu4WlwucTVE1NQYiIiclK7CgG0nswAAw7uGSVyN6+kYrhEfl5RX3qQlEbkCBiIiJ/Xr2asoLq9EmMYL3aM4mdrWrl2+oEjHQETk6hiIiJzUV3+kAQCGdQmvcVNSarwxvaLEx4WcWE3k8hiIiJzQoUt52HYyCx4y4MHeUXW/gRpMpZBjQLvmAIALV0skroaImhoDEZGTEQQBb/1oWnfo/h5RaBPqJ3FFrqt9mGke0elMrcSVEFFTYyAicjK7UnJw6FI+vDw98MxdbaUux6V1CDeFzdMZRRJXQkRNjYGIyMl8/OvfAICH+0QjzJ+X2jclcw/RH5fyUVCql7gaImpKDERETuREeiF+P58LuYcME69ZTZmaxi3NfcTHj39+SMJKiKipMRAROZFPf70AwHRlWQuuSt3kPOXVX5FHLxdKWAkRNTUGIiInkZpbio1H0gEAj/Vj75C9TLg9BgCglPPrksiV8b9wIiexeFsKKo0C+rUJRtfIAKnLcRszEtoAAPS86z2RS2MgInICJ9O1+K6qd+jfQ9pLXI178fPylLoEIrIDBiIiB2c0Cnjjh5MQBODubhHo3MK/7jeRzci5CjiRW2AgInJwy385j9/P58LL0wPPct0hSfRrEyx1CUTUxBiIiBzY7+euYtFPKQCA1/7ZGTHBPnW8g5rC26O7Sl0CETUxBiIiByQIAr7Yn4oJyQdhFICR3Vvg/p6RUpfltsI1XACTyNUppC6AiGr6dM8FvPHDKQBAYqdQvDmyM2QyzmWRisd184jKKw3wgVyiaoioKbCHiMjBlJRXYtmOcwCApwe1wYqHe8Bbyb9dpLZ6Ui/x8fqDaRJWQkRNgYGIyMGsO5iGwrIKxAR5Y/qgNuwZchC9Y4PEx6v2XpSuECJqEgxERA6kwmDEp3tMt+d4rP8tvOTbQV3J1yEtr1TqMojIhhiIiBzIhj+v4EpBGYJ9VRh9KydRO7JHPt0vdQlEZEMMREQO4rvDV/B/3x4DAEzqGwMvT07adWQXc9lDRORKGIiIJGYwClj681lM//IwKgwChncJx+S+vHmrMzh0KU/qEojIRhiIiCSUXlCGBz/eh3d/PgMAeLRvLJY92B0qBXuHnMHo5XulLoGIbITX8hJJ5EhaASYkH0B+aQV8lHLMvacz7uvBeUPOIESjxFVdGQCgTG+AWskAS+Ts2ENEJIE9Z6/iwY/3Ib+0Ap1baPDD0/0YhpzI+w/dKj7elZItYSVEZCvsISKys83HMjD9y8PQG4zo2zoYHz3SAz4q/qfoTDqF+4uPMwp1ElZCRLbCHiIiO/rvgVRM/eJP6A1GDOsShk8n9GQYckIeHjJMuD0GAPDappPQVxqlLYiIGo2BiKiJFZTq8fPJLLy04RhmfXMMRgF4sHcUlj14KydPO7EHekaJj9u+vBmVBoYiImfGP02JmlDyb6abtBqMgrjtyQGt8EJiO96Sw8l1jNBYPP9kzwU8cUcriaohosZiDxFRE9lyPAOvbToJg1HALc198GDvlvh0fE/8e0h7hiEX8cWjfcTHb28+jQtXSySshogagz1ERE3g6OUCzFh3GIIAjI+Pxtx7OktdEjWB+FZBFs8HLtyFC/OGMfASOSH2EBHZSKXBiCNpBfjol/OYvPoP6CqMGNCuOV4Z0VHq0qiJyGQynH9rGPyumRj/5H/+lLAiIrIWe4iIGkFfacSecznYdCQD205moai8UnytfZgflj3YHQo5/+5wZXIPGd57sDsmrjoIANhyIhPfHb6Ce+JaSFwZETUEAxGRFbK1Onyy5wLWHUxDYVmFuF3jpUDv2CDcdksz3N8jCn5enhJWSfZye2vLobPpXx5Gj+hARAZ6S1QRETUUAxFRA6TmlmLlr+fx1R+XxbVnmvupMLxLOEZ0DUf3loGQe3D+iLtRKeS4MG8YYmf9KG7rO38nLr49XMKqiKghGIiI6pCWV4otxzPx4/EM/JVaIG6/tWUApg5sjQHtQhiCCDKZDJP+EYvPfrsgbot58Qdsnt4PHcI1N3knETkCBiKiWhiMAn46kYlP91zAH5fyxe0yGdC3dTCmDmyNPrHNeDURWXhpWHs8Eh+NgQt3iduGLv0VB14ahBCNl3SFEVGdGIjIrekqDDAYBRgFAQKA8gojvj+SjuTfLyAtz3Q3cw8Z0Cc2CMO6hCGxUxj/YaMbUsg9EBvsg4+TeuKxNX+I23u/tR2rJvbC7a2CoVRwkj2RI2IgIrcgCAIu55fhRHohTqRrcfxKIY6na5FTVH7D9wR6e+Lh26LxyG3RDEHUIHd1DMWE22Ow6veL4rYJyaar0M6/NYxDrEQOiIGInE6Z3oC0/FKU6g0orzCgvNKI8kojdFWPi3UVyNSWI7OwDBmFOmQU6pCp1dX7BpytmvtgUt9YjOoeCbWS9xoj68z5Zyc81KclBr+722J7q5d+xIZ/3Y64qAAOuRI5EAYiCVUYjNj/dx485TJ4KjyglHtAqfCAp9wDnnIZlHLT42u3OdMXqL7SiIzCMqTlleFyfiku55fhSkEZBEFAgLcSGrUnAtSeCPA2/firlfCveu6rUiCjUIcLV4vxd04JLlyt/sko1FlVj6dchrahfugUoUHnFv7oFOGP1s19oVR4QCYDPGQyyGSAJ9cNIhtpG+qH7c/egUGLfrHYPvLD38XHJ+Ymori8EqHshSSSFAORhArLKvDwp/sb9B5PuawqHJmCkrIqKJm3eSo8oJJ7wFNxTTsxVFm+1xS65PBUVIcvy9eq9+kpl0GlqG5j3q9REJBeUIa0qsBj+jE9ztTqIAh1fyZr+HkpoPHyhMrTA14KOVSeHlApPKBSyOGtlCPM3wvh/l4I81eb/lfjhVCNF+dvkN21au6Lc28OxSvfHcd/D6TVeL3Tq1vFxy8P74C7OoYiOsjHniUSEdwsEH3wwQd45513kJmZiW7dumHZsmXo3bu3ZPUIgmk1Y32lEXqDERUGIyoMAiqqnusNxhqBosIgoMJgAGCQpOaG8vL0QGSgNyID1YgMVKNFgDcUHjIUlOlRWFaBgtKK6/5XD62uUnxvbLAvbgn2QUywN2KDfREb7INbgn0Q6KOU+JMR1Z9C7oF5o7piXJ9ojFi254bt3vjhFN744ZTFtgHtmuPurhEY0K45/Lw8oVR4wGgU4MF5SEQ25TaBaN26dZg5cyZWrFiBPn36YMmSJUhMTERKSgpCQkIkqam5nwpbZvS/aZvKqpBUHZiMqKg0PddXVm/TXxOmzM9NrwsWbarfI1i83/y8tuNUXP8+8XUjBADh/l5i6IlqZg4/pv8N8lE2eJjPYBRQXF4JP5WCX/rkUjq38MfFt4fDYBSw+0wOJq8+CGMdvai7UnKwKyWn3scI9lWiRYAat0YHomd0M1QajVApPNAzphkKSvWICFBD7SmHVleJMr0BXp4eqDQKCPRWipO9BUFwquF5IluQCUJTDWo4lj59+qBXr154//33AQBGoxFRUVF46qmn8OKLL1q0LS8vR3l59dVHWq0WUVFRKCwshEbDBdbIRZWUAL6+psfFxYCP9MM2JfoS+M4z1VQ8qxg+Sulqaupfz+lMLYYs+dW2O20CbUN9b/q6DDcPUo3NWXUFtbp2X9fx63y9kZ+vzo/f1J+vzvc39fFv3CDQxxMfPdKzjiM0jFarhb+/f73+/XaLHiK9Xo9Dhw5h1qxZ4jYPDw8kJCRg7969NdrPmzcPc+fOtWeJROTm2odpatzqY+/5XMhkwK9nc/DBzvMSVWbpTFax1CWQiwrxU0l6fLcIRFevXoXBYEBoaKjF9tDQUJw+fbpG+1mzZmHmzJnic3MPERGRPcW3Mt009rZbgvB8Yvs62xurxt8MgoAsrQ6VBgGFZRWoMBix4a8raNnMGwHenvDylOO3c1dRqjegY4QGugojMgvLsOGvK/BXe8LPy9QmW6tDcz8V+rdtjq8PXcbMu9riluAbd43VZ7ihrjEJoR57qXsf9anj5q3qNXRij8/iIJ+17rGkxn9WqS96cYtA1FAqlQoqlbRJlYioocxz7jwgQ2Sgt8VrPWOaWTy/J65FjfcvuK/bDff90rAONqiQyHG5xTXIwcHBkMvlyMrKstielZWFsLAwiaoiIiIiR+EWgUipVKJHjx7Yvn27uM1oNGL79u2Ij4+XsDIiIiJyBG4zZDZz5kyMHz8ePXv2RO/evbFkyRKUlJRg4sSJUpdGREREEnObQDRmzBjk5ORg9uzZyMzMRFxcHLZs2VJjojURERG5H7cJRAAwbdo0TJs2TeoyiIiIyMG4xRwiIiIiopthICIiIiK3x0BEREREbo+BiIiIiNweAxERERG5PQYiIiIicnsMREREROT2GIiIiIjI7bnVwozWEgQBAKDVaiWuhKgJlZRUP9ZqAYNBulqqlOhLAJ3psVarhUEpXU0O+OshojqY/902/zt+MzKhPq3c3OXLlxEVFSV1GURERGSFtLQ0REZG3rQNA1E9GI1GpKenw8/PD71798bBgwfrfE+vXr1u2u5mr9f22o3aX7tdq9UiKioKaWlp0Gg0ddZoa3V95qbaR33fw3Niv33wnNwYz0nN7TwnjWtXn99xQ7e7yjk5cOAAioqKEBERAQ+Pm88S4pBZPXh4eIjJUi6X1+v/HHW1u9nrtb12o/a1bddoNJL8H7i+vxtb74Pn5MZ4Tm68nefEunY8J7bbR1OfE1tsd/Zz4u/vD39//3q156TqBpo6dapN2t3s9dpeu1H7+tZjD7aoxZp98JzcGM9Jw+qxB56ThtVjD656Tmy1XQpSnBMOmbkQrVYLf39/FBYWSpLoqSaeE8fDc+J4eE4cjzueE/YQuRCVSoVXX30VKpVK6lKoCs+J4+E5cTw8J47HHc8Je4iIiIjI7bGHiIiIiNweAxERERG5PQYiIiIicnsMREREROT2GIiIiIjI7TEQuZGRI0ciMDAQ9913n9SlEEz31hkwYAA6duyIrl27Yv369VKX5PYKCgrQs2dPxMXFoXPnzvj444+lLokAlJaWIjo6Gs8995zUpVCVmJgYdO3aFXFxcRg4cKDU5dgEL7t3I7t27UJRURFWr16Nr7/+Wupy3F5GRgaysrIQFxeHzMxM9OjRA2fOnIGPj4/Upbktg8GA8vJyeHt7o6SkBJ07d8Yff/yBoKAgqUtza//3f/+Hc+fOISoqCgsXLpS6HIIpEB0/fhy+vr5Sl2Iz7CFyIwMGDICfn5/UZVCV8PBwxMXFAQDCwsIQHByMvLw8aYtyc3K5HN7e3gCA8vJyCIIA/s0orbNnz+L06dMYOnSo1KWQi2MgchK7d+/G3XffjYiICMhkMnz77bc12nzwwQeIiYmBl5cX+vTpgwMHDti/UDdiy3Ny6NAhGAwGREVFNXHVrs0W56SgoADdunVDZGQknn/+eQQHB9upetdji/Px3HPPYd68eXaq2D3Y4rzIZDLccccd6NWrF9auXWunypsWA5GTKCkpQbdu3fDBBx/U+vq6deswc+ZMvPrqq/jzzz/RrVs3JCYmIjs7286Vug9bnZO8vDwkJSVh5cqV9ijbpdninAQEBODIkSO4cOECvvjiC2RlZdmrfJfT2PPx3XffoW3btmjbtq09y3Z5tvjvZM+ePTh06BA2btyIt956C0ePHrVX+U1HIKcDQNiwYYPFtt69ewtTp04VnxsMBiEiIkKYN2+eRbudO3cKo0ePtkeZbsXac6LT6YR+/foJa9assVepbqMx/52YPfnkk8L69eubsky3Yc35ePHFF4XIyEghOjpaCAoKEjQajTB37lx7lu3ybPHfyXPPPSckJyc3YZX2wR4iF6DX63Ho0CEkJCSI2zw8PJCQkIC9e/dKWJn7qs85EQQBEyZMwJ133olHHnlEqlLdRn3OSVZWFoqKigAAhYWF2L17N9q1aydJva6uPudj3rx5SEtLw8WLF7Fw4UI89thjmD17tlQlu4X6nJeSkhLxv5Pi4mLs2LEDnTp1kqReW1JIXQA13tWrV2EwGBAaGmqxPTQ0FKdPnxafJyQk4MiRIygpKUFkZCTWr1+P+Ph4e5frFupzTn777TesW7cOXbt2FcfwP//8c3Tp0sXe5bqF+pyTS5cuYcqUKeJk6qeeeorno4nU93uL7Ks+5yUrKwsjR44EYLoy87HHHkOvXr3sXqutMRC5kZ9//lnqEugaffv2hdFolLoMukbv3r1x+PBhqcugWkyYMEHqEqjKLbfcgiNHjkhdhs1xyMwFBAcHQy6X15j8mZWVhbCwMImqcm88J46H58Sx8Hw4Jnc+LwxELkCpVKJHjx7Yvn27uM1oNGL79u0cEpMIz4nj4TlxLDwfjsmdzwuHzJxEcXExzp07Jz6/cOECDh8+jGbNmqFly5aYOXMmxo8fj549e6J3795YsmQJSkpKMHHiRAmrdm08J46H58Sx8Hw4Jp6XG5D4Kjeqp507dwoAavyMHz9ebLNs2TKhZcuWglKpFHr37i3s27dPuoLdAM+J4+E5cSw8H46J56V2vJcZERERuT3OISIiIiK3x0BEREREbo+BiIiIiNweAxERERG5PQYiIiIicnsMREREROT2GIiIiIjI7TEQERERkdtjICIiIiK3x0BERG5pwIABmDFjhs32t2rVKgQEBNhsf0RkXwxERCSpCRMmQCaTQSaTQalUonXr1njttddQWVkpdWkNMmbMGJw5c0Z8PmfOHMTFxUlXEBE1CO92T0SSGzJkCJKTk1FeXo4ff/wRU6dOhaenJ2bNmtWg/RgMBshkMnh42P9vPbVaDbVabffjEpFtsIeIiCSnUqkQFhaG6OhoPPnkk0hISMDGjRtRXl6O5557Di1atICPjw/69OmDXbt2ie8zD1Nt3LgRHTt2hEqlQmpqKiZMmIB7770Xc+fORfPmzaHRaPDEE09Ar9ffsIabHUun06FTp06YMmWK2P78+fPw8/PDZ599ZlGL+fHcuXNx5MgRsfdr1apVmDRpEkaMGGFx3IqKCoSEhODTTz+1zS+TiKzCHiIicjhqtRq5ubmYNm0aTp48iS+//BIRERHYsGEDhgwZgmPHjqFNmzYAgNLSUsyfPx+ffPIJgoKCEBISAgDYvn07vLy8sGvXLly8eBETJ05EUFAQ3nzzzVqPWdex1q5diz59+mD48OEYMWIEHn74Ydx1112YNGlSjX2NGTMGx48fx5YtW/Dzzz8DAPz9/dG2bVv0798fGRkZCA8PBwBs2rQJpaWlGDNmTFP8KomonthDREQOQxAE/Pzzz9i6dSu6du2K5ORkrF+/Hv369UOrVq3w3HPPoW/fvkhOThbfU1FRgQ8//BC333472rVrB29vbwCAUqnEZ599hk6dOmH48OF47bXX8N5778FoNNY4bmpqap3HiouLwxtvvIFHH30UM2bMwKVLl/Dxxx/X+jnUajV8fX2hUCgQFhaGsLAwqNVqscbPP/9cbJucnIz7778fvr6+tvxVElEDsYeIiCS3adMm+Pr6oqKiAkajEQ899BDuu+8+rFq1Cm3btrVoW15ejqCgIPG5UqlE165da+yzW7duYjgCgPj4eBQXFyMtLQ3R0dEWbY8dOwaDwVDnsZ599ll8++23eP/997F582aL1+rr0UcfxcqVK/HCCy8gKysLmzdvxo4dOxq8HyKyLQYiIpLcwIEDsXz5ciiVSkREREChUGDdunWQy+U4dOgQ5HK5Rftre1PUajVkMlmjjl9cXFyvY2VnZ+PMmTOQy+U4e/YshgwZ0uBjJSUl4cUXX8TevXvx+++/IzY2Fv369WtU/UTUeAxERCQ5Hx8ftG7d2mJb9+7dYTAYkJ2dbVVgOHLkCMrKysQrv/bt2wdfX19ERUXVaFvfY02aNAldunTB5MmT8dhjjyEhIQEdOnSota1SqYTBYKixPSgoCPfeey+Sk5Oxd+9eTJw4scGfjYhsj4GIiBxS27ZtMW7cOCQlJWHRokXo3r07cnJysH37dnTt2hXDhw+/6fv1ej0mT56Ml19+GRcvXsSrr76KadOm1XpJfn2O9cEHH2Dv3r04evQooqKi8MMPP2DcuHHYt28flEpljX3GxMTgwoULOHz4MCIjI+Hn5weVSgXANGw2YsQIGAwGjB8/3ja/MCJqFE6qJiKHlZycjKSkJDz77LNo164d7r33Xhw8eBAtW7as872DBg1CmzZt0L9/f4wZMwb//Oc/MWfOHKuOdfr0aTz//PP48MMPxR6mDz/8EFevXsUrr7xS6/5Gjx6NIUOGYODAgWjevDn++9//iq8lJCQgPDwciYmJiIiIaNgvhYiahEwQBEHqIoiIbGnChAkoKCjAt99+K3UptSouLkaLFi2QnJyMUaNGSV0OEYFDZkREdmM0GnH16lUsWrQIAQEB+Oc//yl1SURUhYGIiMhOUlNTERsbi8jISKxatQoKBb+CiRwFh8yIiIjI7XFSNREREbk9BiIiIiJyewxERERE5PYYiIiIiMjtMRARERGR22MgIiIiIrfHQERERERuj4GIiIiI3N7/AyM5HgAezSxEAAAAAElFTkSuQmCC",
+      "text/plain": [
+       "<Figure size 640x480 with 1 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# df = ds.to_pandas()\n",
+    "\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "# Example dataframe\n",
+    "def plot_distribution(dfs):\n",
+    "  # Get summary stats and quartiles\n",
+    "  q1 = dfs['perplexity'].quantile(.05)\n",
+    "  q2 = dfs['perplexity'].quantile(.5)\n",
+    "  q3 = dfs['perplexity'].quantile(.95)\n",
+    "\n",
+    "  # Create line chart  \n",
+    "  counts, bins = np.histogram(dfs['perplexity'], bins=30000)\n",
+    "  bin_centers = 0.5*(bins[1:] + bins[:-1])\n",
+    "  plt.plot(bin_centers, counts)\n",
+    "\n",
+    "  # Add vertical lines for quartiles \n",
+    "  plt.axvline(x=q1, color='r')\n",
+    "  plt.axvline(x=q2, color='g')\n",
+    "  plt.axvline(x=q3, color='b')\n",
+    "\n",
+    "  plt.title('Perplexity Distribution')\n",
+    "  plt.xlabel('Perplexity')\n",
+    "  plt.ylabel('Frequency')\n",
+    "  plt.xscale('log')\n",
+    "\n",
+    "  plt.show()\n",
+    "\n",
+    "plot_distribution(df)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "get some random samples from the dataset with low and high perplexity"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Low: 3.3\n",
+      "High: 1155.9099999999978\n"
+     ]
+    }
+   ],
+   "source": [
+    "low = df.perplexity.quantile(0)\n",
+    "high = df.perplexity.quantile(0.9)\n",
+    "\n",
+    "print(f'Low: {low}')\n",
+    "print(f'High: {high}')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "_lowest sample:_\n",
+    "```\n",
+    "'Die Skulptur Madonna mit Kind in der katholischen Kirche St-Lucien in Angy, einer französischen Gemeinde im Département Oise in der Region Hauts-de-France, wurde im dritten Viertel des 14. Jahrhunderts geschaffen. Im Jahr 1912 wurde die gotische Skulptur als Monument historique in die Liste der geschützten Objekte (Base Palissy) in Frankreich aufgenommen.\\nDie 1,10 Meter hohe Skulptur aus Kalkstein ist farbig gefasst. Maria hält das Jesuskind auf dem linken Arm. Sein Gesicht wendet sich in Richtung des Betrachters. Maria, mit bäuerlichem Gesicht und roten Wangen, trägt auf ihrem Haupt eine Krone. Die vielen Falten von ihrem Kleid geben ihrer Erscheinung eine Fülle.'\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Filterstrategy: \n",
+    "\n",
+    "1. AlphanumericFilter remove sentence with more than 20% alphanumeric\n",
+    "2. ParenthesesRationFilter remove sentence with more than 5% parentheses\n",
+    "3. PunctuationFilter remove sentence with more than 15% missing punctuation\n",
+    "4. EllipsisFilter remove sentence with more than 30% ellipsis\n",
+    "5. LengthFilter: filter short documets < 5 words\n",
+    "6. LongWordFilter: for js stuff\n",
+    "7. CommonWordFilter: check if coherent sentence maybe not needed\n",
+    "8. RepeatedLinesFilter: remove repeated lines 30%\n",
+    "9. WhitespaceRatioFilter: remove sentence with more than 25% whitespace\n",
+    "10. UrlRatioFilter: remove sentence with more than 20% url\n",
+    "11. PerplexityFilter: remove sentence with perplexity > 1000\n",
+    "\n",
+    "\n",
+    "TODO: find law example which is super long and to filter it "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "wikipedia_filters = [\n",
+    "  NonAlphaNumericFilter(),\n",
+    "  LengthFilter(min_length=10),\n",
+    "  CommonWordFilter(common_words=COMMON_WORDS_DE),\n",
+    "  UrlRatioFilter(),\n",
+    "  PerplexityFilter(language=\"de\",min_threshold=0,max_threshold=perplexity_threshold)\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "da7067303e814628917d39b02cab5c0e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "filter documents... (num_proc=128):   0%|          | 0/53172498 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "import os\n",
+    "from easyllm.data.filters import CommonWordFilter,LongWordFilter,LengthFilter,PunctuationFilter,EllipsisFilter,PerplexityFilter,NonAlphaNumericFilter, RepeatedLinesFilter, RepeatedParagraphFilter,ParenthesesRationFilter, WhitespaceRatioFilter,UrlRatioFilter\n",
+    "\n",
+    "from easyllm.data.filters.common_word import COMMON_WORDS_DE\n",
+    "\n",
+    "perplexity_threshold = 1155.9099999999978\n",
+    "filters = [\n",
+    "  NonAlphaNumericFilter(),\n",
+    "  ParenthesesRationFilter(remove_percentage=0.05),\n",
+    "  PunctuationFilter(remove_percentage=0.10),\n",
+    "  EllipsisFilter(remove_percentage=0.3), \n",
+    "  LengthFilter(min_length=10),\n",
+    "  LongWordFilter(),\n",
+    "  CommonWordFilter(common_words=COMMON_WORDS_DE),\n",
+    "  RepeatedLinesFilter(),\n",
+    "  WhitespaceRatioFilter(),\n",
+    "  UrlRatioFilter(),\n",
+    "  PerplexityFilter(language=\"de\",min_threshold=0,max_threshold=perplexity_threshold)\n",
+    "]\n",
+    "errors = []\n",
+    "\n",
+    "def apply_filters(sample):\n",
+    "  # try:\n",
+    "  for filter in filters:\n",
+    "    fil = filter(sample[\"text\"])\n",
+    "    if fil:\n",
+    "      break\n",
+    "  # datasets filters keeps true elements, meaning if the filter is we want to set it to false\n",
+    "  # to remove the sample\n",
+    "  return not fil\n",
+    "  # return fil\n",
+    "\n",
+    "ds = ds.filter(apply_filters,num_proc=os.cpu_count(),\n",
+    "    desc=\"filter documents...\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Dataset({\n",
+       "    features: ['id', 'text', 'timestamp', 'url'],\n",
+       "    num_rows: 41627980\n",
+       "})"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "ds"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "print random sample"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "In wao.io gibt es zahlreiche Sicherheitsoptimierungen und es ist unwahscheinlich, dass man weiß, was sie alle bewirken. Hier ist ein kurzer Überblick.\n",
+      "Einige Sicherheitsoptionen müssen etwas mehr berücksichtigt werden als andere. Einige lassen sich ohne weiteres Nachdenken einschalten, andere brauchen viel Nachdenken und müssen wahrscheinlich gleich nach dem Einschalten getestet werden.\n",
+      "No-brainers\n",
+      "Verbieten Sie Content-Type Sniffing\n",
+      "Diese Option verbietet, dass der Browser versucht, unerwartete Inhaltstypen für meist CSS und JavaScript zu verarbeiten. Einige Browser versuchen bei unerwarteten Informationen über die gerade geladene Datei zu verstehen, welche Art von Inhalt sich in dieser Datei befindet. Und dann könnten sie versuchen, eine bessere Verwendung für die Datei zu finden als die, die im Dokument angegeben wurde. Dies ist fast immer nutzlos und kann von Angreifern missbraucht werden.\n",
+      "Meine Empfehlung: Schalten Sie es einfach ein\n",
+      "Entfernen Sie ausgehende Informationen über Ihren Server\n",
+      "Diese Option filtert alle von Ihrem Server gesendeten HTTP-Header, die Angreifern eine Vorstellung davon vermitteln, welche Software und welche Version der jeweiligen Software auf Ihrem Server verwendet wird. Ob es sich um die Drupal-Version, die Apache-Version, die PHP-Version oder irgendeinen anderen Hinweis darauf handelt, welche Art von Software Ihr Server verwendet, ein potentieller Angreifer kann einfach nach Schwachstellen für diese suchen und sie missbrauchen.\n",
+      "Dies könnte es schwieriger machen, bestimmte Probleme mit Ihrem Server zu debuggen. Denken Sie daran, dass Sie diese Funktion jederzeit mit einem einfachen Klick aktivieren oder deaktivieren können. Falls Sie also beim Debuggen einige Header vermissen, schalten Sie sie aus und später wieder ein.\n",
+      "Meine Empfehlung: Schalten Sie es einfach ein\n",
+      "Cross-site scripting Schutz\n",
+      "Für den Fall, dass ein Angreifer einen manipulierten Link an einen Ihrer Kunden senden oder bösartige Inhalte in Ihre Website einschleusen kann, schützt diese Option den Benutzer, indem sie den Browser das Dokument auf Angriffe scannen lässt.\n",
+      "Diese Option hat zwei Einstellungen: Sie können entweder unsichere Skripte entfernen oder das Rendern der Site auf einmal verhindern.\n",
+      "Es sind Hunderte von Angriffen bekannt, und die Chancen stehen gut, dass ein Angreifer, der Ihre Website angreift, mehr als nur einen verwendet. Mit den Optionen zum Entfernen unsicherer Skripte kann der Browser jeden von ihm entdeckten Angriff entschärfen, aber Angriffe, die er nicht entdeckt hat, werden trotzdem ausgeführt. An dieser Stelle kommt die Option Rendering verhindern ins Spiel. Wenn der Browser einen einzigen Angriff auf die Seite entdeckt, wird er die Seite überhaupt nicht rendern.\n",
+      "Meine Empfehlung: Rendering verhindern\n",
+      "Thinking required\n",
+      "Es gibt ein paar Funktionen, die davon abhängen, ob Ihre Website konsequent HTTPS verwendet oder nicht.\n",
+      "Wenn Sie sicher sind, dass HTTPS überall auf Ihrer Website verwendet wird und Ihr Server wahrscheinlich bereits Datenverkehr, der http anfordert, auf https umleitet, gibt es einige Optionen, die Sie in wao.io aktivieren sollten.\n",
+      "HTTP-URLs zu HTTPS umleiten\n",
+      "Dadurch wird nicht nur sichergestellt, dass keiner Ihrer Benutzer unsichere Inhalte sieht, sondern auch, dass eine http-Anforderung gar nicht erst auf Ihren Server gelangt. wao.io übernimmt die Aufgabe und leitet Ihren Client auf die https-Version der angeforderten Ressource um. Dies erhöht nicht nur die Sicherheit, sondern auch die Performance der Umleitung.\n",
+      "Meine Empfehlung: Wenn Sie https konsequent verwenden, schalten Sie es ein\n",
+      "Cookie Sicherheit\n",
+      "Auch diese Option hat zwei verschiedene Einstellungen. Sie können Cookies entweder davor schützen, über unsichere Verbindungen offengelegt zu werden, oder sie für unsichere Verbindungen ganz verbieten.\n",
+      "Beide Optionen fügen das Sicherheitsflag zu jedem Cookie hinzu, das Sie über den Set-Cookie-HTTP-Header setzen. Das bedeutet, dass der Client diese Cookies nicht sendet, wenn er eine unsichere http-Verbindung verwendet. Die Verbieten-Option wird sogar alle Set-Cookie-HTTP-Header entfernen, wenn die Verbindung unsicher ist.\n",
+      "Meine Empfehlung: Wenn Sie https konsequent verwenden, verbieten Sie Cookies für unsichere Verbindungen.\n",
+      "Einschränken der Einbettung in andere Webseiten\n",
+      "Nachdem wir nun die Funktionen ausgeschlossen haben, die auf HTTPS basieren, wollen wir uns nun den Funktionen zuwenden, die alles rund um Iframes behandeln.\n",
+      "Um eine durchdachte Entscheidung für diese Optionen zu treffen, müssen Sie wissen, ob Ihre Website in Iframes angezeigt werden soll oder muss, und - falls ja - welche Domains diese Iframes verwenden.\n",
+      "Mit der folgenden Option können Sie steuern, wo Iframes, die Ihre Website enthalten, erlaubt sind. Eine mögliche Antwort auf die Iframe-Frage lautet \"Ich weiß es nicht\". In diesem Fall sollten Sie die Einbettung nicht einschränken. Wenn Sie jedoch wissen, dass keine Iframes erforderlich sind, damit Ihre Website und Ihr Unternehmen korrekt funktionieren, sollten Sie die Einbettung einfach sofort verbieten.\n",
+      "Falls Sie die Domains kennen, die Iframes zur Anzeige Ihrer Website verwenden, gibt es zwei mögliche Ergebnisse: Die Domain, die den Iframe verwendet, ist entweder immer die Domain, die im Iframe enthalten ist, oder sie ist es nicht. Im ersten Fall können Sie die Einbettung nur von demselben Ursprung aus erlauben, im zweiten Fall können Sie eine Liste der Domains angeben, die Iframes verwenden dürfen.\n",
+      "Meine Empfehlung: entweder verbieten oder eine Liste der Ursprünge angeben\n",
+      "Cookies auf dieselbe Website beschränken\n",
+      "Auch bei dieser Funktion geht es darum, Ihre Webseite in Iframes anderer Leute einzubetten. Wenn Ihre Seite nicht von anderen eingebettet werden soll, sollten Sie die Cross-Site-Nutzung verbieten. Wenn sie von anderen eingebettet wird, sollten Sie sie wahrscheinlich nur für Navigationsanfragen zulassen. Wenn Sie sie wirklich für nicht-navigatorische Anfragen zulassen müssen, nehme ich an, dass Ihr Dienst technisch anspruchsvoll ist und Sie diesen Leitfaden ohnehin nicht benötigen.\n",
+      "Meine Empfehlung: entweder verbieten Sie die standortübergreifende Nutzung oder erlauben Sie sie für Navigationsanfragen.\n",
+      "Senden von Informationen über den Verweiser einschränken (Referrer-Policy)\n",
+      "Der Browser lässt jeden gerne wissen, welche URL Ihr Besucher gerade angesehen hat, wenn er eine Anfrage stellt.\n",
+      "Das bedeutet, dass der Browser unabhängig davon, ob er ein JavaScript von Tracking- oder Werbediensten, ein Bild von einem Bildhoster oder eine Website lädt, nachdem Ihr Besucher Ihre Website verlassen hat, die vollständige URL sendet, von der die Anfrage stammt. Dies ist ein potentielles Informationsleck, da diese Dritten nun wissen könnten, wie viele Artikel der Benutzer in seinem Warenkorb hat oder welches Produkt er gerade angeschaut hat.\n",
+      "Nun sind einige Dienste auf den Referrer als Autorisierungsmittel angewiesen. Möglicherweise möchte der Bildhoster die Bilder nur auf Ihrer genauen Website anzeigen. Selbst dann sollte er vielleicht nicht in der Lage sein, zu erkennen, wie viele Artikel sich im Warenkorb Ihres Kunden befinden.\n",
+      "Während die Referrer-Politik eine Reihe von Optionen hat, die im Detail einschränken, über welche Art von Verbindung welche Informationstiefe geteilt wird, habe ich einen Favoriten. Und zwar deshalb, weil er die Privatsphäre respektiert, indem er fast nie einen der Anwendungsfälle für die Überweisungsmechaniken bricht.\n",
+      "Meine Empfehlung: Senden Sie nichts, wenn weniger sicher, Referrer-Ursprung bei Cross-Origin, Referrer-URL sonst\n",
+      "Alle Merkmale, die ich in diesem Artikel nicht erörtert habe, erfordern ein tieferes Verständnis dessen, was sie bewirken, wann sie nicht verwendet werden sollten und vor allem, was sie bei falscher Verwendung kaputt machen könnten. Wenn Sie weitere Informationen zu einer dieser Funktionen oder zu den hier besprochenen Themen benötigen, wenden Sie sich einfach an unser Support-Team. Wir helfen Ihnen gerne weiter!\n",
+      "Experte\n",
+      "Sebastian ThielenProduct Manager von wao.io\n",
+      "Arbeitet seit über 10 Jahren für Avenga. Als leidenschaftlicher Software-Entwickler und Teamleiter in der Entwicklung, hat er sein Expertenwissen in den Themen Web-Performance und Web-Security ständig vertieft und weitergegeben.\n",
+      "Arbeitet seit über 10 Jahren für Avenga. Als leidenschaftlicher Software-Entwickler und Teamleiter in der Entwicklung, hat er sein Expertenwissen in den Themen Web-Performance und Web-Security ständig vertieft und weitergegeben.\n",
+      "Erfahre mehr über Sebastian\n",
+      "Das könnte dir auch gefallen:\n",
+      "Let's Encrypt SSL-Zertifikate auf wao.io\n",
+      "Was ist Let's Encrypt? Kostet mich ein SSL Zertifikat etwas? Muss ich mich selbst darum kümmern oder übernimmt wao.io das Erstellen eines Zertifikates für mich?\n",
+      "9.1.2020\n",
+      "Roland Guelle\n",
+      "Entferne ausgehende Informationen über Deinen Server\n",
+      "Welche Header werden durch das Feature \"Ausgehende Informationen\" über Deinen Server entfernen entfernt? Gibt es Risiken die ich dabei bedenken muss?\n",
+      "9.1.2020\n",
+      "Roland Guelle\n",
+      "AMP und wao.io. So optimieren Sie beschleunigte mobile Seiten (AMP) mit wao.io\n",
+      "Wie kann ich meine AMP beschleunigte Website mit wao.io benutzen? Muss ich zusätzlichen Aufwand betreiben? Hat dies Auswirkungen auf meine Analytics?\n",
+      "9.1.2020\n",
+      "Sebastian Thielen\n",
+      "14 tage kostenlos testen\n",
+      "Keine Kreditkarte notwendig. Jederzeit kündbar. Alle Details auf der Preis-Seite.\n",
+      "jetzt starten\n",
+      "wao.io ist eine Plattform, die die Ladezeit und Sicherheit Deiner Webseite automatisch und kontinuierlich verbessert. Es sind keine Codeänderungen notwendig. Verbessere Deine Conversion Rate, reduziere Bandbreitenkosten und erhalte Einblicke in die Performance Deiner Webseite.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from random import randint\n",
+    "\n",
+    "id = randint(0, len(ds)-1)\n",
+    "sample = ds[id]\n",
+    "\n",
+    "print(sample['text'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Clean long stuff"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Found cached dataset parquet (/home/ubuntu/.cache/huggingface/datasets/philschmid___parquet/philschmid--llama2-german-corpus-21cbdedbab973e31/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)\n"
+     ]
+    }
+   ],
+   "source": [
+    "from datasets import load_dataset \n",
+    "\n",
+    "ds = load_dataset(\"philschmid/llama2-german-corpus\", split=\"raw\")\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Dataset({\n",
+       "    features: ['text'],\n",
+       "    num_rows: 44414930\n",
+       "})"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "ds"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "bc114654578541e682f8229a9e454063",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "filter documents... (num_proc=128):   0%|          | 0/44414930 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "import os \n",
+    "\n",
+    "def apply_filters(sample):\n",
+    "  # try:\n",
+    "  if len(sample[\"text\"]) > 300_000:\n",
+    "    return False\n",
+    "  return True\n",
+    "\n",
+    "\n",
+    "# datasets filters keeps true elements, meaning if the filter is we want to set it to false\n",
+    "ds = ds.filter(apply_filters,num_proc=os.cpu_count(),\n",
+    "    desc=\"filter documents...\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "37786e0a47ec4375a7495bc1c5ed7ff2",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Saving the dataset (0/564 shards):   0%|          | 0/44401239 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "ds.save_to_disk('data/length-filtered')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "845d0381dbd94228a8ae206dcc226c78",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Pushing dataset shards to the dataset hub:   0%|          | 0/141 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c1ba1da04cf145aaa41bf4229bd65007",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8a607f5458774b1a9155c520d2ad538a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "eb2e02b0efd44617a2fa4b0ea9099934",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8f3b51f97a754bb783f133671bf77882",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "290a8f4f928a47f4a1a556d045c2c67f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "11ebed262ecf4c039ab8dd0a3fadc07d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a05ce84e38c94844969296126c6fd238",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "440ac397f00c41fc961e4dce7e9b29c4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "bda20847708d405eb1e7fddf9f171307",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "52b50173dc6144eab7ac52d444caac83",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e850b59e60404c8f923352ce65895f9a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "43aa42dfc0e84101a81d8c613ec57521",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "26bf22e5035545a19cce0763ed34fc1a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b24e0801b7984cd0a963597832ca2e0c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a7fd67591a364cf0adc4f7da92d780d2",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d4bc59beee6f4086bc6148454479acc4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2fa6bdbe519b4d859feb32346c0a90b4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e6e6d76c6cb042b9a79a380edcc610f6",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6998e634177241f1b27503962a59d73d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1fa91fa0b5214273bb68879834e4fe2e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "39916866f9294976ab85602bab0c3826",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "00388b83179449ffb84761301802045e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a8eb9c908755466db45fee483965d3ba",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8e52d3d1c4ba46bea0f5e6be1838571e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b56764221ec34d93acfa19593825240f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a13409b201064edb89ae45ef90413948",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0896749b21154236bcb7177a33e1d4cd",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d68eab6a88f9481eb9eef8ca07fe2c8e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2cd9246535ea490b91522bbe8971549e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "aa8dcf72edc34523ab4f82c031ba7afd",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e87b920d63d3493298938cbc232e6f37",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "4a53ef4d238b42fdb4e439598fc9c42c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "fb9419d04b9e48688109100217b43d58",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9dfbe08641f6409bbb3b37e04da8ac2a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "fcf88d0de0574418941ab875310639aa",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0b37b9cb2c9f4604b0aae4028d75e981",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b9cf6cb03ca04ccc8bddc88b42ac80ff",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "fa48871f9a0840498c9448bbbbdc4e3f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f6e5c75093cb4a0da23999224111f011",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5f32e20587294eae9197a099226472e6",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b6870f074b64428880eddbbeb12c57cb",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "94c5ea54c60845c69aa2ffbaf277afa0",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0b7a369ee1634c8ab27b5a7a798ba50e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c5b5bc2df5d74f7fb1f7bc2e12cde15b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "4b45fae1b65b4eefbe672ea32d939e6d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "394e9b56efce42dfa337d7d0dc2e55ad",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d1b0031735434bb4af09ae50eee189ed",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1a4da430ad1b4807b3e16a7b3fad7caf",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "89433b9ca7304421af8aee98a0bbaa47",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "74f898940e1947b7aadf2c42c4a39143",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "cfccf48c083f4d258c036d6ea14aa658",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2b1cd88c406e4dda916f0bc6557aa3df",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b7b754884f154e9796c23861597bcf41",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "bd7d7a4cbacf43bcb7f9b54ea23e8c49",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8f7bb207729b4a2eaa281e9c10031c43",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9d1cb0a9591f418f946d6917e8844311",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "04fe474bd72647298e4c8171b54d42d4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "72f000512d1547d695a7c0b28f4b61ae",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f2d095722eb440b693f310a2ca078ca1",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "246d7fdec56843888b98a4168c2277ba",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b3b2f3b8a29c4f5eb89079d6ada26021",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "363a94cdee9a4e388296101766912bd2",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a1c9e0c105a44a179dfe0c3c7700a022",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d0133771d95b40edb16523a8eb9944a3",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "45af1a00936e4a80adf28f45c6dbaffa",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c78baea899264419a9adf5c76954b75d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9149eca2e9114eb78b44d105aee123d8",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d250a156512d4ab8bedca48a31cc67aa",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "065614420b624941890f3714c65e2ed3",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "354bcbcb88a34dd0ba469b0c5dd046b1",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "91ac2d32edbf44de91c2fe9a9857abbc",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "250ef3cb470c42c78303244900d712c3",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c8be10f74a4042ba9a9cbe1a7fd7b3bc",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "de7d12e33d41417e9235fb0a0628df97",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "91bdcbec98d14a4e957bf3f63ac0c2de",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a704bcf819e544f5994368d7b4f1ac5b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "cdcdca99544d4626b75567cb25449e5e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7cbbc45f3ee84a7996be88470e4a6f49",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f11b7505054f4f06ac0ce382586376aa",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "897b4b768c64442face0957a63fea539",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0317028f918a496e8f0a9d9027c7d801",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6dea3cbdea8742bcbb0690c1c17aee70",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f1373d83d3d54db495589d0bff826d15",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "cdec16241cfc4f19858996e687eb7048",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9cba9f7db660495cbe095e39048fde65",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "af3bcb1f2f3c496bb72bbc11ce48ecf3",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9ab44bcb577c4c168dfc44ba5045a9e7",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "48fa5f6aba8949469642793c7a8b31e9",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "39d43782b6d8442984ce988cabc265a6",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "437b9f8f928b47c49bcb144d8405a887",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "bc008ead1b7942f3bf4996d2b78a8bc6",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c95f328ea3024c7daf68027f9b17479b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ee5a3c8b8fd04c5dbff6af7452f6c14e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "3d500086d3b94175a8e5bf36ed49289a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "75a6a1090b164c6f982e7145357bbf28",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7aa305cd5df64ea6aa21a14f38e44022",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "122e076f80614e2180480417bb6c3452",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1e9ecdc8eb4846afa1c91d4b5af37b5a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a7a3184ea8c54d69826caecd3d8e6fe2",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2ed2c7886efa47238912677cc1962c79",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8697653bcf3b4ab5a918cbf2ba1883e2",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c963bf4bfae848b3a9225f5f849a3056",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f55288c76d644dc5ad0254cd413624a5",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "56b44d72ffeb434fb29c34e392730c8e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ac8d2fd8aef34d65a68e78a0b941a5ed",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a4154dcad97b4df99f21486d65da678f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b4405bef54a24e3b94b66bb2a20bafdb",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a6ac720b2f604be28b48c62329cc7aca",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9eb2bcc93f4f4ad98b7c883177704f64",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "58510b46e3274173af56036e4751a321",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1cfb76c1b40d4015816034aab8f0c8ad",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6c4e98017d1e4d8abad236f6429b997b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7317c9c35d4e4ebd9b2a4be5835077fe",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1238592d066e4d389b10073192334c07",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "18adb9f52adb48aaac24b51dcb855526",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d4f4670cf1314246bd2ed3a3180997c7",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "530c5e810009405391a25a2b64cc7e2c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "370768dd6e9d4214b82a54804c2ba2f2",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7a1e3f88f3cd45069abea84947c07bc4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d32dec3a4e34483eb594c797794fd2e3",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "196a31e1dc6045bb85d5f4a516d4c668",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "12a97beb087f42c4a06e00efd33493ea",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "52717591375249a9b8d275b286c13562",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "3810ac9a95064b82b2d62fbcb6529139",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ab5a6295353c4b91b30c0c54ee0ed399",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d275d70440994a9ab34440d88c5393be",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "ds.push_to_hub(\"philschmid/llama2-german-corpus-20230816\",\"raw\",private=True,max_shard_size=\"2000MB\",token=\"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\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "dev",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.13"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From ed3a0686aaa8606e0f7ea4f39c4caed6939fd6b1 Mon Sep 17 00:00:00 2001
From: Philipp Schmid <schmidphilipp1995@gmail.com>
Date: Fri, 24 Nov 2023 14:50:56 +0000
Subject: [PATCH 16/16] happy quality

---
 easyllm/data/extractor/html_extractor.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/easyllm/data/extractor/html_extractor.py b/easyllm/data/extractor/html_extractor.py
index 41bf77f..6a1d4a3 100644
--- a/easyllm/data/extractor/html_extractor.py
+++ b/easyllm/data/extractor/html_extractor.py
@@ -1,9 +1,8 @@
-from pydantic import BaseModel
-
 #
 from inscriptis import get_text
 from inscriptis.css_profiles import CSS_PROFILES
 from inscriptis.model.config import ParserConfig
+from pydantic import BaseModel
 from readability import Document
 
 INSCRIPTIS_CONFIG = ParserConfig(css=CSS_PROFILES["strict"])