diff --git a/easyllm/data/__init__.py b/easyllm/data/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/easyllm/data/extractor/__init__.py b/easyllm/data/extractor/__init__.py
new file mode 100644
index 0000000..3f25c04
--- /dev/null
+++ b/easyllm/data/extractor/__init__.py
@@ -0,0 +1 @@
+from easyllm.data.extractor.html_extractor import HtmlExtractor
diff --git a/easyllm/data/extractor/html_extractor.py b/easyllm/data/extractor/html_extractor.py
new file mode 100644
index 0000000..6a1d4a3
--- /dev/null
+++ b/easyllm/data/extractor/html_extractor.py
@@ -0,0 +1,23 @@
+#
+from inscriptis import get_text
+from inscriptis.css_profiles import CSS_PROFILES
+from inscriptis.model.config import ParserConfig
+from pydantic import BaseModel
+from readability import Document
+
+INSCRIPTIS_CONFIG = ParserConfig(css=CSS_PROFILES["strict"])
+
+
+class HtmlExtractor(BaseModel):
+    """
+    Desc: Extracts text from the HTML document using mozzilas readability and inscriptis.
+    """
+
+    name: str = "html_extractor"
+    min_doc_length: int = 25
+
+    def __call__(self, document: str) -> str:
+        parsed_doc = Document(document, min_text_length=self.min_doc_length)
+        clean_html = parsed_doc.summary(html_partial=True)
+        content = get_text(clean_html, INSCRIPTIS_CONFIG).strip()
+        return content
diff --git a/easyllm/data/filters/__init__.py b/easyllm/data/filters/__init__.py
new file mode 100644
index 0000000..d37d7b2
--- /dev/null
+++ b/easyllm/data/filters/__init__.py
@@ -0,0 +1,14 @@
+from easyllm.data.filters.bulletpoint_ratio import BulletpointRatioFilter
+from easyllm.data.filters.common_word import CommonWordFilter
+from easyllm.data.filters.digit_to_character import DigitToCharacter
+from easyllm.data.filters.kenlm_ppl import PerplexityFilter
+from easyllm.data.filters.length import LengthFilter
+from easyllm.data.filters.longword import LongWordFilter
+from easyllm.data.filters.n_gram import TopNGramsFilter
+from easyllm.data.filters.non_alpha_numeric import NonAlphaNumericFilter
+from easyllm.data.filters.parantheses_ration import ParenthesesRationFilter
+from easyllm.data.filters.punctuation import EllipsisFilter, PunctuationFilter
+from easyllm.data.filters.repeating import RepeatedLinesFilter, RepeatedParagraphFilter
+from easyllm.data.filters.url_ratio import UrlRatioFilter
+from easyllm.data.filters.whitespace_ration import WhitespaceRatioFilter
+from easyllm.data.filters.words_to_symbol import SymbolToWordFilter
diff --git a/easyllm/data/filters/bulletpoint_ratio.py b/easyllm/data/filters/bulletpoint_ratio.py
new file mode 100644
index 0000000..566d554
--- /dev/null
+++ b/easyllm/data/filters/bulletpoint_ratio.py
@@ -0,0 +1,42 @@
+from typing import List
+
+from pydantic import BaseModel
+
+
+class BulletpointRatioFilter(BaseModel):
+    """
+    Ref: Gopher (Rae et al., 2021)
+    Desc: If more than 90% of the document are bulletpoints then remove
+    """
+
+    name: str = "bulletpoint_ratio"
+    potential_bullet_points: List[str] = [
+        "•",
+        "‣",
+        "⁃",
+        "⁌",
+        "⁍",
+        "∙",
+        "○",
+        "●",
+        "◘",
+        "◦",
+        "⦾",
+        "⦿",
+        "-",
+    ]
+    remove_percentage: float = 0.9
+
+    def __call__(self, text):
+        # split text into lines
+        lines = text.split("\n")
+        num_bullet_points = 0
+        for line in lines:
+            # check if the line is a bullet point
+            if line.startswith(tuple(self.potential_bullet_points)):
+                num_bullet_points += 1
+        # check if the ratio of bullet points to lines is greater than the remove percentage
+        if num_bullet_points / len(lines) > self.remove_percentage:
+            return True
+        # otherwise keep
+        return False
diff --git a/easyllm/data/filters/common_word.py b/easyllm/data/filters/common_word.py
new file mode 100644
index 0000000..f562020
--- /dev/null
+++ b/easyllm/data/filters/common_word.py
@@ -0,0 +1,29 @@
+from typing import List
+
+from pydantic import BaseModel
+
+COMMON_WORDS_EN = ["the", "be", "to", "of", "and", "that", "have", "with", "this"]
+COMMON_WORDS_DE = ["der", "die", "das", "er" "sein", "zu", "ist", "war", "von", "und", "haben", "mit"]
+
+
+class CommonWordFilter(BaseModel):
+    """
+    Ref: Gopher (Rae et al., 2021)
+    Desc: Makes sure that the document contains at least 2 common words if not remove
+    """
+
+    name: str = "common_word"
+    common_words: List[str] = COMMON_WORDS_EN
+    n: int = 2
+
+    def __call__(self, text):
+        words = text.split()
+        common_word_counter = 0
+        # count the number of common words
+        for word in words:
+            if word.lower() in self.common_words:
+                common_word_counter += 1
+            if common_word_counter >= self.n:
+                return False
+        # otherwise remove
+        return True
diff --git a/easyllm/data/filters/cookie_banner.py b/easyllm/data/filters/cookie_banner.py
new file mode 100644
index 0000000..91ed1bb
--- /dev/null
+++ b/easyllm/data/filters/cookie_banner.py
@@ -0,0 +1,53 @@
+import re
+
+from pydantic import BaseModel
+
+policy_substrings = [
+    "terms of use",
+    "privacy policy",
+    "cookie policy",
+    "uses cookies",
+    "privacy overview",
+    "use of cookies",
+    "use cookies",
+    "privacy & cookies policy",
+    "privacy and cookies policy",
+    "This website uses cookies to improve your experience while you "
+    "navigate through the website. Out of these cookies, the cookies "
+    "that are categorized as necessary are stored on your browser as they "
+    "are essential for the working of basic functionalities of the website. "
+    "We also use third-party cookies that help us analyze and understand how "
+    "you use this website. These cookies will be stored in your browser only "
+    "with your consent. You also have the option to opt-out of these "
+    "cookies. But opting out of some of these cookies may have an effect "
+    "on your browsing experience.".lower(),
+    "Necessary cookies are absolutely essential for the website to "
+    "function properly. This category only includes cookies that "
+    "ensures basic functionalities and security features of the website. "
+    "These cookies do not store any personal information.".lower(),
+    "Any cookies that may not be particularly necessary for the website "
+    "to function and is used specifically to collect user personal data "
+    "via analytics, ads, other embedded contents are termed as non-necessary "
+    "cookies. It is mandatory to procure user consent prior to running these "
+    "cookies on your website.".lower(),
+    "This site uses cookies, including for analytics, personalization, and "
+    "advertising purposes. For more information or to change your "
+    "cookie settings, click here.".lower(),
+    "If you continue to browse this site without changing your cookie "
+    "settings, you agree to this use. AcceptRead More".lower(),
+]
+
+
+class CookieBannerFilter(BaseModel):
+    """
+    Ref: C4 Raffel et al.
+    Desc: Removes documents if more than 40% of the documents include terms for cookies, tos, privacy policy, etc. Requires external list.
+    """
+
+    name: str = "cookie_banner"
+    regex: re.Pattern = re.compile(r"(terms of use|privacy policy|copyright|all rights reserved)", re.IGNORECASE)
+    remove_percentage: float = 0.4
+
+    def __call__(self, text):
+        # check if the regex matches
+        raise NotImplementedError("CookieBannerFilter not implemented yet")
diff --git a/easyllm/data/filters/digit_to_character.py b/easyllm/data/filters/digit_to_character.py
new file mode 100644
index 0000000..7916afc
--- /dev/null
+++ b/easyllm/data/filters/digit_to_character.py
@@ -0,0 +1,22 @@
+import re
+
+from pydantic import BaseModel
+
+
+class DigitToCharacter(BaseModel):
+    """
+    Desc: If more than 20% of the document are digits then remove
+    """
+
+    name: str = "digit_to_character"
+    remove_percentage: float = 0.2
+
+    def __call__(self, text):
+        digits = re.findall(r"\d", text)
+        num_digits = len(digits)
+        total_chars = len(text)
+        # check if there are any characters in the text
+        if num_digits / total_chars > self.remove_percentage:
+            return True
+        # otherwise keep
+        return False
diff --git a/easyllm/data/filters/kenlm_ppl.py b/easyllm/data/filters/kenlm_ppl.py
new file mode 100644
index 0000000..8cb4e1b
--- /dev/null
+++ b/easyllm/data/filters/kenlm_ppl.py
@@ -0,0 +1,200 @@
+import importlib.util
+import re
+import unicodedata
+from typing import Dict
+
+from huggingface_hub import hf_hub_download
+from pydantic import BaseModel, ConfigDict
+
+_kenlm = importlib.util.find_spec("kenlm") is not None
+_sentencepiece = importlib.util.find_spec("sentencepiece") is not None
+
+if _kenlm or not _sentencepiece:
+    import kenlm
+    import sentencepiece
+
+
+class SentencePiece:
+    def __init__(
+        self,
+        model: str,
+    ):
+        super().__init__()
+        self.sp = sentencepiece.SentencePieceProcessor()
+        self.sp.load(str(model))
+
+    def do(self, text: dict) -> dict:
+        tokenized = self.sp.encode_as_pieces(text)
+        return " ".join(tokenized)
+
+
+class KenlmModel:
+    digit_re: re.Pattern[str] = re.compile(r"\d")
+    unicode_punct: Dict[str, str] = {
+        "，": ",",
+        "。": ".",
+        "、": ",",
+        "„": '"',
+        "”": '"',
+        "“": '"',
+        "«": '"',
+        "»": '"',
+        "１": '"',
+        "」": '"',
+        "「": '"',
+        "《": '"',
+        "》": '"',
+        "´": "'",
+        "∶": ":",
+        "：": ":",
+        "？": "?",
+        "！": "!",
+        "（": "(",
+        "）": ")",
+        "；": ";",
+        "–": "-",
+        "—": " - ",
+        "．": ". ",
+        "～": "~",
+        "’": "'",
+        "…": "...",
+        "━": "-",
+        "〈": "<",
+        "〉": ">",
+        "【": "[",
+        "】": "]",
+        "％": "%",
+        "►": "-",
+    }
+    unicode_punct_re: re.Pattern = re.compile(f"[{''.join(unicode_punct.keys())}]")
+    non_printing_chars_re: re.Pattern = re.compile(f"[{''.join(map(chr, list(range(0,32)) + list(range(127,160))))}]")
+    model: kenlm.Model = None
+    tokenizer: SentencePiece = None
+    accent: bool = False
+    case: bool = False
+    numbers: bool = True
+    punct: int = 1
+
+    def __init__(
+        self,
+        model_path: str,
+        tokenizer_path: str,
+        lower_case: bool = False,
+        remove_accents: bool = False,
+        normalize_numbers: bool = True,
+        punctuation: int = 1,
+    ):
+        self.model = kenlm.Model(model_path)
+        self.tokenizer = SentencePiece(tokenizer_path)
+        self.accent = remove_accents
+        self.case = lower_case
+        self.numbers = normalize_numbers
+        self.punct = punctuation
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        language_or_path: str,
+    ):
+        try:
+            model = hf_hub_download("philschmid/kenlm", filename=f"wikipedia/{language_or_path}.arpa.bin")
+            tokenizer = hf_hub_download("philschmid/kenlm", filename=f"wikipedia/{language_or_path}.sp.model")
+        except Exception:
+            raise ValueError(
+                f"KenLM model for {language_or_path} not found at https://huggingface.co/philschmid/kenlm. Please train your own model and upload it to the hub."
+            ) from None
+
+        return cls(
+            model,
+            tokenizer,
+            False,
+            False,
+            True,
+            1,
+        )
+
+    def pp(self, log_score, length):
+        return 10.0 ** (-log_score / length)
+
+    def get_perplexity(self, doc: str, normalize_cc_net: bool = True):
+        if normalize_cc_net:
+            doc = self.normalize(
+                doc,
+                accent=self.accent,
+                case=self.case,
+                numbers=self.numbers,
+                punct=self.punct,
+            )
+        # Tokenize (after normalizing): See https://github.com/facebookresearch/cc_net/blob/bda555bd1cf1ee2e0b925363e62a61cd46c8b60d/cc_net/mine.py#L352 for full pipeline
+        doc = self.tokenizer.do(doc)
+        doc_log_score, doc_length = 0, 0
+        for line in doc.split("\n"):
+            log_score = self.model.score(line)
+            length = len(line.split()) + 1
+            doc_log_score += log_score
+            doc_length += length
+        return round(self.pp(doc_log_score, doc_length), 1)
+
+    def normalize(
+        self,
+        line: str,
+        accent: bool = True,
+        case: bool = True,
+        numbers: bool = True,
+        punct: int = 1,
+    ) -> str:
+        line = line.strip()
+        if not line:
+            return line
+        if case:
+            line = line.lower()
+        if accent:
+            line = self.strip_accents(line)
+        if numbers:
+            line = self.digit_re.sub("0", line)
+        if punct == 1:
+            line = self.replace_unicode_punct(line)
+        elif punct == 2:
+            line = self.remove_unicode_punct(line)
+        line = self.remove_non_printing_char(line)
+        return line
+
+    def strip_accents(self, line: str) -> str:
+        """Strips accents from a piece of text."""
+        nfd = unicodedata.normalize("NFD", line)
+        output = [c for c in nfd if unicodedata.category(c) != "Mn"]
+        if len(output) == line:
+            return line
+        return "".join(output)
+
+    def replace_unicode_punct(self, text: str) -> str:
+        return "".join(self.unicode_punct.get(c, c) for c in text)
+
+    def remove_unicode_punct(self, text: str) -> str:
+        """More aggressive version of replace_unicode_punct but also faster."""
+        return self.unicode_punct_re.sub("", text)
+
+    def remove_non_printing_char(self, text: str) -> str:
+        return self.non_printing_chars_re.sub("", text)
+
+
+class PerplexityFilter(BaseModel):
+    model: KenlmModel = None
+    min_threshold: int = 0
+    max_threshold: int = 1000
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+    def __init__(self, language: str, min_threshold: int = 0, max_threshold: int = 1000):
+        super().__init__()
+        self.min_threshold = min_threshold
+        self.max_threshold = max_threshold
+        self.model = KenlmModel.from_pretrained(language)
+
+    def __call__(self, doc: str) -> bool:
+        # returns True if the perplexity of the document outside of the threshold,
+        # meaning smaller than min_threshold or larger than max_threshold
+        perplexity = self.model.get_perplexity(doc)
+        if perplexity < self.min_threshold or perplexity > self.max_threshold:
+            return True
+        # otherwise keep
+        return False
diff --git a/easyllm/data/filters/length.py b/easyllm/data/filters/length.py
new file mode 100644
index 0000000..51646f2
--- /dev/null
+++ b/easyllm/data/filters/length.py
@@ -0,0 +1,20 @@
+
+from pydantic import BaseModel
+
+
+class LengthFilter(BaseModel):
+    """
+    Desc: Removes documents below or above a certain length of words
+    """
+
+    name: str = "length"
+    min_length: int = 10
+    max_length: int = 1_000_000
+
+    def __call__(self, text):
+        num_words = len(text.split())
+
+        if num_words < self.min_length or num_words > self.max_length:
+            return True
+        # otherwise keep
+        return False
diff --git a/easyllm/data/filters/longword.py b/easyllm/data/filters/longword.py
new file mode 100644
index 0000000..f98ba38
--- /dev/null
+++ b/easyllm/data/filters/longword.py
@@ -0,0 +1,20 @@
+
+from pydantic import BaseModel
+
+
+class LongWordFilter(BaseModel):
+    """
+    Ref: C4 Raffel et al.
+    Desc: If document includes words with > 1000 character are removed, e.g. js or minified files.
+    """
+
+    name: str = "long_word"
+    max_length: int = 1000
+
+    def __call__(self, text):
+        words = text.split()
+        max_len = max(len(word) for word in words)
+        if max_len > self.max_length:
+            return True
+        # otherwise keep
+        return False
diff --git a/easyllm/data/filters/n_gram.py b/easyllm/data/filters/n_gram.py
new file mode 100644
index 0000000..5523be3
--- /dev/null
+++ b/easyllm/data/filters/n_gram.py
@@ -0,0 +1,32 @@
+from collections import Counter
+from itertools import chain
+
+from pydantic import BaseModel
+
+
+def get_ngrams(input_list, n):
+    return list(zip(*[input_list[i:] for i in range(n)]))
+
+
+class TopNGramsFilter(BaseModel):
+    """
+    Ref: Gopher (Rae et al., 2021)
+    Desc: If the document shrinks by > 20% after removing top n-grams then remove
+    """
+
+    name: str = "top_n_grams"
+    remove_percentage: float = 0.2
+    n: int = 2
+
+    def __call__(self, text):
+        words = text.split()
+        if len(words) <= self.n:
+            return True
+        ngrams = get_ngrams(words, self.n)
+        n_grams = Counter(chain(ngrams))
+        most_common = n_grams.most_common(1)[0][0]
+
+        if n_grams[most_common] / len(n_grams) > self.remove_percentage:
+            return True
+        # otherwise keep
+        return False
diff --git a/easyllm/data/filters/non_alpha_numeric.py b/easyllm/data/filters/non_alpha_numeric.py
new file mode 100644
index 0000000..81815f3
--- /dev/null
+++ b/easyllm/data/filters/non_alpha_numeric.py
@@ -0,0 +1,27 @@
+import re
+
+from pydantic import BaseModel
+
+
+class NonAlphaNumericFilter(BaseModel):
+    """
+    Ref: Gopher (Rae et al., 2021)
+    Desc: If more than 20% of the document is non-alphanumeric then remove
+    """
+
+    name: str = "non_alpha_numeric"
+    regex: re.Pattern = re.compile(r"[^a-zA-Z0-9\s]")
+    remove_percentage: float = 0.2
+
+    def __call__(self, text):
+        num_characters = len(text)
+        # check if there are any characters in the text
+        if num_characters == 0:
+            return True
+        # calculate the percentage of non-alphanumeric characters
+        percentage = 1 - ((num_characters - len(self.regex.findall(text))) / num_characters)
+        # if the percentage is greater than the remove_percentage then remove
+        if percentage > self.remove_percentage:
+            return True
+        # otherwise keep
+        return False
diff --git a/easyllm/data/filters/parantheses_ration.py b/easyllm/data/filters/parantheses_ration.py
new file mode 100644
index 0000000..02c8e76
--- /dev/null
+++ b/easyllm/data/filters/parantheses_ration.py
@@ -0,0 +1,23 @@
+import re
+
+from pydantic import BaseModel
+
+
+class ParenthesesRationFilter(BaseModel):
+    """
+    Desc: If more than 10% of the document are Parentheses then remove
+    """
+
+    name: str = "parentheses_ratio"
+    regex: re.Pattern = re.compile(r"\[|\]|\(|\)|{|}|⟨|⟩")
+    remove_percentage: float = 0.1
+
+    def __call__(self, text):
+        # parentheses characters
+        parentheses_count = len(self.regex.findall(text))
+        sentence_length = len(text)
+        # check if the ratio of parentheses to text is greater than the remove percentage
+        if parentheses_count / sentence_length > self.remove_percentage:
+            return True
+        # otherwise keep
+        return False
diff --git a/easyllm/data/filters/punctuation.py b/easyllm/data/filters/punctuation.py
new file mode 100644
index 0000000..17da9a2
--- /dev/null
+++ b/easyllm/data/filters/punctuation.py
@@ -0,0 +1,55 @@
+from typing import List
+
+from pydantic import BaseModel
+
+
+class PunctuationFilter(BaseModel):
+    """
+    Ref: C4 Raffel et al.
+    Desc: If less than 15% of the sentences end with a punctuation mark then remove
+    """
+
+    name: str = "punctuation"
+    punctuations: List[str] = [".", "!", "?"]
+    remove_percentage: float = 0.15
+
+    def __call__(self, text):
+        sentences = text.split("\n")
+        # count the number of sentences ending with a punctuation mark
+        punc_counter = 0
+        for sentence in sentences:
+            for punc in self.punctuations:
+                if sentence.endswith(punc):
+                    punc_counter += 1
+                    break
+        # check if the ratio of sentences not ending with a punctuation mark is greater than the remove percentage
+        if punc_counter / len(sentences) < self.remove_percentage:
+            return True
+        # otherwise keep
+        return False
+
+
+class EllipsisFilter(BaseModel):
+    """
+    Ref: C4 Raffel et al.
+    Desc: If more than 30% of the sentences endwith an elipsis then remove
+    """
+
+    name: str = "ellipsis"
+    ellipsis: List[str] = ["...", "[...]", "…", "(...)", "[…]", "-»", "read more..", "read more"]
+    remove_percentage: float = 0.3
+
+    def __call__(self, text):
+        sentences = text.split("\n")
+        # count the number of sentences ending with an ellipsis
+        ellipsis_counter = 0
+        for sentence in sentences:
+            for ellipsis in self.ellipsis:
+                if sentence.endswith(ellipsis):
+                    ellipsis_counter += 1
+                    break
+        # check if the ratio of sentences ending with an ellipsis is greater than the remove percentage
+        if ellipsis_counter / len(sentences) > self.remove_percentage:
+            return True
+        # otherwise keep
+        return False
diff --git a/easyllm/data/filters/repeating.py b/easyllm/data/filters/repeating.py
new file mode 100644
index 0000000..a37f5ca
--- /dev/null
+++ b/easyllm/data/filters/repeating.py
@@ -0,0 +1,51 @@
+from pydantic import BaseModel
+
+
+class RepeatedLinesFilter(BaseModel):
+    """
+    Ref: Gopher (Rae et al., 2021)
+    Desc: If the document shrinks by > 30% after removing repeated lines then remove
+    """
+
+    name: str = "repeated_lines"
+    remove_percentage: float = 0.3
+
+    def __call__(self, text):
+        # split the text into lines
+        lines = text.split("\n")
+        # remove empty lines
+        lines = [line for line in lines if line.strip()]
+        if len(lines) == 0:
+            return True
+        # remove repeated lines
+        unique_lines = list(set(lines))
+        # calculate the percentage of lines removed
+        if len(unique_lines) / len(lines) < self.remove_percentage:
+            return True
+        # otherwise keep
+        return False
+
+
+class RepeatedParagraphFilter(BaseModel):
+    """
+    Ref: Gopher (Rae et al., 2021)
+    Desc: If the document shrinks by > 30% after removing repeated paragraphs then remove
+    """
+
+    name: str = "repeated_paragraph"
+    remove_percentage: float = 0.3
+
+    def __call__(self, text):
+        # split the text into lines
+        paragraphes = text.split("\n\n")
+        # remove empty paragraph
+        paragraphes = [p for p in paragraphes if p.strip()]
+        if len(paragraphes) == 0:
+            return True
+        # remove repeated paragraphes
+        unique_paragraphes = list(set(paragraphes))
+        # calculate the percentage of lines removed
+        if len(unique_paragraphes) / len(paragraphes) < self.remove_percentage:
+            return True
+        # otherwise keep
+        return False
diff --git a/easyllm/data/filters/url_ratio.py b/easyllm/data/filters/url_ratio.py
new file mode 100644
index 0000000..4571982
--- /dev/null
+++ b/easyllm/data/filters/url_ratio.py
@@ -0,0 +1,24 @@
+import re
+
+from pydantic import BaseModel
+
+
+class UrlRatioFilter(BaseModel):
+    """
+    Desc: If more than 20% of the document are urls then remove
+    """
+
+    name: str = "url_ratio"
+    regex: re.Pattern[
+        str
+    ] = r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)"
+    remove_percentage: float = 0.2
+
+    def __call__(self, text):
+        # find all urls
+        urls = re.findall(self.regex, text)
+        # check if the ratio of urls to words is greater than the remove percentage
+        if len(urls) / len(text.split()) > self.remove_percentage:
+            return True
+        # otherwise keep
+        return False
diff --git a/easyllm/data/filters/whitespace_ration.py b/easyllm/data/filters/whitespace_ration.py
new file mode 100644
index 0000000..e9ff23a
--- /dev/null
+++ b/easyllm/data/filters/whitespace_ration.py
@@ -0,0 +1,23 @@
+import re
+
+from pydantic import BaseModel
+
+
+class WhitespaceRatioFilter(BaseModel):
+    """
+    Desc: If more than 25% of the document are bulletpoints then remove
+    """
+
+    name: str = "whitespace_ratio"
+    regex: re.Pattern = re.compile(r"\s")
+    remove_percentage: float = 0.25
+
+    def __call__(self, text):
+        # whitespace characters
+        whitespace_count = len(self.regex.findall(text))
+        text_length = len(text)
+        # check if the ratio of whitespace to text is greater than the remove percentage
+        if whitespace_count / text_length > self.remove_percentage:
+            return True
+        # otherwise keep
+        return False
diff --git a/easyllm/data/filters/words_to_symbol.py b/easyllm/data/filters/words_to_symbol.py
new file mode 100644
index 0000000..7539dec
--- /dev/null
+++ b/easyllm/data/filters/words_to_symbol.py
@@ -0,0 +1,33 @@
+import re
+
+from pydantic import BaseModel
+
+
+class SymbolToWordFilter(BaseModel):
+    """
+    Ref: Gopher (Rae et al., 2021)
+    Desc: If more than 10% of the document are symbols (hashes [#] or ellipsis (...)) then remove
+    """
+
+    name: str = "symbol_to_word"
+    regex: re.Pattern = r"(\#+|(\.{3,}))(?!\w)"
+    remove_percentage: float = 0.1
+
+    def __call__(self, text: str):
+        num_hashes = len(re.findall(r"\#+", text))
+        num_ellipses = len(re.findall(r"\.{3,}", text))
+        num_words = len(re.findall(r"\w+", text))
+
+        # check if there are any words in the text
+        if num_words == 0:
+            return True
+
+        hash_ratio = num_hashes / num_words
+        ellipses_ratio = num_ellipses / num_words
+
+        # if the percentage is greater than the remove_percentage then remove
+        if hash_ratio > self.remove_percentage or ellipses_ratio > self.remove_percentage:
+            return True
+
+        # otherwise keep
+        return False
diff --git a/notebooks/data-filter.ipynb b/notebooks/data-filter.ipynb
new file mode 100644
index 0000000..ab940af
--- /dev/null
+++ b/notebooks/data-filter.ipynb
@@ -0,0 +1,414 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# How to use EasyLLM Quality data filters\n",
+    "\n",
+    "EasyLLMs `data` package adds quality filters for preprocessing text data for improved pretraining. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install \"easyllm[data]\" --upgrade"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Perplexity filtering\n",
+    "\n",
+    "Perplexity filtering can be used to improve model quality, coherence, and training efficiency by removing confusing text segments and focusing model learning on more standard, comprehensible language.\n",
+    "Perplexity filtering is implemented using `KenLM` models trained on wikipedia. You just need to provide your language id, e.g. `de` and your perplexity `min_threshold` and `max_threshold` the filter will return `True` if the perplexity of the text outside of the threshold `False` otherwise.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "341.3\n",
+      "46793.5\n"
+     ]
+    }
+   ],
+   "source": [
+    "from easyllm.data.filters import PerplexityFilter\n",
+    "\n",
+    "ppl = PerplexityFilter(\"en\",min_threshold=10,max_threshold=1000)\n",
+    "\n",
+    "# Get perplexity\n",
+    "print(ppl.model.get_perplexity(\"I am very perplexed\"))\n",
+    "# 341.3 (low perplexity, since sentence style is formal and with no grammar mistakes)\n",
+    "\n",
+    "print(ppl.model.get_perplexity(\"im hella trippin\"))\n",
+    "# 46793.5 (high perplexity, since the sentence is colloquial and contains grammar mistakes)\n",
+    "\n",
+    "# testing the filter\n",
+    "assert ppl(\"I am very perplexed\") == False\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## NonAlphaNumericFilter\n",
+    "\n",
+    "The `NonAlphaNumericFilter` removes documents based on the number of non-alphanumeric characters in the document. Based on [Gopher (Rae et al., 2021)](https://arxiv.org/pdf/2112.11446.pdf), if the document has more then 20% non-alphanumeric characters, it is removed."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from easyllm.data.filters import NonAlphaNumericFilter\n",
+    "\n",
+    "nam = NonAlphaNumericFilter()\n",
+    "\n",
+    "# not filtered\n",
+    "assert nam(\"This is a test\") == False\n",
+    "\n",
+    "# filtered\n",
+    "assert nam(\"This is a test!!!!!!!\") == True\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## SymbolToWordFilter\n",
+    "\n",
+    "The `SymbolToWordFilter` removes any document with a symbol-to-word ratio greater than 0.1 for either the hash symbol or the ellipsis. Based on [Gopher (Rae et al., 2021)](https://arxiv.org/pdf/2112.11446.pdf)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from easyllm.data.filters import SymbolToWordFilter\n",
+    "\n",
+    "stw = SymbolToWordFilter()\n",
+    "\n",
+    "assert stw(\"This is a test\") == False\n",
+    "\n",
+    "assert stw(\"spam#spam#spam#spam#spam#spam#spam#spam\") == True"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## NumbersToCharacterFilter\n",
+    "\n",
+    "The `NumbersToCharacterFilter` removes any document where the 20% of the document are numbers."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from easyllm.data.filters import DigitToCharacter\n",
+    "\n",
+    "ntw = DigitToCharacter()\n",
+    "\n",
+    "assert ntw(\"Hello 123 world 456 this text 789 contains 1234 numbers more words\") == False\n",
+    "\n",
+    "assert ntw(\"Hello 34534 34534 \") == True\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## UrlRatioFilter\n",
+    "\n",
+    "The `UrlRatioFilter` removes any document where 20% of the document is a URL."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from easyllm.data.filters import UrlRatioFilter \n",
+    "\n",
+    "ur = UrlRatioFilter()\n",
+    "\n",
+    "assert ur(\"https://www.google.com\") == True\n",
+    "\n",
+    "assert ur(\"Example text with some urls http://www.example.com and more text https://www.example2.com and more text\") == False"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## BulletpointRatioFilter \n",
+    "\n",
+    "The `BulletpointRatioFilter` removes documents that have more than 90% bulletpoints. Based on [Gopher (Rae et al., 2021)](https://arxiv.org/pdf/2112.11446.pdf)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from easyllm.data.filters import BulletpointRatioFilter\n",
+    "\n",
+    "br = BulletpointRatioFilter()\n",
+    "\n",
+    "assert br(\"This is a text with \\n- some bullets but\\nnot all\") == False\n",
+    "\n",
+    "assert br(\"- some bullets and\\n- some more\") == True"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## WhitespaceRatioFilter\n",
+    "\n",
+    "The `WhitespaceRatioFilter` is a filter that removes documents that more than 25% of the text is whitespace.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from easyllm.data.filters import WhitespaceRatioFilter\n",
+    "\n",
+    "wr = WhitespaceRatioFilter()\n",
+    "\n",
+    "assert wr(\"This is a test\") == False\n",
+    "\n",
+    "assert wr(\"Hello world!      This text has    extra whitespace.\") == True"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## ParenthesesRationFilter\n",
+    "\n",
+    "The `ParenthesesRationFilter` is a filter that removes all sentences that have a parentheses ratio greater than 10%."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from easyllm.data.filters import ParenthesesRationFilter\n",
+    "\n",
+    "pr = ParenthesesRationFilter()\n",
+    "\n",
+    "assert pr(\"This is a normal sentence\") == False\n",
+    "\n",
+    "assert pr(\"This a (with ) ] {(e)\") == True"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## LongWordFilter\n",
+    "\n",
+    "The `LongWordFilter` is a filter that removes documents that include words longer > 1000 character, e.g. js minfied files."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from easyllm.data.filters import LongWordFilter\n",
+    "\n",
+    "lw = LongWordFilter()\n",
+    "\n",
+    "assert lw(\"This is a test\") == False\n",
+    "\n",
+    "assert lw(f\"This is a test with a {'longword'*500}\") == True"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## LengthFilter\n",
+    "\n",
+    "The `LengthFilter` removes documents below or above a certain number of words. Not tokens since its more expensive to compute."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from easyllm.data.filters import LengthFilter\n",
+    "\n",
+    "l = LengthFilter(min_length=1, max_length=100)\n",
+    "\n",
+    "assert l(\"hello world\") == False\n",
+    "\n",
+    "assert l(\"hello world \" * 100) == True"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## RepeatedParagraphFilter, RepeatedLinesFilter\n",
+    "\n",
+    "The `RepeatedParagraphFilter` & `RepeatedLinesFilter` remove documents which have more than 30% repeated lines or paragraphs. Based on [Gopher (Rae et al., 2021)](https://arxiv.org/pdf/2112.11446.pdf) "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from easyllm.data.filters import RepeatedLinesFilter, RepeatedParagraphFilter\n",
+    "\n",
+    "rl = RepeatedLinesFilter()\n",
+    "rp = RepeatedParagraphFilter()\n",
+    "\n",
+    "assert rl(\"hello\\nworld\") == False\n",
+    "assert rl(\"hello\\nhello\\nhello\\nhello\") == True\n",
+    "\n",
+    "assert rp(\"hello\\n\\nworld\") == False\n",
+    "assert rp(\"hello\\n\\nhello\\n\\nhello\\n\\nhello\") == True"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## TopNGramsFilter\n",
+    "\n",
+    "The `TopNGramsFilter` removes the document if the top n-gram makes more than 20% of the document."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from easyllm.data.filters import TopNGramsFilter\n",
+    "\n",
+    "tng = TopNGramsFilter()\n",
+    "\n",
+    "assert tng(\"This is a test for a longer sentence\") == False \n",
+    "\n",
+    "assert tng(\"The quick brown fox jumps over the lazy dog The quick brown\") == True"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## PunctuationFilter & EllipsisFilter\n",
+    "\n",
+    "The `PunctuationFilter` & `EllipsisFilter` removes the document if more than 15% of the \"linebreaks\" don't contain any punctuation or if more than 30% of the \"linebreaks\" contain an ellipsis."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from easyllm.data.filters import PunctuationFilter, EllipsisFilter\n",
+    "\n",
+    "pf = PunctuationFilter()\n",
+    "\n",
+    "assert pf(\"This is a sentence.\") == False\n",
+    "\n",
+    "assert pf(\"This is a sentence\\n But is not one.\\nNo oneyet.\") == True\n",
+    "\n",
+    "ef = EllipsisFilter()\n",
+    "\n",
+    "assert ef(\"This is a sentence.\") == False\n",
+    "\n",
+    "assert ef(\"This is a sentence\\n But is not one....\") == True"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## CommonWordFilter\n",
+    "\n",
+    "The `CommonWordFilter` removes documents if they don't include atleast 2 common words."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from easyllm.data.filters import CommonWordFilter\n",
+    "\n",
+    "cw = CommonWordFilter()\n",
+    "\n",
+    "assert cw(\"This is a sentence with a common word.\") == False\n",
+    "\n",
+    "assert cw(\"cat dog mouse\") == True"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "dev",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.13"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/datasets/filter-dataset.ipynb b/notebooks/datasets/filter-dataset.ipynb
new file mode 100644
index 0000000..a55f94a
--- /dev/null
+++ b/notebooks/datasets/filter-dataset.ipynb
@@ -0,0 +1,2316 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%pip uninstall easyllm -y\n",
+    "%pip install git+https://github.com/philschmid/easyllm.git@datafilter --upgrade"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import load_dataset\n",
+    "\n",
+    "ds = load_dataset('philschmid/oscar-2301-de-minhash-dedup',split=\"train\")\n",
+    "# ds = load_dataset('wikipedia','20220301.de',split=\"train\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Perplexity filtering \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Nach § 80 Abs. 5 Satz 1 Halbsatz 2 VwGO kann das Gericht der Hauptsache die aufschiebende Wirkung der Klage ganz oder teilweise wiederherstellen. Ist die sofortige Vollziehung von der Behörde den formellen Anforderungen des § 80 Abs. 3 Satz 1 VwGO genügend angeordnet worden, so entscheidet das Gericht nach § 80 Abs. 5 Satz 1 Halbsatz 2 VwGO über die Wiederherstellung der aufschiebenden Wirkung der Klage auf der Grundlage einer eigenen Abwägung des Interesses des Antragstellers, von der Vollziehung des angefochtenen Verwaltungsakts bis zur endgültigen Entscheidung über seine Rechtmäßigkeit verschont zu bleiben, gegen das besondere öffentliche Interesse an dessen sofortiger Vollziehung (vgl. BVerwG, Beschl. v. 19.12.2014 - 7 VR 5.14 -, juris Rn. 9; Nds. OVG, Beschl. v. 10.09.2014 - 8 ME 87/14 -, juris Rn. 2). Im Rahmen der Interessenabwägung haben die Erfolgsaussichten des in der Hauptsache eingelegten Rechtsbehelfs eine entscheidende Bedeutung. Ergibt sich bei der im Rahmen des vorläufigen Rechtsschutzes gebotenen, aber grundsätzlich auch ausreichenden (vgl. Nds. OVG, Beschl. v. 16.8.2017 - 13 ME 173/17 -, juris Rn. 4, vgl. auch Beschl. v. 24.01.2018 - 7 ME 110/17 -, juris Rn. 28) summarischen Überprüfung, dass der Rechtsbehelf in der Hauptsache keinen Erfolg haben wird, weil sich der angegriffene Verwaltungsakt als offensichtlich rechtmäßig erweist, so überwiegt regelmäßig das öffentliche Interesse an der sofortigen Vollziehung des Verwaltungsakts. Erweist sich der Rechtsbehelf bei summarischer Überprüfung demgegenüber als offensichtlich erfolgreich, überwiegt regelmäßig das Interesse des Adressaten des Verwaltungsakts, von dessen Vollziehung vorerst verschont zu bleiben. Stellen sich die Erfolgsaussichten des Rechtsbehelfs hingegen als offen dar, so ist eine Abwägung der widerstreitenden Interessen erforderlich, bei der in Rechnung zu stellen ist, welche Gründe bei bestehender Unsicherheit im Hinblick auf die Erfolgsaussichten des Rechtsbehelfs für und gegen eine Aufrechterhaltung der sofortigen Vollziehung des Verwaltungsakts sprechen (vgl. Nds. OVG, Beschl. v. 10.5.2010 - 13 ME 181/09 -, juris Rn. 4). Außerdem ist zu berücksichtigen, dass die voraussichtliche Rechtmäßigkeit eines Verwaltungsakts für sich allein nur das allgemeine Interesse an seiner Vollziehung begründet, nicht aber zugleich auch deren, für die behördliche Anordnung nach § 80 Abs. 2 Satz 1 Nr. 4 VwGO erforderliche Dringlichkeit (vgl. grundlegend BVerfG, Beschl. v. 27.4.2005 - 1 BvR 223/05 -, NVwZ 2005, 1303; Beschl. v. 18.7.1973, - 1 BvR 23/73 -, BVerfGE 35, 382, 402; Nds. OVG, Beschl. v. 10.9.2014, a.a.O.; Finkelnburg/Dombert/Külpmann, Vorläufiger Rechtsschutz im Verwaltungsstreitverfahren, 7. Aufl., Rn. 757 f. m.w.N.).\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(ds[456][\"text\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8071b0d5472949deabe06d5600f46054",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "add url (num_proc=128):   0%|          | 0/53172498 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "import os\n",
+    "from easyllm.data.filters import PerplexityFilter\n",
+    "\n",
+    "ppl = PerplexityFilter(\"de\",min_threshold=10,max_threshold=1000)\n",
+    "\n",
+    "def calc_pp(doc):\n",
+    "  # pp = ppl.model.get_perplexity(doc[\"text\"])\n",
+    "  return {\n",
+    "    # \"perplexity\": pp,\n",
+    "    \"timestamp\": doc[\"meta\"][\"warc_headers\"][\"warc-date\"],\n",
+    "    \"url\": doc[\"meta\"][\"warc_headers\"][\"warc-target-uri\"]\n",
+    "  }\n",
+    "\n",
+    "ds = ds.map(\n",
+    "    calc_pp,\n",
+    "    remove_columns=[\"meta\"],\n",
+    "    num_proc=os.cpu_count(),\n",
+    "    desc=\"add url\",\n",
+    ")\n",
+    "# time 1min for 1M docs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAkQAAAHLCAYAAAAp7ofKAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjYuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8o6BhiAAAACXBIWXMAAA9hAAAPYQGoP6dpAABa20lEQVR4nO3deXhTVf4G8DdNmjRd0paWbrS0lX0vslkHEKRSNkcBFRQtm6IOKIjLiD9FcEMQEEQFcSng4Ig4oogCIouIsomyQ1kEWuhKt3RL0yb390eaW0ILbdM0N8v7eZ4+k9yc3PtN7xjennPuuTJBEAQQERERuTEPqQsgIiIikhoDEREREbk9BiIiIiJyewxERERE5PYYiIiIiMjtMRARERGR22MgIiIiIrfHQERERERuj4GIiIiI3B4DERHV265duyCTybBr164mO8aAAQMwYMCAJtt/Y0yYMAExMTF2OVZMTAwmTJggPl+1ahVkMhn++OMPuxzfkc8DUVNgICJyUOZ/AM0/Xl5eaNu2LaZNm4asrCypy7Ob9PR0zJkzB4cPH7bpfufMmWPx+/X29kbLli1x9913Izk5GeXl5TY5zsmTJzFnzhxcvHjRJvuzJUeujcjeFFIXQEQ399prryE2NhY6nQ579uzB8uXL8eOPP+L48ePw9vaWujyb++mnnyyep6enY+7cuYiJiUFcXJzNj7d8+XL4+vqivLwcV65cwdatWzFp0iQsWbIEmzZtQlRUlNj2448/htFobND+T548iblz52LAgAEN6l1KSUmBh0fT/s16s9quPw9Ero6BiMjBDR06FD179gQAPProowgKCsLixYvx3Xff4cEHH2zUvktLSx0uVCmVSrse77777kNwcLD4fPbs2Vi7di2SkpJw//33Y9++feJrnp6eTVqLIAjQ6XRQq9VQqVRNeqy62Ps8EEmNQ2ZETubOO+8EAFy4cEHc9p///Ac9evSAWq1Gs2bNMHbsWKSlpVm8b8CAAejcuTMOHTqE/v37w9vbGy+99BIA03yVESNG4KeffkJcXBy8vLzQsWNHfPPNN/Wqaf/+/RgyZAj8/f3h7e2NO+64A7/99pv4+qlTp6BWq5GUlGTxvj179kAul+Pf//63RZ3muSu7du1Cr169AAATJ04Uh7dWrVqFV199FZ6ensjJyalRz5QpUxAQEACdTlev+q83btw4PProo9i/fz+2bdsmbq9tDtGXX36JHj16wM/PDxqNBl26dMHSpUsBmIY977//fgDAwIEDxfrNc7DMv/etW7eiZ8+eUKvV+Oijj8TXrp1DZFZaWorHH38cQUFB0Gg0SEpKQn5+vkUbmUyGOXPm1Hjvtfusq7ba5hBlZ2dj8uTJCA0NhZeXF7p164bVq1dbtLl48SJkMhkWLlyIlStXolWrVlCpVOjVqxcOHjxY6++byBEwEBE5mfPnzwMAgoKCAABvvvkmkpKS0KZNGyxevBgzZszA9u3b0b9/fxQUFFi8Nzc3F0OHDkVcXByWLFmCgQMHiq+dPXsWY8aMwdChQzFv3jwoFArcf//9FoGgNjt27ED//v2h1Wrx6quv4q233kJBQQHuvPNOHDhwAADQoUMHvP766/j888+xceNGAEBJSQkmTJiA9u3b47XXXqt13x06dBBfmzJlCj7//HN8/vnn6N+/Px555BFUVlZi3bp1Fu/R6/X4+uuvMXr0aHh5edXzt1rTI488AuDmQ0fbtm3Dgw8+iMDAQMyfPx9vv/02BgwYIIbB/v374+mnnwYAvPTSS2L9HTp0EPeRkpKCBx98EHfddReWLl1a57DgtGnTcOrUKcyZMwdJSUlYu3Yt7r33XgiC0KDPV5/arlVWVoYBAwbg888/x7hx4/DOO+/A398fEyZMEAPgtb744gu88847ePzxx/HGG2/g4sWLGDVqFCoqKhpUJ5HdCETkkJKTkwUAws8//yzk5OQIaWlpwpdffikEBQUJarVauHz5snDx4kVBLpcLb775psV7jx07JigUCovtd9xxhwBAWLFiRY1jRUdHCwCE//3vf+K2wsJCITw8XOjevbu4befOnQIAYefOnYIgCILRaBTatGkjJCYmCkajUWxXWloqxMbGCnfddZe4zWAwCH379hVCQ0OFq1evClOnThUUCoVw8OBBi1ruuOMO4Y477hCfHzx4UAAgJCcn16g7Pj5e6NOnj8W2b775xqLGG3n11VcFAEJOTk6tr+fn5wsAhJEjR4rbxo8fL0RHR4vPp0+fLmg0GqGysvKGx1m/fv0N6zH/3rds2VLra+PHjxefm///0KNHD0Gv14vbFyxYIAAQvvvuO3EbAOHVV1+tc583q+3687BkyRIBgPCf//xH3KbX64X4+HjB19dX0Gq1giAIwoULFwQAQlBQkJCXlye2/e677wQAwvfff1/jWESOgD1ERA4uISEBzZs3R1RUFMaOHQtfX19s2LABLVq0wDfffAOj0YgHHngAV69eFX/CwsLQpk0b7Ny502JfKpUKEydOrPU4ERERGDlypPjcPBzz119/ITMzs9b3HD58GGfPnsVDDz2E3Nxc8fglJSUYNGgQdu/eLU5C9vDwwKpVq1BcXIyhQ4fiww8/xKxZs8T5UdZISkrC/v37xV4zAFi7di2ioqJwxx13WL1fAPD19QUAFBUV3bBNQEAASkpK6uxFu5nY2FgkJibWu/2UKVMs5jI9+eSTUCgU+PHHH62uoT5+/PFHhIWFWcxb8/T0xNNPP43i4mL88ssvFu3HjBmDwMBA8Xm/fv0AAH///XeT1klkLU6qJnJwH3zwAdq2bQuFQoHQ0FC0a9dOvPro7NmzEAQBbdq0qfW9108CbtGixQ0ny7Zu3RoymcxiW9u2bQGY5oWEhYXVeM/Zs2cBAOPHj79h/YWFheI/jK1atcKcOXPw/PPPo3PnznjllVdu+L76GDNmDGbMmIG1a9di9uzZKCwsxKZNm/DMM8/U+CwNVVxcDADw8/O7YZt//etf+OqrrzB06FC0aNECgwcPxgMPPIAhQ4bU+zixsbENquv6c+3r64vw8PAmv3T+0qVLaNOmTY0r38xDbJcuXbLY3rJlS4vn5v8PXD/fichRMBARObjevXvfsBfFaDRCJpNh8+bNkMvlNV4393KYqdVqm9Zm7v155513bjj35foazHNy0tPTkZubW2vQqq/AwECMGDFCDERff/01ysvL8fDDD1u9T7Pjx48DMAXFGwkJCcHhw4exdetWbN68GZs3b0ZycjKSkpJqTDa+EVufk5sxGAx2O1Zt/38E0OC5TkT2wkBE5MRatWoFQRAQGxsr9uZY69y5cxAEwaJn5cyZMwBww/VzWrVqBcA0vJaQkFDnMVasWIFt27bhzTffxLx58/D444/ju+++u+l76urpSUpKwj333IODBw9i7dq16N69Ozp16lRnLXX5/PPPAaDO4SylUom7774bd999N4xGI/71r3/ho48+wiuvvFJrr1tjnT171mIyfHFxMTIyMjBs2DBxW2BgYI0J9Xq9HhkZGRbbGlJbdHQ0jh49CqPRaNFLdPr0afF1ImfGOURETmzUqFGQy+WYO3dujb+8BUFAbm5uvfeVnp6ODRs2iM+1Wi3WrFmDuLi4G/bi9OjRA61atcLChQvFIaZrXXtJ/IULF/D8889j9OjReOmll7Bw4UJs3LgRa9asuWldPj4+AFDjH3izoUOHIjg4GPPnz8cvv/xik96hL774Ap988gni4+MxaNCgG7a7/vfr4eGBrl27AoC40nVd9TfUypUrLa7UWr58OSorKzF06FBxW6tWrbB79+4a77u+h6ghtQ0bNgyZmZkWV/VVVlZi2bJl8PX1bfScLSKpsYeIyIm1atUKb7zxBmbNmoWLFy/i3nvvhZ+fHy5cuIANGzZgypQpeO655+q1r7Zt22Ly5Mk4ePAgQkND8dlnnyErKwvJyck3fI+Hhwc++eQTDB06FJ06dcLEiRPRokULXLlyBTt37oRGo8H3338PQRAwadIkqNVqLF++HADw+OOP43//+x+mT5+OhIQERERE3PAzBgQEYMWKFfDz84OPjw/69Okjzr3x9PTE2LFj8f7770Mulzd4scqvv/4avr6+0Ov14krVv/32G7p164b169ff9L2PPvoo8vLycOeddyIyMhKXLl3CsmXLEBcXJ86tiYuLg1wux/z581FYWAiVSoU777wTISEhDarTTK/XY9CgQXjggQeQkpKCDz/8EH379sU///lPi7qeeOIJjB49GnfddReOHDmCrVu3WixA2dDapkyZgo8++ggTJkzAoUOHEBMTg6+//hq//fYblixZctO5VkROQcIr3IjoJsyXWV9/WXpt/ve//wl9+/YVfHx8BB8fH6F9+/bC1KlThZSUFLHNHXfcIXTq1KnW90dHRwvDhw8Xtm7dKnTt2lVQqVRC+/bthfXr11u0u/6ye7O//vpLGDVqlBAUFCSoVCohOjpaeOCBB4Tt27cLgiAIS5curXFZvyAIQmpqqqDRaIRhw4ZZ1Hnt5d6CYLpku2PHjoJCoaj1EvwDBw4IAITBgwfX+bsyM192b/7x8vISIiMjhREjRgifffaZoNPparzn+svuv/76a2Hw4MFCSEiIoFQqhZYtWwqPP/64kJGRYfG+jz/+WLjlllsEuVxu8fsz/95rc6PL7n/55RdhypQpQmBgoODr6yuMGzdOyM3NtXivwWAQ/v3vfwvBwcGCt7e3kJiYKJw7d67GPm9WW23nISsrS5g4caIQHBwsKJVKoUuXLjXOhfmy+3feeafGZ8INlgMgcgQyQeAMNyJ3FxMTg86dO2PTpk1Sl2KVI0eOIC4uDmvWrBEXVCQiagjOISIip/fxxx/D19cXo0aNkroUInJSnENERE7r+++/x8mTJ7Fy5UpMmzZNnCRMRNRQDERE5LSeeuopZGVlYdiwYZg7d67U5RCRE+McIiIiInJ7nENEREREbo+BiIiIiNwe5xDVg9FoRHp6Ovz8/Gy+DD8RERE1DUEQUFRUhIiIiBo3Jr4eA1E9pKenIyoqSuoyiIiIyAppaWmIjIy8aRsGonowL0mflpYGjUYjcTVETaSkBDDfPiM9HXCAS9hL9CWIWGSqKf3ZdPgopavJAX89RFQHrVaLqKioet1ahoGoHszDZBqNhoGIXJdcXv1Yo3GIf/HlejngZXqs0WgkDUQO+Oshonqqz3QXTqomIiIit8dARERERG6PgYiIiIjcHgMRERERuT0GIiIiInJ7DERERETk9hiIiIiIyO0xEBEREZHbYyAiIiIit8dARERERG6PgYiIiIjcHgMRERERuT0GIiKSXEGpHi98fQQHLuRJXQoRuSkGIiKS3Js/nMJXf1zGAx/tlboUInJTDEREJKnCsgqsP3RZ6jKIyM0xEBGRZCoNRiz6KUXqMoiIoJC6ACJyP0W6Cuz7Ow/vbjuDkxlaqcshImIgIiL7e+q/f2FXSk6tr53K0KJDuMbOFRGRu+OQGRHZ3Y3CEAAMXfor/kzNt2M1REQMRETkgF7ecFzqEojIzXDIjIjsZvG2M9j/d26d7TIKy+xQDRFRNQYiIrKb97afrVe7/NIKpBeUwd+7iQsiIqrCITMiklyAtyeGdQmz2Jb02QGJqiEid8QeIiKyC12FodbtX065DV0j/eGtVKCwtALdXvsJAHAuu9ie5RGRm2MPERHZRZGustbtMUE+8Faa/jbTqC3/RntvR/2G2IiIGouBiIjsQqursHjePswPvWObIVSjErfJZDIM7xIuPl++67zd6iMi98ZARER2cX0P0ebp/bBuym2QyWQW2z8Ydyua+SjtWRoREecQEZF9FF3XQ3R9ELqWSsG/1YjIvvitQ0R2cW0P0dKxcTdtW3aDCdhERE2FgYiI7GJXSjYA4M72IbgnrsVN21ZUGu1REhGRiIGIiJpcSXklvvrjMgDAWymvs32FQWjqkoiILDAQEVGTy9TqxMf92zavs73ewB4iIrIvBiIianJ5JXoAQFQzNR7oGSVxNURENTEQEVGTO5WhBQA091XV0dLEz4sXwBKRfTEQEVGTOpNVhNnfnQAABNUzEK2a2AsdwjVNWRYRkQUGIiJqUl8dTBMfB3p71us9PaKbYfP0fhbbjl0usGVZREQWGIiIqEnJ5dULMBaUVtyk5c09sHKfLcohIqoVAxERNakr+WXiY1/ODSIiB8VARERNKjWvVHw88662ElZCRHRjDERE1KTMgWjLjH6IDPRu0HvXTbnN4rmeK1gTURNhICKiJlNYViHOG2rZrGFhCAC6RgZYPD+dqbVFWURENTAQEVGTSavqHQr2VcFb2fj5Q7tSchq9DyKi2jAQEVGTuZRrCkQtm6ltsr/c4nKb7IeI6HoMRETUZMzzh6KDfGyyv+Jyg032Q0R0PQYiImoyOUWmHp1QjZdN9ldSXmmT/RARXY+BiIiaTKneFGB8VXKb7K+YgYiImggDERE1mRK9aYhLbYMJ1QADERE1HQYiImoyZVU9RD5K9hARkWNjICKiJlNSNQnaW2WbHiLOISKipsJARERNYuHWFOz9OxcA4O1pmx6iIh0DERE1DQYiImoS7+88Jz72tuGkal0FL70nIttjICKiJmeLVao95TIAQF6JvtH7IiK6HgMREdnc9b04tphU3cxbCQC4ytWqiagJMBARkc3ll1r24thiUnWQnykQ5Razh4iIbI+BiIhs7vrQYotJ1c28VQCAHPYQEVETYCAiIpur2UPU+EAU7GsKRBwyI6KmwEBERDZ3/cRnpbzxXzVBPp4AgJ2nsxu9LyKi6zEQEZHNXR+IZDJZo/cZ6GOaQ3TwYj6OXi5o9P6IiK7FQERENpddVD2sZYveIQCoMBjFx+v/uGyTfRIRmTEQEZHNXckvAwDc3yMSe2fdaZN93hPXQnzMe5oRka1JGogMBgNeeeUVxMbGQq1Wo1WrVnj99dchCILYRhAEzJ49G+Hh4VCr1UhISMDZs2ct9pOXl4dx48ZBo9EgICAAkydPRnFxsUWbo0ePol+/fvDy8kJUVBQWLFhgl89I5I4u55cCAAa2D0FQ1WToxgr3V+PNkZ0BMBARke1JGojmz5+P5cuX4/3338epU6cwf/58LFiwAMuWLRPbLFiwAO+99x5WrFiB/fv3w8fHB4mJidDpdGKbcePG4cSJE9i2bRs2bdqE3bt3Y8qUKeLrWq0WgwcPRnR0NA4dOoR33nkHc+bMwcqVK+36eYncxZUCUw9RZKDapvv1rVrPqJj3NCMiG7PNLait9Pvvv+Oee+7B8OHDAQAxMTH473//iwMHDgAw9Q4tWbIEL7/8Mu655x4AwJo1axAaGopvv/0WY8eOxalTp7BlyxYcPHgQPXv2BAAsW7YMw4YNw8KFCxEREYG1a9dCr9fjs88+g1KpRKdOnXD48GEsXrzYIjgRUeOVVxqQpTXNIWoRYNtA5Odl+soq0TMQEZFtSdpDdPvtt2P79u04c+YMAODIkSPYs2cPhg4dCgC4cOECMjMzkZCQIL7H398fffr0wd69ewEAe/fuRUBAgBiGACAhIQEeHh7Yv3+/2KZ///5QKpVim8TERKSkpCA/P79GXeXl5dBqtRY/RFQ/5vlDak85mvko62jdMD5K9hARUdOQtIfoxRdfhFarRfv27SGXy2EwGPDmm29i3LhxAIDMzEwAQGhoqMX7QkNDxdcyMzMREhJi8bpCoUCzZs0s2sTGxtbYh/m1wMBAi9fmzZuHuXPn2uhTErmXlMwiAEDrEF+bXG5/Ld+qHiLOISIiW5O0h+irr77C2rVr8cUXX+DPP//E6tWrsXDhQqxevVrKsjBr1iwUFhaKP2lpaZLWQ+RMTmWYelQ7hPvZfN/iHCIGIiKyMUl7iJ5//nm8+OKLGDt2LACgS5cuuHTpEubNm4fx48cjLCwMAJCVlYXw8HDxfVlZWYiLiwMAhIWFITvbcuXayspK5OXlie8PCwtDVlaWRRvzc3Oba6lUKqhUtrkyhsjdnMww9RB1CNfYfN/eVUNmpXoDBEGweQ8UEbkvSXuISktL4eFhWYJcLofRaFqALTY2FmFhYdi+fbv4ularxf79+xEfHw8AiI+PR0FBAQ4dOiS22bFjB4xGI/r06SO22b17NyoqKsQ227ZtQ7t27WoMlxFR45zNNgWi9mG2D0RKRfX3RYVBuElLIqKGkTQQ3X333XjzzTfxww8/4OLFi9iwYQMWL16MkSNHAjAt9z9jxgy88cYb2LhxI44dO4akpCRERETg3nvvBQB06NABQ4YMwWOPPYYDBw7gt99+w7Rp0zB27FhEREQAAB566CEolUpMnjwZJ06cwLp167B06VLMnDlTqo9O5JKMRgHpVZfctwzytvn+VdcEovJKg833T0TuS9Ihs2XLluGVV17Bv/71L2RnZyMiIgKPP/44Zs+eLbZ54YUXUFJSgilTpqCgoAB9+/bFli1b4OXlJbZZu3Ytpk2bhkGDBsHDwwOjR4/Ge++9J77u7++Pn376CVOnTkWPHj0QHByM2bNn85J7IhvLLipHhUGAwkOGUD/bDztfexuQ7w6n4+Hbom1+DCJyTzLh2mWhqVZarRb+/v4oLCyERmP7YQAih1BSAvj6mh4XFwM+Pg3exR8X83Dfir2IaqbGry80/pYdJfoS+M4z1VQ8qxg+Sh/EvPiD+PrFt4c3+hj1rqXxvx4isrOG/PvNe5kRkc1crlqDKDLA9sNlRERNiYGIiGwmLc90DzNb37KDiKipMRARkc3su5ALAGgXZvs1iIiImhIDERHZhFZXgf1/5wEABnUIraM1EZFjYSAiIps4fqUQlUYBkYFqxAbbZ8ZxhcFol+MQketjICIim8iuusN9y2b2m1BdqudaRERkGwxERGQTWVodACBU41VHS9sp1fOeZkRkGwxERGQTWVU9RCEa+90HsKScPUREZBsMRERkE1lFph6iMDv2EJVxyIyIbISBiIhsIttOQ2av39NJfFzCITMishEGIiKyiUwxEDXtkNkj8THo0sIfAOcQEZHtMBARkU1cLdIDAJr7Nv2QmbdSDoBziIjIdhiIiKjR9JVGlFWYwom/2rPJj+ejUgDgHCIish0GIiJqtCJdhfjY10vR5MdTe5p6iN7fea7Jj0VE7oGBiIgaTaszzeXxUykg95A1+fH+uGS6RUhqXikEQWjy4xGR62MgIqJG05aZeog0dhguA6rXPAKA3BK9XY5JRK6NgYiIGk1bNWTmZ4fhMgB4e1QX8XF6QZldjklEro2BiIgarahqyEzjZZ8eorG9WyLEz3R5f2ahzi7HJCLXxkBERI1WPWRmnx4iAGgd4gsA4tVtRESNwUBERI1mHjKzVw8RAHhVXWlWXmG02zGJyHUxEBFRo2nLqobM7DSpGgC8PE1fX7pK9hARUeMxEBFRo1X3ENlvyMxLYeoh0nHIjIhsgIGIiBqt0M6X3QOAytMciDhkRkSNx0BERI12PqcYABARoLbbMcUhM/YQEZENMBARUaPoKgw4nVEEAOga6W+343qxh4iIbIiBiIga5VSGFpVGAUE+SrSwZw+ReQ4RJ1UTkQ0wEBFRo5zM0AIAOrfwh0zW9PcxM+OQGRHZEgMRETVKdtV9xew5fwjgOkREZFsMRETUKFeLTYGoua/Srsf1UZku8Tdf8k9E1BgMRETUKOZAFFx1bzF7Mc9XupzPm7sSUeMxEBFRo1wt1gMAgn3tG4giA02B6Ep+GYxGwa7HJiLXw0BERI0i9hDZORCF+3tBJgP0BiNyS/R2PTYRuR4GIiJqlKtF5kBk3zlECrkHmnmbjmkOZURE1mIgIiKrlekNKNGbLnu39xwiAGhedcycIgYiImocBiIispq5Z0ap8ICfyn43djUzD9N9vu+S3Y9NRK6FgYiIrFZ9yb3Krosymnl4mI558WqJ3Y9NRK6FgYiIrFZ9hZl95w+ZTbw9BgBg4FVmRNRIDEREZDWprjAzMx+3jLfvIKJGYiAiIqtVX2EmTSBSK01fYQxERNRYDEREZLXqVaqlGTJTme94z0BERI3EQEREVpNqlWoztdIciIxcrZqIGoWBiIisliPxHCJ11R3vAaC8kne9JyLrMRARkdWknlTtdU0g4jwiImoMBiIispp5UnVzieYQyT1kUMpNX2OcR0REjcFARERWKa80QKurBCBdDxFQPY9o++lsyWogIufHQEREVsmtmlDtKZfBX+0pWR1hGi8AQFpeqWQ1EJHzYyAiIquY5w8F+Uhz2w6zwZ1CAXDIjIgah4GIiKwi9RpEZuaJ1QxERNQYDEREZJWrRdKuQWSmUpgnVfOyeyKyHgMREVlF6jWIzKoXZ2QPERFZj4GIiKwi9RpEZl7m23dwYUYiagQGIiKySvVtOziHiIicHwMREVklq1AHAAituuxdKl6eXJiRiBqPgYiIrHI537TuT2SgWtI62ENERLbAQEREDVZhMCJTa+ohaiF5IOJVZkTUeAxERNRgmYU6GAVAqfBAsI+0k6pD/ExDdlcKypBbNdGbiKihGIiIqMEu55cBACID1PDwkG6VagCIauaNmCBvGIwCUjKLJK2FiJwXAxERNZh5/pDUw2VmzXxMV7qZbzZLRNRQDERE1GAXc0sAAJGB3hJXYuLrZbq57PmcYokrISJnxUBERA127IoWANApQiNxJSbFugoAwDtbUySuhIicFQMRETWIIAg4erkAANAtMkDSWsxOZmjFx4IgSFgJETkryQPRlStX8PDDDyMoKAhqtRpdunTBH3/8Ib4uCAJmz56N8PBwqNVqJCQk4OzZsxb7yMvLw7hx46DRaBAQEIDJkyejuNiy6/zo0aPo168fvLy8EBUVhQULFtjl8xG5msv5ZSgorYBS7oG2Yb5SlwMAqDRUh6BSPdcjIqKGkzQQ5efn4x//+Ac8PT2xefNmnDx5EosWLUJgYKDYZsGCBXjvvfewYsUK7N+/Hz4+PkhMTIROpxPbjBs3DidOnMC2bduwadMm7N69G1OmTBFf12q1GDx4MKKjo3Ho0CG88847mDNnDlauXGnXz0vkCsxXcrUJ9YWq6j5iUnvmrrbiY23V8BkRUUMopDz4/PnzERUVheTkZHFbbGys+FgQBCxZsgQvv/wy7rnnHgDAmjVrEBoaim+//RZjx47FqVOnsGXLFhw8eBA9e/YEACxbtgzDhg3DwoULERERgbVr10Kv1+Ozzz6DUqlEp06dcPjwYSxevNgiOJmVl5ejvLx6PROtVlujDZG7yioy/TES7u8YV5gBwOP9bxHnD2nLKhHuL3FBROR0JO0h2rhxI3r27In7778fISEh6N69Oz7++GPx9QsXLiAzMxMJCQniNn9/f/Tp0wd79+4FAOzduxcBAQFiGAKAhIQEeHh4YP/+/WKb/v37Q6msvgllYmIiUlJSkJ+fX6OuefPmwd/fX/yJioqy+WcnclZZWtMfC6EaaRdkvJZC7oGYINMVb0XsISIiK0gaiP7++28sX74cbdq0wdatW/Hkk0/i6aefxurVqwEAmZmZAIDQ0FCL94WGhoqvZWZmIiQkxOJ1hUKBZs2aWbSpbR/XHuNas2bNQmFhofiTlpZmg09L5BpyqnqIzCtEOwp/b9MfPLkleokrISJnJOmQmdFoRM+ePfHWW28BALp3747jx49jxYoVGD9+vGR1qVQqqFSO89cvkSNxxB4iwHST2SNpBUjLK5W6FCJyQpL2EIWHh6Njx44W2zp06IDU1FQAQFhYGAAgKyvLok1WVpb4WlhYGLKzsy1er6ysRF5enkWb2vZx7TGIqH6yzT1EDhaIWjYzDZmlMhARkRUkDUT/+Mc/kJJiuZDamTNnEB0dDcA0wTosLAzbt28XX9dqtdi/fz/i4+MBAPHx8SgoKMChQ4fENjt27IDRaESfPn3ENrt370ZFRfXcgm3btqFdu3YWV7QRUd3MPUSONmQWXRWILuUyEBFRw0kaiJ555hns27cPb731Fs6dO4cvvvgCK1euxNSpUwEAMpkMM2bMwBtvvIGNGzfi2LFjSEpKQkREBO69914Aph6lIUOG4LHHHsOBAwfw22+/Ydq0aRg7diwiIiIAAA899BCUSiUmT56MEydOYN26dVi6dClmzpwp1UcnckoGoyDeUd5Re4g4ZEZE1pB0DlGvXr2wYcMGzJo1C6+99hpiY2OxZMkSjBs3TmzzwgsvoKSkBFOmTEFBQQH69u2LLVu2wMur+q/TtWvXYtq0aRg0aBA8PDwwevRovPfee+Lr/v7++OmnnzB16lT06NEDwcHBmD17dq2X3BPRjeWWlMMoAB4yIMjHsQJRVFUgupxfBkEQIJPJJK6IiJyJTOA693XSarXw9/dHYWEhNBrHuHcTkc2VlAC+VStPFxcDPj41mqRkFiFxyW4081Hiz1fuavqS9CXwnWeqqXhWMXyUNWsS25ZXotOrWwEAJ19LhLfStn/v1ePXQ0QOpiH/fkt+6w4ich4FpaZL2gPUnhJXUpO3Ug5PualXqKCUaxERUcMwEBFRvRWUmYJGgLfjBSKZTAb/qqDGQEREDcVARET1JvYQeSvraCkNcyAqLGMgIqKGYSAionoz97w44pAZAGiq6uINXomooRiIiKjezENm/g44ZAYAXgo5AKC80ihxJUTkbBiIiKjezD1EgQ46ZKbyNH2llVcYJK6EiJyNVYHo77//tnUdROQEqucQOXYPkY49RETUQFYFotatW2PgwIH4z3/+A51OZ+uaiMhBmXuI/B10DhF7iIjIWlYFoj///BNdu3bFzJkzERYWhscffxwHDhywdW1E5GCqL7t30CEzRVUgYg8RETWQVYEoLi4OS5cuRXp6Oj777DNkZGSgb9++6Ny5MxYvXoycnBxb10lEDiC/xHEXZgQAFSdVE5GVGjWpWqFQYNSoUVi/fj3mz5+Pc+fO4bnnnkNUVBSSkpKQkZFhqzqJSGJfHkhFplYHDxnQIlAtdTm18uKQGRFZqVGB6I8//sC//vUvhIeHY/HixXjuuedw/vx5bNu2Denp6bjnnntsVScRSWzV7xcBADMS2iLY17Fu7GrGHiIispZVdz9cvHgxkpOTkZKSgmHDhmHNmjUYNmwYPDxM+So2NharVq1CTEyMLWslIokIgoC0vFIAwPCu4RJXc2PKqjlE+/7OlbgSInI2VgWi5cuXY9KkSZgwYQLCw2v/cgwJCcGnn37aqOKIyDHkl1agRG8ahmoR4JjDZQCgqLq56+nMIhiNAjw8ZBJXRETOwqpAdPbs2TrbKJVKjB8/3prdE5GDMfcOhWpU8PKUS1zNjSV0CMWCLSkAgOyicoT5e0lcERE5C6vmECUnJ2P9+vU1tq9fvx6rV69udFFE5FjS8k2BKDLQW+JKbq5tqB8iqyZ8X66qmYioPqwKRPPmzUNwcHCN7SEhIXjrrbcaXRQROZbL+WUAgCgHvbrsWuFVvUJZ2nKJKyEiZ2JVIEpNTUVsbGyN7dHR0UhNTW10UUTkWMxDZlHNHLuHCAB8VaaZACXllRJXQkTOxKpAFBISgqNHj9bYfuTIEQQFBTW6KCJyLGlVPUSRTtBD5OtlWjSyiIGIiBrAqkD04IMP4umnn8bOnTthMBhgMBiwY8cOTJ8+HWPHjrV1jUQkscvmHiIHn0MEVPcQFesYiIio/qy6yuz111/HxYsXMWjQICgUpl0YjUYkJSVxDhGRizEaBVwuqJpD5ARDZn5eVYGovELiSojImVgViJRKJdatW4fXX38dR44cgVqtRpcuXRAdHW3r+ohIYjnF5dBXGuEhg1Ncxi72EHHIjIgawKpAZNa2bVu0bdvWVrUQkQMyT6gO91fDU96ou/3YhY8YiHg/MyKqP6sCkcFgwKpVq7B9+3ZkZ2fDaLS8b9COHTtsUhwRSc+8BlFUM8efUA0A6qqFI8v0DEREVH9WBaLp06dj1apVGD58ODp37gyZjMvjE7mqy3nmK8wcf/4QAHgrTYFIxzveE1EDWBWIvvzyS3z11VcYNmyYreshIgdztdi0wGGoxjHvcH89861FyhiIiKgBrJoQoFQq0bp1a1vXQkQOSFt1+bqman0fR6eu6iEq5ZAZETWAVYHo2WefxdKlSyEIgq3rISIHU6QzXb7u5ySByDxkdipDi0qDsY7WREQmVg2Z7dmzBzt37sTmzZvRqVMneHpaflF+8803NimOiKRn7iEyr+/j6MyTqgHgh2MZuCeuhYTVEJGzsOobLiAgACNHjrR1LUTkgIqcLBAp5NUXeZhvSktEVBervuGSk5NtXQcROShnGzIL9atePFLjJCGOiKRn9SprlZWV+Pnnn/HRRx+hqKgIAJCeno7i4mKbFUdE0isSJ1U7R7gI9FGiU4QGAG/wSkT1Z9U33KVLlzBkyBCkpqaivLwcd911F/z8/DB//nyUl5djxYoVtq6TiCQgCIJ4Cwxn6SECgF4xzXAiXSuGOSKiuljVQzR9+nT07NkT+fn5UKurV68dOXIktm/fbrPiiEhapXoDDEbT1aTOMocIqO7NMg/3ERHVxapvuF9//RW///47lEqlxfaYmBhcuXLFJoURkfTMPSxyD5l4ObszCPM3/aF2KbdU4kqIyFlY1UNkNBphMNRc9Ozy5cvw8/NrdFFE5BiqJ1QrnOoWPR3CTd9DpzOLJK6EiJyFVYFo8ODBWLJkifhcJpOhuLgYr776Km/nQeRCnG0NIrOoZqb7rl0tLufijERUL1Z9yy1atAiJiYno2LEjdDodHnroIZw9exbBwcH473//a+saiUgiYg+RynkmVANAoLcScg8ZDEYBeSV6hGi86n4TEbk1qwJRZGQkjhw5gi+//BJHjx5FcXExJk+ejHHjxllMsiYi5+asPURyDxma+SiRU1SO7KJyBiIiqpPV33IKhQIPP/ywLWshIgfjbIsyXqu5rwo5ReXIKS6XuhQicgJWBaI1a9bc9PWkpCSriiEix3LsciEAoLmfso6Wjqe5nwrIAK4WMRARUd2sCkTTp0+3eF5RUYHS0lIolUp4e3szEBG5gNzicnzzl2kZjdG3RkpcTcMF+6oAgD1ERFQvVl1llp+fb/FTXFyMlJQU9O3bl5OqiVzElhOZ0Fca0aWFP3pEB0pdToM196sKROwhIqJ6sPpeZtdr06YN3n777Rq9R0TknMzDZf3bBjvVGkRm5kB0tVgvcSVE5AxsFogA00Tr9PR0W+6SiCRytCoQdWkRIG0hVqruIdJJXAkROQOr5hBt3LjR4rkgCMjIyMD777+Pf/zjHzYpjIiko6sw4EyWaZXnrpH+EldjnWBf00TwLC2HzIioblYFonvvvdfiuUwmQ/PmzXHnnXdi0aJFtqiLiCSUkqlFpVFAsK8S4f7OuYZPu1A/yGTAhaslyNbquBYREd2UVYHIaORS+ESu7FKe6aaobUL8nHL+EAAE+aoQpvFCRqEOmQxERFQHm84hIiLXkFtkmohsnofjrLw85QAAXQX/iCOim7Oqh2jmzJn1brt48WJrDkFEEsotMc27Ma/l46xUCtPffLoKg8SVEJGjsyoQ/fXXX/jrr79QUVGBdu3aAQDOnDkDuVyOW2+9VWznrF3tRO7OfKl6sBOuUH2t6h4iBiIiujmrAtHdd98NPz8/rF69GoGBpgXb8vPzMXHiRPTr1w/PPvusTYskIvu6WlIViJy8h8jLs6qHqJJDZkR0c1bNIVq0aBHmzZsnhiEACAwMxBtvvMGrzIhcQG7V6s7NnT4QsYeIiOrHqkCk1WqRk5NTY3tOTg6KiooaXRQRSctV5hB5KUyBqJyBiIjqYFUgGjlyJCZOnIhvvvkGly9fxuXLl/G///0PkydPxqhRo2xdIxHZWV5JBQDnn0OUlm9aPuCV705IXAkROTqr5hCtWLECzz33HB566CFUVJi+OBUKBSZPnox33nnHpgUSkf0ZjAIAIMjHuXuIzmUXS10CETkJq3qIvL298eGHHyI3N1e84iwvLw8ffvghfHx8bF0jEUnAX+0JpcK5lyp7/6Hqq14FQZCwEiJydI36tsvIyEBGRgbatGkDHx8ffuEQuRDzvcCc2e2tgsTHxeWVElZCRI7OqkCUm5uLQYMGoW3bthg2bBgyMjIAAJMnT+Yl90QuwtknVAOAj0oBX5VpZkB2EW/ySkQ3ZlUgeuaZZ+Dp6YnU1FR4e3uL28eMGYMtW7bYrDgikk6wk9+2wyyk6nNk8673RHQTVgWin376CfPnz0dkZKTF9jZt2uDSpUtWFfL2229DJpNhxowZ4jadToepU6ciKCgIvr6+GD16NLKysizel5qaiuHDh8Pb2xshISF4/vnnUVlp2TW+a9cu3HrrrVCpVGjdujVWrVplVY1E7sTZ1yAyC9FUBaIincSVEJEjsyoQlZSUWPQMmeXl5UGlaviX6MGDB/HRRx+ha9euFtufeeYZfP/991i/fj1++eUXpKenW1zWbzAYMHz4cOj1evz+++9YvXo1Vq1ahdmzZ4ttLly4gOHDh2PgwIE4fPgwZsyYgUcffRRbt25tcJ1E7sQV5hABQIif6S737CEiopuxKhD169cPa9asEZ/LZDIYjUYsWLAAAwcObNC+iouLMW7cOHz88ccWK18XFhbi008/xeLFi3HnnXeiR48eSE5Oxu+//459+/YBMPVUnTx5Ev/5z38QFxeHoUOH4vXXX8cHH3wAvd5064EVK1YgNjYWixYtQocOHTBt2jTcd999ePfdd6356ERuwxXmEAHXDJmxh4iIbsKqQLRgwQKsXLkSQ4cOhV6vxwsvvIDOnTtj9+7dmD9/foP2NXXqVAwfPhwJCQkW2w8dOoSKigqL7e3bt0fLli2xd+9eAMDevXvRpUsXhIaGim0SExOh1Wpx4sQJsc31+05MTBT3UZvy8nJotVqLHyJ34yqBKFRj6iHKYg8REd2EVYGoc+fOOHPmDPr27Yt77rkHJSUlGDVqFP766y+0atWq3vv58ssv8eeff2LevHk1XsvMzIRSqURAQIDF9tDQUGRmZoptrg1D5tfNr92sjVarRVlZWa11zZs3D/7+/uJPVFRUvT8TkatwlUnV4QGmQHS5atVqIqLaNHil6oqKCgwZMgQrVqzA//3f/1l94LS0NEyfPh3btm2Dl5eX1ftpCrNmzcLMmTPF51qtlqGI3I6rzCGKCTItFnsxl4GIiG6swT1Enp6eOHr0aKMPfOjQIWRnZ+PWW2+FQqGAQqHAL7/8gvfeew8KhQKhoaHQ6/UoKCiweF9WVhbCwsIAAGFhYTWuOjM/r6uNRqOBWq2utTaVSgWNRmPxQ+RuXGXILDLQ9N95Xoke5ZW8ySsR1c6qIbOHH34Yn376aaMOPGjQIBw7dgyHDx8Wf3r27Ilx48aJjz09PbF9+3bxPSkpKUhNTUV8fDwAID4+HseOHUN2drbYZtu2bdBoNOjYsaPY5tp9mNuY90FENfmpFPDylEtdhk1ovDwhk5keF5ZVSFsMETksq27uWllZic8++ww///wzevToUeP+ZYsXL65zH35+fujcubPFNh8fHwQFBYnbJ0+ejJkzZ6JZs2bQaDR46qmnEB8fj9tuuw0AMHjwYHTs2BGPPPIIFixYgMzMTLz88suYOnWqePn/E088gffffx8vvPACJk2ahB07duCrr77CDz/8YM1HJ3JZWYU6mGfbOftd7q/l4SGDn0oBra4S2rIK8TJ8IqJrNSgQ/f3334iJicHx48dx662mmyaeOXPGoo3M/KeYDbz77rvw8PDA6NGjUV5ejsTERHz44Yfi63K5HJs2bcKTTz6J+Ph4+Pj4YPz48XjttdfENrGxsfjhhx/wzDPPYOnSpYiMjMQnn3yCxMREm9VJ5AqOpBVgcNXjyf1ukbQWWwvwVkKrq2QPERHdkExowB1Z5XI5MjIyEBISAsB0q4733nuvxlVcrkar1cLf3x+FhYWcT0Qua+32kxiX0Mn0pLgYuK7nVwol+hL4zvMFABTPKoaP0rqa7nl/D45cLsSKh3tgSOcw62opAXxNpTjKr4eI6tCQf78bNIfo+uy0efNmlJSUNLxCInI4OcWuu3Bhy6orzS7l8vuKiGpn1aRqswZ0LhGRg8vW6qUuoclENzPdaiiNaxER0Q00KBDJZLIac4RsOWeIiKSTU+y6KzkHeHsCAIp1lXW0JCJ31aBJ1YIgYMKECeIVXDqdDk888USNq8y++eYb21VIRHaRU+S6gchHZfqqKy5nICKi2jUoEI0fP97i+cMPP2zTYohIOlddeA6Rb1UgOn6F9yUkoto1KBAlJyc3VR1EJKFKgxF5Ja57Sbo5EGVqdcgpKkdzF7lPGxHZTqMmVRORa7harIe7XCNxOpO9RERUEwMRESG7yHWHywAgvlWQ+FhbxnlERFQTAxERIVvruhOqAcDLU47hXcIBADkuHv6IyDoMRESE9MIyqUtocuZ5Q668vAARWY+BiIhwpcCNApELLy9ARNZjICIipBe4/jBSc18GIiK6MQYiIkKGO/UQcciMiGrBQERESHenQMQeIiKqBQMRkZurNBiRqXWDIbOqQJSlLUeZ3iBxNUTkaBiIiNxcVlE5jALgKXftGzUH+1avTj37u+MSVkJEjoiBiMjNmYfLwvy9JK6kack9qgPf+kOXJayEiBwRAxGRm7uSbwpE4S4eiADgmYS2AIAuLfwlroSIHA0DEZGbS8srBQC0CPCWuJKm179tMAAgr0QvcSVE5GgYiIjcXGpVIIoKVEtcSdPzqbrrfVkFJ1UTkSUGIiI3JwaiINfvIVJ7ygEAJeW8wSsRWWIgInJzl6vmELUIcP0eIm+lKRCVVxphMAoSV0NEjoSBiMiN6SuN4o1dIwNdv4fIPGQGcNiMiCwxEBG5sSsFZRAE01BSsK9S6nKanErhAVnV1fcFpZxYTUTVGIiI3Jj5CrOoZmrIZK69MCMAyGQy+CpNvUQ7T2dLXA0RORIGIiI3dim3BAAQ5QbDZWa9YpsBAK4Ws4eIiKoxEBG5sUOX8gEAnSI0EldiP52rPutV3vWeiK7BQETkpgRBwP4LeQCAPrcESVyN/QRX3eQ1lz1ERHQNBiIiN3U5vwwZhTooPGTo3jJA6nLsxnyTV/YQEdG1GIiI3NS+v3MBAF0j/eGtVNTR2nUE+ZiupmMgIqJrMRARuakT6VoAQM+YZhJXYl/mIbOLuaWoMBglroaIHAUDEZGbSi8wLcjoDvcwu1a4v5f4+N1tZySshIgcCQMRkZvKKNQBAML93SsQXTs8+OGu8xJWQkSOhIGIyE1lVN2yI+yaHhN3ofAwLULZt3WwxJUQkaNgICJyQ+WVBnFhwgg3uKnr9d4dEwcAnENERCIGIiI3lFVousJKpfBAoLenxNXYn7/a9Jm1ukqJKyEiR8FAROSGzHe4D/f3cot7mF3P18s0jyiz6vdARMRAROSGMt10QrWZpioQ5ZdW4O+cYomrISJHwEBE5IbEHqIA95tQDQC+quphwi/2p0pYCRE5CgYiIjdU3UPknoEo0Kc6ELnhiCER1YKBiMjNVBqM2HvedNuOyEBviauRhkohx6hbWwAAyit5pRkRMRARuZ11f6ThbHYxArw9MaxzuNTlSKZVc18AgK7CIHElROQIGIiI3Mxney4AAKYPagN/N7zk3szLUw4A0FWwh4iIGIiI3IquwoALV0sAACO6RkhcjbS8PE1ff+whIiKAgYjIrZzPKYZRAAK8PRHsq5S6HEl5KUw9RD+dzJK4EiJyBAxERG7kXLZpzZ22IX5uuSDjtSqN1UNlOUXlElZCRI6AgYjIjZzJKgIAtA71lbgS6Znv5WZ6zEBE5O4YiIjcyNkscw8RA9HI7i3Ex49/fkjCSojIETAQEbmRs1VDZm1C/SSuRHoRAdW3LUnNK5WwEiJyBAxERG6iVF+JS7mmK8zasIcIAKBU8CuQiEz4bUDkJn47lwujALQIUKO5n0rqchzC5un9xMcVBq5HROTOGIiI3MSO06bLyxM6hLj9FWZmMUE+8JSbfheXcjlsRuTOGIiI3IDRKGD7qWwAwKAOoRJX4zjkHjLcdksQAODnU1yPiMidMRARuYET6VpkF5XDWylHn1uaSV2OQzEHIvMaTUTknhiIiNzA7+evAgBubxUEVdUKzWSiUZvu51akq5C4EiKSEgMRkRs4cCEPANAnNkjiShyPxksBANCWVUpcCRFJiYGIyMUZjAIOXKwKRBwuq0HjZeohKixjDxGRO2MgInJxpzK0KNJVwlelQMdwjdTlOJyoZt4ATLc1KS5nLxGRu2IgInJx+6uGy3rGBEIh53/y12sd4gt/tScqjQLSC8qkLoeIJMJvRyIXd/RyAQCgZ3SgtIU4sDCNFwAgS6uTuBIikoqkgWjevHno1asX/Pz8EBISgnvvvRcpKSkWbXQ6HaZOnYqgoCD4+vpi9OjRyMqyXC8kNTUVw4cPh7e3N0JCQvD888+jstKy63vXrl249dZboVKp0Lp1a6xataqpPx6RQzidYbrDfccIDpfdSIjGtHL3jtPZEldCRFKRNBD98ssvmDp1Kvbt24dt27ahoqICgwcPRklJidjmmWeewffff4/169fjl19+QXp6OkaNGiW+bjAYMHz4cOj1evz+++9YvXo1Vq1ahdmzZ4ttLly4gOHDh2PgwIE4fPgwZsyYgUcffRRbt2616+clsrfySgPO55jW12kfxkB0I7+eNS1LkPzbRWkLISLJKKQ8+JYtWyyer1q1CiEhITh06BD69++PwsJCfPrpp/jiiy9w5513AgCSk5PRoUMH7Nu3D7fddht++uknnDx5Ej///DNCQ0MRFxeH119/Hf/+978xZ84cKJVKrFixArGxsVi0aBEAoEOHDtizZw/effddJCYm2v1zE9nL+ewSVBoFaLwUCPf3kroch9UpQoMT6VqpyyAiCTnUHKLCwkIAQLNmpkuDDx06hIqKCiQkJIht2rdvj5YtW2Lv3r0AgL1796JLly4IDa2+HUFiYiK0Wi1OnDghtrl2H+Y25n1cr7y8HFqt1uKHyBmdzjT9f7d9uIb3L7uJBfd1lboEIpKYwwQio9GIGTNm4B//+Ac6d+4MAMjMzIRSqURAQIBF29DQUGRmZoptrg1D5tfNr92sjVarRVlZzatK5s2bB39/f/EnKirKJp+RyN5OZ5rmD3UI85O4EsfWIkAtPtZX8q73RO7IYQLR1KlTcfz4cXz55ZdSl4JZs2ahsLBQ/ElLS5O6JCKrnMqo7iGiG9N4ecJHabqlyY/HMiSuhoik4BCBaNq0adi0aRN27tyJyMhIcXtYWBj0ej0KCgos2mdlZSEsLExsc/1VZ+bndbXRaDRQq9W4nkqlgkajsfghcjZFugr8eSkfALggYx08PGQY0TUCAPDlwVSJqyEiKUgaiARBwLRp07Bhwwbs2LEDsbGxFq/36NEDnp6e2L59u7gtJSUFqampiI+PBwDEx8fj2LFjyM6uvlx227Zt0Gg06Nixo9jm2n2Y25j3QeSKNvx1BSV6A25p7oOukf5Sl+PwhnYx/QFVyHuaEbklSa8ymzp1Kr744gt899138PPzE+f8+Pv7Q61Ww9/fH5MnT8bMmTPRrFkzaDQaPPXUU4iPj8dtt90GABg8eDA6duyIRx55BAsWLEBmZiZefvllTJ06FSqVaW2RJ554Au+//z5eeOEFTJo0CTt27MBXX32FH374QbLPTtSUBEHAmr2XAABJt0VzQnU9qD1NQ2bllQaJKyEiKUjaQ7R8+XIUFhZiwIABCA8PF3/WrVsntnn33XcxYsQIjB49Gv3790dYWBi++eYb8XW5XI5NmzZBLpcjPj4eDz/8MJKSkvDaa6+JbWJjY/HDDz9g27Zt6NatGxYtWoRPPvmEl9yTy/ozNR/nsovhrZRjVI/Iut9A8KoKRH/nlOAKb+FB5HYk7SESBKHONl5eXvjggw/wwQcf3LBNdHQ0fvzxx5vuZ8CAAfjrr78aXCORM9p6wjRn7q6OoeLd3OnmVJ7Vfx+u2XsRs4Z2kLAaIrI3h5hUTUS2IwgCtp00BaLBHcMkrsZ5eCnk4uNdp3MkrISIpMBARORizucU48LVEijlHrijXXOpy3EaSkX116GhHr3XRORaGIiIXMxPVb1D8a2C4KuSdFTcqaiuCUQKD05CJ3I3DERELuana+YPUf0F+aowoms4ANMK398fSZe4IiKyJwYiIheSrdXhcFoBAAYiazw5oJX4+Kn/8iIMInfCQETkQn4+ZVqgtFtUAEI1vLt9Q/mreUUekbtiICJyIdtOmhY3HczeIatE+Fveyie/RC9RJURkbwxERC5CV2HAb+dzATAQWcvjusnUH//6t0SVEJG9MRARuYhjVwqhrzSiuZ8KrUN8pS7HaS1+oJv4WFdhlLASIrInBiIiF/FXqunO9re2DOC9yxrB55qlCnxU8pu0JCJXwkBE5CL+Si0AAHRvGShtIU7OYKxelNFbyXWciNwFAxGRixADUVSApHU4u2JdpfiYCzQSuQ8GIiIXkFFYhkytDnIPGbpE+ktdjnO7JgOVVxqkq4OI7IqBiMgF/HmpAADQPsyPwzyN9M9uEeLjhT+dkbASIrInBiIiJ5eaW4r5W04DAHpEc/5QY3l5yvHi0Pbi80oDrzQjcgcMREROTFdhwIMf70NqXilaNvPGE3e0qvtNVKfJfWPFxwVlFRJWQkT2wkBE5MR+PXsVVwrK0NxPha+fiEdEgLruN1GdPOUeCPA23cbjrR9PSVwNEdkDAxGRE/vxWAYA4O6uEQjhvctsqkVVuPzmzyu4WlwucTVE1NQYiIiclK7CgG0nswAAw7uGSVyN6+kYrhEfl5RX3qQlEbkCBiIiJ/Xr2asoLq9EmMYL3aM4mdrWrl2+oEjHQETk6hiIiJzUV3+kAQCGdQmvcVNSarwxvaLEx4WcWE3k8hiIiJzQoUt52HYyCx4y4MHeUXW/gRpMpZBjQLvmAIALV0skroaImhoDEZGTEQQBb/1oWnfo/h5RaBPqJ3FFrqt9mGke0elMrcSVEFFTYyAicjK7UnJw6FI+vDw98MxdbaUux6V1CDeFzdMZRRJXQkRNjYGIyMl8/OvfAICH+0QjzJ+X2jclcw/RH5fyUVCql7gaImpKDERETuREeiF+P58LuYcME69ZTZmaxi3NfcTHj39+SMJKiKipMRAROZFPf70AwHRlWQuuSt3kPOXVX5FHLxdKWAkRNTUGIiInkZpbio1H0gEAj/Vj75C9TLg9BgCglPPrksiV8b9wIiexeFsKKo0C+rUJRtfIAKnLcRszEtoAAPS86z2RS2MgInICJ9O1+K6qd+jfQ9pLXI178fPylLoEIrIDBiIiB2c0Cnjjh5MQBODubhHo3MK/7jeRzci5CjiRW2AgInJwy385j9/P58LL0wPPct0hSfRrEyx1CUTUxBiIiBzY7+euYtFPKQCA1/7ZGTHBPnW8g5rC26O7Sl0CETUxBiIiByQIAr7Yn4oJyQdhFICR3Vvg/p6RUpfltsI1XACTyNUppC6AiGr6dM8FvPHDKQBAYqdQvDmyM2QyzmWRisd184jKKw3wgVyiaoioKbCHiMjBlJRXYtmOcwCApwe1wYqHe8Bbyb9dpLZ6Ui/x8fqDaRJWQkRNgYGIyMGsO5iGwrIKxAR5Y/qgNuwZchC9Y4PEx6v2XpSuECJqEgxERA6kwmDEp3tMt+d4rP8tvOTbQV3J1yEtr1TqMojIhhiIiBzIhj+v4EpBGYJ9VRh9KydRO7JHPt0vdQlEZEMMREQO4rvDV/B/3x4DAEzqGwMvT07adWQXc9lDRORKGIiIJGYwClj681lM//IwKgwChncJx+S+vHmrMzh0KU/qEojIRhiIiCSUXlCGBz/eh3d/PgMAeLRvLJY92B0qBXuHnMHo5XulLoGIbITX8hJJ5EhaASYkH0B+aQV8lHLMvacz7uvBeUPOIESjxFVdGQCgTG+AWskAS+Ts2ENEJIE9Z6/iwY/3Ib+0Ap1baPDD0/0YhpzI+w/dKj7elZItYSVEZCvsISKys83HMjD9y8PQG4zo2zoYHz3SAz4q/qfoTDqF+4uPMwp1ElZCRLbCHiIiO/rvgVRM/eJP6A1GDOsShk8n9GQYckIeHjJMuD0GAPDappPQVxqlLYiIGo2BiKiJFZTq8fPJLLy04RhmfXMMRgF4sHcUlj14KydPO7EHekaJj9u+vBmVBoYiImfGP02JmlDyb6abtBqMgrjtyQGt8EJiO96Sw8l1jNBYPP9kzwU8cUcriaohosZiDxFRE9lyPAOvbToJg1HALc198GDvlvh0fE/8e0h7hiEX8cWjfcTHb28+jQtXSySshogagz1ERE3g6OUCzFh3GIIAjI+Pxtx7OktdEjWB+FZBFs8HLtyFC/OGMfASOSH2EBHZSKXBiCNpBfjol/OYvPoP6CqMGNCuOV4Z0VHq0qiJyGQynH9rGPyumRj/5H/+lLAiIrIWe4iIGkFfacSecznYdCQD205moai8UnytfZgflj3YHQo5/+5wZXIPGd57sDsmrjoIANhyIhPfHb6Ce+JaSFwZETUEAxGRFbK1Onyy5wLWHUxDYVmFuF3jpUDv2CDcdksz3N8jCn5enhJWSfZye2vLobPpXx5Gj+hARAZ6S1QRETUUAxFRA6TmlmLlr+fx1R+XxbVnmvupMLxLOEZ0DUf3loGQe3D+iLtRKeS4MG8YYmf9KG7rO38nLr49XMKqiKghGIiI6pCWV4otxzPx4/EM/JVaIG6/tWUApg5sjQHtQhiCCDKZDJP+EYvPfrsgbot58Qdsnt4PHcI1N3knETkCBiKiWhiMAn46kYlP91zAH5fyxe0yGdC3dTCmDmyNPrHNeDURWXhpWHs8Eh+NgQt3iduGLv0VB14ahBCNl3SFEVGdGIjIrekqDDAYBRgFAQKA8gojvj+SjuTfLyAtz3Q3cw8Z0Cc2CMO6hCGxUxj/YaMbUsg9EBvsg4+TeuKxNX+I23u/tR2rJvbC7a2CoVRwkj2RI2IgIrcgCAIu55fhRHohTqRrcfxKIY6na5FTVH7D9wR6e+Lh26LxyG3RDEHUIHd1DMWE22Ow6veL4rYJyaar0M6/NYxDrEQOiIGInE6Z3oC0/FKU6g0orzCgvNKI8kojdFWPi3UVyNSWI7OwDBmFOmQU6pCp1dX7BpytmvtgUt9YjOoeCbWS9xoj68z5Zyc81KclBr+722J7q5d+xIZ/3Y64qAAOuRI5EAYiCVUYjNj/dx485TJ4KjyglHtAqfCAp9wDnnIZlHLT42u3OdMXqL7SiIzCMqTlleFyfiku55fhSkEZBEFAgLcSGrUnAtSeCPA2/firlfCveu6rUiCjUIcLV4vxd04JLlyt/sko1FlVj6dchrahfugUoUHnFv7oFOGP1s19oVR4QCYDPGQyyGSAJ9cNIhtpG+qH7c/egUGLfrHYPvLD38XHJ+Ymori8EqHshSSSFAORhArLKvDwp/sb9B5PuawqHJmCkrIqKJm3eSo8oJJ7wFNxTTsxVFm+1xS65PBUVIcvy9eq9+kpl0GlqG5j3q9REJBeUIa0qsBj+jE9ztTqIAh1fyZr+HkpoPHyhMrTA14KOVSeHlApPKBSyOGtlCPM3wvh/l4I81eb/lfjhVCNF+dvkN21au6Lc28OxSvfHcd/D6TVeL3Tq1vFxy8P74C7OoYiOsjHniUSEdwsEH3wwQd45513kJmZiW7dumHZsmXo3bu3ZPUIgmk1Y32lEXqDERUGIyoMAiqqnusNxhqBosIgoMJgAGCQpOaG8vL0QGSgNyID1YgMVKNFgDcUHjIUlOlRWFaBgtKK6/5XD62uUnxvbLAvbgn2QUywN2KDfREb7INbgn0Q6KOU+JMR1Z9C7oF5o7piXJ9ojFi254bt3vjhFN744ZTFtgHtmuPurhEY0K45/Lw8oVR4wGgU4MF5SEQ25TaBaN26dZg5cyZWrFiBPn36YMmSJUhMTERKSgpCQkIkqam5nwpbZvS/aZvKqpBUHZiMqKg0PddXVm/TXxOmzM9NrwsWbarfI1i83/y8tuNUXP8+8XUjBADh/l5i6IlqZg4/pv8N8lE2eJjPYBRQXF4JP5WCX/rkUjq38MfFt4fDYBSw+0wOJq8+CGMdvai7UnKwKyWn3scI9lWiRYAat0YHomd0M1QajVApPNAzphkKSvWICFBD7SmHVleJMr0BXp4eqDQKCPRWipO9BUFwquF5IluQCUJTDWo4lj59+qBXr154//33AQBGoxFRUVF46qmn8OKLL1q0LS8vR3l59dVHWq0WUVFRKCwshEbDBdbIRZWUAL6+psfFxYCP9MM2JfoS+M4z1VQ8qxg+Sulqaupfz+lMLYYs+dW2O20CbUN9b/q6DDcPUo3NWXUFtbp2X9fx63y9kZ+vzo/f1J+vzvc39fFv3CDQxxMfPdKzjiM0jFarhb+/f73+/XaLHiK9Xo9Dhw5h1qxZ4jYPDw8kJCRg7969NdrPmzcPc+fOtWeJROTm2odpatzqY+/5XMhkwK9nc/DBzvMSVWbpTFax1CWQiwrxU0l6fLcIRFevXoXBYEBoaKjF9tDQUJw+fbpG+1mzZmHmzJnic3MPERGRPcW3Mt009rZbgvB8Yvs62xurxt8MgoAsrQ6VBgGFZRWoMBix4a8raNnMGwHenvDylOO3c1dRqjegY4QGugojMgvLsOGvK/BXe8LPy9QmW6tDcz8V+rdtjq8PXcbMu9riluAbd43VZ7ihrjEJoR57qXsf9anj5q3qNXRij8/iIJ+17rGkxn9WqS96cYtA1FAqlQoqlbRJlYioocxz7jwgQ2Sgt8VrPWOaWTy/J65FjfcvuK/bDff90rAONqiQyHG5xTXIwcHBkMvlyMrKstielZWFsLAwiaoiIiIiR+EWgUipVKJHjx7Yvn27uM1oNGL79u2Ij4+XsDIiIiJyBG4zZDZz5kyMHz8ePXv2RO/evbFkyRKUlJRg4sSJUpdGREREEnObQDRmzBjk5ORg9uzZyMzMRFxcHLZs2VJjojURERG5H7cJRAAwbdo0TJs2TeoyiIiIyMG4xRwiIiIiopthICIiIiK3x0BEREREbo+BiIiIiNweAxERERG5PQYiIiIicnsMREREROT2GIiIiIjI7bnVwozWEgQBAKDVaiWuhKgJlZRUP9ZqAYNBulqqlOhLAJ3psVarhUEpXU0O+OshojqY/902/zt+MzKhPq3c3OXLlxEVFSV1GURERGSFtLQ0REZG3rQNA1E9GI1GpKenw8/PD71798bBgwfrfE+vXr1u2u5mr9f22o3aX7tdq9UiKioKaWlp0Gg0ddZoa3V95qbaR33fw3Niv33wnNwYz0nN7TwnjWtXn99xQ7e7yjk5cOAAioqKEBERAQ+Pm88S4pBZPXh4eIjJUi6X1+v/HHW1u9nrtb12o/a1bddoNJL8H7i+vxtb74Pn5MZ4Tm68nefEunY8J7bbR1OfE1tsd/Zz4u/vD39//3q156TqBpo6dapN2t3s9dpeu1H7+tZjD7aoxZp98JzcGM9Jw+qxB56ThtVjD656Tmy1XQpSnBMOmbkQrVYLf39/FBYWSpLoqSaeE8fDc+J4eE4cjzueE/YQuRCVSoVXX30VKpVK6lKoCs+J4+E5cTw8J47HHc8Je4iIiIjI7bGHiIiIiNweAxERERG5PQYiIiIicnsMREREROT2GIiIiIjI7TEQuZGRI0ciMDAQ9913n9SlEEz31hkwYAA6duyIrl27Yv369VKX5PYKCgrQs2dPxMXFoXPnzvj444+lLokAlJaWIjo6Gs8995zUpVCVmJgYdO3aFXFxcRg4cKDU5dgEL7t3I7t27UJRURFWr16Nr7/+Wupy3F5GRgaysrIQFxeHzMxM9OjRA2fOnIGPj4/Upbktg8GA8vJyeHt7o6SkBJ07d8Yff/yBoKAgqUtza//3f/+Hc+fOISoqCgsXLpS6HIIpEB0/fhy+vr5Sl2Iz7CFyIwMGDICfn5/UZVCV8PBwxMXFAQDCwsIQHByMvLw8aYtyc3K5HN7e3gCA8vJyCIIA/s0orbNnz+L06dMYOnSo1KWQi2MgchK7d+/G3XffjYiICMhkMnz77bc12nzwwQeIiYmBl5cX+vTpgwMHDti/UDdiy3Ny6NAhGAwGREVFNXHVrs0W56SgoADdunVDZGQknn/+eQQHB9upetdji/Px3HPPYd68eXaq2D3Y4rzIZDLccccd6NWrF9auXWunypsWA5GTKCkpQbdu3fDBBx/U+vq6deswc+ZMvPrqq/jzzz/RrVs3JCYmIjs7286Vug9bnZO8vDwkJSVh5cqV9ijbpdninAQEBODIkSO4cOECvvjiC2RlZdmrfJfT2PPx3XffoW3btmjbtq09y3Z5tvjvZM+ePTh06BA2btyIt956C0ePHrVX+U1HIKcDQNiwYYPFtt69ewtTp04VnxsMBiEiIkKYN2+eRbudO3cKo0ePtkeZbsXac6LT6YR+/foJa9assVepbqMx/52YPfnkk8L69eubsky3Yc35ePHFF4XIyEghOjpaCAoKEjQajTB37lx7lu3ybPHfyXPPPSckJyc3YZX2wR4iF6DX63Ho0CEkJCSI2zw8PJCQkIC9e/dKWJn7qs85EQQBEyZMwJ133olHHnlEqlLdRn3OSVZWFoqKigAAhYWF2L17N9q1aydJva6uPudj3rx5SEtLw8WLF7Fw4UI89thjmD17tlQlu4X6nJeSkhLxv5Pi4mLs2LEDnTp1kqReW1JIXQA13tWrV2EwGBAaGmqxPTQ0FKdPnxafJyQk4MiRIygpKUFkZCTWr1+P+Ph4e5frFupzTn777TesW7cOXbt2FcfwP//8c3Tp0sXe5bqF+pyTS5cuYcqUKeJk6qeeeorno4nU93uL7Ks+5yUrKwsjR44EYLoy87HHHkOvXr3sXqutMRC5kZ9//lnqEugaffv2hdFolLoMukbv3r1x+PBhqcugWkyYMEHqEqjKLbfcgiNHjkhdhs1xyMwFBAcHQy6X15j8mZWVhbCwMImqcm88J46H58Sx8Hw4Jnc+LwxELkCpVKJHjx7Yvn27uM1oNGL79u0cEpMIz4nj4TlxLDwfjsmdzwuHzJxEcXExzp07Jz6/cOECDh8+jGbNmqFly5aYOXMmxo8fj549e6J3795YsmQJSkpKMHHiRAmrdm08J46H58Sx8Hw4Jp6XG5D4Kjeqp507dwoAavyMHz9ebLNs2TKhZcuWglKpFHr37i3s27dPuoLdAM+J4+E5cSw8H46J56V2vJcZERERuT3OISIiIiK3x0BEREREbo+BiIiIiNweAxERERG5PQYiIiIicnsMREREROT2GIiIiIjI7TEQERERkdtjICIiIiK3x0BERG5pwIABmDFjhs32t2rVKgQEBNhsf0RkXwxERCSpCRMmQCaTQSaTQalUonXr1njttddQWVkpdWkNMmbMGJw5c0Z8PmfOHMTFxUlXEBE1CO92T0SSGzJkCJKTk1FeXo4ff/wRU6dOhaenJ2bNmtWg/RgMBshkMnh42P9vPbVaDbVabffjEpFtsIeIiCSnUqkQFhaG6OhoPPnkk0hISMDGjRtRXl6O5557Di1atICPjw/69OmDXbt2ie8zD1Nt3LgRHTt2hEqlQmpqKiZMmIB7770Xc+fORfPmzaHRaPDEE09Ar9ffsIabHUun06FTp06YMmWK2P78+fPw8/PDZ599ZlGL+fHcuXNx5MgRsfdr1apVmDRpEkaMGGFx3IqKCoSEhODTTz+1zS+TiKzCHiIicjhqtRq5ubmYNm0aTp48iS+//BIRERHYsGEDhgwZgmPHjqFNmzYAgNLSUsyfPx+ffPIJgoKCEBISAgDYvn07vLy8sGvXLly8eBETJ05EUFAQ3nzzzVqPWdex1q5diz59+mD48OEYMWIEHn74Ydx1112YNGlSjX2NGTMGx48fx5YtW/Dzzz8DAPz9/dG2bVv0798fGRkZCA8PBwBs2rQJpaWlGDNmTFP8KomonthDREQOQxAE/Pzzz9i6dSu6du2K5ORkrF+/Hv369UOrVq3w3HPPoW/fvkhOThbfU1FRgQ8//BC333472rVrB29vbwCAUqnEZ599hk6dOmH48OF47bXX8N5778FoNNY4bmpqap3HiouLwxtvvIFHH30UM2bMwKVLl/Dxxx/X+jnUajV8fX2hUCgQFhaGsLAwqNVqscbPP/9cbJucnIz7778fvr6+tvxVElEDsYeIiCS3adMm+Pr6oqKiAkajEQ899BDuu+8+rFq1Cm3btrVoW15ejqCgIPG5UqlE165da+yzW7duYjgCgPj4eBQXFyMtLQ3R0dEWbY8dOwaDwVDnsZ599ll8++23eP/997F582aL1+rr0UcfxcqVK/HCCy8gKysLmzdvxo4dOxq8HyKyLQYiIpLcwIEDsXz5ciiVSkREREChUGDdunWQy+U4dOgQ5HK5Rftre1PUajVkMlmjjl9cXFyvY2VnZ+PMmTOQy+U4e/YshgwZ0uBjJSUl4cUXX8TevXvx+++/IzY2Fv369WtU/UTUeAxERCQ5Hx8ftG7d2mJb9+7dYTAYkJ2dbVVgOHLkCMrKysQrv/bt2wdfX19ERUXVaFvfY02aNAldunTB5MmT8dhjjyEhIQEdOnSota1SqYTBYKixPSgoCPfeey+Sk5Oxd+9eTJw4scGfjYhsj4GIiBxS27ZtMW7cOCQlJWHRokXo3r07cnJysH37dnTt2hXDhw+/6fv1ej0mT56Ml19+GRcvXsSrr76KadOm1XpJfn2O9cEHH2Dv3r04evQooqKi8MMPP2DcuHHYt28flEpljX3GxMTgwoULOHz4MCIjI+Hn5weVSgXANGw2YsQIGAwGjB8/3ja/MCJqFE6qJiKHlZycjKSkJDz77LNo164d7r33Xhw8eBAtW7as872DBg1CmzZt0L9/f4wZMwb//Oc/MWfOHKuOdfr0aTz//PP48MMPxR6mDz/8EFevXsUrr7xS6/5Gjx6NIUOGYODAgWjevDn++9//iq8lJCQgPDwciYmJiIiIaNgvhYiahEwQBEHqIoiIbGnChAkoKCjAt99+K3UptSouLkaLFi2QnJyMUaNGSV0OEYFDZkREdmM0GnH16lUsWrQIAQEB+Oc//yl1SURUhYGIiMhOUlNTERsbi8jISKxatQoKBb+CiRwFh8yIiIjI7XFSNREREbk9BiIiIiJyewxERERE5PYYiIiIiMjtMRARERGR22MgIiIiIrfHQERERERuj4GIiIiI3N7/AyM5HgAezSxEAAAAAElFTkSuQmCC",
+      "text/plain": [
+       "<Figure size 640x480 with 1 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# df = ds.to_pandas()\n",
+    "\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "# Example dataframe\n",
+    "def plot_distribution(dfs):\n",
+    "  # Get summary stats and quartiles\n",
+    "  q1 = dfs['perplexity'].quantile(.05)\n",
+    "  q2 = dfs['perplexity'].quantile(.5)\n",
+    "  q3 = dfs['perplexity'].quantile(.95)\n",
+    "\n",
+    "  # Create line chart  \n",
+    "  counts, bins = np.histogram(dfs['perplexity'], bins=30000)\n",
+    "  bin_centers = 0.5*(bins[1:] + bins[:-1])\n",
+    "  plt.plot(bin_centers, counts)\n",
+    "\n",
+    "  # Add vertical lines for quartiles \n",
+    "  plt.axvline(x=q1, color='r')\n",
+    "  plt.axvline(x=q2, color='g')\n",
+    "  plt.axvline(x=q3, color='b')\n",
+    "\n",
+    "  plt.title('Perplexity Distribution')\n",
+    "  plt.xlabel('Perplexity')\n",
+    "  plt.ylabel('Frequency')\n",
+    "  plt.xscale('log')\n",
+    "\n",
+    "  plt.show()\n",
+    "\n",
+    "plot_distribution(df)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "get some random samples from the dataset with low and high perplexity"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Low: 3.3\n",
+      "High: 1155.9099999999978\n"
+     ]
+    }
+   ],
+   "source": [
+    "low = df.perplexity.quantile(0)\n",
+    "high = df.perplexity.quantile(0.9)\n",
+    "\n",
+    "print(f'Low: {low}')\n",
+    "print(f'High: {high}')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "_lowest sample:_\n",
+    "```\n",
+    "'Die Skulptur Madonna mit Kind in der katholischen Kirche St-Lucien in Angy, einer französischen Gemeinde im Département Oise in der Region Hauts-de-France, wurde im dritten Viertel des 14. Jahrhunderts geschaffen. Im Jahr 1912 wurde die gotische Skulptur als Monument historique in die Liste der geschützten Objekte (Base Palissy) in Frankreich aufgenommen.\\nDie 1,10 Meter hohe Skulptur aus Kalkstein ist farbig gefasst. Maria hält das Jesuskind auf dem linken Arm. Sein Gesicht wendet sich in Richtung des Betrachters. Maria, mit bäuerlichem Gesicht und roten Wangen, trägt auf ihrem Haupt eine Krone. Die vielen Falten von ihrem Kleid geben ihrer Erscheinung eine Fülle.'\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Filterstrategy: \n",
+    "\n",
+    "1. AlphanumericFilter remove sentence with more than 20% alphanumeric\n",
+    "2. ParenthesesRationFilter remove sentence with more than 5% parentheses\n",
+    "3. PunctuationFilter remove sentence with more than 15% missing punctuation\n",
+    "4. EllipsisFilter remove sentence with more than 30% ellipsis\n",
+    "5. LengthFilter: filter short documets < 5 words\n",
+    "6. LongWordFilter: for js stuff\n",
+    "7. CommonWordFilter: check if coherent sentence maybe not needed\n",
+    "8. RepeatedLinesFilter: remove repeated lines 30%\n",
+    "9. WhitespaceRatioFilter: remove sentence with more than 25% whitespace\n",
+    "10. UrlRatioFilter: remove sentence with more than 20% url\n",
+    "11. PerplexityFilter: remove sentence with perplexity > 1000\n",
+    "\n",
+    "\n",
+    "TODO: find law example which is super long and to filter it "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "wikipedia_filters = [\n",
+    "  NonAlphaNumericFilter(),\n",
+    "  LengthFilter(min_length=10),\n",
+    "  CommonWordFilter(common_words=COMMON_WORDS_DE),\n",
+    "  UrlRatioFilter(),\n",
+    "  PerplexityFilter(language=\"de\",min_threshold=0,max_threshold=perplexity_threshold)\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "da7067303e814628917d39b02cab5c0e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "filter documents... (num_proc=128):   0%|          | 0/53172498 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "import os\n",
+    "from easyllm.data.filters import CommonWordFilter,LongWordFilter,LengthFilter,PunctuationFilter,EllipsisFilter,PerplexityFilter,NonAlphaNumericFilter, RepeatedLinesFilter, RepeatedParagraphFilter,ParenthesesRationFilter, WhitespaceRatioFilter,UrlRatioFilter\n",
+    "\n",
+    "from easyllm.data.filters.common_word import COMMON_WORDS_DE\n",
+    "\n",
+    "perplexity_threshold = 1155.9099999999978\n",
+    "filters = [\n",
+    "  NonAlphaNumericFilter(),\n",
+    "  ParenthesesRationFilter(remove_percentage=0.05),\n",
+    "  PunctuationFilter(remove_percentage=0.10),\n",
+    "  EllipsisFilter(remove_percentage=0.3), \n",
+    "  LengthFilter(min_length=10),\n",
+    "  LongWordFilter(),\n",
+    "  CommonWordFilter(common_words=COMMON_WORDS_DE),\n",
+    "  RepeatedLinesFilter(),\n",
+    "  WhitespaceRatioFilter(),\n",
+    "  UrlRatioFilter(),\n",
+    "  PerplexityFilter(language=\"de\",min_threshold=0,max_threshold=perplexity_threshold)\n",
+    "]\n",
+    "errors = []\n",
+    "\n",
+    "def apply_filters(sample):\n",
+    "  # try:\n",
+    "  for filter in filters:\n",
+    "    fil = filter(sample[\"text\"])\n",
+    "    if fil:\n",
+    "      break\n",
+    "  # datasets filters keeps true elements, meaning if the filter is we want to set it to false\n",
+    "  # to remove the sample\n",
+    "  return not fil\n",
+    "  # return fil\n",
+    "\n",
+    "ds = ds.filter(apply_filters,num_proc=os.cpu_count(),\n",
+    "    desc=\"filter documents...\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Dataset({\n",
+       "    features: ['id', 'text', 'timestamp', 'url'],\n",
+       "    num_rows: 41627980\n",
+       "})"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "ds"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "print random sample"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "In wao.io gibt es zahlreiche Sicherheitsoptimierungen und es ist unwahscheinlich, dass man weiß, was sie alle bewirken. Hier ist ein kurzer Überblick.\n",
+      "Einige Sicherheitsoptionen müssen etwas mehr berücksichtigt werden als andere. Einige lassen sich ohne weiteres Nachdenken einschalten, andere brauchen viel Nachdenken und müssen wahrscheinlich gleich nach dem Einschalten getestet werden.\n",
+      "No-brainers\n",
+      "Verbieten Sie Content-Type Sniffing\n",
+      "Diese Option verbietet, dass der Browser versucht, unerwartete Inhaltstypen für meist CSS und JavaScript zu verarbeiten. Einige Browser versuchen bei unerwarteten Informationen über die gerade geladene Datei zu verstehen, welche Art von Inhalt sich in dieser Datei befindet. Und dann könnten sie versuchen, eine bessere Verwendung für die Datei zu finden als die, die im Dokument angegeben wurde. Dies ist fast immer nutzlos und kann von Angreifern missbraucht werden.\n",
+      "Meine Empfehlung: Schalten Sie es einfach ein\n",
+      "Entfernen Sie ausgehende Informationen über Ihren Server\n",
+      "Diese Option filtert alle von Ihrem Server gesendeten HTTP-Header, die Angreifern eine Vorstellung davon vermitteln, welche Software und welche Version der jeweiligen Software auf Ihrem Server verwendet wird. Ob es sich um die Drupal-Version, die Apache-Version, die PHP-Version oder irgendeinen anderen Hinweis darauf handelt, welche Art von Software Ihr Server verwendet, ein potentieller Angreifer kann einfach nach Schwachstellen für diese suchen und sie missbrauchen.\n",
+      "Dies könnte es schwieriger machen, bestimmte Probleme mit Ihrem Server zu debuggen. Denken Sie daran, dass Sie diese Funktion jederzeit mit einem einfachen Klick aktivieren oder deaktivieren können. Falls Sie also beim Debuggen einige Header vermissen, schalten Sie sie aus und später wieder ein.\n",
+      "Meine Empfehlung: Schalten Sie es einfach ein\n",
+      "Cross-site scripting Schutz\n",
+      "Für den Fall, dass ein Angreifer einen manipulierten Link an einen Ihrer Kunden senden oder bösartige Inhalte in Ihre Website einschleusen kann, schützt diese Option den Benutzer, indem sie den Browser das Dokument auf Angriffe scannen lässt.\n",
+      "Diese Option hat zwei Einstellungen: Sie können entweder unsichere Skripte entfernen oder das Rendern der Site auf einmal verhindern.\n",
+      "Es sind Hunderte von Angriffen bekannt, und die Chancen stehen gut, dass ein Angreifer, der Ihre Website angreift, mehr als nur einen verwendet. Mit den Optionen zum Entfernen unsicherer Skripte kann der Browser jeden von ihm entdeckten Angriff entschärfen, aber Angriffe, die er nicht entdeckt hat, werden trotzdem ausgeführt. An dieser Stelle kommt die Option Rendering verhindern ins Spiel. Wenn der Browser einen einzigen Angriff auf die Seite entdeckt, wird er die Seite überhaupt nicht rendern.\n",
+      "Meine Empfehlung: Rendering verhindern\n",
+      "Thinking required\n",
+      "Es gibt ein paar Funktionen, die davon abhängen, ob Ihre Website konsequent HTTPS verwendet oder nicht.\n",
+      "Wenn Sie sicher sind, dass HTTPS überall auf Ihrer Website verwendet wird und Ihr Server wahrscheinlich bereits Datenverkehr, der http anfordert, auf https umleitet, gibt es einige Optionen, die Sie in wao.io aktivieren sollten.\n",
+      "HTTP-URLs zu HTTPS umleiten\n",
+      "Dadurch wird nicht nur sichergestellt, dass keiner Ihrer Benutzer unsichere Inhalte sieht, sondern auch, dass eine http-Anforderung gar nicht erst auf Ihren Server gelangt. wao.io übernimmt die Aufgabe und leitet Ihren Client auf die https-Version der angeforderten Ressource um. Dies erhöht nicht nur die Sicherheit, sondern auch die Performance der Umleitung.\n",
+      "Meine Empfehlung: Wenn Sie https konsequent verwenden, schalten Sie es ein\n",
+      "Cookie Sicherheit\n",
+      "Auch diese Option hat zwei verschiedene Einstellungen. Sie können Cookies entweder davor schützen, über unsichere Verbindungen offengelegt zu werden, oder sie für unsichere Verbindungen ganz verbieten.\n",
+      "Beide Optionen fügen das Sicherheitsflag zu jedem Cookie hinzu, das Sie über den Set-Cookie-HTTP-Header setzen. Das bedeutet, dass der Client diese Cookies nicht sendet, wenn er eine unsichere http-Verbindung verwendet. Die Verbieten-Option wird sogar alle Set-Cookie-HTTP-Header entfernen, wenn die Verbindung unsicher ist.\n",
+      "Meine Empfehlung: Wenn Sie https konsequent verwenden, verbieten Sie Cookies für unsichere Verbindungen.\n",
+      "Einschränken der Einbettung in andere Webseiten\n",
+      "Nachdem wir nun die Funktionen ausgeschlossen haben, die auf HTTPS basieren, wollen wir uns nun den Funktionen zuwenden, die alles rund um Iframes behandeln.\n",
+      "Um eine durchdachte Entscheidung für diese Optionen zu treffen, müssen Sie wissen, ob Ihre Website in Iframes angezeigt werden soll oder muss, und - falls ja - welche Domains diese Iframes verwenden.\n",
+      "Mit der folgenden Option können Sie steuern, wo Iframes, die Ihre Website enthalten, erlaubt sind. Eine mögliche Antwort auf die Iframe-Frage lautet \"Ich weiß es nicht\". In diesem Fall sollten Sie die Einbettung nicht einschränken. Wenn Sie jedoch wissen, dass keine Iframes erforderlich sind, damit Ihre Website und Ihr Unternehmen korrekt funktionieren, sollten Sie die Einbettung einfach sofort verbieten.\n",
+      "Falls Sie die Domains kennen, die Iframes zur Anzeige Ihrer Website verwenden, gibt es zwei mögliche Ergebnisse: Die Domain, die den Iframe verwendet, ist entweder immer die Domain, die im Iframe enthalten ist, oder sie ist es nicht. Im ersten Fall können Sie die Einbettung nur von demselben Ursprung aus erlauben, im zweiten Fall können Sie eine Liste der Domains angeben, die Iframes verwenden dürfen.\n",
+      "Meine Empfehlung: entweder verbieten oder eine Liste der Ursprünge angeben\n",
+      "Cookies auf dieselbe Website beschränken\n",
+      "Auch bei dieser Funktion geht es darum, Ihre Webseite in Iframes anderer Leute einzubetten. Wenn Ihre Seite nicht von anderen eingebettet werden soll, sollten Sie die Cross-Site-Nutzung verbieten. Wenn sie von anderen eingebettet wird, sollten Sie sie wahrscheinlich nur für Navigationsanfragen zulassen. Wenn Sie sie wirklich für nicht-navigatorische Anfragen zulassen müssen, nehme ich an, dass Ihr Dienst technisch anspruchsvoll ist und Sie diesen Leitfaden ohnehin nicht benötigen.\n",
+      "Meine Empfehlung: entweder verbieten Sie die standortübergreifende Nutzung oder erlauben Sie sie für Navigationsanfragen.\n",
+      "Senden von Informationen über den Verweiser einschränken (Referrer-Policy)\n",
+      "Der Browser lässt jeden gerne wissen, welche URL Ihr Besucher gerade angesehen hat, wenn er eine Anfrage stellt.\n",
+      "Das bedeutet, dass der Browser unabhängig davon, ob er ein JavaScript von Tracking- oder Werbediensten, ein Bild von einem Bildhoster oder eine Website lädt, nachdem Ihr Besucher Ihre Website verlassen hat, die vollständige URL sendet, von der die Anfrage stammt. Dies ist ein potentielles Informationsleck, da diese Dritten nun wissen könnten, wie viele Artikel der Benutzer in seinem Warenkorb hat oder welches Produkt er gerade angeschaut hat.\n",
+      "Nun sind einige Dienste auf den Referrer als Autorisierungsmittel angewiesen. Möglicherweise möchte der Bildhoster die Bilder nur auf Ihrer genauen Website anzeigen. Selbst dann sollte er vielleicht nicht in der Lage sein, zu erkennen, wie viele Artikel sich im Warenkorb Ihres Kunden befinden.\n",
+      "Während die Referrer-Politik eine Reihe von Optionen hat, die im Detail einschränken, über welche Art von Verbindung welche Informationstiefe geteilt wird, habe ich einen Favoriten. Und zwar deshalb, weil er die Privatsphäre respektiert, indem er fast nie einen der Anwendungsfälle für die Überweisungsmechaniken bricht.\n",
+      "Meine Empfehlung: Senden Sie nichts, wenn weniger sicher, Referrer-Ursprung bei Cross-Origin, Referrer-URL sonst\n",
+      "Alle Merkmale, die ich in diesem Artikel nicht erörtert habe, erfordern ein tieferes Verständnis dessen, was sie bewirken, wann sie nicht verwendet werden sollten und vor allem, was sie bei falscher Verwendung kaputt machen könnten. Wenn Sie weitere Informationen zu einer dieser Funktionen oder zu den hier besprochenen Themen benötigen, wenden Sie sich einfach an unser Support-Team. Wir helfen Ihnen gerne weiter!\n",
+      "Experte\n",
+      "Sebastian ThielenProduct Manager von wao.io\n",
+      "Arbeitet seit über 10 Jahren für Avenga. Als leidenschaftlicher Software-Entwickler und Teamleiter in der Entwicklung, hat er sein Expertenwissen in den Themen Web-Performance und Web-Security ständig vertieft und weitergegeben.\n",
+      "Arbeitet seit über 10 Jahren für Avenga. Als leidenschaftlicher Software-Entwickler und Teamleiter in der Entwicklung, hat er sein Expertenwissen in den Themen Web-Performance und Web-Security ständig vertieft und weitergegeben.\n",
+      "Erfahre mehr über Sebastian\n",
+      "Das könnte dir auch gefallen:\n",
+      "Let's Encrypt SSL-Zertifikate auf wao.io\n",
+      "Was ist Let's Encrypt? Kostet mich ein SSL Zertifikat etwas? Muss ich mich selbst darum kümmern oder übernimmt wao.io das Erstellen eines Zertifikates für mich?\n",
+      "9.1.2020\n",
+      "Roland Guelle\n",
+      "Entferne ausgehende Informationen über Deinen Server\n",
+      "Welche Header werden durch das Feature \"Ausgehende Informationen\" über Deinen Server entfernen entfernt? Gibt es Risiken die ich dabei bedenken muss?\n",
+      "9.1.2020\n",
+      "Roland Guelle\n",
+      "AMP und wao.io. So optimieren Sie beschleunigte mobile Seiten (AMP) mit wao.io\n",
+      "Wie kann ich meine AMP beschleunigte Website mit wao.io benutzen? Muss ich zusätzlichen Aufwand betreiben? Hat dies Auswirkungen auf meine Analytics?\n",
+      "9.1.2020\n",
+      "Sebastian Thielen\n",
+      "14 tage kostenlos testen\n",
+      "Keine Kreditkarte notwendig. Jederzeit kündbar. Alle Details auf der Preis-Seite.\n",
+      "jetzt starten\n",
+      "wao.io ist eine Plattform, die die Ladezeit und Sicherheit Deiner Webseite automatisch und kontinuierlich verbessert. Es sind keine Codeänderungen notwendig. Verbessere Deine Conversion Rate, reduziere Bandbreitenkosten und erhalte Einblicke in die Performance Deiner Webseite.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from random import randint\n",
+    "\n",
+    "id = randint(0, len(ds)-1)\n",
+    "sample = ds[id]\n",
+    "\n",
+    "print(sample['text'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Clean long stuff"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Found cached dataset parquet (/home/ubuntu/.cache/huggingface/datasets/philschmid___parquet/philschmid--llama2-german-corpus-21cbdedbab973e31/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)\n"
+     ]
+    }
+   ],
+   "source": [
+    "from datasets import load_dataset \n",
+    "\n",
+    "ds = load_dataset(\"philschmid/llama2-german-corpus\", split=\"raw\")\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Dataset({\n",
+       "    features: ['text'],\n",
+       "    num_rows: 44414930\n",
+       "})"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "ds"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "bc114654578541e682f8229a9e454063",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "filter documents... (num_proc=128):   0%|          | 0/44414930 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "import os \n",
+    "\n",
+    "def apply_filters(sample):\n",
+    "  # try:\n",
+    "  if len(sample[\"text\"]) > 300_000:\n",
+    "    return False\n",
+    "  return True\n",
+    "\n",
+    "\n",
+    "# datasets filters keeps true elements, meaning if the filter is we want to set it to false\n",
+    "ds = ds.filter(apply_filters,num_proc=os.cpu_count(),\n",
+    "    desc=\"filter documents...\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "37786e0a47ec4375a7495bc1c5ed7ff2",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Saving the dataset (0/564 shards):   0%|          | 0/44401239 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "ds.save_to_disk('data/length-filtered')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "845d0381dbd94228a8ae206dcc226c78",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Pushing dataset shards to the dataset hub:   0%|          | 0/141 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c1ba1da04cf145aaa41bf4229bd65007",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8a607f5458774b1a9155c520d2ad538a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "eb2e02b0efd44617a2fa4b0ea9099934",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8f3b51f97a754bb783f133671bf77882",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "290a8f4f928a47f4a1a556d045c2c67f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "11ebed262ecf4c039ab8dd0a3fadc07d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a05ce84e38c94844969296126c6fd238",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "440ac397f00c41fc961e4dce7e9b29c4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "bda20847708d405eb1e7fddf9f171307",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "52b50173dc6144eab7ac52d444caac83",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e850b59e60404c8f923352ce65895f9a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "43aa42dfc0e84101a81d8c613ec57521",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "26bf22e5035545a19cce0763ed34fc1a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b24e0801b7984cd0a963597832ca2e0c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a7fd67591a364cf0adc4f7da92d780d2",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d4bc59beee6f4086bc6148454479acc4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2fa6bdbe519b4d859feb32346c0a90b4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e6e6d76c6cb042b9a79a380edcc610f6",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6998e634177241f1b27503962a59d73d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1fa91fa0b5214273bb68879834e4fe2e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "39916866f9294976ab85602bab0c3826",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "00388b83179449ffb84761301802045e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a8eb9c908755466db45fee483965d3ba",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8e52d3d1c4ba46bea0f5e6be1838571e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b56764221ec34d93acfa19593825240f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a13409b201064edb89ae45ef90413948",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0896749b21154236bcb7177a33e1d4cd",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d68eab6a88f9481eb9eef8ca07fe2c8e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2cd9246535ea490b91522bbe8971549e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "aa8dcf72edc34523ab4f82c031ba7afd",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e87b920d63d3493298938cbc232e6f37",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "4a53ef4d238b42fdb4e439598fc9c42c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "fb9419d04b9e48688109100217b43d58",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9dfbe08641f6409bbb3b37e04da8ac2a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "fcf88d0de0574418941ab875310639aa",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0b37b9cb2c9f4604b0aae4028d75e981",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b9cf6cb03ca04ccc8bddc88b42ac80ff",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "fa48871f9a0840498c9448bbbbdc4e3f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f6e5c75093cb4a0da23999224111f011",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5f32e20587294eae9197a099226472e6",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b6870f074b64428880eddbbeb12c57cb",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "94c5ea54c60845c69aa2ffbaf277afa0",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0b7a369ee1634c8ab27b5a7a798ba50e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c5b5bc2df5d74f7fb1f7bc2e12cde15b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "4b45fae1b65b4eefbe672ea32d939e6d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "394e9b56efce42dfa337d7d0dc2e55ad",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d1b0031735434bb4af09ae50eee189ed",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1a4da430ad1b4807b3e16a7b3fad7caf",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "89433b9ca7304421af8aee98a0bbaa47",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "74f898940e1947b7aadf2c42c4a39143",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "cfccf48c083f4d258c036d6ea14aa658",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2b1cd88c406e4dda916f0bc6557aa3df",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b7b754884f154e9796c23861597bcf41",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "bd7d7a4cbacf43bcb7f9b54ea23e8c49",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8f7bb207729b4a2eaa281e9c10031c43",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9d1cb0a9591f418f946d6917e8844311",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "04fe474bd72647298e4c8171b54d42d4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "72f000512d1547d695a7c0b28f4b61ae",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f2d095722eb440b693f310a2ca078ca1",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "246d7fdec56843888b98a4168c2277ba",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b3b2f3b8a29c4f5eb89079d6ada26021",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "363a94cdee9a4e388296101766912bd2",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a1c9e0c105a44a179dfe0c3c7700a022",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d0133771d95b40edb16523a8eb9944a3",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "45af1a00936e4a80adf28f45c6dbaffa",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c78baea899264419a9adf5c76954b75d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9149eca2e9114eb78b44d105aee123d8",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d250a156512d4ab8bedca48a31cc67aa",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "065614420b624941890f3714c65e2ed3",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "354bcbcb88a34dd0ba469b0c5dd046b1",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "91ac2d32edbf44de91c2fe9a9857abbc",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "250ef3cb470c42c78303244900d712c3",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c8be10f74a4042ba9a9cbe1a7fd7b3bc",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "de7d12e33d41417e9235fb0a0628df97",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "91bdcbec98d14a4e957bf3f63ac0c2de",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a704bcf819e544f5994368d7b4f1ac5b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "cdcdca99544d4626b75567cb25449e5e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7cbbc45f3ee84a7996be88470e4a6f49",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f11b7505054f4f06ac0ce382586376aa",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "897b4b768c64442face0957a63fea539",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0317028f918a496e8f0a9d9027c7d801",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6dea3cbdea8742bcbb0690c1c17aee70",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f1373d83d3d54db495589d0bff826d15",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "cdec16241cfc4f19858996e687eb7048",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9cba9f7db660495cbe095e39048fde65",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "af3bcb1f2f3c496bb72bbc11ce48ecf3",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9ab44bcb577c4c168dfc44ba5045a9e7",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "48fa5f6aba8949469642793c7a8b31e9",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "39d43782b6d8442984ce988cabc265a6",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "437b9f8f928b47c49bcb144d8405a887",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "bc008ead1b7942f3bf4996d2b78a8bc6",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c95f328ea3024c7daf68027f9b17479b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ee5a3c8b8fd04c5dbff6af7452f6c14e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "3d500086d3b94175a8e5bf36ed49289a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "75a6a1090b164c6f982e7145357bbf28",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7aa305cd5df64ea6aa21a14f38e44022",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "122e076f80614e2180480417bb6c3452",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1e9ecdc8eb4846afa1c91d4b5af37b5a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a7a3184ea8c54d69826caecd3d8e6fe2",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2ed2c7886efa47238912677cc1962c79",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8697653bcf3b4ab5a918cbf2ba1883e2",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c963bf4bfae848b3a9225f5f849a3056",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f55288c76d644dc5ad0254cd413624a5",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "56b44d72ffeb434fb29c34e392730c8e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ac8d2fd8aef34d65a68e78a0b941a5ed",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a4154dcad97b4df99f21486d65da678f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b4405bef54a24e3b94b66bb2a20bafdb",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a6ac720b2f604be28b48c62329cc7aca",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9eb2bcc93f4f4ad98b7c883177704f64",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "58510b46e3274173af56036e4751a321",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1cfb76c1b40d4015816034aab8f0c8ad",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6c4e98017d1e4d8abad236f6429b997b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7317c9c35d4e4ebd9b2a4be5835077fe",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1238592d066e4d389b10073192334c07",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "18adb9f52adb48aaac24b51dcb855526",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d4f4670cf1314246bd2ed3a3180997c7",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "530c5e810009405391a25a2b64cc7e2c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "370768dd6e9d4214b82a54804c2ba2f2",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7a1e3f88f3cd45069abea84947c07bc4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d32dec3a4e34483eb594c797794fd2e3",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "196a31e1dc6045bb85d5f4a516d4c668",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "12a97beb087f42c4a06e00efd33493ea",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "52717591375249a9b8d275b286c13562",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "3810ac9a95064b82b2d62fbcb6529139",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ab5a6295353c4b91b30c0c54ee0ed399",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d275d70440994a9ab34440d88c5393be",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/315 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "ds.push_to_hub(\"philschmid/llama2-german-corpus-20230816\",\"raw\",private=True,max_shard_size=\"2000MB\",token=\"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\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "dev",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.13"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/pyproject.toml b/pyproject.toml
index 952236e..9563a50 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,6 +10,9 @@ Source = "https://github.com/unknown/hatch-demo"
 [tool.hatch.version]
 path = "easyllm/__init__.py"
 
+[tool.hatch.metadata]
+allow-direct-references = true
+
 [project]
 name = "easyllm"
 description = "Description"
@@ -36,6 +39,7 @@ scripts = { easyllm = "easyllm.cli:main" }
 dependencies = ["pydantic==2.1.1", "nanoid==2.0.0", "huggingface-hub==0.16.4"]
 
 [project.optional-dependencies]
+data = ["datasets","kenlm @ https://github.com/kpu/kenlm/archive/master.zip","sentencepiece","readability-lxml","inscriptis"]
 test = ["pytest", "ruff", "black", "isort", "mypy", "hatch"]
 bedrock = ["boto3"]
 dev = ["ruff", "black", "isort", "mypy", "hatch"]