Merge pull request #33 from philschmid/datafilter

Datafilter
philschmid · Nov 24, 2023 · 1f37a93 · 1f37a93
2 parents a651e9d + ed3a068
commit 1f37a93
Show file tree

Hide file tree

Showing 22 changed files with 3,426 additions and 0 deletions.
diff --git a/easyllm/data/__init__.py b/easyllm/data/__init__.py
diff --git a/easyllm/data/extractor/__init__.py b/easyllm/data/extractor/__init__.py
@@ -0,0 +1 @@
+from easyllm.data.extractor.html_extractor import HtmlExtractor
diff --git a/easyllm/data/extractor/html_extractor.py b/easyllm/data/extractor/html_extractor.py
@@ -0,0 +1,23 @@
+#
+from inscriptis import get_text
+from inscriptis.css_profiles import CSS_PROFILES
+from inscriptis.model.config import ParserConfig
+from pydantic import BaseModel
+from readability import Document
+
+INSCRIPTIS_CONFIG = ParserConfig(css=CSS_PROFILES["strict"])
+
+
+class HtmlExtractor(BaseModel):
+    """
+    Desc: Extracts text from the HTML document using mozzilas readability and inscriptis.
+    """
+
+    name: str = "html_extractor"
+    min_doc_length: int = 25
+
+    def __call__(self, document: str) -> str:
+        parsed_doc = Document(document, min_text_length=self.min_doc_length)
+        clean_html = parsed_doc.summary(html_partial=True)
+        content = get_text(clean_html, INSCRIPTIS_CONFIG).strip()
+        return content
diff --git a/easyllm/data/filters/__init__.py b/easyllm/data/filters/__init__.py
@@ -0,0 +1,14 @@
+from easyllm.data.filters.bulletpoint_ratio import BulletpointRatioFilter
+from easyllm.data.filters.common_word import CommonWordFilter
+from easyllm.data.filters.digit_to_character import DigitToCharacter
+from easyllm.data.filters.kenlm_ppl import PerplexityFilter
+from easyllm.data.filters.length import LengthFilter
+from easyllm.data.filters.longword import LongWordFilter
+from easyllm.data.filters.n_gram import TopNGramsFilter
+from easyllm.data.filters.non_alpha_numeric import NonAlphaNumericFilter
+from easyllm.data.filters.parantheses_ration import ParenthesesRationFilter
+from easyllm.data.filters.punctuation import EllipsisFilter, PunctuationFilter
+from easyllm.data.filters.repeating import RepeatedLinesFilter, RepeatedParagraphFilter
+from easyllm.data.filters.url_ratio import UrlRatioFilter
+from easyllm.data.filters.whitespace_ration import WhitespaceRatioFilter
+from easyllm.data.filters.words_to_symbol import SymbolToWordFilter
diff --git a/easyllm/data/filters/bulletpoint_ratio.py b/easyllm/data/filters/bulletpoint_ratio.py
@@ -0,0 +1,42 @@
+from typing import List
+
+from pydantic import BaseModel
+
+
+class BulletpointRatioFilter(BaseModel):
+    """
+    Ref: Gopher (Rae et al., 2021)
+    Desc: If more than 90% of the document are bulletpoints then remove
+    """
+
+    name: str = "bulletpoint_ratio"
+    potential_bullet_points: List[str] = [
+        "•",
+        "‣",
+        "⁃",
+        "⁌",
+        "⁍",
+        "∙",
+        "○",
+        "●",
+        "◘",
+        "◦",
+        "⦾",
+        "⦿",
+        "-",
+    ]
+    remove_percentage: float = 0.9
+
+    def __call__(self, text):
+        # split text into lines
+        lines = text.split("\n")
+        num_bullet_points = 0
+        for line in lines:
+            # check if the line is a bullet point
+            if line.startswith(tuple(self.potential_bullet_points)):
+                num_bullet_points += 1
+        # check if the ratio of bullet points to lines is greater than the remove percentage
+        if num_bullet_points / len(lines) > self.remove_percentage:
+            return True
+        # otherwise keep
+        return False
diff --git a/easyllm/data/filters/common_word.py b/easyllm/data/filters/common_word.py
@@ -0,0 +1,29 @@
+from typing import List
+
+from pydantic import BaseModel
+
+COMMON_WORDS_EN = ["the", "be", "to", "of", "and", "that", "have", "with", "this"]
+COMMON_WORDS_DE = ["der", "die", "das", "er" "sein", "zu", "ist", "war", "von", "und", "haben", "mit"]
+
+
+class CommonWordFilter(BaseModel):
+    """
+    Ref: Gopher (Rae et al., 2021)
+    Desc: Makes sure that the document contains at least 2 common words if not remove
+    """
+
+    name: str = "common_word"
+    common_words: List[str] = COMMON_WORDS_EN
+    n: int = 2
+
+    def __call__(self, text):
+        words = text.split()
+        common_word_counter = 0
+        # count the number of common words
+        for word in words:
+            if word.lower() in self.common_words:
+                common_word_counter += 1
+            if common_word_counter >= self.n:
+                return False
+        # otherwise remove
+        return True
diff --git a/easyllm/data/filters/cookie_banner.py b/easyllm/data/filters/cookie_banner.py
@@ -0,0 +1,53 @@
+import re
+
+from pydantic import BaseModel
+
+policy_substrings = [
+    "terms of use",
+    "privacy policy",
+    "cookie policy",
+    "uses cookies",
+    "privacy overview",
+    "use of cookies",
+    "use cookies",
+    "privacy & cookies policy",
+    "privacy and cookies policy",
+    "This website uses cookies to improve your experience while you "
+    "navigate through the website. Out of these cookies, the cookies "
+    "that are categorized as necessary are stored on your browser as they "
+    "are essential for the working of basic functionalities of the website. "
+    "We also use third-party cookies that help us analyze and understand how "
+    "you use this website. These cookies will be stored in your browser only "
+    "with your consent. You also have the option to opt-out of these "
+    "cookies. But opting out of some of these cookies may have an effect "
+    "on your browsing experience.".lower(),
+    "Necessary cookies are absolutely essential for the website to "
+    "function properly. This category only includes cookies that "
+    "ensures basic functionalities and security features of the website. "
+    "These cookies do not store any personal information.".lower(),
+    "Any cookies that may not be particularly necessary for the website "
+    "to function and is used specifically to collect user personal data "
+    "via analytics, ads, other embedded contents are termed as non-necessary "
+    "cookies. It is mandatory to procure user consent prior to running these "
+    "cookies on your website.".lower(),
+    "This site uses cookies, including for analytics, personalization, and "
+    "advertising purposes. For more information or to change your "
+    "cookie settings, click here.".lower(),
+    "If you continue to browse this site without changing your cookie "
+    "settings, you agree to this use. AcceptRead More".lower(),
+]
+
+
+class CookieBannerFilter(BaseModel):
+    """
+    Ref: C4 Raffel et al.
+    Desc: Removes documents if more than 40% of the documents include terms for cookies, tos, privacy policy, etc. Requires external list.
+    """
+
+    name: str = "cookie_banner"
+    regex: re.Pattern = re.compile(r"(terms of use|privacy policy|copyright|all rights reserved)", re.IGNORECASE)
+    remove_percentage: float = 0.4
+
+    def __call__(self, text):
+        # check if the regex matches
+        raise NotImplementedError("CookieBannerFilter not implemented yet")
diff --git a/easyllm/data/filters/digit_to_character.py b/easyllm/data/filters/digit_to_character.py
@@ -0,0 +1,22 @@
+import re
+
+from pydantic import BaseModel
+
+
+class DigitToCharacter(BaseModel):
+    """
+    Desc: If more than 20% of the document are digits then remove
+    """
+
+    name: str = "digit_to_character"
+    remove_percentage: float = 0.2
+
+    def __call__(self, text):
+        digits = re.findall(r"\d", text)
+        num_digits = len(digits)
+        total_chars = len(text)
+        # check if there are any characters in the text
+        if num_digits / total_chars > self.remove_percentage:
+            return True
+        # otherwise keep
+        return False
diff --git a/easyllm/data/filters/kenlm_ppl.py b/easyllm/data/filters/kenlm_ppl.py
@@ -0,0 +1,200 @@
+import importlib.util
+import re
+import unicodedata
+from typing import Dict
+
+from huggingface_hub import hf_hub_download
+from pydantic import BaseModel, ConfigDict
+
+_kenlm = importlib.util.find_spec("kenlm") is not None
+_sentencepiece = importlib.util.find_spec("sentencepiece") is not None
+
+if _kenlm or not _sentencepiece:
+    import kenlm
+    import sentencepiece
+
+
+class SentencePiece:
+    def __init__(
+        self,
+        model: str,
+    ):
+        super().__init__()
+        self.sp = sentencepiece.SentencePieceProcessor()
+        self.sp.load(str(model))
+
+    def do(self, text: dict) -> dict:
+        tokenized = self.sp.encode_as_pieces(text)
+        return " ".join(tokenized)
+
+
+class KenlmModel:
+    digit_re: re.Pattern[str] = re.compile(r"\d")
+    unicode_punct: Dict[str, str] = {
+        "，": ",",
+        "。": ".",
+        "、": ",",
+        "„": '"',
+        "”": '"',
+        "“": '"',
+        "«": '"',
+        "»": '"',
+        "１": '"',
+        "」": '"',
+        "「": '"',
+        "《": '"',
+        "》": '"',
+        "´": "'",
+        "∶": ":",
+        "：": ":",
+        "？": "?",
+        "！": "!",
+        "（": "(",
+        "）": ")",
+        "；": ";",
+        "–": "-",
+        "—": " - ",
+        "．": ". ",
+        "～": "~",
+        "’": "'",
+        "…": "...",
+        "━": "-",
+        "〈": "<",
+        "〉": ">",
+        "【": "[",
+        "】": "]",
+        "％": "%",
+        "►": "-",
+    }
+    unicode_punct_re: re.Pattern = re.compile(f"[{''.join(unicode_punct.keys())}]")
+    non_printing_chars_re: re.Pattern = re.compile(f"[{''.join(map(chr, list(range(0,32)) + list(range(127,160))))}]")
+    model: kenlm.Model = None
+    tokenizer: SentencePiece = None
+    accent: bool = False
+    case: bool = False
+    numbers: bool = True
+    punct: int = 1
+
+    def __init__(
+        self,
+        model_path: str,
+        tokenizer_path: str,
+        lower_case: bool = False,
+        remove_accents: bool = False,
+        normalize_numbers: bool = True,
+        punctuation: int = 1,
+    ):
+        self.model = kenlm.Model(model_path)
+        self.tokenizer = SentencePiece(tokenizer_path)
+        self.accent = remove_accents
+        self.case = lower_case
+        self.numbers = normalize_numbers
+        self.punct = punctuation
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        language_or_path: str,
+    ):
+        try:
+            model = hf_hub_download("philschmid/kenlm", filename=f"wikipedia/{language_or_path}.arpa.bin")
+            tokenizer = hf_hub_download("philschmid/kenlm", filename=f"wikipedia/{language_or_path}.sp.model")
+        except Exception:
+            raise ValueError(
+                f"KenLM model for {language_or_path} not found at https://huggingface.co/philschmid/kenlm. Please train your own model and upload it to the hub."
+            ) from None
+
+        return cls(
+            model,
+            tokenizer,
+            False,
+            False,
+            True,
+            1,
+        )
+
+    def pp(self, log_score, length):
+        return 10.0 ** (-log_score / length)
+
+    def get_perplexity(self, doc: str, normalize_cc_net: bool = True):
+        if normalize_cc_net:
+            doc = self.normalize(
+                doc,
+                accent=self.accent,
+                case=self.case,
+                numbers=self.numbers,
+                punct=self.punct,
+            )
+        # Tokenize (after normalizing): See https://github.com/facebookresearch/cc_net/blob/bda555bd1cf1ee2e0b925363e62a61cd46c8b60d/cc_net/mine.py#L352 for full pipeline
+        doc = self.tokenizer.do(doc)
+        doc_log_score, doc_length = 0, 0
+        for line in doc.split("\n"):
+            log_score = self.model.score(line)
+            length = len(line.split()) + 1
+            doc_log_score += log_score
+            doc_length += length
+        return round(self.pp(doc_log_score, doc_length), 1)
+
+    def normalize(
+        self,
+        line: str,
+        accent: bool = True,
+        case: bool = True,
+        numbers: bool = True,
+        punct: int = 1,
+    ) -> str:
+        line = line.strip()
+        if not line:
+            return line
+        if case:
+            line = line.lower()
+        if accent:
+            line = self.strip_accents(line)
+        if numbers:
+            line = self.digit_re.sub("0", line)
+        if punct == 1:
+            line = self.replace_unicode_punct(line)
+        elif punct == 2:
+            line = self.remove_unicode_punct(line)
+        line = self.remove_non_printing_char(line)
+        return line
+
+    def strip_accents(self, line: str) -> str:
+        """Strips accents from a piece of text."""
+        nfd = unicodedata.normalize("NFD", line)
+        output = [c for c in nfd if unicodedata.category(c) != "Mn"]
+        if len(output) == line:
+            return line
+        return "".join(output)
+
+    def replace_unicode_punct(self, text: str) -> str:
+        return "".join(self.unicode_punct.get(c, c) for c in text)
+
+    def remove_unicode_punct(self, text: str) -> str:
+        """More aggressive version of replace_unicode_punct but also faster."""
+        return self.unicode_punct_re.sub("", text)
+
+    def remove_non_printing_char(self, text: str) -> str:
+        return self.non_printing_chars_re.sub("", text)
+
+
+class PerplexityFilter(BaseModel):
+    model: KenlmModel = None
+    min_threshold: int = 0
+    max_threshold: int = 1000
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+    def __init__(self, language: str, min_threshold: int = 0, max_threshold: int = 1000):
+        super().__init__()
+        self.min_threshold = min_threshold
+        self.max_threshold = max_threshold
+        self.model = KenlmModel.from_pretrained(language)
+
+    def __call__(self, doc: str) -> bool:
+        # returns True if the perplexity of the document outside of the threshold,
+        # meaning smaller than min_threshold or larger than max_threshold
+        perplexity = self.model.get_perplexity(doc)
+        if perplexity < self.min_threshold or perplexity > self.max_threshold:
+            return True
+        # otherwise keep
+        return False
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		from easyllm.data.extractor.html_extractor import HtmlExtractor