Skip to content

Commit

Permalink
Merge pull request #33 from philschmid/datafilter
Browse files Browse the repository at this point in the history
Datafilter
  • Loading branch information
philschmid authored Nov 24, 2023
2 parents a651e9d + ed3a068 commit 1f37a93
Show file tree
Hide file tree
Showing 22 changed files with 3,426 additions and 0 deletions.
Empty file added easyllm/data/__init__.py
Empty file.
1 change: 1 addition & 0 deletions easyllm/data/extractor/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from easyllm.data.extractor.html_extractor import HtmlExtractor
23 changes: 23 additions & 0 deletions easyllm/data/extractor/html_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#
from inscriptis import get_text
from inscriptis.css_profiles import CSS_PROFILES
from inscriptis.model.config import ParserConfig
from pydantic import BaseModel
from readability import Document

INSCRIPTIS_CONFIG = ParserConfig(css=CSS_PROFILES["strict"])


class HtmlExtractor(BaseModel):
"""
Desc: Extracts text from the HTML document using mozzilas readability and inscriptis.
"""

name: str = "html_extractor"
min_doc_length: int = 25

def __call__(self, document: str) -> str:
parsed_doc = Document(document, min_text_length=self.min_doc_length)
clean_html = parsed_doc.summary(html_partial=True)
content = get_text(clean_html, INSCRIPTIS_CONFIG).strip()
return content
14 changes: 14 additions & 0 deletions easyllm/data/filters/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from easyllm.data.filters.bulletpoint_ratio import BulletpointRatioFilter
from easyllm.data.filters.common_word import CommonWordFilter
from easyllm.data.filters.digit_to_character import DigitToCharacter
from easyllm.data.filters.kenlm_ppl import PerplexityFilter
from easyllm.data.filters.length import LengthFilter
from easyllm.data.filters.longword import LongWordFilter
from easyllm.data.filters.n_gram import TopNGramsFilter
from easyllm.data.filters.non_alpha_numeric import NonAlphaNumericFilter
from easyllm.data.filters.parantheses_ration import ParenthesesRationFilter
from easyllm.data.filters.punctuation import EllipsisFilter, PunctuationFilter
from easyllm.data.filters.repeating import RepeatedLinesFilter, RepeatedParagraphFilter
from easyllm.data.filters.url_ratio import UrlRatioFilter
from easyllm.data.filters.whitespace_ration import WhitespaceRatioFilter
from easyllm.data.filters.words_to_symbol import SymbolToWordFilter
42 changes: 42 additions & 0 deletions easyllm/data/filters/bulletpoint_ratio.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from typing import List

from pydantic import BaseModel


class BulletpointRatioFilter(BaseModel):
"""
Ref: Gopher (Rae et al., 2021)
Desc: If more than 90% of the document are bulletpoints then remove
"""

name: str = "bulletpoint_ratio"
potential_bullet_points: List[str] = [
"•",
"‣",
"⁃",
"⁌",
"⁍",
"∙",
"○",
"●",
"◘",
"◦",
"⦾",
"⦿",
"-",
]
remove_percentage: float = 0.9

def __call__(self, text):
# split text into lines
lines = text.split("\n")
num_bullet_points = 0
for line in lines:
# check if the line is a bullet point
if line.startswith(tuple(self.potential_bullet_points)):
num_bullet_points += 1
# check if the ratio of bullet points to lines is greater than the remove percentage
if num_bullet_points / len(lines) > self.remove_percentage:
return True
# otherwise keep
return False
29 changes: 29 additions & 0 deletions easyllm/data/filters/common_word.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
from typing import List

from pydantic import BaseModel

COMMON_WORDS_EN = ["the", "be", "to", "of", "and", "that", "have", "with", "this"]
COMMON_WORDS_DE = ["der", "die", "das", "er" "sein", "zu", "ist", "war", "von", "und", "haben", "mit"]


class CommonWordFilter(BaseModel):
"""
Ref: Gopher (Rae et al., 2021)
Desc: Makes sure that the document contains at least 2 common words if not remove
"""

name: str = "common_word"
common_words: List[str] = COMMON_WORDS_EN
n: int = 2

def __call__(self, text):
words = text.split()
common_word_counter = 0
# count the number of common words
for word in words:
if word.lower() in self.common_words:
common_word_counter += 1
if common_word_counter >= self.n:
return False
# otherwise remove
return True
53 changes: 53 additions & 0 deletions easyllm/data/filters/cookie_banner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import re

from pydantic import BaseModel

policy_substrings = [
"terms of use",
"privacy policy",
"cookie policy",
"uses cookies",
"privacy overview",
"use of cookies",
"use cookies",
"privacy & cookies policy",
"privacy and cookies policy",
"This website uses cookies to improve your experience while you "
"navigate through the website. Out of these cookies, the cookies "
"that are categorized as necessary are stored on your browser as they "
"are essential for the working of basic functionalities of the website. "
"We also use third-party cookies that help us analyze and understand how "
"you use this website. These cookies will be stored in your browser only "
"with your consent. You also have the option to opt-out of these "
"cookies. But opting out of some of these cookies may have an effect "
"on your browsing experience.".lower(),
"Necessary cookies are absolutely essential for the website to "
"function properly. This category only includes cookies that "
"ensures basic functionalities and security features of the website. "
"These cookies do not store any personal information.".lower(),
"Any cookies that may not be particularly necessary for the website "
"to function and is used specifically to collect user personal data "
"via analytics, ads, other embedded contents are termed as non-necessary "
"cookies. It is mandatory to procure user consent prior to running these "
"cookies on your website.".lower(),
"This site uses cookies, including for analytics, personalization, and "
"advertising purposes. For more information or to change your "
"cookie settings, click here.".lower(),
"If you continue to browse this site without changing your cookie "
"settings, you agree to this use. AcceptRead More".lower(),
]


class CookieBannerFilter(BaseModel):
"""
Ref: C4 Raffel et al.
Desc: Removes documents if more than 40% of the documents include terms for cookies, tos, privacy policy, etc. Requires external list.
"""

name: str = "cookie_banner"
regex: re.Pattern = re.compile(r"(terms of use|privacy policy|copyright|all rights reserved)", re.IGNORECASE)
remove_percentage: float = 0.4

def __call__(self, text):
# check if the regex matches
raise NotImplementedError("CookieBannerFilter not implemented yet")
22 changes: 22 additions & 0 deletions easyllm/data/filters/digit_to_character.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import re

from pydantic import BaseModel


class DigitToCharacter(BaseModel):
"""
Desc: If more than 20% of the document are digits then remove
"""

name: str = "digit_to_character"
remove_percentage: float = 0.2

def __call__(self, text):
digits = re.findall(r"\d", text)
num_digits = len(digits)
total_chars = len(text)
# check if there are any characters in the text
if num_digits / total_chars > self.remove_percentage:
return True
# otherwise keep
return False
200 changes: 200 additions & 0 deletions easyllm/data/filters/kenlm_ppl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
import importlib.util
import re
import unicodedata
from typing import Dict

from huggingface_hub import hf_hub_download
from pydantic import BaseModel, ConfigDict

_kenlm = importlib.util.find_spec("kenlm") is not None
_sentencepiece = importlib.util.find_spec("sentencepiece") is not None

if _kenlm or not _sentencepiece:
import kenlm
import sentencepiece


class SentencePiece:
def __init__(
self,
model: str,
):
super().__init__()
self.sp = sentencepiece.SentencePieceProcessor()
self.sp.load(str(model))

def do(self, text: dict) -> dict:
tokenized = self.sp.encode_as_pieces(text)
return " ".join(tokenized)


class KenlmModel:
digit_re: re.Pattern[str] = re.compile(r"\d")
unicode_punct: Dict[str, str] = {
",": ",",
"。": ".",
"、": ",",
"„": '"',
"”": '"',
"“": '"',
"«": '"',
"»": '"',
"1": '"',
"」": '"',
"「": '"',
"《": '"',
"》": '"',
"´": "'",
"∶": ":",
":": ":",
"?": "?",
"!": "!",
"(": "(",
")": ")",
";": ";",
"–": "-",
"—": " - ",
".": ". ",
"~": "~",
"’": "'",
"…": "...",
"━": "-",
"〈": "<",
"〉": ">",
"【": "[",
"】": "]",
"%": "%",
"►": "-",
}
unicode_punct_re: re.Pattern = re.compile(f"[{''.join(unicode_punct.keys())}]")
non_printing_chars_re: re.Pattern = re.compile(f"[{''.join(map(chr, list(range(0,32)) + list(range(127,160))))}]")
model: kenlm.Model = None
tokenizer: SentencePiece = None
accent: bool = False
case: bool = False
numbers: bool = True
punct: int = 1

def __init__(
self,
model_path: str,
tokenizer_path: str,
lower_case: bool = False,
remove_accents: bool = False,
normalize_numbers: bool = True,
punctuation: int = 1,
):
self.model = kenlm.Model(model_path)
self.tokenizer = SentencePiece(tokenizer_path)
self.accent = remove_accents
self.case = lower_case
self.numbers = normalize_numbers
self.punct = punctuation

@classmethod
def from_pretrained(
cls,
language_or_path: str,
):
try:
model = hf_hub_download("philschmid/kenlm", filename=f"wikipedia/{language_or_path}.arpa.bin")
tokenizer = hf_hub_download("philschmid/kenlm", filename=f"wikipedia/{language_or_path}.sp.model")
except Exception:
raise ValueError(
f"KenLM model for {language_or_path} not found at https://huggingface.co/philschmid/kenlm. Please train your own model and upload it to the hub."
) from None

return cls(
model,
tokenizer,
False,
False,
True,
1,
)

def pp(self, log_score, length):
return 10.0 ** (-log_score / length)

def get_perplexity(self, doc: str, normalize_cc_net: bool = True):
if normalize_cc_net:
doc = self.normalize(
doc,
accent=self.accent,
case=self.case,
numbers=self.numbers,
punct=self.punct,
)
# Tokenize (after normalizing): See https://github.com/facebookresearch/cc_net/blob/bda555bd1cf1ee2e0b925363e62a61cd46c8b60d/cc_net/mine.py#L352 for full pipeline
doc = self.tokenizer.do(doc)
doc_log_score, doc_length = 0, 0
for line in doc.split("\n"):
log_score = self.model.score(line)
length = len(line.split()) + 1
doc_log_score += log_score
doc_length += length
return round(self.pp(doc_log_score, doc_length), 1)

def normalize(
self,
line: str,
accent: bool = True,
case: bool = True,
numbers: bool = True,
punct: int = 1,
) -> str:
line = line.strip()
if not line:
return line
if case:
line = line.lower()
if accent:
line = self.strip_accents(line)
if numbers:
line = self.digit_re.sub("0", line)
if punct == 1:
line = self.replace_unicode_punct(line)
elif punct == 2:
line = self.remove_unicode_punct(line)
line = self.remove_non_printing_char(line)
return line

def strip_accents(self, line: str) -> str:
"""Strips accents from a piece of text."""
nfd = unicodedata.normalize("NFD", line)
output = [c for c in nfd if unicodedata.category(c) != "Mn"]
if len(output) == line:
return line
return "".join(output)

def replace_unicode_punct(self, text: str) -> str:
return "".join(self.unicode_punct.get(c, c) for c in text)

def remove_unicode_punct(self, text: str) -> str:
"""More aggressive version of replace_unicode_punct but also faster."""
return self.unicode_punct_re.sub("", text)

def remove_non_printing_char(self, text: str) -> str:
return self.non_printing_chars_re.sub("", text)


class PerplexityFilter(BaseModel):
model: KenlmModel = None
min_threshold: int = 0
max_threshold: int = 1000
model_config = ConfigDict(arbitrary_types_allowed=True)

def __init__(self, language: str, min_threshold: int = 0, max_threshold: int = 1000):
super().__init__()
self.min_threshold = min_threshold
self.max_threshold = max_threshold
self.model = KenlmModel.from_pretrained(language)

def __call__(self, doc: str) -> bool:
# returns True if the perplexity of the document outside of the threshold,
# meaning smaller than min_threshold or larger than max_threshold
perplexity = self.model.get_perplexity(doc)
if perplexity < self.min_threshold or perplexity > self.max_threshold:
return True
# otherwise keep
return False
Loading

0 comments on commit 1f37a93

Please sign in to comment.