-
Notifications
You must be signed in to change notification settings - Fork 37
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #33 from philschmid/datafilter
Datafilter
- Loading branch information
Showing
22 changed files
with
3,426 additions
and
0 deletions.
There are no files selected for viewing
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from easyllm.data.extractor.html_extractor import HtmlExtractor |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
# | ||
from inscriptis import get_text | ||
from inscriptis.css_profiles import CSS_PROFILES | ||
from inscriptis.model.config import ParserConfig | ||
from pydantic import BaseModel | ||
from readability import Document | ||
|
||
INSCRIPTIS_CONFIG = ParserConfig(css=CSS_PROFILES["strict"]) | ||
|
||
|
||
class HtmlExtractor(BaseModel): | ||
""" | ||
Desc: Extracts text from the HTML document using mozzilas readability and inscriptis. | ||
""" | ||
|
||
name: str = "html_extractor" | ||
min_doc_length: int = 25 | ||
|
||
def __call__(self, document: str) -> str: | ||
parsed_doc = Document(document, min_text_length=self.min_doc_length) | ||
clean_html = parsed_doc.summary(html_partial=True) | ||
content = get_text(clean_html, INSCRIPTIS_CONFIG).strip() | ||
return content |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
from easyllm.data.filters.bulletpoint_ratio import BulletpointRatioFilter | ||
from easyllm.data.filters.common_word import CommonWordFilter | ||
from easyllm.data.filters.digit_to_character import DigitToCharacter | ||
from easyllm.data.filters.kenlm_ppl import PerplexityFilter | ||
from easyllm.data.filters.length import LengthFilter | ||
from easyllm.data.filters.longword import LongWordFilter | ||
from easyllm.data.filters.n_gram import TopNGramsFilter | ||
from easyllm.data.filters.non_alpha_numeric import NonAlphaNumericFilter | ||
from easyllm.data.filters.parantheses_ration import ParenthesesRationFilter | ||
from easyllm.data.filters.punctuation import EllipsisFilter, PunctuationFilter | ||
from easyllm.data.filters.repeating import RepeatedLinesFilter, RepeatedParagraphFilter | ||
from easyllm.data.filters.url_ratio import UrlRatioFilter | ||
from easyllm.data.filters.whitespace_ration import WhitespaceRatioFilter | ||
from easyllm.data.filters.words_to_symbol import SymbolToWordFilter |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
from typing import List | ||
|
||
from pydantic import BaseModel | ||
|
||
|
||
class BulletpointRatioFilter(BaseModel): | ||
""" | ||
Ref: Gopher (Rae et al., 2021) | ||
Desc: If more than 90% of the document are bulletpoints then remove | ||
""" | ||
|
||
name: str = "bulletpoint_ratio" | ||
potential_bullet_points: List[str] = [ | ||
"•", | ||
"‣", | ||
"⁃", | ||
"⁌", | ||
"⁍", | ||
"∙", | ||
"○", | ||
"●", | ||
"◘", | ||
"◦", | ||
"⦾", | ||
"⦿", | ||
"-", | ||
] | ||
remove_percentage: float = 0.9 | ||
|
||
def __call__(self, text): | ||
# split text into lines | ||
lines = text.split("\n") | ||
num_bullet_points = 0 | ||
for line in lines: | ||
# check if the line is a bullet point | ||
if line.startswith(tuple(self.potential_bullet_points)): | ||
num_bullet_points += 1 | ||
# check if the ratio of bullet points to lines is greater than the remove percentage | ||
if num_bullet_points / len(lines) > self.remove_percentage: | ||
return True | ||
# otherwise keep | ||
return False |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
from typing import List | ||
|
||
from pydantic import BaseModel | ||
|
||
COMMON_WORDS_EN = ["the", "be", "to", "of", "and", "that", "have", "with", "this"] | ||
COMMON_WORDS_DE = ["der", "die", "das", "er" "sein", "zu", "ist", "war", "von", "und", "haben", "mit"] | ||
|
||
|
||
class CommonWordFilter(BaseModel): | ||
""" | ||
Ref: Gopher (Rae et al., 2021) | ||
Desc: Makes sure that the document contains at least 2 common words if not remove | ||
""" | ||
|
||
name: str = "common_word" | ||
common_words: List[str] = COMMON_WORDS_EN | ||
n: int = 2 | ||
|
||
def __call__(self, text): | ||
words = text.split() | ||
common_word_counter = 0 | ||
# count the number of common words | ||
for word in words: | ||
if word.lower() in self.common_words: | ||
common_word_counter += 1 | ||
if common_word_counter >= self.n: | ||
return False | ||
# otherwise remove | ||
return True |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
import re | ||
|
||
from pydantic import BaseModel | ||
|
||
policy_substrings = [ | ||
"terms of use", | ||
"privacy policy", | ||
"cookie policy", | ||
"uses cookies", | ||
"privacy overview", | ||
"use of cookies", | ||
"use cookies", | ||
"privacy & cookies policy", | ||
"privacy and cookies policy", | ||
"This website uses cookies to improve your experience while you " | ||
"navigate through the website. Out of these cookies, the cookies " | ||
"that are categorized as necessary are stored on your browser as they " | ||
"are essential for the working of basic functionalities of the website. " | ||
"We also use third-party cookies that help us analyze and understand how " | ||
"you use this website. These cookies will be stored in your browser only " | ||
"with your consent. You also have the option to opt-out of these " | ||
"cookies. But opting out of some of these cookies may have an effect " | ||
"on your browsing experience.".lower(), | ||
"Necessary cookies are absolutely essential for the website to " | ||
"function properly. This category only includes cookies that " | ||
"ensures basic functionalities and security features of the website. " | ||
"These cookies do not store any personal information.".lower(), | ||
"Any cookies that may not be particularly necessary for the website " | ||
"to function and is used specifically to collect user personal data " | ||
"via analytics, ads, other embedded contents are termed as non-necessary " | ||
"cookies. It is mandatory to procure user consent prior to running these " | ||
"cookies on your website.".lower(), | ||
"This site uses cookies, including for analytics, personalization, and " | ||
"advertising purposes. For more information or to change your " | ||
"cookie settings, click here.".lower(), | ||
"If you continue to browse this site without changing your cookie " | ||
"settings, you agree to this use. AcceptRead More".lower(), | ||
] | ||
|
||
|
||
class CookieBannerFilter(BaseModel): | ||
""" | ||
Ref: C4 Raffel et al. | ||
Desc: Removes documents if more than 40% of the documents include terms for cookies, tos, privacy policy, etc. Requires external list. | ||
""" | ||
|
||
name: str = "cookie_banner" | ||
regex: re.Pattern = re.compile(r"(terms of use|privacy policy|copyright|all rights reserved)", re.IGNORECASE) | ||
remove_percentage: float = 0.4 | ||
|
||
def __call__(self, text): | ||
# check if the regex matches | ||
raise NotImplementedError("CookieBannerFilter not implemented yet") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
import re | ||
|
||
from pydantic import BaseModel | ||
|
||
|
||
class DigitToCharacter(BaseModel): | ||
""" | ||
Desc: If more than 20% of the document are digits then remove | ||
""" | ||
|
||
name: str = "digit_to_character" | ||
remove_percentage: float = 0.2 | ||
|
||
def __call__(self, text): | ||
digits = re.findall(r"\d", text) | ||
num_digits = len(digits) | ||
total_chars = len(text) | ||
# check if there are any characters in the text | ||
if num_digits / total_chars > self.remove_percentage: | ||
return True | ||
# otherwise keep | ||
return False |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,200 @@ | ||
import importlib.util | ||
import re | ||
import unicodedata | ||
from typing import Dict | ||
|
||
from huggingface_hub import hf_hub_download | ||
from pydantic import BaseModel, ConfigDict | ||
|
||
_kenlm = importlib.util.find_spec("kenlm") is not None | ||
_sentencepiece = importlib.util.find_spec("sentencepiece") is not None | ||
|
||
if _kenlm or not _sentencepiece: | ||
import kenlm | ||
import sentencepiece | ||
|
||
|
||
class SentencePiece: | ||
def __init__( | ||
self, | ||
model: str, | ||
): | ||
super().__init__() | ||
self.sp = sentencepiece.SentencePieceProcessor() | ||
self.sp.load(str(model)) | ||
|
||
def do(self, text: dict) -> dict: | ||
tokenized = self.sp.encode_as_pieces(text) | ||
return " ".join(tokenized) | ||
|
||
|
||
class KenlmModel: | ||
digit_re: re.Pattern[str] = re.compile(r"\d") | ||
unicode_punct: Dict[str, str] = { | ||
",": ",", | ||
"。": ".", | ||
"、": ",", | ||
"„": '"', | ||
"”": '"', | ||
"“": '"', | ||
"«": '"', | ||
"»": '"', | ||
"1": '"', | ||
"」": '"', | ||
"「": '"', | ||
"《": '"', | ||
"》": '"', | ||
"´": "'", | ||
"∶": ":", | ||
":": ":", | ||
"?": "?", | ||
"!": "!", | ||
"(": "(", | ||
")": ")", | ||
";": ";", | ||
"–": "-", | ||
"—": " - ", | ||
".": ". ", | ||
"~": "~", | ||
"’": "'", | ||
"…": "...", | ||
"━": "-", | ||
"〈": "<", | ||
"〉": ">", | ||
"【": "[", | ||
"】": "]", | ||
"%": "%", | ||
"►": "-", | ||
} | ||
unicode_punct_re: re.Pattern = re.compile(f"[{''.join(unicode_punct.keys())}]") | ||
non_printing_chars_re: re.Pattern = re.compile(f"[{''.join(map(chr, list(range(0,32)) + list(range(127,160))))}]") | ||
model: kenlm.Model = None | ||
tokenizer: SentencePiece = None | ||
accent: bool = False | ||
case: bool = False | ||
numbers: bool = True | ||
punct: int = 1 | ||
|
||
def __init__( | ||
self, | ||
model_path: str, | ||
tokenizer_path: str, | ||
lower_case: bool = False, | ||
remove_accents: bool = False, | ||
normalize_numbers: bool = True, | ||
punctuation: int = 1, | ||
): | ||
self.model = kenlm.Model(model_path) | ||
self.tokenizer = SentencePiece(tokenizer_path) | ||
self.accent = remove_accents | ||
self.case = lower_case | ||
self.numbers = normalize_numbers | ||
self.punct = punctuation | ||
|
||
@classmethod | ||
def from_pretrained( | ||
cls, | ||
language_or_path: str, | ||
): | ||
try: | ||
model = hf_hub_download("philschmid/kenlm", filename=f"wikipedia/{language_or_path}.arpa.bin") | ||
tokenizer = hf_hub_download("philschmid/kenlm", filename=f"wikipedia/{language_or_path}.sp.model") | ||
except Exception: | ||
raise ValueError( | ||
f"KenLM model for {language_or_path} not found at https://huggingface.co/philschmid/kenlm. Please train your own model and upload it to the hub." | ||
) from None | ||
|
||
return cls( | ||
model, | ||
tokenizer, | ||
False, | ||
False, | ||
True, | ||
1, | ||
) | ||
|
||
def pp(self, log_score, length): | ||
return 10.0 ** (-log_score / length) | ||
|
||
def get_perplexity(self, doc: str, normalize_cc_net: bool = True): | ||
if normalize_cc_net: | ||
doc = self.normalize( | ||
doc, | ||
accent=self.accent, | ||
case=self.case, | ||
numbers=self.numbers, | ||
punct=self.punct, | ||
) | ||
# Tokenize (after normalizing): See https://github.com/facebookresearch/cc_net/blob/bda555bd1cf1ee2e0b925363e62a61cd46c8b60d/cc_net/mine.py#L352 for full pipeline | ||
doc = self.tokenizer.do(doc) | ||
doc_log_score, doc_length = 0, 0 | ||
for line in doc.split("\n"): | ||
log_score = self.model.score(line) | ||
length = len(line.split()) + 1 | ||
doc_log_score += log_score | ||
doc_length += length | ||
return round(self.pp(doc_log_score, doc_length), 1) | ||
|
||
def normalize( | ||
self, | ||
line: str, | ||
accent: bool = True, | ||
case: bool = True, | ||
numbers: bool = True, | ||
punct: int = 1, | ||
) -> str: | ||
line = line.strip() | ||
if not line: | ||
return line | ||
if case: | ||
line = line.lower() | ||
if accent: | ||
line = self.strip_accents(line) | ||
if numbers: | ||
line = self.digit_re.sub("0", line) | ||
if punct == 1: | ||
line = self.replace_unicode_punct(line) | ||
elif punct == 2: | ||
line = self.remove_unicode_punct(line) | ||
line = self.remove_non_printing_char(line) | ||
return line | ||
|
||
def strip_accents(self, line: str) -> str: | ||
"""Strips accents from a piece of text.""" | ||
nfd = unicodedata.normalize("NFD", line) | ||
output = [c for c in nfd if unicodedata.category(c) != "Mn"] | ||
if len(output) == line: | ||
return line | ||
return "".join(output) | ||
|
||
def replace_unicode_punct(self, text: str) -> str: | ||
return "".join(self.unicode_punct.get(c, c) for c in text) | ||
|
||
def remove_unicode_punct(self, text: str) -> str: | ||
"""More aggressive version of replace_unicode_punct but also faster.""" | ||
return self.unicode_punct_re.sub("", text) | ||
|
||
def remove_non_printing_char(self, text: str) -> str: | ||
return self.non_printing_chars_re.sub("", text) | ||
|
||
|
||
class PerplexityFilter(BaseModel): | ||
model: KenlmModel = None | ||
min_threshold: int = 0 | ||
max_threshold: int = 1000 | ||
model_config = ConfigDict(arbitrary_types_allowed=True) | ||
|
||
def __init__(self, language: str, min_threshold: int = 0, max_threshold: int = 1000): | ||
super().__init__() | ||
self.min_threshold = min_threshold | ||
self.max_threshold = max_threshold | ||
self.model = KenlmModel.from_pretrained(language) | ||
|
||
def __call__(self, doc: str) -> bool: | ||
# returns True if the perplexity of the document outside of the threshold, | ||
# meaning smaller than min_threshold or larger than max_threshold | ||
perplexity = self.model.get_perplexity(doc) | ||
if perplexity < self.min_threshold or perplexity > self.max_threshold: | ||
return True | ||
# otherwise keep | ||
return False |
Oops, something went wrong.