Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pre-processing options work for PreTrainedVectorizers [resolves #307] #308

Open
wants to merge 15 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
484 changes: 457 additions & 27 deletions sadedegel/bblock/doc.py

Large diffs are not rendered by default.

270 changes: 252 additions & 18 deletions sadedegel/extension/sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,49 @@ def check_type_all(X, expected_type=str):


def check_type(v, expected_type, error_msg: str) -> None:
"""Check variable type
@param v: Variable to be checked
@param expected_type: Expected type of variable
@param error_msg: Error message
"""Checks and compares of variable types of given variables.

Parameters
----------
v: Any
Variable to be checked.
expected_type: Any
Expected type of variable.
error_msg: str
Error message.

Returns
-------
None

Raises
------
ValueError
If the given and expected variables don't match.
"""
if not isinstance(v, expected_type):
raise ValueError(error_msg)


class OnlinePipeline(Pipeline):
def partial_fit(self, X, y=None, **kwargs):
"""Implements minibatch type of training for given estimator.

Parameters
----------
X: array-like
{array-like, sparse matrix} of shape (n_shape, n_features).
y: array-like, default=None
array-like of shape (n_samples,).
kwargs: type
Remaining keyword arguments for current estimator.

Returns
-------
self: object
Fitted estimator.

"""
for i, step in enumerate(self.steps):
name, est = step
est.partial_fit(X, y, **kwargs)
Expand All @@ -37,6 +69,32 @@ def partial_fit(self, X, y=None, **kwargs):


class Text2Doc(BaseEstimator, TransformerMixin):
"""Creates text2doc converter for given tokenizer.

Parameters
----------
tokenizer: str, default='icu'
Which tokenizer to be used.
hashtag: bool, default=False
Whether to tokenize hashtags alone or with related tokens.
mention: bool, default=False
Whether to tokenize mentions alone or with related tokens.
emoji: bool, default=False
Whether to tokenize emojis by symbols or all together.
emoticon: bool, default=False
Whether to tokenize emoticons by symbols or all together.
progress_tracking: bool, default=True

Methods
-------
fit()
Placeholder. Don't use.
partial_fit()
Placeholder. Don't use.
transform()
Transforms given list of strings into list of sadedegel's Doc objects which can be tokenized.
"""

Doc = None

def __init__(self, tokenizer="icu", hashtag=False, mention=False, emoji=False, emoticon=False,
Expand All @@ -52,14 +110,19 @@ def __init__(self, tokenizer="icu", hashtag=False, mention=False, emoji=False, e
self.init()

def init(self):
if Text2Doc.Doc is None:
if hasattr(self, 'hashtag') and hasattr(self, 'mention') and hasattr(self, 'emoji') and hasattr(
self, 'emoticon'):
if hasattr(self, 'hashtag') and hasattr(self, 'mention') and hasattr(self, 'emoji') and hasattr(
self, 'emoticon'):
if Text2Doc.Doc is None:
Text2Doc.Doc = DocBuilder(tokenizer=self.tokenizer, tokenizer__hashtag=self.hashtag,
tokenizer__mention=self.mention, tokenizer__emoji=self.emoji,
tokenizer__emoticon=self.emoticon)
else:
Text2Doc.Doc = DocBuilder(tokenizer=self.tokenizer, tokenizer__hashtag=False,
if Text2Doc.Doc.tokenizer.hashtag != self.hashtag or Text2Doc.Doc.tokenizer.mention != self.mention or Text2Doc.Doc.tokenizer.emoji != self.emoji or Text2Doc.Doc.tokenizer.emoticon != self.emoticon:
Text2Doc.Doc = DocBuilder(tokenizer=self.tokenizer, tokenizer__hashtag=self.hashtag,
tokenizer__mention=self.mention, tokenizer__emoji=self.emoji,
tokenizer__emoticon=self.emoticon)
else:
Text2Doc.Doc = DocBuilder(tokenizer=self.tokenizer, tokenizer__hashtag=False,
tokenizer__mention=False, tokenizer__emoji=False,
tokenizer__emoticon=False)

Expand All @@ -70,6 +133,23 @@ def partial_fit(self, X, y=None, **kwargs):
return self

def transform(self, X, y=None):
"""Transforms given list of strings into list of sadedegel's Doc objects which can be tokenized.

Parameters
----------
X: array-like
List of strings to be transformed.

Returns
-------
docs: array-like
List of sadedegel.bblock.doc.Document objects.

Raises
------
ValueError
If the X contains no valid documents.
"""
if isinstance(X, list):
check_type_all(X)
n_total = len(X)
Expand Down Expand Up @@ -101,6 +181,28 @@ def partial_fit(self, X, y=None, **kwargs):


class HashVectorizer(BaseEstimator, TransformerMixin):
"""Coverts a collection of text documents to a matrix of occurrences.

Parameters
----------
n_features: int, default=1048576
The number of features(columns) in output matrices.
Small numbers can cause hash collisions.
prefix_range: tuple (min_n, max_n), default=(3,5)
The lower and upper boundary of the range of n-values for prefixes (Characters).
alternate_sign: bool, default=True
When True, an alternating sign is added to features as to
approximately converse the linear product in the hashed space.

Methods
-------
fit()
Placeholder. Don't use.
partial_fit()
Placeholder. Don't use.
transform()
Takes sadedegel doc objects and returns matrix of hashed features.
"""
def __init__(self, n_features: int = 1048576, prefix_range: tuple = (3, 5), *, alternate_sign: bool = True):
check_type(prefix_range, tuple, f"prefix_range should be of tuple type. {type(prefix_range)} found.")
self.n_features = n_features
Expand All @@ -115,6 +217,17 @@ def partial_fit(self, X, y=None, **kwargs):
return self

def transform(self, docs):
"""Takes sadedegel doc objects and returns matrix of hashed features.

Parameters
----------
docs: sadedegel.bblock.doc.Document or list
List of sadedegel.bblock.doc.Document objects or single Doc object.
Returns
-------
csr_matrix: array-like
scipy.sparse.csr of shape (n_samples, n_features)
"""
def feature_iter():
if hasattr(self, 'prefix_range'):
for d in docs:
Expand All @@ -130,9 +243,35 @@ def feature_iter():


class TfidfVectorizer(SadedegelVectorizer):
"""Transforms a count matrix to a normalized tf or tf-idf representation.
Parameters
----------
tf_method: str, default='raw'
Type of term frequency method. Can be one of:
['raw', 'binary', 'freq', 'log_norm', 'double_norm']
idf_method: str, default='probabilistic'
Type of inverse document frequency method. Can be one of:
['smooth', 'probabilistic', 'unary']
drop_stopwords: bool, default=True
Whether to drop or keep stopwords.
lowercase: bool, default=True
Whether to lowercase or keep original cases of given text.
drop_suffix: bool, default=True
Whether to drop suffixes or keep original version of the text.
drop_punct: bool, default=True
Whether to drop or keep punctuations.
show_progress: bool, default=True
Whether to keep track of progress or not.

Methods
-------
transform()
Takes list of sadedegel documents and returns matrix of tf-idf features which is pretrained.
"""
def __init__(self, *, tf_method='raw', idf_method='probabilistic', drop_stopwords=True,
lowercase=True,
drop_suffix=True, drop_punct=True, show_progress=True):

super().__init__()

self.tf_method = tf_method
Expand All @@ -144,6 +283,22 @@ def __init__(self, *, tf_method='raw', idf_method='probabilistic', drop_stopword
self.show_progress = show_progress

def transform(self, X, y=None):
"""Takes list of sadedegel documents and returns matrix of tf-idf features which is pretrained.

Parameters
----------
X: array-like
List of sadedegel.bblock.doc.Document objects.
Returns
-------
csr_matrix: array-like
scipy.sparse.csr of shape (n_samples, n_features)

Raises
------
ValueError
If the X contains no valid documents.
"""
if isinstance(X, list):
check_type_all(X, Document)
n_total = len(X)
Expand Down Expand Up @@ -181,10 +336,41 @@ def transform(self, X, y=None):


class BM25Vectorizer(SadedegelVectorizer):
"""sadedegel's 'Best Match 25' implementation.

Parameters
----------
tf_method: str, default='raw'
Type of term frequency method. Can be one of:
['raw', 'binary', 'freq', 'log_norm', 'double_norm']
idf_method: str, default='probabilistic'
Type of inverse document frequency method. Can be one of:
['smooth', 'probabilistic', 'unary']
k1: float, default=1.25
Determines the term frequency saturation.
b: float, default=0.75
Ratio of the document length to be multiplied.
delta: float, default=0
A constant value for lower bounding.
drop_stopwords: bool, default=True
Whether to drop or keep stopwords.
lowercase: bool, default=True
Whether to lowercase or keep original cases of given text.
drop_suffix: bool, default=True
Whether to drop suffixes or keep original version of the text.
drop_punct: bool, default=True
Whether to drop or keep punctuations.
show_progress: bool, default=True
Whether to keep track of progress or not.

Methods
-------
transform()
Takes list of sadedegel documents and returns matrix of features using BM25.
"""
def __init__(self, *, tf_method='raw', idf_method='probabilistic', k1=1.25, b=0.75, delta=0,
drop_stopwords=True,
lowercase=True, drop_suffix=True, drop_punct=True, show_progress=True):

super().__init__()

self.tf_method = tf_method
Expand All @@ -199,6 +385,21 @@ def __init__(self, *, tf_method='raw', idf_method='probabilistic', k1=1.25, b=0.
self.delta = delta

def transform(self, X, y=None):
"""Takes list of sadedegel documents and returns matrix of features using BM25.

Parameters
----------
X: array-like
List of sadedegel.bblock.doc.Document objects.
Returns
-------
csr_matrix: array-like
scipy.sparse.csr of shape (n_samples, n_features)
Raises
------
ValueError
If the X contains no valid documents.
"""
if isinstance(X, list):
check_type_all(X, Document)
n_total = len(X)
Expand Down Expand Up @@ -238,28 +439,61 @@ def transform(self, X, y=None):


class PreTrainedVectorizer(BaseEstimator, TransformerMixin):
Doc = None
"""Pretrained vectorizer using transformer based embeddings.

Parameters
----------
model: str
Name of the huggingface pretrained model.
do_sents: bool, default=False
Whether to get sentence or raw document embeddings.
show_progress: bool, default=True
Whether to keep track of progress or not.

Methods
-------
transform()
Takes list of sadedegel documents consists of sentences and returns matrix of features which is pretrained.
"""

def __init__(self, model: str, do_sents = False, show_progress = True):

def __init__(self, model: str, do_sents = False, progress_tracking = True):
super().__init__()
self.model = model
self.do_sents = do_sents
self.progress_tracking = progress_tracking
self.show_progress = show_progress

def fit(self, X, y=None):
return self

def transform(self, X, y=None):
if PreTrainedVectorizer.Doc is None:
PreTrainedVectorizer.Doc = DocBuilder()
"""Takes list of sadedegel documents consists of sentences and returns matrix of features which is pretrained.

Parameters
----------
X: array-like
List of strings.
Returns
-------
csr_matrix: array-like
scipy.sparse.csr of shape (n_samples, n_features)
"""
if isinstance(X, list):
check_type_all(X, Document)
n_total = len(X)
else:
X1, X2, X = tee(X, 3)

check_type_all(X1, Document)
n_total = sum((1 for _ in X2))

vecs = []
n_total = 0
for text in tqdm(X, disable=not hasattr(self, 'progress_tracking') or not self.progress_tracking, unit="doc"):
d = PreTrainedVectorizer.Doc(text)
vecs.append(d.get_pretrained_embedding(architecture=self.model, do_sents=self.do_sents))
for doc in track(X, description="Vectorizing document(s)", update_period=1,
disable=not self.show_progress):
vecs.append(doc.get_pretrained_embedding(architecture=self.model, do_sents=self.do_sents))
if self.do_sents:
n_total += len(d)
n_total += len(doc)
else:
n_total += 1
vector_shape = vecs[0].shape[1]
Expand Down
Loading