Skip to content

Commit

Permalink
rename tokenizer class name.
Browse files Browse the repository at this point in the history
  • Loading branch information
kourgeorge committed Nov 28, 2022
1 parent b849b12 commit c61b03e
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 15 deletions.
22 changes: 11 additions & 11 deletions src/compcor/corpus_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,10 @@
from compcor.text_embedder import TextTokenizer, TextEmbedder
import compcor.utils as utils
from compcor.utils import Corpus, TCorpus
from compcor.text_tokenizer_embedder import TextTokenizerEmbedder
from compcor.text_tokenizer_embedder import STTokenizerEmbedder


def ttest_distance(corpus1: Corpus, corpus2: Corpus, model: TextEmbedder = TextTokenizerEmbedder()):
def ttest_distance(corpus1: Corpus, corpus2: Corpus, model: TextEmbedder = STTokenizerEmbedder()):
# calculate mean and covariance statistics

embeddings1, embeddings2 = utils.get_corpora_embeddings(corpus1, corpus2, model)
Expand All @@ -29,7 +29,7 @@ def ttest_distance(corpus1: Corpus, corpus2: Corpus, model: TextEmbedder = TextT
return 1 - np.nanmean(res.pvalue)


def IRPR_distance(corpus1: Corpus, corpus2: Corpus, model: TextEmbedder = TextTokenizerEmbedder()):
def IRPR_distance(corpus1: Corpus, corpus2: Corpus, model: TextEmbedder = STTokenizerEmbedder()):
embeddings1, embeddings2 = utils.get_corpora_embeddings(corpus1, corpus2, model)

cosine = np.clip(cosine_similarity(embeddings1, embeddings2), -1, 1)
Expand All @@ -40,7 +40,7 @@ def IRPR_distance(corpus1: Corpus, corpus2: Corpus, model: TextEmbedder = TextTo
return 2 * (precision * recall) / (precision + recall)


def classifier_distance(corpus1: Corpus, corpus2: Corpus, model: TextEmbedder = TextTokenizerEmbedder()):
def classifier_distance(corpus1: Corpus, corpus2: Corpus, model: TextEmbedder = STTokenizerEmbedder()):
# distance between corpora is the F1 score of a classifier trained to classify membership of a random sample of each
embeddings1, embeddings2 = utils.get_corpora_embeddings(corpus1, corpus2, model)

Expand Down Expand Up @@ -71,7 +71,7 @@ def classifier_distance(corpus1: Corpus, corpus2: Corpus, model: TextEmbedder =
return correct


def medoid_distance(corpus1: Corpus, corpus2: Corpus, model: TextEmbedder = TextTokenizerEmbedder()):
def medoid_distance(corpus1: Corpus, corpus2: Corpus, model: TextEmbedder = STTokenizerEmbedder()):
embeddings1, embeddings2 = utils.get_corpora_embeddings(corpus1, corpus2, model)

# calculate mean and covariance statistics
Expand All @@ -84,7 +84,7 @@ def medoid_distance(corpus1: Corpus, corpus2: Corpus, model: TextEmbedder = Text
return cosine


def fid_distance(corpus1: Corpus, corpus2: Corpus, model: TextEmbedder = TextTokenizerEmbedder()):
def fid_distance(corpus1: Corpus, corpus2: Corpus, model: TextEmbedder = STTokenizerEmbedder()):
embeddings1, embeddings2 = utils.get_corpora_embeddings(corpus1, corpus2, model)
# TODO: needs a note explaining what the resulting calculation is. Is it an overlap/probability as approximated by Gaussian curve
# Note that the paper says FID is a F1 score but this is a different calculation (unless it is in effect an F1 score)
Expand All @@ -108,14 +108,14 @@ def fid_distance(corpus1: Corpus, corpus2: Corpus, model: TextEmbedder = TextTok
return fid


def mauve_distance(corpus1: Corpus, corpus2: Corpus, model: TextEmbedder = TextTokenizerEmbedder()):
def mauve_distance(corpus1: Corpus, corpus2: Corpus, model: TextEmbedder = STTokenizerEmbedder()):
embeddings1, embeddings2 = utils.get_corpora_embeddings(corpus1, corpus2, model)

out = mauve.compute_mauve(p_features=embeddings1, q_features=embeddings2, device_id=0, verbose=False)
return 1 - out.mauve


def pr_distance(corpus1: Corpus, corpus2: Corpus, model: TextEmbedder = TextTokenizerEmbedder(), nearest_k=5):
def pr_distance(corpus1: Corpus, corpus2: Corpus, model: TextEmbedder = STTokenizerEmbedder(), nearest_k=5):
embeddings1, embeddings2 = utils.get_corpora_embeddings(corpus1, corpus2, model)

metric = compute_prdc(real_features=np.vstack(embeddings1),
Expand All @@ -127,7 +127,7 @@ def pr_distance(corpus1: Corpus, corpus2: Corpus, model: TextEmbedder = TextToke
return 1 - 2 * (precision * recall) / (precision + recall)


def dc_distance(corpus1: Corpus, corpus2: Corpus, model: TextEmbedder = TextTokenizerEmbedder(), nearest_k=5):
def dc_distance(corpus1: Corpus, corpus2: Corpus, model: TextEmbedder = STTokenizerEmbedder(), nearest_k=5):
embeddings1, embeddings2 = utils.get_corpora_embeddings(corpus1, corpus2, model)

metric = compute_prdc(real_features=np.vstack(embeddings1),
Expand All @@ -140,7 +140,7 @@ def dc_distance(corpus1: Corpus, corpus2: Corpus, model: TextEmbedder = TextToke
return 1 - 2 * (density * coverage) / (density + coverage)


def chi_square_distance(corpus1: TCorpus, corpus2: TCorpus, tokenizer: TextTokenizer = TextTokenizerEmbedder(),
def chi_square_distance(corpus1: TCorpus, corpus2: TCorpus, tokenizer: TextTokenizer = STTokenizerEmbedder(),
top=5000):
# calculate p-value of chi-square test between frequency counts of top most frequent shared tokens between corpora
# note, does not normalize for the size of the corpora, so most common tokens may reflect more the larger corpus
Expand Down Expand Up @@ -173,7 +173,7 @@ def chi_square_distance(corpus1: TCorpus, corpus2: TCorpus, tokenizer: TextToken
return 1-scipy.stats.chi2.cdf(chi_stat, 2 * (len(common_words) - 1))


def zipf_distance(corpus1: TCorpus, corpus2: TCorpus, tokenizer: TextTokenizer = TextTokenizerEmbedder()):
def zipf_distance(corpus1: TCorpus, corpus2: TCorpus, tokenizer: TextTokenizer = STTokenizerEmbedder()):
tokens1, tokens2 = utils.get_corpora_tokens(corpus1, corpus2, tokenizer)

zipf1 = utils.zipf_coeff(tokens1)
Expand Down
10 changes: 7 additions & 3 deletions src/compcor/example.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import compcor.corpus_metrics as corpus_metrics
from compcor.text_tokenizer_embedder import TextTokenizerEmbedder
from compcor.text_tokenizer_embedder import STTokenizerEmbedder

### Example code

Expand Down Expand Up @@ -55,7 +55,9 @@
distance = corpus_metrics.zipf_distance(corpus1=setA, corpus2=setB)
print("zipf_distance={}".format(distance))

embedder = TextTokenizerEmbedder()
print("Comparing corpora on embedding data...")

embedder = STTokenizerEmbedder(embedding_model_name="all-MiniLM-L12-v2")
embeddingA = embedder.embed_sentences(setA)
embeddingB = embedder.embed_sentences(setB)

Expand All @@ -77,7 +79,9 @@
distance = corpus_metrics.ttest_distance(corpus1=embeddingA, corpus2=embeddingB)
print("ttest_distance={}".format(distance))

embedder = TextTokenizerEmbedder()


embedder = STTokenizerEmbedder()
tokensA = embedder.tokenize_sentences(setA)
tokensB = embedder.tokenize_sentences(setB)

Expand Down
2 changes: 1 addition & 1 deletion src/compcor/text_tokenizer_embedder.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from sklearn import preprocessing


class TextTokenizerEmbedder(TextEmbedder, TextTokenizer):
class STTokenizerEmbedder(TextEmbedder, TextTokenizer):
def __init__(self, embedding_model_name="all-MiniLM-L6-v2"):
self.model_name = embedding_model_name
self.embedder: SentenceTransformer = None
Expand Down

0 comments on commit c61b03e

Please sign in to comment.