Skip to content

Commit

Permalink
Various optimizations for faster import time
Browse files Browse the repository at this point in the history
  • Loading branch information
louismartin committed Sep 2, 2020
1 parent 28b8698 commit c1b2cd2
Show file tree
Hide file tree
Showing 4 changed files with 36 additions and 22 deletions.
2 changes: 1 addition & 1 deletion easse/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ def evaluate_system_output(
metrics_scores["f1_token"] = corpus_f1_token(sys_sents, refs_sents, tokenizer=tokenizer, lowercase=lowercase)

if analysis:
from easse.annotation.word_level import corpus_analyse_operations # Lazy inline import for performance
from easse.annotation.word_level import corpus_analyse_operations # Lazy inline import for faster import time
metrics_scores["word_level_analysis"] = corpus_analyse_operations(orig_sents, sys_sents, refs_sents,
verbose=False, as_str=True)

Expand Down
50 changes: 31 additions & 19 deletions easse/quality_estimation.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,5 @@
from typing import List

from tseval.feature_extraction import (get_compression_ratio, count_sentence_splits, get_levenshtein_similarity,
is_exact_match, get_additions_proportion, get_deletions_proportion,
get_wordrank_score, wrap_single_sentence_vectorizer)

from easse.utils.preprocessing import normalize


Expand All @@ -16,20 +12,36 @@ def get_average(vectorizer, orig_sentences, sys_sentences):
return cumsum / count


def corpus_quality_estimation(orig_sentences: List[str], sys_sentences: List[str],
lowercase: bool = False, tokenizer: str = '13a'):
orig_sentences = [normalize(sent, lowercase, tokenizer) for sent in orig_sentences]
sys_sentences = [normalize(sent, lowercase, tokenizer) for sent in sys_sentences]
def corpus_quality_estimation(orig_sentences: List[str],
sys_sentences: List[str],
lowercase: bool = False,
tokenizer: str = '13a'):
orig_sentences = [
normalize(sent, lowercase, tokenizer) for sent in orig_sentences
]
sys_sentences = [
normalize(sent, lowercase, tokenizer) for sent in sys_sentences
]
# Lazy inline import for faster import time
from tseval.feature_extraction import (
get_compression_ratio, count_sentence_splits,
get_levenshtein_similarity, is_exact_match, get_additions_proportion,
get_deletions_proportion, get_wordrank_score,
wrap_single_sentence_vectorizer)
return {
'Compression ratio': get_average(get_compression_ratio, orig_sentences, sys_sentences),
'Sentence splits': get_average(count_sentence_splits, orig_sentences, sys_sentences),
'Levenshtein similarity': get_average(get_levenshtein_similarity, orig_sentences, sys_sentences),
'Exact copies': get_average(is_exact_match, orig_sentences, sys_sentences),
'Additions proportion': get_average(get_additions_proportion, orig_sentences, sys_sentences),
'Deletions proportion': get_average(get_deletions_proportion, orig_sentences, sys_sentences),
'Lexical complexity score': get_average(
wrap_single_sentence_vectorizer(get_wordrank_score),
orig_sentences,
sys_sentences
),
'Compression ratio':
get_average(get_compression_ratio, orig_sentences, sys_sentences),
'Sentence splits':
get_average(count_sentence_splits, orig_sentences, sys_sentences),
'Levenshtein similarity':
get_average(get_levenshtein_similarity, orig_sentences, sys_sentences),
'Exact copies':
get_average(is_exact_match, orig_sentences, sys_sentences),
'Additions proportion':
get_average(get_additions_proportion, orig_sentences, sys_sentences),
'Deletions proportion':
get_average(get_deletions_proportion, orig_sentences, sys_sentences),
'Lexical complexity score':
get_average(wrap_single_sentence_vectorizer(get_wordrank_score),
orig_sentences, sys_sentences),
}
4 changes: 3 additions & 1 deletion easse/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import pandas as pd
import plotly.express as px
from sacrebleu import corpus_bleu
from tseval.feature_extraction import get_levenshtein_similarity, get_compression_ratio, count_sentences
from yattag import Doc, indent

from easse.fkgl import corpus_fkgl
Expand All @@ -17,6 +16,7 @@
from easse.utils.helpers import add_dicts
from easse.utils.text import to_words, count_words
from easse.annotation.lcs import get_lcs
from easse.utils.text import count_sentences


def get_all_scores(
Expand Down Expand Up @@ -83,6 +83,7 @@ def get_random_html_id():


def get_qualitative_examples_html(orig_sents, sys_sents, refs_sents):
from tseval.feature_extraction import get_levenshtein_similarity, get_compression_ratio # Inline lazy import for performance
title_key_print = [
('Randomly sampled simplifications',
lambda c, s, refs: 0,
Expand Down Expand Up @@ -210,6 +211,7 @@ def get_plotly_histogram(orig_sents, sys_sents, ref_sents, feature_extractor, fe


def get_plots_html(orig_sents, sys_sents, ref_sents):
from tseval.feature_extraction import get_levenshtein_similarity, get_compression_ratio # Inline lazy import for performance
doc = Doc()
features = {
'Compression ratio': get_compression_ratio,
Expand Down
2 changes: 1 addition & 1 deletion easse/utils/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def count_words(text):


def to_sentences(text, language='english'):
import nltk # Lazy inline import because NLTK takes ~1s to load
import nltk # Lazy inline import for faster import time
try:
tokenizer = nltk.data.load(f'tokenizers/punkt/{language}.pickle')
except LookupError:
Expand Down

0 comments on commit c1b2cd2

Please sign in to comment.