From c1b2cd26bd19d0abb9388ee54fc5c700295347a3 Mon Sep 17 00:00:00 2001 From: Louis MARTIN Date: Thu, 9 Jul 2020 02:53:50 -0700 Subject: [PATCH] Various optimizations for faster import time --- easse/cli.py | 2 +- easse/quality_estimation.py | 50 +++++++++++++++++++++++-------------- easse/report.py | 4 ++- easse/utils/text.py | 2 +- 4 files changed, 36 insertions(+), 22 deletions(-) diff --git a/easse/cli.py b/easse/cli.py index 810a8b7..0a7d2a2 100644 --- a/easse/cli.py +++ b/easse/cli.py @@ -150,7 +150,7 @@ def evaluate_system_output( metrics_scores["f1_token"] = corpus_f1_token(sys_sents, refs_sents, tokenizer=tokenizer, lowercase=lowercase) if analysis: - from easse.annotation.word_level import corpus_analyse_operations # Lazy inline import for performance + from easse.annotation.word_level import corpus_analyse_operations # Lazy inline import for faster import time metrics_scores["word_level_analysis"] = corpus_analyse_operations(orig_sents, sys_sents, refs_sents, verbose=False, as_str=True) diff --git a/easse/quality_estimation.py b/easse/quality_estimation.py index f7134e1..6149616 100644 --- a/easse/quality_estimation.py +++ b/easse/quality_estimation.py @@ -1,9 +1,5 @@ from typing import List -from tseval.feature_extraction import (get_compression_ratio, count_sentence_splits, get_levenshtein_similarity, - is_exact_match, get_additions_proportion, get_deletions_proportion, - get_wordrank_score, wrap_single_sentence_vectorizer) - from easse.utils.preprocessing import normalize @@ -16,20 +12,36 @@ def get_average(vectorizer, orig_sentences, sys_sentences): return cumsum / count -def corpus_quality_estimation(orig_sentences: List[str], sys_sentences: List[str], - lowercase: bool = False, tokenizer: str = '13a'): - orig_sentences = [normalize(sent, lowercase, tokenizer) for sent in orig_sentences] - sys_sentences = [normalize(sent, lowercase, tokenizer) for sent in sys_sentences] +def corpus_quality_estimation(orig_sentences: List[str], + sys_sentences: List[str], + lowercase: bool = False, + tokenizer: str = '13a'): + orig_sentences = [ + normalize(sent, lowercase, tokenizer) for sent in orig_sentences + ] + sys_sentences = [ + normalize(sent, lowercase, tokenizer) for sent in sys_sentences + ] + # Lazy inline import for faster import time + from tseval.feature_extraction import ( + get_compression_ratio, count_sentence_splits, + get_levenshtein_similarity, is_exact_match, get_additions_proportion, + get_deletions_proportion, get_wordrank_score, + wrap_single_sentence_vectorizer) return { - 'Compression ratio': get_average(get_compression_ratio, orig_sentences, sys_sentences), - 'Sentence splits': get_average(count_sentence_splits, orig_sentences, sys_sentences), - 'Levenshtein similarity': get_average(get_levenshtein_similarity, orig_sentences, sys_sentences), - 'Exact copies': get_average(is_exact_match, orig_sentences, sys_sentences), - 'Additions proportion': get_average(get_additions_proportion, orig_sentences, sys_sentences), - 'Deletions proportion': get_average(get_deletions_proportion, orig_sentences, sys_sentences), - 'Lexical complexity score': get_average( - wrap_single_sentence_vectorizer(get_wordrank_score), - orig_sentences, - sys_sentences - ), + 'Compression ratio': + get_average(get_compression_ratio, orig_sentences, sys_sentences), + 'Sentence splits': + get_average(count_sentence_splits, orig_sentences, sys_sentences), + 'Levenshtein similarity': + get_average(get_levenshtein_similarity, orig_sentences, sys_sentences), + 'Exact copies': + get_average(is_exact_match, orig_sentences, sys_sentences), + 'Additions proportion': + get_average(get_additions_proportion, orig_sentences, sys_sentences), + 'Deletions proportion': + get_average(get_deletions_proportion, orig_sentences, sys_sentences), + 'Lexical complexity score': + get_average(wrap_single_sentence_vectorizer(get_wordrank_score), + orig_sentences, sys_sentences), } diff --git a/easse/report.py b/easse/report.py index 64a28d8..ba452de 100644 --- a/easse/report.py +++ b/easse/report.py @@ -6,7 +6,6 @@ import pandas as pd import plotly.express as px from sacrebleu import corpus_bleu -from tseval.feature_extraction import get_levenshtein_similarity, get_compression_ratio, count_sentences from yattag import Doc, indent from easse.fkgl import corpus_fkgl @@ -17,6 +16,7 @@ from easse.utils.helpers import add_dicts from easse.utils.text import to_words, count_words from easse.annotation.lcs import get_lcs +from easse.utils.text import count_sentences def get_all_scores( @@ -83,6 +83,7 @@ def get_random_html_id(): def get_qualitative_examples_html(orig_sents, sys_sents, refs_sents): + from tseval.feature_extraction import get_levenshtein_similarity, get_compression_ratio # Inline lazy import for performance title_key_print = [ ('Randomly sampled simplifications', lambda c, s, refs: 0, @@ -210,6 +211,7 @@ def get_plotly_histogram(orig_sents, sys_sents, ref_sents, feature_extractor, fe def get_plots_html(orig_sents, sys_sents, ref_sents): + from tseval.feature_extraction import get_levenshtein_similarity, get_compression_ratio # Inline lazy import for performance doc = Doc() features = { 'Compression ratio': get_compression_ratio, diff --git a/easse/utils/text.py b/easse/utils/text.py index e931044..988f348 100644 --- a/easse/utils/text.py +++ b/easse/utils/text.py @@ -11,7 +11,7 @@ def count_words(text): def to_sentences(text, language='english'): - import nltk # Lazy inline import because NLTK takes ~1s to load + import nltk # Lazy inline import for faster import time try: tokenizer = nltk.data.load(f'tokenizers/punkt/{language}.pickle') except LookupError: