Skip to content

Commit

Permalink
Added French language (#58)
Browse files Browse the repository at this point in the history
* added support for french language

* Bump version to 0.7.2

* Update CHANGELOG.md

Co-authored-by: Baltazar Bieniek <[email protected]>
  • Loading branch information
bbieniek and Baltazar Bieniek authored Jun 3, 2021
1 parent b2445d8 commit a3f26f8
Show file tree
Hide file tree
Showing 7 changed files with 76 additions and 26 deletions.
14 changes: 12 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
# Changelog

## [Unreleased](https://github.com/nedap/deidentify/tree/HEAD)

[Full Changelog](https://github.com/nedap/deidentify/compare/v0.7.1...HEAD)

**Closed issues:**

- Surrogation output [\#56](https://github.com/nedap/deidentify/issues/56)
- Surrogate Evaluation Script [\#55](https://github.com/nedap/deidentify/issues/55)
- Surrogation Methods [\#54](https://github.com/nedap/deidentify/issues/54)

## [v0.7.1](https://github.com/nedap/deidentify/tree/v0.7.1) (2021-02-15)

[Full Changelog](https://github.com/nedap/deidentify/compare/v0.7.0...v0.7.1)
Expand All @@ -16,7 +26,7 @@

## [v0.7.0](https://github.com/nedap/deidentify/tree/v0.7.0) (2020-12-16)

[Full Changelog](https://github.com/nedap/deidentify/compare/model_bilstmcrf_ons_fast-v0.2.0...v0.7.0)
[Full Changelog](https://github.com/nedap/deidentify/compare/v0.6.1...v0.7.0)

**Merged pull requests:**

Expand Down Expand Up @@ -131,7 +141,7 @@

## [v0.3.0](https://github.com/nedap/deidentify/tree/v0.3.0) (2020-01-16)

[Full Changelog](https://github.com/nedap/deidentify/compare/model_crf_ons_tuned-v0.1.0...v0.3.0)
[Full Changelog](https://github.com/nedap/deidentify/compare/355d23ac80b64d277ae70daf8e9914c6671f2256...v0.3.0)

**Merged pull requests:**

Expand Down
2 changes: 1 addition & 1 deletion deidentify/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from pathlib import Path

__version__ = "0.7.1"
__version__ = "0.7.2"

cache_root = Path(Path.home(), ".deidentify")
24 changes: 17 additions & 7 deletions deidentify/evaluation/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
# More info on the warning: https://github.com/explosion/spaCy/issues/5727
warnings.filterwarnings('ignore', message=r'.*W030.*')


def flatten(lists):
return [e for l in lists for e in l]

Expand All @@ -33,7 +34,8 @@ def __init__(self, gold: List[Document], predicted: List[Document], language='nl
self.gold = gold
self.predicted = predicted

self.tags = sorted(list(set(ann.tag for doc in gold for ann in doc.annotations)))
self.tags = sorted(
list(set(ann.tag for doc in gold for ann in doc.annotations)))

if language not in self.supported_languages():
logger.warning(
Expand All @@ -43,13 +45,16 @@ def __init__(self, gold: List[Document], predicted: List[Document], language='nl
if language == 'nl':
from deidentify.tokenizer.tokenizer_ons import TokenizerOns
self.tokenizer = TokenizerOns(disable=('tagger', 'parser', 'ner'))
if language == 'fr':
from deidentify.tokenizer.tokenizer_fr import TokenizerFR
self.tokenizer = TokenizerFR(disable=('tagger', 'parser', 'ner'))
else:
from deidentify.tokenizer.tokenizer_en import TokenizerEN
self.tokenizer = TokenizerEN(disable=('tagger', 'parser', 'ner'))

@staticmethod
def supported_languages():
return ('nl', 'en')
return ('nl', 'en', 'fr')

def entity_level(self):
metric = Metric('entity level')
Expand Down Expand Up @@ -78,11 +83,13 @@ def token_level(self):
metric = Metric('token level')

tags_gold = flatten(self.token_annotations(doc) for doc in self.gold)
tags_pred = flatten(self.token_annotations(doc) for doc in self.predicted)
tags_pred = flatten(self.token_annotations(doc)
for doc in self.predicted)

cm = confusion_matrix(tags_gold, tags_pred, labels=self.tags + ['O'])

row_sum, col_sum, cm_sum = np.sum(cm, axis=0), np.sum(cm, axis=1), np.sum(cm)
row_sum, col_sum, cm_sum = np.sum(
cm, axis=0), np.sum(cm, axis=1), np.sum(cm)
for i, tag in enumerate(self.tags):
tp = cm[i, i]
fp = row_sum[i] - cm[i, i]
Expand All @@ -99,8 +106,10 @@ def token_level(self):
def token_level_blind(self):
metric = Metric('token (blind)')

tags_gold = flatten(self.token_annotations(doc, tag_blind=True) for doc in self.gold)
tags_pred = flatten(self.token_annotations(doc, tag_blind=True) for doc in self.predicted)
tags_gold = flatten(self.token_annotations(
doc, tag_blind=True) for doc in self.gold)
tags_pred = flatten(self.token_annotations(
doc, tag_blind=True) for doc in self.predicted)
# convert labels: ENT => 1, else => 0
tags_gold = list(map(lambda tag: int(tag == ENTITY_TAG), tags_gold))
tags_pred = list(map(lambda tag: int(tag == ENTITY_TAG), tags_pred))
Expand All @@ -116,7 +125,8 @@ def token_level_blind(self):

def token_annotations(self, doc, tag_blind=False, entity_tag=ENTITY_TAG):
parsed = self.tokenizer.parse_text(doc.text)
entities = [(int(ann.start), int(ann.end), ann.tag) for ann in doc.annotations]
entities = [(int(ann.start), int(ann.end), ann.tag)
for ann in doc.annotations]
biluo_tags = biluo_tags_from_offsets(parsed, entities)

tags = []
Expand Down
31 changes: 22 additions & 9 deletions deidentify/methods/bilstmcrf/run_bilstmcrf.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,11 @@ def get_embeddings(corpus_name: str,
word_embeddings = 'nl'
contextual_forward = 'nl-forward'
contextual_backward = 'nl-backward'
elif language == 'fr':
logger.info('Use French embeddings')
word_embeddings = 'fr'
contextual_forward = 'fr-forward'
contextual_backward = 'fr-backward'
else:
logger.info('Use English embeddings')
word_embeddings = 'glove'
Expand Down Expand Up @@ -92,7 +97,8 @@ def get_model(corpus: flair.data.Corpus,
contextual_backward_path=contextual_backward_path
)

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)
embeddings: StackedEmbeddings = StackedEmbeddings(
embeddings=embedding_types)
tagger: SequenceTagger = SequenceTagger(hidden_size=256,
embeddings=embeddings,
tag_dictionary=tag_dictionary,
Expand All @@ -112,7 +118,8 @@ def main(args):
logger.info('Get sentences...')
train_sents, train_docs = flair_utils.standoff_to_flair_sents(corpus.train, tokenizer,
verbose=True)
dev_sents, dev_docs = flair_utils.standoff_to_flair_sents(corpus.dev, tokenizer, verbose=True)
dev_sents, dev_docs = flair_utils.standoff_to_flair_sents(
corpus.dev, tokenizer, verbose=True)
test_sents, test_docs = flair_utils.standoff_to_flair_sents(corpus.test, tokenizer,
verbose=True)

Expand Down Expand Up @@ -146,7 +153,8 @@ def main(args):
if not args.train_with_dev:
# Model performance is judged by dev data, so we also pick the best performing model
# according to the dev score to make our final predictions.
tagger = SequenceTagger.load(join(model_dir, 'flair', 'best-model.pt'))
tagger = SequenceTagger.load(
join(model_dir, 'flair', 'best-model.pt'))
else:
# Training is stopped if train loss converges - here, we do not have a "best model" and
# use the final model to make predictions.
Expand All @@ -156,18 +164,22 @@ def main(args):
make_predictions(tagger, flair_corpus)

train_utils.save_predictions(corpus_name=corpus.name, run_id=args.run_id,
train=flair_utils.flair_sents_to_standoff(train_sents, train_docs),
dev=flair_utils.flair_sents_to_standoff(dev_sents, dev_docs),
train=flair_utils.flair_sents_to_standoff(
train_sents, train_docs),
dev=flair_utils.flair_sents_to_standoff(
dev_sents, dev_docs),
test=flair_utils.flair_sents_to_standoff(test_sents, test_docs))


def arg_parser():
parser = argparse.ArgumentParser()
parser.add_argument("corpus", choices=CORPUS_PATH.keys(), help="Corpus identifier.")
parser.add_argument("corpus", choices=CORPUS_PATH.keys(),
help="Corpus identifier.")
parser.add_argument("run_id", help="Run identifier")
parser.add_argument("--train_with_dev", help="Use dev set during training",
action='store_true')
parser.add_argument("--model_file", help="Load existing model instead of training new.")
parser.add_argument(
"--model_file", help="Load existing model instead of training new.")
parser.add_argument("--pooled_contextual_embeddings",
help="Boolean flag whether to use pooled variant of FlairEmbeddings.",
action='store_true')
Expand All @@ -176,7 +188,7 @@ def arg_parser():
parser.add_argument("--contextual_backward_path",
help="Path to contextual string embeddings (backward)")
parser.add_argument("--embedding_lang",
choices=['en', 'nl'],
choices=['en', 'nl', 'fr'],
help="Specify language of embeddings.")
parser.add_argument("--fine_tune",
help="Fine tune an existing model (has to be passed with --model_file)",
Expand All @@ -187,5 +199,6 @@ def arg_parser():
if __name__ == '__main__':
ARGS = arg_parser()
ARGS.run_id = 'bilstmcrf_' + ARGS.run_id
logger.add(join(train_utils.model_dir(ARGS.corpus, ARGS.run_id), 'training.log'))
logger.add(join(train_utils.model_dir(
ARGS.corpus, ARGS.run_id), 'training.log'))
main(ARGS)
19 changes: 12 additions & 7 deletions deidentify/methods/bilstmcrf/run_bilstmcrf_training_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,17 +24,21 @@ def main(args, model_dir):
logger.info('Loaded corpus: {}'.format(corpus))

logger.info('Get sentences...')
train_sents, _ = flair_utils.standoff_to_flair_sents(corpus.train, tokenizer, verbose=True)
dev_sents, _ = flair_utils.standoff_to_flair_sents(corpus.dev, tokenizer, verbose=True)
train_sents, _ = flair_utils.standoff_to_flair_sents(
corpus.train, tokenizer, verbose=True)
dev_sents, _ = flair_utils.standoff_to_flair_sents(
corpus.dev, tokenizer, verbose=True)
test_sents, test_docs = flair_utils.standoff_to_flair_sents(corpus.test,
tokenizer, verbose=True)

train_sents = train_sents + dev_sents
train_sents_filtered = list(filter(lambda sent: not _ignore_sentence(sent), train_sents))
train_sents_filtered = list(
filter(lambda sent: not _ignore_sentence(sent), train_sents))

sample_size = int(len(train_sents_filtered) * args.train_sample_frac)
rs = RandomState(seed=args.random_seed)
train_sents_sample = rs.choice(train_sents_filtered, replace=False, size=sample_size).tolist()
train_sents_sample = rs.choice(
train_sents_filtered, replace=False, size=sample_size).tolist()
logger.info('Train with fraction of training data: {} sents out of {} sentences ({}%)',
sample_size, len(train_sents_filtered), args.train_sample_frac)

Expand Down Expand Up @@ -78,7 +82,8 @@ def main(args, model_dir):

def arg_parser():
parser = argparse.ArgumentParser()
parser.add_argument("corpus", choices=CORPUS_PATH.keys(), help="Corpus identifier.")
parser.add_argument("corpus", choices=CORPUS_PATH.keys(),
help="Corpus identifier.")
parser.add_argument("run_id", help="Run identifier")
parser.add_argument("--train_sample_frac",
help="Fraction of the training data to use.",
Expand All @@ -92,8 +97,8 @@ def arg_parser():
help="If passed, the final model is saved.",
action='store_true')
parser.add_argument("--embedding_lang",
choices = ['en','nl'],
help="Specify language of embeddings.")
choices=['en', 'nl', 'fr'],
help="Specify language of embeddings.")
return parser.parse_args()


Expand Down
11 changes: 11 additions & 0 deletions deidentify/tokenizer/tokenizer_fr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import spacy

from deidentify.tokenizer import Tokenizer

NLP = spacy.load('fr_core_news_sm')


class TokenizerFR(Tokenizer):

def parse_text(self, text: str) -> spacy.tokens.doc.Doc:
return NLP(text)
1 change: 1 addition & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ dependencies:
- spacy==2.3.5
- https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz#egg=en_core_web_sm==2.3.1
- https://github.com/explosion/spacy-models/releases/download/nl_core_news_sm-2.3.0/nl_core_news_sm-2.3.0.tar.gz#egg=nl_core_news_sm==2.3.0
- https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-2.3.0/fr_core_news_sm-2.3.0.tar.gz#egg=fr_core_news_sm==2.3.0
- deduce==1.0.2
- py-dateinfer==0.4.5
- loguru==0.5.3
Expand Down

0 comments on commit a3f26f8

Please sign in to comment.