From a0dc465b128c7b5967560d3199b72a41639269a2 Mon Sep 17 00:00:00 2001 From: Viswanatha Reddy Gajjala Date: Sun, 27 Jun 2021 01:30:58 +0530 Subject: [PATCH] Added num2words --- transformations/num2words/README.md | 19 ++++++ transformations/num2words/__init__.py | 1 + transformations/num2words/test.json | 50 ++++++++++++++++ transformations/num2words/transformation.py | 66 +++++++++++++++++++++ 4 files changed, 136 insertions(+) create mode 100644 transformations/num2words/README.md create mode 100644 transformations/num2words/__init__.py create mode 100644 transformations/num2words/test.json create mode 100644 transformations/num2words/transformation.py diff --git a/transformations/num2words/README.md b/transformations/num2words/README.md new file mode 100644 index 000000000..6eac4c3fc --- /dev/null +++ b/transformations/num2words/README.md @@ -0,0 +1,19 @@ +# Numbers2Words Transformation 🦎 + ⌨️ → 🐍 +This transformation converts the numbers/floats in the given sentence/paragraph to word format. + +Author name: Viswanatha Reddy Gajjala +Author email: viswanatha.g15@iiits.in + +## What type of a transformation is this? +This transformation acts like a perturbation to test robustness. +Input: 2 times 2 is 4. +Output: two times two is four. + +## What tasks does it intend to benefit? +This perturbation would benefit all tasks which have a sentence/paragraph/document as input like text classification, +text generation, etc. + +This transformation can be used to augment the dataset that contains numerical values. It helps to analyze models performance on questions which require numerical understanding. + +## What are the limitations of this transformation? +The transformation's outputs are too simple to be used for data augmentation. Unlike a paraphraser, it is not capable of generating linguistically diverse text. \ No newline at end of file diff --git a/transformations/num2words/__init__.py b/transformations/num2words/__init__.py new file mode 100644 index 000000000..0a79241bb --- /dev/null +++ b/transformations/num2words/__init__.py @@ -0,0 +1 @@ +from .transformation import * \ No newline at end of file diff --git a/transformations/num2words/test.json b/transformations/num2words/test.json new file mode 100644 index 000000000..7720f2151 --- /dev/null +++ b/transformations/num2words/test.json @@ -0,0 +1,50 @@ +{ + "type": "num2words", + "test_cases": [ + { + "class": "num2words", + "inputs": { + "sentence": "He ate 0.25 of the pizza within the first 5 minutes." + }, + "outputs": [{ + "sentence": "He ate zero and 25/100 of the pizza within the first 5 minutes." + }] + }, + { + "class": "num2words", + "inputs": { + "sentence": "He has returned from his office." + }, + "outputs": [{ + "sentence": "He has returned from his office." + }] + }, + { + "class": "num2words", + "inputs": { + "sentence": "She has bought 100 apples." + }, + "outputs": [{ + "sentence": "She has bought one hundred apples." + }] + }, + { + "class": "num2words", + "inputs": { + "sentence": "2 times 2 is 4." + }, + "outputs": [{ + "sentence": "two times two is four." + }] + }, + { + "class": "num2words", + "inputs": { + "sentence": "Neuroplasticity is a continuous processing allowing short-term, medium-term, and long-term remodeling of the neuronosynaptic organization." + }, + "outputs": [{ + "sentence": "Neuroplasticity is a continuous processing allowing short-term, medium-term, and long-term remodeling of the neuronosynaptic organization." + }] + } + ] +} diff --git a/transformations/num2words/transformation.py b/transformations/num2words/transformation.py new file mode 100644 index 000000000..9f7e8e892 --- /dev/null +++ b/transformations/num2words/transformation.py @@ -0,0 +1,66 @@ +import re +import spacy +from interfaces.SentenceOperation import SentenceOperation +from tasks.TaskTypes import TaskType +import inflect + + +class Numbers2Words: + nlp = None + + def __init__(self): + self.nlp = spacy.load("en_core_web_sm") + + @staticmethod + def int2words(n, p=inflect.engine()): + return ' '.join(p.number_to_words(n, wantlist=True, andword=' ')) + + def float2words(self, float_value): + float_value = str(round(float(float_value), 2)) + integer, dot, decimal = float_value.partition('.') + return "{integer}{decimal}".format( + integer=self.int2words(int(integer)), + decimal=" and {}/100".format(decimal) if decimal and int(decimal) else '') + + def __call__(self, input_text: str): + doc = self.nlp(input_text) + + for entity in doc.ents: + new_value = None + + if entity.label_ == "CARDINAL" and not re.search( + "[_]|[-]|[:]|[/]|[(]|[)]", entity.text + ): + + cardinal_value = entity.text + + cardinal_value = cardinal_value.replace(",", "") + + if cardinal_value.isdigit() or '.' in cardinal_value: + cardinal_value = self.float2words(cardinal_value) + input_text = input_text.replace(entity.text, str(cardinal_value)) + + return input_text + + +class Num2Words(SentenceOperation): + tasks = [TaskType.TEXT_CLASSIFICATION, TaskType.TEXT_TO_TEXT_GENERATION] + languages = ["en"] + + def __init__(self, verbose=False): + super().__init__(verbose=verbose) + self.transform = Numbers2Words() + + def generate(self, sentence: str): + result = self.transform(sentence) + if self.verbose: + print(f"Perturbed Input from {self.name()} : {result}") + return [result] + +""" +# Sample code to demonstrate usage. Can also assist in adding test cases. +if __name__ == '__main__': + Num2Words(verbose=True).generate('she has bought hundred apples.') + Num2Words(verbose=True).generate('she has bought 100 apples.') + Num2Words(verbose=True).generate('she has bought 100.55 apples.') +""" \ No newline at end of file