diff --git a/api/server.py b/api/server.py index 2ed212b..65d69a3 100644 --- a/api/server.py +++ b/api/server.py @@ -1,14 +1,14 @@ import json import logging import sys -from typing import Union +from typing import Union, Optional, Dict from flask import Flask, request from paste.translogger import TransLogger from waitress import serve from names_dataset import NameDataset, NameWrapper -from names_dataset.emails import extract_names_from_email +from names_dataset.emails import extract_names_from_email, try_to_split_with_two_last_names logger = logging.getLogger(__name__) logging.basicConfig( @@ -47,6 +47,16 @@ def str2bool(s: Union[bool, str]) -> bool: return False +def package_name(name: str, identifier: str) -> Optional[Dict]: + if name is not None: + result = nd.search(name)[identifier] + if result is not None: + result['name'] = name.title() + return result + else: + return None + + @app.route('/split', methods=['GET']) def split(): try: @@ -59,21 +69,16 @@ def split(): ) else: first_name, last_name = extract_names_from_email(nd, q) - if first_name is not None: - result_first_name = nd.search(first_name)['first_name'] - if result_first_name is not None: - result_first_name['name'] = first_name - else: - result_first_name = None - if last_name is not None: - result_last_name = nd.search(last_name)['last_name'] - if result_last_name is not None: - result_last_name['name'] = last_name - else: - result_last_name = None + last_name2 = None + if first_name is None or last_name is None: + first_name, last_name, last_name2 = try_to_split_with_two_last_names(nd, q) + result_first_name = package_name(first_name, 'first_name') + result_last_name = package_name(last_name, 'last_name') + result_last_name2 = package_name(last_name2, 'last_name') result = { 'first_name': result_first_name, - 'last_name': result_last_name + 'last_name': result_last_name, + 'last_name2': result_last_name2 } return generate_output({'result': result}, status=True) except Exception as e: diff --git a/names_dataset/emails.py b/names_dataset/emails.py index e42aa0d..0955850 100644 --- a/names_dataset/emails.py +++ b/names_dataset/emails.py @@ -1,4 +1,5 @@ import re +from collections import Counter from typing import Dict import numpy as np @@ -60,10 +61,37 @@ def _general_score(nd: NameDataset, candidate: str): return float('-inf') +def try_to_split_with_two_last_names(nd: NameDataset, email: str): + c = Counter() + for i in range(1, len(email)): + first_name, last_name = extract_names_from_email(nd, email[0:i]) + if first_name is not None: + c[first_name] += 1 + if last_name is not None: + c[last_name] += 1 + most_common = c.most_common(1) + if len(most_common) > 0: + candidate1 = most_common[0][0] + candidate2, candidate3 = extract_names_from_email(nd, email.replace(candidate1, '')) + + fn1, ln1 = _infer_first_and_last_names(candidate1, candidate2, nd) + fn2, ln2 = _infer_first_and_last_names(candidate1, candidate3, nd) + fn3, ln3 = _infer_first_and_last_names(candidate2, candidate3, nd) + + real_first_name = Counter([fn1, fn2, fn3]).most_common(1)[0][0] + last_names = list({candidate1, candidate2, candidate3} - {real_first_name}) + last_name1 = last_names[0] + last_name2 = last_names[1] + if email.index(last_name1) < email.index(last_name2): + last_name1, last_name2 = last_name1, last_name2 + else: + last_name1, last_name2 = last_name2, last_name1 + return real_first_name, last_name1, last_name2 + return None, None, None + + def extract_names_from_email(nd: NameDataset, email: str): email = email.strip() - if '' in email: - email = email.split(' ')[0] if '@' not in email: email += '@gmail.com' @@ -116,15 +144,7 @@ def extract_names_from_email(nd: NameDataset, email: str): last_name = None if first_name is not None and last_name is not None: - fn_1 = nd.search(first_name)['first_name'] - ln_1 = nd.search(last_name)['last_name'] - fn_2 = nd.search(first_name)['last_name'] - ln_2 = nd.search(last_name)['first_name'] - if fn_1 is not None and ln_1 is not None and fn_2 is not None and ln_2 is not None: - score_1 = _compute_score(fn_1) + _compute_score(ln_1) - score_2 = _compute_score(fn_2) + _compute_score(ln_2) - if score_2 > score_1: - first_name, last_name = last_name, first_name + first_name, last_name = _infer_first_and_last_names(first_name, last_name, nd) if first_name is not None: first_name = first_name.lower() @@ -133,3 +153,16 @@ def extract_names_from_email(nd: NameDataset, email: str): last_name = last_name.lower() return first_name, last_name + + +def _infer_first_and_last_names(first_name, last_name, nd): + fn_1 = nd.search(first_name)['first_name'] + ln_1 = nd.search(last_name)['last_name'] + fn_2 = nd.search(first_name)['last_name'] + ln_2 = nd.search(last_name)['first_name'] + if fn_1 is not None and ln_1 is not None and fn_2 is not None and ln_2 is not None: + score_1 = _compute_score(fn_1) + _compute_score(ln_1) + score_2 = _compute_score(fn_2) + _compute_score(ln_2) + if score_2 > score_1: + first_name, last_name = last_name, first_name + return first_name, last_name diff --git a/tests/test_from_emails.py b/tests/test_from_emails.py index 785624f..77a39aa 100644 --- a/tests/test_from_emails.py +++ b/tests/test_from_emails.py @@ -1,12 +1,205 @@ import unittest from names_dataset import NameDataset -from names_dataset.emails import extract_names_from_email +from names_dataset.emails import extract_names_from_email, try_to_split_with_two_last_names class TestEmail(unittest.TestCase): - def test_1(self): + def test_with_three_3(self): + inputs = [ + 'perezmartiisabel', + 'isabelmartiperez', + 'martiperezisabel', + 'isabelperezmarti', + + 'garciafernandezmaria', + 'mariafernandezgarcia', + + 'gonzalezlopezana', + 'analopezgonzalez', + + 'rodriguezhernandezjuan', + 'juanhernandezrodriguez', + + 'suarezdominguezcarlos', + 'carlosdominguezsuarez', + + 'sanchezruizlucia', + 'luciaruizsanchez', + + 'gomeznunezmiguel', + 'miguelnunezgomez', + + ] + + outputs = [ + ['isabel', 'perez', 'marti'], + ['isabel', 'marti', 'perez'], + ['isabel', 'marti', 'perez'], + ['isabel', 'perez', 'marti'], + + ['maria', 'garcia', 'fernandez'], + ['maria', 'fernandez', 'garcia'], + + ['ana', 'gonzalez', 'lopez'], + ['ana', 'lopez', 'gonzalez'], + + ['juan', 'rodriguez', 'hernandez'], + ['juan', 'hernandez', 'rodriguez'], + + ['carlos', 'suarez', 'dominguez'], + ['carlos', 'dominguez', 'suarez'], + + ['lucia', 'sanchez', 'ruiz'], + ['lucia', 'ruiz', 'sanchez'], + + ['miguel', 'gomez', 'nunez'], + ['miguel', 'nunez', 'gomez'], + ] + inputs2 = [] + for i in inputs: + inputs2.append(i.split('@')[0]) + + nd = NameDataset() + for input_, output_ in zip(inputs2, outputs): + first_name, last_name, last_name2 = try_to_split_with_two_last_names(nd, input_) + print(input_) + print('output=', first_name, last_name, last_name2) + print('expected=', output_[0], output_[1], output_[2]) + self.assertEqual(output_[0], first_name) + self.assertEqual(output_[1], last_name) + self.assertEqual(output_[2], last_name2) + print('[OK]') + + def test_with_three_2(self): + inputs = [ + 'torresmoralesines', + + 'perezmartiisabel', + 'isabelmartiperez', + 'martiperezisabel', + 'isabelperezmarti', + + 'garciafernandezmaria', + 'mariafernandezgarcia', + 'fernandezgarciamaria', + 'mariagarciafernandez', + + 'gonzalezlopezana', + 'analopezgonzalez', + 'lopezgonzalezana', + 'anagonzalezlopez', + + 'rodriguezhernandezjuan', + 'juanhernandezrodriguez', + 'hernandezrodriguezjuan', + 'juanrodriguezhernandez', + + 'suarezdominguezcarlos', + 'carlosdominguezsuarez', + 'dominguezsuarezcarlos', + 'carlossuarezdominguez', + + 'sanchezruizlucia', + 'luciaruizsanchez', + 'ruizsanchezlucia', + 'luciasanchezruiz', + + 'gomeznunezmiguel', + 'miguelnunezgomez', + 'nunezgomezmiguel', + 'miguelgomeznunez', + + 'moralestorresines', + 'inestorresmorales', + ] + + outputs = [ + ['ines', 'torres', 'morales'], + + ['isabel', 'perez', 'marti'], + ['isabel', 'marti', 'perez'], + ['isabel', 'marti', 'perez'], + ['isabel', 'perez', 'marti'], + + ['maria', 'garcia', 'fernandez'], + ['maria', 'fernandez', 'garcia'], + ['maria', 'fernandez', 'garcia'], + ['maria', 'garcia', 'fernandez'], + + ['ana', 'gonzalez', 'lopez'], + ['ana', 'lopez', 'gonzalez'], + ['ana', 'lopez', 'gonzalez'], + ['ana', 'gonzalez', 'lopez'], + + ['juan', 'rodriguez', 'hernandez'], + ['juan', 'hernandez', 'rodriguez'], + ['juan', 'hernandez', 'rodriguez'], + ['juan', 'rodriguez', 'hernandez'], + + ['carlos', 'suarez', 'dominguez'], + ['carlos', 'dominguez', 'suarez'], + ['carlos', 'dominguez', 'suarez'], + ['carlos', 'suarez', 'dominguez'], + + ['lucia', 'sanchez', 'ruiz'], + ['lucia', 'ruiz', 'sanchez'], + ['lucia', 'ruiz', 'sanchez'], + ['lucia', 'sanchez', 'ruiz'], + + ['miguel', 'gomez', 'nunez'], + ['miguel', 'nunez', 'gomez'], + ['miguel', 'nunez', 'gomez'], + ['miguel', 'gomez', 'nunez'], + + ['ines', 'morales', 'torres'], + ['ines', 'torres', 'morales'], + ] + inputs2 = [] + for i in inputs: + inputs2.append(i.split('@')[0]) + + nd = NameDataset() + for input_, output_ in zip(inputs2, outputs): + first_name, last_name, last_name2 = try_to_split_with_two_last_names(nd, input_) + print(input_) + print('output=', first_name, last_name, last_name2) + print('expected=', output_[0], output_[1], output_[2]) + self.assertEqual(output_[0], first_name) + self.assertEqual(output_[1], last_name) + self.assertEqual(output_[2], last_name2) + print('[OK]') + + def test_with_three_1(self): + inputs = [ + 'perezmartiisabel', + 'isabelmartiperez', + 'martiperezisabel', + 'isabelperezmarti', + ] + outputs = [ + ['isabel', 'perez', 'marti'], + ['isabel', 'marti', 'perez'], + + ['isabel', 'marti', 'perez'], + ['isabel', 'perez', 'marti'], + ] + inputs2 = [] + for i in inputs: + inputs2.append(i.split('@')[0]) + + nd = NameDataset() + for input_, output_ in zip(inputs2, outputs): + first_name, last_name, last_name2 = try_to_split_with_two_last_names(nd, input_) + print(input_) + print('output=', first_name, last_name, last_name2) + self.assertEqual(output_[0], first_name) + self.assertEqual(output_[1], last_name) + self.assertEqual(output_[2], last_name2) + print('[OK]') + + def test_with_two(self): inputs = [ 'info@skysense.jp', 'isabelle.remy.fr@gmail.com', @@ -28,6 +221,7 @@ def test_1(self): 'remy_j@example.com', 'j_remy123@example.com', 'philippe.remy1@example.com', + 'perezmarti', ] inputs2 = [] for i in inputs: @@ -54,6 +248,7 @@ def test_1(self): ['remy', None], [None, 'remy'], ['philippe', 'remy'], + ['perez', 'marti'], ] nd = NameDataset()