From 6d79fee8c64e08ac95da0065bc6f050510803390 Mon Sep 17 00:00:00 2001
From: philipperemy <premy.enseirb@gmail.com>
Date: Wed, 9 Oct 2024 21:39:02 +0900
Subject: [PATCH] emails

---
 api/server.py             |  35 ++++---
 names_dataset/emails.py   |  55 ++++++++---
 tests/test_from_emails.py | 199 +++++++++++++++++++++++++++++++++++++-
 3 files changed, 261 insertions(+), 28 deletions(-)

diff --git a/api/server.py b/api/server.py
index 2ed212b..65d69a3 100644
--- a/api/server.py
+++ b/api/server.py
@@ -1,14 +1,14 @@
 import json
 import logging
 import sys
-from typing import Union
+from typing import Union, Optional, Dict
 
 from flask import Flask, request
 from paste.translogger import TransLogger
 from waitress import serve
 
 from names_dataset import NameDataset, NameWrapper
-from names_dataset.emails import extract_names_from_email
+from names_dataset.emails import extract_names_from_email, try_to_split_with_two_last_names
 
 logger = logging.getLogger(__name__)
 logging.basicConfig(
@@ -47,6 +47,16 @@ def str2bool(s: Union[bool, str]) -> bool:
         return False
 
 
+def package_name(name: str, identifier: str) -> Optional[Dict]:
+    if name is not None:
+        result = nd.search(name)[identifier]
+        if result is not None:
+            result['name'] = name.title()
+        return result
+    else:
+        return None
+
+
 @app.route('/split', methods=['GET'])
 def split():
     try:
@@ -59,21 +69,16 @@ def split():
             )
         else:
             first_name, last_name = extract_names_from_email(nd, q)
-            if first_name is not None:
-                result_first_name = nd.search(first_name)['first_name']
-                if result_first_name is not None:
-                    result_first_name['name'] = first_name
-            else:
-                result_first_name = None
-            if last_name is not None:
-                result_last_name = nd.search(last_name)['last_name']
-                if result_last_name is not None:
-                    result_last_name['name'] = last_name
-            else:
-                result_last_name = None
+            last_name2 = None
+            if first_name is None or last_name is None:
+                first_name, last_name, last_name2 = try_to_split_with_two_last_names(nd, q)
+            result_first_name = package_name(first_name, 'first_name')
+            result_last_name = package_name(last_name, 'last_name')
+            result_last_name2 = package_name(last_name2, 'last_name')
             result = {
                 'first_name': result_first_name,
-                'last_name': result_last_name
+                'last_name': result_last_name,
+                'last_name2': result_last_name2
             }
             return generate_output({'result': result}, status=True)
     except Exception as e:
diff --git a/names_dataset/emails.py b/names_dataset/emails.py
index e42aa0d..0955850 100644
--- a/names_dataset/emails.py
+++ b/names_dataset/emails.py
@@ -1,4 +1,5 @@
 import re
+from collections import Counter
 from typing import Dict
 
 import numpy as np
@@ -60,10 +61,37 @@ def _general_score(nd: NameDataset, candidate: str):
     return float('-inf')
 
 
+def try_to_split_with_two_last_names(nd: NameDataset, email: str):
+    c = Counter()
+    for i in range(1, len(email)):
+        first_name, last_name = extract_names_from_email(nd, email[0:i])
+        if first_name is not None:
+            c[first_name] += 1
+        if last_name is not None:
+            c[last_name] += 1
+    most_common = c.most_common(1)
+    if len(most_common) > 0:
+        candidate1 = most_common[0][0]
+        candidate2, candidate3 = extract_names_from_email(nd, email.replace(candidate1, ''))
+
+        fn1, ln1 = _infer_first_and_last_names(candidate1, candidate2, nd)
+        fn2, ln2 = _infer_first_and_last_names(candidate1, candidate3, nd)
+        fn3, ln3 = _infer_first_and_last_names(candidate2, candidate3, nd)
+
+        real_first_name = Counter([fn1, fn2, fn3]).most_common(1)[0][0]
+        last_names = list({candidate1, candidate2, candidate3} - {real_first_name})
+        last_name1 = last_names[0]
+        last_name2 = last_names[1]
+        if email.index(last_name1) < email.index(last_name2):
+            last_name1, last_name2 = last_name1, last_name2
+        else:
+            last_name1, last_name2 = last_name2, last_name1
+        return real_first_name, last_name1, last_name2
+    return None, None, None
+
+
 def extract_names_from_email(nd: NameDataset, email: str):
     email = email.strip()
-    if '' in email:
-        email = email.split(' ')[0]
     if '@' not in email:
         email += '@gmail.com'
 
@@ -116,15 +144,7 @@ def extract_names_from_email(nd: NameDataset, email: str):
         last_name = None
 
     if first_name is not None and last_name is not None:
-        fn_1 = nd.search(first_name)['first_name']
-        ln_1 = nd.search(last_name)['last_name']
-        fn_2 = nd.search(first_name)['last_name']
-        ln_2 = nd.search(last_name)['first_name']
-        if fn_1 is not None and ln_1 is not None and fn_2 is not None and ln_2 is not None:
-            score_1 = _compute_score(fn_1) + _compute_score(ln_1)
-            score_2 = _compute_score(fn_2) + _compute_score(ln_2)
-            if score_2 > score_1:
-                first_name, last_name = last_name, first_name
+        first_name, last_name = _infer_first_and_last_names(first_name, last_name, nd)
 
     if first_name is not None:
         first_name = first_name.lower()
@@ -133,3 +153,16 @@ def extract_names_from_email(nd: NameDataset, email: str):
         last_name = last_name.lower()
 
     return first_name, last_name
+
+
+def _infer_first_and_last_names(first_name, last_name, nd):
+    fn_1 = nd.search(first_name)['first_name']
+    ln_1 = nd.search(last_name)['last_name']
+    fn_2 = nd.search(first_name)['last_name']
+    ln_2 = nd.search(last_name)['first_name']
+    if fn_1 is not None and ln_1 is not None and fn_2 is not None and ln_2 is not None:
+        score_1 = _compute_score(fn_1) + _compute_score(ln_1)
+        score_2 = _compute_score(fn_2) + _compute_score(ln_2)
+        if score_2 > score_1:
+            first_name, last_name = last_name, first_name
+    return first_name, last_name
diff --git a/tests/test_from_emails.py b/tests/test_from_emails.py
index 785624f..77a39aa 100644
--- a/tests/test_from_emails.py
+++ b/tests/test_from_emails.py
@@ -1,12 +1,205 @@
 import unittest
 
 from names_dataset import NameDataset
-from names_dataset.emails import extract_names_from_email
+from names_dataset.emails import extract_names_from_email, try_to_split_with_two_last_names
 
 
 class TestEmail(unittest.TestCase):
 
-    def test_1(self):
+    def test_with_three_3(self):
+        inputs = [
+            'perezmartiisabel',
+            'isabelmartiperez',
+            'martiperezisabel',
+            'isabelperezmarti',
+
+            'garciafernandezmaria',
+            'mariafernandezgarcia',
+
+            'gonzalezlopezana',
+            'analopezgonzalez',
+
+            'rodriguezhernandezjuan',
+            'juanhernandezrodriguez',
+
+            'suarezdominguezcarlos',
+            'carlosdominguezsuarez',
+
+            'sanchezruizlucia',
+            'luciaruizsanchez',
+
+            'gomeznunezmiguel',
+            'miguelnunezgomez',
+
+        ]
+
+        outputs = [
+            ['isabel', 'perez', 'marti'],
+            ['isabel', 'marti', 'perez'],
+            ['isabel', 'marti', 'perez'],
+            ['isabel', 'perez', 'marti'],
+
+            ['maria', 'garcia', 'fernandez'],
+            ['maria', 'fernandez', 'garcia'],
+
+            ['ana', 'gonzalez', 'lopez'],
+            ['ana', 'lopez', 'gonzalez'],
+
+            ['juan', 'rodriguez', 'hernandez'],
+            ['juan', 'hernandez', 'rodriguez'],
+
+            ['carlos', 'suarez', 'dominguez'],
+            ['carlos', 'dominguez', 'suarez'],
+
+            ['lucia', 'sanchez', 'ruiz'],
+            ['lucia', 'ruiz', 'sanchez'],
+
+            ['miguel', 'gomez', 'nunez'],
+            ['miguel', 'nunez', 'gomez'],
+        ]
+        inputs2 = []
+        for i in inputs:
+            inputs2.append(i.split('@')[0])
+
+        nd = NameDataset()
+        for input_, output_ in zip(inputs2, outputs):
+            first_name, last_name, last_name2 = try_to_split_with_two_last_names(nd, input_)
+            print(input_)
+            print('output=', first_name, last_name, last_name2)
+            print('expected=', output_[0], output_[1], output_[2])
+            self.assertEqual(output_[0], first_name)
+            self.assertEqual(output_[1], last_name)
+            self.assertEqual(output_[2], last_name2)
+            print('[OK]')
+
+    def test_with_three_2(self):
+        inputs = [
+            'torresmoralesines',
+
+            'perezmartiisabel',
+            'isabelmartiperez',
+            'martiperezisabel',
+            'isabelperezmarti',
+
+            'garciafernandezmaria',
+            'mariafernandezgarcia',
+            'fernandezgarciamaria',
+            'mariagarciafernandez',
+
+            'gonzalezlopezana',
+            'analopezgonzalez',
+            'lopezgonzalezana',
+            'anagonzalezlopez',
+
+            'rodriguezhernandezjuan',
+            'juanhernandezrodriguez',
+            'hernandezrodriguezjuan',
+            'juanrodriguezhernandez',
+
+            'suarezdominguezcarlos',
+            'carlosdominguezsuarez',
+            'dominguezsuarezcarlos',
+            'carlossuarezdominguez',
+
+            'sanchezruizlucia',
+            'luciaruizsanchez',
+            'ruizsanchezlucia',
+            'luciasanchezruiz',
+
+            'gomeznunezmiguel',
+            'miguelnunezgomez',
+            'nunezgomezmiguel',
+            'miguelgomeznunez',
+
+            'moralestorresines',
+            'inestorresmorales',
+        ]
+
+        outputs = [
+            ['ines', 'torres', 'morales'],
+
+            ['isabel', 'perez', 'marti'],
+            ['isabel', 'marti', 'perez'],
+            ['isabel', 'marti', 'perez'],
+            ['isabel', 'perez', 'marti'],
+
+            ['maria', 'garcia', 'fernandez'],
+            ['maria', 'fernandez', 'garcia'],
+            ['maria', 'fernandez', 'garcia'],
+            ['maria', 'garcia', 'fernandez'],
+
+            ['ana', 'gonzalez', 'lopez'],
+            ['ana', 'lopez', 'gonzalez'],
+            ['ana', 'lopez', 'gonzalez'],
+            ['ana', 'gonzalez', 'lopez'],
+
+            ['juan', 'rodriguez', 'hernandez'],
+            ['juan', 'hernandez', 'rodriguez'],
+            ['juan', 'hernandez', 'rodriguez'],
+            ['juan', 'rodriguez', 'hernandez'],
+
+            ['carlos', 'suarez', 'dominguez'],
+            ['carlos', 'dominguez', 'suarez'],
+            ['carlos', 'dominguez', 'suarez'],
+            ['carlos', 'suarez', 'dominguez'],
+
+            ['lucia', 'sanchez', 'ruiz'],
+            ['lucia', 'ruiz', 'sanchez'],
+            ['lucia', 'ruiz', 'sanchez'],
+            ['lucia', 'sanchez', 'ruiz'],
+
+            ['miguel', 'gomez', 'nunez'],
+            ['miguel', 'nunez', 'gomez'],
+            ['miguel', 'nunez', 'gomez'],
+            ['miguel', 'gomez', 'nunez'],
+
+            ['ines', 'morales', 'torres'],
+            ['ines', 'torres', 'morales'],
+        ]
+        inputs2 = []
+        for i in inputs:
+            inputs2.append(i.split('@')[0])
+
+        nd = NameDataset()
+        for input_, output_ in zip(inputs2, outputs):
+            first_name, last_name, last_name2 = try_to_split_with_two_last_names(nd, input_)
+            print(input_)
+            print('output=', first_name, last_name, last_name2)
+            print('expected=', output_[0], output_[1], output_[2])
+            self.assertEqual(output_[0], first_name)
+            self.assertEqual(output_[1], last_name)
+            self.assertEqual(output_[2], last_name2)
+            print('[OK]')
+
+    def test_with_three_1(self):
+        inputs = [
+            'perezmartiisabel',
+            'isabelmartiperez',
+            'martiperezisabel',
+            'isabelperezmarti',
+        ]
+        outputs = [
+            ['isabel', 'perez', 'marti'],
+            ['isabel', 'marti', 'perez'],
+
+            ['isabel', 'marti', 'perez'],
+            ['isabel', 'perez', 'marti'],
+        ]
+        inputs2 = []
+        for i in inputs:
+            inputs2.append(i.split('@')[0])
+
+        nd = NameDataset()
+        for input_, output_ in zip(inputs2, outputs):
+            first_name, last_name, last_name2 = try_to_split_with_two_last_names(nd, input_)
+            print(input_)
+            print('output=', first_name, last_name, last_name2)
+            self.assertEqual(output_[0], first_name)
+            self.assertEqual(output_[1], last_name)
+            self.assertEqual(output_[2], last_name2)
+            print('[OK]')
+
+    def test_with_two(self):
         inputs = [
             'info@skysense.jp',
             'isabelle.remy.fr@gmail.com',
@@ -28,6 +221,7 @@ def test_1(self):
             'remy_j@example.com',
             'j_remy123@example.com',
             'philippe.remy1@example.com',
+            'perezmarti',
         ]
         inputs2 = []
         for i in inputs:
@@ -54,6 +248,7 @@ def test_1(self):
             ['remy', None],
             [None, 'remy'],
             ['philippe', 'remy'],
+            ['perez', 'marti'],
         ]
 
         nd = NameDataset()