From df7b02b24b4b24b10beb256fa5661b84d0148202 Mon Sep 17 00:00:00 2001 From: "Luiz Otavio V. B. Oliveira" Date: Fri, 14 Jun 2019 09:19:25 +0200 Subject: [PATCH 1/5] Generate the files to call the language_classifier --- postal/lang_classifier.py | 13 +++ postal/pylangclassifier.c | 184 ++++++++++++++++++++++++++++++++++++++ setup.py | 7 ++ 3 files changed, 204 insertions(+) create mode 100644 postal/lang_classifier.py create mode 100644 postal/pylangclassifier.c diff --git a/postal/lang_classifier.py b/postal/lang_classifier.py new file mode 100644 index 0000000..e68c5b4 --- /dev/null +++ b/postal/lang_classifier.py @@ -0,0 +1,13 @@ +"""Python bindings to libpostal parse_address.""" +from postal import _langclassifier +from postal.utils.encoding import safe_decode + + +def classify_lang_address(address): + """ + Classify the language of an address. + + @param address: the address as either Unicode or a UTF-8 encoded string + """ + address = safe_decode(address, 'utf-8') + return _langclassifier.classify_lang_address(address) diff --git a/postal/pylangclassifier.c b/postal/pylangclassifier.c new file mode 100644 index 0000000..34fa6a9 --- /dev/null +++ b/postal/pylangclassifier.c @@ -0,0 +1,184 @@ +#include +#include +#include "pyutils.h" + +#if PY_MAJOR_VERSION >= 3 +#define IS_PY3K +#endif + +struct module_state { + PyObject *error; +}; + + +#ifdef IS_PY3K + #define GETSTATE(m) ((struct module_state*)PyModule_GetState(m)) +#else + #define GETSTATE(m) (&_state) + static struct module_state _state; +#endif + + +static PyObject *py_classify_lang_address(PyObject *self, PyObject *args, PyObject *keywords) { + PyObject *arg_input; + + PyObject *result = NULL; + + static char *kwlist[] = {"address", NULL}; + + + if (!PyArg_ParseTupleAndKeywords(args, keywords, + "O|OO:pyparser", kwlist, + &arg_input)) { + return 0; + } + + char *input = PyObject_to_string(arg_input); + + if (input == NULL) { + return NULL; + } + + + +// if (!address_dictionary_module_setup(NULL) || !transliteration_module_setup(NULL) || !language_classifier_module_setup(dir)) { +// log_error("Could not load language classifiers\n"); +// exit(EXIT_FAILURE); +// } + +// libpostal_address_parser_response_t *parsed = libpostal_parse_address(input, options); +// if (parsed == NULL) { +// goto exit_free_country; +// } +// +// result = PyList_New((Py_ssize_t)parsed->num_components); +// if (!result) { +// goto exit_destroy_response; +// } +// +// for (int i = 0; i < parsed->num_components; i++) { +// char *component = parsed->components[i]; +// char *label = parsed->labels[i]; +// PyObject *component_unicode = PyUnicode_DecodeUTF8((const char *)component, strlen(component), "strict"); +// if (component_unicode == NULL) { +// Py_DECREF(result); +// goto exit_destroy_response; +// } +// +// PyObject *label_unicode = PyUnicode_DecodeUTF8((const char *)label, strlen(label), "strict"); +// if (label_unicode == NULL) { +// Py_DECREF(component_unicode); +// Py_DECREF(result); +// goto exit_destroy_response; +// } +// PyObject *tuple = Py_BuildValue("(OO)", component_unicode, label_unicode); +// if (tuple == NULL) { +// Py_DECREF(component_unicode); +// Py_DECREF(label_unicode); +// goto exit_destroy_response; +// } +// +// // Note: PyList_SetItem steals a reference, so don't worry about DECREF +// PyList_SetItem(result, (Py_ssize_t)i, tuple); +// +// Py_DECREF(component_unicode); +// Py_DECREF(label_unicode); +// } +// +// exit_destroy_response: +// libpostal_address_parser_response_destroy(parsed); +// exit_free_country: +// if (country != NULL) { +// free(country); +// } +// exit_free_language: +// if (language != NULL) { +// free(language); +// } +// exit_free_input: +// if (input != NULL) { +// free(input); +// } +// return result; +} + +static PyMethodDef langclassifier_methods[] = { + {"classify_lang_address", (PyCFunction)py_classify_lang_address, METH_VARARGS | METH_KEYWORDS, + "classify_lang_address(text)"}, + {NULL, NULL}, +}; + +#ifdef IS_PY3K + +static int langclassifier_traverse(PyObject *m, visitproc visit, void *arg) { + Py_VISIT(GETSTATE(m)->error); + return 0; +} + +static int langclassifier_clear(PyObject *m) { + Py_CLEAR(GETSTATE(m)->error); +// libpostal_teardown(); + libpostal_teardown_language_classifier(); + return 0; +} + +static struct PyModuleDef module_def = { + PyModuleDef_HEAD_INIT, + "_langclassifier", + NULL, + sizeof(struct module_state), + langclassifier_methods, + NULL, + langclassifier_traverse, + langclassifier_clear, + NULL +}; + +#define INITERROR return NULL + +PyObject * +PyInit__langclassifier(void) { +#else + +#define INITERROR return + +//void cleanup_libpostal(void) { +// libpostal_teardown(); +// libpostal_teardown_parser(); +//} + +void init_parser(void) { + #endif + + #ifdef IS_PY3K + PyObject *module = PyModule_Create(&module_def); + #else + PyObject *module = Py_InitModule("_langclassifier", parser_methods); + #endif + + if (module == NULL) { + INITERROR; + } + struct module_state *st = GETSTATE(module); + + st->error = PyErr_NewException("_langclassifier.Error", NULL, NULL); + if (st->error == NULL) { + Py_DECREF(module); + INITERROR; + } + + if (!libpostal_setup() || !libpostal_setup_language_classifier()) { + PyErr_SetString(PyExc_TypeError, + "Error loading libpostal data"); + } + + #ifndef IS_PY3K + Py_AtExit(&cleanup_libpostal); + #endif + + + #ifdef IS_PY3K + return module; + #endif +} + diff --git a/setup.py b/setup.py index 6ded97b..96cfe3f 100644 --- a/setup.py +++ b/setup.py @@ -40,6 +40,13 @@ def main(): library_dirs=['/usr/local/lib'], extra_compile_args=['-std=c99'], ), + Extension('postal._langclassifier', + sources=['postal/pylangclassifier.c', 'postal/pyutils.c'], + libraries=['postal'], + include_dirs=['/usr/local/include'], + library_dirs=['/usr/local/lib'], + extra_compile_args=['-std=c99'], + ), Extension('postal._token_types', sources=['postal/pytokentypes.c'], libraries=['postal'], From 0c0da049e4c5c464472f2aac8a6d97428c381249 Mon Sep 17 00:00:00 2001 From: "Luiz Otavio V. B. Oliveira" Date: Fri, 14 Jun 2019 10:08:49 +0200 Subject: [PATCH 2/5] First (untested) version --- postal/pylangclassifier.c | 117 +++++++++++++++++--------------------- 1 file changed, 51 insertions(+), 66 deletions(-) diff --git a/postal/pylangclassifier.c b/postal/pylangclassifier.c index 34fa6a9..9b8be9f 100644 --- a/postal/pylangclassifier.c +++ b/postal/pylangclassifier.c @@ -11,6 +11,12 @@ struct module_state { }; +typedef struct language_classifier_response { + Py_ssize_t num_languages; + char **languages; + double *probs; +} language_classifier_response_t; + #ifdef IS_PY3K #define GETSTATE(m) ((struct module_state*)PyModule_GetState(m)) #else @@ -39,67 +45,45 @@ static PyObject *py_classify_lang_address(PyObject *self, PyObject *args, PyObje return NULL; } + language_classifier_response_t *response = classify_languages(input); + + if (response == NULL) { + goto exit_free_input; + } + + result = PyList_New((Py_ssize_t)response->num_languages); + if (!result) { + goto exit_destroy_response; + } + + for (int i = 0; i < response->num_languages; i++) { + char *language = response->languages[i]; + double prob = response->probs[i]; + PyObject *language_unicode = PyUnicode_DecodeUTF8((const char *)language, strlen(language), "strict"); + if (language_unicode == NULL) { + Py_DECREF(result); + goto exit_destroy_response; + } + PyObject *tuple = Py_BuildValue("(OO)", language_unicode, prob); + if (tuple == NULL) { + Py_DECREF(language_unicode); + goto exit_destroy_response; + } -// if (!address_dictionary_module_setup(NULL) || !transliteration_module_setup(NULL) || !language_classifier_module_setup(dir)) { -// log_error("Could not load language classifiers\n"); -// exit(EXIT_FAILURE); -// } - -// libpostal_address_parser_response_t *parsed = libpostal_parse_address(input, options); -// if (parsed == NULL) { -// goto exit_free_country; -// } -// -// result = PyList_New((Py_ssize_t)parsed->num_components); -// if (!result) { -// goto exit_destroy_response; -// } -// -// for (int i = 0; i < parsed->num_components; i++) { -// char *component = parsed->components[i]; -// char *label = parsed->labels[i]; -// PyObject *component_unicode = PyUnicode_DecodeUTF8((const char *)component, strlen(component), "strict"); -// if (component_unicode == NULL) { -// Py_DECREF(result); -// goto exit_destroy_response; -// } -// -// PyObject *label_unicode = PyUnicode_DecodeUTF8((const char *)label, strlen(label), "strict"); -// if (label_unicode == NULL) { -// Py_DECREF(component_unicode); -// Py_DECREF(result); -// goto exit_destroy_response; -// } -// PyObject *tuple = Py_BuildValue("(OO)", component_unicode, label_unicode); -// if (tuple == NULL) { -// Py_DECREF(component_unicode); -// Py_DECREF(label_unicode); -// goto exit_destroy_response; -// } -// -// // Note: PyList_SetItem steals a reference, so don't worry about DECREF -// PyList_SetItem(result, (Py_ssize_t)i, tuple); -// -// Py_DECREF(component_unicode); -// Py_DECREF(label_unicode); -// } -// -// exit_destroy_response: -// libpostal_address_parser_response_destroy(parsed); -// exit_free_country: -// if (country != NULL) { -// free(country); -// } -// exit_free_language: -// if (language != NULL) { -// free(language); -// } -// exit_free_input: -// if (input != NULL) { -// free(input); -// } -// return result; + // Note: PyList_SetItem steals a reference, so don't worry about DECREF + PyList_SetItem(result, (Py_ssize_t)i, tuple); + + Py_DECREF(language_unicode); + } + + exit_destroy_response: + language_classifier_response_destroy(response); + exit_free_input: + if (input != NULL) { + free(input); + } + return result; } static PyMethodDef langclassifier_methods[] = { @@ -117,7 +101,7 @@ static int langclassifier_traverse(PyObject *m, visitproc visit, void *arg) { static int langclassifier_clear(PyObject *m) { Py_CLEAR(GETSTATE(m)->error); -// libpostal_teardown(); + libpostal_teardown(); libpostal_teardown_language_classifier(); return 0; } @@ -142,12 +126,12 @@ PyInit__langclassifier(void) { #define INITERROR return -//void cleanup_libpostal(void) { -// libpostal_teardown(); -// libpostal_teardown_parser(); -//} +void cleanup_libpostal(void) { + libpostal_teardown(); + libpostal_teardown_language_classifier(); +} -void init_parser(void) { +void init_langclassifier(void) { #endif #ifdef IS_PY3K @@ -167,7 +151,8 @@ void init_parser(void) { INITERROR; } - if (!libpostal_setup() || !libpostal_setup_language_classifier()) { + + if (!libpostal_setup() || libpostal_setup_language_classifier()) { PyErr_SetString(PyExc_TypeError, "Error loading libpostal data"); } From 5abfd01c74eee48f2e6edb40072a6dff8f3a47d0 Mon Sep 17 00:00:00 2001 From: "Luiz Otavio V. B. Oliveira" Date: Fri, 21 Jun 2019 16:35:49 +0200 Subject: [PATCH 3/5] Version tested locally --- postal/pylangclassifier.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/postal/pylangclassifier.c b/postal/pylangclassifier.c index 9b8be9f..f4ebd23 100644 --- a/postal/pylangclassifier.c +++ b/postal/pylangclassifier.c @@ -30,12 +30,7 @@ static PyObject *py_classify_lang_address(PyObject *self, PyObject *args, PyObje PyObject *result = NULL; - static char *kwlist[] = {"address", NULL}; - - - if (!PyArg_ParseTupleAndKeywords(args, keywords, - "O|OO:pyparser", kwlist, - &arg_input)) { + if (!PyArg_ParseTuple(args, "O:pylangclassifier", &arg_input)) { return 0; } @@ -45,7 +40,7 @@ static PyObject *py_classify_lang_address(PyObject *self, PyObject *args, PyObje return NULL; } - language_classifier_response_t *response = classify_languages(input); + libpostal_language_classifier_response_t *response = libpostal_classify_language(input); if (response == NULL) { goto exit_free_input; @@ -65,7 +60,7 @@ static PyObject *py_classify_lang_address(PyObject *self, PyObject *args, PyObje goto exit_destroy_response; } - PyObject *tuple = Py_BuildValue("(OO)", language_unicode, prob); + PyObject *tuple = Py_BuildValue("(Od)", language_unicode, prob); if (tuple == NULL) { Py_DECREF(language_unicode); goto exit_destroy_response; @@ -78,7 +73,7 @@ static PyObject *py_classify_lang_address(PyObject *self, PyObject *args, PyObje } exit_destroy_response: - language_classifier_response_destroy(response); + libpostal_language_classifier_response_destroy(response); exit_free_input: if (input != NULL) { free(input); @@ -152,7 +147,7 @@ void init_langclassifier(void) { } - if (!libpostal_setup() || libpostal_setup_language_classifier()) { + if (!libpostal_setup() || !libpostal_setup_language_classifier()) { PyErr_SetString(PyExc_TypeError, "Error loading libpostal data"); } @@ -165,5 +160,4 @@ void init_langclassifier(void) { #ifdef IS_PY3K return module; #endif -} - +} \ No newline at end of file From 849622747a22bac142200aa528ff5aed76b7acac Mon Sep 17 00:00:00 2001 From: "Luiz Otavio V. B. Oliveira" Date: Wed, 27 Nov 2019 15:42:10 +0100 Subject: [PATCH 4/5] Check for exception in cases where address=None --- postal/lang_classifier.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/postal/lang_classifier.py b/postal/lang_classifier.py index e68c5b4..2206a92 100644 --- a/postal/lang_classifier.py +++ b/postal/lang_classifier.py @@ -2,7 +2,6 @@ from postal import _langclassifier from postal.utils.encoding import safe_decode - def classify_lang_address(address): """ Classify the language of an address. @@ -10,4 +9,7 @@ def classify_lang_address(address): @param address: the address as either Unicode or a UTF-8 encoded string """ address = safe_decode(address, 'utf-8') - return _langclassifier.classify_lang_address(address) + try: + return _langclassifier.classify_lang_address(address) + except SystemError: + return None From fdd36b75429e1ae39c30b4621fecb4c20083057f Mon Sep 17 00:00:00 2001 From: "Luiz Otavio V. B. Oliveira" Date: Tue, 3 Dec 2019 20:47:46 +0100 Subject: [PATCH 5/5] Add tests for classify_lang_address --- postal/tests/test_lang_classifier.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 postal/tests/test_lang_classifier.py diff --git a/postal/tests/test_lang_classifier.py b/postal/tests/test_lang_classifier.py new file mode 100644 index 0000000..19339c8 --- /dev/null +++ b/postal/tests/test_lang_classifier.py @@ -0,0 +1,28 @@ +# -*- coding: utf-8 -*- +"""Test pypostal address parsing.""" + +from __future__ import unicode_literals + +import unittest +from postal.lang_classifier import classify_lang_address + + +class TestLangClassfier(unittest.TestCase): + """Test libpostal language classifier from Python.""" + def test_parses(self): + cases = ( + ('Rua casemiro osorio, 123', {'pt': 1.0}), + ('Street Oudenoord, 1234', {'en': 0.76, 'nl': 0.23}), + ('Oudenoord, 1234', {'nl': 1.0}) + ) + + """Language classifier tests.""" + for address, lang_expected in cases: + lang = classify_lang_address(address) + # Round probabilities + lang = {k: round(v, 2) for k, v in lang} + self.assertDictEqual(lang, lang_expected) + + +if __name__ == '__main__': + unittest.main()