From 096d95739163ed81df126d8991caa7db0b5e9cae Mon Sep 17 00:00:00 2001 From: "Luiz Otavio V. B. Oliveira" Date: Fri, 14 Jun 2019 09:19:25 +0200 Subject: [PATCH 1/3] Python bindings for the language classifier API --- postal/lang_classifier.py | 15 +++ postal/pylangclassifier.c | 163 +++++++++++++++++++++++++++ postal/tests/test_lang_classifier.py | 28 +++++ setup.py | 7 ++ 4 files changed, 213 insertions(+) create mode 100644 postal/lang_classifier.py create mode 100644 postal/pylangclassifier.c create mode 100644 postal/tests/test_lang_classifier.py diff --git a/postal/lang_classifier.py b/postal/lang_classifier.py new file mode 100644 index 0000000..2206a92 --- /dev/null +++ b/postal/lang_classifier.py @@ -0,0 +1,15 @@ +"""Python bindings to libpostal parse_address.""" +from postal import _langclassifier +from postal.utils.encoding import safe_decode + +def classify_lang_address(address): + """ + Classify the language of an address. + + @param address: the address as either Unicode or a UTF-8 encoded string + """ + address = safe_decode(address, 'utf-8') + try: + return _langclassifier.classify_lang_address(address) + except SystemError: + return None diff --git a/postal/pylangclassifier.c b/postal/pylangclassifier.c new file mode 100644 index 0000000..f4ebd23 --- /dev/null +++ b/postal/pylangclassifier.c @@ -0,0 +1,163 @@ +#include +#include +#include "pyutils.h" + +#if PY_MAJOR_VERSION >= 3 +#define IS_PY3K +#endif + +struct module_state { + PyObject *error; +}; + + +typedef struct language_classifier_response { + Py_ssize_t num_languages; + char **languages; + double *probs; +} language_classifier_response_t; + +#ifdef IS_PY3K + #define GETSTATE(m) ((struct module_state*)PyModule_GetState(m)) +#else + #define GETSTATE(m) (&_state) + static struct module_state _state; +#endif + + +static PyObject *py_classify_lang_address(PyObject *self, PyObject *args, PyObject *keywords) { + PyObject *arg_input; + + PyObject *result = NULL; + + if (!PyArg_ParseTuple(args, "O:pylangclassifier", &arg_input)) { + return 0; + } + + char *input = PyObject_to_string(arg_input); + + if (input == NULL) { + return NULL; + } + + libpostal_language_classifier_response_t *response = libpostal_classify_language(input); + + if (response == NULL) { + goto exit_free_input; + } + + result = PyList_New((Py_ssize_t)response->num_languages); + if (!result) { + goto exit_destroy_response; + } + + for (int i = 0; i < response->num_languages; i++) { + char *language = response->languages[i]; + double prob = response->probs[i]; + PyObject *language_unicode = PyUnicode_DecodeUTF8((const char *)language, strlen(language), "strict"); + if (language_unicode == NULL) { + Py_DECREF(result); + goto exit_destroy_response; + } + + PyObject *tuple = Py_BuildValue("(Od)", language_unicode, prob); + if (tuple == NULL) { + Py_DECREF(language_unicode); + goto exit_destroy_response; + } + + // Note: PyList_SetItem steals a reference, so don't worry about DECREF + PyList_SetItem(result, (Py_ssize_t)i, tuple); + + Py_DECREF(language_unicode); + } + + exit_destroy_response: + libpostal_language_classifier_response_destroy(response); + exit_free_input: + if (input != NULL) { + free(input); + } + return result; +} + +static PyMethodDef langclassifier_methods[] = { + {"classify_lang_address", (PyCFunction)py_classify_lang_address, METH_VARARGS | METH_KEYWORDS, + "classify_lang_address(text)"}, + {NULL, NULL}, +}; + +#ifdef IS_PY3K + +static int langclassifier_traverse(PyObject *m, visitproc visit, void *arg) { + Py_VISIT(GETSTATE(m)->error); + return 0; +} + +static int langclassifier_clear(PyObject *m) { + Py_CLEAR(GETSTATE(m)->error); + libpostal_teardown(); + libpostal_teardown_language_classifier(); + return 0; +} + +static struct PyModuleDef module_def = { + PyModuleDef_HEAD_INIT, + "_langclassifier", + NULL, + sizeof(struct module_state), + langclassifier_methods, + NULL, + langclassifier_traverse, + langclassifier_clear, + NULL +}; + +#define INITERROR return NULL + +PyObject * +PyInit__langclassifier(void) { +#else + +#define INITERROR return + +void cleanup_libpostal(void) { + libpostal_teardown(); + libpostal_teardown_language_classifier(); +} + +void init_langclassifier(void) { + #endif + + #ifdef IS_PY3K + PyObject *module = PyModule_Create(&module_def); + #else + PyObject *module = Py_InitModule("_langclassifier", parser_methods); + #endif + + if (module == NULL) { + INITERROR; + } + struct module_state *st = GETSTATE(module); + + st->error = PyErr_NewException("_langclassifier.Error", NULL, NULL); + if (st->error == NULL) { + Py_DECREF(module); + INITERROR; + } + + + if (!libpostal_setup() || !libpostal_setup_language_classifier()) { + PyErr_SetString(PyExc_TypeError, + "Error loading libpostal data"); + } + + #ifndef IS_PY3K + Py_AtExit(&cleanup_libpostal); + #endif + + + #ifdef IS_PY3K + return module; + #endif +} \ No newline at end of file diff --git a/postal/tests/test_lang_classifier.py b/postal/tests/test_lang_classifier.py new file mode 100644 index 0000000..19339c8 --- /dev/null +++ b/postal/tests/test_lang_classifier.py @@ -0,0 +1,28 @@ +# -*- coding: utf-8 -*- +"""Test pypostal address parsing.""" + +from __future__ import unicode_literals + +import unittest +from postal.lang_classifier import classify_lang_address + + +class TestLangClassfier(unittest.TestCase): + """Test libpostal language classifier from Python.""" + def test_parses(self): + cases = ( + ('Rua casemiro osorio, 123', {'pt': 1.0}), + ('Street Oudenoord, 1234', {'en': 0.76, 'nl': 0.23}), + ('Oudenoord, 1234', {'nl': 1.0}) + ) + + """Language classifier tests.""" + for address, lang_expected in cases: + lang = classify_lang_address(address) + # Round probabilities + lang = {k: round(v, 2) for k, v in lang} + self.assertDictEqual(lang, lang_expected) + + +if __name__ == '__main__': + unittest.main() diff --git a/setup.py b/setup.py index 6ded97b..96cfe3f 100644 --- a/setup.py +++ b/setup.py @@ -40,6 +40,13 @@ def main(): library_dirs=['/usr/local/lib'], extra_compile_args=['-std=c99'], ), + Extension('postal._langclassifier', + sources=['postal/pylangclassifier.c', 'postal/pyutils.c'], + libraries=['postal'], + include_dirs=['/usr/local/include'], + library_dirs=['/usr/local/lib'], + extra_compile_args=['-std=c99'], + ), Extension('postal._token_types', sources=['postal/pytokentypes.c'], libraries=['postal'], From 8bd1079afa3aed93a4acbc8da8b7e4b3c8b44fa0 Mon Sep 17 00:00:00 2001 From: sbrugman Date: Tue, 8 Sep 2020 19:55:08 +0200 Subject: [PATCH 2/3] Fix references and readme update --- README.md | 3 +++ postal/pylangclassifier.c | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 1991aa4..8b4d18a 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,9 @@ expand_address('Quatre vingt douze Ave des Champs-Élysées') from postal.parser import parse_address parse_address('The Book Club 100-106 Leonard St, Shoreditch, London, Greater London, EC2A 4RH, United Kingdom') + +from postal.lang_classifier import classify_lang_address +classify_lang_address('Quatre vingt douze Ave des Champs-Élysées') ``` Installation diff --git a/postal/pylangclassifier.c b/postal/pylangclassifier.c index f4ebd23..338691e 100644 --- a/postal/pylangclassifier.c +++ b/postal/pylangclassifier.c @@ -40,7 +40,7 @@ static PyObject *py_classify_lang_address(PyObject *self, PyObject *args, PyObje return NULL; } - libpostal_language_classifier_response_t *response = libpostal_classify_language(input); + language_classifier_response_t *response = libpostal_classify_language(input); if (response == NULL) { goto exit_free_input; @@ -132,7 +132,7 @@ void init_langclassifier(void) { #ifdef IS_PY3K PyObject *module = PyModule_Create(&module_def); #else - PyObject *module = Py_InitModule("_langclassifier", parser_methods); + PyObject *module = Py_InitModule("_langclassifier", langclassifier_methods); #endif if (module == NULL) { From 143eaaf3d97aeab02b6131713506a627ad84636d Mon Sep 17 00:00:00 2001 From: sbrugman Date: Tue, 8 Sep 2020 20:38:48 +0200 Subject: [PATCH 3/3] Readme update installation instructions conform https://github.com/openvenues/libpostal --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 8b4d18a..f4effda 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,7 @@ git clone https://github.com/openvenues/libpostal cd libpostal ./bootstrap.sh ./configure --datadir=[...some dir with a few GB of space...] -make +make -j4 sudo make install # On Linux it's probably a good idea to run