From 6b8bf75ffe20535f3bc89d3187591ab1c851f483 Mon Sep 17 00:00:00 2001 From: Nikolai Kondrashov Date: Tue, 8 Sep 2020 16:38:10 +0300 Subject: [PATCH 1/3] Add parse_json() and parse_json_file() Add an implementation of parse_json() function accepting either text or a text iterator and producing an iterable returning parsed values. Add a naive implementation of parse_json_file() function accepting a text file object and producing an iterable returning parsed values. This allows parsing JSON and JSON streams without passing them through a program. --- jq.pyx | 105 ++++++++++++++++++++++++++++++++++++++++++++++ tests/jq_tests.py | 66 +++++++++++++++++++++++++++++ 2 files changed, 171 insertions(+) diff --git a/jq.pyx b/jq.pyx index d3f9067..43f1bc4 100644 --- a/jq.pyx +++ b/jq.pyx @@ -109,6 +109,71 @@ cdef object _jv_to_python(jv value): return python_value +class JSONParseError(Exception): + """A failure to parse JSON""" + + +cdef class _JSONParser(object): + cdef jv_parser* _parser + cdef object _text_iter + cdef object _bytes + + def __dealloc__(self): + jv_parser_free(self._parser) + + def __cinit__(self, text_iter): + self._parser = jv_parser_new(0) + self._text_iter = text_iter + self._bytes = None + + def __iter__(self): + return self + + def __next__(self): + """ + Retrieve next parsed JSON value. + + Returns: + The next parsed JSON value. + + Raises: + JSONParseError: failed parsing the input JSON. + StopIteration: no more values available. + """ + cdef jv value + while True: + # If we have no bytes to parse + if self._bytes is None: + # Ready some more + self._ready_next_bytes() + # Parse whatever we've readied, if any + value = jv_parser_next(self._parser) + if jv_is_valid(value): + return _jv_to_python(value) + elif jv_invalid_has_msg(jv_copy(value)): + error_message = jv_invalid_get_msg(value) + message = jv_string_value(error_message).decode("utf8") + jv_free(error_message) + raise JSONParseError(message) + else: + jv_free(value) + # If we didn't ready any bytes + if self._bytes is None: + raise StopIteration + self._bytes = None + + cdef bint _ready_next_bytes(self) except 1: + cdef char* cbytes + try: + self._bytes = next(self._text_iter).encode("utf8") + cbytes = PyBytes_AsString(self._bytes) + jv_parser_set_buf(self._parser, cbytes, len(cbytes), 1) + except StopIteration: + self._bytes = None + jv_parser_set_buf(self._parser, "", 0, 0) + return 0 + + def compile(object program, args=None): cdef object program_bytes = program.encode("utf8") return _Program(program_bytes, args=args) @@ -356,6 +421,46 @@ def text(program, value=_NO_VALUE, text=_NO_VALUE): return compile(program).input(value, text=text).text() +def parse_json(text=_NO_VALUE, text_iter=_NO_VALUE): + """ + Parse a JSON stream. + Either "text" or "text_iter" must be specified. + + Args: + text: A string containing the JSON stream to parse. + text_iter: An iterator returning strings - pieces of the JSON stream + to parse. + + Returns: + An iterator returning parsed values. + + Raises: + JSONParseError: failed parsing the input JSON stream. + """ + if (text is _NO_VALUE) == (text_iter is _NO_VALUE): + raise ValueError("Either the text or text_iter argument should be set") + return _JSONParser(text_iter + if text_iter is not _NO_VALUE + else _iter((text,))) + + +def parse_json_file(fp): + """ + Parse a JSON stream file. + + Args: + fp: The file-like object to read the JSON stream from. + Must be in text mode. + + Returns: + An iterator returning parsed values. + + Raises: + JSONParseError: failed parsing the JSON stream. + """ + return parse_json(text=fp.read()) + + # Support the 0.1.x API for backwards compatibility def jq(object program): return compile(program) diff --git a/tests/jq_tests.py b/tests/jq_tests.py index 5b20391..acde6fd 100644 --- a/tests/jq_tests.py +++ b/tests/jq_tests.py @@ -4,6 +4,7 @@ from nose.tools import istest, assert_equal, assert_is, assert_raises +import io import jq @@ -204,6 +205,71 @@ def program_string_can_be_retrieved_from_program(): program = jq.compile(".") assert_equal(".", program.program_string) +@istest +def parse_json_both_text_and_text_iter_accepted(): + assert_equal(True, next(jq.parse_json(text="true"))) + assert_equal(True, next(jq.parse_json(text_iter=iter(["true"])))) + +@istest +def parse_json_file_works(): + fp = io.StringIO('{"abc": "def"}') + assert_equal([dict(abc="def")], list(jq.parse_json_file(fp))) + +@istest +def parse_json_empty_text_iter_stops(): + assert_raises(StopIteration, next, jq.parse_json(text_iter=iter([]))) + assert_raises(StopIteration, next, jq.parse_json(text_iter=iter([""]))) + assert_raises(StopIteration, next, jq.parse_json(text_iter=iter(["", ""]))) + +@istest +def parse_json_single_complete_text_iter_works(): + assert_equal(False, next(jq.parse_json(text_iter=iter(["false"])))) + assert_equal(True, next(jq.parse_json(text_iter=iter(["true"])))) + assert_equal(42, next(jq.parse_json(text_iter=iter(["42"])))) + assert_equal(-42, next(jq.parse_json(text_iter=iter(["-42"])))) + assert_equal("42", next(jq.parse_json(text_iter=iter(['"42"'])))) + assert_equal([42], next(jq.parse_json(text_iter=iter(["[42]"])))) + assert_equal(dict(a=42), + next(jq.parse_json(text_iter=iter(['{"a": 42}'])))) + +@istest +def parse_json_multi_complete_text_iter_works(): + assert_equal(False, next(jq.parse_json(text_iter=iter(["fa", "lse"])))) + assert_equal(True, next(jq.parse_json(text_iter=iter(["tr", "ue"])))) + assert_equal(42, next(jq.parse_json(text_iter=iter(["4", "2"])))) + assert_equal(-42, next(jq.parse_json(text_iter=iter(["-4", "2"])))) + assert_equal("42", next(jq.parse_json(text_iter=iter(['"4', '2"'])))) + assert_equal([42], next(jq.parse_json(text_iter=iter(["[4", "2]"])))) + assert_equal(dict(a=42), + next(jq.parse_json(text_iter=iter(['{"a":', ' 42}'])))) + +@istest +def parse_json_single_incomplete_text_iter_breaks(): + assert_raises(jq.JSONParseError, next, + jq.parse_json(text_iter=iter(["fals"]))) + assert_raises(jq.JSONParseError, next, + jq.parse_json(text_iter=iter(["tru"]))) + assert_raises(jq.JSONParseError, next, + jq.parse_json(text_iter=iter(["-"]))) + assert_raises(jq.JSONParseError, next, + jq.parse_json(text_iter=iter(['"42']))) + assert_raises(jq.JSONParseError, next, + jq.parse_json(text_iter=iter(["[42"]))) + assert_raises(jq.JSONParseError, next, + jq.parse_json(text_iter=iter(['{"a": 42']))) + +@istest +def parse_json_multi_incomplete_text_iter_breaks(): + assert_raises(jq.JSONParseError, next, + jq.parse_json(text_iter=iter(["fa", "ls"]))) + assert_raises(jq.JSONParseError, next, + jq.parse_json(text_iter=iter(["tr", "u"]))) + assert_raises(jq.JSONParseError, next, + jq.parse_json(text_iter=iter(['"4', '2']))) + assert_raises(jq.JSONParseError, next, + jq.parse_json(text_iter=iter(["[4", "2"]))) + assert_raises(jq.JSONParseError, next, + jq.parse_json(text_iter=iter(['{"a":', ' 42']))) @istest class TestJvToPython(object): From 42f317eef44cbdc75325af5ad44a854eaf4399b6 Mon Sep 17 00:00:00 2001 From: Nikolai Kondrashov Date: Wed, 9 Sep 2020 17:31:08 +0300 Subject: [PATCH 2/3] Use PyBytes_AsStringAndSize() Let Python give us the length of the "bytes" it already knows, instead of doing an strlen(). This improves performance a bit. --- jq.pyx | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/jq.pyx b/jq.pyx index 43f1bc4..c1dfbde 100644 --- a/jq.pyx +++ b/jq.pyx @@ -2,6 +2,7 @@ import json import threading from cpython.bytes cimport PyBytes_AsString +from cpython.bytes cimport PyBytes_AsStringAndSize cdef extern from "jv.h": @@ -164,10 +165,11 @@ cdef class _JSONParser(object): cdef bint _ready_next_bytes(self) except 1: cdef char* cbytes + cdef ssize_t clen try: self._bytes = next(self._text_iter).encode("utf8") - cbytes = PyBytes_AsString(self._bytes) - jv_parser_set_buf(self._parser, cbytes, len(cbytes), 1) + PyBytes_AsStringAndSize(self._bytes, &cbytes, &clen) + jv_parser_set_buf(self._parser, cbytes, clen, 1) except StopIteration: self._bytes = None jv_parser_set_buf(self._parser, "", 0, 0) @@ -360,8 +362,10 @@ cdef class _ResultIterator(object): self._bytes_input = bytes_input self._ready = False cdef jv_parser* parser = jv_parser_new(0) - cdef char* cbytes_input = PyBytes_AsString(bytes_input) - jv_parser_set_buf(parser, cbytes_input, len(cbytes_input), 0) + cdef char* cbytes_input + cdef ssize_t clen_input + PyBytes_AsStringAndSize(bytes_input, &cbytes_input, &clen_input) + jv_parser_set_buf(parser, cbytes_input, clen_input, 0) self._parser = parser def __iter__(self): From 11a81a7065abc663a4e832ab91002a9074abde02 Mon Sep 17 00:00:00 2001 From: Nikolai Kondrashov Date: Tue, 15 Sep 2020 14:14:01 +0300 Subject: [PATCH 3/3] parser: Accept bytes as input In addition to (Unicode) strings, also accept "bytes" (and corresponding iterators) as input to the parser. This allows skipping the decode/encode step when reading raw data from a file or socket, e.g. with os.read(). This introduces small, but measurable performance increase for such cases. --- jq.pyx | 14 +++++++++----- tests/jq_tests.py | 4 +++- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/jq.pyx b/jq.pyx index c1dfbde..863ee17 100644 --- a/jq.pyx +++ b/jq.pyx @@ -167,7 +167,11 @@ cdef class _JSONParser(object): cdef char* cbytes cdef ssize_t clen try: - self._bytes = next(self._text_iter).encode("utf8") + text = next(self._text_iter) + if isinstance(text, bytes): + self._bytes = text + else: + self._bytes = text.encode("utf8") PyBytes_AsStringAndSize(self._bytes, &cbytes, &clen) jv_parser_set_buf(self._parser, cbytes, clen, 1) except StopIteration: @@ -431,9 +435,10 @@ def parse_json(text=_NO_VALUE, text_iter=_NO_VALUE): Either "text" or "text_iter" must be specified. Args: - text: A string containing the JSON stream to parse. - text_iter: An iterator returning strings - pieces of the JSON stream - to parse. + text: A string or bytes object containing the JSON stream to + parse. + text_iter: An iterator returning strings or bytes - pieces of the + JSON stream to parse. Returns: An iterator returning parsed values. @@ -454,7 +459,6 @@ def parse_json_file(fp): Args: fp: The file-like object to read the JSON stream from. - Must be in text mode. Returns: An iterator returning parsed values. diff --git a/tests/jq_tests.py b/tests/jq_tests.py index acde6fd..18bf3b8 100644 --- a/tests/jq_tests.py +++ b/tests/jq_tests.py @@ -206,9 +206,11 @@ def program_string_can_be_retrieved_from_program(): assert_equal(".", program.program_string) @istest -def parse_json_both_text_and_text_iter_accepted(): +def parse_json_all_inputs_accepted(): assert_equal(True, next(jq.parse_json(text="true"))) assert_equal(True, next(jq.parse_json(text_iter=iter(["true"])))) + assert_equal(True, next(jq.parse_json(text=b"true"))) + assert_equal(True, next(jq.parse_json(text_iter=iter([b"true"])))) @istest def parse_json_file_works():