Skip to content

Commit

Permalink
Apply suggestions from code review
Browse files Browse the repository at this point in the history
Co-authored-by: kirkrodrigues <[email protected]>
  • Loading branch information
LinZhihao-723 and kirkrodrigues authored Jun 26, 2024
1 parent db6b089 commit 6696553
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 34 deletions.
48 changes: 19 additions & 29 deletions components/core/src/clp/ffi/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,11 @@

namespace clp::ffi {
/**
* Validates whether the given string is UTF-8 encoded, and escapes any characters to generate to
* make the string compatible with JSON specification.
* Validates whether the given string is UTF-8 encoded, and escapes any characters to make the
* string compatible with the JSON specification.
* @param raw The raw string to escape.
* @return The escaped string on success.
* @return std::nullopt if the string contains none-UTF8 encoded byte sequence.
* @return std::nullopt if the string contains any non-UTF-8-encoded byte sequences.
*/
[[nodiscard]] auto validate_and_escape_utf8_string(std::string_view raw
) -> std::optional<std::string>;
Expand All @@ -27,8 +27,7 @@ namespace clp::ffi {
/**
* Validates whether the given string is UTF-8 encoded, optionally escaping ASCII characters using
* the given handler.
* @tparam EscapeHandler Method to optionally escape any ASCII character in the string. Signature:
* (std::string_view::const_iterator it_ascii_char) -> void
* @tparam EscapeHandler Method to optionally escape any ASCII character in the string.
* @param src
* @param escape_handler
* @return Whether the input is a valid UTF-8 encoded string.
Expand All @@ -39,25 +38,16 @@ generic_validate_utf8_string(std::string_view src, EscapeHandler escape_handler)

namespace utils_hpp {
/**
* Validates whether the given byte is a valid UTF-8 header with continuation bytes, and set code
* point and code point range accordingly.
* The valid code point range is defined as following:
* .----------------------------------------------------------.
* | Continuation Length | First Code Point | Last Code Point |
* |---------------------|------------------|-----------------|
* | 1 Byte | 0x80 | 0x7FF |
* | 2 Byte | 0x800 | 0xFFFF |
* | 3 Byte | 0x10000 | 0x10FFFF |
* |---------------------|------------------|-----------------|
* @param header Input byte to validate
* @param num_continuation_bytes Outputs the number of continuation bytes corresponded to the header
* byte, if the header is valid.
* @param code_point Outputs the code extracted from the header byte, if the header is valid.
* @param code_point_lower_bound Outputs the lower bound of the valid code point range corresponded
* with the header byte, if the header if valid.
* @param code_point_upper_bound Outputs the upper bound of the valid code point range corresponded
* with the header byte, if the header if valid.
* @return Whether the input byte is a valid header byte.
* Validates whether the given byte is a valid lead byte for a multi-byte UTF-8 character, parses
* the byte, and returns the parsed properties as well as associated properties.
* @param header Byte to validate.
* @param num_continuation_bytes Returns the number of continuation bytes expected.
* @param code_point Returns the code point bits parsed from the lead byte.
* @param code_point_lower_bound Returns the lower bound of the code point range for the UTF-8
* character.
* @param code_point_upper_bound Returns the upper bound of the code point range for the UTF-8
* character.
* @return Whether the input byte is a valid lead byte for a multi-byte UTF-8 character.
*/
[[nodiscard]] auto validate_header_byte_and_set_code_point(
uint8_t header,
Expand All @@ -75,16 +65,16 @@ namespace utils_hpp {

/*
* @param byte
* @return Whether the input byte is a valid UTF-8 continuation byte. A valid UTF-8 continuation
* byte should match 0b10xx_xxxx.
* @return Whether the input byte is a valid UTF-8 continuation byte.
*/
[[nodiscard]] auto is_valid_utf8_continuation_byte(uint8_t byte) -> bool;

/**
* Updates the code point by applying the payload of the given continuation byte.
* Parses the code-point bits from the given continuation byte and combines them with the given
* code point.
* @param code_point
* @param continuation_byte
* @return Updated code point.
* @return The updated code point.
*/
[[nodiscard]] auto update_code_point(uint32_t code_point, uint8_t continuation_byte) -> uint32_t;
} // namespace utils_hpp
Expand Down Expand Up @@ -130,7 +120,7 @@ auto generic_validate_utf8_string(std::string_view src, EscapeHandler escape_han
}

if (0 != num_continuation_bytes_to_validate) {
// Incomplete continuation byte sequence
// Incomplete UTF-8 character
return false;
}

Expand Down
11 changes: 6 additions & 5 deletions components/core/tests/test-ffi_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,14 @@ namespace {
* Gets an expected escaped string by first convert the raw string into a json string and then dumps
* the a printable string using nlohmann::json.
* @param raw
* @return Escaped string dumped by nlohmann::json, with surrounding '"' dropped.
* @return The input string after escaping any characters that are invalid in JSON strings.
*/
[[nodiscard]] auto get_expected_escaped_string(std::string_view raw) -> std::string;

/**
* Generates a UTF-8 encoded byte sequence of a given code point with the given number of
* continuation bytes. The range of the code point is not validated, which means the generated byte
* sequence can be overlong.
* Generates a UTF-8 encoded byte sequence with the given code point and number of continuation
* bytes. The range of the code point is not validated, which means the generated byte sequence can
* be invalid (overlong or exceeding the valid range of UTF-8 code points).
* @param code_point
* @param num_continuation_bytes
* @return The encoded UTF-8 byte sequence.
Expand Down Expand Up @@ -98,7 +98,7 @@ TEST_CASE("escape_utf8_string_basic", "[ffi][utils]") {
actual = validate_and_escape_utf8_string(test_str);
REQUIRE((actual.has_value() && actual.value() == get_expected_escaped_string(test_str)));

// Test valid UTF8 chars with continuation bytes
// Test valid UTF-8 chars with continuation bytes
std::vector<std::string> const valid_utf8{
"\n",
"\xF0\xA0\x80\x8F", // https://en.wiktionary.org/wiki/%F0%A0%80%8F
Expand Down Expand Up @@ -147,6 +147,7 @@ TEST_CASE("escape_utf8_string_with_invalid_continuation", "[ffi][utils]") {
test_str = valid_utf8_byte_sequence;
constexpr char cInvalidHeaderByte{'\xFF'};
test_str.front() = cInvalidHeaderByte;
REQUIRE((false == is_utf8_encoded(test_str)));
REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value()));

// Test invalid continuation bytes
Expand Down

0 comments on commit 6696553

Please sign in to comment.