-
Notifications
You must be signed in to change notification settings - Fork 72
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add support for validating and escaping UTF-8 strings. (#453)
Co-authored-by: kirkrodrigues <[email protected]>
- Loading branch information
1 parent
8a2c0a8
commit 249816b
Showing
6 changed files
with
533 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
#include "utils.hpp" | ||
|
||
#include <array> | ||
#include <cstddef> | ||
#include <cstdint> | ||
#include <cstdio> | ||
#include <optional> | ||
#include <string> | ||
#include <string_view> | ||
#include <tuple> | ||
|
||
#include "../utf8_utils.hpp" | ||
|
||
using std::string; | ||
using std::string_view; | ||
|
||
namespace clp::ffi { | ||
auto validate_and_escape_utf8_string(string_view raw) -> std::optional<string> { | ||
std::optional<std::string> ret_val; | ||
auto& escaped{ret_val.emplace()}; | ||
escaped.reserve(raw.size() + (raw.size() / 2)); | ||
if (false == validate_and_append_escaped_utf8_string(raw, escaped)) { | ||
return std::nullopt; | ||
} | ||
return ret_val; | ||
} | ||
|
||
auto validate_and_append_escaped_utf8_string(std::string_view src, std::string& dst) -> bool { | ||
string_view::const_iterator next_char_to_copy_it{src.cbegin()}; | ||
|
||
auto escape_handler = [&](string_view::const_iterator it) -> void { | ||
// Allocate 6 + 1 size buffer to format control characters as "\u00bb", with the last byte | ||
// used by `snprintf` to append '\0' | ||
constexpr size_t cControlCharacterBufSize{7}; | ||
std::array<char, cControlCharacterBufSize> buf{}; | ||
std::string_view escaped_char; | ||
bool escape_required{true}; | ||
switch (*it) { | ||
case '\b': | ||
escaped_char = "\\b"; | ||
break; | ||
case '\t': | ||
escaped_char = "\\t"; | ||
break; | ||
case '\n': | ||
escaped_char = "\\n"; | ||
break; | ||
case '\f': | ||
escaped_char = "\\f"; | ||
break; | ||
case '\r': | ||
escaped_char = "\\r"; | ||
break; | ||
case '\\': | ||
escaped_char = "\\\\"; | ||
break; | ||
case '"': | ||
escaped_char = "\\\""; | ||
break; | ||
default: { | ||
constexpr uint8_t cLargestControlCharacter{0x1F}; | ||
auto const byte{static_cast<uint8_t>(*it)}; | ||
if (cLargestControlCharacter >= byte) { | ||
std::ignore = snprintf(buf.data(), buf.size(), "\\u00%02x", byte); | ||
escaped_char = {buf.data(), buf.size() - 1}; | ||
} else { | ||
escape_required = false; | ||
} | ||
break; | ||
} | ||
} | ||
if (escape_required) { | ||
dst.append(next_char_to_copy_it, it); | ||
dst += escaped_char; | ||
next_char_to_copy_it = it + 1; | ||
} | ||
}; | ||
|
||
if (false == validate_utf8_string(src, escape_handler)) { | ||
return false; | ||
} | ||
|
||
if (src.cend() != next_char_to_copy_it) { | ||
dst.append(next_char_to_copy_it, src.cend()); | ||
} | ||
|
||
return true; | ||
} | ||
} // namespace clp::ffi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
#ifndef CLP_FFI_UTILS_HPP | ||
#define CLP_FFI_UTILS_HPP | ||
|
||
#include <optional> | ||
#include <string> | ||
#include <string_view> | ||
|
||
namespace clp::ffi { | ||
/** | ||
* Validates whether the given string is UTF-8 encoded, and escapes any characters to make the | ||
* string compatible with the JSON specification. | ||
* @param raw The raw string to escape. | ||
* @return The escaped string on success. | ||
* @return std::nullopt if the string contains any non-UTF-8-encoded byte sequences. | ||
*/ | ||
[[nodiscard]] auto validate_and_escape_utf8_string(std::string_view raw | ||
) -> std::optional<std::string>; | ||
|
||
/** | ||
* Validates whether `src` is UTF-8 encoded, and appends `src` to `dst` while escaping any | ||
* characters to make the appended string compatible with the JSON specification. | ||
* @param src The string to validate and escape. | ||
* @param dst Returns `dst` with an escaped version of `src` appended. | ||
* @return Whether `src` is a valid UTF-8-encoded string. NOTE: Even if `src` is not UTF-8 encoded, | ||
* `dst` may be modified. | ||
*/ | ||
[[nodiscard]] auto | ||
validate_and_append_escaped_utf8_string(std::string_view src, std::string& dst) -> bool; | ||
} // namespace clp::ffi | ||
|
||
#endif // CLP_FFI_UTILS_HPP |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
#include "utf8_utils.hpp" | ||
|
||
#include <cstddef> | ||
#include <cstdint> | ||
#include <string_view> | ||
|
||
namespace clp { | ||
auto is_utf8_encoded(std::string_view str) -> bool { | ||
auto escape_handler = []([[maybe_unused]] std::string_view::const_iterator it) -> void {}; | ||
return validate_utf8_string(str, escape_handler); | ||
} | ||
|
||
namespace utf8_utils_internal { | ||
auto parse_and_validate_lead_byte( | ||
uint8_t byte, | ||
size_t& num_continuation_bytes, | ||
uint32_t& code_point, | ||
uint32_t& code_point_lower_bound, | ||
uint32_t& code_point_upper_bound | ||
) -> bool { | ||
if ((byte & cFourByteUtf8CharHeaderMask) == cFourByteUtf8CharHeader) { | ||
num_continuation_bytes = 3; | ||
code_point = (~cFourByteUtf8CharHeaderMask & byte); | ||
code_point_lower_bound = cFourByteUtf8CharCodePointLowerBound; | ||
code_point_upper_bound = cFourByteUtf8CharCodePointUpperBound; | ||
} else if ((byte & cThreeByteUtf8CharHeaderMask) == cThreeByteUtf8CharHeader) { | ||
num_continuation_bytes = 2; | ||
code_point = (~cThreeByteUtf8CharHeaderMask & byte); | ||
code_point_lower_bound = cThreeByteUtf8CharCodePointLowerBound; | ||
code_point_upper_bound = cThreeByteUtf8CharCodePointUpperBound; | ||
} else if ((byte & cTwoByteUtf8CharHeaderMask) == cTwoByteUtf8CharHeader) { | ||
num_continuation_bytes = 1; | ||
code_point = (~cTwoByteUtf8CharHeaderMask & byte); | ||
code_point_lower_bound = cTwoByteUtf8CharCodePointLowerBound; | ||
code_point_upper_bound = cTwoByteUtf8CharCodePointUpperBound; | ||
} else { | ||
return false; | ||
} | ||
return true; | ||
} | ||
|
||
auto is_ascii_char(uint8_t byte) -> bool { | ||
return cOneByteUtf8CharCodePointUpperBound >= byte; | ||
} | ||
|
||
auto is_valid_utf8_continuation_byte(uint8_t byte) -> bool { | ||
return (byte & cUtf8ContinuationByteMask) == cUtf8ContinuationByteHeader; | ||
} | ||
|
||
auto parse_continuation_byte(uint32_t code_point, uint8_t continuation_byte) -> uint32_t { | ||
return (code_point << cUtf8NumContinuationByteCodePointBits) | ||
+ (continuation_byte & cUtf8ContinuationByteCodePointMask); | ||
} | ||
} // namespace utf8_utils_internal | ||
} // namespace clp |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,144 @@ | ||
#ifndef CLP_UTF8_UTILS_HPP | ||
#define CLP_UTF8_UTILS_HPP | ||
|
||
#include <cstddef> | ||
#include <cstdint> | ||
#include <string_view> | ||
|
||
namespace clp { | ||
// Constants | ||
// Lead byte signature | ||
constexpr uint8_t cTwoByteUtf8CharHeaderMask{0xE0}; // 0b111x_xxxx | ||
constexpr uint8_t cTwoByteUtf8CharHeader{0xC0}; // 0b110x_xxxx | ||
constexpr uint8_t cThreeByteUtf8CharHeaderMask{0xF0}; // 0b1111_xxxx | ||
constexpr uint8_t cThreeByteUtf8CharHeader{0xE0}; // 0b1110_xxxx | ||
constexpr uint8_t cFourByteUtf8CharHeaderMask{0xF8}; // 0b1111_1xxx | ||
constexpr uint8_t cFourByteUtf8CharHeader{0xF0}; // 0b1111_0xxx | ||
|
||
// Code point ranges (inclusive) | ||
constexpr uint32_t cOneByteUtf8CharCodePointLowerBound{0}; | ||
constexpr uint32_t cOneByteUtf8CharCodePointUpperBound{0x7F}; | ||
constexpr uint32_t cTwoByteUtf8CharCodePointLowerBound{0x80}; | ||
constexpr uint32_t cTwoByteUtf8CharCodePointUpperBound{0x7FF}; | ||
constexpr uint32_t cThreeByteUtf8CharCodePointLowerBound{0x800}; | ||
constexpr uint32_t cThreeByteUtf8CharCodePointUpperBound{0xFFFF}; | ||
constexpr uint32_t cFourByteUtf8CharCodePointLowerBound{0x1'0000}; | ||
constexpr uint32_t cFourByteUtf8CharCodePointUpperBound{0x10'FFFF}; | ||
|
||
// Continuation byte | ||
constexpr uint32_t cUtf8ContinuationByteMask{0xC0}; | ||
constexpr uint32_t cUtf8ContinuationByteHeader{0x80}; | ||
constexpr uint32_t cUtf8ContinuationByteCodePointMask{0x3F}; | ||
constexpr uint8_t cUtf8NumContinuationByteCodePointBits{6}; | ||
|
||
/** | ||
* Validates whether the given string is UTF-8 encoded, optionally escaping ASCII characters using | ||
* the given handler. | ||
* @tparam EscapeHandler Method to optionally escape any ASCII character in the string. | ||
* @param src | ||
* @param escape_handler | ||
* @return Whether the input is a valid UTF-8 encoded string. | ||
*/ | ||
template <typename EscapeHandler> | ||
requires std::is_invocable_v<EscapeHandler, std::string_view::const_iterator> | ||
[[nodiscard]] auto validate_utf8_string(std::string_view src, EscapeHandler escape_handler) -> bool; | ||
|
||
/** | ||
* @param str | ||
* @return Whether the input is a valid UTF-8 encoded string. | ||
*/ | ||
[[nodiscard]] auto is_utf8_encoded(std::string_view str) -> bool; | ||
|
||
namespace utf8_utils_internal { | ||
/** | ||
* Validates whether the given byte is a valid lead byte for a multi-byte UTF-8 character, parses | ||
* the byte, and returns the parsed properties as well as associated properties. | ||
* @param byte Byte to validate. | ||
* @param num_continuation_bytes Returns the number of continuation bytes expected. | ||
* @param code_point Returns the code point bits parsed from the lead byte. | ||
* @param code_point_lower_bound Returns the lower bound of the code point range for the UTF-8 | ||
* character. | ||
* @param code_point_upper_bound Returns the upper bound of the code point range for the UTF-8 | ||
* character. | ||
* @return Whether the input byte is a valid lead byte for a multi-byte UTF-8 character. | ||
*/ | ||
[[nodiscard]] auto parse_and_validate_lead_byte( | ||
uint8_t byte, | ||
size_t& num_continuation_bytes, | ||
uint32_t& code_point, | ||
uint32_t& code_point_lower_bound, | ||
uint32_t& code_point_upper_bound | ||
) -> bool; | ||
|
||
/** | ||
* @param byte | ||
* @return Whether the given byte is a valid ASCII character. | ||
*/ | ||
[[nodiscard]] auto is_ascii_char(uint8_t byte) -> bool; | ||
|
||
/* | ||
* @param byte | ||
* @return Whether the input byte is a valid UTF-8 continuation byte. | ||
*/ | ||
[[nodiscard]] auto is_valid_utf8_continuation_byte(uint8_t byte) -> bool; | ||
|
||
/** | ||
* Parses the code-point bits from the given continuation byte and combines them with the given | ||
* code point. | ||
* @param code_point | ||
* @param continuation_byte | ||
* @return The updated code point. | ||
*/ | ||
[[nodiscard]] auto | ||
parse_continuation_byte(uint32_t code_point, uint8_t continuation_byte) -> uint32_t; | ||
} // namespace utf8_utils_internal | ||
|
||
template <typename EscapeHandler> | ||
requires std::is_invocable_v<EscapeHandler, std::string_view::const_iterator> | ||
auto validate_utf8_string(std::string_view src, EscapeHandler escape_handler) -> bool { | ||
size_t num_continuation_bytes_to_validate{0}; | ||
uint32_t code_point{}; | ||
uint32_t code_point_lower_bound{}; | ||
uint32_t code_point_upper_bound{}; | ||
|
||
// NOLINTNEXTLINE(readability-qualified-auto) | ||
for (auto it{src.cbegin()}; it != src.cend(); ++it) { | ||
auto const byte{static_cast<uint8_t>(*it)}; | ||
if (0 == num_continuation_bytes_to_validate) { | ||
if (utf8_utils_internal::is_ascii_char(byte)) { | ||
escape_handler(it); | ||
} else if (false | ||
== utf8_utils_internal::parse_and_validate_lead_byte( | ||
byte, | ||
num_continuation_bytes_to_validate, | ||
code_point, | ||
code_point_lower_bound, | ||
code_point_upper_bound | ||
)) | ||
{ | ||
return false; | ||
} | ||
} else { | ||
if (false == utf8_utils_internal::is_valid_utf8_continuation_byte(byte)) { | ||
return false; | ||
} | ||
code_point = utf8_utils_internal::parse_continuation_byte(code_point, byte); | ||
--num_continuation_bytes_to_validate; | ||
if (0 == num_continuation_bytes_to_validate | ||
&& (code_point < code_point_lower_bound || code_point_upper_bound < code_point)) | ||
{ | ||
return false; | ||
} | ||
} | ||
} | ||
|
||
if (0 != num_continuation_bytes_to_validate) { | ||
// Incomplete UTF-8 character | ||
return false; | ||
} | ||
|
||
return true; | ||
} | ||
} // namespace clp | ||
|
||
#endif // CLP_UTF8_UTILS_HPP |
Oops, something went wrong.