From 0eaddf2aaa8def53276e7d8f7697d6473b37b594 Mon Sep 17 00:00:00 2001 From: Colin Rofls Date: Mon, 17 Jun 2024 13:33:13 -0400 Subject: [PATCH] [text-format] Fix parsing of string literals This renames `next_byte_value` to `next_str_lit_bytes` and may return between 1..=4 bytes per call, representing the variable-length nature of the UTF-8 encoding. --- protobuf-support/src/lexer/lexer_impl.rs | 74 ++++++++++++++++++------ protobuf-support/src/lexer/str_lit.rs | 8 +-- 2 files changed, 60 insertions(+), 22 deletions(-) diff --git a/protobuf-support/src/lexer/lexer_impl.rs b/protobuf-support/src/lexer/lexer_impl.rs index f0d6a9609..0add903c1 100644 --- a/protobuf-support/src/lexer/lexer_impl.rs +++ b/protobuf-support/src/lexer/lexer_impl.rs @@ -67,6 +67,15 @@ impl From for LexerError { } } +/// The raw bytes for a single char or escape sequence in a string literal +/// +/// The raw bytes are available via an `into_iter` implementation. +pub struct DecodedBytes { + // a single char can be up to 4-bytes when encoded in utf-8 + buf: [u8; 4], + len: u8, +} + #[derive(Copy, Clone)] pub struct Lexer<'a> { language: ParserLanguage, @@ -440,24 +449,24 @@ impl<'a> Lexer<'a> { // octEscape = '\' octalDigit octalDigit octalDigit // charEscape = '\' ( "a" | "b" | "f" | "n" | "r" | "t" | "v" | '\' | "'" | '"' ) // quote = "'" | '"' - pub fn next_byte_value(&mut self) -> LexerResult { + pub fn next_str_lit_bytes(&mut self) -> LexerResult { match self.next_char()? { '\\' => { match self.next_char()? { - '\'' => Ok(b'\''), - '"' => Ok(b'"'), - '\\' => Ok(b'\\'), - 'a' => Ok(b'\x07'), - 'b' => Ok(b'\x08'), - 'f' => Ok(b'\x0c'), - 'n' => Ok(b'\n'), - 'r' => Ok(b'\r'), - 't' => Ok(b'\t'), - 'v' => Ok(b'\x0b'), + '\'' => Ok(b'\''.into()), + '"' => Ok(b'"'.into()), + '\\' => Ok(b'\\'.into()), + 'a' => Ok(b'\x07'.into()), + 'b' => Ok(b'\x08'.into()), + 'f' => Ok(b'\x0c'.into()), + 'n' => Ok(b'\n'.into()), + 'r' => Ok(b'\r'.into()), + 't' => Ok(b'\t'.into()), + 'v' => Ok(b'\x0b'.into()), 'x' => { let d1 = self.next_hex_digit()? as u8; let d2 = self.next_hex_digit()? as u8; - Ok(((d1 << 4) | d2) as u8) + Ok((((d1 << 4) | d2) as u8).into()) } d if d >= '0' && d <= '7' => { let mut r = d as u8 - b'0'; @@ -467,16 +476,14 @@ impl<'a> Lexer<'a> { Ok(d) => r = (r << 3) + d as u8, } } - Ok(r) + Ok(r.into()) } // https://github.com/google/protobuf/issues/4562 - // TODO: overflow - c => Ok(c as u8), + c => Ok(c.into()), } } '\n' | '\0' => Err(LexerError::IncorrectInput), - // TODO: check overflow - c => Ok(c as u8), + c => Ok(c.into()), } } @@ -530,7 +537,7 @@ impl<'a> Lexer<'a> { }; first = false; while self.lookahead_char() != Some(q) { - self.next_byte_value()?; + self.next_str_lit_bytes()?; } self.next_char_expect_eq(q)?; @@ -663,6 +670,37 @@ impl<'a> Lexer<'a> { } } +impl From for DecodedBytes { + fn from(value: u8) -> Self { + DecodedBytes { + buf: [value, 0, 0, 0], + len: 1, + } + } +} + +impl From for DecodedBytes { + fn from(value: char) -> Self { + let mut this = DecodedBytes { + buf: [0; 4], + len: 0, + }; + let len = value.encode_utf8(&mut this.buf).len(); + this.len = len as _; + this + } +} + +// means that we work with `Vec::extend`. +impl IntoIterator for DecodedBytes { + type Item = u8; + type IntoIter = std::iter::Take>; + + fn into_iter(self) -> Self::IntoIter { + self.buf.into_iter().take(self.len as _) + } +} + #[cfg(test)] mod test { use super::*; diff --git a/protobuf-support/src/lexer/str_lit.rs b/protobuf-support/src/lexer/str_lit.rs index 0e51a16bf..840c9eb33 100644 --- a/protobuf-support/src/lexer/str_lit.rs +++ b/protobuf-support/src/lexer/str_lit.rs @@ -32,9 +32,9 @@ impl StrLit { let mut lexer = Lexer::new(&self.escaped, ParserLanguage::Json); let mut r = Vec::new(); while !lexer.eof() { - r.push( + r.extend( lexer - .next_byte_value() + .next_str_lit_bytes() .map_err(|_| StrLitDecodeError::OtherError)?, ); } @@ -45,9 +45,9 @@ impl StrLit { let mut lexer = Lexer::new(&self.escaped, ParserLanguage::Json); let mut r = Vec::new(); while !lexer.eof() { - r.push( + r.extend( lexer - .next_byte_value() + .next_str_lit_bytes() .map_err(|_| StrLitDecodeError::OtherError)?, ); }