From 558534ba054a2dc65241afbde74541f091321ccc Mon Sep 17 00:00:00 2001 From: Noemi <45180344+unflxw@users.noreply.github.com> Date: Fri, 20 Dec 2024 11:57:41 +0100 Subject: [PATCH 1/2] Add simple benchmark tool Since `#[bench]` is nightly-only, add a simple binary that can be modified to test the performance of specific queries as needed. --- src/bin/bench.rs | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 src/bin/bench.rs diff --git a/src/bin/bench.rs b/src/bin/bench.rs new file mode 100644 index 0000000..821af21 --- /dev/null +++ b/src/bin/bench.rs @@ -0,0 +1,22 @@ +fn long_insert_into_values_query(rows: usize) -> String { + let mut query = r#"INSERT INTO "table_name" ("one","two","three") VALUES "#.to_owned(); + for i in 0..rows { + query.push_str(&format!("({}, {}, {}),", i, i + 1, i + 2)); + } + query.push_str("(0, 0, 0);"); + query +} + +extern crate sql_lexer; + +fn main() { + for rows in [200, 2000, 20000] { + println!("Sanitising long_insert_into_values_query with {rows} rows"); + let query = long_insert_into_values_query(rows); + let start = std::time::Instant::now(); + let output = sql_lexer::sanitize_string(query); + let elapsed = start.elapsed(); + println!("Output: {}", output); + println!("Elapsed: {:?}", elapsed); + } +} From 225b1c51b37b5ea8b239b77f212ace65f3b95694 Mon Sep 17 00:00:00 2001 From: Noemi <45180344+unflxw@users.noreply.github.com> Date: Fri, 20 Dec 2024 12:24:06 +0100 Subject: [PATCH 2/2] Avoid `Vec::remove` with `None` token Using `Vec::remove(i)` causes the `n` tokens between position `i` and the end of the vector to be shifted, which is an `O(n)` operation. When done in a loop, as it is done in our `INSERT INTO ... VALUES` repeated list sanitisation, it makes the overall performance `O(n^2)`. To avoid using `Vec::remove`, implement a `Token::None` value that represents a "non-token", a gap to be ignored in the token list. Replace calls to `Vec::remove(i)` with replacing the token at position `i` with `Token::None`. Replace calls to `Vec::insert` for the same reason -- luckily, when we insert a token, it's a replacement for a different token, so it's possible to rewrite them in that manner. These are rarely done in a loop, though, so the impact on sanitisation performance is much more limited. --- src/lib.rs | 4 ++++ src/sanitizer.rs | 29 +++++++++++++++++++++-------- src/writer.rs | 1 + 3 files changed, 26 insertions(+), 8 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index cae497f..b67f7b9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -137,8 +137,12 @@ pub enum Token { SquareBracketClose, Colon, Semicolon, + // Used by the sanitizer to replace values Placeholder, + // Used by the sanitizer to replace repeated value lists Ellipsis, + // Used by the sanitizer to remove tokens + None, Null, True, False, diff --git a/src/sanitizer.rs b/src/sanitizer.rs index b9ff10c..7f510d7 100644 --- a/src/sanitizer.rs +++ b/src/sanitizer.rs @@ -97,14 +97,17 @@ impl SqlSanitizer { if keep { kept += 1; } else { + let mut removed = 1; self.remove(pos + kept); while kept > 0 { kept -= 1; + removed += 1; self.remove(pos + kept); } + pos += removed; } } - self.sql.tokens.insert(start_pos, Token::Ellipsis); + self.ellipsis(start_pos); } Token::ParentheseClose | Token::SquareBracketClose => state = State::Default, Token::Dot if state == State::JoinOn => (), @@ -146,23 +149,28 @@ impl SqlSanitizer { match self.sql.tokens[pos] { Token::ParentheseClose => break, Token::SquareBracketClose => break, - _ => self.remove(pos), + _ => { + self.remove(pos); + pos += 1; + } } } - self.sql.tokens.insert(start_pos, Token::Placeholder); + self.placeholder(start_pos); } _ => (), } } // Remove comments Token::Comment(_) => { - self.sql.tokens.remove(pos); + self.remove(pos); if self.sql.tokens.get(pos - 1) == Some(&Token::Space) { - self.sql.tokens.remove(pos - 1); + self.remove(pos - 1); } } // Spaces don't influence the state by default Token::Space => (), + // Non-tokens don't influence the state + Token::None => (), // Keep state the same if we're in a insert values or keyword scope state _ if state == State::InsertValues || state == State::KeywordScopeStarted => (), // Reset state to default if there were no matches @@ -176,12 +184,17 @@ impl SqlSanitizer { } fn remove(&mut self, position: usize) { - self.sql.tokens.remove(position); + self.sql.tokens[position] = Token::None; } + // Replaces the token at position `position` with a placeholder. fn placeholder(&mut self, position: usize) { - self.sql.tokens.remove(position); - self.sql.tokens.insert(position, Token::Placeholder); + self.sql.tokens[position] = Token::Placeholder; + } + + // Replaces the token at position `position` with an ellipsis. + fn ellipsis(&mut self, position: usize) { + self.sql.tokens[position] = Token::Ellipsis; } } diff --git a/src/writer.rs b/src/writer.rs index 70aa426..2579ce2 100644 --- a/src/writer.rs +++ b/src/writer.rs @@ -171,6 +171,7 @@ impl SqlWriter { Token::Semicolon => out.push(';'), Token::Placeholder => out.push('?'), Token::Ellipsis => out.push_str("..."), + Token::None => {} Token::Null => out.push_str("NULL"), Token::True => out.push_str("TRUE"), Token::False => out.push_str("FALSE"),