From 5ca00b71acec3116bc9a0336686e55204f248d7c Mon Sep 17 00:00:00 2001 From: Pablo Rodriguez Mira <36644554+PabloRMira@users.noreply.github.com> Date: Fri, 30 Apr 2021 23:29:23 +0200 Subject: [PATCH] [FIX] Table names that include the substring select in them break subsequent formatting (#167) --- docs/additional_tests.html | 101 ++++++++++++++++++++++++++++++++++ docs/utils.html | 38 ++++++------- nbs/00_core.ipynb | 15 +---- nbs/02_utils.ipynb | 7 ++- nbs/99_additional_tests.ipynb | 90 +++++++++++++++++++++++++++++- sql_formatter/core.py | 4 +- sql_formatter/utils.py | 6 +- 7 files changed, 223 insertions(+), 38 deletions(-) diff --git a/docs/additional_tests.html b/docs/additional_tests.html index 4ac6f9c..64fd8f3 100644 --- a/docs/additional_tests.html +++ b/docs/additional_tests.html @@ -119,6 +119,107 @@

format_sql + + {% endraw %} + + {% raw %} + +
+
+ +
+
+
assert_and_print(
+    format_sql(
+"""SELECT var 
+    FROM table_selection as a 
+    LEFT JOIN table2 as b ON a.id = b.id 
+    LEFT JOIN table3 as c ON a.id = c.id 
+    ORDER BY 1
+"""),
+"""
+SELECT var
+FROM   table_selection as a
+    LEFT JOIN table2 as b
+        ON a.id = b.id
+    LEFT JOIN table3 as c
+        ON a.id = c.id
+ORDER BY 1
+""".strip()
+)
+
+ +
+
+
+ +
+
+ +
+ +
+
SELECT var
+FROM   table_selection as a
+    LEFT JOIN table2 as b
+        ON a.id = b.id
+    LEFT JOIN table3 as c
+        ON a.id = c.id
+ORDER BY 1
+
+
+
+ +
+
+ +
+ {% endraw %} + +
+
+

utils

+
+
+
+
+
+

split_query

+
+
+
+ {% raw %} + +
+
+ +
+
+
assert_and_print(
+    split_query("select var from table_selection"),
+    [{"string": "select var ", "comment": False, "quote": False, "select": True}, 
+     {"string": "from table_selection", "comment": False, "quote": False, "select": False}]
+)
+
+ +
+
+
+ +
+
+ +
+ +
+
[{'string': 'select var ', 'comment': False, 'quote': False, 'select': True}, {'string': 'from table_selection', 'comment': False, 'quote': False, 'select': False}]
+
+
+
+ +
+
+
{% endraw %} diff --git a/docs/utils.html b/docs/utils.html index 22e3c50..6a27489 100644 --- a/docs/utils.html +++ b/docs/utils.html @@ -1256,7 +1256,7 @@

split_query -

split_apply_concat[source]

split_apply_concat(s, f)

+

split_apply_concat[source]

split_apply_concat(s, f)

Split query s, apply function f and concatenate strings

@@ -1330,7 +1330,7 @@

Split by comment / no
-

split_comment_quote[source]

split_comment_quote(s)

+

split_comment_quote[source]

split_comment_quote(s)

Split query s into dictionaries with keys 'string', 'comment' and 'quote'

@@ -1458,7 +1458,7 @@

Split by comment / non-comment -

split_comment[source]

split_comment(s)

+

split_comment[source]

split_comment(s)

Split query s into dictionaries with keys 'string', 'comment'

@@ -1542,7 +1542,7 @@

Get posit
-

identify_in_sql[source]

identify_in_sql(regex, s)

+

identify_in_sql[source]

identify_in_sql(regex, s)

Find positions of regex (str or list) in string s ignoring comment and text in quotes

@@ -1716,7 +1716,7 @@

Split individual queries ba
-

split_by_semicolon[source]

split_by_semicolon(s)

+

split_by_semicolon[source]

split_by_semicolon(s)

Split string s by semicolon but not between parenthesis or in comments

@@ -1801,7 +1801,7 @@

split_by_semicolon -

replace_newline_chars[source]

replace_newline_chars(s)

+

replace_newline_chars[source]

replace_newline_chars(s)

Replace newline characters in s by whitespace but not in the comments

@@ -1877,7 +1877,7 @@

Substitute regex i
-

sub_in_sql[source]

sub_in_sql(regex, repl, s)

+

sub_in_sql[source]

sub_in_sql(regex, repl, s)

Subsitute regex with repl in query s ignoring comments and text in quotes

@@ -1951,7 +1951,7 @@

Add whitespaces after comma -

add_whitespaces_after_comma[source]

add_whitespaces_after_comma(s)

+

add_whitespaces_after_comma[source]

add_whitespaces_after_comma(s)

Add whitespace after comma in query s if there is no whitespace

@@ -2096,7 +2096,7 @@

add_whitespaces_af
-

identify_end_of_fields[source]

identify_end_of_fields(s)

+

identify_end_of_fields[source]

identify_end_of_fields(s)

Identify end of fields in query s

@@ -2328,7 +2328,7 @@

identify_end_of_fields<
-

add_newline_indentation[source]

add_newline_indentation(s, indentation)

+

add_newline_indentation[source]

add_newline_indentation(s, indentation)

Add newline and indentation for end of fields in query s

@@ -2501,7 +2501,7 @@

Handling subqueries -

extract_outer_subquery[source]

extract_outer_subquery(s)

+

extract_outer_subquery[source]

extract_outer_subquery(s)

Extract outer subquery in query s

@@ -2554,7 +2554,7 @@

extract_outer_subquery<
-

format_subquery[source]

format_subquery(s, previous_s)

+

format_subquery[source]

format_subquery(s, previous_s)

Format subquery in line s based on indentation on previous_s

@@ -2592,7 +2592,7 @@

Query identification -

check_sql_query[source]

check_sql_query(s)

+

check_sql_query[source]

check_sql_query(s)

Checks whether s is a SQL query based on match of CREATE TABLE / VIEW or SELECT ignoring comments and text in quotes

@@ -2790,7 +2790,7 @@

Marker to not format
-

check_skip_marker[source]

check_skip_marker(s)

+

check_skip_marker[source]

check_skip_marker(s)

Checks whether user set marker /skip-formatter/ to not format query

@@ -2874,7 +2874,7 @@

Check lines were CREATE
-

identify_create_table_view[source]

identify_create_table_view(s)

+

identify_create_table_view[source]

identify_create_table_view(s)

Identify positions of CREATE .. TABLE / VIEW statements

@@ -2949,7 +2949,7 @@

identify_create_tab
-

count_lines[source]

count_lines(s)

+

count_lines[source]

count_lines(s)

Count the number of lines in s

@@ -3024,7 +3024,7 @@

count_lines -

find_line_number[source]

find_line_number(s, positions)

+

find_line_number[source]

find_line_number(s, positions)

Find line number in s out of positions

@@ -3099,7 +3099,7 @@

find_line_number -

disimilarity[source]

disimilarity(str1, str2)

+

disimilarity[source]

disimilarity(str1, str2)

Calculate disimilarity between two strings by word

@@ -3193,7 +3193,7 @@

disimilarity -

assign_comment[source]

assign_comment(fs, cds)

+

assign_comment[source]

assign_comment(fs, cds)

Assign comments in list of dictionaries cds to formatted string fs using Jaccard distance

The comment dictionaries cds should contain the keys "comment" and "preceding" (string)

diff --git a/nbs/00_core.ipynb b/nbs/00_core.ipynb index 0e1d394..359bf4f 100644 --- a/nbs/00_core.ipynb +++ b/nbs/00_core.ipynb @@ -4,16 +4,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The autoreload extension is already loaded. To reload it, use:\n", - " %reload_ext autoreload\n" - ] - } - ], + "outputs": [], "source": [ "#hide\n", "%load_ext autoreload\n", @@ -465,8 +456,8 @@ " split_s = split_query(s) # split by comment and non comment\n", " split_s = compress_dicts(split_s, [\"comment\", \"select\"])\n", " # compile regex before loop\n", - " create_re = re.compile(\"create\", flags=re.I)\n", - " select_re = re.compile(\"select\", flags=re.I)\n", + " create_re = re.compile(r\"\\bcreate\\b\", flags=re.I)\n", + " select_re = re.compile(r\"\\bselect\\b\", flags=re.I)\n", " for statement in statements:\n", " if create_re.match(statement): # special case CREATE with AS capitalize as well\n", " create_sub = re.compile(rf\"\\s*({statement} )(.*) as\\b\", flags=re.I)\n", diff --git a/nbs/02_utils.ipynb b/nbs/02_utils.ipynb index 38afb51..9341b2e 100644 --- a/nbs/02_utils.ipynb +++ b/nbs/02_utils.ipynb @@ -911,9 +911,11 @@ " comment_region = False # start with non-quote\n", " s_comp = [] # container for string components\n", " start = 0\n", + " select_re = re.compile(r\"^[\\n\\s\\]\\(]*\\bselect\\b\\s$\")\n", + " from_re = re.compile(r\"^[\\n\\s\\]]\\bfrom\\b\\s$\")\n", " # loop over character positions\n", " for i, c in enumerate(s):\n", - " if s_low[i:i+6] == \"select\" and k == 0: # k = 0 -> no comment\n", + " if select_re.match(s_low[max(i-1, 0):i+7]) and k == 0: # k = 0 -> no comment\n", " s_comp.append({\n", " \"string\": s[start:i], \n", " \"comment\": comment_region, \n", @@ -922,7 +924,7 @@ " })\n", " start = i\n", " select_region = True # after select starts the select region\n", - " elif s_low[i:i+4] == \"from\" and k == 0:\n", + " elif from_re.match(s_low[max(i-1, 0):i+5]) and k == 0:\n", " select_open = False\n", " s_comp.append({\n", " \"string\": s[start:i], \n", @@ -2787,6 +2789,7 @@ "Converted 01_format_file.ipynb.\n", "Converted 02_utils.ipynb.\n", "Converted 03_validation.ipynb.\n", + "Converted 99_additional_tests.ipynb.\n", "Converted index.ipynb.\n" ] } diff --git a/nbs/99_additional_tests.ipynb b/nbs/99_additional_tests.ipynb index 413528e..aec8e75 100644 --- a/nbs/99_additional_tests.ipynb +++ b/nbs/99_additional_tests.ipynb @@ -103,7 +103,95 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SELECT var\n", + "FROM table_selection as a\n", + " LEFT JOIN table2 as b\n", + " ON a.id = b.id\n", + " LEFT JOIN table3 as c\n", + " ON a.id = c.id\n", + "ORDER BY 1\n" + ] + } + ], + "source": [ + "assert_and_print(\n", + " format_sql(\n", + "\"\"\"SELECT var \n", + " FROM table_selection as a \n", + " LEFT JOIN table2 as b ON a.id = b.id \n", + " LEFT JOIN table3 as c ON a.id = c.id \n", + " ORDER BY 1\n", + "\"\"\"),\n", + "\"\"\"\n", + "SELECT var\n", + "FROM table_selection as a\n", + " LEFT JOIN table2 as b\n", + " ON a.id = b.id\n", + " LEFT JOIN table3 as c\n", + " ON a.id = c.id\n", + "ORDER BY 1\n", + "\"\"\".strip()\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## utils" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### split_query" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[{'string': 'select var ', 'comment': False, 'quote': False, 'select': True}, {'string': 'from table_selection', 'comment': False, 'quote': False, 'select': False}]\n" + ] + } + ], + "source": [ + "assert_and_print(\n", + " split_query(\"select var from table_selection\"),\n", + " [{\"string\": \"select var \", \"comment\": False, \"quote\": False, \"select\": True}, \n", + " {\"string\": \"from table_selection\", \"comment\": False, \"quote\": False, \"select\": False}]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Converted 00_core.ipynb.\n", + "Converted 01_format_file.ipynb.\n", + "Converted 02_utils.ipynb.\n", + "Converted 03_validation.ipynb.\n", + "Converted 99_additional_tests.ipynb.\n", + "Converted index.ipynb.\n" + ] + } + ], "source": [ "#hide\n", "from nbdev.export import notebook2script\n", diff --git a/sql_formatter/core.py b/sql_formatter/core.py index 63178d6..ab3c3fb 100644 --- a/sql_formatter/core.py +++ b/sql_formatter/core.py @@ -51,8 +51,8 @@ def preformat_statements(s): split_s = split_query(s) # split by comment and non comment split_s = compress_dicts(split_s, ["comment", "select"]) # compile regex before loop - create_re = re.compile("create", flags=re.I) - select_re = re.compile("select", flags=re.I) + create_re = re.compile(r"\bcreate\b", flags=re.I) + select_re = re.compile(r"\bselect\b", flags=re.I) for statement in statements: if create_re.match(statement): # special case CREATE with AS capitalize as well create_sub = re.compile(rf"\s*({statement} )(.*) as\b", flags=re.I) diff --git a/sql_formatter/utils.py b/sql_formatter/utils.py index 128a44d..a9d3723 100644 --- a/sql_formatter/utils.py +++ b/sql_formatter/utils.py @@ -218,9 +218,11 @@ def split_query(s): comment_region = False # start with non-quote s_comp = [] # container for string components start = 0 + select_re = re.compile(r"^[\n\s\]\(]*\bselect\b\s$") + from_re = re.compile(r"^[\n\s\]]\bfrom\b\s$") # loop over character positions for i, c in enumerate(s): - if s_low[i:i+6] == "select" and k == 0: # k = 0 -> no comment + if select_re.match(s_low[max(i-1, 0):i+7]) and k == 0: # k = 0 -> no comment s_comp.append({ "string": s[start:i], "comment": comment_region, @@ -229,7 +231,7 @@ def split_query(s): }) start = i select_region = True # after select starts the select region - elif s_low[i:i+4] == "from" and k == 0: + elif from_re.match(s_low[max(i-1, 0):i+5]) and k == 0: select_open = False s_comp.append({ "string": s[start:i],