From 83abf20b774a5d5354bada2fe044cc3dfaaf4daa Mon Sep 17 00:00:00 2001 From: Pablo Rodriguez Mira <36644554+PabloRMira@users.noreply.github.com> Date: Sat, 23 Jan 2021 22:33:16 +0100 Subject: [PATCH] [MNT] Robustify comment assignment (#141) --- docs/utils.html | 6 +++--- nbs/02_utils.ipynb | 17 +++++++++++------ sql_formatter/utils.py | 13 +++++++++---- 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/docs/utils.html b/docs/utils.html index c84908e..ac3cc60 100644 --- a/docs/utils.html +++ b/docs/utils.html @@ -2776,7 +2776,7 @@
jaccard_distance
-assign_comment
[source]+
assign_comment
(fs
,cds
)
assign_comment
[source]
assign_comment
(fs
,cds
)Assign comments in list of dictionaries
cds
to formatted stringfs
using Jaccard distanceThe comment dictionaries
@@ -2818,8 +2818,8 @@cds
should contain the keys "comment" and "preceding" (string)
assign_comment
""".strip(), [ {"comment": "/* some comment */[C]", "preceding": "select asdf, qwer, "}, - {"comment": "-- comment there[C]", "preceding": "case when asdf = 1"}, - {"comment": "-- comment here[C]", "preceding": "and asdf = 2"}, + {"comment": "-- comment there[C]", "preceding": "case when asdf = 1 "}, + {"comment": "-- comment here[C]", "preceding": "and asdf = 2 "}, {"comment": "/* bla bla */[C]", "preceding": "then 2 when asdf = 3 then 3"} ] ), diff --git a/nbs/02_utils.ipynb b/nbs/02_utils.ipynb index 2217510..907c8f1 100644 --- a/nbs/02_utils.ipynb +++ b/nbs/02_utils.ipynb @@ -2141,8 +2141,12 @@ "#export\n", "def jaccard_distance(str1, str2):\n", " \"Calculate the Jaccard distance between two strings by word\"\n", - " set1 = set(str1.split())\n", - " set2 = set(str2.split())\n", + " split1 = re.split(r\"(?:\\s|,)\", str1)\n", + " split1 = [sp for sp in split1 if sp != \"\"]\n", + " split2 = re.split(r\"(?:\\s|,)\", str2)\n", + " split2 = [sp for sp in split2 if sp != \"\"] \n", + " set1 = set(split1)\n", + " set2 = set(split2)\n", " return float(len(set1 & set2) / len(set1 | set2))" ] }, @@ -2168,9 +2172,10 @@ " match_beginn_cs = re.compile(r\"^\\[CS\\]\")\n", " replace_select = re.compile(r\"\\b(?:select distinct |select )\", flags=re.I)\n", " # loop on comments to be assigned\n", - " for d in cds:\n", + " for i, d in enumerate(cds):\n", + " cum_preceding = \"\".join([d[\"preceding\"] for d in cds[0:i+1]])\n", " cp_list = [\n", - " jaccard_distance(replace_and_or.sub(\"\", s.strip()), d[\"preceding\"])\n", + " jaccard_distance(replace_and_or.sub(\"\", s.strip()), cum_preceding)\n", " for s in accumulate([s for s in fsplit_s], operator.add)\n", " ]\n", " # get line number with maximal jaccard distance (most similar)\n", @@ -2226,8 +2231,8 @@ "\"\"\".strip(),\n", " [\n", " {\"comment\": \"/* some comment */[C]\", \"preceding\": \"select asdf, qwer, \"},\n", - " {\"comment\": \"-- comment there[C]\", \"preceding\": \"case when asdf = 1\"},\n", - " {\"comment\": \"-- comment here[C]\", \"preceding\": \"and asdf = 2\"},\n", + " {\"comment\": \"-- comment there[C]\", \"preceding\": \"case when asdf = 1 \"},\n", + " {\"comment\": \"-- comment here[C]\", \"preceding\": \"and asdf = 2 \"},\n", " {\"comment\": \"/* bla bla */[C]\", \"preceding\": \"then 2 when asdf = 3 then 3\"}\n", " ]\n", " ),\n", diff --git a/sql_formatter/utils.py b/sql_formatter/utils.py index e2614e4..5b03171 100644 --- a/sql_formatter/utils.py +++ b/sql_formatter/utils.py @@ -601,8 +601,12 @@ def find_line_number(s, positions): # Cell def jaccard_distance(str1, str2): "Calculate the Jaccard distance between two strings by word" - set1 = set(str1.split()) - set2 = set(str2.split()) + split1 = re.split(r"(?:\s|,)", str1) + split1 = [sp for sp in split1 if sp != ""] + split2 = re.split(r"(?:\s|,)", str2) + split2 = [sp for sp in split2 if sp != ""] + set1 = set(split1) + set2 = set(split2) return float(len(set1 & set2) / len(set1 | set2)) # Cell @@ -621,9 +625,10 @@ def assign_comment(fs, cds): match_beginn_cs = re.compile(r"^\[CS\]") replace_select = re.compile(r"\b(?:select distinct |select )", flags=re.I) # loop on comments to be assigned - for d in cds: + for i, d in enumerate(cds): + cum_preceding = "".join([d["preceding"] for d in cds[0:i+1]]) cp_list = [ - jaccard_distance(replace_and_or.sub("", s.strip()), d["preceding"]) + jaccard_distance(replace_and_or.sub("", s.strip()), cum_preceding) for s in accumulate([s for s in fsplit_s], operator.add) ] # get line number with maximal jaccard distance (most similar)