From 111051707458151273ccd7a3c85a5d737745d1d8 Mon Sep 17 00:00:00 2001 From: "Jurgen J. Vinju" Date: Wed, 17 Jul 2024 10:52:59 +0200 Subject: [PATCH 01/14] rewrote squeeze in Rascal with reified classes --- src/org/rascalmpl/library/String.rsc | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/src/org/rascalmpl/library/String.rsc b/src/org/rascalmpl/library/String.rsc index de466de5272..58713a0a1c0 100644 --- a/src/org/rascalmpl/library/String.rsc +++ b/src/org/rascalmpl/library/String.rsc @@ -20,6 +20,7 @@ module String extend Exception; import List; +import ParseTree; @synopsis{All functions in this module that have a charset parameter use this as default.} private str DEFAULT_CHARSET = "UTF-8"; @@ -522,8 +523,29 @@ squeeze("hello", "el"); ``` } @javaClass{org.rascalmpl.library.Prelude} +@deprecated{Use the other squeence function that accepts Rascal character classes.} public java str squeeze(str src, str charSet); +@synopsis{Squeeze repeated occurrences of characters.} +@description{ +Squeeze repeated occurrences in `src` of characters, if they are a member of `charSet`, removed. + +* `src` is any string +* `&CharClass` is a character class type such as `[a-z]` (a type that is a subtype of the class of all characters `![]`) +} +@pitfalls{ +* `![]` excludes the `0` character, so we can never squeeze the unicode codepoint `0`. We _can_ squeeze the number `0` of course, using `#[0-9]` for example. +} +@examples{ +```rascal-shell +import String; +squeeze("hello", #[el]); +``` +} +public str squeeze(str src, type[&CharClass <: ![]] _) = visit(src) { + case /+/ => c + when &CharClass _ := Tree::char(charAt(c, 0)) +}; @synopsis{Split a string into a list of strings based on a literal separator.} From 2835f74235e64bf7393ff4a53731383723570e82 Mon Sep 17 00:00:00 2001 From: "Jurgen J. Vinju" Date: Wed, 17 Jul 2024 18:25:03 +0200 Subject: [PATCH 02/14] fixed problem with bound --- src/org/rascalmpl/library/String.rsc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/org/rascalmpl/library/String.rsc b/src/org/rascalmpl/library/String.rsc index 58713a0a1c0..30593c3fb04 100644 --- a/src/org/rascalmpl/library/String.rsc +++ b/src/org/rascalmpl/library/String.rsc @@ -528,7 +528,7 @@ public java str squeeze(str src, str charSet); @synopsis{Squeeze repeated occurrences of characters.} @description{ -Squeeze repeated occurrences in `src` of characters, if they are a member of `charSet`, removed. +Squeeze repeated occurrences in `src` of characters, if they are a member of `&CharClass`, removed. * `src` is any string * `&CharClass` is a character class type such as `[a-z]` (a type that is a subtype of the class of all characters `![]`) @@ -542,7 +542,7 @@ import String; squeeze("hello", #[el]); ``` } -public str squeeze(str src, type[&CharClass <: ![]] _) = visit(src) { +public str squeeze(str src, type[&CharClass] _:type[![]] _) = visit(src) { case /+/ => c when &CharClass _ := Tree::char(charAt(c, 0)) }; From 983fc2f52ce92429c6bdab34ac8841a9b8180b3b Mon Sep 17 00:00:00 2001 From: "Jurgen J. Vinju" Date: Wed, 17 Jul 2024 18:33:38 +0200 Subject: [PATCH 03/14] duplicated tests for the other squeeze function --- .../library/lang/rascal/tests/basic/Strings1.rsc | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/org/rascalmpl/library/lang/rascal/tests/basic/Strings1.rsc b/src/org/rascalmpl/library/lang/rascal/tests/basic/Strings1.rsc index 0a679380687..c50e0bd14b1 100644 --- a/src/org/rascalmpl/library/lang/rascal/tests/basic/Strings1.rsc +++ b/src/org/rascalmpl/library/lang/rascal/tests/basic/Strings1.rsc @@ -264,6 +264,22 @@ test bool tstSqueezeCase3() = squeeze("aabcc", "a-c") == "abc"; test bool tstSqueezeCase4() = squeeze("aabbcc", "a-c") == "abc"; test bool tstSqueezeCase5() = squeeze("aaabc", "a-c") == "abc"; +// second squeeze +test bool tstSqueeze1(str S) = // !:= squeeze(S, #[a-zA-Z]); +test bool tstSqueeze2(str S) = squeeze(S, #[]) == S; +test bool tstSqueeze3(str S) { + if (// := S) { + return // := squeeze(S, #[0-9]); + } + return true; +} + +test bool tstSqueezeUnicode() = squeeze("Hi ๐Ÿ๐ŸWorld", #[๐Ÿ]) == "Hi ๐ŸWorld"; +test bool tstSqueezeCase1() = squeeze("abc", #[a-c]) == "abc"; +test bool tstSqueezeCase2() = squeeze("aabc", #[a-c]) == "abc"; +test bool tstSqueezeCase3() = squeeze("aabcc", #[a-c]) == "abc"; +test bool tstSqueezeCase4() = squeeze("aabbcc", #[a-c]) == "abc"; +test bool tstSqueezeCase5() = squeeze("aaabc", #[a-c]) == "abc"; test bool tstStartsWith(str S1, str S2) = startsWith(S1+S2, S1); From 58b2283e2630ee6096f8063922eb68c0d65d6d81 Mon Sep 17 00:00:00 2001 From: "Jurgen J. Vinju" Date: Thu, 18 Jul 2024 09:26:18 +0200 Subject: [PATCH 04/14] this fixes #2005 for the existing squeeze function on strings, but the new one is still broken on unicode for a different reason --- .../interpreter/TraversalEvaluator.java | 17 +++++++++-------- .../lang/rascal/tests/basic/Strings1.rsc | 18 +++++++++--------- 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/src/org/rascalmpl/interpreter/TraversalEvaluator.java b/src/org/rascalmpl/interpreter/TraversalEvaluator.java index 612699e119e..505432e64d3 100644 --- a/src/org/rascalmpl/interpreter/TraversalEvaluator.java +++ b/src/org/rascalmpl/interpreter/TraversalEvaluator.java @@ -45,6 +45,7 @@ import org.rascalmpl.interpreter.utils.Cases.CaseBlock; import org.rascalmpl.types.RascalType; import org.rascalmpl.interpreter.utils.Names; +import org.rascalmpl.values.IRascalValueFactory; import org.rascalmpl.values.RascalValueFactory; import org.rascalmpl.values.parsetrees.ITree; import org.rascalmpl.values.parsetrees.TreeAdapter; @@ -689,11 +690,11 @@ public IValue traverseTop(IValue subject, CaseBlockList casesOrRules, TraverseRe * Performance issue: we create a lot of garbage by producing all these substrings. */ public IValue traverseString(IValue subject, CaseBlockList casesOrRules, TraverseResult tr){ - String subjectString = ((IString) subject).getValue(); + IString subjectString = (IString) subject; int len = subjectString.length(); int subjectCursor = 0; int subjectCursorForResult = 0; - StringBuffer replacementString = null; + IString replacementString = null; boolean hasMatched = false; boolean hasChanged = false; @@ -703,7 +704,7 @@ public IValue traverseString(IValue subject, CaseBlockList casesOrRules, Travers while (subjectCursor < len) { try { - IString substring = eval.getValueFactory().string(subjectString.substring(subjectCursor, len)); + IString substring = subjectString.substring(subjectCursor, len); IValue subresult = substring; tr.matched = false; tr.changed = false; @@ -741,16 +742,16 @@ else if (lastPattern instanceof LiteralPattern || lastPattern instanceof TypedVa // Create replacementString when this is the first replacement if (replacementString == null) { - replacementString = new StringBuffer(); + replacementString = IRascalValueFactory.getInstance().string(""); } // Copy string before the match to the replacement string for (; subjectCursorForResult < subjectCursor + start; subjectCursorForResult++){ - replacementString.append(subjectString.charAt(subjectCursorForResult)); + replacementString = replacementString.concat(IRascalValueFactory.getInstance().string(subjectString.charAt(subjectCursorForResult))); } subjectCursorForResult = subjectCursor + end; // Copy replacement into replacement string - replacementString.append(((IString)repl).getValue()); + replacementString = replacementString.concat((IString) repl); tr.matched = true; tr.changed = true; @@ -770,9 +771,9 @@ else if (lastPattern instanceof LiteralPattern || lastPattern instanceof TypedVa // Copy remaining characters of subject string into replacement string for (; subjectCursorForResult < len; subjectCursorForResult++){ - replacementString.append(subjectString.charAt(subjectCursorForResult)); + replacementString = replacementString.concat(IRascalValueFactory.getInstance().string(subjectString.charAt(subjectCursorForResult))); } - return eval.getValueFactory().string(replacementString.toString()); + return replacementString; } } diff --git a/src/org/rascalmpl/library/lang/rascal/tests/basic/Strings1.rsc b/src/org/rascalmpl/library/lang/rascal/tests/basic/Strings1.rsc index c50e0bd14b1..652b897c330 100644 --- a/src/org/rascalmpl/library/lang/rascal/tests/basic/Strings1.rsc +++ b/src/org/rascalmpl/library/lang/rascal/tests/basic/Strings1.rsc @@ -265,21 +265,21 @@ test bool tstSqueezeCase4() = squeeze("aabbcc", "a-c") == "abc"; test bool tstSqueezeCase5() = squeeze("aaabc", "a-c") == "abc"; // second squeeze -test bool tstSqueeze1(str S) = // !:= squeeze(S, #[a-zA-Z]); -test bool tstSqueeze2(str S) = squeeze(S, #[]) == S; -test bool tstSqueeze3(str S) { +test bool tstSqueeze21(str S) = // !:= squeeze(S, #[a-zA-Z]); +test bool tstSqueeze22(str S) = squeeze(S, #[]) == S; +test bool tstSqueeze23(str S) { if (// := S) { return // := squeeze(S, #[0-9]); } return true; } -test bool tstSqueezeUnicode() = squeeze("Hi ๐Ÿ๐ŸWorld", #[๐Ÿ]) == "Hi ๐ŸWorld"; -test bool tstSqueezeCase1() = squeeze("abc", #[a-c]) == "abc"; -test bool tstSqueezeCase2() = squeeze("aabc", #[a-c]) == "abc"; -test bool tstSqueezeCase3() = squeeze("aabcc", #[a-c]) == "abc"; -test bool tstSqueezeCase4() = squeeze("aabbcc", #[a-c]) == "abc"; -test bool tstSqueezeCase5() = squeeze("aaabc", #[a-c]) == "abc"; +test bool tstSqueeze2Unicode() = squeeze("Hi ๐Ÿ๐ŸWorld", #[๐Ÿ]) == "Hi ๐ŸWorld"; +test bool tstSqueeze2Case1() = squeeze("abc", #[a-c]) == "abc"; +test bool tstSqueeze2Case2() = squeeze("aabc", #[a-c]) == "abc"; +test bool tstSqueeze2Case3() = squeeze("aabcc", #[a-c]) == "abc"; +test bool tstSqueeze2Case4() = squeeze("aabbcc", #[a-c]) == "abc"; +test bool tstSqueeze2Case5() = squeeze("aaabc", #[a-c]) == "abc"; test bool tstStartsWith(str S1, str S2) = startsWith(S1+S2, S1); From 67568f2fa11563a379347c3c15213a6f224bb787 Mon Sep 17 00:00:00 2001 From: "Jurgen J. Vinju" Date: Sun, 21 Jul 2024 11:12:18 +0200 Subject: [PATCH 05/14] Revert "this fixes #2005 for the existing squeeze function on strings, but the new one is still broken on unicode for a different reason" This reverts commit 58b2283e2630ee6096f8063922eb68c0d65d6d81. --- .../interpreter/TraversalEvaluator.java | 17 ++++++++--------- .../lang/rascal/tests/basic/Strings1.rsc | 18 +++++++++--------- 2 files changed, 17 insertions(+), 18 deletions(-) diff --git a/src/org/rascalmpl/interpreter/TraversalEvaluator.java b/src/org/rascalmpl/interpreter/TraversalEvaluator.java index 505432e64d3..612699e119e 100644 --- a/src/org/rascalmpl/interpreter/TraversalEvaluator.java +++ b/src/org/rascalmpl/interpreter/TraversalEvaluator.java @@ -45,7 +45,6 @@ import org.rascalmpl.interpreter.utils.Cases.CaseBlock; import org.rascalmpl.types.RascalType; import org.rascalmpl.interpreter.utils.Names; -import org.rascalmpl.values.IRascalValueFactory; import org.rascalmpl.values.RascalValueFactory; import org.rascalmpl.values.parsetrees.ITree; import org.rascalmpl.values.parsetrees.TreeAdapter; @@ -690,11 +689,11 @@ public IValue traverseTop(IValue subject, CaseBlockList casesOrRules, TraverseRe * Performance issue: we create a lot of garbage by producing all these substrings. */ public IValue traverseString(IValue subject, CaseBlockList casesOrRules, TraverseResult tr){ - IString subjectString = (IString) subject; + String subjectString = ((IString) subject).getValue(); int len = subjectString.length(); int subjectCursor = 0; int subjectCursorForResult = 0; - IString replacementString = null; + StringBuffer replacementString = null; boolean hasMatched = false; boolean hasChanged = false; @@ -704,7 +703,7 @@ public IValue traverseString(IValue subject, CaseBlockList casesOrRules, Travers while (subjectCursor < len) { try { - IString substring = subjectString.substring(subjectCursor, len); + IString substring = eval.getValueFactory().string(subjectString.substring(subjectCursor, len)); IValue subresult = substring; tr.matched = false; tr.changed = false; @@ -742,16 +741,16 @@ else if (lastPattern instanceof LiteralPattern || lastPattern instanceof TypedVa // Create replacementString when this is the first replacement if (replacementString == null) { - replacementString = IRascalValueFactory.getInstance().string(""); + replacementString = new StringBuffer(); } // Copy string before the match to the replacement string for (; subjectCursorForResult < subjectCursor + start; subjectCursorForResult++){ - replacementString = replacementString.concat(IRascalValueFactory.getInstance().string(subjectString.charAt(subjectCursorForResult))); + replacementString.append(subjectString.charAt(subjectCursorForResult)); } subjectCursorForResult = subjectCursor + end; // Copy replacement into replacement string - replacementString = replacementString.concat((IString) repl); + replacementString.append(((IString)repl).getValue()); tr.matched = true; tr.changed = true; @@ -771,9 +770,9 @@ else if (lastPattern instanceof LiteralPattern || lastPattern instanceof TypedVa // Copy remaining characters of subject string into replacement string for (; subjectCursorForResult < len; subjectCursorForResult++){ - replacementString = replacementString.concat(IRascalValueFactory.getInstance().string(subjectString.charAt(subjectCursorForResult))); + replacementString.append(subjectString.charAt(subjectCursorForResult)); } - return replacementString; + return eval.getValueFactory().string(replacementString.toString()); } } diff --git a/src/org/rascalmpl/library/lang/rascal/tests/basic/Strings1.rsc b/src/org/rascalmpl/library/lang/rascal/tests/basic/Strings1.rsc index 652b897c330..c50e0bd14b1 100644 --- a/src/org/rascalmpl/library/lang/rascal/tests/basic/Strings1.rsc +++ b/src/org/rascalmpl/library/lang/rascal/tests/basic/Strings1.rsc @@ -265,21 +265,21 @@ test bool tstSqueezeCase4() = squeeze("aabbcc", "a-c") == "abc"; test bool tstSqueezeCase5() = squeeze("aaabc", "a-c") == "abc"; // second squeeze -test bool tstSqueeze21(str S) = // !:= squeeze(S, #[a-zA-Z]); -test bool tstSqueeze22(str S) = squeeze(S, #[]) == S; -test bool tstSqueeze23(str S) { +test bool tstSqueeze1(str S) = // !:= squeeze(S, #[a-zA-Z]); +test bool tstSqueeze2(str S) = squeeze(S, #[]) == S; +test bool tstSqueeze3(str S) { if (// := S) { return // := squeeze(S, #[0-9]); } return true; } -test bool tstSqueeze2Unicode() = squeeze("Hi ๐Ÿ๐ŸWorld", #[๐Ÿ]) == "Hi ๐ŸWorld"; -test bool tstSqueeze2Case1() = squeeze("abc", #[a-c]) == "abc"; -test bool tstSqueeze2Case2() = squeeze("aabc", #[a-c]) == "abc"; -test bool tstSqueeze2Case3() = squeeze("aabcc", #[a-c]) == "abc"; -test bool tstSqueeze2Case4() = squeeze("aabbcc", #[a-c]) == "abc"; -test bool tstSqueeze2Case5() = squeeze("aaabc", #[a-c]) == "abc"; +test bool tstSqueezeUnicode() = squeeze("Hi ๐Ÿ๐ŸWorld", #[๐Ÿ]) == "Hi ๐ŸWorld"; +test bool tstSqueezeCase1() = squeeze("abc", #[a-c]) == "abc"; +test bool tstSqueezeCase2() = squeeze("aabc", #[a-c]) == "abc"; +test bool tstSqueezeCase3() = squeeze("aabcc", #[a-c]) == "abc"; +test bool tstSqueezeCase4() = squeeze("aabbcc", #[a-c]) == "abc"; +test bool tstSqueezeCase5() = squeeze("aaabc", #[a-c]) == "abc"; test bool tstStartsWith(str S1, str S2) = startsWith(S1+S2, S1); From 05e695022b9d92d9d361ee1783421af11a5baff9 Mon Sep 17 00:00:00 2001 From: "Jurgen J. Vinju" Date: Mon, 18 Nov 2024 20:23:02 +0100 Subject: [PATCH 06/14] added more documentation to vis::Graphs --- src/org/rascalmpl/library/vis/Graphs.rsc | 25 ++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/src/org/rascalmpl/library/vis/Graphs.rsc b/src/org/rascalmpl/library/vis/Graphs.rsc index b24a4d3ce8b..40d27f19345 100644 --- a/src/org/rascalmpl/library/vis/Graphs.rsc +++ b/src/org/rascalmpl/library/vis/Graphs.rsc @@ -90,7 +90,7 @@ default str defaultNodeLabeler(&T v) = ""; alias EdgeLabeler[&T]= str (&T _source, &T _target); str defaultEdgeLabeler(&T _source, &T _target) = ""; - +@synopsis{Produces an overall cytoscape.js wrapper which is sent as JSON to the client side.} Cytoscape cytoscape(list[CytoData] \data, \CytoLayout \layout=\defaultCoseLayout(), CytoStyle nodeStyle=defaultNodeStyle(), CytoStyle edgeStyle=defaultEdgeStyle()) = cytoscape( elements=\data, @@ -101,6 +101,7 @@ Cytoscape cytoscape(list[CytoData] \data, \CytoLayout \layout=\defaultCoseLayout \layout=\layout ); +@synopsis{Translates different types of Rascal data conveniently to the Cytoscape.js data format.} list[CytoData] graphData(rel[loc x, loc y] v, NodeLinker[loc] nodeLinker=defaultNodeLinker, NodeLabeler[loc] nodeLabeler=defaultNodeLabeler, EdgeLabeler[loc] edgeLabeler=defaultEdgeLabeler) = [cytodata(\node("", label=nodeLabeler(e), editor="")) | e <- {*v, *v}] + [cytodata(\edge("", "", label=edgeLabeler(from, to))) | <- v] @@ -169,6 +170,7 @@ data CytoNodeShape | \polygon() ; +@synopsis{Overall cytoscape.js object for sending to the client side.} data Cytoscape = cytoscape( list[CytoData] elements = [], @@ -242,6 +244,11 @@ data CytoStyleOf CytoStyleOf cytoNodeStyleOf(CytoStyle style) = cytoNodeStyleOf(selector=\node(), style=style); CytoStyleOf cytoEdgeStyleOf(CytoStyle style) = cytoEdgeStyleOf(selector=\edge(), style=style); +@synopsis{Instantiates a default node style} +@description{ +Because the JSON writer can not instantiate default values for keyword fields, +we have to do it manually here. +} CytoStyle defaultNodeStyle() = cytoNodeStyle( width = "label", @@ -256,6 +263,11 @@ CytoStyle defaultNodeStyle() \text-valign = CytoVerticalAlign::\center() ); +@synopsis{Instantiates a default edge style} +@description{ +Because the JSON writer can not instantiate default values for keyword fields +we have to do it manually here. +} CytoStyle defaultEdgeStyle() = cytoEdgeStyle( width = 3, @@ -433,7 +445,16 @@ Response (Request) graphServer(Cytoscape ch) { return reply; } -@synopsis{default HTML wrapper for a chart} +@synopsis{default HTML wrapper for a cytoscape.js graph} +@description{ +This client features: +* cytoscape.js loading with cytoscape-dagre and dagre present. +* fetching of graph data via `http://localhost/cytoscape` URL +* clickable links in every node that has an 'editor' data field that holds a `loc`, via the `http://localhost/editor?src=loc` URL +* full screen graph view + +This client mirrors the server defined by ((graphServer)). +} private HTMLElement plotHTML() = html([ head([ From 9b6d182a375e3849f43d5cfeb60edb1c8d251193 Mon Sep 17 00:00:00 2001 From: "Jurgen J. Vinju" Date: Wed, 20 Nov 2024 17:24:51 +0100 Subject: [PATCH 07/14] finalized ancient TODO in char2int --- src/org/rascalmpl/library/String.rsc | 2 +- .../values/parsetrees/SymbolFactory.java | 48 +++++++++++-------- 2 files changed, 28 insertions(+), 22 deletions(-) diff --git a/src/org/rascalmpl/library/String.rsc b/src/org/rascalmpl/library/String.rsc index 5a61188792e..a7a570fcaef 100644 --- a/src/org/rascalmpl/library/String.rsc +++ b/src/org/rascalmpl/library/String.rsc @@ -523,7 +523,7 @@ squeeze("hello", "el"); ``` } @javaClass{org.rascalmpl.library.Prelude} -@deprecated{Use the other squeence function that accepts Rascal character classes.} +@deprecated{Use the other squeence function that accepts Rascal character class syntax.} public java str squeeze(str src, str charSet); @synopsis{Squeeze repeated occurrences of characters.} diff --git a/src/org/rascalmpl/values/parsetrees/SymbolFactory.java b/src/org/rascalmpl/values/parsetrees/SymbolFactory.java index 22e96b5b231..666064ea692 100644 --- a/src/org/rascalmpl/values/parsetrees/SymbolFactory.java +++ b/src/org/rascalmpl/values/parsetrees/SymbolFactory.java @@ -15,6 +15,7 @@ import java.util.List; +import org.apache.commons.lang3.ArrayUtils; import org.rascalmpl.ast.CaseInsensitiveStringConstant; import org.rascalmpl.ast.Char; import org.rascalmpl.ast.Class; @@ -338,30 +339,35 @@ else if (range.isFromTo()) { private static IValue char2int(Char character) { String s = ((Char.Lexical) character).getString(); if (s.startsWith("\\")) { - if (s.length() > 1 && java.lang.Character.isDigit(s.charAt(1))) { // octal escape - // TODO - throw new NotYetImplemented("octal escape sequence in character class types"); + if (ArrayUtils.contains(new int[] { 'a', 'u', 'U'}, s.charAt(1))) { + // lexical UnicodeEscape + // = utf16: "\\" [u] [0-9 A-F a-f] [0-9 A-F a-f] [0-9 A-F a-f] [0-9 A-F a-f] + // | utf32: "\\" [U] (("0" [0-9 A-F a-f]) | "10") [0-9 A-F a-f] [0-9 A-F a-f] [0-9 A-F a-f] [0-9 A-F a-f] // 24 bits + // | ascii: "\\" [a] [0-7] [0-9A-Fa-f] + // ; + return factory.integer(Integer.parseUnsignedInt(s.substring(2), 16)); } - if (s.length() > 1 && s.charAt(1) == 'u') { // octal escape - // TODO - throw new NotYetImplemented("unicode escape sequence in character class types"); - } - char cha = s.charAt(1); - switch (cha) { - case 't': return factory.integer('\t'); - case 'n': return factory.integer('\n'); - case 'r': return factory.integer('\r'); - case '\"' : return factory.integer('\"'); - case '\'' : return factory.integer('\''); - case '-' : return factory.integer('-'); - case '<' : return factory.integer('<'); - case '>' : return factory.integer('>'); - case '\\' : return factory.integer('\\'); + else { + char cha = s.charAt(1); + switch (cha) { + case 't': return factory.integer('\t'); + case 'n': return factory.integer('\n'); + case 'r': return factory.integer('\r'); + case '\"' : return factory.integer('\"'); + case '\'' : return factory.integer('\''); + case '-' : return factory.integer('-'); + case '<' : return factory.integer('<'); + case '>' : return factory.integer('>'); + case '\\' : return factory.integer('\\'); + default: + return factory.integer(cha); + } } - s = s.substring(1); } - char cha = s.charAt(0); - return factory.integer(cha); + else { + char cha = s.charAt(0); + return factory.integer(cha); + } } public static IConstructor charClass(int ch) { From a2b3dee7553cf27eef1fd04b8d634c46091f60d6 Mon Sep 17 00:00:00 2001 From: "Jurgen J. Vinju" Date: Wed, 20 Nov 2024 17:39:49 +0100 Subject: [PATCH 08/14] fixed comments --- src/org/rascalmpl/library/String.rsc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/org/rascalmpl/library/String.rsc b/src/org/rascalmpl/library/String.rsc index a7a570fcaef..96c0ab4c522 100644 --- a/src/org/rascalmpl/library/String.rsc +++ b/src/org/rascalmpl/library/String.rsc @@ -531,10 +531,12 @@ public java str squeeze(str src, str charSet); Squeeze repeated occurrences in `src` of characters, if they are a member of `&CharClass`, removed. * `src` is any string -* `&CharClass` is a character class type such as `[a-z]` (a type that is a subtype of the class of all characters `![]`) +* `&CharClass` is a reified character class type such as `#[a-z]` (a type that is a subtype of the class of all characters `#![]`) } @pitfalls{ -* `![]` excludes the `0` character, so we can never squeeze the unicode codepoint `0`. We _can_ squeeze the number `0` of course, using `#[0-9]` for example. +* `![]` excludes the `0` character, so we can not squeeze the unicode codepoint `0` using `![]`. +We use `#[\U000000-\U10FFFF]` to include the `0` character. +* Do not confuse the character `0` (codepoint 48) with the zero codepoint: `#[0] != #[\a00]` } @examples{ ```rascal-shell From 228a6f40643a1ed03110a07ffccb01058d78e0cf Mon Sep 17 00:00:00 2001 From: "Jurgen J. Vinju" Date: Wed, 20 Nov 2024 17:44:06 +0100 Subject: [PATCH 09/14] better docs --- src/org/rascalmpl/library/String.rsc | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/org/rascalmpl/library/String.rsc b/src/org/rascalmpl/library/String.rsc index 96c0ab4c522..1665f17c119 100644 --- a/src/org/rascalmpl/library/String.rsc +++ b/src/org/rascalmpl/library/String.rsc @@ -515,11 +515,16 @@ public java str trim(str s); Squeeze repeated occurrences in `src` of characters in `charSet` removed. See http://commons.apache.org/lang/api-2.6/index.html?org/apache/commons/lang/text/package-summary.html[Apache] for the allowed syntax in `charSet`. + +This function was deprecated because it introduces yet another syntax for character classes, with +specific escapes next to already existing notation in syntax symbols and regular expressions. +Better use the other ((squeeze)) function. } @examples{ ```rascal-shell import String; squeeze("hello", "el"); +squeeze("hello", "el") == squeeze("hello", #[el]); ``` } @javaClass{org.rascalmpl.library.Prelude} @@ -531,7 +536,11 @@ public java str squeeze(str src, str charSet); Squeeze repeated occurrences in `src` of characters, if they are a member of `&CharClass`, removed. * `src` is any string -* `&CharClass` is a reified character class type such as `#[a-z]` (a type that is a subtype of the class of all characters `#![]`) +* `&CharClass` is a reified character class type such as `[a-z]` (a type that is a subtype of the class of all characters `![]`) +* To pass in a char-class type used the type reifier operator: `#[a-z]` or `#![]` +} +@benefits{ +* to squeeze all characters use the universal character class: `#![]` (the negation of the empty class). } @pitfalls{ * `![]` excludes the `0` character, so we can not squeeze the unicode codepoint `0` using `![]`. From e56bf912f93e4f664919fe96888c703f7d9421ef Mon Sep 17 00:00:00 2001 From: "Jurgen J. Vinju" Date: Wed, 20 Nov 2024 19:01:33 +0100 Subject: [PATCH 10/14] fixed unicode issues in character class reification code --- .../lang/rascal/tests/basic/Strings1.rsc | 18 +++++++++--------- .../values/parsetrees/SymbolFactory.java | 4 ++-- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/org/rascalmpl/library/lang/rascal/tests/basic/Strings1.rsc b/src/org/rascalmpl/library/lang/rascal/tests/basic/Strings1.rsc index c50e0bd14b1..96485906099 100644 --- a/src/org/rascalmpl/library/lang/rascal/tests/basic/Strings1.rsc +++ b/src/org/rascalmpl/library/lang/rascal/tests/basic/Strings1.rsc @@ -265,21 +265,21 @@ test bool tstSqueezeCase4() = squeeze("aabbcc", "a-c") == "abc"; test bool tstSqueezeCase5() = squeeze("aaabc", "a-c") == "abc"; // second squeeze -test bool tstSqueeze1(str S) = // !:= squeeze(S, #[a-zA-Z]); -test bool tstSqueeze2(str S) = squeeze(S, #[]) == S; -test bool tstSqueeze3(str S) { +test bool tstSqueeze1CC(str S) = // !:= squeeze(S, #[a-zA-Z]); +test bool tstSqueeze2CC(str S) = squeeze(S, #[]) == S; +test bool tstSqueeze3CC(str S) { if (// := S) { return // := squeeze(S, #[0-9]); } return true; } -test bool tstSqueezeUnicode() = squeeze("Hi ๐Ÿ๐ŸWorld", #[๐Ÿ]) == "Hi ๐ŸWorld"; -test bool tstSqueezeCase1() = squeeze("abc", #[a-c]) == "abc"; -test bool tstSqueezeCase2() = squeeze("aabc", #[a-c]) == "abc"; -test bool tstSqueezeCase3() = squeeze("aabcc", #[a-c]) == "abc"; -test bool tstSqueezeCase4() = squeeze("aabbcc", #[a-c]) == "abc"; -test bool tstSqueezeCase5() = squeeze("aaabc", #[a-c]) == "abc"; +test bool tstSqueezeUnicodeCC() = squeeze("Hi ๐Ÿ๐ŸWorld", #[๐Ÿ]) == "Hi ๐ŸWorld"; +test bool tstSqueezeCase1CC() = squeeze("abc", #[a-c]) == "abc"; +test bool tstSqueezeCase2CC() = squeeze("aabc", #[a-c]) == "abc"; +test bool tstSqueezeCase3CC() = squeeze("aabcc", #[a-c]) == "abc"; +test bool tstSqueezeCase4CC() = squeeze("aabbcc", #[a-c]) == "abc"; +test bool tstSqueezeCase5CC() = squeeze("aaabc", #[a-c]) == "abc"; test bool tstStartsWith(str S1, str S2) = startsWith(S1+S2, S1); diff --git a/src/org/rascalmpl/values/parsetrees/SymbolFactory.java b/src/org/rascalmpl/values/parsetrees/SymbolFactory.java index 666064ea692..ee6faf40483 100644 --- a/src/org/rascalmpl/values/parsetrees/SymbolFactory.java +++ b/src/org/rascalmpl/values/parsetrees/SymbolFactory.java @@ -348,7 +348,7 @@ private static IValue char2int(Char character) { return factory.integer(Integer.parseUnsignedInt(s.substring(2), 16)); } else { - char cha = s.charAt(1); + int cha = s.codePointAt(1); switch (cha) { case 't': return factory.integer('\t'); case 'n': return factory.integer('\n'); @@ -365,7 +365,7 @@ private static IValue char2int(Char character) { } } else { - char cha = s.charAt(0); + int cha = s.codePointAt(0); return factory.integer(cha); } } From f6969e9b0d6faabf6b2ea16eb15776fef451601b Mon Sep 17 00:00:00 2001 From: "Jurgen J. Vinju" Date: Thu, 21 Nov 2024 08:43:36 +0100 Subject: [PATCH 11/14] improved docs --- src/org/rascalmpl/library/String.rsc | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/org/rascalmpl/library/String.rsc b/src/org/rascalmpl/library/String.rsc index 1665f17c119..06621d38738 100644 --- a/src/org/rascalmpl/library/String.rsc +++ b/src/org/rascalmpl/library/String.rsc @@ -541,10 +541,16 @@ Squeeze repeated occurrences in `src` of characters, if they are a member of `&C } @benefits{ * to squeeze all characters use the universal character class: `#![]` (the negation of the empty class). +* this function is type-safe; you can only pass in correct reified character classes like `#[A-Za-z]`. } @pitfalls{ -* `![]` excludes the `0` character, so we can not squeeze the unicode codepoint `0` using `![]`. -We use `#[\U000000-\U10FFFF]` to include the `0` character. +* `![]` excludes the 0'th unicode character, so we can not squeeze the unicode codepoint `0` using this function. +If you really need to squeeze 0 then it's best to write your own: +```rascal +visit (x) { + case /+/ => "\a00" when dot == "\a00" +} +```` * Do not confuse the character `0` (codepoint 48) with the zero codepoint: `#[0] != #[\a00]` } @examples{ From 11f6a426f989f99f1759f443702ca31777c04e40 Mon Sep 17 00:00:00 2001 From: "Jurgen J. Vinju" Date: Thu, 21 Nov 2024 08:51:16 +0100 Subject: [PATCH 12/14] removed deprecation explanation which is unnecessary --- src/org/rascalmpl/library/String.rsc | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/org/rascalmpl/library/String.rsc b/src/org/rascalmpl/library/String.rsc index 06621d38738..132c288a987 100644 --- a/src/org/rascalmpl/library/String.rsc +++ b/src/org/rascalmpl/library/String.rsc @@ -515,15 +515,12 @@ public java str trim(str s); Squeeze repeated occurrences in `src` of characters in `charSet` removed. See http://commons.apache.org/lang/api-2.6/index.html?org/apache/commons/lang/text/package-summary.html[Apache] for the allowed syntax in `charSet`. - -This function was deprecated because it introduces yet another syntax for character classes, with -specific escapes next to already existing notation in syntax symbols and regular expressions. -Better use the other ((squeeze)) function. } @examples{ ```rascal-shell import String; squeeze("hello", "el"); +// the other squeeze function uses character class types instead: squeeze("hello", "el") == squeeze("hello", #[el]); ``` } From 252a9306d308d1a045941a0e0fcd32cb2bcd3b59 Mon Sep 17 00:00:00 2001 From: "Jurgen J. Vinju" Date: Thu, 21 Nov 2024 11:16:47 +0100 Subject: [PATCH 13/14] fixes #2009 --- .../rascal/grammar/tests/CharactersTests.rsc | 46 +++++++ .../values/parsetrees/SymbolFactory.java | 117 ++++-------------- 2 files changed, 73 insertions(+), 90 deletions(-) diff --git a/src/org/rascalmpl/library/lang/rascal/grammar/tests/CharactersTests.rsc b/src/org/rascalmpl/library/lang/rascal/grammar/tests/CharactersTests.rsc index 7b9fc42153f..0ce18b21159 100644 --- a/src/org/rascalmpl/library/lang/rascal/grammar/tests/CharactersTests.rsc +++ b/src/org/rascalmpl/library/lang/rascal/grammar/tests/CharactersTests.rsc @@ -2,6 +2,7 @@ module lang::rascal::grammar::tests::CharactersTests import lang::rascal::grammar::definition::Characters; import ParseTree; +import String; test bool testFlip() = \new-char-class([range(2,2), range(1,1)]) == \char-class([range(1,2)]); test bool testMerge() = \new-char-class([range(3,4), range(2,2), range(1,1)]) == \char-class([range(1,4)]); @@ -24,3 +25,48 @@ test bool testDiff1() = difference(\char-class([range(10,30)]), \char-class([ran test bool testDiff2() = difference(\char-class([range(10,30), range(40,50)]), \char-class([range(25,45)])) ==\char-class( [range(10,24), range(46,50)]); +test bool asciiEscape() = \char-class([range(0,127)]) == #[\a00-\a7F].symbol; +test bool utf16Escape() = \char-class([range(0,65535)]) == #[\u0000-\uFFFF].symbol; +test bool utf32Escape() = \char-class([range(0,1114111)]) == #[\U000000-\U10FFFF].symbol; +test bool highLowSurrogateRange1() = \char-class([range(9312,12991)]) == #[โ‘ -ใŠฟ].symbol; +test bool highLowSurrogateRange2() = \char-class([range(127829,127829)]) == #[๐Ÿ•].symbol; +test bool differentEscapesSameResult1() = #[\a00-\a7F] == #[\u0000-\u007F]; +test bool differentEscapesSameResult2() = #[\a00-\a7F] == #[\U000000-\U00007F]; + +/* to avoid a known ambiguity */ +alias NotAZ = ![A-Z]; + +test bool unicodeCharacterClassSubtype1() { + Tree t = char(charAt("โ‘ญ", 0)); + + if ([โ‘ -ใŠฟ] circled := t) { + assert [โ‘ญ] _ := circled; + assert NotAZ _ := circled; + return true; + } + + return false; +} + +test bool unicodeCharacterClassSubtype2() { + Tree t = char(charAt("๐Ÿ•", 0)); + + if ([๐Ÿ•] pizza := t) { + assert [\a00-๐Ÿ•] _ := pizza; + assert NotAZ _ := pizza; + return true; + } + + return false; +} + +test bool literalAsciiEscape1() = lit("\n") == #"\a0A".symbol; +test bool literalAsciiEscape2() = lit("w") == #"\a77".symbol; +test bool literalAsciiEscape3() = lit("\f") == #"\a0C".symbol; +test bool literalAsciiEscape4() = lit("\n") == #"\n".symbol; +@ignore{vallang must re-introduce the \f notation} +test bool literalAsciiEscape5() = lit("\f") == #"\f".symbol; +test bool literalUtf16Escape() = lit("\n") == #"\u000A".symbol; +test bool literalUtf32Escape1() = lit("\n") == #"\U00000A".symbol; +test bool literalUtf32Escape2() = lit("๐Ÿ•") == #"\U01F355".symbol; + diff --git a/src/org/rascalmpl/values/parsetrees/SymbolFactory.java b/src/org/rascalmpl/values/parsetrees/SymbolFactory.java index ee6faf40483..cec5444b924 100644 --- a/src/org/rascalmpl/values/parsetrees/SymbolFactory.java +++ b/src/org/rascalmpl/values/parsetrees/SymbolFactory.java @@ -13,6 +13,8 @@ *******************************************************************************/ package org.rascalmpl.values.parsetrees; +import java.io.IOException; +import java.io.StringReader; import java.util.List; import org.apache.commons.lang3.ArrayUtils; @@ -35,6 +37,8 @@ import io.usethesource.vallang.IString; import io.usethesource.vallang.IValue; import io.usethesource.vallang.IValueFactory; +import io.usethesource.vallang.exceptions.FactTypeUseException; +import io.usethesource.vallang.io.StandardTextReader; import org.rascalmpl.values.RascalValueFactory; import org.rascalmpl.values.ValueFactoryFactory; @@ -198,101 +202,34 @@ private static IValue symbolAST2SymbolConstructor(Sym symbol, boolean lex, Strin } private static IValue literal2Symbol(StringConstant sep) { - String lit = ((StringConstant.Lexical) sep).getString(); - StringBuilder builder = new StringBuilder(lit.length()); - - // TODO: did we deal with all escapes here? probably not! - for (int i = 1; i < lit.length() - 1; i++) { - if (lit.charAt(i) == '\\') { - i++; - switch (lit.charAt(i)) { - case 'b': - builder.append('\b'); - break; - case 'f': - builder.append('\f'); - break; - case 'n': - builder.append('\n'); - break; - case 't': - builder.append('\t'); - break; - case 'r': - builder.append('\r'); - break; - case '\\': - builder.append('\\'); - break; - case '\"': - builder.append('\"'); - break; - case '>': - builder.append('>'); - break; - case '<': - builder.append('<'); - break; - case '\'': - builder.append('\''); - break; - case 'u': - while (lit.charAt(i++) == 'u'); - builder.append((char) Integer.decode("0x" + lit.substring(i, i+4)).intValue()); - i+=4; - break; - default: - // octal escape - int a = lit.charAt(i++); - int b = lit.charAt(i++); - int c = lit.charAt(i); - builder.append( (char) (100 * a + 10 * b + c)); - } - } - else { - builder.append(lit.charAt(i)); - } + try { + String lit = ((StringConstant.Lexical) sep).getString(); + // this should be the exact notation for string literals in vallang + IValue string = new StandardTextReader().read(factory, new StringReader(lit)); + + return factory.constructor(RascalValueFactory.Symbol_Lit, string); + } + catch (FactTypeUseException | IOException e) { + // this would mean Rascal's syntax definition for string constants is not aligned with vallang's string notation + throw new RuntimeException("Internal error: parsed stringconstant notation does not coincide with vallang stringconstant notation"); } - - return factory.constructor(RascalValueFactory.Symbol_Lit, factory.string(builder.toString())); } private static IValue ciliteral2Symbol(CaseInsensitiveStringConstant constant) { - String lit = ((CaseInsensitiveStringConstant.Lexical) constant).getString(); - StringBuilder builder = new StringBuilder(lit.length()); - - for (int i = 1; i < lit.length() - 1; i++) { - if (lit.charAt(i) == '\\') { - i++; - switch (lit.charAt(i)) { - case 'n': - builder.append('\n'); - break; - case 't': - builder.append('\t'); - break; - case 'r': - builder.append('\r'); - break; - case '\\': - builder.append('\\'); - break; - case '\"': - builder.append('\''); - break; - default: - int a = lit.charAt(i++); - int b = lit.charAt(i++); - int c = lit.charAt(i); - builder.append( (char) (100 * a + 10 * b + c)); - } - } - else { - builder.append(lit.charAt(i)); - } + try { + String lit = ((CaseInsensitiveStringConstant.Lexical) constant).getString(); + // replace single quotes by double quotes first + lit = "\"" + lit.substring(1, lit.length() - 1) + "\""; + + // this should be the exact notation for string literals in vallang + IValue string = new StandardTextReader().read(factory, new StringReader(lit)); + + return factory.constructor(RascalValueFactory.Symbol_Cilit, string); + } + catch (FactTypeUseException | IOException e) { + // this would mean Rascal's syntax definition for string constants is not aligned with vallang's string notation + throw new RuntimeException("Internal error: parsed stringconstant notation does not coincide with vallang stringconstant notation"); } - - return factory.constructor(RascalValueFactory.Symbol_Lit, factory.string(builder.toString())); } private static IConstructor charclass2Symbol(Class cc) { From 2b24936cd629fcad102ace6c0d35a6cb074570e7 Mon Sep 17 00:00:00 2001 From: "Jurgen J. Vinju" Date: Thu, 21 Nov 2024 11:23:23 +0100 Subject: [PATCH 14/14] added tests for escapes in ci literals and fixed bug that was triggered by those --- .../lang/rascal/grammar/tests/CharactersTests.rsc | 9 +++++++++ src/org/rascalmpl/values/parsetrees/SymbolFactory.java | 4 ++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/src/org/rascalmpl/library/lang/rascal/grammar/tests/CharactersTests.rsc b/src/org/rascalmpl/library/lang/rascal/grammar/tests/CharactersTests.rsc index 0ce18b21159..e55a5db50aa 100644 --- a/src/org/rascalmpl/library/lang/rascal/grammar/tests/CharactersTests.rsc +++ b/src/org/rascalmpl/library/lang/rascal/grammar/tests/CharactersTests.rsc @@ -70,3 +70,12 @@ test bool literalUtf16Escape() = lit("\n") == #"\u000A".symbol; test bool literalUtf32Escape1() = lit("\n") == #"\U00000A".symbol; test bool literalUtf32Escape2() = lit("๐Ÿ•") == #"\U01F355".symbol; +test bool ciliteralAsciiEscape1() = cilit("\n") == #'\a0A'.symbol; +test bool ciliteralAsciiEscape2() = cilit("w") == #'\a77'.symbol; +test bool ciliteralAsciiEscape3() = cilit("\f") == #'\a0C'.symbol; +test bool ciliteralAsciiEscape4() = cilit("\n") == #'\n'.symbol; +@ignore{vallang must re-introduce the \f notation} +test bool ciliteralAsciiEscape5() = cilit("\f") == #'\f'.symbol; +test bool ciliteralUtf16Escape() = cilit("\n") == #'\u000A'.symbol; +test bool ciliteralUtf32Escape1() = cilit("\n") == #'\U00000A'.symbol; +test bool ciliteralUtf32Escape2() = cilit("๐Ÿ•") == #'\U01F355'.symbol; diff --git a/src/org/rascalmpl/values/parsetrees/SymbolFactory.java b/src/org/rascalmpl/values/parsetrees/SymbolFactory.java index cec5444b924..d3dac7eb4b1 100644 --- a/src/org/rascalmpl/values/parsetrees/SymbolFactory.java +++ b/src/org/rascalmpl/values/parsetrees/SymbolFactory.java @@ -77,7 +77,7 @@ private static IValue symbolAST2SymbolConstructor(Sym symbol, boolean lex, Strin boolean noExpand = lex || layout == null; if (symbol.isCaseInsensitiveLiteral()) { - return factory.constructor(RascalValueFactory.Symbol_Cilit, ciliteral2Symbol(symbol.getCistring())); + return ciliteral2Symbol(symbol.getCistring()); } if (symbol.isCharacterClass()) { Class cc = symbol.getCharClass(); @@ -220,7 +220,7 @@ private static IValue ciliteral2Symbol(CaseInsensitiveStringConstant constant) { String lit = ((CaseInsensitiveStringConstant.Lexical) constant).getString(); // replace single quotes by double quotes first lit = "\"" + lit.substring(1, lit.length() - 1) + "\""; - + // this should be the exact notation for string literals in vallang IValue string = new StandardTextReader().read(factory, new StringReader(lit));