diff --git a/src/org/rascalmpl/library/String.rsc b/src/org/rascalmpl/library/String.rsc index 0312d201946..132c288a987 100644 --- a/src/org/rascalmpl/library/String.rsc +++ b/src/org/rascalmpl/library/String.rsc @@ -20,6 +20,7 @@ module String extend Exception; import List; +import ParseTree; @synopsis{All functions in this module that have a charset parameter use this as default.} private str DEFAULT_CHARSET = "UTF-8"; @@ -519,11 +520,46 @@ for the allowed syntax in `charSet`. ```rascal-shell import String; squeeze("hello", "el"); +// the other squeeze function uses character class types instead: +squeeze("hello", "el") == squeeze("hello", #[el]); ``` } @javaClass{org.rascalmpl.library.Prelude} +@deprecated{Use the other squeence function that accepts Rascal character class syntax.} public java str squeeze(str src, str charSet); +@synopsis{Squeeze repeated occurrences of characters.} +@description{ +Squeeze repeated occurrences in `src` of characters, if they are a member of `&CharClass`, removed. + +* `src` is any string +* `&CharClass` is a reified character class type such as `[a-z]` (a type that is a subtype of the class of all characters `![]`) +* To pass in a char-class type used the type reifier operator: `#[a-z]` or `#![]` +} +@benefits{ +* to squeeze all characters use the universal character class: `#![]` (the negation of the empty class). +* this function is type-safe; you can only pass in correct reified character classes like `#[A-Za-z]`. +} +@pitfalls{ +* `![]` excludes the 0'th unicode character, so we can not squeeze the unicode codepoint `0` using this function. +If you really need to squeeze 0 then it's best to write your own: +```rascal +visit (x) { + case /+/ => "\a00" when dot == "\a00" +} +```` +* Do not confuse the character `0` (codepoint 48) with the zero codepoint: `#[0] != #[\a00]` +} +@examples{ +```rascal-shell +import String; +squeeze("hello", #[el]); +``` +} +public str squeeze(str src, type[&CharClass] _:type[![]] _) = visit(src) { + case /+/ => c + when &CharClass _ := Tree::char(charAt(c, 0)) +}; @synopsis{Split a string into a list of strings based on a literal separator.} diff --git a/src/org/rascalmpl/library/lang/rascal/grammar/tests/CharactersTests.rsc b/src/org/rascalmpl/library/lang/rascal/grammar/tests/CharactersTests.rsc index 7b9fc42153f..e55a5db50aa 100644 --- a/src/org/rascalmpl/library/lang/rascal/grammar/tests/CharactersTests.rsc +++ b/src/org/rascalmpl/library/lang/rascal/grammar/tests/CharactersTests.rsc @@ -2,6 +2,7 @@ module lang::rascal::grammar::tests::CharactersTests import lang::rascal::grammar::definition::Characters; import ParseTree; +import String; test bool testFlip() = \new-char-class([range(2,2), range(1,1)]) == \char-class([range(1,2)]); test bool testMerge() = \new-char-class([range(3,4), range(2,2), range(1,1)]) == \char-class([range(1,4)]); @@ -24,3 +25,57 @@ test bool testDiff1() = difference(\char-class([range(10,30)]), \char-class([ran test bool testDiff2() = difference(\char-class([range(10,30), range(40,50)]), \char-class([range(25,45)])) ==\char-class( [range(10,24), range(46,50)]); +test bool asciiEscape() = \char-class([range(0,127)]) == #[\a00-\a7F].symbol; +test bool utf16Escape() = \char-class([range(0,65535)]) == #[\u0000-\uFFFF].symbol; +test bool utf32Escape() = \char-class([range(0,1114111)]) == #[\U000000-\U10FFFF].symbol; +test bool highLowSurrogateRange1() = \char-class([range(9312,12991)]) == #[β‘ -㊿].symbol; +test bool highLowSurrogateRange2() = \char-class([range(127829,127829)]) == #[πŸ•].symbol; +test bool differentEscapesSameResult1() = #[\a00-\a7F] == #[\u0000-\u007F]; +test bool differentEscapesSameResult2() = #[\a00-\a7F] == #[\U000000-\U00007F]; + +/* to avoid a known ambiguity */ +alias NotAZ = ![A-Z]; + +test bool unicodeCharacterClassSubtype1() { + Tree t = char(charAt("β‘­", 0)); + + if ([β‘ -㊿] circled := t) { + assert [β‘­] _ := circled; + assert NotAZ _ := circled; + return true; + } + + return false; +} + +test bool unicodeCharacterClassSubtype2() { + Tree t = char(charAt("πŸ•", 0)); + + if ([πŸ•] pizza := t) { + assert [\a00-πŸ•] _ := pizza; + assert NotAZ _ := pizza; + return true; + } + + return false; +} + +test bool literalAsciiEscape1() = lit("\n") == #"\a0A".symbol; +test bool literalAsciiEscape2() = lit("w") == #"\a77".symbol; +test bool literalAsciiEscape3() = lit("\f") == #"\a0C".symbol; +test bool literalAsciiEscape4() = lit("\n") == #"\n".symbol; +@ignore{vallang must re-introduce the \f notation} +test bool literalAsciiEscape5() = lit("\f") == #"\f".symbol; +test bool literalUtf16Escape() = lit("\n") == #"\u000A".symbol; +test bool literalUtf32Escape1() = lit("\n") == #"\U00000A".symbol; +test bool literalUtf32Escape2() = lit("πŸ•") == #"\U01F355".symbol; + +test bool ciliteralAsciiEscape1() = cilit("\n") == #'\a0A'.symbol; +test bool ciliteralAsciiEscape2() = cilit("w") == #'\a77'.symbol; +test bool ciliteralAsciiEscape3() = cilit("\f") == #'\a0C'.symbol; +test bool ciliteralAsciiEscape4() = cilit("\n") == #'\n'.symbol; +@ignore{vallang must re-introduce the \f notation} +test bool ciliteralAsciiEscape5() = cilit("\f") == #'\f'.symbol; +test bool ciliteralUtf16Escape() = cilit("\n") == #'\u000A'.symbol; +test bool ciliteralUtf32Escape1() = cilit("\n") == #'\U00000A'.symbol; +test bool ciliteralUtf32Escape2() = cilit("πŸ•") == #'\U01F355'.symbol; diff --git a/src/org/rascalmpl/library/lang/rascal/tests/basic/Strings1.rsc b/src/org/rascalmpl/library/lang/rascal/tests/basic/Strings1.rsc index 0a679380687..96485906099 100644 --- a/src/org/rascalmpl/library/lang/rascal/tests/basic/Strings1.rsc +++ b/src/org/rascalmpl/library/lang/rascal/tests/basic/Strings1.rsc @@ -264,6 +264,22 @@ test bool tstSqueezeCase3() = squeeze("aabcc", "a-c") == "abc"; test bool tstSqueezeCase4() = squeeze("aabbcc", "a-c") == "abc"; test bool tstSqueezeCase5() = squeeze("aaabc", "a-c") == "abc"; +// second squeeze +test bool tstSqueeze1CC(str S) = // !:= squeeze(S, #[a-zA-Z]); +test bool tstSqueeze2CC(str S) = squeeze(S, #[]) == S; +test bool tstSqueeze3CC(str S) { + if (// := S) { + return // := squeeze(S, #[0-9]); + } + return true; +} + +test bool tstSqueezeUnicodeCC() = squeeze("Hi 🍝🍝World", #[🍝]) == "Hi 🍝World"; +test bool tstSqueezeCase1CC() = squeeze("abc", #[a-c]) == "abc"; +test bool tstSqueezeCase2CC() = squeeze("aabc", #[a-c]) == "abc"; +test bool tstSqueezeCase3CC() = squeeze("aabcc", #[a-c]) == "abc"; +test bool tstSqueezeCase4CC() = squeeze("aabbcc", #[a-c]) == "abc"; +test bool tstSqueezeCase5CC() = squeeze("aaabc", #[a-c]) == "abc"; test bool tstStartsWith(str S1, str S2) = startsWith(S1+S2, S1); diff --git a/src/org/rascalmpl/values/parsetrees/SymbolFactory.java b/src/org/rascalmpl/values/parsetrees/SymbolFactory.java index 22e96b5b231..d3dac7eb4b1 100644 --- a/src/org/rascalmpl/values/parsetrees/SymbolFactory.java +++ b/src/org/rascalmpl/values/parsetrees/SymbolFactory.java @@ -13,8 +13,11 @@ *******************************************************************************/ package org.rascalmpl.values.parsetrees; +import java.io.IOException; +import java.io.StringReader; import java.util.List; +import org.apache.commons.lang3.ArrayUtils; import org.rascalmpl.ast.CaseInsensitiveStringConstant; import org.rascalmpl.ast.Char; import org.rascalmpl.ast.Class; @@ -34,6 +37,8 @@ import io.usethesource.vallang.IString; import io.usethesource.vallang.IValue; import io.usethesource.vallang.IValueFactory; +import io.usethesource.vallang.exceptions.FactTypeUseException; +import io.usethesource.vallang.io.StandardTextReader; import org.rascalmpl.values.RascalValueFactory; import org.rascalmpl.values.ValueFactoryFactory; @@ -72,7 +77,7 @@ private static IValue symbolAST2SymbolConstructor(Sym symbol, boolean lex, Strin boolean noExpand = lex || layout == null; if (symbol.isCaseInsensitiveLiteral()) { - return factory.constructor(RascalValueFactory.Symbol_Cilit, ciliteral2Symbol(symbol.getCistring())); + return ciliteral2Symbol(symbol.getCistring()); } if (symbol.isCharacterClass()) { Class cc = symbol.getCharClass(); @@ -197,101 +202,34 @@ private static IValue symbolAST2SymbolConstructor(Sym symbol, boolean lex, Strin } private static IValue literal2Symbol(StringConstant sep) { - String lit = ((StringConstant.Lexical) sep).getString(); - StringBuilder builder = new StringBuilder(lit.length()); - - // TODO: did we deal with all escapes here? probably not! - for (int i = 1; i < lit.length() - 1; i++) { - if (lit.charAt(i) == '\\') { - i++; - switch (lit.charAt(i)) { - case 'b': - builder.append('\b'); - break; - case 'f': - builder.append('\f'); - break; - case 'n': - builder.append('\n'); - break; - case 't': - builder.append('\t'); - break; - case 'r': - builder.append('\r'); - break; - case '\\': - builder.append('\\'); - break; - case '\"': - builder.append('\"'); - break; - case '>': - builder.append('>'); - break; - case '<': - builder.append('<'); - break; - case '\'': - builder.append('\''); - break; - case 'u': - while (lit.charAt(i++) == 'u'); - builder.append((char) Integer.decode("0x" + lit.substring(i, i+4)).intValue()); - i+=4; - break; - default: - // octal escape - int a = lit.charAt(i++); - int b = lit.charAt(i++); - int c = lit.charAt(i); - builder.append( (char) (100 * a + 10 * b + c)); - } - } - else { - builder.append(lit.charAt(i)); - } + try { + String lit = ((StringConstant.Lexical) sep).getString(); + // this should be the exact notation for string literals in vallang + IValue string = new StandardTextReader().read(factory, new StringReader(lit)); + + return factory.constructor(RascalValueFactory.Symbol_Lit, string); + } + catch (FactTypeUseException | IOException e) { + // this would mean Rascal's syntax definition for string constants is not aligned with vallang's string notation + throw new RuntimeException("Internal error: parsed stringconstant notation does not coincide with vallang stringconstant notation"); } - - return factory.constructor(RascalValueFactory.Symbol_Lit, factory.string(builder.toString())); } private static IValue ciliteral2Symbol(CaseInsensitiveStringConstant constant) { - String lit = ((CaseInsensitiveStringConstant.Lexical) constant).getString(); - StringBuilder builder = new StringBuilder(lit.length()); - - for (int i = 1; i < lit.length() - 1; i++) { - if (lit.charAt(i) == '\\') { - i++; - switch (lit.charAt(i)) { - case 'n': - builder.append('\n'); - break; - case 't': - builder.append('\t'); - break; - case 'r': - builder.append('\r'); - break; - case '\\': - builder.append('\\'); - break; - case '\"': - builder.append('\''); - break; - default: - int a = lit.charAt(i++); - int b = lit.charAt(i++); - int c = lit.charAt(i); - builder.append( (char) (100 * a + 10 * b + c)); - } - } - else { - builder.append(lit.charAt(i)); - } + try { + String lit = ((CaseInsensitiveStringConstant.Lexical) constant).getString(); + // replace single quotes by double quotes first + lit = "\"" + lit.substring(1, lit.length() - 1) + "\""; + + // this should be the exact notation for string literals in vallang + IValue string = new StandardTextReader().read(factory, new StringReader(lit)); + + return factory.constructor(RascalValueFactory.Symbol_Cilit, string); + } + catch (FactTypeUseException | IOException e) { + // this would mean Rascal's syntax definition for string constants is not aligned with vallang's string notation + throw new RuntimeException("Internal error: parsed stringconstant notation does not coincide with vallang stringconstant notation"); } - - return factory.constructor(RascalValueFactory.Symbol_Lit, factory.string(builder.toString())); } private static IConstructor charclass2Symbol(Class cc) { @@ -338,30 +276,35 @@ else if (range.isFromTo()) { private static IValue char2int(Char character) { String s = ((Char.Lexical) character).getString(); if (s.startsWith("\\")) { - if (s.length() > 1 && java.lang.Character.isDigit(s.charAt(1))) { // octal escape - // TODO - throw new NotYetImplemented("octal escape sequence in character class types"); - } - if (s.length() > 1 && s.charAt(1) == 'u') { // octal escape - // TODO - throw new NotYetImplemented("unicode escape sequence in character class types"); + if (ArrayUtils.contains(new int[] { 'a', 'u', 'U'}, s.charAt(1))) { + // lexical UnicodeEscape + // = utf16: "\\" [u] [0-9 A-F a-f] [0-9 A-F a-f] [0-9 A-F a-f] [0-9 A-F a-f] + // | utf32: "\\" [U] (("0" [0-9 A-F a-f]) | "10") [0-9 A-F a-f] [0-9 A-F a-f] [0-9 A-F a-f] [0-9 A-F a-f] // 24 bits + // | ascii: "\\" [a] [0-7] [0-9A-Fa-f] + // ; + return factory.integer(Integer.parseUnsignedInt(s.substring(2), 16)); } - char cha = s.charAt(1); - switch (cha) { - case 't': return factory.integer('\t'); - case 'n': return factory.integer('\n'); - case 'r': return factory.integer('\r'); - case '\"' : return factory.integer('\"'); - case '\'' : return factory.integer('\''); - case '-' : return factory.integer('-'); - case '<' : return factory.integer('<'); - case '>' : return factory.integer('>'); - case '\\' : return factory.integer('\\'); + else { + int cha = s.codePointAt(1); + switch (cha) { + case 't': return factory.integer('\t'); + case 'n': return factory.integer('\n'); + case 'r': return factory.integer('\r'); + case '\"' : return factory.integer('\"'); + case '\'' : return factory.integer('\''); + case '-' : return factory.integer('-'); + case '<' : return factory.integer('<'); + case '>' : return factory.integer('>'); + case '\\' : return factory.integer('\\'); + default: + return factory.integer(cha); + } } - s = s.substring(1); } - char cha = s.charAt(0); - return factory.integer(cha); + else { + int cha = s.codePointAt(0); + return factory.integer(cha); + } } public static IConstructor charClass(int ch) {