Skip to content

Commit

Permalink
Merge pull request #2002 from usethesource/squeeze-charclass
Browse files Browse the repository at this point in the history
rewrote squeeze in Rascal with reified classes
  • Loading branch information
jurgenvinju authored Nov 21, 2024
2 parents 1690da7 + 2b24936 commit edfb516
Show file tree
Hide file tree
Showing 4 changed files with 162 additions and 112 deletions.
36 changes: 36 additions & 0 deletions src/org/rascalmpl/library/String.rsc
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ module String

extend Exception;
import List;
import ParseTree;

@synopsis{All functions in this module that have a charset parameter use this as default.}
private str DEFAULT_CHARSET = "UTF-8";
Expand Down Expand Up @@ -519,11 +520,46 @@ for the allowed syntax in `charSet`.
```rascal-shell
import String;
squeeze("hello", "el");
// the other squeeze function uses character class types instead:
squeeze("hello", "el") == squeeze("hello", #[el]);
```
}
@javaClass{org.rascalmpl.library.Prelude}
@deprecated{Use the other squeence function that accepts Rascal character class syntax.}
public java str squeeze(str src, str charSet);
@synopsis{Squeeze repeated occurrences of characters.}
@description{
Squeeze repeated occurrences in `src` of characters, if they are a member of `&CharClass`, removed.
* `src` is any string
* `&CharClass` is a reified character class type such as `[a-z]` (a type that is a subtype of the class of all characters `![]`)
* To pass in a char-class type used the type reifier operator: `#[a-z]` or `#![]`
}
@benefits{
* to squeeze all characters use the universal character class: `#![]` (the negation of the empty class).
* this function is type-safe; you can only pass in correct reified character classes like `#[A-Za-z]`.
}
@pitfalls{
* `![]` excludes the 0'th unicode character, so we can not squeeze the unicode codepoint `0` using this function.
If you really need to squeeze 0 then it's best to write your own:
```rascal
visit (x) {
case /<dot:.>+/ => "\a00" when dot == "\a00"
}
````
* Do not confuse the character `0` (codepoint 48) with the zero codepoint: `#[0] != #[\a00]`
}
@examples{
```rascal-shell
import String;
squeeze("hello", #[el]);
```
}
public str squeeze(str src, type[&CharClass] _:type[![]] _) = visit(src) {
case /<c:.><c>+/ => c
when &CharClass _ := Tree::char(charAt(c, 0))
};
@synopsis{Split a string into a list of strings based on a literal separator.}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ module lang::rascal::grammar::tests::CharactersTests

import lang::rascal::grammar::definition::Characters;
import ParseTree;
import String;

test bool testFlip() = \new-char-class([range(2,2), range(1,1)]) == \char-class([range(1,2)]);
test bool testMerge() = \new-char-class([range(3,4), range(2,2), range(1,1)]) == \char-class([range(1,4)]);
Expand All @@ -24,3 +25,57 @@ test bool testDiff1() = difference(\char-class([range(10,30)]), \char-class([ran
test bool testDiff2() = difference(\char-class([range(10,30), range(40,50)]), \char-class([range(25,45)])) ==\char-class( [range(10,24), range(46,50)]);


test bool asciiEscape() = \char-class([range(0,127)]) == #[\a00-\a7F].symbol;
test bool utf16Escape() = \char-class([range(0,65535)]) == #[\u0000-\uFFFF].symbol;
test bool utf32Escape() = \char-class([range(0,1114111)]) == #[\U000000-\U10FFFF].symbol;
test bool highLowSurrogateRange1() = \char-class([range(9312,12991)]) == #[①-㊿].symbol;
test bool highLowSurrogateRange2() = \char-class([range(127829,127829)]) == #[🍕].symbol;
test bool differentEscapesSameResult1() = #[\a00-\a7F] == #[\u0000-\u007F];
test bool differentEscapesSameResult2() = #[\a00-\a7F] == #[\U000000-\U00007F];

/* to avoid a known ambiguity */
alias NotAZ = ![A-Z];

test bool unicodeCharacterClassSubtype1() {
Tree t = char(charAt("⑭", 0));

if ([①-㊿] circled := t) {
assert [⑭] _ := circled;
assert NotAZ _ := circled;
return true;
}

return false;
}

test bool unicodeCharacterClassSubtype2() {
Tree t = char(charAt("🍕", 0));

if ([🍕] pizza := t) {
assert [\a00-🍕] _ := pizza;
assert NotAZ _ := pizza;
return true;
}

return false;
}

test bool literalAsciiEscape1() = lit("\n") == #"\a0A".symbol;
test bool literalAsciiEscape2() = lit("w") == #"\a77".symbol;
test bool literalAsciiEscape3() = lit("\f") == #"\a0C".symbol;
test bool literalAsciiEscape4() = lit("\n") == #"\n".symbol;
@ignore{vallang must re-introduce the \f notation}
test bool literalAsciiEscape5() = lit("\f") == #"\f".symbol;
test bool literalUtf16Escape() = lit("\n") == #"\u000A".symbol;
test bool literalUtf32Escape1() = lit("\n") == #"\U00000A".symbol;
test bool literalUtf32Escape2() = lit("🍕") == #"\U01F355".symbol;

test bool ciliteralAsciiEscape1() = cilit("\n") == #'\a0A'.symbol;
test bool ciliteralAsciiEscape2() = cilit("w") == #'\a77'.symbol;
test bool ciliteralAsciiEscape3() = cilit("\f") == #'\a0C'.symbol;
test bool ciliteralAsciiEscape4() = cilit("\n") == #'\n'.symbol;
@ignore{vallang must re-introduce the \f notation}
test bool ciliteralAsciiEscape5() = cilit("\f") == #'\f'.symbol;
test bool ciliteralUtf16Escape() = cilit("\n") == #'\u000A'.symbol;
test bool ciliteralUtf32Escape1() = cilit("\n") == #'\U00000A'.symbol;
test bool ciliteralUtf32Escape2() = cilit("🍕") == #'\U01F355'.symbol;
16 changes: 16 additions & 0 deletions src/org/rascalmpl/library/lang/rascal/tests/basic/Strings1.rsc
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,22 @@ test bool tstSqueezeCase3() = squeeze("aabcc", "a-c") == "abc";
test bool tstSqueezeCase4() = squeeze("aabbcc", "a-c") == "abc";
test bool tstSqueezeCase5() = squeeze("aaabc", "a-c") == "abc";

// second squeeze
test bool tstSqueeze1CC(str S) = /<c:[a-zA-Z]><c>/ !:= squeeze(S, #[a-zA-Z]);
test bool tstSqueeze2CC(str S) = squeeze(S, #[]) == S;
test bool tstSqueeze3CC(str S) {
if (/<c:[a-zA-Z]><c>/ := S) {
return /<c><c>/ := squeeze(S, #[0-9]);
}
return true;
}

test bool tstSqueezeUnicodeCC() = squeeze("Hi 🍝🍝World", #[🍝]) == "Hi 🍝World";
test bool tstSqueezeCase1CC() = squeeze("abc", #[a-c]) == "abc";
test bool tstSqueezeCase2CC() = squeeze("aabc", #[a-c]) == "abc";
test bool tstSqueezeCase3CC() = squeeze("aabcc", #[a-c]) == "abc";
test bool tstSqueezeCase4CC() = squeeze("aabbcc", #[a-c]) == "abc";
test bool tstSqueezeCase5CC() = squeeze("aaabc", #[a-c]) == "abc";

test bool tstStartsWith(str S1, str S2) = startsWith(S1+S2, S1);

Expand Down
167 changes: 55 additions & 112 deletions src/org/rascalmpl/values/parsetrees/SymbolFactory.java
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,11 @@
*******************************************************************************/
package org.rascalmpl.values.parsetrees;

import java.io.IOException;
import java.io.StringReader;
import java.util.List;

import org.apache.commons.lang3.ArrayUtils;
import org.rascalmpl.ast.CaseInsensitiveStringConstant;
import org.rascalmpl.ast.Char;
import org.rascalmpl.ast.Class;
Expand All @@ -34,6 +37,8 @@
import io.usethesource.vallang.IString;
import io.usethesource.vallang.IValue;
import io.usethesource.vallang.IValueFactory;
import io.usethesource.vallang.exceptions.FactTypeUseException;
import io.usethesource.vallang.io.StandardTextReader;

import org.rascalmpl.values.RascalValueFactory;
import org.rascalmpl.values.ValueFactoryFactory;
Expand Down Expand Up @@ -72,7 +77,7 @@ private static IValue symbolAST2SymbolConstructor(Sym symbol, boolean lex, Strin
boolean noExpand = lex || layout == null;

if (symbol.isCaseInsensitiveLiteral()) {
return factory.constructor(RascalValueFactory.Symbol_Cilit, ciliteral2Symbol(symbol.getCistring()));
return ciliteral2Symbol(symbol.getCistring());
}
if (symbol.isCharacterClass()) {
Class cc = symbol.getCharClass();
Expand Down Expand Up @@ -197,101 +202,34 @@ private static IValue symbolAST2SymbolConstructor(Sym symbol, boolean lex, Strin
}

private static IValue literal2Symbol(StringConstant sep) {
String lit = ((StringConstant.Lexical) sep).getString();
StringBuilder builder = new StringBuilder(lit.length());

// TODO: did we deal with all escapes here? probably not!
for (int i = 1; i < lit.length() - 1; i++) {
if (lit.charAt(i) == '\\') {
i++;
switch (lit.charAt(i)) {
case 'b':
builder.append('\b');
break;
case 'f':
builder.append('\f');
break;
case 'n':
builder.append('\n');
break;
case 't':
builder.append('\t');
break;
case 'r':
builder.append('\r');
break;
case '\\':
builder.append('\\');
break;
case '\"':
builder.append('\"');
break;
case '>':
builder.append('>');
break;
case '<':
builder.append('<');
break;
case '\'':
builder.append('\'');
break;
case 'u':
while (lit.charAt(i++) == 'u');
builder.append((char) Integer.decode("0x" + lit.substring(i, i+4)).intValue());
i+=4;
break;
default:
// octal escape
int a = lit.charAt(i++);
int b = lit.charAt(i++);
int c = lit.charAt(i);
builder.append( (char) (100 * a + 10 * b + c));
}
}
else {
builder.append(lit.charAt(i));
}
try {
String lit = ((StringConstant.Lexical) sep).getString();
// this should be the exact notation for string literals in vallang
IValue string = new StandardTextReader().read(factory, new StringReader(lit));

return factory.constructor(RascalValueFactory.Symbol_Lit, string);
}
catch (FactTypeUseException | IOException e) {
// this would mean Rascal's syntax definition for string constants is not aligned with vallang's string notation
throw new RuntimeException("Internal error: parsed stringconstant notation does not coincide with vallang stringconstant notation");
}

return factory.constructor(RascalValueFactory.Symbol_Lit, factory.string(builder.toString()));
}

private static IValue ciliteral2Symbol(CaseInsensitiveStringConstant constant) {
String lit = ((CaseInsensitiveStringConstant.Lexical) constant).getString();
StringBuilder builder = new StringBuilder(lit.length());

for (int i = 1; i < lit.length() - 1; i++) {
if (lit.charAt(i) == '\\') {
i++;
switch (lit.charAt(i)) {
case 'n':
builder.append('\n');
break;
case 't':
builder.append('\t');
break;
case 'r':
builder.append('\r');
break;
case '\\':
builder.append('\\');
break;
case '\"':
builder.append('\'');
break;
default:
int a = lit.charAt(i++);
int b = lit.charAt(i++);
int c = lit.charAt(i);
builder.append( (char) (100 * a + 10 * b + c));
}
}
else {
builder.append(lit.charAt(i));
}
try {
String lit = ((CaseInsensitiveStringConstant.Lexical) constant).getString();
// replace single quotes by double quotes first
lit = "\"" + lit.substring(1, lit.length() - 1) + "\"";

// this should be the exact notation for string literals in vallang
IValue string = new StandardTextReader().read(factory, new StringReader(lit));

return factory.constructor(RascalValueFactory.Symbol_Cilit, string);
}
catch (FactTypeUseException | IOException e) {
// this would mean Rascal's syntax definition for string constants is not aligned with vallang's string notation
throw new RuntimeException("Internal error: parsed stringconstant notation does not coincide with vallang stringconstant notation");
}

return factory.constructor(RascalValueFactory.Symbol_Lit, factory.string(builder.toString()));
}

private static IConstructor charclass2Symbol(Class cc) {
Expand Down Expand Up @@ -338,30 +276,35 @@ else if (range.isFromTo()) {
private static IValue char2int(Char character) {
String s = ((Char.Lexical) character).getString();
if (s.startsWith("\\")) {
if (s.length() > 1 && java.lang.Character.isDigit(s.charAt(1))) { // octal escape
// TODO
throw new NotYetImplemented("octal escape sequence in character class types");
}
if (s.length() > 1 && s.charAt(1) == 'u') { // octal escape
// TODO
throw new NotYetImplemented("unicode escape sequence in character class types");
if (ArrayUtils.contains(new int[] { 'a', 'u', 'U'}, s.charAt(1))) {
// lexical UnicodeEscape
// = utf16: "\\" [u] [0-9 A-F a-f] [0-9 A-F a-f] [0-9 A-F a-f] [0-9 A-F a-f]
// | utf32: "\\" [U] (("0" [0-9 A-F a-f]) | "10") [0-9 A-F a-f] [0-9 A-F a-f] [0-9 A-F a-f] [0-9 A-F a-f] // 24 bits
// | ascii: "\\" [a] [0-7] [0-9A-Fa-f]
// ;
return factory.integer(Integer.parseUnsignedInt(s.substring(2), 16));
}
char cha = s.charAt(1);
switch (cha) {
case 't': return factory.integer('\t');
case 'n': return factory.integer('\n');
case 'r': return factory.integer('\r');
case '\"' : return factory.integer('\"');
case '\'' : return factory.integer('\'');
case '-' : return factory.integer('-');
case '<' : return factory.integer('<');
case '>' : return factory.integer('>');
case '\\' : return factory.integer('\\');
else {
int cha = s.codePointAt(1);
switch (cha) {
case 't': return factory.integer('\t');
case 'n': return factory.integer('\n');
case 'r': return factory.integer('\r');
case '\"' : return factory.integer('\"');
case '\'' : return factory.integer('\'');
case '-' : return factory.integer('-');
case '<' : return factory.integer('<');
case '>' : return factory.integer('>');
case '\\' : return factory.integer('\\');
default:
return factory.integer(cha);
}
}
s = s.substring(1);
}
char cha = s.charAt(0);
return factory.integer(cha);
else {
int cha = s.codePointAt(0);
return factory.integer(cha);
}
}

public static IConstructor charClass(int ch) {
Expand Down

0 comments on commit edfb516

Please sign in to comment.