Skip to content

Commit

Permalink
Merge pull request #65 from b3b00/feature/stringDelimiterEscaping
Browse files Browse the repository at this point in the history
string improvements
  • Loading branch information
b3b00 authored Mar 8, 2018
2 parents 99847ed + be991b0 commit 2bfbb25
Show file tree
Hide file tree
Showing 5 changed files with 286 additions and 80 deletions.
126 changes: 115 additions & 11 deletions ParserTests/GenericLexerTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,17 @@ public enum Extensions
[Lexeme(GenericToken.Extension)]
DATE,

[Lexeme(GenericToken.Extension)]
CHAINE,

[Lexeme(GenericToken.Double)]
DOUBLE,


}





public static class ExtendedGenericLexer
Expand Down Expand Up @@ -52,8 +58,6 @@ public static void AddExtension(Extensions token, LexemeAttribute lexem, Generic
{
if (token == Extensions.DATE)
{


NodeCallback<GenericToken> callback = (FSMMatch<GenericToken> match) =>
{
match.Properties[GenericLexer<Extensions>.DerivedToken] = Extensions.DATE;
Expand All @@ -66,18 +70,52 @@ public static void AddExtension(Extensions token, LexemeAttribute lexem, Generic
fsmBuilder.GoTo(GenericLexer<Extensions>.in_double)
.Transition('.', CheckDate)
.Mark("start_date")
.RepetitionTransition(4, "[0-9]")
// .RangeTransition('0','9')
// .Mark("y1")
// .RangeTransition('0','9')
// .Mark("y2")
// .RangeTransition('0','9')
// .Mark("y3")
// .RangeTransition('0','9')
// .Mark("y4")
.RepetitionTransition(4, "[0-9]")
.End(GenericToken.Extension)
.CallBack(callback);
}
else if (token == Extensions.CHAINE) {
NodeCallback<GenericToken> callback = (FSMMatch<GenericToken> match) =>
{
match.Properties[GenericLexer<Extensions>.DerivedToken] = Extensions.CHAINE;
return match;
};

char quote = '\'';
NodeAction collapseDelimiter = (string value) => {
if (value.EndsWith(""+quote+quote)) {
return value.Substring(0,value.Length-2)+quote;
}
return value;
};

var exceptQuote = new char[]{quote};
string in_string = "in_string_same";
string escaped = "escaped_same";
string delim = "delim_same";

var fsmBuilder = lexer.FSMBuilder;

fsmBuilder.GoTo(GenericLexer<Extensions>.start)
.Transition(quote)
.Mark(in_string)
.ExceptTransitionTo(exceptQuote,in_string)
.Transition(quote)

.Mark(escaped)
.End(GenericToken.String)
.CallBack(callback)
.Transition(quote)

.Mark(delim)
.Action(collapseDelimiter)
.ExceptTransitionTo(exceptQuote,in_string);
fsmBuilder.GoTo(delim)
.TransitionTo(quote,escaped)

.ExceptTransitionTo(exceptQuote,in_string);

}
}

}
Expand Down Expand Up @@ -115,6 +153,17 @@ public enum DefaultQuotedString
DefaultString
}

public enum SelfEscapedString {
[Lexeme(GenericToken.String,"'","'")]
STRING
}

public enum ManyString {
[Lexeme(GenericToken.String,"'","'")]
[Lexeme(GenericToken.String)]
STRING
}

public enum AlphaId
{
[Lexeme(GenericToken.Identifier, IdentifierType.Alpha)]
Expand Down Expand Up @@ -151,8 +200,22 @@ public void TestExtensions()
Assert.Equal("20.02.2018", tokens[0].Value);
Assert.Equal(Extensions.DOUBLE, tokens[1].TokenID);
Assert.Equal("3.14", tokens[1].Value);

tokens = lexer.Tokenize("'that''s it'").ToList();
Assert.Equal(1,tokens.Count);
Token<Extensions> tok = tokens[0];
Assert.Equal(Extensions.CHAINE,tok.TokenID);
Assert.Equal("'that's it'",tokens[0].Value);

tokens = lexer.Tokenize("'et voilà'").ToList();
Assert.Equal(1,tokens.Count);
tok = tokens[0];
Assert.Equal(Extensions.CHAINE,tok.TokenID);
Assert.Equal("'et voilà'",tokens[0].Value);
}



[Fact]
public void TestAlphaId()
{
Expand Down Expand Up @@ -253,6 +316,47 @@ public void TestDefaultQuotedString()
Assert.Equal(source, tok.StringWithoutQuotes);
}

[Fact]
public void TestSelfEscapedString()
{
var lexerRes = LexerBuilder.BuildLexer<SelfEscapedString>(new BuildResult<ILexer<SelfEscapedString>>());
Assert.False(lexerRes.IsError);
var lexer = lexerRes.Result as GenericLexer<SelfEscapedString>;
Assert.NotNull(lexer);
var tokens = lexer.Tokenize("'that''s it'").ToList();
Assert.Equal(1,tokens.Count);
Token<SelfEscapedString> tok = tokens[0];
Assert.Equal(SelfEscapedString.STRING,tok.TokenID);
Assert.Equal("'that's it'",tokens[0].Value);

tokens = lexer.Tokenize("'et voilà'").ToList();
Assert.Equal(1,tokens.Count);
tok = tokens[0];
Assert.Equal(SelfEscapedString.STRING,tok.TokenID);
Assert.Equal("'et voilà'",tokens[0].Value);

}

[Fact]
public void TestManyString()
{
var lexerRes = LexerBuilder.BuildLexer<ManyString>(new BuildResult<ILexer<ManyString>>());
Assert.False(lexerRes.IsError);
var lexer = lexerRes.Result;
string string1 ="\"hello \\\"world \"";
string string2 = "'that''s it'";
string source1 = $"{string1} {string2}";
var r = lexer.Tokenize(source1).ToList();
Assert.Equal(2, r.Count);
Token<ManyString> tok1 = r[0];
Assert.Equal(ManyString.STRING, tok1.TokenID);
Assert.Equal(string1, tok1.Value);

Token<ManyString> tok2 = r[1];
Assert.Equal(ManyString.STRING, tok2.TokenID);
Assert.Equal(string2, tok2.Value);
}

[Fact]
public void TestBadLetterStringDelimiter()
{
Expand Down
98 changes: 77 additions & 21 deletions sly/lexer/GenericLexer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ public class GenericLexer<IN> : ILexer<IN> where IN : struct
public static string single_line_comment_start = "single_line_comment_start";

public static string multi_line_comment_start = "multi_line_comment_start";

protected FSMLexer<GenericToken, GenericToken> LexerFsm;

protected BuildExtension<IN> ExtensionBuilder;
Expand All @@ -71,7 +72,9 @@ public class GenericLexer<IN> : ILexer<IN> where IN : struct
public FSMLexerBuilder<GenericToken, GenericToken> FSMBuilder;


protected char StringDelimiter;
protected char StringDelimiterChar;
protected char EscapeStringDelimiterChar;
protected int StringCounter = 0;

public string SingleLineComment { get; set; }
public string MultiLineCommentStart { get; set; }
Expand All @@ -82,14 +85,14 @@ public GenericLexer(IdentifierType idType = IdentifierType.Alpha, BuildExtension
{
InitializeStaticLexer(idType, staticTokens);
derivedTokens = new Dictionary<GenericToken, Dictionary<string, IN>>();
ExtensionBuilder = extensionBuilder;
ExtensionBuilder = extensionBuilder;
}


private void InitializeStaticLexer(IdentifierType idType = IdentifierType.Alpha, params GenericToken[] staticTokens)
{
FSMBuilder = new FSMLexerBuilder<GenericToken, GenericToken>();

StringCounter = 0;

// conf
FSMBuilder.IgnoreWS()
Expand Down Expand Up @@ -287,18 +290,34 @@ public void AddKeyWord(IN token, string keyword)
}


public void AddStringLexem(IN token, string stringDelimiter)
{
public void AddStringLexem(IN token, string stringDelimiter, string escapeDelimiterChar = "\\")
{

if (string.IsNullOrEmpty(stringDelimiter) || stringDelimiter.Length > 1)
{
throw new InvalidLexerException($"bad lexem {stringDelimiter} : StringToken lexeme <{token.ToString()}> must be 1 character length.");
throw new InvalidLexerException($"bad lexem {stringDelimiter} : StringToken lexeme delimiter char <{token.ToString()}> must be 1 character length.");
}
if (char.IsLetterOrDigit(stringDelimiter[0]))
{
throw new InvalidLexerException($"bad lexem {stringDelimiter} : SugarToken lexeme <{token.ToString()}> can not start with a letter.");
throw new InvalidLexerException($"bad lexem {stringDelimiter} : StringToken lexeme delimiter char <{token.ToString()}> can not start with a letter.");
}

if (string.IsNullOrEmpty(escapeDelimiterChar) || escapeDelimiterChar.Length > 1)
{
throw new InvalidLexerException($"bad lexem {escapeDelimiterChar} : StringToken lexeme escape char <{token.ToString()}> must be 1 character length.");
}
if (char.IsLetterOrDigit(escapeDelimiterChar[0]))
{
throw new InvalidLexerException($"bad lexem {escapeDelimiterChar} : StringToken lexeme escape char lexeme <{token.ToString()}> can not start with a letter.");
}

StringCounter++;

StringDelimiterChar = stringDelimiter[0];

EscapeStringDelimiterChar = escapeDelimiterChar[0];


StringDelimiter = stringDelimiter[0];

NodeCallback<GenericToken> callback = (FSMMatch<GenericToken> match) =>
{
Expand All @@ -309,18 +328,55 @@ public void AddStringLexem(IN token, string stringDelimiter)
return match;
};

FSMBuilder.GoTo(start);
FSMBuilder.Transition(StringDelimiter, GenericToken.String)
.Mark(in_string)
.ExceptTransitionTo(new char[] { StringDelimiter, '\\' }, in_string, GenericToken.String)
.Transition('\\', GenericToken.String)
.Mark(escape_string)
.AnyTransitionTo(' ', in_string, GenericToken.String)
.Transition(StringDelimiter, GenericToken.String)
if (StringDelimiterChar != EscapeStringDelimiterChar)
{

FSMBuilder.GoTo(start);
FSMBuilder.Transition(StringDelimiterChar, GenericToken.String)
.Mark(in_string+StringCounter)
.ExceptTransitionTo(new char[] { StringDelimiterChar, EscapeStringDelimiterChar }, in_string+StringCounter, GenericToken.String)
.Transition(EscapeStringDelimiterChar, GenericToken.String)
.Mark(escape_string+StringCounter)
.AnyTransitionTo(' ', in_string+StringCounter, GenericToken.String)
.Transition(StringDelimiterChar, GenericToken.String)
.End(GenericToken.String)
.Mark(string_end+StringCounter)
.CallBack(callback);
FSMBuilder.Fsm.StringDelimiter = StringDelimiterChar;
}
else {
NodeAction collapseDelimiter = (string value) => {
if (value.EndsWith(""+StringDelimiterChar+StringDelimiterChar)) {
return value.Substring(0,value.Length-2)+StringDelimiterChar;
}
return value;
};

var exceptDelimiter = new char[]{StringDelimiterChar};
string in_string = "in_string_same";
string escaped = "escaped_same";
string delim = "delim_same";

FSMBuilder.GoTo(start)
.Transition(StringDelimiterChar)
.Mark(in_string+StringCounter)
.ExceptTransitionTo(exceptDelimiter,in_string+StringCounter)
.Transition(StringDelimiterChar)

.Mark(escaped+StringCounter)
.End(GenericToken.String)
.Mark(string_end)
.CallBack(callback);
FSMBuilder.Fsm.StringDelimiter = StringDelimiter;
.CallBack(callback)
.Transition(StringDelimiterChar)

.Mark(delim+StringCounter)
.Action(collapseDelimiter)
.ExceptTransitionTo(exceptDelimiter,in_string+StringCounter);

FSMBuilder.GoTo(delim+StringCounter)
.TransitionTo(StringDelimiterChar,escaped+StringCounter)

.ExceptTransitionTo(exceptDelimiter,in_string+StringCounter);
}

}
public void AddSugarLexem(IN token, string specialValue)
Expand Down Expand Up @@ -411,7 +467,7 @@ public void ConsumeComment(Token<GenericToken> comment, string source)
}
else
{
newColumn = LexerFsm.CurrentColumn+lines[0].Length+MultiLineCommentEnd.Length;
newColumn = LexerFsm.CurrentColumn + lines[0].Length + MultiLineCommentEnd.Length;
}


Expand All @@ -424,7 +480,7 @@ public Token<IN> Transcode(FSMMatch<GenericToken> match)
var tok = new Token<IN>();
tok.Value = match.Result.Value;
tok.Position = match.Result.Position;
tok.StringDelimiter = StringDelimiter;
tok.StringDelimiter = StringDelimiterChar;
tok.TokenID = (IN)match.Properties[DerivedToken];
return tok;
}
Expand Down
19 changes: 15 additions & 4 deletions sly/lexer/LexerBuilder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -264,12 +264,23 @@ private static BuildResult<ILexer<IN>> BuildGenericLexer<IN>(Dictionary<IN, List
{
if (lexem.GenericTokenParameters != null && lexem.GenericTokenParameters.Length > 0)
{
try {
lexer.AddStringLexem(tokenID, lexem.GenericTokenParameters[0]);
try
{
string delimiter = lexem.GenericTokenParameters[0];
if (lexem.GenericTokenParameters.Length > 1)
{
string escape = lexem.GenericTokenParameters[1];
lexer.AddStringLexem(tokenID, delimiter, escape);
}
else
{
lexer.AddStringLexem(tokenID, delimiter);
}
}
catch (Exception e) {
catch (Exception e)
{
result.IsError = true;
result.AddError(new InitializationError(ErrorLevel.FATAL,e.Message));
result.AddError(new InitializationError(ErrorLevel.FATAL, e.Message));
}
}
else
Expand Down
Loading

0 comments on commit 2bfbb25

Please sign in to comment.