forked from links-lang/links
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlexer.mll
407 lines (365 loc) · 16.2 KB
/
lexer.mll
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
(*
We can't tokenize Links using an FSM because of the following
combination of properties:
1. XML literals must be tokenized differently from regular code.
For example, most operator characters in Links have no special
meaning when they occur in an XML literal.
2. An XML literal can contain nested Links expressions (using the
"unquote" characters '{' and '}').
3. Links code can contain nested XML literals.
4. XML literal elements can contain other XML literal elements.
5. Braces can be arbitrarily nested within Links code.
We therefore need the power of a PDA. This is achieved by
maintaining a stack of lexers, which is manipulated by the semantic
actions. The particularly notable manipulations are:
1. On encountering '{' we push an expression lexer onto the stack.
On encountering '}' we pop the lexer, and carry on lexing in
the "mode" we were in before.
2. On encountering the start of an XML literal we push an xml lexer.
On encountering an XML closing tag we pop the lexer and carry on
lexing in the "mode" we were in before.
There are some other details (involving start tags and attributes)
that are handled similarly to the above.
*)
{
open Lexing
open Utility
open Parser
(* Constructors are not first class in OCaml *)
let infix0 x = INFIX0 x
let infixl0 x = INFIXL0 x
let infixr0 x = INFIXR0 x
let infix1 x = INFIX1 x
let infixl1 x = INFIXL1 x
let infixr1 x = INFIXR1 x
let infix2 x = INFIX2 x
let infixl2 x = INFIXL2 x
let infixr2 x = INFIXR2 x
let infix3 x = INFIX3 x
let infixl3 x = INFIXL3 x
let infixr3 x = INFIXR3 x
let infix4 x = INFIX4 x
let infixl4 x = INFIXL4 x
let infixr4 x = INFIXR4 x
let infix5 x = INFIX5 x
let infixl5 x = INFIXL5 x
let infixr5 x = INFIXR5 x
let infix6 x = INFIX6 x
let infixl6 x = INFIXL6 x
let infixr6 x = INFIXR6 x
let infix7 x = INFIX7 x
let infixl7 x = INFIXL7 x
let infixr7 x = INFIXR7 x
let infix8 x = INFIX8 x
let infixl8 x = INFIXL8 x
let infixr8 x = INFIXR8 x
let infix9 x = INFIX9 x
let infixl9 x = INFIXL9 x
let infixr9 x = INFIXR9 x
let prefix x = PREFIXOP x
let postfix x = POSTFIXOP x
let precs =
[
infix0, infixl0, infixr0;
infix1, infixl1, infixr1;
infix2, infixl2, infixr2;
infix3, infixl3, infixr3;
infix4, infixl4, infixr4;
infix5, infixl5, infixr5;
infix6, infixl6, infixr6;
infix7, infixl7, infixr7;
infix8, infixl8, infixr8;
infix9, infixl9, infixr9;
]
let initial_optable =
(* lifted from the definition of Haskell *)
[
"!" , infix9;
"^" , infixr8;
"^^" , infixr8;
"**" , infixr8;
"*" , infixl7;
"/" , infixl7;
"+" , infixl6;
"*." , infixl7;
"/." , infixl7;
"+." , infixl6;
"-" , infixl6;
".-" , infixl6;
"::" , infixr5;
"++" , infixr5;
"==" , infix4;
"<>" , infix4;
"<" , infix4;
"<=" , infix4;
">=" , infix4;
">" , infix4;
"&&" , infixr3;
"||" , infixr2;
">>" , infixl1;
]
let default_precedence = infixl9
class lexer_context =
object
val mutable optable = initial_optable
val lexers = Stack.create ()
method precedence name =
try List.assoc name optable name
with NotFound _ -> default_precedence name
method setprec (assoc : [`None|`Left|`Right|`Pre|`Post]) level name =
let value = match List.nth precs level, assoc with
| (a,_,_), `None -> a
| (_,a,_), `Left -> a
| (_,_,a), `Right -> a
| _, `Pre -> prefix
| _, `Post -> postfix
in
optable <- (name, value) :: optable
method push_lexer (lexer : Lexing.lexbuf -> Parser.token) =
Stack.push lexer lexers
method pop_lexer =
Stack.pop lexers
method next_lexer =
Stack.top lexers
end
let fresh_context () = new lexer_context
let bump_lines lexbuf n =
lexbuf.lex_curr_p <- {lexbuf.lex_curr_p with pos_lnum = lexbuf.lex_curr_p.pos_lnum + n}
let count_newlines = StringUtils.count '\n'
let keywords = [
"alien" , ALIEN;
"as" , AS;
"case" , CASE;
"client" , CLIENT;
"database" , DATABASE;
"default" , DEFAULT;
"delete" , DELETE;
"else" , ELSE;
"escape" , ESCAPE;
"false" , FALSE;
"for" , FOR;
"forall" , FORALL;
"from" , FROM;
"fun" , FUN;
"formlet" , FORMLET;
"if" , IF;
"in" , IN;
"open" , OPEN;
"yields" , YIELDS;
"insert" , INSERT;
"linfun" , LINFUN;
"module" , MODULE;
"mu" , MU;
"native" , NATIVE;
"nu" , NU;
"offer" , OFFER;
"orderby" , ORDERBY;
"op" , OP;
"page" , PAGE;
"query" , QUERY;
"readonly" , READONLY;
"receive" , RECEIVE;
"returning", RETURNING;
"select" , SELECT;
"server" , SERVER;
"set" , SET;
"sig" , SIG;
"spawn" , SPAWN;
"spawnClient" , SPAWNCLIENT;
"spawnAt" , SPAWNAT;
"spawnAngel" , SPAWNANGEL;
"spawnAngelAt" , SPAWNANGELAT;
"spawnWait", SPAWNWAIT;
"switch" , SWITCH;
"table" , TABLE;
"TableHandle", TABLEHANDLE;
"true" , TRUE;
"typename" , TYPENAME;
"update" , UPDATE;
"values" , VALUES;
"var" , VAR;
"where" , WHERE;
"with" , WITH;
(* SAND *)
"tablekeys" , TABLEKEYS;
]
exception LexicalError of (string * Lexing.position)
}
let def_id = (['a'-'z' 'A'-'Z'] ['a'-'z' 'A'-'Z' '_' '0'-'9']*)
let module_name = (['A'-'Z'] (['A'-'Z' 'a'-'z'])*)
let octal_code = (['0'-'3']['0'-'7']['0'-'7'])
let hex_code = (['0'-'9''a'-'f''A'-'F']['0'-'9''a'-'f''A'-'F'])
let def_qname = ('#' | def_id (':' def_id)*)
let def_integer = (['1'-'9'] ['0'-'9']* | '0')
let def_float = (def_integer '.' ['0'-'9']+ ('e' ('-')? def_integer)?)
let def_blank = [' ' '\t' '\r' '\n']
let char_contents = ([^ '\"' '\\']|"\\\"" |"\\\\" | "\\n" | "\\r" | "\\t" | ('\\' octal_code) | ('\\' ['x' 'X'] hex_code))
let string_contents = char_contents*
let regexrepl_fsa = [^ '{' '/']* (* this regex is too restrictive. But can't seem to get a more precise one to work :( *)
let regex_flags = ['l' 'n' 'g']*
let directive_prefix = ['' '@' '$']
let xml_opening = ('<' def_id)
let xml_closing_tag = ('<' '/' def_id '>')
let initopchar = [ '!' '$' '&' '*' '+' '/' '<' '=' '>' '@' '\\' '^' '-' ]
let opchar = [ '.' '!' '$' '&' '*' '+' '/' '<' '=' '>' '@' '\\' '^' '-' ]
(* Each lexer when called must return exactly one token and possibly
modify the stack of remaining lexers. The lexer on top of the stack
will be called next; when each action starts it's the current lexer.
Each rule takes two arguments: the currently operative precedence
table and the stack.
*)
rule lex ctxt nl = parse
| eof { END }
| '#' ([^ '\n'] *) { lex ctxt nl lexbuf }
| "End" { END }
| ';' { SEMICOLON }
| directive_prefix (def_id as id) { KEYWORD id}
| '\n' { nl (); bump_lines lexbuf 1; lex ctxt nl lexbuf }
| '_' { UNDERSCORE }
| '=' { EQ }
| "->" { RARROW }
| "~>" { SQUIGRARROW }
| "-@" { LOLLI }
| "~@" { SQUIGLOLLI }
| "=>" { FATRARROW }
| "-." { MINUSDOT }
| '-' { MINUS }
| '(' { LPAREN }
| ')' { RPAREN }
| "{|" { ctxt#push_lexer (lex ctxt nl); LBRACEBAR }
| '{' { ctxt#push_lexer (lex ctxt nl); LBRACE }
| "|}" { ctxt#pop_lexer (* fall back *); BARRBRACE }
| '}' { ctxt#pop_lexer (* fall back *); RBRACE }
| "<->" { LRARROW }
| "<--" { LLARROW }
| "<-" { LARROW }
| "<|" { LEFTTRIANGLE }
| "|>" { RIGHTTRIANGLE }
| '<' (def_qname as id) { (* come back here after scanning the start tag *)
ctxt#push_lexer (starttag ctxt nl); LXML id }
| "[|" { LBRACKETBAR }
| "|]" { BARRBRACKET }
| '[' { LBRACKET }
| ']' { RBRACKET }
| "[+|" { LBRACKETPLUSBAR }
| "|+]" { BARPLUSRBRACKET }
| "[&|" { LBRACKETAMPBAR }
| "|&]" { BARAMPRBRACKET }
| "||" { BARBAR }
| "&&" { AMPAMP }
| '|' { VBAR }
| '~' { TILDE }
| "=~" { ctxt#push_lexer (regex' ctxt nl); EQUALSTILDE }
| ',' { COMMA }
| '.' { DOT }
| ".." { DOTDOT }
| "::" { COLONCOLON }
| ':' { COLON }
| '!' { BANG }
| '?' { QUESTION }
| "%" def_id as var { PERCENTVAR var }
| '%' { PERCENT }
| initopchar opchar * as op { ctxt#precedence op }
| '`' (def_id as var) '`' { if List.mem_assoc var keywords || Char.isUpper var.[0] then
raise (LexicalError (lexeme lexbuf, lexeme_end_p lexbuf))
else ctxt#precedence var }
| "'" (char_contents as c) "'" { let c' = decode_escapes c in
if String.length c' = 1 then CHAR (c'.[0])
else raise (LexicalError (lexeme lexbuf, lexeme_end_p lexbuf)) }
| def_integer as var { UINTEGER (int_of_string var) }
| def_float as var { UFLOAT (float_of_string var) }
| ('\"' (string_contents as var) '\"'){ STRING (decode_escapes var) }
| "infix" { INFIX ctxt#setprec }
| "infixl" { INFIXL ctxt#setprec }
| "infixr" { INFIXR ctxt#setprec }
| "prefix" { PREFIX ctxt#setprec }
| "postfix" { POSTFIX ctxt#setprec }
| def_id as var { try List.assoc var keywords
with Not_found | NotFound _ ->
if Char.isUpper var.[0] then CONSTRUCTOR var
else VARIABLE var }
| def_blank { lex ctxt nl lexbuf }
| _ { raise (LexicalError (lexeme lexbuf, lexeme_end_p lexbuf)) }
and starttag ctxt nl = parse
| eof { END }
| def_qname as var { VARIABLE var }
| '=' { EQ }
| '#' ([^ '\n'] *) { starttag ctxt nl lexbuf }
| '>' { (* Switch to `xmllex' *)
ctxt#pop_lexer; ctxt#push_lexer (xmllex ctxt nl); RXML }
| '"' { (* Come back here after scanning the attr value *)
ctxt#push_lexer (attrlex ctxt nl); LQUOTE }
| '{' { (* Come back here after scanning the attribute block *)
ctxt#push_lexer (lex ctxt nl); LBRACE }
| "/>" { ctxt#pop_lexer (* fall back *); SLASHRXML }
| '\n' { nl () ; bump_lines lexbuf 1; starttag ctxt nl lexbuf }
| def_blank { starttag ctxt nl lexbuf }
| _ { raise (LexicalError (lexeme lexbuf, lexeme_end_p lexbuf)) }
and xmllex ctxt nl = parse
| eof { END }
| "{{" { CDATA "{" }
| "}}" { CDATA "}" }
| "}" { raise (LexicalError (lexeme lexbuf, lexeme_end_p lexbuf)) }
| [^ '{' '}' '<' '&' ]* as cdata { bump_lines lexbuf (count_newlines cdata); CDATA cdata }
| "&" { CDATA "&" }
| "<" { CDATA "<" }
| ">" { CDATA ">" }
| "{|" { (* scan the expression, then back here *)
ctxt#push_lexer (lex ctxt nl); LBRACEBAR }
| '{' { (* scan the expression, then back here *)
ctxt#push_lexer (lex ctxt nl); LBRACE }
| "</" (def_qname as var) '>' { (* fall back *)
ctxt#pop_lexer; ENDTAG var }
| '<' (def_qname as var) { (* switch to `starttag' to handle the nested xml, then back here *)
ctxt#push_lexer (starttag ctxt nl); LXML var }
| _ { raise (LexicalError (lexeme lexbuf, lexeme_end_p lexbuf)) }
and attrlex ctxt nl = parse
| eof { END }
| '"' { (* fall back *)
ctxt#pop_lexer; RQUOTE }
| '{' { (* scan the expression, then back here *)
ctxt#push_lexer (lex ctxt nl); LBRACE }
| [^ '{' '"']* as string { bump_lines lexbuf (count_newlines string); STRING string }
| _ { raise (LexicalError (lexeme lexbuf, lexeme_end_p lexbuf)) }
and regex' ctxt nl = parse
| eof { END }
| '/' { ctxt#push_lexer (regex ctxt nl); SLASH }
| "s/" { ctxt#push_lexer (regexrepl ctxt nl); (* push twice is intentional *)
ctxt#push_lexer (regexrepl ctxt nl);
ctxt#push_lexer (regex ctxt nl);
SSLASH }
| '#' ([^ '\n'] *) { regex' ctxt nl lexbuf }
| '\n' { nl (); bump_lines lexbuf 1; regex' ctxt nl lexbuf }
| def_blank { regex' ctxt nl lexbuf }
and regex ctxt nl = parse
| eof { END }
| '/' { ctxt#pop_lexer; ctxt#pop_lexer; SLASH }
| '/' (regex_flags as f) { ctxt#pop_lexer; ctxt#pop_lexer; SLASHFLAGS (f) }
| '.' { DOT }
| '[' (_ as f) '-' (_ as t) ']' { RANGE (f,t) }
| '?' { QUESTION }
| '*' { STAR }
| '+' { PLUS }
| '|' { ALTERNATE }
| '^' { CARET }
| '$' { DOLLAR }
| '(' { LPAREN }
| ')' { RPAREN }
| '{' { (* scan the expression, then back here *)
ctxt#push_lexer (lex ctxt nl); LBRACE }
| '\\' (_ as c) { QUOTEDMETA (String.make 1 c)}
| '\n' { nl (); bump_lines lexbuf 1; regex ctxt nl lexbuf }
| def_blank { regex ctxt nl lexbuf }
| (_ as c) { STRING (String.make 1 c) }
and regexrepl ctxt nl = parse
| eof { END }
| regexrepl_fsa as var { REGEXREPL(var) }
| '{' { (* scan the expression, then back here *)
ctxt#push_lexer (lex ctxt nl); LBRACE }
| '/' { ctxt#pop_lexer; ctxt#pop_lexer; SLASH}
| '/' (regex_flags as f) { ctxt#pop_lexer; ctxt#pop_lexer; SLASHFLAGS (f) }
{
let lexer ctxt ~newline_hook =
ctxt#push_lexer (lex ctxt newline_hook);
fun lexbuf -> ctxt#next_lexer lexbuf
}