This repository has been archived by the owner on Apr 5, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 8
/
py_tokenizer.erl
177 lines (159 loc) · 9.45 KB
/
py_tokenizer.erl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
-module(py_tokenizer).
-export([tokenize/2, all_errors/1]).
tokenize([], _Position) ->
[];
tokenize(L, Position) ->
{Token, Rest, NewPosition} = tokenize_one(L, Position),
[Token] ++ tokenize(Rest, NewPosition).
% deal with empty list
tokenize_one([], Position) ->
{{eof, Position, Position}, [], Position};
% deal with long strings
tokenize_one([M, M, M | Rest], {Filename, Line, Column}) when (M == $') or (M == $") ->
match_large_string(M, [], Rest, {Filename, Line, Column}, {Filename, Line, Column+3});
% deal with strings
tokenize_one([Q | Rest], {Filename, Line, Column}) when (Q == $') or (Q == $") ->
match_string(Q, [], Rest, {Filename, Line, Column}, {Filename, Line, Column+1});
% deal with caret return
tokenize_one([$\r | Rest], {Filename, Line, Column}) ->
tokenize_one(Rest, {Filename, Line, Column+1});
% deal with new line
tokenize_one([$\n | Rest], {Filename, Line, Column}) ->
{{new_line, {Filename, Line, Column}}, Rest, {Filename, Line+1, 1}};
% deal with spaces
tokenize_one([S | Rest], {Filename, Line, 1}) when (S == $\s) or (S == $\t) ->
match_leading_space([S], Rest, {Filename, Line, 1}, {Filename, Line, 2});
tokenize_one([S | Rest], {Filename, Line, Column}) when (S == $\s) or (S == $\t) ->
tokenize_one(Rest, {Filename, Line, Column+1});
% deal with line comment
tokenize_one([$# | Rest], Position) ->
match_line_comment(Rest, Position);
% deal with symbols
tokenize_one([C | Rest], {Filename, Line, Column}) when (C == $_) or ((C >= $a) and (C =< $z)) or ((C >= $A) and (C =< $Z)) ->
match_symbol([C], Rest, {Filename, Line, Column}, {Filename, Line, Column+1});
% deal with numbers
tokenize_one([$., N | Rest], {Filename, Line, Column}) when (N >= $0) and (N =< $9) ->
match_number([$., N], Rest, {Filename, Line, Column}, {Filename, Line, Column+2});
tokenize_one([N | Rest], {Filename, Line, Column}) when (N >= $0) and (N =< $9) ->
match_number([N], Rest, {Filename, Line, Column}, {Filename, Line, Column+1});
% deal with punctuations
tokenize_one([$*, $* | Rest], {Filename, Line, Column})->
{{punctuation, {Filename, Line, Column}, "**"}, Rest, {Filename, Line, Column+2}};
tokenize_one([$>, $= | Rest], {Filename, Line, Column})->
{{punctuation, {Filename, Line, Column}, ">="}, Rest, {Filename, Line, Column+2}};
tokenize_one([$=, $> | Rest], {Filename, Line, Column})->
{{punctuation, {Filename, Line, Column}, ">="}, Rest, {Filename, Line, Column+2}};
tokenize_one([$<, $= | Rest], {Filename, Line, Column})->
{{punctuation, {Filename, Line, Column}, "<="}, Rest, {Filename, Line, Column+2}};
tokenize_one([$=, $< | Rest], {Filename, Line, Column})->
{{punctuation, {Filename, Line, Column}, "<="}, Rest, {Filename, Line, Column+2}};
tokenize_one([$!, $= | Rest], {Filename, Line, Column})->
{{punctuation, {Filename, Line, Column}, "!="}, Rest, {Filename, Line, Column+2}};
tokenize_one([$=, $= | Rest], {Filename, Line, Column})->
{{punctuation, {Filename, Line, Column}, "=="}, Rest, {Filename, Line, Column+2}};
tokenize_one([C | Rest], {Filename, Line, Column})
when (C == $:) or (C == $=) or (C == $+) or (C == $-) or (C == $*) or (C == $/) or (C == $\\)
or (C == $() or (C == $)) or (C == $.) or (C == $,) or (C == $[) or (C == $]) or (C == ${)
or (C == $}) or (C == $>) or (C == $<) or (C == $@) or (C == $%) or (C == $;)
-> {{punctuation, {Filename, Line, Column}, C}, Rest, {Filename, Line, Column+1}};
% the rest
tokenize_one([C | Rest], {Filename, Line, Column}) ->
{{error, {Filename, Line, Column}, unexpected_char, C}, Rest, {Filename, Line, Column+1}}.
match_large_string(Mark, Parsed, [Mark, Mark, Mark | Rest], StartPosition, {Filename, Line, Column}) ->
{{string, StartPosition, Parsed}, Rest, {Filename, Line, Column+3}};
match_large_string(Mark, Parsed, [$\n | Rest], StartPosition, {Filename, Line, _Column}) ->
match_large_string(Mark, Parsed ++ [$\n], Rest, StartPosition, {Filename, Line+1, 1});
match_large_string(Mark, Parsed, [C | Rest], StartPosition, {Filename, Line, Column}) ->
match_large_string(Mark, Parsed ++ [C], Rest, StartPosition, {Filename, Line, Column+1});
match_large_string(_Mark, _Parsed, [], _StartPosition, Position) ->
{{error, Position, unexpected_end_of_file}, [], Position}.
translate_char(C) ->
case C of
$n->$\n;
$t->$\t;
$r->$\r;
$\\->$\\;
$/->$/;
$'->$';
$"->$";
_Any->illegal
end.
match_string(Mark, Parsed, [Mark | Rest], StartPosition, {Filename, Line, Column}) ->
{{string, StartPosition, Parsed}, Rest, {Filename, Line, Column+1}};
match_string(Mark, Parsed, [$\\, C | Rest], StartPosition, {Filename, Line, Column}) when (C == $x) or (C == $u)->
match_hex_string(Mark, Parsed, [], Rest, StartPosition, {Filename, Line, Column+2});
match_string(Mark, Parsed, [$\\, C | Rest], StartPosition, {Filename, Line, Column}) ->
case translate_char(C) of
illegal->{{error, {Filename, Line, Column+1}, unexpected_char, C}, Rest, {Filename, Line, Column+2}};
TranslatedC->match_string(Mark, Parsed ++ [TranslatedC], Rest, StartPosition, {Filename, Line, Column+2})
end;
match_string(_Mark, _Parsed, [$\n | Rest], _StartPosition, {Filename, Line, Column}) ->
{{error, {Filename, Line, Column}, unexpected_end_of_line}, Rest, {Filename, Line+1, 1}};
match_string(Mark, Parsed, [C | Rest], StartPosition, {Filename, Line, Column}) ->
match_string(Mark, Parsed ++ [C], Rest, StartPosition, {Filename, Line, Column+1}).
match_hex_string(Mark, Parsed, ParsedHex, [N | Rest], StartPosition, {Filename, Line, Column}) when ((N >= $0) and (N =< $9)) or ((N >= $a) and (N =< $f)) or ((N >= $A) and (N =< $F))->
match_hex_string(Mark, Parsed, ParsedHex ++ [N], Rest, StartPosition, {Filename, Line, Column+1});
match_hex_string(_Mark, _Parsed, [], [_Mark | Rest], _StartPosition, {Filename, Line, Column}) ->
{{error, {Filename, Line, Column}, unexpected_end_of_string}, Rest, {Filename, Line, Column}};
match_hex_string(_Mark, _Parsed, [], [C | Rest], _StartPosition, {Filename, Line, Column}) ->
{{error, {Filename, Line, Column}, unexpected_char, C}, Rest, {Filename, Line, Column+1}};
match_hex_string(Mark, Parsed, ParsedHex, Rest, StartPosition, Position) ->
match_string(Mark, Parsed ++ [list_to_integer(ParsedHex, 16)], Rest, StartPosition, Position).
match_leading_space(Parsed, [$\s | Rest], StartPosition, {Filename, Line, Column}) ->
match_leading_space(Parsed ++ [$\s], Rest, StartPosition, {Filename, Line, Column+1});
match_leading_space(Parsed, [$\t | Rest], StartPosition, {Filename, Line, Column}) ->
match_leading_space(Parsed ++ [$\t], Rest, StartPosition, {Filename, Line, Column+1});
match_leading_space(Parsed, Rest, StartPosition, Position) ->
{{leading_space, StartPosition, Parsed}, Rest, Position}.
match_line_comment([$\n | Rest], {Filename, Line, _Column}) ->
tokenize_one(Rest, {Filename, Line + 1, 1});
match_line_comment([_ | Rest], {Filename, Line, Column}) ->
match_line_comment(Rest, {Filename, Line, Column+1}).
symbol_type(Symbol) ->
case Symbol of
"class"->keyword;
"def"->keyword;
"end"->keyword;
"import"->keyword;
"pass"->keyword;
"global"->keyword;
"from"->keyword;
"for"->keyword;
"in"->keyword;
"if"->keyword;
"else"->keyword;
"elif"->keyword;
"and"->keyword;
"or"->keyword;
"not"->keyword;
"as"->keyword;
_->symbol
end.
match_symbol(Parsed, [C | Rest], StartPosition, {Filename, Line, Column}) when (C == $_) or ((C >= $0) and (C =< $9)) or ((C >= $a) and (C =< $z)) or ((C >= $A) and (C =< $Z)) ->
match_symbol(Parsed ++ [C], Rest, StartPosition, {Filename, Line, Column+1});
match_symbol(Parsed, Rest, StartPosition, Position) ->
{{symbol_type(Parsed), StartPosition, Parsed}, Rest, Position}.
match_number(Parsed, [N | Rest], StartPosition, {Filename, Line, Column}) when (N >= $0) and (N =< $9) ->
match_number(Parsed ++ [N], Rest, StartPosition, {Filename, Line, Column+1});
match_number(Parsed, [$. | Rest], StartPosition, {Filename, Line, Column})->
match_float(Parsed ++ [$.], Rest, StartPosition, {Filename, Line, Column+1});
match_number([$0], [$x | Rest], StartPosition, {Filename, Line, Column})->
match_hex("0x", Rest, StartPosition, {Filename, Line, Column+1});
match_number(Parsed, Rest, StartPosition, Position)->
{{integer, StartPosition, Parsed}, Rest, Position}.
match_float(Parsed, [N | Rest], StartPosition, {Filename, Line, Column}) when (N >= $0) and (N =< $9) ->
match_float(Parsed ++ [N], Rest, StartPosition, {Filename, Line, Column+1});
match_float(Parsed, Rest, StartPosition, Position) ->
{{float, StartPosition, Parsed}, Rest, Position}.
match_hex(Parsed, [N | Rest], StartPosition, {Filename, Line, Column}) when ((N >= $0) and (N =< $9)) or ((N >= $a) and (N =< $z)) or ((N >= $A) and (N =< $Z)) ->
match_hex(Parsed ++ [N], Rest, StartPosition, {Filename, Line, Column+1});
match_hex(Parsed, Rest, StartPosition, Position) ->
{{hex, StartPosition, Parsed}, Rest, Position}.
all_errors([]) ->
[];
all_errors([X | Rest]) ->
case X of
{error, Position, Reason}->[{error, Position, Reason} | all_errors(Rest)];
{error, Position, Reason, Relative}->[{error, Position, Reason, Relative} | all_errors(Rest)];
_Other->all_errors(Rest)
end.