-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathEBNF.parse.txt
256 lines (220 loc) · 9.56 KB
/
EBNF.parse.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
EBNF for EBNF ignoring white space and comments (I was planning on figuring that out later)
SYNTAX = SYNTAX RULE, (: SYNTAX RULE :).
SYNTAX RULE = META IDENTIFIER, '=', DEFINITIONS LIST, '.' .
DEFINITIONS LIST = SINGLE DEFINITION, (: '/', SINGLE DEFINITION :).
SINGLE DEFINITION = TERM, (: ',', TERM :).
TERM = FACTOR, (/ '-', EXCEPTION /) .
EXCEPTION = FACTOR .
FACTOR = (/ INTEGER, '*' /), PRIMARY.
PRIMARY = OPTIONAL SEQUENCE / REPEATED SEQUENCE
/ SPECIAL SEQUENCE / GROUPED SEQUENCE
/ META IDENTIFIER / TERMINAL / EMPTY.
EMPTY = .
OPTIONAL SEQUENCE = '(/', DEFINITIONS LIST, '/)' .
REPEATED SEQUENCE = '(:', DEFINITIONS LIST, ':)' .
GROUPED SEQUENCE = '(', DEFINITIONS LIST, ')'.
TERMINAL = "'", CHARACTER - "'", (: CHARACTER - "'" :), "'"
/ '"', CHARACTER - '"', (: CHARACTER - '"' :), '"'.
META IDENTIFIER = LETTER, (: LETTER / DIGIT :).
INTEGER = DIGIT, (: DIGIT :).
SPECIAL SEQUENCE = '?', (: CHARACTER - '?' :),'?'.
Numbering of the primary states in the state machine:
(I'm not sure if ending should really have a state, maybe they should all just have the same state since the state they actually will transition too will from from the state stack)
SYNTAX = 1 SYNTAX RULE, 2 (: SYNTAX RULE :).
SYNTAX RULE = 3 META IDENTIFIER, 4 '=', 5 DEFINITIONS LIST, 6 '.' 7.
DEFINITIONS LIST = 8 SINGLE DEFINITION, 9(: '/', 10 SINGLE DEFINITION :).
SINGLE DEFINITION = 11 TERM, 12 (: ',',13 TERM :).
TERM = 14 FACTOR, 15 (/ '-', 16 EXCEPTION /) . 17
EXCEPTION = 18 FACTOR 19.
FACTOR = 20 (/ INTEGER, 21 '*' /), 22 PRIMARY. 23
PRIMARY = 24 OPTIONAL SEQUENCE / REPEATED SEQUENCE
/ SPECIAL SEQUENCE / GROUPED SEQUENCE
/ META IDENTIFIER / TERMINAL / EMPTY. 25
EMPTY = 26.
OPTIONAL SEQUENCE = 27'(', 28 '/', 29 DEFINITIONS LIST, 30 '/', 31 ')' 32.
REPEATED SEQUENCE = 33'(',34':', 35 DEFINITIONS LIST, 36 ':', 37 ')' 38.
GROUPED SEQUENCE = 39'(', 40 DEFINITIONS LIST, 41 ')'. 42
TERMINAL = 43 "'", 44 CHARACTER - "'", 45 (: CHARACTER - "'" :), "'" 46
/ 43 '"', 47 CHARACTER - '"', 48 (: CHARACTER - '"' :), '"'.46
META IDENTIFIER = 50 LETTER, 51 (: LETTER / DIGIT :).
INTEGER = 52 DIGIT, 53 (: DIGIT :).
SPECIAL SEQUENCE = 54 '?', 55 (: CHARACTER - '?' :),'?'. 56
LETTER DIGIT and CHARACTER should be defined as well. However, EBNF speficies seferal built in IDENTIFIERS and I think they should be included and for the sake of manually parsing this EBNF I don't think it's critical. (Especially because CHARACTER is encoding dependant and it would matter if we're parsing by byte of character then which is not something I really want to deal with yet)
Beginnings of me manually generating the state transition table.
Notation:
StateTransitionList = (: StateTranstion, NewLine :)
(* the state is the current state, then followed by all possible transitions *)
StateTranstion = State, CharacterList, Transition, (: " |", CharacterList, transition)
(* each character list is the list of characters / state states that could be valid to allow the transition that follows it *)
CharacterList = ( CharacterListTerm / "[",CharacterListOptions,"]" ), (: StateP :)
CharacterListOptions = CharacterList, " | ", CharacterList , (: " | ", CharacterList :)
CharacterListTerm = CHAR, (: CHAR :)
(* L is for LETTER, D for DIGIT, ~ for a long character list that is context dependant that I can only use because I'm processing this manually, Symbol any of the characters that appear in the terminal strings in the EBNF *)
CHAR = "L" / "D" / "~" / Symbol
StateP = State | "*"
(* the transitions are change state, consume character and change state, push a state on the stack and change state, and pop to a state off the stack.
Transition = (" -> ", State)|(" => ", State)|(" -v ", State)|(" -^ ", State)
1 L -v 3 2
2 L -v 3 2 | E 0 -^ 0
3 L -v 50 4
4 = => 5
5 .,/-D(?'"L -v 8 6
6 . => 7
7 [L 1 | LE 2] -^ 2
8 [. 5 | / 29 | : 35 | ) 40 | ,/-D(?'"L] -v 11 9
9 / => 10 | . 5 -^ 6 | / 29 -^ 30 | : 35 -^ 36 | ) 40 -^ 41
10 [. 5 | / 29 | : 35 | ) 40 | ,/-D(?'"L] -v 11 9
11 [[. 5 | / 29 | : 35 | ) 40]* | ,/-D(?'"L] -v 14 12
12 , => 13 | [/ | . 5 | / 29 | : 35 | ) 40]* -^ 9
13 [[. 5 | / 29 | : 35 | ) 40]* | ,/-D(?'"L] -v 14 12
14 [[[. 5 | / 29 | : 35 | ) 40]* | ,]* | /-D(?'"L] -v 20 15
15 - => 16 | [[. 5 | / 29 | : 35 | ) 40]** | ,/] > 17
16 [[[/ | . 5 | / 29 | : 35 | ) 40]* | ,]* | D(?'"L]-v 18 17
17 [[/ | . 5 | / 29 | : 35 | ) 40]* | ,]* -^ 12
18 [[[/ | . 5 | / 29 | : 35 | ) 40]* | ,]** | D(?'"L]-v 20 19
19 [[/ | . 5 | / 29 | : 35 | ) 40]* | ,]** -^ 17
20 D -v 52 21 | [[[[/ | . 5 | / 29 | : 35 | ) 40]* | ,]* | -]14 | [[/ | . 5 | / 29 | : 35 | ) 40]* | ,]**18 | (?'"L] > 22
21 * => 22
22 [[[[/ | . 5 | / 29 | : 35 | ) 40]* | ,]* | -]14 | [[/ | . 5 | / 29 | : 35 | ) 40]* | ,]**18 | (?'"L]-v 24 23
23 [[[/ | . 5 | / 29 | : 35 | ) 40]* | ,]* | -]14 -^ 15 | [[/ | . 5 | / 29 | : 35 | ) 40]* | ,]** 18 -^ 19
24 ( -v 27 25 | ( -v 33 25 | ? -v 54 25 | ( -v 39 25 | L -v 50 25 | '" -v 43 25 | [[[[/ | . 5 | / 29 | : 35 | ) 40]* | ,]* | -]14 | [[/ | . 5 | / 29 | : 35 | ) 40]* | ,]**18]* -v 26 25
25 [[[[/ | . 5 | / 29 | : 35 | ) 40]* | ,]* | -]14 | [[/ | . 5 | / 29 | : 35 | ) 40]* | ,]**18]* -^ 23
26 [[[[/ | . 5 | / 29 | : 35 | ) 40]* | ,]* | -]14 | [[/ | . 5 | / 29 | : 35 | ) 40]* | ,]**18]** -^ 25
27 ( => 28
28 / => 29
29 ,/-D(?'"L] -v 8 30
30 / => 31
31 ) => 32
32 [[[[/ | . 5 | / 29 | : 35 | ) 40]* | ,]* | -]14 | [[/ | . 5 | / 29 | : 35 | ) 40]* | ,]**18]** -^ 25
33 ( => 34
34 : => 35
35 :,/-D(?'"L -v 8 36
36 : => 37
37 ) => 38
38 [[[[/ | . 5 | / 29 | : 35 | ) 40]* | ,]* | -]14 | [[/ | . 5 | / 29 | : 35 | ) 40]* | ,]**18]** -^ 25
39 ( => 40
40 ),/-D(?'"L -v 8 41
41 ) => 42
42 [[[[/ | . 5 | / 29 | : 35 | ) 40]* | ,]* | -]14 | [[/ | . 5 | / 29 | : 35 | ) 40]* | ,]**18]** -^ 25
43 ' => 44 | " => 47
44 ~ => 45
45 ~ => 45 | ' => 46
46 [[[[/ | . 5 | / 29 | : 35 | ) 40]* | ,]* | -]14 | [[/ | . 5 | / 29 | : 35 | ) 40]* | ,]**18]** -^ 25
47 ~ => 48
48 ~ => 48 | " => 46
50 L => 51
51 LD => 51 | = 3 -^ 4 | [[[[/ | . 5 | / 29 | : 35 | ) 40]* | ,]* | -]14 | [[/ | . 5 | / 29 | : 35 | ) 40]* | ,]**18]*24 -^ 25
52 D => 53
53 D => 53 | * * -^ 21
54 ? => 55
55 ~ => 55 | ? => 56
56 [[[[/ | . 5 | / 29 | : 35 | ) 40]* | ,]* | -]14 | [[/ | . 5 | / 29 | : 35 | ) 40]* | ,]**18]** -^ 25
Beginnings of my analysis of the ambiguity:
States 24 and 9 have ambiguous state transitions as they have multiple transitions that have overlapping character lists
So the idea is to create a new state that represents the superposition of the states that they would transition to
Then when there is a differentiating character go back and fix the output tree to match the correct state transitions without having to reparse all of the intermediate text.
24 ( -V=> 100 r1 100=28|34|40
100 /=> 29 | :=> 35 | ),/-D(?'"L -v 8
100 : =v 33 ( => 34 : => 35 | ),-D(?'"L =v 39 ( => 40 -v 8 | / => 101
101= -v 27 (=> 28 / => 29 | -v 39 (=> 40 -v 8 -v 11 -v 14 -v 20 > 22 -v 24 -v 26 -^ 25 -^ 23 -^15 > 17 -^ 12 -^ 9 / => 10
101 ) -v 11 | ,/-D(?'"L] => 102
102 -v 27 (=> 28 / => 29 -v 8 -v 11 | -v 39 (=> 40 -v 8 -v 11 -v 14 -v 20 > 22 -v 24 -v 26 -^ 25 -^ 23 -^15 > 17 -^ 12 -^ 9 / => 10 -v 11
r1
Sudo Code for the parser:
struct syntaxNode
{
state state;
syntaxNode next;
syntaxNode parent;
syntaxNode firstChild;
int textStart;
int textEnd;
}
struct stateTransition{
state to;
state returnTo;
reformer r;
bool consume;
bool recordStart;
bool recordEnd;
}
struct reformer{
int upBeforeChange;
int backtrack;
reformStep first;
}
struct reformStep{
state changeTo;
bool stepDownAfter;
bool consume;
bool stepUpAfter;
reformer next;
}
//magically populate states and chars
stateTransition[states][chars] stm;
//magically populate stm
int end=str.length;
syntaxNode currentSN=new syntaxNode;
pos=-1;
stateTransition itit= new stateTransition{
to:1,
returnTo:Null,
reformer:Null,
consume:true,
recordStart:true,
recordEnd:false
}
stateTransition st= itit;
while(st && st.to!=DONE){
if(st.consume){ //update now so pos will be acurately reflected
pos++;
if(pos>=end){
ch=EOF;
}else{
ch=str[pos];
}
}
if(st.returnTo){ //decend into the next Identifier.
returnStack.push(st.returnTo);
s=st.to;
if(startRecord(s)){ // we don't actually want to record every state in the state tree but this part isn't fully thought through and needs to be revised
currentSN.state=s;
currentSN.textStart=pos;
currentSN.firstChild=new syntaxNode;
currentSN.firstChild.parent=currentSN;
currentSN=currentSN.firstChild;
}
}else{
if(st.to){
s=st.to;
if(endRecord(s)){ //dito
currentSN.textEnd=pos;
currentSN.next=new syntaxNode;
currentSN.next.parent=currentSN.parent;
currentSN=currentSN.next;
currentSN.state=s;
currentSN.textStart=pos;
}
}else{
s=returnStack.pop();
currentSN.next=NULL;
currentSN=currentSN.parent;
currentSN.next=new syntaxNode;
currentSN.next.parent=currentSN.parent;
currentSN=currentSN.next;
currentSN.state=s=returnStack.pop();
currentSN.textStart=pos;
}
}
//stuff to deal with fixing the abiguities. really not done at all
if(st.reformer){
}
while(r){
i=r.upBeforeChange;
while(i--){
currentSN=currentSN.parent;
}
r=r.next;
}
st=stm[s][ch];
}