-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathlex.py
209 lines (173 loc) · 6.04 KB
/
lex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
"""
Lexical structure of the calculator input languages.
Programming languges and related formal notations are
divided into at least three levels of descripton:
lexical, syntactic, and semantic. The semantic
structure may be further divided into static and
dynamic semantics.
The lexical structure of a programming language is the way
an input text is divided into a individual "tokens" or
"lexemes" like identifiers, operator symbols, and
numeric and string literals. White space and comments
are not lexemes. For example, if the input is
"(3 * 5)/x", the lexemes should be
["(", "3", "*", "5", ")", "/", "x" ]
Beware 5-3 can read as '5' followed by '-3'; write it
as 5 - 3.
"""
import io
from typing import Sequence
# We use regular expressions (re) for the patterns that
# match lexemes
import re
# An Enum is a special kind of class used to enumerate
# a finite set of values.
from enum import Enum
import logging
logging.basicConfig()
log = logging.getLogger(__name__)
log.setLevel(logging.INFO)
# To the extent possible, we would like to describe
# the lexical structure by some tables that are easily
# edited, rather than details of the procedural code.
# We will make a pattern for each distinct token. These
# are 'raw' strings to make it easier to 'escape' some
# characters that are special in regular expressions.
#
# Conventions:
# UPPER = token
# error = error (name is exactly "error")
# ignore = comment or whitespace (name is exactly "ignore")
#
# Note these patterns are for both llcalc.py and rpncalc.py;
# some of the tokens are used only by one or the other.
#
# Order matters: e.g., INT must precede MINUS so that -5
# is read as one negative integer, not MINUS followed by INT.
# error should be the last pattern.
#
class TokenCat(Enum):
ignore = r"\s+|#.*" # Whitespace and comments
INT = r"\-?[0-9]+"
PLUS = r"\+"
MINUS = r"-"
TIMES = r"\*"
DIV = r"/"
NEG = r"~"
ABS = r"@"
ASSIGN = r"="
IF = r"if"
VAR = r"[a-zA-Z_][a-zA-Z_]*"
LPAREN = r"\("
RPAREN = r"\)"
error = "." # catch-all for errors
END = "###SHOULD NOT MATCH###" # Not really a pattern
def all_token_re() -> str:
"""Create a regular expression that matches ALL of the tokens in TokenCat.
Pattern will look like
"(?:\+)|(?:\*)|...|(?:[0-9]+)"
i.e., each token pattern P will be enclosed in the non-capturing
group (?:P) and all the groups will be combined as alternatives
with | .
"""
return "|".join([f"(?:{cat.value})" for cat in TokenCat])
TOKENS_PAT = re.compile(all_token_re())
class LexicalError(Exception):
"""Raised when we can't extract tokens from the input"""
pass
class Token(object):
"""One token from the input stream"""
def __init__(self, value: any, kind: TokenCat):
self.value = value
self.kind = kind
def __repr__(self) -> str:
return f"Token('{self.value}', {self.kind})"
def __str__(self) -> str:
return repr(self)
END = Token("End of Input", TokenCat.END)
class TokenStream(object):
"""
Provides the tokens within a stream one-by-one.
Example usage:
f = open("my_input_file")
stream = TokenStream(f)
while stream.has_more():
token = stream.take() # Removes token from front of stream
lookahead = stream.peek() # Returns token without removing it
# Do something with the token
"""
def __init__(self, f: io.IOBase):
self.file = f
self.tokens = []
self._check_fill()
log.debug("Tokens: {}".format(self.tokens))
def __str__(self) -> str:
return "[{}]".format("|".join(self.tokens))
def _check_fill(self):
while len(self.tokens) == 0:
# We could read more than one line before hitting
# a token, but the loop will be broken if we
# hit end of file
line = self.file.readline()
if len(line) == 0:
# End of file, leave zero tokens in buffer
break
self.tokens = lex(line.strip())
log.debug("Refilled, tokens: {}".format(self.tokens))
# Note this might also leave zero tokens in buffer,
# but in that case outer while loop will attempt
# to refill it until we either get some tokens
# or hit end of file
def has_more(self) -> bool:
"""True if there are more tokens in the stream"""
self._check_fill()
return len(self.tokens) > 0
def peek(self) -> Token:
"""Examine next token without consuming it. """
self._check_fill()
if len(self.tokens) > 0:
token = self.tokens[0]
else:
token = END
return token
def take(self) -> Token:
"""Consume next token"""
self._check_fill()
if len(self.tokens) > 0:
token = self.tokens.pop(0)
else:
token = END
return token
def lex(s: str) -> Sequence[Token]:
"""Break string into a list of Token objects"""
words = TOKENS_PAT.findall(s)
tokens = []
for word in words:
token = classify(word)
if token.kind == TokenCat.ignore:
log.debug(f"Skipping {token}")
continue
tokens.append(token)
return tokens
def classify(word: str) -> Token:
"""Convert a textual token into a Token object
with a value and category.
"""
log.debug(f"Classifying token '{word}")
for kind in TokenCat:
log.debug(f"Checking '{word}' for token class '{kind}'")
pattern = kind.value
if re.fullmatch(pattern, word):
log.debug(f"Classified as {kind}")
if kind.name == "error":
raise LexicalError(f"Unrecognized character '{word}'")
return Token(word, kind)
raise LexicalError(f"Unrecognized token '{word}'")
###
if __name__ == "__main__":
# Simple smoke test
text = io.StringIO("3*(4 + 24)+12")
tokens = TokenStream(text)
while tokens.has_more():
print(f"Token: {tokens.take()}")
input("Press enter to continue")