tokenizer.py
1 """ 2 Taken from the NAYA project 3 4 https://github.com/danielyule/naya 5 6 Copyright (c) 2019 Daniel Yule 7 """ 8 9 10 class TokenType: 11 OPERATOR = 0 12 STRING = 1 13 NUMBER = 2 14 BOOLEAN = 3 15 NULL = 4 16 17 18 class State: 19 WHITESPACE = 0 20 INTEGER_0 = 1 21 INTEGER_SIGN = 2 22 INTEGER = 3 23 INTEGER_EXP = 4 24 INTEGER_EXP_0 = 5 25 FLOATING_POINT_0 = 6 26 FLOATING_POINT = 8 27 STRING = 9 28 STRING_ESCAPE = 10 29 STRING_END = 11 30 TRUE_1 = 12 31 TRUE_2 = 13 32 TRUE_3 = 14 33 FALSE_1 = 15 34 FALSE_2 = 16 35 FALSE_3 = 17 36 FALSE_4 = 18 37 NULL_1 = 19 38 NULL_2 = 20 39 NULL_3 = 21 40 UNICODE_1 = 22 41 UNICODE_2 = 23 42 UNICODE_3 = 24 43 UNICODE_4 = 25 44 45 46 def tokenize(stream): 47 def is_delimiter(char): 48 return char.isspace() or char in "{}[]:," 49 50 token = [] 51 charcode = 0 52 completed = False 53 now_token = "" 54 55 def process_char(char, charcode): 56 nonlocal token, completed, now_token 57 advance = True 58 add_char = False 59 next_state = state 60 if state == State.WHITESPACE: 61 if char == "{": 62 completed = True 63 now_token = (TokenType.OPERATOR, "{") 64 elif char == "}": 65 completed = True 66 now_token = (TokenType.OPERATOR, "}") 67 elif char == "[": 68 completed = True 69 now_token = (TokenType.OPERATOR, "[") 70 elif char == "]": 71 completed = True 72 now_token = (TokenType.OPERATOR, "]") 73 elif char == ",": 74 completed = True 75 now_token = (TokenType.OPERATOR, ",") 76 elif char == ":": 77 completed = True 78 now_token = (TokenType.OPERATOR, ":") 79 elif char == "\"": 80 next_state = State.STRING 81 elif char in "123456789": 82 next_state = State.INTEGER 83 add_char = True 84 elif char == "0": 85 next_state = State.INTEGER_0 86 add_char = True 87 elif char == "-": 88 next_state = State.INTEGER_SIGN 89 add_char = True 90 elif char == "f": 91 next_state = State.FALSE_1 92 elif char == "t": 93 next_state = State.TRUE_1 94 elif char == "n": 95 next_state = State.NULL_1 96 elif not char.isspace(): 97 raise ValueError("Invalid JSON character: '{0}'".format(char)) 98 elif state == State.INTEGER: 99 if char in "0123456789": 100 add_char = True 101 elif char == ".": 102 next_state = State.FLOATING_POINT_0 103 add_char = True 104 elif char == "e" or char == 'E': 105 next_state = State.INTEGER_EXP_0 106 add_char = True 107 elif is_delimiter(char): 108 next_state = State.WHITESPACE 109 completed = True 110 now_token = (TokenType.NUMBER, int("".join(token))) 111 advance = False 112 else: 113 raise ValueError("A number must contain only digits. Got '{}'".format(char)) 114 elif state == State.INTEGER_0: 115 if char == ".": 116 next_state = State.FLOATING_POINT_0 117 add_char = True 118 elif char == "e" or char == 'E': 119 next_state = State.INTEGER_EXP_0 120 add_char = True 121 elif is_delimiter(char): 122 next_state = State.WHITESPACE 123 completed = True 124 now_token = (TokenType.NUMBER, 0) 125 advance = False 126 else: 127 raise ValueError("A 0 must be followed by a '.' or a 'e'. Got '{0}'".format(char)) 128 elif state == State.INTEGER_SIGN: 129 if char == "0": 130 next_state = State.INTEGER_0 131 add_char = True 132 elif char in "123456789": 133 next_state = State.INTEGER 134 add_char = True 135 else: 136 raise ValueError("A - must be followed by a digit. Got '{0}'".format(char)) 137 elif state == State.INTEGER_EXP_0: 138 if char == "+" or char == "-" or char in "0123456789": 139 next_state = State.INTEGER_EXP 140 add_char = True 141 else: 142 raise ValueError("An e in a number must be followed by a '+', '-' or digit. Got '{0}'".format(char)) 143 elif state == State.INTEGER_EXP: 144 if char in "0123456789": 145 add_char = True 146 elif is_delimiter(char): 147 completed = True 148 now_token = (TokenType.NUMBER, float("".join(token))) 149 next_state = State.WHITESPACE 150 advance = False 151 else: 152 raise ValueError("A number exponent must consist only of digits. Got '{}'".format(char)) 153 elif state == State.FLOATING_POINT: 154 if char in "0123456789": 155 add_char = True 156 elif char == "e" or char == "E": 157 next_state = State.INTEGER_EXP_0 158 add_char = True 159 elif is_delimiter(char): 160 completed = True 161 now_token = (TokenType.NUMBER, float("".join(token))) 162 next_state = State.WHITESPACE 163 advance = False 164 else: 165 raise ValueError("A number must include only digits") 166 elif state == State.FLOATING_POINT_0: 167 if char in "0123456789": 168 next_state = State.FLOATING_POINT 169 add_char = True 170 else: 171 raise ValueError("A number with a decimal point must be followed by a fractional part") 172 elif state == State.FALSE_1: 173 if char == "a": 174 next_state = State.FALSE_2 175 else: 176 raise ValueError("Invalid JSON character: '{0}'".format(char)) 177 elif state == State.FALSE_2: 178 if char == "l": 179 next_state = State.FALSE_3 180 else: 181 raise ValueError("Invalid JSON character: '{0}'".format(char)) 182 elif state == State.FALSE_3: 183 if char == "s": 184 next_state = State.FALSE_4 185 else: 186 raise ValueError("Invalid JSON character: '{0}'".format(char)) 187 elif state == State.FALSE_4: 188 if char == "e": 189 next_state = State.WHITESPACE 190 completed = True 191 now_token = (TokenType.BOOLEAN, False) 192 else: 193 raise ValueError("Invalid JSON character: '{0}'".format(char)) 194 elif state == State.TRUE_1: 195 if char == "r": 196 next_state = State.TRUE_2 197 else: 198 raise ValueError("Invalid JSON character: '{0}'".format(char)) 199 elif state == State.TRUE_2: 200 if char == "u": 201 next_state = State.TRUE_3 202 else: 203 raise ValueError("Invalid JSON character: '{0}'".format(char)) 204 elif state == State.TRUE_3: 205 if char == "e": 206 next_state = State.WHITESPACE 207 completed = True 208 now_token = (TokenType.BOOLEAN, True) 209 else: 210 raise ValueError("Invalid JSON character: '{0}'".format(char)) 211 elif state == State.NULL_1: 212 if char == "u": 213 next_state = State.NULL_2 214 else: 215 raise ValueError("Invalid JSON character: '{0}'".format(char)) 216 elif state == State.NULL_2: 217 if char == "l": 218 next_state = State.NULL_3 219 else: 220 raise ValueError("Invalid JSON character: '{0}'".format(char)) 221 elif state == State.NULL_3: 222 if char == "l": 223 next_state = State.WHITESPACE 224 completed = True 225 now_token = (TokenType.NULL, None) 226 else: 227 raise ValueError("Invalid JSON character: '{0}'".format(char)) 228 elif state == State.STRING: 229 if char == "\"": 230 completed = True 231 now_token = (TokenType.STRING, "".join(token)) 232 next_state = State.STRING_END 233 elif char == "\\": 234 next_state = State.STRING_ESCAPE 235 else: 236 add_char = True 237 elif state == State.STRING_END: 238 if is_delimiter(char): 239 advance = False 240 next_state = State.WHITESPACE 241 else: 242 raise ValueError("Expected whitespace or an operator after strin. Got '{}'".format(char)) 243 elif state == State.STRING_ESCAPE: 244 next_state = State.STRING 245 if char == "\\" or char == "\"": 246 add_char = True 247 elif char == "b": 248 char = "\b" 249 add_char = True 250 elif char == "f": 251 char = "\f" 252 add_char = True 253 elif char == "n": 254 char = "\n" 255 add_char = True 256 elif char == "t": 257 char = "\t" 258 add_char = True 259 elif char == "r": 260 char = "\r" 261 add_char = True 262 elif char == "/": 263 char = "/" 264 add_char = True 265 elif char == "u": 266 next_state = State.UNICODE_1 267 charcode = 0 268 else: 269 raise ValueError("Invalid string escape: {}".format(char)) 270 elif state == State.UNICODE_1: 271 if char in "0123456789": 272 charcode = (ord(char) - 48) * 4096 273 elif char in "abcdef": 274 charcode = (ord(char) - 87) * 4096 275 elif char in "ABCDEF": 276 charcode = (ord(char) - 55) * 4096 277 else: 278 raise ValueError("Invalid character code: {}".format(char)) 279 next_state = State.UNICODE_2 280 char = "" 281 elif state == State.UNICODE_2: 282 if char in "0123456789": 283 charcode += (ord(char) - 48) * 256 284 elif char in "abcdef": 285 charcode += (ord(char) - 87) * 256 286 elif char in "ABCDEF": 287 charcode += (ord(char) - 55) * 256 288 else: 289 raise ValueError("Invalid character code: {}".format(char)) 290 next_state = State.UNICODE_3 291 char = "" 292 elif state == State.UNICODE_3: 293 if char in "0123456789": 294 charcode += (ord(char) - 48) * 16 295 elif char in "abcdef": 296 charcode += (ord(char) - 87) * 16 297 elif char in "ABCDEF": 298 charcode += (ord(char) - 55) * 16 299 else: 300 raise ValueError("Invalid character code: {}".format(char)) 301 next_state = State.UNICODE_4 302 char = "" 303 elif state == State.UNICODE_4: 304 if char in "0123456789": 305 charcode += ord(char) - 48 306 elif char in "abcdef": 307 charcode += ord(char) - 87 308 elif char in "ABCDEF": 309 charcode += ord(char) - 55 310 else: 311 raise ValueError("Invalid character code: {}".format(char)) 312 next_state = State.STRING 313 char = chr(charcode) 314 add_char = True 315 316 if add_char: 317 token.append(char) 318 319 return advance, next_state, charcode 320 state = State.WHITESPACE 321 char = stream.read(1) 322 index = 0 323 while char: 324 try: 325 advance, state, charcode = process_char(char, charcode) 326 except ValueError as e: 327 raise ValueError("".join([e.args[0], " at index {}".format(index)])) 328 if completed: 329 completed = False 330 token = [] 331 yield now_token 332 if advance: 333 char = stream.read(1) 334 index += 1 335 process_char(" ", charcode) 336 if completed: 337 yield now_token