/ src / json_stream / tokenizer.py
tokenizer.py
  1  """
  2  Taken from the NAYA project
  3  
  4  https://github.com/danielyule/naya
  5  
  6  Copyright (c) 2019 Daniel Yule
  7  """
  8  
  9  
 10  class TokenType:
 11      OPERATOR = 0
 12      STRING = 1
 13      NUMBER = 2
 14      BOOLEAN = 3
 15      NULL = 4
 16  
 17  
 18  class State:
 19      WHITESPACE = 0
 20      INTEGER_0 = 1
 21      INTEGER_SIGN = 2
 22      INTEGER = 3
 23      INTEGER_EXP = 4
 24      INTEGER_EXP_0 = 5
 25      FLOATING_POINT_0 = 6
 26      FLOATING_POINT = 8
 27      STRING = 9
 28      STRING_ESCAPE = 10
 29      STRING_END = 11
 30      TRUE_1 = 12
 31      TRUE_2 = 13
 32      TRUE_3 = 14
 33      FALSE_1 = 15
 34      FALSE_2 = 16
 35      FALSE_3 = 17
 36      FALSE_4 = 18
 37      NULL_1 = 19
 38      NULL_2 = 20
 39      NULL_3 = 21
 40      UNICODE_1 = 22
 41      UNICODE_2 = 23
 42      UNICODE_3 = 24
 43      UNICODE_4 = 25
 44  
 45  
 46  def tokenize(stream):
 47      def is_delimiter(char):
 48          return char.isspace() or char in "{}[]:,"
 49  
 50      token = []
 51      charcode = 0
 52      completed = False
 53      now_token = ""
 54  
 55      def process_char(char, charcode):
 56          nonlocal token, completed, now_token
 57          advance = True
 58          add_char = False
 59          next_state = state
 60          if state == State.WHITESPACE:
 61              if char == "{":
 62                  completed = True
 63                  now_token = (TokenType.OPERATOR, "{")
 64              elif char == "}":
 65                  completed = True
 66                  now_token = (TokenType.OPERATOR, "}")
 67              elif char == "[":
 68                  completed = True
 69                  now_token = (TokenType.OPERATOR, "[")
 70              elif char == "]":
 71                  completed = True
 72                  now_token = (TokenType.OPERATOR, "]")
 73              elif char == ",":
 74                  completed = True
 75                  now_token = (TokenType.OPERATOR, ",")
 76              elif char == ":":
 77                  completed = True
 78                  now_token = (TokenType.OPERATOR, ":")
 79              elif char == "\"":
 80                  next_state = State.STRING
 81              elif char in "123456789":
 82                  next_state = State.INTEGER
 83                  add_char = True
 84              elif char == "0":
 85                  next_state = State.INTEGER_0
 86                  add_char = True
 87              elif char == "-":
 88                  next_state = State.INTEGER_SIGN
 89                  add_char = True
 90              elif char == "f":
 91                  next_state = State.FALSE_1
 92              elif char == "t":
 93                  next_state = State.TRUE_1
 94              elif char == "n":
 95                  next_state = State.NULL_1
 96              elif not char.isspace():
 97                  raise ValueError("Invalid JSON character: '{0}'".format(char))
 98          elif state == State.INTEGER:
 99              if char in "0123456789":
100                  add_char = True
101              elif char == ".":
102                  next_state = State.FLOATING_POINT_0
103                  add_char = True
104              elif char == "e" or char == 'E':
105                  next_state = State.INTEGER_EXP_0
106                  add_char = True
107              elif is_delimiter(char):
108                  next_state = State.WHITESPACE
109                  completed = True
110                  now_token = (TokenType.NUMBER, int("".join(token)))
111                  advance = False
112              else:
113                  raise ValueError("A number must contain only digits.  Got '{}'".format(char))
114          elif state == State.INTEGER_0:
115              if char == ".":
116                  next_state = State.FLOATING_POINT_0
117                  add_char = True
118              elif char == "e" or char == 'E':
119                  next_state = State.INTEGER_EXP_0
120                  add_char = True
121              elif is_delimiter(char):
122                  next_state = State.WHITESPACE
123                  completed = True
124                  now_token = (TokenType.NUMBER, 0)
125                  advance = False
126              else:
127                  raise ValueError("A 0 must be followed by a '.' or a 'e'.  Got '{0}'".format(char))
128          elif state == State.INTEGER_SIGN:
129              if char == "0":
130                  next_state = State.INTEGER_0
131                  add_char = True
132              elif char in "123456789":
133                  next_state = State.INTEGER
134                  add_char = True
135              else:
136                  raise ValueError("A - must be followed by a digit.  Got '{0}'".format(char))
137          elif state == State.INTEGER_EXP_0:
138              if char == "+" or char == "-" or char in "0123456789":
139                  next_state = State.INTEGER_EXP
140                  add_char = True
141              else:
142                  raise ValueError("An e in a number must be followed by a '+', '-' or digit.  Got '{0}'".format(char))
143          elif state == State.INTEGER_EXP:
144              if char in "0123456789":
145                  add_char = True
146              elif is_delimiter(char):
147                  completed = True
148                  now_token = (TokenType.NUMBER, float("".join(token)))
149                  next_state = State.WHITESPACE
150                  advance = False
151              else:
152                  raise ValueError("A number exponent must consist only of digits.  Got '{}'".format(char))
153          elif state == State.FLOATING_POINT:
154              if char in "0123456789":
155                  add_char = True
156              elif char == "e" or char == "E":
157                  next_state = State.INTEGER_EXP_0
158                  add_char = True
159              elif is_delimiter(char):
160                  completed = True
161                  now_token = (TokenType.NUMBER, float("".join(token)))
162                  next_state = State.WHITESPACE
163                  advance = False
164              else:
165                  raise ValueError("A number must include only digits")
166          elif state == State.FLOATING_POINT_0:
167              if char in "0123456789":
168                  next_state = State.FLOATING_POINT
169                  add_char = True
170              else:
171                  raise ValueError("A number with a decimal point must be followed by a fractional part")
172          elif state == State.FALSE_1:
173              if char == "a":
174                  next_state = State.FALSE_2
175              else:
176                  raise ValueError("Invalid JSON character: '{0}'".format(char))
177          elif state == State.FALSE_2:
178              if char == "l":
179                  next_state = State.FALSE_3
180              else:
181                  raise ValueError("Invalid JSON character: '{0}'".format(char))
182          elif state == State.FALSE_3:
183              if char == "s":
184                  next_state = State.FALSE_4
185              else:
186                  raise ValueError("Invalid JSON character: '{0}'".format(char))
187          elif state == State.FALSE_4:
188              if char == "e":
189                  next_state = State.WHITESPACE
190                  completed = True
191                  now_token = (TokenType.BOOLEAN, False)
192              else:
193                  raise ValueError("Invalid JSON character: '{0}'".format(char))
194          elif state == State.TRUE_1:
195              if char == "r":
196                  next_state = State.TRUE_2
197              else:
198                  raise ValueError("Invalid JSON character: '{0}'".format(char))
199          elif state == State.TRUE_2:
200              if char == "u":
201                  next_state = State.TRUE_3
202              else:
203                  raise ValueError("Invalid JSON character: '{0}'".format(char))
204          elif state == State.TRUE_3:
205              if char == "e":
206                  next_state = State.WHITESPACE
207                  completed = True
208                  now_token = (TokenType.BOOLEAN, True)
209              else:
210                  raise ValueError("Invalid JSON character: '{0}'".format(char))
211          elif state == State.NULL_1:
212              if char == "u":
213                  next_state = State.NULL_2
214              else:
215                  raise ValueError("Invalid JSON character: '{0}'".format(char))
216          elif state == State.NULL_2:
217              if char == "l":
218                  next_state = State.NULL_3
219              else:
220                  raise ValueError("Invalid JSON character: '{0}'".format(char))
221          elif state == State.NULL_3:
222              if char == "l":
223                  next_state = State.WHITESPACE
224                  completed = True
225                  now_token = (TokenType.NULL, None)
226              else:
227                  raise ValueError("Invalid JSON character: '{0}'".format(char))
228          elif state == State.STRING:
229              if char == "\"":
230                  completed = True
231                  now_token = (TokenType.STRING, "".join(token))
232                  next_state = State.STRING_END
233              elif char == "\\":
234                  next_state = State.STRING_ESCAPE
235              else:
236                  add_char = True
237          elif state == State.STRING_END:
238              if is_delimiter(char):
239                  advance = False
240                  next_state = State.WHITESPACE
241              else:
242                  raise ValueError("Expected whitespace or an operator after strin.  Got '{}'".format(char))
243          elif state == State.STRING_ESCAPE:
244              next_state = State.STRING
245              if char == "\\" or char == "\"":
246                  add_char = True
247              elif char == "b":
248                  char = "\b"
249                  add_char = True
250              elif char == "f":
251                  char = "\f"
252                  add_char = True
253              elif char == "n":
254                  char = "\n"
255                  add_char = True
256              elif char == "t":
257                  char = "\t"
258                  add_char = True
259              elif char == "r":
260                  char = "\r"
261                  add_char = True
262              elif char == "/":
263                  char = "/"
264                  add_char = True
265              elif char == "u":
266                  next_state = State.UNICODE_1
267                  charcode = 0
268              else:
269                  raise ValueError("Invalid string escape: {}".format(char))
270          elif state == State.UNICODE_1:
271              if char in "0123456789":
272                  charcode = (ord(char) - 48) * 4096
273              elif char in "abcdef":
274                  charcode = (ord(char) - 87) * 4096
275              elif char in "ABCDEF":
276                  charcode = (ord(char) - 55) * 4096
277              else:
278                  raise ValueError("Invalid character code: {}".format(char))
279              next_state = State.UNICODE_2
280              char = ""
281          elif state == State.UNICODE_2:
282              if char in "0123456789":
283                  charcode += (ord(char) - 48) * 256
284              elif char in "abcdef":
285                  charcode += (ord(char) - 87) * 256
286              elif char in "ABCDEF":
287                  charcode += (ord(char) - 55) * 256
288              else:
289                  raise ValueError("Invalid character code: {}".format(char))
290              next_state = State.UNICODE_3
291              char = ""
292          elif state == State.UNICODE_3:
293              if char in "0123456789":
294                  charcode += (ord(char) - 48) * 16
295              elif char in "abcdef":
296                  charcode += (ord(char) - 87) * 16
297              elif char in "ABCDEF":
298                  charcode += (ord(char) - 55) * 16
299              else:
300                  raise ValueError("Invalid character code: {}".format(char))
301              next_state = State.UNICODE_4
302              char = ""
303          elif state == State.UNICODE_4:
304              if char in "0123456789":
305                  charcode += ord(char) - 48
306              elif char in "abcdef":
307                  charcode += ord(char) - 87
308              elif char in "ABCDEF":
309                  charcode += ord(char) - 55
310              else:
311                  raise ValueError("Invalid character code: {}".format(char))
312              next_state = State.STRING
313              char = chr(charcode)
314              add_char = True
315  
316          if add_char:
317              token.append(char)
318  
319          return advance, next_state, charcode
320      state = State.WHITESPACE
321      char = stream.read(1)
322      index = 0
323      while char:
324          try:
325              advance, state, charcode = process_char(char, charcode)
326          except ValueError as e:
327              raise ValueError("".join([e.args[0], " at index {}".format(index)]))
328          if completed:
329              completed = False
330              token = []
331              yield now_token
332          if advance:
333              char = stream.read(1)
334              index += 1
335      process_char(" ", charcode)
336      if completed:
337          yield now_token