/ src / python / txtai / database / sql / token.py
token.py
  1  """
  2  Token module
  3  """
  4  
  5  
  6  class Token:
  7      """
  8      Methods to check for token type.
  9      """
 10  
 11      # Similar token replacement
 12      SIMILAR_TOKEN = "__SIMILAR__"
 13  
 14      # Default distinct token
 15      DISTINCT = ["distinct"]
 16  
 17      # Default alias token
 18      ALIAS = ["as"]
 19  
 20      # Default list of comparison operators
 21      OPERATORS = ["=", "!=", "<>", ">", ">=", "<", "<=", "+", "-", "*", "/", "%", "||", "not", "between", "like", "is", "null"]
 22  
 23      # Default list of logic separators
 24      LOGIC_SEPARATORS = ["and", "or"]
 25  
 26      # Default list of sort order operators
 27      SORT_ORDER = ["asc", "desc"]
 28  
 29      @staticmethod
 30      def get(tokens, x):
 31          """
 32          Gets token at position x. This method will validate position is valid within tokens.
 33  
 34          Args:
 35              tokens: input tokens
 36              x: position to retrieve
 37  
 38          Returns:
 39              tokens[x] if x is a valid position, None otherwise
 40          """
 41  
 42          if 0 <= x < len(tokens):
 43              return tokens[x]
 44  
 45          return None
 46  
 47      @staticmethod
 48      def isalias(tokens, x, alias):
 49          """
 50          Checks if tokens[x] is an alias keyword.
 51  
 52          Args:
 53              tokens: input tokens
 54              x: current position
 55              alias: if column alias processing is enabled
 56  
 57          Returns:
 58              True if tokens[x] is an alias token, False otherwise
 59          """
 60  
 61          prior = Token.get(tokens, x - 1)
 62          token = tokens[x]
 63  
 64          # True if prior token is not a separator, grouping token or distinct token and current token is either a column token or quoted token
 65          return (
 66              alias
 67              and x > 0
 68              and not Token.isseparator(prior)
 69              and not Token.isgroupstart(prior)
 70              and not Token.isdistinct(prior)
 71              and (Token.iscolumn(token) or Token.isquoted(token))
 72          )
 73  
 74      @staticmethod
 75      def isattribute(tokens, x):
 76          """
 77          Checks if tokens[x] is an attribute.
 78  
 79          Args:
 80              tokens: input tokens
 81              x: current position
 82  
 83          Returns:
 84              True if tokens[x] is an attribute, False otherwise
 85          """
 86  
 87          # True if token is a column and next token is not an operator
 88          return Token.iscolumn(tokens[x]) and not Token.isoperator(Token.get(tokens, x + 1))
 89  
 90      @staticmethod
 91      def isbracket(token):
 92          """
 93          Checks if token is an open bracket.
 94  
 95          Args:
 96              token: token to test
 97  
 98          Returns:
 99              True if token is an open bracket, False otherwise
100          """
101  
102          # Token is a bracket
103          return token == "["
104  
105      @staticmethod
106      def iscolumn(token):
107          """
108          Checks if token is a column name.
109  
110          Args:
111              token: token to test
112  
113          Returns:
114              True if this token is a column name token, False otherwise
115          """
116  
117          # Columns are not operators, logic separators, literals or sort order tokens
118          return (
119              token
120              and not Token.isoperator(token)
121              and not Token.islogicseparator(token)
122              and not Token.isliteral(token)
123              and not Token.issortorder(token)
124          )
125  
126      @staticmethod
127      def iscompound(tokens, x):
128          """
129          Checks if tokens[x] is a compound expression.
130  
131          Args:
132              tokens: input tokens
133              x: current position
134  
135          Returns:
136              True if tokens[x] is a compound expression, False otherwise
137          """
138  
139          # Compound expression is defined as: <column> <operator(s)> <column>
140          return Token.isoperator(tokens[x]) and (Token.iscolumn(Token.get(tokens, x - 1)) or Token.iscolumn(Token.get(tokens, x + 1)))
141  
142      @staticmethod
143      def isdistinct(token):
144          """
145          Checks if token is the distinct keyword.
146  
147          Args:
148              token: token to test
149  
150          Returns:
151              True if this token is a distinct keyword, False otherwise
152          """
153  
154          # Token is the distinct keyword
155          return token and token.lower() in Token.DISTINCT
156  
157      @staticmethod
158      def isfunction(tokens, x):
159          """
160          Checks if tokens[x] is a function.
161  
162          Args:
163              tokens: input tokens
164              x: current position
165  
166          Returns:
167              True if tokens[x] is a function, False otherwise
168          """
169  
170          # True if a column token is followed by an open paren
171          return Token.iscolumn(tokens[x]) and Token.get(tokens, x + 1) == "("
172  
173      @staticmethod
174      def isgroupstart(token):
175          """
176          Checks if token is a group start token.
177  
178          Args:
179              token: token to test
180  
181          Returns:
182              True if token is a group start token, False otherwise
183          """
184  
185          # Token is a paren
186          return token == "("
187  
188      @staticmethod
189      def isliteral(token):
190          """
191          Checks if token is a literal.
192  
193          Args:
194              token: token to test
195  
196          Returns:
197              True if this token is a literal, False otherwise
198          """
199  
200          # Literals are wrapped in quotes, parens, wildcards or numeric.
201          return token and (token.startswith(("'", '"', ",", "(", ")", "*")) or token.replace(".", "", 1).isdigit())
202  
203      @staticmethod
204      def islogicseparator(token):
205          """
206          Checks if token is a logic separator token.
207  
208          Args:
209              token: token to test
210  
211          Returns:
212              True if this token is a logic separator, False otherwise
213          """
214  
215          # Token is a logic separator
216          return token and token.lower() in Token.LOGIC_SEPARATORS
217  
218      @staticmethod
219      def isoperator(token):
220          """
221          Checks if token is an operator token.
222  
223          Args:
224              token: token to test
225  
226          Returns:
227              True if this token is an operator, False otherwise
228          """
229  
230          # Token is an operator
231          return token and token.lower() in Token.OPERATORS
232  
233      @staticmethod
234      def isquoted(token):
235          """
236          Checks if token is quoted.
237  
238          Args:
239              token: token to test
240  
241          Returns:
242              True if this token is quoted, False otherwise
243          """
244  
245          # Token is quoted
246          return token.startswith(("'", '"')) and token.endswith(("'", '"'))
247  
248      @staticmethod
249      def isseparator(token):
250          """
251          Checks if token is a separator token.
252  
253          Args:
254              token to test
255  
256          Returns:
257              True if this token is a separator, False otherwise
258          """
259  
260          # Token is a comma
261          return token == ","
262  
263      @staticmethod
264      def issimilar(tokens, x, similar):
265          """
266          Checks if tokens[x] is a similar() function.
267  
268          Args:
269              tokens: input tokens
270              x: current position
271              similar: list where similar function call parameters are stored, can be None in which case similar processing is skipped
272  
273          Returns:
274              True if tokens[x] is a similar clause
275          """
276  
277          # True if a "similar" token is followed by an open paren
278          return similar is not None and tokens[x].lower() == "similar" and Token.get(tokens, x + 1) == "("
279  
280      @staticmethod
281      def issortorder(token):
282          """
283          Checks if token is a sort order token.
284  
285          Args:
286              token: token to test
287  
288          Returns:
289              True if this token is a sort order operator, False otherwise
290          """
291  
292          # Token is a sort order operator
293          return token and token.lower() in Token.SORT_ORDER
294  
295      @staticmethod
296      def normalize(token):
297          """
298          Applies a normalization algorithm to the input token as follows:
299              - Strip single and double quotes
300              - Make lowercase
301  
302          Args:
303              token: input token
304  
305          Returns:
306              normalized token
307          """
308  
309          # Lowercase, replace and return
310          return token.lower().replace("'", "").replace('"', "")
311  
312      @staticmethod
313      def wrapspace(text, token):
314          """
315          Applies whitespace wrapping rules to token.
316  
317          Args:
318              text: current text buffer
319              token: token to add
320  
321          Returns:
322              token with whitespace rules applied
323          """
324  
325          # Wildcards have no whitespace. Need special case since * is also multiply which does have whitespace.
326          if token in ["*"] and (not text or text.endswith((" ", "("))):
327              return token
328  
329          # Operator whitespace
330          if Token.isoperator(token) or Token.islogicseparator(token) or token.lower() in ["in"]:
331              return f" {token} " if not text.endswith(" ") else f"{token} "
332  
333          # Comma whitespace
334          if Token.isseparator(token):
335              return f"{token} "
336  
337          # No whitespace if any of the following is True
338          if not text or text.endswith((" ", "(", "[")) or token in ["(", "[", ")", "]"] or token.startswith("."):
339              return token
340  
341          # Default is to add leading whitespace
342          return f" {token}"