/ src / python / txtai / database / sql / expression.py
expression.py
  1  """
  2  Expression module
  3  """
  4  
  5  from .token import Token
  6  
  7  
  8  class Expression:
  9      """
 10      Parses expression statements and runs a set of substitution/formatting rules.
 11      """
 12  
 13      def __init__(self, resolver, tolist):
 14          """
 15          Creates a new expression parser.
 16  
 17          Args:
 18              resolver: function to call to resolve query column names with database column names
 19              tolist: outputs expression lists if True, text if False
 20          """
 21  
 22          self.resolver = resolver
 23          self.tolist = tolist
 24  
 25      def __call__(self, tokens, alias=False, aliases=None, similar=None):
 26          """
 27          Parses and formats a list of tokens as follows:
 28              - Replaces query column names with database column names
 29              - Adds similar query placeholders and extracts similar function parameters
 30              - Rewrites expression and returns
 31  
 32          Args:
 33              tokens: input expression
 34              alias: if True, column aliases should be generated and added to aliases dict
 35              aliases: dict of generated aliases, if present these tokens should NOT be resolved
 36              similar: list of similar queries, if present new similar queries are appended to this list
 37  
 38          Returns:
 39              rewritten clause
 40          """
 41  
 42          # Processes token expressions and applies a set of transformation rules
 43          transformed = self.process(list(tokens), alias, aliases, similar)
 44  
 45          # Re-write alias expression and return
 46          if alias and not self.tolist:
 47              return self.buildalias(transformed, tokens, aliases)
 48  
 49          # Re-write input expression and return
 50          return self.buildlist(transformed) if self.tolist is True else self.buildtext(transformed)
 51  
 52      def process(self, tokens, alias, aliases, similar):
 53          """
 54          Replaces query column names with database column names, adds similar query placeholders and
 55          extracts similar function parameters.
 56  
 57          Args:
 58              tokens: input expression
 59              alias: if True, column aliases should be generated and added to aliases dict
 60              aliases: dict of generated aliases, if present these tokens should NOT be resolved
 61              similar: list of similar queries, if present new similar queries are appended to this list
 62  
 63          Returns:
 64              transformed tokens
 65          """
 66  
 67          # Create clause index and token iterator. Iterator skips distinct tokens.
 68          index, iterator = 0, ((x, token) for x, token in enumerate(tokens) if not Token.isdistinct(token))
 69          for x, token in iterator:
 70              # Check if separator, increment clause index
 71              if Token.isseparator(token):
 72                  index += 1
 73  
 74              # Check if token is a square bracket
 75              elif Token.isbracket(token):
 76                  # Resolve bracket expression
 77                  self.bracket(iterator, tokens, x)
 78  
 79              # Check if token is a similar function
 80              elif Token.issimilar(tokens, x, similar):
 81                  # Resolve similar expression
 82                  self.similar(iterator, tokens, x, similar)
 83  
 84              # Check if token is a function
 85              elif Token.isfunction(tokens, x):
 86                  # Resolve function expression
 87                  self.function(iterator, tokens, token, aliases, similar)
 88  
 89              # Check for alias expression
 90              elif Token.isalias(tokens, x, alias):
 91                  # Process alias expression
 92                  self.alias(iterator, tokens, x, aliases, index)
 93  
 94              # Check for attribute expression
 95              elif Token.isattribute(tokens, x):
 96                  # Resolve attribute expression
 97                  self.attribute(tokens, x, aliases)
 98  
 99              # Check for compound expression
100              elif Token.iscompound(tokens, x):
101                  # Resolve compound expression
102                  self.compound(iterator, tokens, x, aliases, similar)
103  
104          # Remove replaced tokens
105          return [token for token in tokens if token]
106  
107      def buildtext(self, tokens):
108          """
109          Builds a new expression from tokens. This method applies a set of rules to generate whitespace between tokens.
110  
111          Args:
112              tokens: input expression
113  
114          Returns:
115              expression text
116          """
117  
118          # Rebuild expression
119          text = ""
120          for token in tokens:
121              # Write token with whitespace rules applied
122              text += Token.wrapspace(text, token)
123  
124          # Remove any leading/trailing whitespace and return
125          return text.strip()
126  
127      def buildlist(self, tokens):
128          """
129          Builds a new expression from tokens. This method returns a list of expression components. These components can be joined together
130          on commas to form a text expression.
131  
132          Args:
133              tokens: input expression
134  
135          Returns:
136              expression list
137          """
138  
139          parts, current, parens, brackets = [], [], 0, 0
140  
141          for token in tokens:
142              # Create new part
143              if token == "," and not parens and not brackets:
144                  parts.append(self.buildtext(current))
145                  current = []
146              else:
147                  # Accumulate tokens
148                  if token == "(":
149                      parens += 1
150                  elif token == ")":
151                      parens -= 1
152                  elif token == "[":
153                      brackets += 1
154                  elif token == "]":
155                      brackets -= 1
156                  elif Token.issortorder(token):
157                      token = f" {token}"
158                  current.append(token)
159  
160          # Add last part
161          if current:
162              parts.append(self.buildtext(current))
163  
164          return parts
165  
166      def buildalias(self, transformed, tokens, aliases):
167          """
168          Builds new alias text expression from transformed and input tokens.
169  
170          Args:
171              transformed: transformed tokens
172              tokens: original input tokens
173              aliases: dict of column aliases
174  
175          Returns:
176              alias text expression
177          """
178  
179          # Convert tokens to expressions
180          transformed = self.buildlist(transformed)
181          tokens = self.buildlist(tokens)
182  
183          expression = []
184          for x, token in enumerate(transformed):
185              if x not in aliases.values():
186                  alias = tokens[x]
187  
188                  # Strip leading/trailing brackets from alias name that doesn't have operators
189                  if not any(Token.isoperator(t) for t in alias) and alias[0] in ("[", "(") and alias[-1] in ("]", ")"):
190                      alias = alias[1:-1]
191  
192                  # Strip leading distinct keyword
193                  values = alias.split()
194                  if len(values) > 0 and Token.isdistinct(values[0]):
195                      alias = " ".join(values[1:])
196  
197                  # Resolve alias
198                  token = self.resolver(token, alias)
199  
200              expression.append(token)
201  
202          # Build alias text expression
203          return ", ".join(expression)
204  
205      def bracket(self, iterator, tokens, x):
206          """
207          Consumes a [bracket] expression.
208  
209          Args:
210              iterator: tokens iterator
211              tokens: input tokens
212              x: current position
213          """
214  
215          # Function parameters
216          params = []
217  
218          # Clear token from stream
219          token = tokens[x]
220          tokens[x] = None
221  
222          # Bracket counter (current token is an open bracket)
223          brackets = 1
224  
225          # Read until token is a end bracket
226          while token and (token != "]" or brackets > 0):
227              x, token = next(iterator, (None, None))
228  
229              # Increase/decrease bracket counter
230              if token == "[":
231                  brackets += 1
232              elif token == "]":
233                  brackets -= 1
234  
235              # Accumulate tokens
236              if token != "]" or brackets > 0:
237                  params.append(token)
238  
239              # Clear token from stream
240              tokens[x] = None
241  
242          # Set last token to resolved bracket expression
243          tokens[x] = self.resolve(self.buildtext(params).replace("'", "''"), None)
244  
245      def similar(self, iterator, tokens, x, similar):
246          """
247          Substitutes a similar() function call with a placeholder that can later be used to add
248          embeddings query results as a filter.
249  
250          Args:
251              iterator: tokens iterator
252              tokens: input tokens
253              x: current position
254              similar: list where similar function call parameters are stored
255          """
256  
257          # Function parameters
258          params = []
259  
260          # Clear token from stream
261          token = tokens[x]
262          tokens[x] = None
263  
264          # Read until token is a closing paren
265          while token and token != ")":
266              x, token = next(iterator, (None, None))
267              if token and token not in ["(", ",", ")"]:
268                  # Strip quotes and accumulate tokens
269                  params.append(token.replace("'", "").replace('"', ""))
270  
271              # Clear token from stream
272              tokens[x] = None
273  
274          # Add placeholder for embedding similarity results
275          tokens[x] = f"{Token.SIMILAR_TOKEN}{len(similar)}"
276  
277          # Save parameters
278          similar.append(params)
279  
280      def function(self, iterator, tokens, token, aliases, similar):
281          """
282          Resolves column names within the function's parameters.
283  
284          Args:
285              iterator: tokens iterator
286              tokens: input tokens
287              token: current token
288              aliases: dict of generated aliases, if present these tokens should NOT be resolved
289              similar: list where similar function call parameters are stored
290          """
291  
292          # Consume function parameters
293          while token and token != ")":
294              x, token = next(iterator, (None, None))
295  
296              # Check if token is a square bracket
297              if Token.isbracket(token):
298                  # Resolve bracket expression
299                  self.bracket(iterator, tokens, x)
300  
301              # Check if token is a similar function
302              elif Token.issimilar(tokens, x, similar):
303                  # Resolve similar expression
304                  self.similar(iterator, tokens, x, similar)
305  
306              # Check if token is a function
307              elif Token.isfunction(tokens, x):
308                  # Resolve function parameters that are functions
309                  self.function(iterator, tokens, token, aliases, similar)
310  
311              # Check for attribute expression
312              elif Token.isattribute(tokens, x):
313                  # Resolve attributes
314                  self.attribute(tokens, x, aliases)
315  
316              # Check for compound expression
317              elif Token.iscompound(tokens, x):
318                  # Resolve compound expressions
319                  self.compound(iterator, tokens, x, aliases, similar)
320  
321      def alias(self, iterator, tokens, x, aliases, index):
322          """
323          Reads an alias clause and stores it in aliases.
324  
325          Args:
326              iterator: tokens iterator
327              tokens: input tokens
328              x: current position
329              aliases: dict where aliases are stored - stores {alias: clause index}
330              index: clause index, used to match aliases with columns
331          """
332  
333          token = tokens[x]
334  
335          # If this is an alias token, get next token
336          if token in Token.ALIAS:
337              x, token = next(iterator, (None, None))
338  
339          # Consume tokens until end of stream or a separator is found. Evaluate next token to prevent consuming here.
340          while x + 1 < len(tokens) and not Token.isseparator(Token.get(tokens, x + 1)):
341              x, token = next(iterator, (None, None))
342  
343          # Add normalized alias and clause index
344          aliases[Token.normalize(token)] = index
345  
346      def attribute(self, tokens, x, aliases):
347          """
348          Resolves an attribute column name.
349  
350          Args:
351              tokens: input tokens
352              x: current token position
353              aliases: dict of generated aliases, if present these tokens should NOT be resolved
354          """
355  
356          # Resolve attribute expression
357          tokens[x] = self.resolve(tokens[x], aliases)
358  
359      def compound(self, iterator, tokens, x, aliases, similar):
360          """
361          Resolves column names in a compound expression (left side <operator(s)> right side).
362  
363          Args:
364              iterator: tokens iterator
365              tokens: input tokens
366              x: current token position
367              aliases: dict of generated aliases, if present these tokens should NOT be resolved
368              similar: list where similar function call parameters are stored
369          """
370  
371          # Resolve left side (left side already had function processing applied through standard loop)
372          if Token.iscolumn(tokens[x - 1]):
373              tokens[x - 1] = self.resolve(tokens[x - 1], aliases)
374  
375          # Consume operator(s), handle both single and compound operators, i.e. column NOT LIKE 1
376          token = tokens[x]
377          while token and Token.isoperator(token):
378              x, token = next(iterator, (None, None))
379  
380          # Resolve right side
381          if token and Token.iscolumn(token):
382              # Need to process functions since it hasn't went through the standard loop yet
383              if Token.isfunction(tokens, x):
384                  self.function(iterator, tokens, token, aliases, similar)
385              else:
386                  tokens[x] = self.resolve(token, aliases)
387  
388      def resolve(self, token, aliases):
389          """
390          Resolves this token's value if it is not an alias or a bind parameter.
391  
392          Args:
393              token: token to resolve
394              aliases: dict of generated aliases, if present these tokens should NOT be resolved
395  
396          Returns:
397              resolved token value
398          """
399  
400          # Check for alias or bind parameter
401          if (aliases and Token.normalize(token) in aliases) or (token.startswith(":")):
402              return token
403  
404          return self.resolver(token)