Cradicle Explorer

/ src / python / txtai / scoring / normalize.py
normalize.py
  1  """
  2  Normalize module
  3  """
  4  
  5  import numpy as np
  6  
  7  
  8  class Normalize:
  9      """
 10      Applies score normalization methods.
 11  
 12      Bayesian mode supports BB25-style score calibration aliases ("bayes", "bb25", "bayesian-bm25").
 13      Reference implementations:
 14        - https://github.com/instructkr/bb25
 15        - https://github.com/cognica-io/bayesian-bm25
 16      """
 17  
 18      def __init__(self, config):
 19          """
 20          Creates a new Normalize instance.
 21  
 22          Args:
 23              config: normalize configuration
 24          """
 25  
 26          # Normalize settings
 27          self.config = config if isinstance(config, dict) else {}
 28          method = self.config.get("method", config if isinstance(config, str) else "default")
 29          self.method = str(method).lower()
 30  
 31          # Bayesian settings
 32          self.alpha = float(self.config.get("alpha", 1.0))
 33          self.beta = self.config.get("beta")
 34          self.beta = float(self.beta) if self.beta is not None else self.beta
 35  
 36      def isbayes(self):
 37          """
 38          Checks if Bayesian normalization mode is active.
 39  
 40          Returns:
 41              True if using BB25/Bayesian normalization
 42          """
 43  
 44          return self.method in ("bayes", "bayesian", "bayesian-bm25", "bb25")
 45  
 46      def __call__(self, scores, avgscore):
 47          """
 48          Normalizes scores.
 49  
 50          Args:
 51              scores: list of (id, score)
 52              avgscore: average score across index
 53  
 54          Returns:
 55              normalized scores
 56          """
 57  
 58          return self.bayes(scores) if self.isbayes() else self.default(scores, avgscore)
 59  
 60      def default(self, scores, avgscore):
 61          """
 62          Default normalization implementation.
 63  
 64          Args:
 65              scores: list of (id, score)
 66              avgscore: average score across index
 67  
 68          Returns:
 69              normalized scores
 70          """
 71  
 72          # Use average index score in max score calculation
 73          maxscore = min(scores[0][1] + avgscore, 6 * avgscore)
 74  
 75          # Normalize scores between 0 - 1 using maxscore
 76          return [(uid, min(score / maxscore, 1.0)) for uid, score in scores]
 77  
 78      def bayes(self, scores):
 79          """
 80          BB25/Bayesian normalization implementation.
 81  
 82          Args:
 83              scores: list of (id, score)
 84  
 85          Returns:
 86              normalized scores
 87          """
 88  
 89          # Convert scores to numpy array
 90          values = np.array([score for _, score in scores], dtype=np.float32)
 91          probabilities = np.zeros(values.shape[0], dtype=np.float32)
 92  
 93          # Follow BB25 candidate-set behavior:
 94          #   - estimate statistics on positive-score candidates only
 95          #   - assign zero-score candidates a final score of 0.0
 96          positive = values > 0.0
 97          if not np.any(positive):
 98              return [(uid, 0.0) for uid, _ in scores]
 99  
100          candidates = values[positive]
101  
102          # Dynamically derive beta using candidate score distribution, if not configured
103          beta = self.beta if self.beta is not None else float(np.median(candidates))
104  
105          # Scale alpha by standard deviation for score-range invariance
106          std = float(np.std(candidates))
107          alpha = abs(self.alpha / std if std > 0 else self.alpha)
108  
109          # Compute sigmoid likelihood for positive-score candidates.
110          # In this generic normalize flow, term-structure priors (tf/doc length) are not available,
111          # so we use the likelihood-only BB25 transform (equivalent to a flat prior of 0.5).
112          logits = np.clip(alpha * (candidates - beta), -500, 500)
113          probabilities[positive] = 1.0 / (1.0 + np.exp(-logits))
114          probabilities = np.clip(probabilities, 0.0, 1.0)
115  
116          # Convert back to score tuples
117          return [(uid, float(probabilities[x])) for x, (uid, _) in enumerate(scores)]