normalize.py
1 """ 2 Normalize module 3 """ 4 5 import numpy as np 6 7 8 class Normalize: 9 """ 10 Applies score normalization methods. 11 12 Bayesian mode supports BB25-style score calibration aliases ("bayes", "bb25", "bayesian-bm25"). 13 Reference implementations: 14 - https://github.com/instructkr/bb25 15 - https://github.com/cognica-io/bayesian-bm25 16 """ 17 18 def __init__(self, config): 19 """ 20 Creates a new Normalize instance. 21 22 Args: 23 config: normalize configuration 24 """ 25 26 # Normalize settings 27 self.config = config if isinstance(config, dict) else {} 28 method = self.config.get("method", config if isinstance(config, str) else "default") 29 self.method = str(method).lower() 30 31 # Bayesian settings 32 self.alpha = float(self.config.get("alpha", 1.0)) 33 self.beta = self.config.get("beta") 34 self.beta = float(self.beta) if self.beta is not None else self.beta 35 36 def isbayes(self): 37 """ 38 Checks if Bayesian normalization mode is active. 39 40 Returns: 41 True if using BB25/Bayesian normalization 42 """ 43 44 return self.method in ("bayes", "bayesian", "bayesian-bm25", "bb25") 45 46 def __call__(self, scores, avgscore): 47 """ 48 Normalizes scores. 49 50 Args: 51 scores: list of (id, score) 52 avgscore: average score across index 53 54 Returns: 55 normalized scores 56 """ 57 58 return self.bayes(scores) if self.isbayes() else self.default(scores, avgscore) 59 60 def default(self, scores, avgscore): 61 """ 62 Default normalization implementation. 63 64 Args: 65 scores: list of (id, score) 66 avgscore: average score across index 67 68 Returns: 69 normalized scores 70 """ 71 72 # Use average index score in max score calculation 73 maxscore = min(scores[0][1] + avgscore, 6 * avgscore) 74 75 # Normalize scores between 0 - 1 using maxscore 76 return [(uid, min(score / maxscore, 1.0)) for uid, score in scores] 77 78 def bayes(self, scores): 79 """ 80 BB25/Bayesian normalization implementation. 81 82 Args: 83 scores: list of (id, score) 84 85 Returns: 86 normalized scores 87 """ 88 89 # Convert scores to numpy array 90 values = np.array([score for _, score in scores], dtype=np.float32) 91 probabilities = np.zeros(values.shape[0], dtype=np.float32) 92 93 # Follow BB25 candidate-set behavior: 94 # - estimate statistics on positive-score candidates only 95 # - assign zero-score candidates a final score of 0.0 96 positive = values > 0.0 97 if not np.any(positive): 98 return [(uid, 0.0) for uid, _ in scores] 99 100 candidates = values[positive] 101 102 # Dynamically derive beta using candidate score distribution, if not configured 103 beta = self.beta if self.beta is not None else float(np.median(candidates)) 104 105 # Scale alpha by standard deviation for score-range invariance 106 std = float(np.std(candidates)) 107 alpha = abs(self.alpha / std if std > 0 else self.alpha) 108 109 # Compute sigmoid likelihood for positive-score candidates. 110 # In this generic normalize flow, term-structure priors (tf/doc length) are not available, 111 # so we use the likelihood-only BB25 transform (equivalent to a flat prior of 0.5). 112 logits = np.clip(alpha * (candidates - beta), -500, 500) 113 probabilities[positive] = 1.0 / (1.0 + np.exp(-logits)) 114 probabilities = np.clip(probabilities, 0.0, 1.0) 115 116 # Convert back to score tuples 117 return [(uid, float(probabilities[x])) for x, (uid, _) in enumerate(scores)]