sif.py
1 """ 2 SIF module 3 """ 4 5 import numpy as np 6 7 from .tfidf import TFIDF 8 9 10 class SIF(TFIDF): 11 """ 12 Smooth Inverse Frequency (SIF) scoring. 13 """ 14 15 def __init__(self, config=None): 16 super().__init__(config) 17 18 # SIF configurable parameters 19 self.a = self.config.get("a", 1e-3) 20 21 def computefreq(self, tokens): 22 # Default method computes frequency for a single entry 23 # SIF uses word frequencies across entire index 24 return {token: self.wordfreq[token] for token in tokens} 25 26 def score(self, freq, idf, length): 27 # Set freq to word frequencies across entire index when freq and idf shape don't match 28 if isinstance(freq, np.ndarray) and freq.shape != np.array(idf).shape: 29 freq.fill(freq.sum()) 30 31 # Calculate SIF score 32 return self.a / (self.a + freq / self.tokens)