Cradicle Explorer

/ src / python / txtai / graph / topics.py
topics.py
  1  """
  2  Topics module
  3  """
  4  
  5  from ..pipeline import Tokenizer
  6  from ..scoring import ScoringFactory
  7  
  8  
  9  class Topics:
 10      """
 11      Topic modeling using community detection.
 12      """
 13  
 14      def __init__(self, config):
 15          """
 16          Creates a new Topics instance.
 17  
 18          Args:
 19              config: topic configuration
 20          """
 21  
 22          self.config = config if config else {}
 23          self.tokenizer = Tokenizer(stopwords=True)
 24  
 25          # Additional stopwords to ignore when building topic names
 26          self.stopwords = set()
 27          if "stopwords" in self.config:
 28              self.stopwords.update(self.config["stopwords"])
 29  
 30      def __call__(self, graph):
 31          """
 32          Runs topic modeling for input graph.
 33  
 34          Args:
 35              graph: Graph instance
 36  
 37          Returns:
 38              dictionary of {topic name: [ids]}
 39          """
 40  
 41          # Detect communities
 42          communities = graph.communities(self.config)
 43  
 44          # Sort by community size, largest to smallest
 45          communities = sorted(communities, key=len, reverse=True)
 46  
 47          # Calculate centrality of graph
 48          centrality = graph.centrality()
 49  
 50          # Score communities and generate topn terms
 51          topics = [self.score(graph, x, community, centrality) for x, community in enumerate(communities)]
 52  
 53          # Merge duplicate topics and return
 54          return self.merge(topics)
 55  
 56      def score(self, graph, index, community, centrality):
 57          """
 58          Scores a community of nodes and generates the topn terms in the community.
 59  
 60          Args:
 61              graph: Graph instance
 62              index: community index
 63              community: community of nodes
 64              centrality: node centrality scores
 65  
 66          Returns:
 67              (topn topic terms, topic ids sorted by score descending)
 68          """
 69  
 70          # Tokenize input and build scoring index
 71          scoring = ScoringFactory.create({"method": self.config.get("labels", "bm25"), "terms": True})
 72          scoring.index(((node, self.tokenize(graph, node), None) for node in community))
 73  
 74          # Check if scoring index has data
 75          if scoring.idf:
 76              # Sort by most commonly occurring terms (i.e. lowest score)
 77              idf = sorted(scoring.idf, key=scoring.idf.get)
 78  
 79              # Term count for generating topic labels
 80              topn = self.config.get("terms", 4)
 81  
 82              # Get topn terms
 83              terms = self.topn(idf, topn)
 84  
 85              # Sort community by score descending
 86              community = [uid for uid, _ in scoring.search(terms, len(community))]
 87          else:
 88              # No text found for topic, generate topic name
 89              terms = ["topic", str(index)]
 90  
 91              # Sort community by centrality scores
 92              community = sorted(community, key=lambda x: centrality[x], reverse=True)
 93  
 94          return (terms, community)
 95  
 96      def tokenize(self, graph, node):
 97          """
 98          Tokenizes node text.
 99  
100          Args:
101              graph: Graph instance
102              node: node id
103  
104          Returns:
105              list of node tokens
106          """
107  
108          text = graph.attribute(node, "text")
109          return self.tokenizer(text) if text else []
110  
111      def topn(self, terms, n):
112          """
113          Gets topn terms.
114  
115          Args:
116              terms: list of terms
117              n: topn
118  
119          Returns:
120              topn terms
121          """
122  
123          topn = []
124  
125          for term in terms:
126              # Add terms that pass tokenization rules
127              if self.tokenizer(term) and term not in self.stopwords:
128                  topn.append(term)
129  
130              # Break once topn terms collected
131              if len(topn) == n:
132                  break
133  
134          return topn
135  
136      def merge(self, topics):
137          """
138          Merges duplicate topics
139  
140          Args:
141              topics: list of (topn terms, topic ids)
142  
143          Returns:
144              dictionary of {topic name:[ids]}
145          """
146  
147          merge, termslist = {}, {}
148  
149          for terms, uids in topics:
150              # Use topic terms as key
151              key = frozenset(terms)
152  
153              # Add key to merged topics, if necessary
154              if key not in merge:
155                  merge[key], termslist[key] = [], terms
156  
157              # Merge communities
158              merge[key].extend(uids)
159  
160          # Sort communities largest to smallest since the order could have changed with merges
161          results = {}
162          for k, v in sorted(merge.items(), key=lambda x: len(x[1]), reverse=True):
163              # Create composite string key using topic terms and store ids
164              results["_".join(termslist[k])] = v
165  
166          return results