/ src / python / txtai / data / labels.py
labels.py
 1  """
 2  Labels module
 3  """
 4  
 5  from .base import Data
 6  
 7  
 8  class Labels(Data):
 9      """
10      Tokenizes text-classification datasets as input for training text-classification models.
11      """
12  
13      def __init__(self, tokenizer, columns, maxlength):
14          """
15          Creates a new instance for tokenizing Labels training data.
16  
17          Args:
18              tokenizer: model tokenizer
19              columns: tuple of columns to use for text/label
20              maxlength: maximum sequence length
21          """
22  
23          super().__init__(tokenizer, columns, maxlength)
24  
25          # Standardize columns
26          if not self.columns:
27              self.columns = ("text", None, "label")
28          elif len(columns) < 3:
29              self.columns = (self.columns[0], None, self.columns[-1])
30  
31      def process(self, data):
32          # Column keys
33          text1, text2, label = self.columns
34  
35          # Tokenizer inputs can be single string or string pair, depending on task
36          text = (data[text1], data[text2]) if text2 else (data[text1],)
37  
38          # Tokenize text and add label
39          inputs = self.tokenizer(*text, max_length=self.maxlength, padding=True, truncation=True)
40          inputs[label] = data[label]
41  
42          return inputs