labels.py
1 """ 2 Labels module 3 """ 4 5 from .base import Data 6 7 8 class Labels(Data): 9 """ 10 Tokenizes text-classification datasets as input for training text-classification models. 11 """ 12 13 def __init__(self, tokenizer, columns, maxlength): 14 """ 15 Creates a new instance for tokenizing Labels training data. 16 17 Args: 18 tokenizer: model tokenizer 19 columns: tuple of columns to use for text/label 20 maxlength: maximum sequence length 21 """ 22 23 super().__init__(tokenizer, columns, maxlength) 24 25 # Standardize columns 26 if not self.columns: 27 self.columns = ("text", None, "label") 28 elif len(columns) < 3: 29 self.columns = (self.columns[0], None, self.columns[-1]) 30 31 def process(self, data): 32 # Column keys 33 text1, text2, label = self.columns 34 35 # Tokenizer inputs can be single string or string pair, depending on task 36 text = (data[text1], data[text2]) if text2 else (data[text1],) 37 38 # Tokenize text and add label 39 inputs = self.tokenizer(*text, max_length=self.maxlength, padding=True, truncation=True) 40 inputs[label] = data[label] 41 42 return inputs