/ src / python / txtai / pipeline / data / segmentation.py
segmentation.py
  1  """
  2  Segmentation module
  3  """
  4  
  5  import re
  6  
  7  # Conditional import
  8  try:
  9      from nltk import sent_tokenize
 10  
 11      NLTK = True
 12  except ImportError:
 13      NLTK = False
 14  
 15  # Conditional import
 16  try:
 17      import chonkie
 18  
 19      CHONKIE = True
 20  except ImportError:
 21      CHONKIE = False
 22  
 23  from ..base import Pipeline
 24  
 25  
 26  class Segmentation(Pipeline):
 27      """
 28      Segments text into logical units.
 29      """
 30  
 31      def __init__(
 32          self,
 33          sentences=False,
 34          lines=False,
 35          paragraphs=False,
 36          minlength=None,
 37          join=False,
 38          sections=False,
 39          cleantext=True,
 40          chunker=None,
 41          tuples=False,
 42          **kwargs,
 43      ):
 44          """
 45          Creates a new Segmentation pipeline.
 46  
 47          Args:
 48              sentences: tokenize text into sentences if True, defaults to False
 49              lines: tokenizes text into lines if True, defaults to False
 50              paragraphs: tokenizes text into paragraphs if True, defaults to False
 51              minlength: require at least minlength characters per text element, defaults to None
 52              join: joins tokenized sections back together if True, defaults to False
 53              sections: tokenizes text into sections if True, defaults to False. Splits using section or page breaks, depending on what's available
 54              cleantext: apply text cleaning rules, defaults to True
 55              chunker: creates a third-party chunker to tokenize text if set, defaults to None
 56              tuples: return (input, output) tuples, defaults to False
 57              kwargs: additional keyword arguments
 58          """
 59  
 60          if not NLTK and sentences:
 61              raise ImportError('NLTK is not available - install "pipeline" extra to enable')
 62  
 63          if not CHONKIE and chunker:
 64              raise ImportError('Chonkie is not available - install "pipeline" extra to enable')
 65  
 66          self.sentences = sentences
 67          self.lines = lines
 68          self.paragraphs = paragraphs
 69          self.sections = sections
 70          self.minlength = minlength
 71          self.join = join
 72          self.cleantext = cleantext
 73  
 74          # Create a third-party chunker, if applicable
 75          self.chunker = self.createchunker(chunker, **kwargs) if chunker else None
 76  
 77          # Return (input, output) tuples as output
 78          self.tuples = tuples
 79  
 80      def __call__(self, text):
 81          """
 82          Segments text into semantic units.
 83  
 84          This method supports text as a string or a list. If the input is a string, the return
 85          type is text|list. If text is a list, a list of returned, this could be a
 86          list of text or a list of lists depending on the tokenization strategy.
 87  
 88          Args:
 89              text: text|list
 90  
 91          Returns:
 92              segmented text
 93          """
 94  
 95          # Get inputs
 96          texts = [text] if not isinstance(text, list) else text
 97  
 98          # Extract text for each input file
 99          results = []
100          for value in texts:
101              # Get text
102              result = self.text(value)
103  
104              # Parse and add extracted results
105              result = self.parse(result)
106  
107              # Wrap as tuple
108              if self.tuples:
109                  result = [(value, x) for x in result] if isinstance(result, list) else (value, result)
110  
111              results.append(result)
112  
113          return results[0] if isinstance(text, str) else results
114  
115      def text(self, text):
116          """
117          Hook to allow extracting text out of input text object.
118  
119          Args:
120              text: object to extract text from
121          """
122  
123          return text
124  
125      def parse(self, text):
126          """
127          Splits and cleans text based on the current parameters.
128  
129          Args:
130              text: input text
131  
132          Returns:
133              parsed and clean content
134          """
135  
136          content = None
137  
138          if self.chunker:
139              # pylint: disable=E1102
140              content = [self.clean(x.text) for x in self.chunker(text)]
141          elif self.sentences:
142              content = [self.clean(x) for x in sent_tokenize(text)]
143          elif self.lines:
144              content = [self.clean(x) for x in re.split(r"\n{1,}", text)]
145          elif self.paragraphs:
146              content = [self.clean(x) for x in re.split(r"\n{2,}", text)]
147          elif self.sections:
148              split = r"\f" if "\f" in text else r"\n{3,}"
149              content = [self.clean(x) for x in re.split(split, text)]
150          else:
151              content = self.clean(text)
152  
153          # Text tokenization enabled
154          if isinstance(content, list):
155              # Remove empty strings
156              content = [x for x in content if x]
157              return " ".join(content) if self.join else content
158  
159          # Default method that returns clean text
160          return content
161  
162      def clean(self, text):
163          """
164          Applies a series of rules to clean text.
165  
166          Args:
167              text: input text
168  
169          Returns:
170              clean text
171          """
172  
173          # Text cleaning disabled, return original text
174          if not self.cleantext:
175              return text
176  
177          # Collapse and remove excess whitespace
178          text = re.sub(r" +", " ", text)
179          text = text.strip()
180  
181          # If minlength enabled, require at least minlength chars
182          return text if not self.minlength or len(text) >= self.minlength else None
183  
184      def createchunker(self, chunker, **kwargs):
185          """
186          Creates a new third-party chunker
187  
188          Args:
189              chunker: name of chunker to create
190              kwargs: additional keyword arguments
191  
192          Returns:
193              new chunker
194          """
195  
196          # Resolve and create a third-party chunker
197          chunker = f"{chunker[0].upper() + chunker[1:]}Chunker"
198          return getattr(chonkie, chunker)(**kwargs)