segmentation.py
1 """ 2 Segmentation module 3 """ 4 5 import re 6 7 # Conditional import 8 try: 9 from nltk import sent_tokenize 10 11 NLTK = True 12 except ImportError: 13 NLTK = False 14 15 # Conditional import 16 try: 17 import chonkie 18 19 CHONKIE = True 20 except ImportError: 21 CHONKIE = False 22 23 from ..base import Pipeline 24 25 26 class Segmentation(Pipeline): 27 """ 28 Segments text into logical units. 29 """ 30 31 def __init__( 32 self, 33 sentences=False, 34 lines=False, 35 paragraphs=False, 36 minlength=None, 37 join=False, 38 sections=False, 39 cleantext=True, 40 chunker=None, 41 tuples=False, 42 **kwargs, 43 ): 44 """ 45 Creates a new Segmentation pipeline. 46 47 Args: 48 sentences: tokenize text into sentences if True, defaults to False 49 lines: tokenizes text into lines if True, defaults to False 50 paragraphs: tokenizes text into paragraphs if True, defaults to False 51 minlength: require at least minlength characters per text element, defaults to None 52 join: joins tokenized sections back together if True, defaults to False 53 sections: tokenizes text into sections if True, defaults to False. Splits using section or page breaks, depending on what's available 54 cleantext: apply text cleaning rules, defaults to True 55 chunker: creates a third-party chunker to tokenize text if set, defaults to None 56 tuples: return (input, output) tuples, defaults to False 57 kwargs: additional keyword arguments 58 """ 59 60 if not NLTK and sentences: 61 raise ImportError('NLTK is not available - install "pipeline" extra to enable') 62 63 if not CHONKIE and chunker: 64 raise ImportError('Chonkie is not available - install "pipeline" extra to enable') 65 66 self.sentences = sentences 67 self.lines = lines 68 self.paragraphs = paragraphs 69 self.sections = sections 70 self.minlength = minlength 71 self.join = join 72 self.cleantext = cleantext 73 74 # Create a third-party chunker, if applicable 75 self.chunker = self.createchunker(chunker, **kwargs) if chunker else None 76 77 # Return (input, output) tuples as output 78 self.tuples = tuples 79 80 def __call__(self, text): 81 """ 82 Segments text into semantic units. 83 84 This method supports text as a string or a list. If the input is a string, the return 85 type is text|list. If text is a list, a list of returned, this could be a 86 list of text or a list of lists depending on the tokenization strategy. 87 88 Args: 89 text: text|list 90 91 Returns: 92 segmented text 93 """ 94 95 # Get inputs 96 texts = [text] if not isinstance(text, list) else text 97 98 # Extract text for each input file 99 results = [] 100 for value in texts: 101 # Get text 102 result = self.text(value) 103 104 # Parse and add extracted results 105 result = self.parse(result) 106 107 # Wrap as tuple 108 if self.tuples: 109 result = [(value, x) for x in result] if isinstance(result, list) else (value, result) 110 111 results.append(result) 112 113 return results[0] if isinstance(text, str) else results 114 115 def text(self, text): 116 """ 117 Hook to allow extracting text out of input text object. 118 119 Args: 120 text: object to extract text from 121 """ 122 123 return text 124 125 def parse(self, text): 126 """ 127 Splits and cleans text based on the current parameters. 128 129 Args: 130 text: input text 131 132 Returns: 133 parsed and clean content 134 """ 135 136 content = None 137 138 if self.chunker: 139 # pylint: disable=E1102 140 content = [self.clean(x.text) for x in self.chunker(text)] 141 elif self.sentences: 142 content = [self.clean(x) for x in sent_tokenize(text)] 143 elif self.lines: 144 content = [self.clean(x) for x in re.split(r"\n{1,}", text)] 145 elif self.paragraphs: 146 content = [self.clean(x) for x in re.split(r"\n{2,}", text)] 147 elif self.sections: 148 split = r"\f" if "\f" in text else r"\n{3,}" 149 content = [self.clean(x) for x in re.split(split, text)] 150 else: 151 content = self.clean(text) 152 153 # Text tokenization enabled 154 if isinstance(content, list): 155 # Remove empty strings 156 content = [x for x in content if x] 157 return " ".join(content) if self.join else content 158 159 # Default method that returns clean text 160 return content 161 162 def clean(self, text): 163 """ 164 Applies a series of rules to clean text. 165 166 Args: 167 text: input text 168 169 Returns: 170 clean text 171 """ 172 173 # Text cleaning disabled, return original text 174 if not self.cleantext: 175 return text 176 177 # Collapse and remove excess whitespace 178 text = re.sub(r" +", " ", text) 179 text = text.strip() 180 181 # If minlength enabled, require at least minlength chars 182 return text if not self.minlength or len(text) >= self.minlength else None 183 184 def createchunker(self, chunker, **kwargs): 185 """ 186 Creates a new third-party chunker 187 188 Args: 189 chunker: name of chunker to create 190 kwargs: additional keyword arguments 191 192 Returns: 193 new chunker 194 """ 195 196 # Resolve and create a third-party chunker 197 chunker = f"{chunker[0].upper() + chunker[1:]}Chunker" 198 return getattr(chonkie, chunker)(**kwargs)