/ src / preprocessing / protocol.py
protocol.py
 1  from __future__ import annotations
 2  
 3  """Protocol for preprocessor implementations."""
 4  
 5  from typing import Protocol
 6  
 7  
 8  class Preprocessor(Protocol):
 9      """Protocol for preprocessor implementations.
10  
11      Any class implementing this method can preprocess documents to remove
12      repeated content, clean text, etc. This allows swapping preprocessing
13      strategies without changing the rest of the code.
14      """
15  
16      def preprocess(self, text: str) -> str:
17          """Preprocess text to remove repeated content.
18  
19          Parameters
20          ----------
21          text
22              The text to preprocess.
23  
24          Returns
25          -------
26          Preprocessed text with repeated content removed.
27          """
28          ...
29