/ src / pipeline / steps / preprocess_step.py
preprocess_step.py
 1  from __future__ import annotations
 2  
 3  """Step that preprocesses markdown text to remove repeated content."""
 4  
 5  import logging
 6  
 7  from ...constants import DEFAULT_ENCODING
 8  from ...preprocessing.protocol import Preprocessor
 9  from ..contexts.ingestion_context import IngestionContext
10  
11  
12  class PreprocessStep:
13      """Step that preprocesses markdown text to remove repeated content.
14  
15      This step detects and removes repeated headers, footers, and page numbers
16      that can pollute chunks and reduce retrieval precision.
17      """
18  
19      def __init__(self, preprocessor: Preprocessor):
20          """Initialize the preprocessing step.
21  
22          Parameters
23          ----------
24          preprocessor
25              Preprocessor instance created by PreprocessorFactory.
26          """
27          self.preprocessor = preprocessor
28  
29      def run(self, context: IngestionContext) -> None:
30          """Preprocess the markdown text to remove repeated content.
31  
32          Parameters
33          ----------
34          context
35              Ingestion context with raw_text and markdown_path set.
36          """
37          logger = logging.getLogger(__name__)
38          if not context.raw_text:
39              context.mark_failed("Raw text not set. Load step must run first.")
40              return
41  
42          logger.info("Preprocessing markdown to remove repeated content...")
43  
44          cleaned_text = self.preprocessor.preprocess(context.raw_text)
45  
46          context.raw_text = cleaned_text
47  
48          if context.markdown_path:
49              context.markdown_path.write_text(
50                  cleaned_text, encoding=DEFAULT_ENCODING
51              )
52              logger.info(f"Updated markdown file: {context.markdown_path}")
53  
54          logger.info("Preprocessing complete")
55