preprocess_step.py
1 from __future__ import annotations 2 3 """Step that preprocesses markdown text to remove repeated content.""" 4 5 import logging 6 7 from ...constants import DEFAULT_ENCODING 8 from ...preprocessing.protocol import Preprocessor 9 from ..contexts.ingestion_context import IngestionContext 10 11 12 class PreprocessStep: 13 """Step that preprocesses markdown text to remove repeated content. 14 15 This step detects and removes repeated headers, footers, and page numbers 16 that can pollute chunks and reduce retrieval precision. 17 """ 18 19 def __init__(self, preprocessor: Preprocessor): 20 """Initialize the preprocessing step. 21 22 Parameters 23 ---------- 24 preprocessor 25 Preprocessor instance created by PreprocessorFactory. 26 """ 27 self.preprocessor = preprocessor 28 29 def run(self, context: IngestionContext) -> None: 30 """Preprocess the markdown text to remove repeated content. 31 32 Parameters 33 ---------- 34 context 35 Ingestion context with raw_text and markdown_path set. 36 """ 37 logger = logging.getLogger(__name__) 38 if not context.raw_text: 39 context.mark_failed("Raw text not set. Load step must run first.") 40 return 41 42 logger.info("Preprocessing markdown to remove repeated content...") 43 44 cleaned_text = self.preprocessor.preprocess(context.raw_text) 45 46 context.raw_text = cleaned_text 47 48 if context.markdown_path: 49 context.markdown_path.write_text( 50 cleaned_text, encoding=DEFAULT_ENCODING 51 ) 52 logger.info(f"Updated markdown file: {context.markdown_path}") 53 54 logger.info("Preprocessing complete") 55