regex_text_extractor.py
1 # SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai> 2 # 3 # SPDX-License-Identifier: Apache-2.0 4 5 import re 6 from typing import Any 7 8 from haystack import component, logging 9 from haystack.core.serialization import default_from_dict, default_to_dict 10 from haystack.dataclasses import ChatMessage 11 12 logger = logging.getLogger(__name__) 13 14 15 @component 16 class RegexTextExtractor: 17 """ 18 Extracts text from chat message or string input using a regex pattern. 19 20 RegexTextExtractor parses input text or ChatMessages using a provided regular expression pattern. 21 It can be configured to search through all messages or only the last message in a list of ChatMessages. 22 23 ### Usage example 24 25 ```python 26 from haystack.components.extractors import RegexTextExtractor 27 from haystack.dataclasses import ChatMessage 28 29 # Using with a string 30 parser = RegexTextExtractor(regex_pattern='<issue url=\"(.+)\">') 31 result = parser.run(text_or_messages='<issue url="github.com/hahahaha">hahahah</issue>') 32 # result: {"captured_text": "github.com/hahahaha"} 33 34 # Using with ChatMessages 35 messages = [ChatMessage.from_user('<issue url="github.com/hahahaha">hahahah</issue>')] 36 result = parser.run(text_or_messages=messages) 37 # result: {"captured_text": "github.com/hahahaha"} 38 ``` 39 """ 40 41 def __init__(self, regex_pattern: str) -> None: 42 """ 43 Creates an instance of the RegexTextExtractor component. 44 45 :param regex_pattern: 46 The regular expression pattern used to extract text. 47 The pattern should include a capture group to extract the desired text. 48 Example: `'<issue url="(.+)">'` captures `'github.com/hahahaha'` from `'<issue url="github.com/hahahaha">'`. 49 """ 50 self.regex_pattern = regex_pattern 51 52 # Check if the pattern has at least one capture group 53 num_groups = re.compile(regex_pattern).groups 54 if num_groups < 1: 55 logger.warning( 56 "The provided regex pattern {regex_pattern} doesn't contain any capture groups. " 57 "The entire match will be returned instead.", 58 regex_pattern=regex_pattern, 59 ) 60 61 def to_dict(self) -> dict[str, Any]: 62 """ 63 Serializes the component to a dictionary. 64 65 :returns: 66 Dictionary with serialized data. 67 """ 68 return default_to_dict(self, regex_pattern=self.regex_pattern) 69 70 @classmethod 71 def from_dict(cls, data: dict[str, Any]) -> "RegexTextExtractor": 72 """ 73 Deserializes the component from a dictionary. 74 75 :param data: 76 The dictionary to deserialize from. 77 :returns: 78 The deserialized component. 79 """ 80 # return_empty_on_no_match is an old parameter. We'd like to avoid that pipelines break if it's still present. 81 if "return_empty_on_no_match" in data["init_parameters"]: 82 logger.warning("The `return_empty_on_no_match` init parameter has been removed and will be ignored.") 83 data["init_parameters"].pop("return_empty_on_no_match") 84 85 return default_from_dict(cls, data) 86 87 @component.output_types(captured_text=str) 88 def run(self, text_or_messages: str | list[ChatMessage]) -> dict[str, str]: 89 """ 90 Extracts text from input using the configured regex pattern. 91 92 :param text_or_messages: 93 Either a string or a list of ChatMessage objects to search through. 94 95 :returns: 96 - `{"captured_text": "matched text"}` if a match is found 97 - `{"captured_text": ""}` if no match is found 98 99 :raises TypeError: if receiving a list the last element is not a ChatMessage instance. 100 """ 101 if isinstance(text_or_messages, str): 102 return self._build_result(self._extract_from_text(text_or_messages)) 103 if not text_or_messages: 104 logger.warning("Received empty list of messages") 105 return {"captured_text": ""} 106 return self._process_last_message(text_or_messages) 107 108 def _build_result(self, result: str | list[str]) -> dict: 109 """Helper method to build the return dictionary based on configuration.""" 110 if (isinstance(result, str) and result == "") or (isinstance(result, list) and not result): 111 return {"captured_text": ""} 112 return {"captured_text": result} 113 114 def _process_last_message(self, messages: list[ChatMessage]) -> dict: 115 """ 116 Process only the last message and build the result. 117 118 :raises TypeError: If the last element of the list is not a ChatMessage instance. 119 """ 120 last_message = messages[-1] 121 if not isinstance(last_message, ChatMessage): 122 raise TypeError(f"Expected ChatMessage object, got {type(last_message)}") 123 if last_message.text is None: 124 logger.warning("Last message has no text content") 125 return {"captured_text": ""} 126 result = self._extract_from_text(last_message.text) 127 return self._build_result(result) 128 129 def _extract_from_text(self, text: str) -> str | list[str]: 130 """ 131 Extract text using the regex pattern. 132 133 :param text: 134 The text to search through. 135 136 :returns: 137 The text captured by the first capturing group in the regex pattern. 138 If the pattern has no capture groups, returns the entire match. 139 If no match is found, returns an empty string. 140 """ 141 match = re.search(self.regex_pattern, text) 142 if not match: 143 return "" 144 if match.groups(): 145 return match.group(1) 146 return match.group(0)