/ haystack / components / extractors / regex_text_extractor.py
regex_text_extractor.py
  1  # SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
  2  #
  3  # SPDX-License-Identifier: Apache-2.0
  4  
  5  import re
  6  from typing import Any
  7  
  8  from haystack import component, logging
  9  from haystack.core.serialization import default_from_dict, default_to_dict
 10  from haystack.dataclasses import ChatMessage
 11  
 12  logger = logging.getLogger(__name__)
 13  
 14  
 15  @component
 16  class RegexTextExtractor:
 17      """
 18      Extracts text from chat message or string input using a regex pattern.
 19  
 20      RegexTextExtractor parses input text or ChatMessages using a provided regular expression pattern.
 21      It can be configured to search through all messages or only the last message in a list of ChatMessages.
 22  
 23      ### Usage example
 24  
 25      ```python
 26      from haystack.components.extractors import RegexTextExtractor
 27      from haystack.dataclasses import ChatMessage
 28  
 29      # Using with a string
 30      parser = RegexTextExtractor(regex_pattern='<issue url=\"(.+)\">')
 31      result = parser.run(text_or_messages='<issue url="github.com/hahahaha">hahahah</issue>')
 32      # result: {"captured_text": "github.com/hahahaha"}
 33  
 34      # Using with ChatMessages
 35      messages = [ChatMessage.from_user('<issue url="github.com/hahahaha">hahahah</issue>')]
 36      result = parser.run(text_or_messages=messages)
 37      # result: {"captured_text": "github.com/hahahaha"}
 38      ```
 39      """
 40  
 41      def __init__(self, regex_pattern: str) -> None:
 42          """
 43          Creates an instance of the RegexTextExtractor component.
 44  
 45          :param regex_pattern:
 46              The regular expression pattern used to extract text.
 47              The pattern should include a capture group to extract the desired text.
 48              Example: `'<issue url="(.+)">'` captures `'github.com/hahahaha'` from `'<issue url="github.com/hahahaha">'`.
 49          """
 50          self.regex_pattern = regex_pattern
 51  
 52          # Check if the pattern has at least one capture group
 53          num_groups = re.compile(regex_pattern).groups
 54          if num_groups < 1:
 55              logger.warning(
 56                  "The provided regex pattern {regex_pattern} doesn't contain any capture groups. "
 57                  "The entire match will be returned instead.",
 58                  regex_pattern=regex_pattern,
 59              )
 60  
 61      def to_dict(self) -> dict[str, Any]:
 62          """
 63          Serializes the component to a dictionary.
 64  
 65          :returns:
 66              Dictionary with serialized data.
 67          """
 68          return default_to_dict(self, regex_pattern=self.regex_pattern)
 69  
 70      @classmethod
 71      def from_dict(cls, data: dict[str, Any]) -> "RegexTextExtractor":
 72          """
 73          Deserializes the component from a dictionary.
 74  
 75          :param data:
 76              The dictionary to deserialize from.
 77          :returns:
 78              The deserialized component.
 79          """
 80          # return_empty_on_no_match is an old parameter. We'd like to avoid that pipelines break if it's still present.
 81          if "return_empty_on_no_match" in data["init_parameters"]:
 82              logger.warning("The `return_empty_on_no_match` init parameter has been removed and will be ignored.")
 83              data["init_parameters"].pop("return_empty_on_no_match")
 84  
 85          return default_from_dict(cls, data)
 86  
 87      @component.output_types(captured_text=str)
 88      def run(self, text_or_messages: str | list[ChatMessage]) -> dict[str, str]:
 89          """
 90          Extracts text from input using the configured regex pattern.
 91  
 92          :param text_or_messages:
 93              Either a string or a list of ChatMessage objects to search through.
 94  
 95          :returns:
 96            - `{"captured_text": "matched text"}` if a match is found
 97            - `{"captured_text": ""}` if no match is found
 98  
 99          :raises TypeError: if receiving a list the last element is not a ChatMessage instance.
100          """
101          if isinstance(text_or_messages, str):
102              return self._build_result(self._extract_from_text(text_or_messages))
103          if not text_or_messages:
104              logger.warning("Received empty list of messages")
105              return {"captured_text": ""}
106          return self._process_last_message(text_or_messages)
107  
108      def _build_result(self, result: str | list[str]) -> dict:
109          """Helper method to build the return dictionary based on configuration."""
110          if (isinstance(result, str) and result == "") or (isinstance(result, list) and not result):
111              return {"captured_text": ""}
112          return {"captured_text": result}
113  
114      def _process_last_message(self, messages: list[ChatMessage]) -> dict:
115          """
116          Process only the last message and build the result.
117  
118          :raises TypeError: If the last element of the list is not a ChatMessage instance.
119          """
120          last_message = messages[-1]
121          if not isinstance(last_message, ChatMessage):
122              raise TypeError(f"Expected ChatMessage object, got {type(last_message)}")
123          if last_message.text is None:
124              logger.warning("Last message has no text content")
125              return {"captured_text": ""}
126          result = self._extract_from_text(last_message.text)
127          return self._build_result(result)
128  
129      def _extract_from_text(self, text: str) -> str | list[str]:
130          """
131          Extract text using the regex pattern.
132  
133          :param text:
134              The text to search through.
135  
136          :returns:
137              The text captured by the first capturing group in the regex pattern.
138              If the pattern has no capture groups, returns the entire match.
139              If no match is found, returns an empty string.
140          """
141          match = re.search(self.regex_pattern, text)
142          if not match:
143              return ""
144          if match.groups():
145              return match.group(1)
146          return match.group(0)