/ .github / utils / docs_search_sync.py
docs_search_sync.py
  1  """
  2  This script syncs the Haystack docs HTML files to the deepset workspace for search indexing.
  3  
  4  It is used in the docs_search_sync.yml workflow.
  5  
  6  1. Collects all HTML files from the docs and reference directories for the stable Haystack version.
  7  2. Uploads the HTML files to the deepset workspace.
  8      - A timestamp-based metadata field is used to track document versions in the workspace.
  9  3. Deletes the old HTML files from the deepset workspace.
 10      - Since most files are overwritten during upload, only a small number of deletions is expected.
 11      - In case MAX_DELETIONS_SAFETY_LIMIT is exceeded, we block the deletion.
 12  """
 13  
 14  import os
 15  import sys
 16  import time
 17  from pathlib import Path
 18  
 19  import requests
 20  from deepset_cloud_sdk.workflows.sync_client.files import DeepsetCloudFile, WriteMode, list_files, upload_texts
 21  
 22  DEEPSET_WORKSPACE_DOCS_SEARCH = os.environ["DEEPSET_WORKSPACE_DOCS_SEARCH"]
 23  DEEPSET_API_KEY_DOCS_SEARCH = os.environ["DEEPSET_API_KEY_DOCS_SEARCH"]
 24  
 25  # If there are more files to delete than this limit, it's likely that something went wrong in the upload process.
 26  MAX_DELETIONS_SAFETY_LIMIT = 20
 27  
 28  
 29  def collect_docs_files(version: int) -> list[DeepsetCloudFile]:
 30      """
 31      Collect all HTML files from the docs and reference directories.
 32  
 33      Returns a list of DeepsetCloudFile objects.
 34      """
 35      repo_root = Path(__file__).parent.parent.parent
 36      build_dir = repo_root / "docs-website" / "build"
 37      # we want to exclude previous and temporarily unstable versions (2.x) and next version (next)
 38      exclude = ("2.", "next")
 39  
 40      files = []
 41      for section in ("docs", "reference"):
 42          for subfolder in (build_dir / section).iterdir():
 43              if subfolder.is_dir() and not any(x in subfolder.name for x in exclude):
 44                  for html_file in subfolder.rglob("*.html"):
 45                      files.append(
 46                          DeepsetCloudFile(
 47                              # The build produces files like docs/agents/index.html or reference/agents-api/index.html.
 48                              # For file names, we want to use the parent directory name (agents.html or agents-api.html)
 49                              name=f"{html_file.parent.name}.html",
 50                              text=html_file.read_text(),
 51                              meta={
 52                                  "type": "api-reference" if section == "reference" else "documentation",
 53                                  "version": version,
 54                              },
 55                          )
 56                      )
 57      return files
 58  
 59  
 60  def delete_files(file_names: list[str]) -> None:
 61      """
 62      Delete files from the deepset workspace.
 63      """
 64      url = f"https://api.cloud.deepset.ai/api/v1/workspaces/{DEEPSET_WORKSPACE_DOCS_SEARCH}/files"
 65      payload = {"names": file_names}
 66      headers = {"Accept": "application/json", "Authorization": f"Bearer {DEEPSET_API_KEY_DOCS_SEARCH}"}
 67      response = requests.delete(url, json=payload, headers=headers, timeout=300)
 68      response.raise_for_status()
 69  
 70  
 71  if __name__ == "__main__":
 72      version = time.time_ns()
 73      print(f"Docs version: {version}")
 74  
 75      print("Collecting docs files from build directory")
 76      dc_files = collect_docs_files(version)
 77      print(f"Collected {len(dc_files)} docs files")
 78  
 79      if len(dc_files) == 0:
 80          print("No docs files found. Something is wrong. Exiting.")
 81          sys.exit(1)
 82  
 83      print("Uploading docs files to deepset")
 84      summary = upload_texts(
 85          workspace_name=DEEPSET_WORKSPACE_DOCS_SEARCH,
 86          files=dc_files,
 87          api_key=DEEPSET_API_KEY_DOCS_SEARCH,
 88          blocking=True,  # Very important to ensure that DC is up to date when we query for deletion
 89          timeout_s=300,
 90          show_progress=True,
 91          write_mode=WriteMode.OVERWRITE,
 92          enable_parallel_processing=True,
 93      )
 94      print(f"Uploaded docs files to deepset\n{summary}")
 95      if summary.failed_upload_count > 0:
 96          print("Failed to upload some docs files. Stopping to prevent risky deletion of old files.")
 97          sys.exit(1)
 98  
 99      print("Listing old docs files from deepset")
100      odata_filter = f"version lt '{version}'"
101      old_files_names = [
102          f.name
103          for batch in list_files(
104              workspace_name=DEEPSET_WORKSPACE_DOCS_SEARCH, api_key=DEEPSET_API_KEY_DOCS_SEARCH, odata_filter=odata_filter
105          )
106          for f in batch
107      ]
108  
109      print(f"Found {len(old_files_names)} old files to delete")
110      if len(old_files_names) > MAX_DELETIONS_SAFETY_LIMIT:
111          print(
112              f"Found >{MAX_DELETIONS_SAFETY_LIMIT} old files to delete. "
113              "Stopping because something could have gone wrong in the upload process."
114          )
115          sys.exit(1)
116  
117      if len(old_files_names) > 0:
118          print("Deleting old docs files from deepset")
119          delete_files(old_files_names)
120          print("Deleted old docs files from deepset")