docs_search_sync.py
1 """ 2 This script syncs the Haystack docs HTML files to the deepset workspace for search indexing. 3 4 It is used in the docs_search_sync.yml workflow. 5 6 1. Collects all HTML files from the docs and reference directories for the stable Haystack version. 7 2. Uploads the HTML files to the deepset workspace. 8 - A timestamp-based metadata field is used to track document versions in the workspace. 9 3. Deletes the old HTML files from the deepset workspace. 10 - Since most files are overwritten during upload, only a small number of deletions is expected. 11 - In case MAX_DELETIONS_SAFETY_LIMIT is exceeded, we block the deletion. 12 """ 13 14 import os 15 import sys 16 import time 17 from pathlib import Path 18 19 import requests 20 from deepset_cloud_sdk.workflows.sync_client.files import DeepsetCloudFile, WriteMode, list_files, upload_texts 21 22 DEEPSET_WORKSPACE_DOCS_SEARCH = os.environ["DEEPSET_WORKSPACE_DOCS_SEARCH"] 23 DEEPSET_API_KEY_DOCS_SEARCH = os.environ["DEEPSET_API_KEY_DOCS_SEARCH"] 24 25 # If there are more files to delete than this limit, it's likely that something went wrong in the upload process. 26 MAX_DELETIONS_SAFETY_LIMIT = 20 27 28 29 def collect_docs_files(version: int) -> list[DeepsetCloudFile]: 30 """ 31 Collect all HTML files from the docs and reference directories. 32 33 Returns a list of DeepsetCloudFile objects. 34 """ 35 repo_root = Path(__file__).parent.parent.parent 36 build_dir = repo_root / "docs-website" / "build" 37 # we want to exclude previous and temporarily unstable versions (2.x) and next version (next) 38 exclude = ("2.", "next") 39 40 files = [] 41 for section in ("docs", "reference"): 42 for subfolder in (build_dir / section).iterdir(): 43 if subfolder.is_dir() and not any(x in subfolder.name for x in exclude): 44 for html_file in subfolder.rglob("*.html"): 45 files.append( 46 DeepsetCloudFile( 47 # The build produces files like docs/agents/index.html or reference/agents-api/index.html. 48 # For file names, we want to use the parent directory name (agents.html or agents-api.html) 49 name=f"{html_file.parent.name}.html", 50 text=html_file.read_text(), 51 meta={ 52 "type": "api-reference" if section == "reference" else "documentation", 53 "version": version, 54 }, 55 ) 56 ) 57 return files 58 59 60 def delete_files(file_names: list[str]) -> None: 61 """ 62 Delete files from the deepset workspace. 63 """ 64 url = f"https://api.cloud.deepset.ai/api/v1/workspaces/{DEEPSET_WORKSPACE_DOCS_SEARCH}/files" 65 payload = {"names": file_names} 66 headers = {"Accept": "application/json", "Authorization": f"Bearer {DEEPSET_API_KEY_DOCS_SEARCH}"} 67 response = requests.delete(url, json=payload, headers=headers, timeout=300) 68 response.raise_for_status() 69 70 71 if __name__ == "__main__": 72 version = time.time_ns() 73 print(f"Docs version: {version}") 74 75 print("Collecting docs files from build directory") 76 dc_files = collect_docs_files(version) 77 print(f"Collected {len(dc_files)} docs files") 78 79 if len(dc_files) == 0: 80 print("No docs files found. Something is wrong. Exiting.") 81 sys.exit(1) 82 83 print("Uploading docs files to deepset") 84 summary = upload_texts( 85 workspace_name=DEEPSET_WORKSPACE_DOCS_SEARCH, 86 files=dc_files, 87 api_key=DEEPSET_API_KEY_DOCS_SEARCH, 88 blocking=True, # Very important to ensure that DC is up to date when we query for deletion 89 timeout_s=300, 90 show_progress=True, 91 write_mode=WriteMode.OVERWRITE, 92 enable_parallel_processing=True, 93 ) 94 print(f"Uploaded docs files to deepset\n{summary}") 95 if summary.failed_upload_count > 0: 96 print("Failed to upload some docs files. Stopping to prevent risky deletion of old files.") 97 sys.exit(1) 98 99 print("Listing old docs files from deepset") 100 odata_filter = f"version lt '{version}'" 101 old_files_names = [ 102 f.name 103 for batch in list_files( 104 workspace_name=DEEPSET_WORKSPACE_DOCS_SEARCH, api_key=DEEPSET_API_KEY_DOCS_SEARCH, odata_filter=odata_filter 105 ) 106 for f in batch 107 ] 108 109 print(f"Found {len(old_files_names)} old files to delete") 110 if len(old_files_names) > MAX_DELETIONS_SAFETY_LIMIT: 111 print( 112 f"Found >{MAX_DELETIONS_SAFETY_LIMIT} old files to delete. " 113 "Stopping because something could have gone wrong in the upload process." 114 ) 115 sys.exit(1) 116 117 if len(old_files_names) > 0: 118 print("Deleting old docs files from deepset") 119 delete_files(old_files_names) 120 print("Deleted old docs files from deepset")