process_videos.py
1 #!/usr/bin/env python3 2 """ 3 process_videos.py — Transcribe new videos from a Google Drive folder, 4 deduplicate by content, and generate captions + YT/FB titles via Claude API. 5 6 Usage: 7 python3 process_videos.py --folder-id <DRIVE_FOLDER_ID> [--processed-log <path>] 8 9 Output: prints formatted results to stdout 10 """ 11 12 import argparse 13 import hashlib 14 import json 15 import os 16 import subprocess 17 import sys 18 import tempfile 19 from pathlib import Path 20 21 # --- Configuration: update these for your environment --- 22 # Path to your Google Drive CLI tool (e.g., gws-gateway.sh, gdrive, rclone wrapper) 23 GWS_GATEWAY = os.environ.get("GWS_GATEWAY", "gws-gateway.sh") 24 # Path to your Whisper binary 25 WHISPER_BIN = os.environ.get("WHISPER_BIN", "whisper") 26 # Working directory for subprocess calls 27 WORKSPACE = os.environ.get("WORKSPACE", str(Path.cwd())) 28 29 30 def gws(args: list[str]) -> dict: 31 """Call the Google Drive CLI and return parsed JSON output.""" 32 cmd = [GWS_GATEWAY] + args 33 result = subprocess.run(cmd, capture_output=True, text=True, cwd=WORKSPACE) 34 try: 35 return json.loads(result.stdout) 36 except Exception: 37 return {"error": result.stderr or result.stdout} 38 39 40 def list_videos(folder_id: str) -> list[dict]: 41 """List video files in a Google Drive folder.""" 42 data = gws([ 43 "drive", "files", "list", 44 "--params", json.dumps({ 45 "q": f'"{folder_id}" in parents and mimeType contains "video/"', 46 "fields": "files(id,name,mimeType,createdTime)", 47 "orderBy": "createdTime desc" 48 }) 49 ]) 50 return data.get("files", []) 51 52 53 def download_video(file_id: str, dest: str) -> bool: 54 """Download a video file from Google Drive.""" 55 data = gws([ 56 "drive", "files", "get", 57 "--params", json.dumps({"fileId": file_id, "alt": "media"}), 58 "-o", dest 59 ]) 60 return data.get("status") == "success" 61 62 63 def transcribe(video_path: str, out_dir: str) -> str | None: 64 """Transcribe a video file using Whisper.""" 65 result = subprocess.run( 66 [WHISPER_BIN, video_path, 67 "--model", "turbo", 68 "--output_format", "txt", 69 "--output_dir", out_dir, 70 "--language", "en"], 71 capture_output=True, text=True 72 ) 73 txt_path = Path(out_dir) / (Path(video_path).stem + ".txt") 74 if txt_path.exists(): 75 return txt_path.read_text().strip() 76 return None 77 78 79 def content_hash(text: str) -> str: 80 """Generate a content hash for deduplication.""" 81 return hashlib.md5(text.lower().split().__str__().encode()).hexdigest() 82 83 84 def generate_caption_and_title(transcript: str, api_key: str) -> tuple[str, str]: 85 """Use Claude API to generate a social caption and video title.""" 86 import urllib.request 87 prompt = f"""Given this video transcript, write: 88 1. A punchy social media caption (2-4 sentences, first person, no hashtags, conversational) 89 2. A YouTube/Facebook title (under 60 chars, curiosity-driven, no clickbait) 90 91 Transcript: 92 {transcript} 93 94 Respond in this exact format: 95 CAPTION: <caption here> 96 TITLE: <title here>""" 97 98 payload = json.dumps({ 99 "model": "claude-sonnet-4-6", 100 "max_tokens": 300, 101 "messages": [{"role": "user", "content": prompt}] 102 }).encode() 103 104 req = urllib.request.Request( 105 "https://api.anthropic.com/v1/messages", 106 data=payload, 107 headers={ 108 "x-api-key": api_key, 109 "anthropic-version": "2023-06-01", 110 "content-type": "application/json" 111 } 112 ) 113 with urllib.request.urlopen(req) as resp: 114 data = json.loads(resp.read()) 115 text = data["content"][0]["text"] 116 caption = "" 117 title = "" 118 for line in text.splitlines(): 119 if line.startswith("CAPTION:"): 120 caption = line[len("CAPTION:"):].strip() 121 elif line.startswith("TITLE:"): 122 title = line[len("TITLE:"):].strip() 123 return caption, title 124 125 126 def load_processed(log_path: str) -> set[str]: 127 """Load set of already-processed video IDs.""" 128 if not Path(log_path).exists(): 129 return set() 130 with open(log_path) as f: 131 return set(json.load(f)) 132 133 134 def save_processed(log_path: str, processed: set[str]): 135 """Save updated set of processed video IDs.""" 136 with open(log_path, "w") as f: 137 json.dump(list(processed), f, indent=2) 138 139 140 def main(): 141 parser = argparse.ArgumentParser( 142 description="Transcribe Drive videos and generate social captions + titles" 143 ) 144 parser.add_argument("--folder-id", required=True, 145 help="Google Drive folder ID to scan for videos") 146 parser.add_argument("--processed-log", default="processed_ids.json", 147 help="Path to JSON file tracking processed video IDs") 148 args = parser.parse_args() 149 150 # Load Anthropic API key from environment 151 api_key = os.environ.get("ANTHROPIC_API_KEY", "").strip() 152 if not api_key: 153 print("ERROR: Set ANTHROPIC_API_KEY environment variable", file=sys.stderr) 154 sys.exit(1) 155 156 processed_ids = load_processed(args.processed_log) 157 videos = list_videos(args.folder_id) 158 159 new_videos = [v for v in videos if v["id"] not in processed_ids] 160 if not new_videos: 161 print("No new videos found.") 162 return 163 164 seen_hashes: dict[str, str] = {} # hash -> first video name (within this batch) 165 results = [] 166 167 with tempfile.TemporaryDirectory() as tmpdir: 168 for video in new_videos: 169 vid_id = video["id"] 170 vid_name = video["name"] 171 dest = str(Path(tmpdir) / (vid_name.replace(" ", "_"))) 172 173 print(f"Downloading {vid_name}...", file=sys.stderr) 174 ok = download_video(vid_id, dest) 175 if not ok: 176 print(f" SKIP (download failed)", file=sys.stderr) 177 processed_ids.add(vid_id) 178 continue 179 180 print(f"Transcribing {vid_name}...", file=sys.stderr) 181 transcript = transcribe(dest, tmpdir) 182 if not transcript: 183 print(f" SKIP (transcription failed)", file=sys.stderr) 184 processed_ids.add(vid_id) 185 continue 186 187 h = content_hash(transcript) 188 is_ab_variant = h in seen_hashes 189 if is_ab_variant: 190 print(f" A/B variant of {seen_hashes[h]} — processing anyway", file=sys.stderr) 191 else: 192 seen_hashes[h] = vid_name 193 194 print(f"Generating caption + title for {vid_name}...", file=sys.stderr) 195 caption, title = generate_caption_and_title(transcript, api_key) 196 197 result_entry = { 198 "name": vid_name, 199 "transcript": transcript, 200 "caption": caption, 201 "title": title 202 } 203 if is_ab_variant: 204 result_entry["ab_variant_of"] = seen_hashes[h] 205 results.append(result_entry) 206 processed_ids.add(vid_id) 207 208 save_processed(args.processed_log, processed_ids) 209 210 # Print formatted output 211 for r in results: 212 ab_tag = f" (A/B variant of {r['ab_variant_of']})" if r.get('ab_variant_of') else "" 213 print(f"\n*{r['name']}*{ab_tag}") 214 print(f"📝 *Transcript:* {r['transcript']}") 215 print(f"🎬 *Caption:* {r['caption']}") 216 print(f"📺 *YT/FB Title:* {r['title']}") 217 218 if not results: 219 print("All new videos were duplicates — nothing new to report.") 220 221 222 if __name__ == "__main__": 223 main()