/ video-caption-generator / scripts / process_videos.py
process_videos.py
  1  #!/usr/bin/env python3
  2  """
  3  process_videos.py — Transcribe new videos from a Google Drive folder,
  4  deduplicate by content, and generate captions + YT/FB titles via Claude API.
  5  
  6  Usage:
  7      python3 process_videos.py --folder-id <DRIVE_FOLDER_ID> [--processed-log <path>]
  8  
  9  Output: prints formatted results to stdout
 10  """
 11  
 12  import argparse
 13  import hashlib
 14  import json
 15  import os
 16  import subprocess
 17  import sys
 18  import tempfile
 19  from pathlib import Path
 20  
 21  # --- Configuration: update these for your environment ---
 22  # Path to your Google Drive CLI tool (e.g., gws-gateway.sh, gdrive, rclone wrapper)
 23  GWS_GATEWAY = os.environ.get("GWS_GATEWAY", "gws-gateway.sh")
 24  # Path to your Whisper binary
 25  WHISPER_BIN = os.environ.get("WHISPER_BIN", "whisper")
 26  # Working directory for subprocess calls
 27  WORKSPACE = os.environ.get("WORKSPACE", str(Path.cwd()))
 28  
 29  
 30  def gws(args: list[str]) -> dict:
 31      """Call the Google Drive CLI and return parsed JSON output."""
 32      cmd = [GWS_GATEWAY] + args
 33      result = subprocess.run(cmd, capture_output=True, text=True, cwd=WORKSPACE)
 34      try:
 35          return json.loads(result.stdout)
 36      except Exception:
 37          return {"error": result.stderr or result.stdout}
 38  
 39  
 40  def list_videos(folder_id: str) -> list[dict]:
 41      """List video files in a Google Drive folder."""
 42      data = gws([
 43          "drive", "files", "list",
 44          "--params", json.dumps({
 45              "q": f'"{folder_id}" in parents and mimeType contains "video/"',
 46              "fields": "files(id,name,mimeType,createdTime)",
 47              "orderBy": "createdTime desc"
 48          })
 49      ])
 50      return data.get("files", [])
 51  
 52  
 53  def download_video(file_id: str, dest: str) -> bool:
 54      """Download a video file from Google Drive."""
 55      data = gws([
 56          "drive", "files", "get",
 57          "--params", json.dumps({"fileId": file_id, "alt": "media"}),
 58          "-o", dest
 59      ])
 60      return data.get("status") == "success"
 61  
 62  
 63  def transcribe(video_path: str, out_dir: str) -> str | None:
 64      """Transcribe a video file using Whisper."""
 65      result = subprocess.run(
 66          [WHISPER_BIN, video_path,
 67           "--model", "turbo",
 68           "--output_format", "txt",
 69           "--output_dir", out_dir,
 70           "--language", "en"],
 71          capture_output=True, text=True
 72      )
 73      txt_path = Path(out_dir) / (Path(video_path).stem + ".txt")
 74      if txt_path.exists():
 75          return txt_path.read_text().strip()
 76      return None
 77  
 78  
 79  def content_hash(text: str) -> str:
 80      """Generate a content hash for deduplication."""
 81      return hashlib.md5(text.lower().split().__str__().encode()).hexdigest()
 82  
 83  
 84  def generate_caption_and_title(transcript: str, api_key: str) -> tuple[str, str]:
 85      """Use Claude API to generate a social caption and video title."""
 86      import urllib.request
 87      prompt = f"""Given this video transcript, write:
 88  1. A punchy social media caption (2-4 sentences, first person, no hashtags, conversational)
 89  2. A YouTube/Facebook title (under 60 chars, curiosity-driven, no clickbait)
 90  
 91  Transcript:
 92  {transcript}
 93  
 94  Respond in this exact format:
 95  CAPTION: <caption here>
 96  TITLE: <title here>"""
 97  
 98      payload = json.dumps({
 99          "model": "claude-sonnet-4-6",
100          "max_tokens": 300,
101          "messages": [{"role": "user", "content": prompt}]
102      }).encode()
103  
104      req = urllib.request.Request(
105          "https://api.anthropic.com/v1/messages",
106          data=payload,
107          headers={
108              "x-api-key": api_key,
109              "anthropic-version": "2023-06-01",
110              "content-type": "application/json"
111          }
112      )
113      with urllib.request.urlopen(req) as resp:
114          data = json.loads(resp.read())
115      text = data["content"][0]["text"]
116      caption = ""
117      title = ""
118      for line in text.splitlines():
119          if line.startswith("CAPTION:"):
120              caption = line[len("CAPTION:"):].strip()
121          elif line.startswith("TITLE:"):
122              title = line[len("TITLE:"):].strip()
123      return caption, title
124  
125  
126  def load_processed(log_path: str) -> set[str]:
127      """Load set of already-processed video IDs."""
128      if not Path(log_path).exists():
129          return set()
130      with open(log_path) as f:
131          return set(json.load(f))
132  
133  
134  def save_processed(log_path: str, processed: set[str]):
135      """Save updated set of processed video IDs."""
136      with open(log_path, "w") as f:
137          json.dump(list(processed), f, indent=2)
138  
139  
140  def main():
141      parser = argparse.ArgumentParser(
142          description="Transcribe Drive videos and generate social captions + titles"
143      )
144      parser.add_argument("--folder-id", required=True,
145                          help="Google Drive folder ID to scan for videos")
146      parser.add_argument("--processed-log", default="processed_ids.json",
147                          help="Path to JSON file tracking processed video IDs")
148      args = parser.parse_args()
149  
150      # Load Anthropic API key from environment
151      api_key = os.environ.get("ANTHROPIC_API_KEY", "").strip()
152      if not api_key:
153          print("ERROR: Set ANTHROPIC_API_KEY environment variable", file=sys.stderr)
154          sys.exit(1)
155  
156      processed_ids = load_processed(args.processed_log)
157      videos = list_videos(args.folder_id)
158  
159      new_videos = [v for v in videos if v["id"] not in processed_ids]
160      if not new_videos:
161          print("No new videos found.")
162          return
163  
164      seen_hashes: dict[str, str] = {}  # hash -> first video name (within this batch)
165      results = []
166  
167      with tempfile.TemporaryDirectory() as tmpdir:
168          for video in new_videos:
169              vid_id = video["id"]
170              vid_name = video["name"]
171              dest = str(Path(tmpdir) / (vid_name.replace(" ", "_")))
172  
173              print(f"Downloading {vid_name}...", file=sys.stderr)
174              ok = download_video(vid_id, dest)
175              if not ok:
176                  print(f"  SKIP (download failed)", file=sys.stderr)
177                  processed_ids.add(vid_id)
178                  continue
179  
180              print(f"Transcribing {vid_name}...", file=sys.stderr)
181              transcript = transcribe(dest, tmpdir)
182              if not transcript:
183                  print(f"  SKIP (transcription failed)", file=sys.stderr)
184                  processed_ids.add(vid_id)
185                  continue
186  
187              h = content_hash(transcript)
188              is_ab_variant = h in seen_hashes
189              if is_ab_variant:
190                  print(f"  A/B variant of {seen_hashes[h]} — processing anyway", file=sys.stderr)
191              else:
192                  seen_hashes[h] = vid_name
193  
194              print(f"Generating caption + title for {vid_name}...", file=sys.stderr)
195              caption, title = generate_caption_and_title(transcript, api_key)
196  
197              result_entry = {
198                  "name": vid_name,
199                  "transcript": transcript,
200                  "caption": caption,
201                  "title": title
202              }
203              if is_ab_variant:
204                  result_entry["ab_variant_of"] = seen_hashes[h]
205              results.append(result_entry)
206              processed_ids.add(vid_id)
207  
208      save_processed(args.processed_log, processed_ids)
209  
210      # Print formatted output
211      for r in results:
212          ab_tag = f" (A/B variant of {r['ab_variant_of']})" if r.get('ab_variant_of') else ""
213          print(f"\n*{r['name']}*{ab_tag}")
214          print(f"📝 *Transcript:* {r['transcript']}")
215          print(f"🎬 *Caption:* {r['caption']}")
216          print(f"📺 *YT/FB Title:* {r['title']}")
217  
218      if not results:
219          print("All new videos were duplicates — nothing new to report.")
220  
221  
222  if __name__ == "__main__":
223      main()