/ video-clip-pipeline / clip_segmenter.py
clip_segmenter.py
  1  #!/usr/bin/env python3
  2  """
  3  Clip Segmenter - Find the best clip-worthy segments from Whisper transcripts
  4  Uses Claude API to analyze transcripts and identify standalone clips.
  5  
  6  Usage:
  7      python3 clip_segmenter.py --transcript path/to/transcript.json --output path/to/segments.json
  8      python3 clip_segmenter.py --transcript-dir transcripts/ --output-dir segments/
  9  """
 10  
 11  import argparse
 12  import json
 13  import os
 14  import sys
 15  from pathlib import Path
 16  import anthropic
 17  import re
 18  from datetime import datetime
 19  
 20  
 21  def load_transcript(json_file):
 22      """Load Whisper JSON transcript"""
 23      with open(json_file, 'r') as f:
 24          data = json.load(f)
 25      return data
 26  
 27  
 28  def format_timestamp(seconds):
 29      """Convert seconds to HH:MM:SS format"""
 30      hours = int(seconds // 3600)
 31      minutes = int((seconds % 3600) // 60)
 32      secs = int(seconds % 60)
 33      return f"{hours:02d}:{minutes:02d}:{secs:02d}"
 34  
 35  
 36  def create_full_transcript_text(whisper_data):
 37      """Create a readable transcript with timestamps for Claude"""
 38      segments = whisper_data.get('segments', [])
 39  
 40      transcript_lines = []
 41      for segment in segments:
 42          start = segment['start']
 43          text = segment['text'].strip()
 44          timestamp = format_timestamp(start)
 45          transcript_lines.append(f"[{timestamp}] {text}")
 46  
 47      return '\n'.join(transcript_lines)
 48  
 49  
 50  def analyze_with_claude(transcript_text, episode_title, anthropic_client,
 51                          model="claude-haiku-4-5-20250514", max_segments=5, min_hook_strength=6):
 52      """Send transcript to Claude for segment analysis"""
 53  
 54      prompt = f"""You are analyzing a podcast transcript to identify the best clip-worthy segments.
 55  
 56  Episode: {episode_title}
 57  
 58  TRANSCRIPT:
 59  {transcript_text}
 60  
 61  Please identify {max_segments} standalone segments that would work as viral clips. Each segment should:
 62  - Be 3-15 minutes long
 63  - Have a clear hook/opening that grabs attention
 64  - Contain a complete thought, story, or framework
 65  - Feel satisfying as a standalone watch
 66  - Have viral potential (contrarian takes, practical advice, compelling stories)
 67  
 68  For each segment, provide:
 69  - start_time: timestamp in seconds (not HH:MM:SS)
 70  - end_time: timestamp in seconds
 71  - suggested_title: Catchy, clickbait-worthy title (50 chars max)
 72  - one_line_description: What the clip is about
 73  - hook_strength: 1-10 rating for how compelling the opening is (only include if >= {min_hook_strength})
 74  - key_topics: 2-3 main topics covered
 75  
 76  Return ONLY a valid JSON array like this:
 77  [
 78      {{
 79          "start_time": 65,
 80          "end_time": 420,
 81          "suggested_title": "Why 99% of Founders Fail at AI",
 82          "one_line_description": "A breakdown of the critical mistake most entrepreneurs make when implementing AI",
 83          "hook_strength": 9,
 84          "key_topics": ["AI implementation", "founder mistakes", "business strategy"]
 85      }}
 86  ]
 87  
 88  Focus on segments with strong hooks, practical advice, and contrarian or surprising insights."""
 89  
 90      try:
 91          response = anthropic_client.messages.create(
 92              model=model,
 93              max_tokens=4000,
 94              messages=[{"role": "user", "content": prompt}]
 95          )
 96  
 97          response_text = response.content[0].text.strip()
 98  
 99          # Try to extract JSON from response
100          json_match = re.search(r'\[.*\]', response_text, re.DOTALL)
101          if json_match:
102              json_str = json_match.group(0)
103              segments = json.loads(json_str)
104              return segments
105          else:
106              print(f"Warning: No JSON found in Claude response for {episode_title}")
107              print("Response:", response_text[:200])
108              return []
109  
110      except Exception as e:
111          print(f"Error calling Claude API: {e}")
112          return []
113  
114  
115  def process_single(transcript_file, output_file, episode_title, client,
116                      model="claude-haiku-4-5-20250514", max_segments=5, min_hook_strength=6):
117      """Process a single transcript file."""
118      print(f"\nProcessing: {transcript_file}")
119  
120      whisper_data = load_transcript(transcript_file)
121      if not episode_title:
122          episode_title = Path(transcript_file).stem
123  
124      transcript_text = create_full_transcript_text(whisper_data)
125      print(f"Transcript length: {len(transcript_text)} chars")
126  
127      segments = analyze_with_claude(
128          transcript_text, episode_title, client,
129          model=model, max_segments=max_segments, min_hook_strength=min_hook_strength
130      )
131      print(f"Found {len(segments)} segments")
132  
133      # Add metadata
134      for segment in segments:
135          segment['episode_file'] = Path(transcript_file).stem
136          segment['video_file'] = Path(transcript_file).stem + '.mp4'
137  
138      # Save segments
139      output_path = Path(output_file)
140      output_path.parent.mkdir(parents=True, exist_ok=True)
141      with open(output_path, 'w') as f:
142          json.dump(segments, f, indent=2)
143  
144      print(f"Saved to: {output_path}")
145  
146      # Print summary
147      for i, seg in enumerate(segments, 1):
148          duration = seg['end_time'] - seg['start_time']
149          print(f"  {i}. {seg['suggested_title']} ({duration:.0f}s, hook: {seg['hook_strength']}/10)")
150  
151      return segments
152  
153  
154  def main():
155      parser = argparse.ArgumentParser(description="Find clip-worthy segments from Whisper transcripts")
156      parser.add_argument("--transcript", help="Path to a single Whisper JSON transcript")
157      parser.add_argument("--transcript-dir", help="Directory of Whisper JSON transcripts")
158      parser.add_argument("--output", help="Output path for segment JSON (single file mode)")
159      parser.add_argument("--output-dir", help="Output directory for segment JSONs (batch mode)")
160      parser.add_argument("--episode-title", help="Episode title (optional)")
161      parser.add_argument("--model", default="claude-haiku-4-5-20250514", help="Claude model to use")
162      parser.add_argument("--max-segments", type=int, default=5, help="Max clips per episode (default: 5)")
163      parser.add_argument("--min-hook-strength", type=int, default=6, help="Min hook score to include (default: 6)")
164      args = parser.parse_args()
165  
166      if not args.transcript and not args.transcript_dir:
167          parser.error("Provide --transcript or --transcript-dir")
168  
169      # Get Anthropic client
170      api_key = os.environ.get("ANTHROPIC_API_KEY") or os.environ.get("ANTHROPIC_KEY")
171      if not api_key:
172          print("Error: Set ANTHROPIC_API_KEY environment variable")
173          sys.exit(1)
174      client = anthropic.Anthropic(api_key=api_key)
175  
176      if args.transcript:
177          # Single file mode
178          output = args.output or str(Path(args.transcript).parent / f"{Path(args.transcript).stem}_segments.json")
179          process_single(
180              args.transcript, output, args.episode_title, client,
181              model=args.model, max_segments=args.max_segments, min_hook_strength=args.min_hook_strength
182          )
183      else:
184          # Batch mode
185          transcript_dir = Path(args.transcript_dir)
186          output_dir = Path(args.output_dir or "segments")
187          output_dir.mkdir(parents=True, exist_ok=True)
188  
189          transcript_files = list(transcript_dir.glob("*.json"))
190          print(f"Found {len(transcript_files)} transcripts to process")
191  
192          all_segments = []
193          for transcript_file in transcript_files:
194              output_file = output_dir / f"{transcript_file.stem}_segments.json"
195              segments = process_single(
196                  str(transcript_file), str(output_file), None, client,
197                  model=args.model, max_segments=args.max_segments, min_hook_strength=args.min_hook_strength
198              )
199              all_segments.extend(segments)
200  
201          # Save combined segments
202          combined_file = output_dir / "all_segments.json"
203          with open(combined_file, 'w') as f:
204              json.dump(all_segments, f, indent=2)
205  
206          print(f"\n✅ Segmentation complete!")
207          print(f"Total segments found: {len(all_segments)}")
208          print(f"Combined segments saved to: {combined_file}")
209  
210          # Show top segments by hook strength
211          top_segments = sorted(all_segments, key=lambda x: x.get('hook_strength', 0), reverse=True)[:5]
212          print("\n🔥 Top 5 segments by hook strength:")
213          for i, seg in enumerate(top_segments, 1):
214              duration = seg['end_time'] - seg['start_time']
215              print(f"{i}. {seg['suggested_title']} (hook: {seg['hook_strength']}/10, {duration:.0f}s)")
216              print(f"   Episode: {seg['episode_file']}")
217  
218  
219  if __name__ == "__main__":
220      main()