/ video-clip-pipeline / clip_segmenter.py
clip_segmenter.py
1 #!/usr/bin/env python3 2 """ 3 Clip Segmenter - Find the best clip-worthy segments from Whisper transcripts 4 Uses Claude API to analyze transcripts and identify standalone clips. 5 6 Usage: 7 python3 clip_segmenter.py --transcript path/to/transcript.json --output path/to/segments.json 8 python3 clip_segmenter.py --transcript-dir transcripts/ --output-dir segments/ 9 """ 10 11 import argparse 12 import json 13 import os 14 import sys 15 from pathlib import Path 16 import anthropic 17 import re 18 from datetime import datetime 19 20 21 def load_transcript(json_file): 22 """Load Whisper JSON transcript""" 23 with open(json_file, 'r') as f: 24 data = json.load(f) 25 return data 26 27 28 def format_timestamp(seconds): 29 """Convert seconds to HH:MM:SS format""" 30 hours = int(seconds // 3600) 31 minutes = int((seconds % 3600) // 60) 32 secs = int(seconds % 60) 33 return f"{hours:02d}:{minutes:02d}:{secs:02d}" 34 35 36 def create_full_transcript_text(whisper_data): 37 """Create a readable transcript with timestamps for Claude""" 38 segments = whisper_data.get('segments', []) 39 40 transcript_lines = [] 41 for segment in segments: 42 start = segment['start'] 43 text = segment['text'].strip() 44 timestamp = format_timestamp(start) 45 transcript_lines.append(f"[{timestamp}] {text}") 46 47 return '\n'.join(transcript_lines) 48 49 50 def analyze_with_claude(transcript_text, episode_title, anthropic_client, 51 model="claude-haiku-4-5-20250514", max_segments=5, min_hook_strength=6): 52 """Send transcript to Claude for segment analysis""" 53 54 prompt = f"""You are analyzing a podcast transcript to identify the best clip-worthy segments. 55 56 Episode: {episode_title} 57 58 TRANSCRIPT: 59 {transcript_text} 60 61 Please identify {max_segments} standalone segments that would work as viral clips. Each segment should: 62 - Be 3-15 minutes long 63 - Have a clear hook/opening that grabs attention 64 - Contain a complete thought, story, or framework 65 - Feel satisfying as a standalone watch 66 - Have viral potential (contrarian takes, practical advice, compelling stories) 67 68 For each segment, provide: 69 - start_time: timestamp in seconds (not HH:MM:SS) 70 - end_time: timestamp in seconds 71 - suggested_title: Catchy, clickbait-worthy title (50 chars max) 72 - one_line_description: What the clip is about 73 - hook_strength: 1-10 rating for how compelling the opening is (only include if >= {min_hook_strength}) 74 - key_topics: 2-3 main topics covered 75 76 Return ONLY a valid JSON array like this: 77 [ 78 {{ 79 "start_time": 65, 80 "end_time": 420, 81 "suggested_title": "Why 99% of Founders Fail at AI", 82 "one_line_description": "A breakdown of the critical mistake most entrepreneurs make when implementing AI", 83 "hook_strength": 9, 84 "key_topics": ["AI implementation", "founder mistakes", "business strategy"] 85 }} 86 ] 87 88 Focus on segments with strong hooks, practical advice, and contrarian or surprising insights.""" 89 90 try: 91 response = anthropic_client.messages.create( 92 model=model, 93 max_tokens=4000, 94 messages=[{"role": "user", "content": prompt}] 95 ) 96 97 response_text = response.content[0].text.strip() 98 99 # Try to extract JSON from response 100 json_match = re.search(r'\[.*\]', response_text, re.DOTALL) 101 if json_match: 102 json_str = json_match.group(0) 103 segments = json.loads(json_str) 104 return segments 105 else: 106 print(f"Warning: No JSON found in Claude response for {episode_title}") 107 print("Response:", response_text[:200]) 108 return [] 109 110 except Exception as e: 111 print(f"Error calling Claude API: {e}") 112 return [] 113 114 115 def process_single(transcript_file, output_file, episode_title, client, 116 model="claude-haiku-4-5-20250514", max_segments=5, min_hook_strength=6): 117 """Process a single transcript file.""" 118 print(f"\nProcessing: {transcript_file}") 119 120 whisper_data = load_transcript(transcript_file) 121 if not episode_title: 122 episode_title = Path(transcript_file).stem 123 124 transcript_text = create_full_transcript_text(whisper_data) 125 print(f"Transcript length: {len(transcript_text)} chars") 126 127 segments = analyze_with_claude( 128 transcript_text, episode_title, client, 129 model=model, max_segments=max_segments, min_hook_strength=min_hook_strength 130 ) 131 print(f"Found {len(segments)} segments") 132 133 # Add metadata 134 for segment in segments: 135 segment['episode_file'] = Path(transcript_file).stem 136 segment['video_file'] = Path(transcript_file).stem + '.mp4' 137 138 # Save segments 139 output_path = Path(output_file) 140 output_path.parent.mkdir(parents=True, exist_ok=True) 141 with open(output_path, 'w') as f: 142 json.dump(segments, f, indent=2) 143 144 print(f"Saved to: {output_path}") 145 146 # Print summary 147 for i, seg in enumerate(segments, 1): 148 duration = seg['end_time'] - seg['start_time'] 149 print(f" {i}. {seg['suggested_title']} ({duration:.0f}s, hook: {seg['hook_strength']}/10)") 150 151 return segments 152 153 154 def main(): 155 parser = argparse.ArgumentParser(description="Find clip-worthy segments from Whisper transcripts") 156 parser.add_argument("--transcript", help="Path to a single Whisper JSON transcript") 157 parser.add_argument("--transcript-dir", help="Directory of Whisper JSON transcripts") 158 parser.add_argument("--output", help="Output path for segment JSON (single file mode)") 159 parser.add_argument("--output-dir", help="Output directory for segment JSONs (batch mode)") 160 parser.add_argument("--episode-title", help="Episode title (optional)") 161 parser.add_argument("--model", default="claude-haiku-4-5-20250514", help="Claude model to use") 162 parser.add_argument("--max-segments", type=int, default=5, help="Max clips per episode (default: 5)") 163 parser.add_argument("--min-hook-strength", type=int, default=6, help="Min hook score to include (default: 6)") 164 args = parser.parse_args() 165 166 if not args.transcript and not args.transcript_dir: 167 parser.error("Provide --transcript or --transcript-dir") 168 169 # Get Anthropic client 170 api_key = os.environ.get("ANTHROPIC_API_KEY") or os.environ.get("ANTHROPIC_KEY") 171 if not api_key: 172 print("Error: Set ANTHROPIC_API_KEY environment variable") 173 sys.exit(1) 174 client = anthropic.Anthropic(api_key=api_key) 175 176 if args.transcript: 177 # Single file mode 178 output = args.output or str(Path(args.transcript).parent / f"{Path(args.transcript).stem}_segments.json") 179 process_single( 180 args.transcript, output, args.episode_title, client, 181 model=args.model, max_segments=args.max_segments, min_hook_strength=args.min_hook_strength 182 ) 183 else: 184 # Batch mode 185 transcript_dir = Path(args.transcript_dir) 186 output_dir = Path(args.output_dir or "segments") 187 output_dir.mkdir(parents=True, exist_ok=True) 188 189 transcript_files = list(transcript_dir.glob("*.json")) 190 print(f"Found {len(transcript_files)} transcripts to process") 191 192 all_segments = [] 193 for transcript_file in transcript_files: 194 output_file = output_dir / f"{transcript_file.stem}_segments.json" 195 segments = process_single( 196 str(transcript_file), str(output_file), None, client, 197 model=args.model, max_segments=args.max_segments, min_hook_strength=args.min_hook_strength 198 ) 199 all_segments.extend(segments) 200 201 # Save combined segments 202 combined_file = output_dir / "all_segments.json" 203 with open(combined_file, 'w') as f: 204 json.dump(all_segments, f, indent=2) 205 206 print(f"\n✅ Segmentation complete!") 207 print(f"Total segments found: {len(all_segments)}") 208 print(f"Combined segments saved to: {combined_file}") 209 210 # Show top segments by hook strength 211 top_segments = sorted(all_segments, key=lambda x: x.get('hook_strength', 0), reverse=True)[:5] 212 print("\n🔥 Top 5 segments by hook strength:") 213 for i, seg in enumerate(top_segments, 1): 214 duration = seg['end_time'] - seg['start_time'] 215 print(f"{i}. {seg['suggested_title']} (hook: {seg['hook_strength']}/10, {duration:.0f}s)") 216 print(f" Episode: {seg['episode_file']}") 217 218 219 if __name__ == "__main__": 220 main()