/ video_models / video_utils.py
video_utils.py
1 """ 2 Module for video analysis and non-verbal information extraction 3 This unified module contains functions for: 4 1. Video content extraction 5 2. Video content analysis 6 3. Non-verbal cues extraction 7 4. Non-verbal elements analysis 8 """ 9 10 import os 11 import gc 12 import time 13 import torch 14 import numpy as np 15 import traceback 16 from tempfile import NamedTemporaryFile 17 import logging 18 from typing import Tuple, List, Optional, Dict, Any, Callable 19 20 # Logging configuration 21 logger = logging.getLogger("video_analyzer") 22 23 # Global variables to track model states 24 internvideo_model_loaded = False 25 deepseek_model_loaded = False 26 27 # Shared constants 28 INTERNVIDEO_MODEL_PATH = "OpenGVLab/InternVideo2_5_Chat_8B" 29 DEEPSEEK_MODEL_PATH = "huihui-ai/DeepSeek-R1-Distill-Qwen-14B-abliterated-v2" 30 IMAGENET_MEAN = (0.485, 0.456, 0.406) 31 IMAGENET_STD = (0.229, 0.224, 0.225) 32 33 # Shared utility functions 34 def build_transform(input_size=448): 35 """Creates transformations for input images""" 36 from torchvision import transforms as T 37 from torchvision.transforms.functional import InterpolationMode 38 39 return T.Compose([ 40 T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img), 41 T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC), 42 T.ToTensor(), 43 T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD) 44 ]) 45 46 def get_dynamic_segments(video_path: str) -> int: 47 """Determines the optimal number of segments based on video duration""" 48 from decord import VideoReader, cpu 49 50 vr = VideoReader(video_path, ctx=cpu(0)) 51 fps = float(vr.get_avg_fps()) 52 duration = len(vr) / fps 53 54 if duration < 10: # Very short video (< 10 sec) 55 num_segments = 16 56 elif duration < 60: # Short video (10s - 1 min) 57 num_segments = 60 58 elif duration < 140: # Medium-short video 59 num_segments = 140 60 elif duration < 300: # Medium video (1 min - 5 min) 61 num_segments = 300 62 else: # Long video (> 5 min) 63 num_segments = 400 64 65 return min(num_segments, 400) # Limit to 400 segments 66 67 def get_index(bound, fps, max_frame, first_idx=0, num_segments=32): 68 """Calculates the indices of images to extract""" 69 start_idx = max(first_idx, round(bound[0] * fps)) if bound else 0 70 end_idx = min(round(bound[1] * fps), max_frame) if bound else max_frame 71 seg_size = float(end_idx - start_idx) / num_segments 72 return np.array([int(start_idx + (seg_size / 2) + np.round(seg_size * idx)) for idx in range(num_segments)]) 73 74 def load_video(video_path: str, num_segments: int = 128, input_size: int = 448, 75 progress: Optional[Callable] = None) -> Tuple[torch.Tensor, List[int]]: 76 """Loads and preprocesses video images""" 77 from decord import VideoReader, cpu 78 from PIL import Image 79 80 vr = VideoReader(video_path, ctx=cpu(0)) 81 max_frame = len(vr) - 1 82 fps = float(vr.get_avg_fps()) 83 84 pixel_values_list = [] 85 num_patches_list = [] 86 transform = build_transform(input_size=input_size) 87 88 frame_indices = get_index(None, fps, max_frame, num_segments=num_segments) 89 90 # Image processing with progress updates 91 for i, frame_index in enumerate(frame_indices): 92 if i % 10 == 0 and progress: # Update every 10 images 93 progress_val = 0.1 + 0.3 * (i / len(frame_indices)) 94 progress(progress_val, desc=f"Processing images ({i}/{len(frame_indices)})...") 95 96 img = Image.fromarray(vr[frame_index].asnumpy()).convert("RGB") 97 pixel_values = transform(img).unsqueeze(0) 98 num_patches_list.append(1) 99 pixel_values_list.append(pixel_values) 100 101 pixel_values = torch.cat(pixel_values_list) 102 return pixel_values, num_patches_list 103 104 def unload_internvideo_model(): 105 """Frees memory of the InternVideo model""" 106 global internvideo_model_loaded 107 if internvideo_model_loaded: 108 try: 109 import torch 110 torch.cuda.empty_cache() 111 gc.collect() 112 internvideo_model_loaded = False 113 return True 114 except Exception as e: 115 logger.error(f"Error while freeing InternVideo model: {str(e)}") 116 return False 117 return False 118 119 def load_internvideo_model(): 120 """Loads the InternVideo model if necessary""" 121 global internvideo_model_loaded 122 123 if not internvideo_model_loaded: 124 try: 125 from transformers import AutoModel, AutoTokenizer 126 127 tokenizer = AutoTokenizer.from_pretrained(INTERNVIDEO_MODEL_PATH, trust_remote_code=True) 128 model = AutoModel.from_pretrained(INTERNVIDEO_MODEL_PATH, trust_remote_code=True).half().cuda() 129 model = model.to(torch.bfloat16) 130 internvideo_model_loaded = True 131 return model, tokenizer 132 except Exception as e: 133 logger.error(f"Error while loading InternVideo model: {str(e)}") 134 return None, None 135 return None, None 136 137 def load_deepseek_model(): 138 """Loads the DeepSeek model if necessary""" 139 global deepseek_model_loaded 140 141 if not deepseek_model_loaded: 142 try: 143 from vllm import LLM 144 import torch 145 os.environ["VLLM_ALLOW_LONG_MAX_MODEL_LEN"] = "1" 146 147 model = LLM( 148 model=DEEPSEEK_MODEL_PATH, 149 dtype="half", 150 tensor_parallel_size=torch.cuda.device_count(), 151 gpu_memory_utilization=0.85, 152 max_model_len=19760, 153 trust_remote_code=True, 154 enforce_eager=False, 155 ) 156 deepseek_model_loaded = True 157 return model 158 except Exception as e: 159 logger.error(f"Error while loading DeepSeek model: {str(e)}") 160 return None 161 return None 162 163 # Prompts for different analyses 164 VIDEO_CONTENT_PROMPT = """# Video Content Extraction Prompt System 165 166 ## SYSTEM INSTRUCTIONS 167 168 You are a specialized system designed to extract and document all observable components from video content. Your purpose is to create a comprehensive, objective record of visual elements without analyzing or interpreting their meaning. This detailed extraction will serve as input for subsequent analytical systems. 169 170 ## EXTRACTION METHODOLOGY 171 172 For each video, proceed through these extraction phases: 173 174 1. **Initial Overview**: Identify basic video parameters 175 2. **Visual Component Extraction**: Document all visual elements sequentially 176 3. **Temporal Sequencing**: Map the progression of components through time 177 4. **Component Relationship Mapping**: Note spatial and temporal relationships 178 179 ## COMPONENT EXTRACTION GUIDELINES 180 181 Extract and document each of the following component categories in detail: 182 183 ### 1. Technical Parameters 184 - **Video Quality**: Resolution, frame rate, aspect ratio 185 - **Duration**: Total length, timestamp format 186 - **Format**: Video codec, container format 187 - **Technical Issues**: Visible compression artifacts, frame drops, or other technical anomalies 188 189 ### 2. Visual Composition 190 - **Shot Types**: Close-up, medium shot, wide shot, extreme close-up, etc. 191 - **Camera Angles**: High angle, low angle, eye level, bird's eye view, etc. 192 - **Camera Movements**: Pan, tilt, tracking, zoom, static, handheld, stabilized, etc. 193 - **Framing**: Rule of thirds positioning, headroom, lead room, symmetry/asymmetry 194 - **Depth of Field**: Shallow, deep, rack focus events 195 - **Composition**: Foreground, midground, background elements and their arrangement 196 197 ### 3. Lighting and Color 198 - **Lighting Setup**: High-key, low-key, natural, artificial, direction of light 199 - **Lighting Quality**: Hard, soft, diffused, direct 200 - **Colorimetry**: Color palette, saturation levels, temperature (warm/cool) 201 - **Contrast Levels**: High contrast, low contrast 202 - **Color Grading**: Visible filters, stylistic color treatments 203 - **Time of Day**: Daytime, nighttime, golden hour, etc. 204 205 ### 4. Environment and Setting 206 - **Location Type**: Indoor, outdoor, studio, natural environment 207 - **Setting Details**: Urban, rural, domestic, public, private 208 - **Set Design Elements**: Furniture, decorations, props, architectural features 209 - **Weather Conditions**: If outdoors - clear, cloudy, rainy, snowy, etc. 210 - **Time Period Indicators**: Modern, historical, futuristic elements 211 212 ### 5. People and Characters 213 - **Number of People**: Total count, entries/exits during footage 214 - **Physical Characteristics**: Age range, gender presentation, ethnicity, clothing, distinctive features 215 - **Positioning**: Standing, sitting, walking, relative positions between people 216 - **Facial Expressions**: Detailed documentation of visible expressions (smiling, frowning, neutral, etc.) 217 - **Body Language**: Posture, gestures, proxemics (physical distance between people) 218 - **Eye Direction**: Where subjects are looking 219 - **Physical Actions**: What subjects are physically doing 220 221 ### 6. Text Elements 222 - **On-screen Text**: Titles, subtitles, captions, credits, watermarks 223 - **Text in Scene**: Signs, books, screens, clothing with text 224 - **Text Style**: Font, size, color, animation 225 - **Text Positioning**: Where text appears on screen 226 - **Duration**: How long text remains visible 227 - **Language**: What language(s) appears in text 228 229 ### 7. Graphics and Visual Effects 230 - **Graphic Elements**: Logos, icons, illustrations, diagrams 231 - **Animation**: Moving graphics, style of animation 232 - **Visual Effects**: CGI elements, compositing, filters 233 - **Transitions**: Cuts, dissolves, wipes, fades 234 - **Screen Graphics**: User interfaces, screens within the video 235 - **Overlays**: Information graphics, lower thirds, watermarks 236 237 ### 8. Temporal Elements 238 - **Editing Pace**: Shot length, cutting patterns 239 - **Time Manipulation**: Slow motion, time-lapse, freeze frames 240 - **Sequence of Events**: Chronological documentation of what happens 241 - **Scene Changes**: Transitions between different locations or settings 242 - **Timestamp References**: Noting when specific elements appear and disappear 243 244 ### 9. Production Context (if evident) 245 - **Production Type**: Professional, amateur, social media, broadcast, film 246 - **Visible Equipment**: Microphones, lights, reflectors in frame 247 - **Production Credits**: Visible information about creators 248 249 ## OUTPUT FORMAT 250 251 Structure your extraction in this format: 252 VIDEO EXTRACTION REPORT 253 Basic Parameters 254 * Title (if known): [title] 255 * Duration: [time] 256 * Resolution: [resolution] 257 * Aspect Ratio: [ratio] 258 Visual Component Timeline 259 [00:00-00:00] [Detailed description of visual elements during this timeframe] [00:00-00:00] [Next segment description] ... 260 People and Characters 261 * Person 1: [Detailed description] 262 * Visible at: [Timestamp ranges] 263 * Actions: [Description of what they do] 264 * Expressions: [Description of notable expressions] 265 * Person 2: [...] 266 Text Elements 267 * [00:00-00:00] [Description of text content, style, position] 268 * [00:00-00:00] [...] 269 Graphics and Effects 270 * [00:00-00:00] [Description of graphics or effects] 271 * [00:00-00:00] [...] 272 Technical Elements 273 * Camera Angles: [List all observed camera angles with timestamps] 274 * Shot Types: [List all observed shot types with timestamps] 275 * Camera Movements: [List all observed movements with timestamps] 276 * Lighting Conditions: [List all observed lighting conditions with timestamps] 277 * Color Palette: [Description of dominant colors and changes] 278 * Editing Techniques: [Description of evident editing choices] 279 Component Relationships 280 * [Description of notable spatial relationships between elements] 281 * [Description of notable temporal relationships between elements] 282 283 284 ## IMPORTANT GUIDELINES 285 286 1. **Record ONLY what is directly observable** in the video 287 2. **DO NOT analyze, interpret, or evaluate** the content 288 3. **Avoid subjective judgments** about quality, intent, or meaning 289 4. **Do not speculate** about anything not visible in the video 290 5. **Be precise and comprehensive** in documenting all components 291 6. **Maintain objective, neutral language** throughout 292 7. **If uncertain about any element**, note the uncertainty rather than guessing 293 8. **Document timestamps** as accurately as possible 294 9. **Prioritize completeness** - capture all relevant visual elements 295 10. **Focus on EXTRACTION ONLY** - leave all analysis to subsequent systems 296 297 REMEMBER: Your role is solely to extract and document components, not to analyze them. Provide a comprehensive extraction that will serve as a foundation for later analytical systems. 298 """ 299 300 NONVERBAL_EXTRACTION_PROMPT = """# Enhanced Non-Verbal and Expression Video Extraction System 301 302 ## SYSTEM INSTRUCTIONS 303 304 You are a specialized system designed to extract and document all non-verbal communication, facial expressions, and body language elements from video content with extreme granularity and precision. Your purpose is to create a comprehensive, objective record of these human behavioral components without analyzing or interpreting their meaning. 305 306 ## EXTRACTION METHODOLOGY 307 308 For each video, employ this hyper-granular extraction process: 309 1. **Frame-by-Frame Subject Identification**: Track all visible people 310 2. **Micro-Level Facial Analysis**: Document all facial movements 311 3. **Comprehensive Body Language Extraction**: Document all posture, gestures, and movements 312 4. **Multi-dimensional Proxemics Extraction**: Document spatial relationships 313 5. **Temporal Micro-Tracking**: Map the progression of non-verbal cues 314 315 ## OUTPUT FORMAT 316 317 Structure your extraction in a detailed, systematic format capturing all observable non-verbal elements. 318 319 ANALYSIS START: 320 """ 321 322 NONVERBAL_ANALYSIS_PROMPT_TEMPLATE = """ 323 Non-Verbal Communication Analysis System 324 SYSTEM INSTRUCTIONS 325 You are a specialized system designed to analyze and interpret the non-verbal communication, facial expressions, and body language documented in video extraction reports. Your purpose is to provide insightful analysis of these behavioral components, identifying patterns, potential meanings, and psychological implications. 326 327 ANALYSIS METHODOLOGY 328 For each extraction report, proceed through these analytical phases: 329 1. Emotional State Analysis: Interpret facial expressions and body language to identify emotional states 330 2. Congruence Assessment: Evaluate alignment between different non-verbal channels 331 3. Interpersonal Dynamic Analysis: Interpret relationship indicators and status displays 332 4. Pattern Recognition: Identify recurring behaviors and their potential significance 333 5. Contextual Integration: Consider how setting and situation inform behavioral interpretation 334 335 OUTPUT FORMAT 336 Structure your analysis in this format: 337 ## NON-VERBAL COMMUNICATION ANALYSIS REPORT 338 339 ### Executive Summary 340 [Brief overview of key findings and significant patterns] 341 342 ### Emotional State Analysis 343 [Analysis of emotional states, changes, and potential causes] 344 345 ### Communication Intent Assessment 346 [Analysis of what subject appears to be communicating non-verbally] 347 348 ### Interpersonal Dynamic Analysis 349 [Analysis of relationship indicators, power dynamics, rapport, and group dynamics] 350 351 ### Credibility and Congruence Assessment 352 [Analysis of alignment between different non-verbal channels and overall authenticity] 353 354 ### Psychological State Indicators 355 [Analysis of comfort, stress, cognitive load, and attitudinal indicators] 356 357 ### Key Behavioral Patterns 358 [Analysis of significant recurring behaviors and their potential meanings] 359 360 ANALYTICAL PRINCIPLES 361 1. Balance confidence with uncertainty - Acknowledge the probabilistic nature of non-verbal interpretation 362 2. Consider cultural and contextual factors in all interpretations 363 3. Identify multiple potential interpretations where appropriate 364 4. Distinguish between observation and inference - Clearly separate what was observed from what it might mean 365 366 Here is the text: "{extraction_text}" 367 ANALYSIS START: 368 """ 369 370 # Main functions 371 def extract_video_content(video_path: str, progress: Optional[Callable] = None) -> Tuple[str, Optional[str]]: 372 """Extracts video content using InternVideo2.5""" 373 global internvideo_model_loaded 374 375 try: 376 if progress: 377 progress(0, desc="Loading InternVideo2.5 model...") 378 379 # Import necessary libraries 380 from transformers import AutoModel, AutoTokenizer 381 382 # Model loading 383 if not internvideo_model_loaded: 384 tokenizer = AutoTokenizer.from_pretrained(INTERNVIDEO_MODEL_PATH, trust_remote_code=True) 385 model = AutoModel.from_pretrained(INTERNVIDEO_MODEL_PATH, trust_remote_code=True).half().cuda() 386 model = model.to(torch.bfloat16) 387 internvideo_model_loaded = True 388 else: 389 # Get existing instances 390 tokenizer = AutoTokenizer.from_pretrained(INTERNVIDEO_MODEL_PATH, trust_remote_code=True) 391 model = AutoModel.from_pretrained(INTERNVIDEO_MODEL_PATH, trust_remote_code=True, device_map="auto") 392 393 if progress: 394 progress(0.5, desc="Determining optimal number of frames...") 395 396 # Get the optimal number of segments 397 num_segments = get_dynamic_segments(video_path) 398 399 if progress: 400 progress(0.6, desc="Processing video frames...") 401 402 # Loading and processing video frames 403 pixel_values, num_patches_list = load_video(video_path, num_segments=num_segments, progress=progress) 404 pixel_values = pixel_values.to(torch.bfloat16).to(model.device) 405 406 if progress: 407 progress(0.7, desc="Building prompt...") 408 409 # Building prompt with images 410 video_prefix = "".join([f"Frame {i+1}: <image>\n" for i in range(len(num_patches_list))]) 411 full_prompt = video_prefix + VIDEO_CONTENT_PROMPT 412 413 if progress: 414 progress(0.8, desc="Running extraction (may take a while)...") 415 416 # Running the model 417 with torch.no_grad(): 418 result = model.chat( 419 tokenizer, pixel_values, full_prompt, 420 dict( 421 do_sample=True, 422 temperature=0.53, 423 max_new_tokens=8500, 424 top_p=0.93, 425 top_k=30, 426 ), 427 num_patches_list=num_patches_list, 428 history=None, return_history=False 429 ) 430 431 if progress: 432 progress(0.9, desc="Saving extraction results...") 433 434 # Save to a temporary file 435 temp_file = NamedTemporaryFile(delete=False, suffix=".txt", mode="w") 436 temp_file.write(result) 437 temp_path = temp_file.name 438 temp_file.close() 439 440 if progress: 441 progress(1.0, desc="Extraction completed!") 442 443 return result, temp_path 444 445 except Exception as e: 446 error_msg = f"Error in extraction phase: {str(e)}\n{traceback.format_exc()}" 447 logger.error(error_msg) 448 return error_msg, None 449 450 def extract_nonverbal(video_path: str, progress: Optional[Callable] = None) -> Tuple[str, Optional[str]]: 451 """Extracts non-verbal cues using InternVideo2.5""" 452 global internvideo_model_loaded 453 454 try: 455 if progress: 456 progress(0, desc="Loading InternVideo2.5 model...") 457 458 # Import necessary libraries 459 from transformers import AutoModel, AutoTokenizer 460 461 # Model loading 462 if not internvideo_model_loaded: 463 tokenizer = AutoTokenizer.from_pretrained(INTERNVIDEO_MODEL_PATH, trust_remote_code=True) 464 model = AutoModel.from_pretrained(INTERNVIDEO_MODEL_PATH, trust_remote_code=True).half().cuda() 465 model = model.to(torch.bfloat16) 466 internvideo_model_loaded = True 467 else: 468 # Get existing instances 469 tokenizer = AutoTokenizer.from_pretrained(INTERNVIDEO_MODEL_PATH, trust_remote_code=True) 470 model = AutoModel.from_pretrained(INTERNVIDEO_MODEL_PATH, trust_remote_code=True, device_map="auto") 471 472 if progress: 473 progress(0.5, desc="Determining optimal number of frames...") 474 475 # Get the optimal number of segments 476 num_segments = get_dynamic_segments(video_path) 477 478 if progress: 479 progress(0.6, desc="Processing video frames...") 480 481 # Loading and processing video frames 482 pixel_values, num_patches_list = load_video(video_path, num_segments=num_segments, progress=progress) 483 pixel_values = pixel_values.to(torch.bfloat16).to(model.device) 484 485 if progress: 486 progress(0.7, desc="Building prompt...") 487 488 # Building prompt with images 489 video_prefix = "".join([f"Frame {i+1}: <image>\n" for i in range(len(num_patches_list))]) 490 full_prompt = video_prefix + NONVERBAL_EXTRACTION_PROMPT 491 492 if progress: 493 progress(0.8, desc="Running inference (may take a while)...") 494 495 # Running the model 496 with torch.no_grad(): 497 result = model.chat( 498 tokenizer, pixel_values, full_prompt, 499 dict( 500 do_sample=True, 501 temperature=0.53, 502 max_new_tokens=8500, 503 top_p=0.93, 504 top_k=30, 505 ), 506 num_patches_list=num_patches_list, 507 history=None, return_history=False 508 ) 509 510 if progress: 511 progress(0.9, desc="Saving results...") 512 513 # Save to a temporary file 514 temp_file = NamedTemporaryFile(delete=False, suffix=".txt", mode="w") 515 temp_file.write(result) 516 temp_path = temp_file.name 517 temp_file.close() 518 519 if progress: 520 progress(1.0, desc="Extraction completed!") 521 522 return result, temp_path 523 524 except Exception as e: 525 error_msg = f"Error in extraction phase: {str(e)}\n{traceback.format_exc()}" 526 logger.error(error_msg) 527 return error_msg, None 528 529 def analyze_nonverbal(extraction_text: str, extraction_path: Optional[str] = None, 530 progress: Optional[Callable] = None) -> str: 531 """Analyzes non-verbal cues using the DeepSeek model""" 532 global deepseek_model_loaded 533 534 try: 535 if progress: 536 progress(0, desc="Preparing DeepSeek model...") 537 538 # Free memory of previous model if it was loaded 539 global internvideo_model_loaded 540 if internvideo_model_loaded: 541 if progress: 542 progress(0.1, desc="Freeing InternVideo model memory...") 543 unload_internvideo_model() 544 545 if progress: 546 progress(0.2, desc="Loading DeepSeek model...") 547 548 # Import necessary libraries 549 from vllm import SamplingParams 550 551 # Initialize model if not already loaded 552 if not deepseek_model_loaded: 553 model = load_deepseek_model() 554 if model is None: 555 raise Exception("Failed to load DeepSeek model") 556 557 if progress: 558 progress(0.6, desc="Configuring inference parameters...") 559 560 # Configure sampling parameters 561 sampling_params = SamplingParams( 562 temperature=0.53, 563 top_p=0.93, 564 top_k=30, 565 max_tokens=8500, 566 frequency_penalty=0.2, 567 ) 568 569 if progress: 570 progress(0.7, desc="Running analysis (may take a while)...") 571 572 # Prepare analysis prompt 573 prompt = NONVERBAL_ANALYSIS_PROMPT_TEMPLATE.format(extraction_text=extraction_text) 574 575 # Generate analysis 576 outputs = model.generate([prompt], sampling_params) 577 analysis = outputs[0].outputs[0].text.strip() 578 579 if progress: 580 progress(1.0, desc="Analysis completed!") 581 582 return analysis 583 584 except Exception as e: 585 error_msg = f"Error in analysis phase: {str(e)}\n{traceback.format_exc()}" 586 logger.error(error_msg) 587 return error_msg 588 589 def analyze_manipulation_strategies(extraction_text: str, extraction_path: Optional[str] = None, 590 progress: Optional[Callable] = None) -> str: 591 """Analyzes video manipulation strategies using the DeepSeek model""" 592 global deepseek_model_loaded 593 594 try: 595 if progress: 596 progress(0, desc="Preparing DeepSeek model...") 597 598 # Free memory of previous model if it was loaded 599 global internvideo_model_loaded 600 if internvideo_model_loaded: 601 if progress: 602 progress(0.1, desc="Freeing InternVideo model memory...") 603 unload_internvideo_model() 604 605 if progress: 606 progress(0.2, desc="Loading DeepSeek model...") 607 608 # Import necessary libraries 609 from vllm import SamplingParams 610 611 # Initialize model if not already loaded 612 if not deepseek_model_loaded: 613 model = load_deepseek_model() 614 if model is None: 615 raise Exception("Failed to load DeepSeek model") 616 617 if progress: 618 progress(0.6, desc="Configuring inference parameters...") 619 620 # Configure sampling parameters 621 sampling_params = SamplingParams( 622 temperature=0.53, 623 top_p=0.93, 624 top_k=30, 625 max_tokens=8500, 626 frequency_penalty=0.2, 627 ) 628 629 if progress: 630 progress(0.7, desc="Running analysis (may take a while)...") 631 632 # Prepare analysis prompt 633 prompt = f""" 634 Video Manipulation Strategies Analysis System 635 SYSTEM INSTRUCTIONS 636 You are a specialized system designed to analyze video content extractions and identify potential persuasion, manipulation, and influence strategies employed in the video. Your purpose is to objectively identify and explain these strategies without making any political judgments. 637 638 ANALYSIS METHODOLOGY 639 For each video extraction report, proceed through these analytical phases: 640 1. Narrative Structure Analysis: Identify how the story is constructed 641 2. Visual and Production Technique Analysis: Examine camera work, editing, lighting, etc. 642 3. Emotional Appeal Analysis: Identify emotional triggers and psychological techniques 643 4. Rhetorical Strategy Analysis: Identify persuasion and argument techniques 644 5. Information Presentation Analysis: Examine how facts, evidence, and claims are presented 645 646 OUTPUT FORMAT 647 Structure your analysis in this format: 648 ## VIDEO MANIPULATION STRATEGIES ANALYSIS REPORT 649 650 ### Executive Summary 651 [Brief overview of key findings and significant manipulation strategies detected] 652 653 ### Narrative Structure 654 [Analysis of storytelling approach, framing techniques, perspective control] 655 656 ### Visual and Production Techniques 657 [Analysis of camera angles, editing choices, visual symbolism, color psychology] 658 659 ### Emotional Appeal Strategies 660 [Analysis of emotional triggers, psychological techniques, identity appeals] 661 662 ### Rhetorical and Linguistic Strategies 663 [Analysis of language patterns, argument structures, rhetorical devices] 664 665 ### Information Management Techniques 666 [Analysis of evidence presentation, information selection/omission, source handling] 667 668 ### Audience Targeting 669 [Analysis of how content targets specific audiences or demographics] 670 671 ### Manipulation Risk Assessment 672 [Assessment of overall manipulation potential and ethical considerations] 673 674 PRINCIPLES FOR ANALYSIS 675 1. Maintain political neutrality - Focus on techniques, not ideological positions 676 2. Distinguish between persuasion and manipulation 677 3. Consider context and audience expectations 678 4. Document evidence for each identified strategy 679 5. Acknowledge normal vs. problematic uses of influence techniques 680 681 Here is the video extraction text: "{extraction_text}" 682 ANALYSIS START: 683 """ 684 685 # Generate analysis 686 outputs = model.generate([prompt], sampling_params) 687 analysis = outputs[0].outputs[0].text.strip() 688 689 if progress: 690 progress(1.0, desc="Analysis completed!") 691 692 return analysis 693 694 except Exception as e: 695 error_msg = f"Error in analysis phase: {str(e)}\n{traceback.format_exc()}" 696 logger.error(error_msg) 697 return error_msg