/ video_models / video_utils.py
video_utils.py
  1  """
  2  Module for video analysis and non-verbal information extraction
  3  This unified module contains functions for:
  4  1. Video content extraction
  5  2. Video content analysis
  6  3. Non-verbal cues extraction
  7  4. Non-verbal elements analysis
  8  """
  9  
 10  import os
 11  import gc
 12  import time
 13  import torch
 14  import numpy as np
 15  import traceback
 16  from tempfile import NamedTemporaryFile
 17  import logging
 18  from typing import Tuple, List, Optional, Dict, Any, Callable
 19  
 20  # Logging configuration
 21  logger = logging.getLogger("video_analyzer")
 22  
 23  # Global variables to track model states
 24  internvideo_model_loaded = False
 25  deepseek_model_loaded = False
 26  
 27  # Shared constants
 28  INTERNVIDEO_MODEL_PATH = "OpenGVLab/InternVideo2_5_Chat_8B"
 29  DEEPSEEK_MODEL_PATH = "huihui-ai/DeepSeek-R1-Distill-Qwen-14B-abliterated-v2"
 30  IMAGENET_MEAN = (0.485, 0.456, 0.406)
 31  IMAGENET_STD = (0.229, 0.224, 0.225)
 32  
 33  # Shared utility functions
 34  def build_transform(input_size=448):
 35      """Creates transformations for input images"""
 36      from torchvision import transforms as T
 37      from torchvision.transforms.functional import InterpolationMode
 38      
 39      return T.Compose([
 40          T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img),
 41          T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
 42          T.ToTensor(),
 43          T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
 44      ])
 45  
 46  def get_dynamic_segments(video_path: str) -> int:
 47      """Determines the optimal number of segments based on video duration"""
 48      from decord import VideoReader, cpu
 49      
 50      vr = VideoReader(video_path, ctx=cpu(0))
 51      fps = float(vr.get_avg_fps())
 52      duration = len(vr) / fps
 53      
 54      if duration < 10:      # Very short video (< 10 sec)
 55          num_segments = 16
 56      elif duration < 60:    # Short video (10s - 1 min)
 57          num_segments = 60
 58      elif duration < 140:   # Medium-short video
 59          num_segments = 140
 60      elif duration < 300:   # Medium video (1 min - 5 min)
 61          num_segments = 300
 62      else:                  # Long video (> 5 min)
 63          num_segments = 400
 64          
 65      return min(num_segments, 400)  # Limit to 400 segments
 66  
 67  def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
 68      """Calculates the indices of images to extract"""
 69      start_idx = max(first_idx, round(bound[0] * fps)) if bound else 0
 70      end_idx = min(round(bound[1] * fps), max_frame) if bound else max_frame
 71      seg_size = float(end_idx - start_idx) / num_segments
 72      return np.array([int(start_idx + (seg_size / 2) + np.round(seg_size * idx)) for idx in range(num_segments)])
 73  
 74  def load_video(video_path: str, num_segments: int = 128, input_size: int = 448, 
 75                 progress: Optional[Callable] = None) -> Tuple[torch.Tensor, List[int]]:
 76      """Loads and preprocesses video images"""
 77      from decord import VideoReader, cpu
 78      from PIL import Image
 79      
 80      vr = VideoReader(video_path, ctx=cpu(0))
 81      max_frame = len(vr) - 1
 82      fps = float(vr.get_avg_fps())
 83      
 84      pixel_values_list = []
 85      num_patches_list = []
 86      transform = build_transform(input_size=input_size)
 87      
 88      frame_indices = get_index(None, fps, max_frame, num_segments=num_segments)
 89      
 90      # Image processing with progress updates
 91      for i, frame_index in enumerate(frame_indices):
 92          if i % 10 == 0 and progress:  # Update every 10 images
 93              progress_val = 0.1 + 0.3 * (i / len(frame_indices))
 94              progress(progress_val, desc=f"Processing images ({i}/{len(frame_indices)})...")
 95                  
 96          img = Image.fromarray(vr[frame_index].asnumpy()).convert("RGB")
 97          pixel_values = transform(img).unsqueeze(0)
 98          num_patches_list.append(1)
 99          pixel_values_list.append(pixel_values)
100              
101      pixel_values = torch.cat(pixel_values_list)
102      return pixel_values, num_patches_list
103  
104  def unload_internvideo_model():
105      """Frees memory of the InternVideo model"""
106      global internvideo_model_loaded
107      if internvideo_model_loaded:
108          try:
109              import torch
110              torch.cuda.empty_cache()
111              gc.collect()
112              internvideo_model_loaded = False
113              return True
114          except Exception as e:
115              logger.error(f"Error while freeing InternVideo model: {str(e)}")
116              return False
117      return False
118  
119  def load_internvideo_model():
120      """Loads the InternVideo model if necessary"""
121      global internvideo_model_loaded
122      
123      if not internvideo_model_loaded:
124          try:
125              from transformers import AutoModel, AutoTokenizer
126              
127              tokenizer = AutoTokenizer.from_pretrained(INTERNVIDEO_MODEL_PATH, trust_remote_code=True)
128              model = AutoModel.from_pretrained(INTERNVIDEO_MODEL_PATH, trust_remote_code=True).half().cuda()
129              model = model.to(torch.bfloat16)
130              internvideo_model_loaded = True
131              return model, tokenizer
132          except Exception as e:
133              logger.error(f"Error while loading InternVideo model: {str(e)}")
134              return None, None
135      return None, None
136  
137  def load_deepseek_model():
138      """Loads the DeepSeek model if necessary"""
139      global deepseek_model_loaded
140      
141      if not deepseek_model_loaded:
142          try:
143              from vllm import LLM
144              import torch
145              os.environ["VLLM_ALLOW_LONG_MAX_MODEL_LEN"] = "1"
146              
147              model = LLM(
148                  model=DEEPSEEK_MODEL_PATH,
149                  dtype="half",
150                  tensor_parallel_size=torch.cuda.device_count(),
151                  gpu_memory_utilization=0.85,
152                  max_model_len=19760,
153                  trust_remote_code=True,
154                  enforce_eager=False,
155              )
156              deepseek_model_loaded = True
157              return model
158          except Exception as e:
159              logger.error(f"Error while loading DeepSeek model: {str(e)}")
160              return None
161      return None
162  
163  # Prompts for different analyses
164  VIDEO_CONTENT_PROMPT = """# Video Content Extraction Prompt System
165  
166  ## SYSTEM INSTRUCTIONS
167  
168  You are a specialized system designed to extract and document all observable components from video content. Your purpose is to create a comprehensive, objective record of visual elements without analyzing or interpreting their meaning. This detailed extraction will serve as input for subsequent analytical systems.
169  
170  ## EXTRACTION METHODOLOGY
171  
172  For each video, proceed through these extraction phases:
173  
174  1. **Initial Overview**: Identify basic video parameters
175  2. **Visual Component Extraction**: Document all visual elements sequentially
176  3. **Temporal Sequencing**: Map the progression of components through time
177  4. **Component Relationship Mapping**: Note spatial and temporal relationships
178  
179  ## COMPONENT EXTRACTION GUIDELINES
180  
181  Extract and document each of the following component categories in detail:
182  
183  ### 1. Technical Parameters
184  - **Video Quality**: Resolution, frame rate, aspect ratio
185  - **Duration**: Total length, timestamp format
186  - **Format**: Video codec, container format
187  - **Technical Issues**: Visible compression artifacts, frame drops, or other technical anomalies
188  
189  ### 2. Visual Composition
190  - **Shot Types**: Close-up, medium shot, wide shot, extreme close-up, etc.
191  - **Camera Angles**: High angle, low angle, eye level, bird's eye view, etc.
192  - **Camera Movements**: Pan, tilt, tracking, zoom, static, handheld, stabilized, etc.
193  - **Framing**: Rule of thirds positioning, headroom, lead room, symmetry/asymmetry
194  - **Depth of Field**: Shallow, deep, rack focus events
195  - **Composition**: Foreground, midground, background elements and their arrangement
196  
197  ### 3. Lighting and Color
198  - **Lighting Setup**: High-key, low-key, natural, artificial, direction of light
199  - **Lighting Quality**: Hard, soft, diffused, direct
200  - **Colorimetry**: Color palette, saturation levels, temperature (warm/cool)
201  - **Contrast Levels**: High contrast, low contrast
202  - **Color Grading**: Visible filters, stylistic color treatments
203  - **Time of Day**: Daytime, nighttime, golden hour, etc.
204  
205  ### 4. Environment and Setting
206  - **Location Type**: Indoor, outdoor, studio, natural environment
207  - **Setting Details**: Urban, rural, domestic, public, private
208  - **Set Design Elements**: Furniture, decorations, props, architectural features
209  - **Weather Conditions**: If outdoors - clear, cloudy, rainy, snowy, etc.
210  - **Time Period Indicators**: Modern, historical, futuristic elements
211  
212  ### 5. People and Characters
213  - **Number of People**: Total count, entries/exits during footage
214  - **Physical Characteristics**: Age range, gender presentation, ethnicity, clothing, distinctive features
215  - **Positioning**: Standing, sitting, walking, relative positions between people
216  - **Facial Expressions**: Detailed documentation of visible expressions (smiling, frowning, neutral, etc.)
217  - **Body Language**: Posture, gestures, proxemics (physical distance between people)
218  - **Eye Direction**: Where subjects are looking
219  - **Physical Actions**: What subjects are physically doing
220  
221  ### 6. Text Elements
222  - **On-screen Text**: Titles, subtitles, captions, credits, watermarks
223  - **Text in Scene**: Signs, books, screens, clothing with text
224  - **Text Style**: Font, size, color, animation
225  - **Text Positioning**: Where text appears on screen
226  - **Duration**: How long text remains visible
227  - **Language**: What language(s) appears in text
228  
229  ### 7. Graphics and Visual Effects
230  - **Graphic Elements**: Logos, icons, illustrations, diagrams
231  - **Animation**: Moving graphics, style of animation
232  - **Visual Effects**: CGI elements, compositing, filters
233  - **Transitions**: Cuts, dissolves, wipes, fades
234  - **Screen Graphics**: User interfaces, screens within the video
235  - **Overlays**: Information graphics, lower thirds, watermarks
236  
237  ### 8. Temporal Elements
238  - **Editing Pace**: Shot length, cutting patterns
239  - **Time Manipulation**: Slow motion, time-lapse, freeze frames
240  - **Sequence of Events**: Chronological documentation of what happens
241  - **Scene Changes**: Transitions between different locations or settings
242  - **Timestamp References**: Noting when specific elements appear and disappear
243  
244  ### 9. Production Context (if evident)
245  - **Production Type**: Professional, amateur, social media, broadcast, film
246  - **Visible Equipment**: Microphones, lights, reflectors in frame
247  - **Production Credits**: Visible information about creators
248  
249  ## OUTPUT FORMAT
250  
251  Structure your extraction in this format:
252  VIDEO EXTRACTION REPORT
253  Basic Parameters
254  * Title (if known): [title]
255  * Duration: [time]
256  * Resolution: [resolution]
257  * Aspect Ratio: [ratio]
258  Visual Component Timeline
259  [00:00-00:00] [Detailed description of visual elements during this timeframe] [00:00-00:00] [Next segment description] ...
260  People and Characters
261  * Person 1: [Detailed description]
262      * Visible at: [Timestamp ranges]
263      * Actions: [Description of what they do]
264      * Expressions: [Description of notable expressions]
265  * Person 2: [...]
266  Text Elements
267  * [00:00-00:00] [Description of text content, style, position]
268  * [00:00-00:00] [...]
269  Graphics and Effects
270  * [00:00-00:00] [Description of graphics or effects]
271  * [00:00-00:00] [...]
272  Technical Elements
273  * Camera Angles: [List all observed camera angles with timestamps]
274  * Shot Types: [List all observed shot types with timestamps]
275  * Camera Movements: [List all observed movements with timestamps]
276  * Lighting Conditions: [List all observed lighting conditions with timestamps]
277  * Color Palette: [Description of dominant colors and changes]
278  * Editing Techniques: [Description of evident editing choices]
279  Component Relationships
280  * [Description of notable spatial relationships between elements]
281  * [Description of notable temporal relationships between elements]
282  
283  
284  ## IMPORTANT GUIDELINES
285  
286  1. **Record ONLY what is directly observable** in the video
287  2. **DO NOT analyze, interpret, or evaluate** the content
288  3. **Avoid subjective judgments** about quality, intent, or meaning
289  4. **Do not speculate** about anything not visible in the video
290  5. **Be precise and comprehensive** in documenting all components
291  6. **Maintain objective, neutral language** throughout
292  7. **If uncertain about any element**, note the uncertainty rather than guessing
293  8. **Document timestamps** as accurately as possible
294  9. **Prioritize completeness** - capture all relevant visual elements
295  10. **Focus on EXTRACTION ONLY** - leave all analysis to subsequent systems
296  
297  REMEMBER: Your role is solely to extract and document components, not to analyze them. Provide a comprehensive extraction that will serve as a foundation for later analytical systems.
298  """
299  
300  NONVERBAL_EXTRACTION_PROMPT = """# Enhanced Non-Verbal and Expression Video Extraction System
301          
302  ## SYSTEM INSTRUCTIONS
303  
304  You are a specialized system designed to extract and document all non-verbal communication, facial expressions, and body language elements from video content with extreme granularity and precision. Your purpose is to create a comprehensive, objective record of these human behavioral components without analyzing or interpreting their meaning.
305  
306  ## EXTRACTION METHODOLOGY
307  
308  For each video, employ this hyper-granular extraction process:
309  1. **Frame-by-Frame Subject Identification**: Track all visible people
310  2. **Micro-Level Facial Analysis**: Document all facial movements 
311  3. **Comprehensive Body Language Extraction**: Document all posture, gestures, and movements
312  4. **Multi-dimensional Proxemics Extraction**: Document spatial relationships
313  5. **Temporal Micro-Tracking**: Map the progression of non-verbal cues
314  
315  ## OUTPUT FORMAT
316  
317  Structure your extraction in a detailed, systematic format capturing all observable non-verbal elements.
318  
319  ANALYSIS START:
320  """
321  
322  NONVERBAL_ANALYSIS_PROMPT_TEMPLATE = """
323  Non-Verbal Communication Analysis System
324  SYSTEM INSTRUCTIONS
325  You are a specialized system designed to analyze and interpret the non-verbal communication, facial expressions, and body language documented in video extraction reports. Your purpose is to provide insightful analysis of these behavioral components, identifying patterns, potential meanings, and psychological implications.
326  
327  ANALYSIS METHODOLOGY
328  For each extraction report, proceed through these analytical phases:
329  1. Emotional State Analysis: Interpret facial expressions and body language to identify emotional states
330  2. Congruence Assessment: Evaluate alignment between different non-verbal channels
331  3. Interpersonal Dynamic Analysis: Interpret relationship indicators and status displays
332  4. Pattern Recognition: Identify recurring behaviors and their potential significance
333  5. Contextual Integration: Consider how setting and situation inform behavioral interpretation
334  
335  OUTPUT FORMAT
336  Structure your analysis in this format:
337  ## NON-VERBAL COMMUNICATION ANALYSIS REPORT
338  
339  ### Executive Summary
340  [Brief overview of key findings and significant patterns]
341  
342  ### Emotional State Analysis
343  [Analysis of emotional states, changes, and potential causes]
344  
345  ### Communication Intent Assessment
346  [Analysis of what subject appears to be communicating non-verbally]
347  
348  ### Interpersonal Dynamic Analysis
349  [Analysis of relationship indicators, power dynamics, rapport, and group dynamics]
350  
351  ### Credibility and Congruence Assessment
352  [Analysis of alignment between different non-verbal channels and overall authenticity]
353  
354  ### Psychological State Indicators
355  [Analysis of comfort, stress, cognitive load, and attitudinal indicators]
356  
357  ### Key Behavioral Patterns
358  [Analysis of significant recurring behaviors and their potential meanings]
359  
360  ANALYTICAL PRINCIPLES
361  1. Balance confidence with uncertainty - Acknowledge the probabilistic nature of non-verbal interpretation
362  2. Consider cultural and contextual factors in all interpretations
363  3. Identify multiple potential interpretations where appropriate
364  4. Distinguish between observation and inference - Clearly separate what was observed from what it might mean
365  
366  Here is the text: "{extraction_text}"
367  ANALYSIS START:
368  """
369  
370  # Main functions
371  def extract_video_content(video_path: str, progress: Optional[Callable] = None) -> Tuple[str, Optional[str]]:
372      """Extracts video content using InternVideo2.5"""
373      global internvideo_model_loaded
374      
375      try:
376          if progress:
377              progress(0, desc="Loading InternVideo2.5 model...")
378          
379          # Import necessary libraries
380          from transformers import AutoModel, AutoTokenizer
381          
382          # Model loading
383          if not internvideo_model_loaded:
384              tokenizer = AutoTokenizer.from_pretrained(INTERNVIDEO_MODEL_PATH, trust_remote_code=True)
385              model = AutoModel.from_pretrained(INTERNVIDEO_MODEL_PATH, trust_remote_code=True).half().cuda()
386              model = model.to(torch.bfloat16)
387              internvideo_model_loaded = True
388          else:
389              # Get existing instances
390              tokenizer = AutoTokenizer.from_pretrained(INTERNVIDEO_MODEL_PATH, trust_remote_code=True)
391              model = AutoModel.from_pretrained(INTERNVIDEO_MODEL_PATH, trust_remote_code=True, device_map="auto")
392          
393          if progress:
394              progress(0.5, desc="Determining optimal number of frames...")
395          
396          # Get the optimal number of segments
397          num_segments = get_dynamic_segments(video_path)
398          
399          if progress:
400              progress(0.6, desc="Processing video frames...")
401          
402          # Loading and processing video frames
403          pixel_values, num_patches_list = load_video(video_path, num_segments=num_segments, progress=progress)
404          pixel_values = pixel_values.to(torch.bfloat16).to(model.device)
405          
406          if progress:
407              progress(0.7, desc="Building prompt...")
408          
409          # Building prompt with images
410          video_prefix = "".join([f"Frame {i+1}: <image>\n" for i in range(len(num_patches_list))])
411          full_prompt = video_prefix + VIDEO_CONTENT_PROMPT
412          
413          if progress:
414              progress(0.8, desc="Running extraction (may take a while)...")
415          
416          # Running the model
417          with torch.no_grad():
418              result = model.chat(
419                  tokenizer, pixel_values, full_prompt,
420                  dict(
421                      do_sample=True,
422                      temperature=0.53,
423                      max_new_tokens=8500,
424                      top_p=0.93,
425                      top_k=30,
426                  ),
427                  num_patches_list=num_patches_list,
428                  history=None, return_history=False
429              )
430          
431          if progress:
432              progress(0.9, desc="Saving extraction results...")
433          
434          # Save to a temporary file
435          temp_file = NamedTemporaryFile(delete=False, suffix=".txt", mode="w")
436          temp_file.write(result)
437          temp_path = temp_file.name
438          temp_file.close()
439          
440          if progress:
441              progress(1.0, desc="Extraction completed!")
442          
443          return result, temp_path
444          
445      except Exception as e:
446          error_msg = f"Error in extraction phase: {str(e)}\n{traceback.format_exc()}"
447          logger.error(error_msg)
448          return error_msg, None
449  
450  def extract_nonverbal(video_path: str, progress: Optional[Callable] = None) -> Tuple[str, Optional[str]]:
451      """Extracts non-verbal cues using InternVideo2.5"""
452      global internvideo_model_loaded
453      
454      try:
455          if progress:
456              progress(0, desc="Loading InternVideo2.5 model...")
457          
458          # Import necessary libraries
459          from transformers import AutoModel, AutoTokenizer
460          
461          # Model loading
462          if not internvideo_model_loaded:
463              tokenizer = AutoTokenizer.from_pretrained(INTERNVIDEO_MODEL_PATH, trust_remote_code=True)
464              model = AutoModel.from_pretrained(INTERNVIDEO_MODEL_PATH, trust_remote_code=True).half().cuda()
465              model = model.to(torch.bfloat16)
466              internvideo_model_loaded = True
467          else:
468              # Get existing instances
469              tokenizer = AutoTokenizer.from_pretrained(INTERNVIDEO_MODEL_PATH, trust_remote_code=True)
470              model = AutoModel.from_pretrained(INTERNVIDEO_MODEL_PATH, trust_remote_code=True, device_map="auto")
471          
472          if progress:
473              progress(0.5, desc="Determining optimal number of frames...")
474          
475          # Get the optimal number of segments
476          num_segments = get_dynamic_segments(video_path)
477          
478          if progress:
479              progress(0.6, desc="Processing video frames...")
480          
481          # Loading and processing video frames
482          pixel_values, num_patches_list = load_video(video_path, num_segments=num_segments, progress=progress)
483          pixel_values = pixel_values.to(torch.bfloat16).to(model.device)
484          
485          if progress:
486              progress(0.7, desc="Building prompt...")
487          
488          # Building prompt with images
489          video_prefix = "".join([f"Frame {i+1}: <image>\n" for i in range(len(num_patches_list))])
490          full_prompt = video_prefix + NONVERBAL_EXTRACTION_PROMPT
491          
492          if progress:
493              progress(0.8, desc="Running inference (may take a while)...")
494          
495          # Running the model
496          with torch.no_grad():
497              result = model.chat(
498                  tokenizer, pixel_values, full_prompt,
499                  dict(
500                      do_sample=True,
501                      temperature=0.53,
502                      max_new_tokens=8500,
503                      top_p=0.93,
504                      top_k=30,
505                  ),
506                  num_patches_list=num_patches_list,
507                  history=None, return_history=False
508              )
509          
510          if progress:
511              progress(0.9, desc="Saving results...")
512          
513          # Save to a temporary file
514          temp_file = NamedTemporaryFile(delete=False, suffix=".txt", mode="w")
515          temp_file.write(result)
516          temp_path = temp_file.name
517          temp_file.close()
518          
519          if progress:
520              progress(1.0, desc="Extraction completed!")
521          
522          return result, temp_path
523          
524      except Exception as e:
525          error_msg = f"Error in extraction phase: {str(e)}\n{traceback.format_exc()}"
526          logger.error(error_msg)
527          return error_msg, None
528  
529  def analyze_nonverbal(extraction_text: str, extraction_path: Optional[str] = None, 
530                       progress: Optional[Callable] = None) -> str:
531      """Analyzes non-verbal cues using the DeepSeek model"""
532      global deepseek_model_loaded
533      
534      try:
535          if progress:
536              progress(0, desc="Preparing DeepSeek model...")
537          
538          # Free memory of previous model if it was loaded
539          global internvideo_model_loaded
540          if internvideo_model_loaded:
541              if progress:
542                  progress(0.1, desc="Freeing InternVideo model memory...")
543              unload_internvideo_model()
544          
545          if progress:
546              progress(0.2, desc="Loading DeepSeek model...")
547          
548          # Import necessary libraries
549          from vllm import SamplingParams
550          
551          # Initialize model if not already loaded
552          if not deepseek_model_loaded:
553              model = load_deepseek_model()
554              if model is None:
555                  raise Exception("Failed to load DeepSeek model")
556          
557          if progress:
558              progress(0.6, desc="Configuring inference parameters...")
559          
560          # Configure sampling parameters
561          sampling_params = SamplingParams(
562              temperature=0.53,
563              top_p=0.93,
564              top_k=30,
565              max_tokens=8500,
566              frequency_penalty=0.2,
567          )
568          
569          if progress:
570              progress(0.7, desc="Running analysis (may take a while)...")
571          
572          # Prepare analysis prompt
573          prompt = NONVERBAL_ANALYSIS_PROMPT_TEMPLATE.format(extraction_text=extraction_text)
574          
575          # Generate analysis
576          outputs = model.generate([prompt], sampling_params)
577          analysis = outputs[0].outputs[0].text.strip()
578          
579          if progress:
580              progress(1.0, desc="Analysis completed!")
581          
582          return analysis
583          
584      except Exception as e:
585          error_msg = f"Error in analysis phase: {str(e)}\n{traceback.format_exc()}"
586          logger.error(error_msg)
587          return error_msg
588  
589  def analyze_manipulation_strategies(extraction_text: str, extraction_path: Optional[str] = None, 
590                                     progress: Optional[Callable] = None) -> str:
591      """Analyzes video manipulation strategies using the DeepSeek model"""
592      global deepseek_model_loaded
593      
594      try:
595          if progress:
596              progress(0, desc="Preparing DeepSeek model...")
597          
598          # Free memory of previous model if it was loaded
599          global internvideo_model_loaded
600          if internvideo_model_loaded:
601              if progress:
602                  progress(0.1, desc="Freeing InternVideo model memory...")
603              unload_internvideo_model()
604          
605          if progress:
606              progress(0.2, desc="Loading DeepSeek model...")
607          
608          # Import necessary libraries
609          from vllm import SamplingParams
610          
611          # Initialize model if not already loaded
612          if not deepseek_model_loaded:
613              model = load_deepseek_model()
614              if model is None:
615                  raise Exception("Failed to load DeepSeek model")
616          
617          if progress:
618              progress(0.6, desc="Configuring inference parameters...")
619          
620          # Configure sampling parameters
621          sampling_params = SamplingParams(
622              temperature=0.53,
623              top_p=0.93,
624              top_k=30,
625              max_tokens=8500,
626              frequency_penalty=0.2,
627          )
628          
629          if progress:
630              progress(0.7, desc="Running analysis (may take a while)...")
631          
632          # Prepare analysis prompt
633          prompt = f"""
634  Video Manipulation Strategies Analysis System
635  SYSTEM INSTRUCTIONS
636  You are a specialized system designed to analyze video content extractions and identify potential persuasion, manipulation, and influence strategies employed in the video. Your purpose is to objectively identify and explain these strategies without making any political judgments.
637  
638  ANALYSIS METHODOLOGY
639  For each video extraction report, proceed through these analytical phases:
640  1. Narrative Structure Analysis: Identify how the story is constructed
641  2. Visual and Production Technique Analysis: Examine camera work, editing, lighting, etc.
642  3. Emotional Appeal Analysis: Identify emotional triggers and psychological techniques
643  4. Rhetorical Strategy Analysis: Identify persuasion and argument techniques
644  5. Information Presentation Analysis: Examine how facts, evidence, and claims are presented
645  
646  OUTPUT FORMAT
647  Structure your analysis in this format:
648  ## VIDEO MANIPULATION STRATEGIES ANALYSIS REPORT
649  
650  ### Executive Summary
651  [Brief overview of key findings and significant manipulation strategies detected]
652  
653  ### Narrative Structure
654  [Analysis of storytelling approach, framing techniques, perspective control]
655  
656  ### Visual and Production Techniques
657  [Analysis of camera angles, editing choices, visual symbolism, color psychology]
658  
659  ### Emotional Appeal Strategies
660  [Analysis of emotional triggers, psychological techniques, identity appeals]
661  
662  ### Rhetorical and Linguistic Strategies
663  [Analysis of language patterns, argument structures, rhetorical devices]
664  
665  ### Information Management Techniques
666  [Analysis of evidence presentation, information selection/omission, source handling]
667  
668  ### Audience Targeting
669  [Analysis of how content targets specific audiences or demographics]
670  
671  ### Manipulation Risk Assessment
672  [Assessment of overall manipulation potential and ethical considerations]
673  
674  PRINCIPLES FOR ANALYSIS
675  1. Maintain political neutrality - Focus on techniques, not ideological positions
676  2. Distinguish between persuasion and manipulation
677  3. Consider context and audience expectations
678  4. Document evidence for each identified strategy
679  5. Acknowledge normal vs. problematic uses of influence techniques
680  
681  Here is the video extraction text: "{extraction_text}"
682  ANALYSIS START:
683  """
684          
685          # Generate analysis
686          outputs = model.generate([prompt], sampling_params)
687          analysis = outputs[0].outputs[0].text.strip()
688          
689          if progress:
690              progress(1.0, desc="Analysis completed!")
691          
692          return analysis
693          
694      except Exception as e:
695          error_msg = f"Error in analysis phase: {str(e)}\n{traceback.format_exc()}"
696          logger.error(error_msg)
697          return error_msg