/ video_models / video_manipulation_analyzer.py
video_manipulation_analyzer.py
  1  # DÉPRÉCIÉ: Ce fichier est maintenu pour compatibilité mais sera supprimé dans une version future
  2  # Veuillez utiliser video_models.video_utils à la place
  3  # Import de compatibilité
  4  import os
  5  import gc
  6  import time
  7  import torch
  8  import numpy as np
  9  import traceback
 10  from tempfile import NamedTemporaryFile
 11  import logging
 12  
 13  # Logging
 14  logger = logging.getLogger("video_analyzer")
 15  
 16  # Global variables to track model loading status
 17  internvideo_model_loaded = False
 18  deepseek_model_loaded = False
 19  
 20  # Step 1: Video content extraction with InternVideo2.5
 21  def extract_video_content(video_path, progress=None):
 22      """Extract video content using InternVideo2.5"""
 23      global internvideo_model_loaded
 24      
 25      try:
 26          if progress:
 27              progress(0, desc="Loading InternVideo2.5 model...")
 28          
 29          # Import required libraries dynamically to avoid memory issues
 30          from transformers import AutoModel, AutoTokenizer
 31          from decord import VideoReader, cpu
 32          from PIL import Image
 33          import torchvision.transforms as T
 34          from torchvision.transforms.functional import InterpolationMode
 35          
 36          # Constants
 37          MODEL_PATH = "OpenGVLab/InternVideo2_5_Chat_8B"
 38          IMAGENET_MEAN = (0.485, 0.456, 0.406)
 39          IMAGENET_STD = (0.229, 0.224, 0.225)
 40          
 41          # Analysis prompt (video content extraction)
 42          ANALYSIS_PROMPT = """# Video Content Extraction Prompt System
 43  
 44  ## SYSTEM INSTRUCTIONS
 45  
 46  You are a specialized system designed to extract and document all observable components from video content. Your purpose is to create a comprehensive, objective record of visual elements without analyzing or interpreting their meaning. This detailed extraction will serve as input for subsequent analytical systems.
 47  
 48  ## EXTRACTION METHODOLOGY
 49  
 50  For each video, proceed through these extraction phases:
 51  
 52  1. **Initial Overview**: Identify basic video parameters
 53  2. **Visual Component Extraction**: Document all visual elements sequentially
 54  3. **Temporal Sequencing**: Map the progression of components through time
 55  4. **Component Relationship Mapping**: Note spatial and temporal relationships
 56  
 57  ## COMPONENT EXTRACTION GUIDELINES
 58  
 59  Extract and document each of the following component categories in detail:
 60  
 61  ### 1. Technical Parameters
 62  - **Video Quality**: Resolution, frame rate, aspect ratio
 63  - **Duration**: Total length, timestamp format
 64  - **Format**: Video codec, container format
 65  - **Technical Issues**: Visible compression artifacts, frame drops, or other technical anomalies
 66  
 67  ### 2. Visual Composition
 68  - **Shot Types**: Close-up, medium shot, wide shot, extreme close-up, etc.
 69  - **Camera Angles**: High angle, low angle, eye level, bird's eye view, etc.
 70  - **Camera Movements**: Pan, tilt, tracking, zoom, static, handheld, stabilized, etc.
 71  - **Framing**: Rule of thirds positioning, headroom, lead room, symmetry/asymmetry
 72  - **Depth of Field**: Shallow, deep, rack focus events
 73  - **Composition**: Foreground, midground, background elements and their arrangement
 74  
 75  ### 3. Lighting and Color
 76  - **Lighting Setup**: High-key, low-key, natural, artificial, direction of light
 77  - **Lighting Quality**: Hard, soft, diffused, direct
 78  - **Colorimetry**: Color palette, saturation levels, temperature (warm/cool)
 79  - **Contrast Levels**: High contrast, low contrast
 80  - **Color Grading**: Visible filters, stylistic color treatments
 81  - **Time of Day**: Daytime, nighttime, golden hour, etc.
 82  
 83  ### 4. Environment and Setting
 84  - **Location Type**: Indoor, outdoor, studio, natural environment
 85  - **Setting Details**: Urban, rural, domestic, public, private
 86  - **Set Design Elements**: Furniture, decorations, props, architectural features
 87  - **Weather Conditions**: If outdoors - clear, cloudy, rainy, snowy, etc.
 88  - **Time Period Indicators**: Modern, historical, futuristic elements
 89  
 90  ### 5. People and Characters
 91  - **Number of People**: Total count, entries/exits during footage
 92  - **Physical Characteristics**: Age range, gender presentation, ethnicity, clothing, distinctive features
 93  - **Positioning**: Standing, sitting, walking, relative positions between people
 94  - **Facial Expressions**: Detailed documentation of visible expressions (smiling, frowning, neutral, etc.)
 95  - **Body Language**: Posture, gestures, proxemics (physical distance between people)
 96  - **Eye Direction**: Where subjects are looking
 97  - **Physical Actions**: What subjects are physically doing
 98  
 99  ### 6. Text Elements
100  - **On-screen Text**: Titles, subtitles, captions, credits, watermarks
101  - **Text in Scene**: Signs, books, screens, clothing with text
102  - **Text Style**: Font, size, color, animation
103  - **Text Positioning**: Where text appears on screen
104  - **Duration**: How long text remains visible
105  - **Language**: What language(s) appears in text
106  
107  ### 7. Graphics and Visual Effects
108  - **Graphic Elements**: Logos, icons, illustrations, diagrams
109  - **Animation**: Moving graphics, style of animation
110  - **Visual Effects**: CGI elements, compositing, filters
111  - **Transitions**: Cuts, dissolves, wipes, fades
112  - **Screen Graphics**: User interfaces, screens within the video
113  - **Overlays**: Information graphics, lower thirds, watermarks
114  
115  ### 8. Temporal Elements
116  - **Editing Pace**: Shot length, cutting patterns
117  - **Time Manipulation**: Slow motion, time-lapse, freeze frames
118  - **Sequence of Events**: Chronological documentation of what happens
119  - **Scene Changes**: Transitions between different locations or settings
120  - **Timestamp References**: Noting when specific elements appear and disappear
121  
122  ### 9. Production Context (if evident)
123  - **Production Type**: Professional, amateur, social media, broadcast, film
124  - **Visible Equipment**: Microphones, lights, reflectors in frame
125  - **Production Credits**: Visible information about creators
126  
127  ## OUTPUT FORMAT
128  
129  Structure your extraction in this format:
130  
131  ```
132  ## VIDEO EXTRACTION REPORT
133  
134  ### Basic Parameters
135  - Title (if known): [title]
136  - Duration: [time]
137  - Resolution: [resolution]
138  - Aspect Ratio: [ratio]
139  
140  ### Visual Component Timeline
141  [00:00-00:00] [Detailed description of visual elements during this timeframe]
142  [00:00-00:00] [Next segment description]
143  ...
144  
145  ### People and Characters
146  - Person 1: [Detailed description]
147    - Visible at: [Timestamp ranges]
148    - Actions: [Description of what they do]
149    - Expressions: [Description of notable expressions]
150  - Person 2: [...]
151  
152  ### Text Elements
153  - [00:00-00:00] [Description of text content, style, position]
154  - [00:00-00:00] [...]
155  
156  ### Graphics and Effects
157  - [00:00-00:00] [Description of graphics or effects]
158  - [00:00-00:00] [...]
159  
160  ### Technical Elements
161  - Camera Angles: [List all observed camera angles with timestamps]
162  - Shot Types: [List all observed shot types with timestamps]
163  - Camera Movements: [List all observed movements with timestamps]
164  - Lighting Conditions: [List all observed lighting conditions with timestamps]
165  - Color Palette: [Description of dominant colors and changes]
166  - Editing Techniques: [Description of evident editing choices]
167  
168  ### Component Relationships
169  - [Description of notable spatial relationships between elements]
170  - [Description of notable temporal relationships between elements]
171  ```
172  
173  ## IMPORTANT GUIDELINES
174  
175  1. **Record ONLY what is directly observable** in the video
176  2. **DO NOT analyze, interpret, or evaluate** the content
177  3. **Avoid subjective judgments** about quality, intent, or meaning
178  4. **Do not speculate** about anything not visible in the video
179  5. **Be precise and comprehensive** in documenting all components
180  6. **Maintain objective, neutral language** throughout
181  7. **If uncertain about any element**, note the uncertainty rather than guessing
182  8. **Document timestamps** as accurately as possible
183  9. **Prioritize completeness** - capture all relevant visual elements
184  10. **Focus on EXTRACTION ONLY** - leave all analysis to subsequent systems
185  
186  REMEMBER: Your role is solely to extract and document components, not to analyze them. Provide a comprehensive extraction that will serve as a foundation for later analytical systems.
187  """
188          
189          if progress:
190              progress(0.1, desc="Building transforms...")
191          
192          # Define functions
193          def build_transform(input_size=448):
194              return T.Compose([
195                  T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img),
196                  T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
197                  T.ToTensor(),
198                  T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
199              ])
200          
201          def get_dynamic_segments(video_path):
202              """Determine optimal number of frames based on video duration"""
203              vr = VideoReader(video_path, ctx=cpu(0))
204              fps = float(vr.get_avg_fps())
205              duration = len(vr) / fps
206              
207              if duration < 10:      # Very short video (< 10 sec)
208                  num_segments = 16
209              elif duration < 60:    # Short video (10s - 1 min)
210                  num_segments = 60
211              elif duration < 300:   # Medium video (1 min - 5 min)
212                  num_segments = 300
213              else:                  # Long video (> 5 min)
214                  num_segments = 400
215                  
216              return min(num_segments, 400)  # Cap at 400 frames
217              
218          def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
219              start_idx = max(first_idx, round(bound[0] * fps)) if bound else 0
220              end_idx = min(round(bound[1] * fps), max_frame) if bound else max_frame
221              seg_size = float(end_idx - start_idx) / num_segments
222              return np.array([int(start_idx + (seg_size / 2) + np.round(seg_size * idx)) for idx in range(num_segments)])
223              
224          def load_video(video_path, num_segments=128, input_size=448):
225              """Extract frames for processing"""
226              vr = VideoReader(video_path, ctx=cpu(0))
227              max_frame = len(vr) - 1
228              fps = float(vr.get_avg_fps())
229              
230              pixel_values_list = []
231              num_patches_list = []
232              transform = build_transform(input_size=input_size)
233              
234              frame_indices = get_index(None, fps, max_frame, num_segments=num_segments)
235              
236              # Process frames with progress updates
237              for i, frame_index in enumerate(frame_indices):
238                  if i % 10 == 0 and progress:  # Update progress every 10 frames
239                      progress_val = 0.1 + 0.3 * (i / len(frame_indices))
240                      progress(progress_val, desc=f"Processing frames ({i}/{len(frame_indices)})...")
241                      
242                  img = Image.fromarray(vr[frame_index].asnumpy()).convert("RGB")
243                  pixel_values = transform(img).unsqueeze(0)
244                  num_patches_list.append(1)
245                  pixel_values_list.append(pixel_values)
246                  
247              pixel_values = torch.cat(pixel_values_list)
248              return pixel_values, num_patches_list
249          
250          if progress:
251              progress(0.4, desc="Loading tokenizer and model...")
252          
253          # Load tokenizer and model (with memory optimization)
254          if not internvideo_model_loaded:
255              tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
256              model = AutoModel.from_pretrained(MODEL_PATH, trust_remote_code=True).half().cuda()
257              model = model.to(torch.bfloat16)
258              internvideo_model_loaded = True
259          
260          if progress:
261              progress(0.5, desc="Determining optimal frame count...")
262          
263          # Get optimal number of frames
264          num_segments = get_dynamic_segments(video_path)
265          
266          if progress:
267              progress(0.6, desc="Processing video frames...")
268          
269          # Load and process video frames
270          pixel_values, num_patches_list = load_video(video_path, num_segments=num_segments)
271          pixel_values = pixel_values.to(torch.bfloat16).to(model.device)
272          
273          if progress:
274              progress(0.7, desc="Constructing prompt...")
275          
276          # Construct prompt with frames
277          video_prefix = "".join([f"Frame {i+1}: <image>\n" for i in range(len(num_patches_list))])
278          full_prompt = video_prefix + ANALYSIS_PROMPT
279          
280          if progress:
281              progress(0.8, desc="Running extraction (this may take a while)...")
282          
283          # Run the model
284          with torch.no_grad():
285              result = model.chat(
286                  tokenizer, pixel_values, full_prompt,
287                  dict(
288                      do_sample=True,
289                      temperature=0.53,
290                      max_new_tokens=8500,
291                      top_p=0.93,
292                      top_k=30,
293                  ),
294                  num_patches_list=num_patches_list,
295                  history=None, return_history=False
296              )
297          
298          if progress:
299              progress(0.9, desc="Saving extraction results...")
300          
301          # Save extraction to a temporary file
302          temp_file = NamedTemporaryFile(delete=False, suffix=".txt", mode="w")
303          temp_file.write(result)
304          temp_path = temp_file.name
305          temp_file.close()
306          
307          if progress:
308              progress(1.0, desc="Extraction complete!")
309          
310          return result, temp_path
311          
312      except Exception as e:
313          error_msg = f"Error in extraction phase: {str(e)}\n{traceback.format_exc()}"
314          logger.error(error_msg)
315          return error_msg, None