/ video_models / video_manipulation_analyzer.py
video_manipulation_analyzer.py
1 # DÉPRÉCIÉ: Ce fichier est maintenu pour compatibilité mais sera supprimé dans une version future 2 # Veuillez utiliser video_models.video_utils à la place 3 # Import de compatibilité 4 import os 5 import gc 6 import time 7 import torch 8 import numpy as np 9 import traceback 10 from tempfile import NamedTemporaryFile 11 import logging 12 13 # Logging 14 logger = logging.getLogger("video_analyzer") 15 16 # Global variables to track model loading status 17 internvideo_model_loaded = False 18 deepseek_model_loaded = False 19 20 # Step 1: Video content extraction with InternVideo2.5 21 def extract_video_content(video_path, progress=None): 22 """Extract video content using InternVideo2.5""" 23 global internvideo_model_loaded 24 25 try: 26 if progress: 27 progress(0, desc="Loading InternVideo2.5 model...") 28 29 # Import required libraries dynamically to avoid memory issues 30 from transformers import AutoModel, AutoTokenizer 31 from decord import VideoReader, cpu 32 from PIL import Image 33 import torchvision.transforms as T 34 from torchvision.transforms.functional import InterpolationMode 35 36 # Constants 37 MODEL_PATH = "OpenGVLab/InternVideo2_5_Chat_8B" 38 IMAGENET_MEAN = (0.485, 0.456, 0.406) 39 IMAGENET_STD = (0.229, 0.224, 0.225) 40 41 # Analysis prompt (video content extraction) 42 ANALYSIS_PROMPT = """# Video Content Extraction Prompt System 43 44 ## SYSTEM INSTRUCTIONS 45 46 You are a specialized system designed to extract and document all observable components from video content. Your purpose is to create a comprehensive, objective record of visual elements without analyzing or interpreting their meaning. This detailed extraction will serve as input for subsequent analytical systems. 47 48 ## EXTRACTION METHODOLOGY 49 50 For each video, proceed through these extraction phases: 51 52 1. **Initial Overview**: Identify basic video parameters 53 2. **Visual Component Extraction**: Document all visual elements sequentially 54 3. **Temporal Sequencing**: Map the progression of components through time 55 4. **Component Relationship Mapping**: Note spatial and temporal relationships 56 57 ## COMPONENT EXTRACTION GUIDELINES 58 59 Extract and document each of the following component categories in detail: 60 61 ### 1. Technical Parameters 62 - **Video Quality**: Resolution, frame rate, aspect ratio 63 - **Duration**: Total length, timestamp format 64 - **Format**: Video codec, container format 65 - **Technical Issues**: Visible compression artifacts, frame drops, or other technical anomalies 66 67 ### 2. Visual Composition 68 - **Shot Types**: Close-up, medium shot, wide shot, extreme close-up, etc. 69 - **Camera Angles**: High angle, low angle, eye level, bird's eye view, etc. 70 - **Camera Movements**: Pan, tilt, tracking, zoom, static, handheld, stabilized, etc. 71 - **Framing**: Rule of thirds positioning, headroom, lead room, symmetry/asymmetry 72 - **Depth of Field**: Shallow, deep, rack focus events 73 - **Composition**: Foreground, midground, background elements and their arrangement 74 75 ### 3. Lighting and Color 76 - **Lighting Setup**: High-key, low-key, natural, artificial, direction of light 77 - **Lighting Quality**: Hard, soft, diffused, direct 78 - **Colorimetry**: Color palette, saturation levels, temperature (warm/cool) 79 - **Contrast Levels**: High contrast, low contrast 80 - **Color Grading**: Visible filters, stylistic color treatments 81 - **Time of Day**: Daytime, nighttime, golden hour, etc. 82 83 ### 4. Environment and Setting 84 - **Location Type**: Indoor, outdoor, studio, natural environment 85 - **Setting Details**: Urban, rural, domestic, public, private 86 - **Set Design Elements**: Furniture, decorations, props, architectural features 87 - **Weather Conditions**: If outdoors - clear, cloudy, rainy, snowy, etc. 88 - **Time Period Indicators**: Modern, historical, futuristic elements 89 90 ### 5. People and Characters 91 - **Number of People**: Total count, entries/exits during footage 92 - **Physical Characteristics**: Age range, gender presentation, ethnicity, clothing, distinctive features 93 - **Positioning**: Standing, sitting, walking, relative positions between people 94 - **Facial Expressions**: Detailed documentation of visible expressions (smiling, frowning, neutral, etc.) 95 - **Body Language**: Posture, gestures, proxemics (physical distance between people) 96 - **Eye Direction**: Where subjects are looking 97 - **Physical Actions**: What subjects are physically doing 98 99 ### 6. Text Elements 100 - **On-screen Text**: Titles, subtitles, captions, credits, watermarks 101 - **Text in Scene**: Signs, books, screens, clothing with text 102 - **Text Style**: Font, size, color, animation 103 - **Text Positioning**: Where text appears on screen 104 - **Duration**: How long text remains visible 105 - **Language**: What language(s) appears in text 106 107 ### 7. Graphics and Visual Effects 108 - **Graphic Elements**: Logos, icons, illustrations, diagrams 109 - **Animation**: Moving graphics, style of animation 110 - **Visual Effects**: CGI elements, compositing, filters 111 - **Transitions**: Cuts, dissolves, wipes, fades 112 - **Screen Graphics**: User interfaces, screens within the video 113 - **Overlays**: Information graphics, lower thirds, watermarks 114 115 ### 8. Temporal Elements 116 - **Editing Pace**: Shot length, cutting patterns 117 - **Time Manipulation**: Slow motion, time-lapse, freeze frames 118 - **Sequence of Events**: Chronological documentation of what happens 119 - **Scene Changes**: Transitions between different locations or settings 120 - **Timestamp References**: Noting when specific elements appear and disappear 121 122 ### 9. Production Context (if evident) 123 - **Production Type**: Professional, amateur, social media, broadcast, film 124 - **Visible Equipment**: Microphones, lights, reflectors in frame 125 - **Production Credits**: Visible information about creators 126 127 ## OUTPUT FORMAT 128 129 Structure your extraction in this format: 130 131 ``` 132 ## VIDEO EXTRACTION REPORT 133 134 ### Basic Parameters 135 - Title (if known): [title] 136 - Duration: [time] 137 - Resolution: [resolution] 138 - Aspect Ratio: [ratio] 139 140 ### Visual Component Timeline 141 [00:00-00:00] [Detailed description of visual elements during this timeframe] 142 [00:00-00:00] [Next segment description] 143 ... 144 145 ### People and Characters 146 - Person 1: [Detailed description] 147 - Visible at: [Timestamp ranges] 148 - Actions: [Description of what they do] 149 - Expressions: [Description of notable expressions] 150 - Person 2: [...] 151 152 ### Text Elements 153 - [00:00-00:00] [Description of text content, style, position] 154 - [00:00-00:00] [...] 155 156 ### Graphics and Effects 157 - [00:00-00:00] [Description of graphics or effects] 158 - [00:00-00:00] [...] 159 160 ### Technical Elements 161 - Camera Angles: [List all observed camera angles with timestamps] 162 - Shot Types: [List all observed shot types with timestamps] 163 - Camera Movements: [List all observed movements with timestamps] 164 - Lighting Conditions: [List all observed lighting conditions with timestamps] 165 - Color Palette: [Description of dominant colors and changes] 166 - Editing Techniques: [Description of evident editing choices] 167 168 ### Component Relationships 169 - [Description of notable spatial relationships between elements] 170 - [Description of notable temporal relationships between elements] 171 ``` 172 173 ## IMPORTANT GUIDELINES 174 175 1. **Record ONLY what is directly observable** in the video 176 2. **DO NOT analyze, interpret, or evaluate** the content 177 3. **Avoid subjective judgments** about quality, intent, or meaning 178 4. **Do not speculate** about anything not visible in the video 179 5. **Be precise and comprehensive** in documenting all components 180 6. **Maintain objective, neutral language** throughout 181 7. **If uncertain about any element**, note the uncertainty rather than guessing 182 8. **Document timestamps** as accurately as possible 183 9. **Prioritize completeness** - capture all relevant visual elements 184 10. **Focus on EXTRACTION ONLY** - leave all analysis to subsequent systems 185 186 REMEMBER: Your role is solely to extract and document components, not to analyze them. Provide a comprehensive extraction that will serve as a foundation for later analytical systems. 187 """ 188 189 if progress: 190 progress(0.1, desc="Building transforms...") 191 192 # Define functions 193 def build_transform(input_size=448): 194 return T.Compose([ 195 T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img), 196 T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC), 197 T.ToTensor(), 198 T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD) 199 ]) 200 201 def get_dynamic_segments(video_path): 202 """Determine optimal number of frames based on video duration""" 203 vr = VideoReader(video_path, ctx=cpu(0)) 204 fps = float(vr.get_avg_fps()) 205 duration = len(vr) / fps 206 207 if duration < 10: # Very short video (< 10 sec) 208 num_segments = 16 209 elif duration < 60: # Short video (10s - 1 min) 210 num_segments = 60 211 elif duration < 300: # Medium video (1 min - 5 min) 212 num_segments = 300 213 else: # Long video (> 5 min) 214 num_segments = 400 215 216 return min(num_segments, 400) # Cap at 400 frames 217 218 def get_index(bound, fps, max_frame, first_idx=0, num_segments=32): 219 start_idx = max(first_idx, round(bound[0] * fps)) if bound else 0 220 end_idx = min(round(bound[1] * fps), max_frame) if bound else max_frame 221 seg_size = float(end_idx - start_idx) / num_segments 222 return np.array([int(start_idx + (seg_size / 2) + np.round(seg_size * idx)) for idx in range(num_segments)]) 223 224 def load_video(video_path, num_segments=128, input_size=448): 225 """Extract frames for processing""" 226 vr = VideoReader(video_path, ctx=cpu(0)) 227 max_frame = len(vr) - 1 228 fps = float(vr.get_avg_fps()) 229 230 pixel_values_list = [] 231 num_patches_list = [] 232 transform = build_transform(input_size=input_size) 233 234 frame_indices = get_index(None, fps, max_frame, num_segments=num_segments) 235 236 # Process frames with progress updates 237 for i, frame_index in enumerate(frame_indices): 238 if i % 10 == 0 and progress: # Update progress every 10 frames 239 progress_val = 0.1 + 0.3 * (i / len(frame_indices)) 240 progress(progress_val, desc=f"Processing frames ({i}/{len(frame_indices)})...") 241 242 img = Image.fromarray(vr[frame_index].asnumpy()).convert("RGB") 243 pixel_values = transform(img).unsqueeze(0) 244 num_patches_list.append(1) 245 pixel_values_list.append(pixel_values) 246 247 pixel_values = torch.cat(pixel_values_list) 248 return pixel_values, num_patches_list 249 250 if progress: 251 progress(0.4, desc="Loading tokenizer and model...") 252 253 # Load tokenizer and model (with memory optimization) 254 if not internvideo_model_loaded: 255 tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True) 256 model = AutoModel.from_pretrained(MODEL_PATH, trust_remote_code=True).half().cuda() 257 model = model.to(torch.bfloat16) 258 internvideo_model_loaded = True 259 260 if progress: 261 progress(0.5, desc="Determining optimal frame count...") 262 263 # Get optimal number of frames 264 num_segments = get_dynamic_segments(video_path) 265 266 if progress: 267 progress(0.6, desc="Processing video frames...") 268 269 # Load and process video frames 270 pixel_values, num_patches_list = load_video(video_path, num_segments=num_segments) 271 pixel_values = pixel_values.to(torch.bfloat16).to(model.device) 272 273 if progress: 274 progress(0.7, desc="Constructing prompt...") 275 276 # Construct prompt with frames 277 video_prefix = "".join([f"Frame {i+1}: <image>\n" for i in range(len(num_patches_list))]) 278 full_prompt = video_prefix + ANALYSIS_PROMPT 279 280 if progress: 281 progress(0.8, desc="Running extraction (this may take a while)...") 282 283 # Run the model 284 with torch.no_grad(): 285 result = model.chat( 286 tokenizer, pixel_values, full_prompt, 287 dict( 288 do_sample=True, 289 temperature=0.53, 290 max_new_tokens=8500, 291 top_p=0.93, 292 top_k=30, 293 ), 294 num_patches_list=num_patches_list, 295 history=None, return_history=False 296 ) 297 298 if progress: 299 progress(0.9, desc="Saving extraction results...") 300 301 # Save extraction to a temporary file 302 temp_file = NamedTemporaryFile(delete=False, suffix=".txt", mode="w") 303 temp_file.write(result) 304 temp_path = temp_file.name 305 temp_file.close() 306 307 if progress: 308 progress(1.0, desc="Extraction complete!") 309 310 return result, temp_path 311 312 except Exception as e: 313 error_msg = f"Error in extraction phase: {str(e)}\n{traceback.format_exc()}" 314 logger.error(error_msg) 315 return error_msg, None