Cradicle Explorer

/ hash_advanced.py
hash_advanced.py
   1  #!/usr/bin/env python3
   2  
   3  import base64
   4  import os
   5  import sys
   6  import time
   7  import random
   8  import re
   9  import json
  10  from dataclasses import dataclass, asdict
  11  from io import BytesIO
  12  from typing import List, Dict, Any, Tuple, Optional, Set, Generator
  13  from urllib.parse import urljoin, urlparse, urldefrag, quote
  14  from concurrent.futures import ThreadPoolExecutor, as_completed, Future
  15  
  16  import requests
  17  from PIL import Image, UnidentifiedImageError
  18  import numpy as np
  19  import face_recognition
  20  from bs4 import BeautifulSoup
  21  import tldextract
  22  from fake_useragent import UserAgent
  23  
  24  
  25  # ================== CONFIGURATION ==================
  26  
  27  class CrawlerConfig:
  28      """Configuration for web crawling."""
  29      MAX_PAGES_PER_USERNAME = 50
  30      MAX_DEPTH = 1
  31      TIMEOUT = 15
  32      MAX_WORKERS = 10
  33      DELAY = (1.0, 3.0)
  34      USER_AGENT_ROTATION = True
  35      FOLLOW_SAME_DOMAIN = False
  36      EXCLUDE_EXTENSIONS = {'.pdf', '.zip', '.tar', '.gz', '.exe', '.dmg', '.iso'}
  37      VALID_IMAGE_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.svg'}
  38      MAX_IMAGE_SIZE_MB = 5
  39      MAX_RETRIES = 2
  40      RATE_LIMIT_DELAY = 1.0
  41      VERBOSE = True
  42      PROFILE_TEMPLATES_FILE = "profile_templates.json"
  43  
  44  
  45  # ================== LOAD PROFILE TEMPLATES FROM JSON ==================
  46  
  47  def load_profile_templates(filename: str = "profile_templates.json") -> Dict[str, Any]:
  48      """
  49      Load profile templates from a JSON file.
  50      If file doesn't exist, return empty dict.
  51      """
  52      try:
  53          if os.path.exists(filename):
  54              with open(filename, 'r') as f:
  55                  templates = json.load(f)
  56                  print(f"✅ Loaded {len(templates)} profile templates from {filename}")
  57                  return templates
  58          else:
  59              print(f"❌ Profile templates file not found: {filename}")
  60              print("   Create a JSON file with platform configurations.")
  61              return {}
  62      except Exception as e:
  63          print(f"❌ Error loading profile templates from {filename}: {e}")
  64          return {}
  65  
  66  
  67  def save_profile_templates(templates: Dict[str, Any], filename: str = "profile_templates.json"):
  68      """Save profile templates to JSON file."""
  69      try:
  70          with open(filename, 'w') as f:
  71              json.dump(templates, f, indent=2)
  72          print(f"💾 Saved {len(templates)} profile templates to {filename}")
  73      except Exception as e:
  74          print(f"❌ Error saving profile templates: {e}")
  75  
  76  
  77  def get_enabled_platforms(templates: Dict[str, Any]) -> List[str]:
  78      """Get list of enabled platforms from templates."""
  79      enabled = []
  80      for platform_name, config in templates.items():
  81          if config.get("enabled", True):
  82              enabled.append(platform_name)
  83      return enabled
  84  
  85  
  86  def get_platforms_by_category(templates: Dict[str, Any]) -> Dict[str, List[str]]:
  87      """Organize platforms by category."""
  88      categories = {}
  89      for platform_name, config in templates.items():
  90          if config.get("enabled", True):
  91              category = config.get("category", "Uncategorized")
  92              if category not in categories:
  93                  categories[category] = []
  94              categories[category].append(platform_name)
  95      return categories
  96  
  97  
  98  # Load templates at module level
  99  PROFILE_TEMPLATES = load_profile_templates()
 100  
 101  
 102  # ================== SITE-SPECIFIC CHECKERS ==================
 103  
 104  class SiteCheckers:
 105      """Site-specific profile existence checkers."""
 106      
 107      @staticmethod
 108      def github_check(response: requests.Response, username: str) -> bool:
 109          """Check if GitHub profile exists."""
 110          if response.status_code != 200:
 111              return False
 112          
 113          html = response.text.lower()
 114          
 115          # More accurate GitHub existence check
 116          not_found_indicators = [
 117              'this is not the web page you are looking for',
 118              'page not found',
 119              'github could not find that page',
 120              'there isn\'t a github pages site here',
 121          ]
 122          
 123          # Check for absence of profile elements
 124          if any(indicator in html for indicator in not_found_indicators):
 125              return False
 126          
 127          # Positive indicators
 128          profile_indicators = [
 129              f'itemprop="name"',
 130              'vcard-names-container',
 131              'js-profile-editable-area',
 132              'p-nickname vcard-username',
 133              'user-profile-frame',
 134          ]
 135          
 136          # Check for username in page
 137          if username.lower() in html:
 138              for indicator in profile_indicators:
 139                  if indicator in html:
 140                      return True
 141          
 142          # Alternative: check for common GitHub profile elements
 143          soup = BeautifulSoup(response.text, 'html.parser')
 144          
 145          # Check for profile-specific elements
 146          if soup.find('div', {'class': 'user-profile-frame'}):
 147              return True
 148          
 149          if soup.find('span', {'itemprop': 'name'}):
 150              return True
 151          
 152          # Check for the username in the page title
 153          title = soup.find('title')
 154          if title and username.lower() in title.text.lower():
 155              return True
 156          
 157          # Check for avatar image (GitHub avatars have specific URLs)
 158          for img in soup.find_all('img'):
 159              src = img.get('src', '')
 160              if 'avatars.githubusercontent.com' in src and 'identicon' not in src:
 161                  return True
 162          
 163          return False
 164      
 165      @staticmethod
 166      def stackoverflow_check(response: requests.Response, username: str) -> bool:
 167          """Check if Stack Overflow profile exists."""
 168          if response.status_code != 200:
 169              return False
 170          
 171          html = response.text.lower()
 172          
 173          # Stack Overflow shows "Page Not Found" for non-existent users
 174          if 'page not found' in html or '404 - page not found' in html:
 175              return False
 176          
 177          # Check for user profile elements
 178          profile_indicators = [
 179              'user-card',
 180              'user-avatar',
 181              'user-details',
 182          ]
 183          
 184          for indicator in profile_indicators:
 185              if indicator in html:
 186                  return True
 187          
 188          return False
 189      
 190      @staticmethod
 191      def twitter_check(response: requests.Response, username: str) -> bool:
 192          """Check if Twitter profile exists."""
 193          # Twitter often redirects or shows different pages
 194          final_url = response.url.lower()
 195          
 196          # Check if we got redirected to twitter.com/home (logged out view)
 197          if 'twitter.com/home' in final_url:
 198              return False
 199          
 200          # Check if we're on the user's profile
 201          if f'twitter.com/{username.lower()}' in final_url:
 202              return True
 203          
 204          # Check response content
 205          html = response.text.lower()
 206          
 207          # Twitter shows "This account doesn't exist" for non-existent users
 208          if 'this account doesn\'t exist' in html or 'account suspended' in html:
 209              return False
 210          
 211          # Check for profile elements
 212          if 'profile-header' in html or 'user-actions' in html:
 213              return True
 214          
 215          return response.status_code == 200
 216      
 217      @staticmethod
 218      def instagram_check(response: requests.Response, username: str) -> bool:
 219          """Check if Instagram profile exists."""
 220          if response.status_code != 200:
 221              return False
 222          
 223          html = response.text.lower()
 224          
 225          # Instagram shows "Sorry, this page isn't available."
 226          if 'sorry, this page isn\'t available' in html:
 227              return False
 228          
 229          # Check for profile elements
 230          if 'profile-page' in html or 'vcard' in html:
 231              return True
 232          
 233          return True
 234      
 235      @staticmethod
 236      def reddit_check(response: requests.Response, username: str) -> bool:
 237          """Check if Reddit profile exists."""
 238          if response.status_code != 200:
 239              return False
 240          
 241          html = response.text.lower()
 242          
 243          # Reddit shows "page not found" or "this user has deleted their account"
 244          if 'page not found' in html or 'this user has deleted' in html:
 245              return False
 246          
 247          # Check for user profile elements
 248          if 'user-profile' in html or f'user/{username.lower()}' in html:
 249              return True
 250          
 251          return True
 252      
 253      @staticmethod
 254      def artstation_check(response: requests.Response, username: str) -> bool:
 255          """Check if ArtStation profile exists."""
 256          if response.status_code != 200:
 257              return False
 258          
 259          html = response.text.lower()
 260          
 261          # ArtStation shows "The page you were looking for doesn't exist"
 262          if 'doesn\'t exist' in html or 'page not found' in html:
 263              return False
 264          
 265          # Check for profile elements
 266          if 'artist-header' in html or 'user-profile' in html:
 267              return True
 268          
 269          return True
 270      
 271      @staticmethod
 272      def deviantart_check(response: requests.Response, username: str) -> bool:
 273          """Check if DeviantArt profile exists."""
 274          if response.status_code != 200:
 275              return False
 276          
 277          html = response.text.lower()
 278          
 279          # DeviantArt shows "The deviation you are looking for appears to be missing"
 280          if 'deviation you are looking for' in html or 'does not exist' in html:
 281              return False
 282          
 283          return True
 284      
 285      @staticmethod
 286      def flickr_check(response: requests.Response, username: str) -> bool:
 287          """Check if Flickr profile exists."""
 288          if response.status_code != 200:
 289              return False
 290          
 291          html = response.text.lower()
 292          
 293          # Flickr shows "This member is no longer active on Flickr"
 294          if 'no longer active' in html or 'does not exist' in html:
 295              return False
 296          
 297          return True
 298      
 299      @staticmethod
 300      def _500px_check(response: requests.Response, username: str) -> bool:
 301          """Check if 500px profile exists."""
 302          if response.status_code != 200:
 303              return False
 304          
 305          html = response.text.lower()
 306          
 307          # 500px shows "The page you requested could not be found"
 308          if 'could not be found' in html:
 309              return False
 310          
 311          return True
 312      
 313      @staticmethod
 314      def bandcamp_check(response: requests.Response, username: str) -> bool:
 315          """Check if Bandcamp profile exists."""
 316          if response.status_code != 200:
 317              return False
 318          
 319          html = response.text.lower()
 320          
 321          # Bandcamp shows "Couldn't find that one"
 322          if 'couldn\'t find that one' in html:
 323              return False
 324          
 325          return True
 326      
 327      @staticmethod
 328      def keybase_check(response: requests.Response, username: str) -> bool:
 329          """Check if Keybase profile exists."""
 330          if response.status_code != 200:
 331              return False
 332          
 333          html = response.text.lower()
 334          
 335          # Keybase shows "User not found"
 336          if 'user not found' in html:
 337              return False
 338          
 339          return True
 340      
 341      @staticmethod
 342      def gitlab_check(response: requests.Response, username: str) -> bool:
 343          """Check if GitLab profile exists."""
 344          if response.status_code == 404:
 345              return False
 346          
 347          if response.status_code != 200:
 348              return True  # GitLab might redirect or show other pages
 349          
 350          html = response.text.lower()
 351          
 352          # GitLab shows "The page could not be found" for 404s
 353          if 'page could not be found' in html:
 354              return False
 355          
 356          return True
 357      
 358      @staticmethod
 359      def universal_check(response: requests.Response, username: str) -> bool:
 360          """Universal profile checker for any site."""
 361          if response.status_code != 200:
 362              return False
 363          
 364          html = response.text.lower()
 365          
 366          # Common "not found" patterns across many sites
 367          not_found_indicators = [
 368              'page not found',
 369              '404',
 370              'not found',
 371              'doesn\'t exist',
 372              'does not exist',
 373              'couldn\'t be found',
 374              'no longer available',
 375              'user not found',
 376              'profile not found',
 377              'account not found',
 378              'this page could not be found',
 379              'sorry, this page isn\'t available',
 380              'the page you were looking for',
 381              'we couldn\'t find that page',
 382          ]
 383          
 384          for indicator in not_found_indicators:
 385              if indicator in html:
 386                  return False
 387          
 388          # Check for username in page (good indicator of profile page)
 389          soup = BeautifulSoup(response.text, 'html.parser')
 390          
 391          # Check title
 392          title = soup.find('title')
 393          if title and username.lower() in title.text.lower():
 394              return True
 395          
 396          # Check meta tags
 397          for meta in soup.find_all('meta'):
 398              content = meta.get('content', '').lower()
 399              if username.lower() in content:
 400                  return True
 401          
 402          # Check for common profile elements
 403          profile_keywords = ['profile', 'user', 'member', 'account', 'avatar']
 404          page_text = soup.get_text().lower()
 405          
 406          for keyword in profile_keywords:
 407              if keyword in page_text:
 408                  # Also check if username appears near profile keyword
 409                  text_lower = page_text.replace('\n', ' ').replace('\r', ' ')
 410                  if username.lower() in text_lower:
 411                      return True
 412          
 413          # Default to True if we got a 200 and no "not found" indicators
 414          return True
 415  
 416      @staticmethod
 417      def fansfinder_check(response: requests.Response, username: str) -> bool:
 418          """Check if OnlyFans profile exists via FansFinder."""
 419          html = response.text.lower()
 420          
 421          # Check for specific indicators that the profile exists on FansFinder
 422          existence_indicators = [
 423              f'data-username="{username.lower()}"',  # Username in data attribute
 424              f'onlyfans.com/{username.lower()}',  # Profile URL in page
 425              "media.onlyfinder.com",  # OnlyFans content URLs via FansFinder
 426              "og:title",  # Open Graph tags
 427              "og:description",  # Open Graph tags
 428              "user-profile profile-container",  # Profile container
 429              "avatar-container",  # Avatar container
 430              "about-profile",  # User about section
 431              "profile-icon",  # Profile icons
 432              "img-responsive",  # Responsive images
 433          ]
 434          
 435          # Check for "profile not found" indicators
 436          not_found_indicators = [
 437              "page not found",
 438              "profile not found",
 439              "doesn't exist",
 440              "does not exist",
 441              "no longer active",
 442              "user not found",
 443              "couldn't find that profile",
 444              "this profile is not available",
 445              "no results found",
 446              "no profiles found",
 447              "0 results",
 448          ]
 449          
 450          # If we see "not found" indicators, profile doesn't exist
 451          for indicator in not_found_indicators:
 452              if indicator in html:
 453                  return False
 454          
 455          # Check for existence indicators
 456          for indicator in existence_indicators:
 457              if indicator in html:
 458                  return True
 459          
 460          # Also check for specific patterns in the HTML structure
 461          soup = BeautifulSoup(response.text, 'html.parser')
 462          
 463          # Check for the specific FansFinder profile container
 464          profile_containers = soup.find_all('div', {'class': re.compile(r'user-profile.*profile-container')})
 465          if profile_containers:
 466              for container in profile_containers:
 467                  # Check if username is in container's data attributes
 468                  data_username = container.get('data-username', '')
 469                  if username.lower() == data_username.lower():
 470                      return True
 471                  
 472                  # Check for OnlyFans link in the container
 473                  onlyfans_links = container.find_all('a', href=True)
 474                  for link in onlyfans_links:
 475                      if f'onlyfans.com/{username.lower()}' in link.get('href', '').lower():
 476                          return True
 477          
 478          # Check for avatar images
 479          avatar_images = soup.find_all('img', {'class': 'img-responsive'})
 480          if avatar_images:
 481              # Check if any image has the username in alt or title
 482              for img in avatar_images:
 483                  alt_text = img.get('alt', '').lower()
 484                  title_text = img.get('title', '').lower()
 485                  if username.lower() in alt_text or username.lower() in title_text:
 486                      return True
 487          
 488          # Check for profile headers with the username
 489          profile_headers = soup.find_all(['h1', 'h2', 'h3', 'h4'])
 490          for header in profile_headers:
 491              if username.lower() in header.get_text().lower():
 492                  return True
 493          
 494          # Check response status
 495          if response.status_code == 200:
 496              # Additional check: look for FansFinder specific elements
 497              if 'fansfinder' in response.url.lower():
 498                  # Check if we have meaningful content (not just a search page)
 499                  if len(response.text) > 5000:  # Profile pages tend to be larger
 500                      # Look for profile-specific data
 501                      if 'profile-container' in html or 'avatar-container' in html:
 502                          return True
 503          
 504          return False
 505  
 506  
 507  # ================== ENHANCED PROFILE CRAWLER ==================
 508  
 509  class EnhancedProfileCrawler:
 510      """Enhanced crawler with site-specific checks and universal image extraction."""
 511      
 512      def __init__(self, config: CrawlerConfig = None):
 513          self.config = config or CrawlerConfig()
 514          self.ua = UserAgent() if self.config.USER_AGENT_ROTATION else None
 515          self.session = requests.Session()
 516          self.session.headers.update({
 517              'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
 518              'Accept-Language': 'en-US,en;q=0.5',
 519              'Accept-Encoding': 'gzip, deflate',
 520              'Connection': 'keep-alive',
 521              'Upgrade-Insecure-Requests': '1',
 522              'Cache-Control': 'no-cache',
 523              'DNT': '1',
 524          })
 525          self.checkers = SiteCheckers()
 526          self.rate_limit_cache = {}
 527          self.profile_templates = load_profile_templates(self.config.PROFILE_TEMPLATES_FILE)
 528      
 529      def get_random_user_agent(self) -> str:
 530          """Get a random user agent."""
 531          if self.ua:
 532              return self.ua.random
 533          return 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
 534      
 535      def get_browser_like_headers(self) -> Dict[str, str]:
 536          """Get headers that look like a real browser."""
 537          return {
 538              'User-Agent': self.get_random_user_agent(),
 539              'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
 540              'Accept-Language': 'en-US,en;q=0.5',
 541              'Accept-Encoding': 'gzip, deflate, br',
 542              'DNT': '1',
 543              'Connection': 'keep-alive',
 544              'Upgrade-Insecure-Requests': '1',
 545              'Sec-Fetch-Dest': 'document',
 546              'Sec-Fetch-Mode': 'navigate',
 547              'Sec-Fetch-Site': 'none',
 548              'Sec-Fetch-User': '?1',
 549              'Cache-Control': 'max-age=0',
 550              'TE': 'trailers',
 551          }
 552      
 553      def check_rate_limit(self, domain: str):
 554          """Rate limiting by domain."""
 555          current_time = time.time()
 556          if domain in self.rate_limit_cache:
 557              last_request = self.rate_limit_cache[domain]
 558              elapsed = current_time - last_request
 559              if elapsed < self.config.RATE_LIMIT_DELAY:
 560                  sleep_time = self.config.RATE_LIMIT_DELAY - elapsed
 561                  if self.config.VERBOSE:
 562                      print(f"  ⏳ Rate limiting: waiting {sleep_time:.1f}s for {domain}")
 563                  time.sleep(sleep_time)
 564          
 565          self.rate_limit_cache[domain] = current_time
 566      
 567      def is_valid_avatar(self, url: str, img_element) -> bool:
 568          """Universal check if an image is likely a valid avatar."""
 569          url_lower = url.lower()
 570          
 571          # Skip known placeholder/blank avatars
 572          placeholder_keywords = [
 573              'default', 'placeholder', 'anonymous', 'unknown', 
 574              'ghost', 'blank', 'null', 'empty', 'none',
 575              'no-avatar', 'no-avatar.jpg', 'no-photo', 'no-image',
 576              'default_avatar', 'default-profile', 'default-user',
 577              'gravatar.com/avatar/?',  # Empty gravatar
 578              'identicon', 'monsterid', 'wavatar', 'retro',  # GitHub defaults
 579              '0.jpg', '0.png', '0.gif',  # Zero filenames
 580          ]
 581          
 582          for keyword in placeholder_keywords:
 583              if keyword in url_lower:
 584                  return False
 585          
 586          # Check image element attributes
 587          alt_text = (img_element.get('alt') or '').lower()
 588          for keyword in placeholder_keywords:
 589              if keyword in alt_text:
 590                  return False
 591          
 592          title_text = (img_element.get('title') or '').lower()
 593          for keyword in placeholder_keywords:
 594              if keyword in title_text:
 595                  return False
 596          
 597          # Check for common placeholder dimensions (very small images)
 598          try:
 599              width_attr = img_element.get('width', '')
 600              height_attr = img_element.get('height', '')
 601              
 602              if width_attr and height_attr:
 603                  width = int(''.join(filter(str.isdigit, width_attr)) or '0')
 604                  height = int(''.join(filter(str.isdigit, height_attr)) or '0')
 605                  
 606                  # Skip very small images (likely icons, not avatars)
 607                  if width < 32 or height < 32:
 608                      return False
 609          except:
 610              pass
 611          
 612          # Check for common placeholder class names
 613          img_class = ' '.join(img_element.get('class', [])).lower()
 614          placeholder_classes = [
 615              'placeholder', 'default', 'empty', 'blank',
 616              'no-avatar', 'no-image', 'avatar-placeholder'
 617          ]
 618          
 619          for p_class in placeholder_classes:
 620              if p_class in img_class:
 621                  return False
 622          
 623          # Platform-specific checks
 624          # GitHub
 625          if 'github' in url_lower:
 626              if any(x in url_lower for x in ['identicon', 'monsterid', 'retro', 'wavatar']):
 627                  return False
 628          
 629          # Gravatar
 630          if 'gravatar.com/avatar/' in url_lower:
 631              # Check for MD5 hash length (32 chars) - empty gravatars have short or no hash
 632              import re
 633              match = re.search(r'gravatar\.com/avatar/([a-fA-F0-9]+)', url_lower)
 634              if match:
 635                  hash_value = match.group(1)
 636                  if len(hash_value) < 32:  # Not a proper MD5 hash
 637                      return False
 638          
 639          return True
 640      
 641      def get_image_src(self, img_element) -> Optional[str]:
 642          """Get image source from img element with priority order."""
 643          # List of attributes to check in order of priority
 644          src_attrs = ['src', 'data-src', 'data-original', 'data-lazy-src', 'data-lazyload', 'srcset']
 645          
 646          for attr in src_attrs:
 647              src = img_element.get(attr)
 648              if not src:
 649                  continue
 650              
 651              # Clean and return the URL
 652              if attr == 'srcset':
 653                  # Handle srcset - take the first (usually highest quality) image
 654                  srcset_parts = src.split(',')
 655                  if srcset_parts:
 656                      first_part = srcset_parts[0].strip()
 657                      url = first_part.split(' ')[0].strip()
 658                      return url if url else None
 659              else:
 660                  # For other attributes, return directly
 661                  return src.strip()
 662          
 663          return None
 664      
 665      def extract_fansfinder_avatar(self, html: str, base_url: str, username: str) -> List[str]:
 666          """Extract avatar from FansFinder profile page for specific username."""
 667          soup = BeautifulSoup(html, 'html.parser')
 668          image_urls = set()
 669          
 670          # Look for the specific avatar container structure
 671          avatar_containers = soup.find_all('div', {'class': 'avatar-container'})
 672          
 673          for container in avatar_containers:
 674              # Check if this container belongs to our username
 675              # Look for data-username attribute in parent containers
 676              parent = container.find_parent('div', {'data-username': username.lower()})
 677              if parent:
 678                  # This container belongs to our username
 679                  images = container.find_all('img')
 680                  for img in images:
 681                      src = self.get_image_src(img)
 682                      if src:
 683                          try:
 684                              full_url = urljoin(base_url, src)
 685                              if self.is_valid_avatar(full_url, img):
 686                                  image_urls.add(full_url)
 687                          except Exception as e:
 688                              if self.config.VERBOSE:
 689                                  print(f"    [!] URL join error: {e}")
 690          
 691          # Also look for images with username patterns in URLs and attributes
 692          for img in soup.find_all('img'):
 693              src = self.get_image_src(img)
 694              if not src:
 695                  continue
 696              
 697              src_lower = src.lower()
 698              username_lower = username.lower()
 699              
 700              # Check if image URL contains the username
 701              if username_lower in src_lower:
 702                  # Check for specific patterns
 703                  patterns = [
 704                      f'{username_lower}-onlyfans.',
 705                      f'{username_lower}_onlyfans.',
 706                      f'/{username_lower}/',
 707                      f'/{username_lower}-',
 708                  ]
 709                  
 710                  for pattern in patterns:
 711                      if pattern in src_lower:
 712                          try:
 713                              full_url = urljoin(base_url, src)
 714                              if self.is_valid_avatar(full_url, img):
 715                                  image_urls.add(full_url)
 716                                  break
 717                          except Exception as e:
 718                              if self.config.VERBOSE:
 719                                  print(f"    [!] URL join error: {e}")
 720              
 721              # Check alt and title attributes
 722              alt = img.get('alt', '').lower()
 723              title = img.get('title', '').lower()
 724              
 725              # If alt or title contains the username and "onlyfans"
 726              if username_lower in alt and 'onlyfans' in alt:
 727                  try:
 728                      full_url = urljoin(base_url, src)
 729                      if self.is_valid_avatar(full_url, img):
 730                          image_urls.add(full_url)
 731                  except Exception as e:
 732                      if self.config.VERBOSE:
 733                          print(f"    [!] URL join error: {e}")
 734              
 735              if username_lower in title and 'onlyfans' in title:
 736                  try:
 737                      full_url = urljoin(base_url, src)
 738                      if self.is_valid_avatar(full_url, img):
 739                          image_urls.add(full_url)
 740                  except Exception as e:
 741                      if self.config.VERBOSE:
 742                          print(f"    [!] URL join error: {e}")
 743          
 744          # If we still don't have images, look for the most likely profile image
 745          if not image_urls:
 746              # Look for images with img-responsive class
 747              for img in soup.find_all('img', {'class': 'img-responsive'}):
 748                  src = self.get_image_src(img)
 749                  if src:
 750                      try:
 751                          full_url = urljoin(base_url, src)
 752                          if self.is_valid_avatar(full_url, img):
 753                              image_urls.add(full_url)
 754                      except Exception as e:
 755                          if self.config.VERBOSE:
 756                              print(f"    [!] URL join error: {e}")
 757          
 758          return list(image_urls)
 759      
 760      def check_profile_with_cf_bypass(self, url: str, platform: str, username: str) -> Dict[str, Any]:
 761          """Check profile with Cloudflare bypass attempts."""
 762          domain = urlparse(url).netloc
 763          self.check_rate_limit(domain)
 764          
 765          time.sleep(random.uniform(*self.config.DELAY))
 766          
 767          try:
 768              headers = self.get_browser_like_headers()
 769              
 770              response = self.session.get(
 771                  url,
 772                  headers=headers,
 773                  timeout=self.config.TIMEOUT,
 774                  allow_redirects=True,
 775                  stream=False
 776              )
 777              
 778              # For OnlyFans specifically, use fansfinder_check
 779              exists = False
 780              if platform == "onlyfans":
 781                  exists = self.checkers.fansfinder_check(response, username)
 782              else:
 783                  # Use standard check for other platforms
 784                  platform_config = self.profile_templates.get(platform, {})
 785                  check_method = platform_config.get("check_method", "status_code")
 786                  
 787                  if check_method == "status_code":
 788                      exists = response.status_code == 200
 789                  else:
 790                      # Use the appropriate checker method
 791                      check_method_name = f"{check_method}"
 792                      if hasattr(self.checkers, check_method_name):
 793                          checker_func = getattr(self.checkers, check_method_name)
 794                          exists = checker_func(response, username)
 795              
 796              # Extract images if profile exists
 797              image_urls = []
 798              if exists:
 799                  platform_config = self.profile_templates.get(platform, {})
 800                  if platform == "onlyfans":
 801                      # Use the updated method that takes username
 802                      image_urls = self.extract_fansfinder_avatar(response.text, url, username)
 803                  else:
 804                      image_urls = self.extract_images(response.text, url, platform_config)
 805              
 806              result = {
 807                  "exists": exists,
 808                  "status_code": response.status_code,
 809                  "url": response.url,
 810                  "image_urls": image_urls,
 811                  "error": None,
 812                  "platform": platform,
 813                  "username": username,
 814                  "final_url": response.url,
 815                  "content_length": len(response.text),
 816                  "cf_protected": "cf-ray" in response.headers  # Indicate if Cloudflare was detected
 817              }
 818              
 819              return result
 820              
 821          except Exception as e:
 822              return {
 823                  "exists": False,
 824                  "status_code": 0,
 825                  "url": url,
 826                  "image_urls": [],
 827                  "error": str(e),
 828                  "platform": platform,
 829                  "username": username
 830              }
 831      
 832      def check_profile(self, url: str, platform: str, username: str) -> Dict[str, Any]:
 833          """Check if a profile exists with site-specific logic."""
 834          # Use special handling for OnlyFans (using FansFinder)
 835          if platform == "onlyfans":
 836              return self.check_profile_with_cf_bypass(url, platform, username)
 837          
 838          domain = urlparse(url).netloc
 839          self.check_rate_limit(domain)
 840          
 841          # Random delay to avoid detection
 842          time.sleep(random.uniform(*self.config.DELAY))
 843          
 844          try:
 845              # Update headers for this request
 846              headers = {'User-Agent': self.get_random_user_agent()}
 847              
 848              response = self.session.get(
 849                  url,
 850                  headers=headers,
 851                  timeout=self.config.TIMEOUT,
 852                  allow_redirects=True,
 853                  stream=False
 854              )
 855              
 856              # Get platform configuration
 857              platform_config = self.profile_templates.get(platform, {})
 858              if isinstance(platform_config, str):
 859                  platform_config = {"url": platform_config, "check_method": "status_code"}
 860              
 861              check_method = platform_config.get("check_method", "status_code")
 862              exists = False
 863              
 864              # Use appropriate check method
 865              if check_method == "status_code":
 866                  exists = response.status_code == 200
 867              elif check_method == "github_check":
 868                  exists = self.checkers.github_check(response, username)
 869              elif check_method == "twitter_check":
 870                  exists = self.checkers.twitter_check(response, username)
 871              elif check_method == "instagram_check":
 872                  exists = self.checkers.instagram_check(response, username)
 873              elif check_method == "reddit_check":
 874                  exists = self.checkers.reddit_check(response, username)
 875              elif check_method == "stackoverflow_check":
 876                  exists = self.checkers.stackoverflow_check(response, username)
 877              elif check_method == "artstation_check":
 878                  exists = self.checkers.artstation_check(response, username)
 879              elif check_method == "deviantart_check":
 880                  exists = self.checkers.deviantart_check(response, username)
 881              elif check_method == "flickr_check":
 882                  exists = self.checkers.flickr_check(response, username)
 883              elif check_method == "500px_check":
 884                  exists = self.checkers._500px_check(response, username)
 885              elif check_method == "bandcamp_check":
 886                  exists = self.checkers.bandcamp_check(response, username)
 887              elif check_method == "keybase_check":
 888                  exists = self.checkers.keybase_check(response, username)
 889              elif check_method == "gitlab_check":
 890                  exists = self.checkers.gitlab_check(response, username)
 891              elif check_method == "universal_check":
 892                  exists = self.checkers.universal_check(response, username)
 893              elif check_method == "fansfinder_check":
 894                  exists = self.checkers.fansfinder_check(response, username)
 895              else:
 896                  # Default: status code 200
 897                  exists = response.status_code == 200
 898              
 899              # Extract images if profile exists
 900              image_urls = []
 901              if exists:
 902                  image_urls = self.extract_images(response.text, url, platform_config, username)  # Added username parameter
 903              
 904              result = {
 905                  "exists": exists,
 906                  "status_code": response.status_code,
 907                  "url": response.url,  # Use final URL after redirects
 908                  "image_urls": image_urls,
 909                  "error": None,
 910                  "platform": platform,
 911                  "username": username,
 912                  "final_url": response.url,
 913                  "content_length": len(response.text)
 914              }
 915              
 916              return result
 917              
 918          except requests.exceptions.Timeout:
 919              return {
 920                  "exists": False,
 921                  "status_code": 408,
 922                  "url": url,
 923                  "image_urls": [],
 924                  "error": "Timeout",
 925                  "platform": platform,
 926                  "username": username
 927              }
 928          except requests.exceptions.ConnectionError:
 929              return {
 930                  "exists": False,
 931                  "status_code": 0,
 932                  "url": url,
 933                  "image_urls": [],
 934                  "error": "Connection error",
 935                  "platform": platform,
 936                  "username": username
 937              }
 938          except Exception as e:
 939              return {
 940                  "exists": False,
 941                  "status_code": 0,
 942                  "url": url,
 943                  "image_urls": [],
 944                  "error": str(e),
 945                  "platform": platform,
 946                  "username": username
 947              }
 948      
 949      def extract_images(self, html: str, base_url: str, platform_config: Dict, username: str = None) -> List[str]:
 950          """Universal image extraction that works with any site."""
 951          soup = BeautifulSoup(html, 'html.parser')
 952          image_urls = set()
 953          
 954          # Get platform name for specific handling if needed
 955          platform_name = platform_config.get("platform", "")
 956          platform_url = platform_config.get("url", "")
 957          
 958          # Special handling for OnlyFans/FansFinder
 959          if platform_name == "onlyfans" and username:
 960              fansfinder_images = self.extract_fansfinder_avatar(html, base_url, username)
 961              image_urls.update(fansfinder_images)
 962          
 963          # Phase 1: Try platform-specific selector first
 964          avatar_selector = platform_config.get("avatar_selector", "")
 965          if avatar_selector:
 966              try:
 967                  for img in soup.select(avatar_selector):
 968                      src = self.get_image_src(img)
 969                      if src:
 970                          try:
 971                              full_url = urljoin(base_url, src)
 972                              if self.is_valid_avatar(full_url, img):
 973                                  image_urls.add(full_url)
 974                          except Exception as e:
 975                              if self.config.VERBOSE:
 976                                  print(f"    [!] URL join error: {e}")
 977              except Exception as e:
 978                  if self.config.VERBOSE:
 979                      print(f"    [!] Error with selector {avatar_selector}: {e}")
 980          
 981          # Phase 2: Universal avatar detection patterns
 982          if not image_urls:
 983              # Pattern 1: Look for common avatar class patterns
 984              avatar_patterns = [
 985                  r'.*avatar.*',  # Contains 'avatar'
 986                  r'.*profile.*',  # Contains 'profile'
 987                  r'.*user.*',  # Contains 'user'
 988                  r'.*photo.*',  # Contains 'photo'
 989                  r'.*pic.*',  # Contains 'pic'
 990                  r'.*image.*',  # Contains 'image'
 991              ]
 992              
 993              for img in soup.find_all('img'):
 994                  # Check class attribute
 995                  img_class = ' '.join(img.get('class', []))
 996                  if any(re.search(pattern, img_class, re.IGNORECASE) for pattern in avatar_patterns):
 997                      src = self.get_image_src(img)
 998                      if src:
 999                          try:
1000                              full_url = urljoin(base_url, src)
1001                              if self.is_valid_avatar(full_url, img):
1002                                  image_urls.add(full_url)
1003                          except:
1004                              pass
1005                  
1006                  # Check id attribute
1007                  img_id = img.get('id', '')
1008                  if any(re.search(pattern, img_id, re.IGNORECASE) for pattern in avatar_patterns):
1009                      src = self.get_image_src(img)
1010                      if src:
1011                          try:
1012                              full_url = urljoin(base_url, src)
1013                              if self.is_valid_avatar(full_url, img):
1014                                  image_urls.add(full_url)
1015                          except:
1016                              pass
1017                  
1018                  # Check alt attribute
1019                  img_alt = img.get('alt', '').lower()
1020                  if any(keyword in img_alt for keyword in ['profile', 'avatar', 'user', 'photo', 'picture']):
1021                      src = self.get_image_src(img)
1022                      if src:
1023                          try:
1024                              full_url = urljoin(base_url, src)
1025                              if self.is_valid_avatar(full_url, img):
1026                                  image_urls.add(full_url)
1027                          except:
1028                              pass
1029          
1030          # Phase 3: Look for meta tags (social sharing images)
1031          meta_selectors = [
1032              'meta[property="og:image"]',
1033              'meta[name="og:image"]',
1034              'meta[property="twitter:image"]',
1035              'meta[name="twitter:image"]',
1036              'meta[itemprop="image"]',
1037              'meta[name="image"]',
1038          ]
1039          
1040          for selector in meta_selectors:
1041              for meta in soup.select(selector):
1042                  content = meta.get('content')
1043                  if content:
1044                      try:
1045                          full_url = urljoin(base_url, content)
1046                          # Check if it looks like a profile image
1047                          if self.config.VERBOSE:
1048                              print(f"    📱 Found meta image: {full_url}")
1049                          image_urls.add(full_url)
1050                      except:
1051                          pass
1052          
1053          # Phase 4: Check all images with common avatar filename patterns
1054          if not image_urls:
1055              avatar_filename_patterns = [
1056                  r'.*avatar.*\.(jpg|jpeg|png|gif|webp)$',
1057                  r'.*profile.*\.(jpg|jpeg|png|gif|webp)$',
1058                  r'.*user.*\.(jpg|jpeg|png|gif|webp)$',
1059                  r'.*pic.*\.(jpg|jpeg|png|gif|webp)$',
1060                  r'.*photo.*\.(jpg|jpeg|png|gif|webp)$',
1061                  r'.*pfp.*\.(jpg|jpeg|png|gif|webp)$',  # pfp = profile picture
1062                  r'.*me.*\.(jpg|jpeg|png|gif|webp)$',
1063              ]
1064              
1065              for img in soup.find_all('img'):
1066                  src = self.get_image_src(img)
1067                  if not src:
1068                      continue
1069                  
1070                  # Extract filename from URL
1071                  filename = src.split('/')[-1].split('?')[0].lower()
1072                  
1073                  # Check if filename matches avatar patterns
1074                  if any(re.search(pattern, filename, re.IGNORECASE) for pattern in avatar_filename_patterns):
1075                      try:
1076                          full_url = urljoin(base_url, src)
1077                          if self.is_valid_avatar(full_url, img):
1078                              image_urls.add(full_url)
1079                      except:
1080                          pass
1081          
1082          # Phase 5: Check all images with common avatar URL patterns
1083          if not image_urls:
1084              avatar_url_patterns = [
1085                  r'.*\/avatar\/.*',
1086                  r'.*\/profile\/.*',
1087                  r'.*\/user\/.*',
1088                  r'.*\/photo\/.*',
1089                  r'gravatar\.com\/avatar\/.*',
1090                  r'avatars\..*\.com\/.*',
1091                  r'cdn\.discordapp\.com\/avatars\/.*',
1092                  r'ugc\.production\.linktr\.ee\/.*',  # Linktree CDN
1093                  r'pbs\.twimg\.com\/profile_images\/.*',  # Twitter
1094                  r'instagram\.fbom.*\.fna\.fbcdn\.net\/.*',  # Instagram
1095                  r'i\.redd\.it\/.*',  # Reddit
1096                  r'i\.imgur\.com\/.*',  # Imgur
1097                  r'media\.onlyfinder\.com\/.*',  # FansFinder/OnlyFans CDN
1098              ]
1099              
1100              for img in soup.find_all('img'):
1101                  src = self.get_image_src(img)
1102                  if not src:
1103                      continue
1104                  
1105                  # Check if URL matches avatar patterns
1106                  if any(re.search(pattern, src, re.IGNORECASE) for pattern in avatar_url_patterns):
1107                      try:
1108                          full_url = urljoin(base_url, src)
1109                          if self.is_valid_avatar(full_url, img):
1110                              image_urls.add(full_url)
1111                      except:
1112                          pass
1113          
1114          # Phase 6: Fallback - take first few images that look reasonable
1115          if not image_urls:
1116              for img in soup.find_all('img')[:10]:  # Limit to first 10 images
1117                  src = self.get_image_src(img)
1118                  if not src:
1119                      continue
1120                  
1121                  try:
1122                      full_url = urljoin(base_url, src)
1123                      
1124                      # Check if it's a reasonable size (not an icon)
1125                      img_width = img.get('width')
1126                      img_height = img.get('height')
1127                      
1128                      # If dimensions are specified, check if it's avatar-sized
1129                      if img_width and img_height:
1130                          width = int(img_width) if img_width.isdigit() else 0
1131                          height = int(img_height) if img_height.isdigit() else 0
1132                          
1133                          # Avatars are usually square-ish and not tiny
1134                          if width > 50 and height > 50:
1135                              if self.is_valid_avatar(full_url, img):
1136                                  image_urls.add(full_url)
1137                      else:
1138                          # No dimensions, just add it
1139                          if self.is_valid_avatar(full_url, img):
1140                              image_urls.add(full_url)
1141                              
1142                  except:
1143                      pass
1144          
1145          # Filter and clean URLs
1146          filtered_urls = []
1147          for url in image_urls:
1148              # Skip data URIs and javascript
1149              if url.startswith(('data:', 'javascript:')):
1150                  continue
1151              
1152              # Remove query parameters that might cause issues
1153              try:
1154                  parsed = urlparse(url)
1155                  clean_url = f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
1156                  
1157                  # Check if it's likely an image
1158                  path_lower = parsed.path.lower()
1159                  has_image_ext = any(path_lower.endswith(ext) for ext in self.config.VALID_IMAGE_EXTENSIONS)
1160                  
1161                  # Expanded list with more mainstream and frequently used hosts
1162                  is_known_avatar_host = any(
1163                      host in clean_url.lower() 
1164                      for host in [
1165                          'avatars.githubusercontent.com',      # GitHub
1166                          'gravatar.com',                      # Gravatar
1167                          'avatar.trakt.tv',                   # Trakt
1168                          'ugc.production.linktr.ee',          # Linktree
1169                          'cdn.discordapp.com',                # Discord
1170                          'pbs.twimg.com/profile_images',       # Twitter/X
1171                          'instagram.fbom1-2.fna.fbcdn.net',   # Instagram (Facebook CDN)
1172                          'scontent.cdninstagram.com',         # Instagram
1173                          'i.redd.it',                         # Reddit
1174                          'i.imgur.com',                       # Imgur
1175                          'public.onlyfans.com/files',         # OnlyFans
1176                          'media.onlyfinder.com',              # FansFinder/OnlyFans CDN
1177                          # Additional mainstream hosts
1178                          'platform.twitter.com',              # Twitter CDN variant
1179                          'abs.twimg.com',                      # Twitter avatars
1180                          'lh3.googleusercontent.com',          # Google/YouTube
1181                          'yt3.ggpht.com',                     # YouTube
1182                          'a0.muscdn.com',                     # SoundCloud
1183                          'i1.sndcdn.com',                     # SoundCloud
1184                          'a.pomf.lol',                        # Pomf.cat (meme culture)
1185                          'pbs.twimg.com/media',               # Twitter media (profile pics often here)
1186                          'via.placeholder.com',               # Common placeholder service
1187                          'ui-avatars.com',                    # Generated avatars
1188                          'robohash.org',                      # Robot avatars
1189                          'identicons.github.com',             # GitHub identicons
1190                          'secure.gravatar.com/avatar',        # Gravatar secure
1191                          'steamcdn-a.akamaihd.net',           # Steam
1192                          'steamuserimages-a.akamaihd.net',    # Steam
1193                          'avatar-management--avatars.us-west-2',  # Twitch
1194                          'static-cdn.jtvnw.net',              # Twitch
1195                          'tiktokcdn.com',                     # TikTok
1196                          'byteimg.com',                       # TikTok CDN
1197                          'ssl-profile-images-cdn.viago.co',   # LinkedIn variant
1198                          'media.licdn.com/dms/image',          # LinkedIn
1199                          'https://media.licdn.com/dms/image/v2/'
1200                      ]
1201                  )
1202                  
1203                  if has_image_ext or is_known_avatar_host:
1204                      filtered_urls.append(clean_url)
1205              except:
1206                  continue
1207          
1208          # Return unique URLs, limited to reasonable number
1209          return list(set(filtered_urls))[:10]  # Return up to 10 unique images
1210      
1211      def crawl_usernames(self, usernames: List[str], platforms: List[str] = None) -> Dict[str, List[Dict]]:
1212          """Crawl multiple usernames across platforms."""
1213          if platforms is None:
1214              platforms = get_enabled_platforms(self.profile_templates)
1215          
1216          results = {username: [] for username in usernames}
1217          
1218          with ThreadPoolExecutor(max_workers=min(self.config.MAX_WORKERS, len(platforms))) as executor:
1219              futures = []
1220              
1221              for username in usernames:
1222                  username = username.strip()
1223                  if not username:
1224                      continue
1225                  
1226                  for platform in platforms:
1227                      if platform not in self.profile_templates:
1228                          continue
1229                      
1230                      platform_config = self.profile_templates[platform]
1231                      if isinstance(platform_config, str):
1232                          url = platform_config.format(username)
1233                      else:
1234                          url = platform_config.get("url", "").format(username)
1235                      
1236                      if not url:
1237                          continue
1238                      
1239                      future = executor.submit(
1240                          self.check_profile,
1241                          url, platform, username
1242                      )
1243                      futures.append((future, username, platform, url))
1244              
1245              # Process results
1246              completed = 0
1247              total = len(futures)
1248              
1249              for future, username, platform, url in futures:
1250                  try:
1251                      result = future.result(timeout=self.config.TIMEOUT + 10)
1252                      results[username].append(result)
1253                      completed += 1
1254                      
1255                      if self.config.VERBOSE:
1256                          status = "✅" if result["exists"] else "❌"
1257                          images = f" ({len(result['image_urls'])} img)" if result["image_urls"] else ""
1258                          error = f" - {result['error']}" if result["error"] else ""
1259                          print(f"  {status} [{completed}/{total}] {platform}: {result['exists']}{images}{error}")
1260                          
1261                  except Exception as e:
1262                      if self.config.VERBOSE:
1263                          print(f"  ❌ {platform}: Error - {e}")
1264          
1265          return results
1266  
1267  
1268  # ================== IMAGE PROCESSING ==================
1269  
1270  def get_image_bytes(source: str, max_size_mb: int = 5, timeout: int = 10) -> Optional[bytes]:
1271      """Download image with error handling."""
1272      try:
1273          if source.startswith("data:"):
1274              b64_data = source.split(",", 1)[1]
1275              return base64.b64decode(b64_data)
1276          elif source.startswith("http://") or source.startswith("https://"):
1277              headers = {
1278                  'User-Agent': UserAgent().random,
1279                  'Accept': 'image/*,*/*;q=0.8',
1280              }
1281              
1282              response = requests.get(
1283                  source, 
1284                  headers=headers, 
1285                  timeout=timeout, 
1286                  stream=True,
1287                  allow_redirects=True
1288              )
1289              response.raise_for_status()
1290              
1291              # Check content type
1292              content_type = response.headers.get('Content-Type', '').lower()
1293              if content_type and not any(x in content_type for x in ['image/', 'octet-stream', 'binary']):
1294                  return None
1295              
1296              # Read in chunks
1297              content = b''
1298              max_bytes = max_size_mb * 1024 * 1024
1299              
1300              for chunk in response.iter_content(chunk_size=8192):
1301                  content += chunk
1302                  if len(content) > max_bytes:
1303                      return None
1304              
1305              return content
1306          else:
1307              if not os.path.exists(source):
1308                  return None
1309              
1310              with open(source, "rb") as f:
1311                  return f.read()
1312      except Exception:
1313          return None
1314  
1315  
1316  def compute_face_encoding(image_bytes: bytes) -> Optional[np.ndarray]:
1317      """Extract face encoding from image."""
1318      try:
1319          image = Image.open(BytesIO(image_bytes))
1320          if image.mode != 'RGB':
1321              image = image.convert('RGB')
1322          
1323          rgb_image = np.array(image)
1324          
1325          # Try face detection
1326          face_locations = face_recognition.face_locations(rgb_image, model="hog")
1327          if not face_locations:
1328              return None
1329          
1330          encodings = face_recognition.face_encodings(rgb_image, face_locations)
1331          return encodings[0] if encodings else None
1332          
1333      except Exception:
1334          return None
1335  
1336  
1337  # ================== FACE INDEX SYSTEM ==================
1338  
1339  class FaceIndexSystem:
1340      """Face indexing system."""
1341      
1342      def __init__(self):
1343          self.faces = []
1344          self.config = CrawlerConfig()
1345      
1346      def index_from_results(self, crawl_results: Dict[str, List[Dict]]) -> List[Dict]:
1347          """Index faces from crawl results."""
1348          new_faces = []
1349          
1350          for username, results in crawl_results.items():
1351              for result in results:
1352                  if not result["exists"]:
1353                      continue
1354                  
1355                  for image_url in result["image_urls"][:2]:  # Try first 2 images
1356                      try:
1357                          img_bytes = get_image_bytes(image_url)
1358                          if not img_bytes:
1359                              continue
1360                          
1361                          encoding = compute_face_encoding(img_bytes)
1362                          if encoding is None:
1363                              continue
1364                          
1365                          face_record = {
1366                              "username": username,
1367                              "platform": result["platform"],
1368                              "page_url": result["url"],
1369                              "image_url": image_url,
1370                              "encoding": encoding.tolist(),
1371                              "timestamp": time.time()
1372                          }
1373                          
1374                          self.faces.append(face_record)
1375                          new_faces.append(face_record)
1376                          
1377                          if self.config.VERBOSE:
1378                              print(f"    👤 Face indexed: {username}@{result['platform']}")
1379                          
1380                      except Exception as e:
1381                          if self.config.VERBOSE:
1382                              print(f"    [!] Error: {e}")
1383          
1384          return new_faces
1385      
1386      def search_faces(self, target_encoding: np.ndarray, threshold: float = 0.6, top_k: int = 10) -> List[Dict]:
1387          """Search for similar faces."""
1388          results = []
1389          
1390          for face in self.faces:
1391              try:
1392                  face_encoding = np.array(face["encoding"])
1393                  distance = float(face_recognition.face_distance([target_encoding], face_encoding)[0])
1394                  similarity = max(0.0, 1.0 - min(distance, 1.0))
1395                  
1396                  results.append({
1397                      "username": face["username"],
1398                      "platform": face["platform"],
1399                      "similarity": similarity,
1400                      "distance": distance,
1401                      "match": distance < threshold,
1402                      "page_url": face["page_url"],
1403                      "image_url": face["image_url"]
1404                  })
1405              except Exception:
1406                  continue
1407          
1408          results.sort(key=lambda x: x["similarity"], reverse=True)
1409          return results[:top_k]
1410      
1411      def save_index(self, filename: str = "face_index.json"):
1412          """Save index to file."""
1413          data = {
1414              "faces": self.faces,
1415              "metadata": {
1416                  "total": len(self.faces),
1417                  "timestamp": time.time()
1418              }
1419          }
1420          
1421          with open(filename, 'w') as f:
1422              json.dump(data, f, indent=2)
1423          
1424          print(f"💾 Saved {len(self.faces)} faces to {filename}")
1425      
1426      def load_index(self, filename: str = "face_index.json"):
1427          """Load index from file."""
1428          try:
1429              with open(filename, 'r') as f:
1430                  data = json.load(f)
1431              
1432              self.faces = data.get("faces", [])
1433              print(f"📂 Loaded {len(self.faces)} faces from {filename}")
1434              return True
1435          except Exception as e:
1436              print(f"❌ Error loading: {e}")
1437              return False
1438  
1439  
1440  # ================== NEW FUNCTIONS FOR URI FACE COMPARISON ==================
1441  
1442  def compare_face_from_uri(face_system, uri: str, username: str = None, save_to_db: bool = False):
1443      """Compare a face from a URI (URL or local path) with indexed faces."""
1444      print(f"\n🔍 Comparing face from URI: {uri}")
1445      
1446      # Load target image
1447      target_bytes = get_image_bytes(uri)
1448      if not target_bytes:
1449          print("❌ Could not load image from URI")
1450          return
1451      
1452      # Check if it's a valid image
1453      try:
1454          Image.open(BytesIO(target_bytes)).verify()
1455      except Exception:
1456          print("❌ Invalid image file")
1457          return
1458      
1459      # Extract face encoding
1460      print("🧬 Extracting face encoding...")
1461      target_encoding = compute_face_encoding(target_bytes)
1462      if target_encoding is None:
1463          print("❌ No face detected in the image")
1464          return
1465      
1466      print("✅ Face encoding extracted successfully")
1467      
1468      # Get comparison parameters
1469      try:
1470          threshold = float(input("Match threshold (0.1-1.0, default 0.6): ") or "0.6")
1471          top_k = int(input("Number of results to show (default 20): ") or "20")
1472      except ValueError:
1473          threshold = 0.6
1474          top_k = 20
1475      
1476      # Search for matches
1477      print(f"\n🔍 Searching {len(face_system.faces)} indexed faces...")
1478      matches = face_system.search_faces(target_encoding, threshold, top_k)
1479      
1480      if not matches:
1481          print("❌ No matches found")
1482          return matches
1483      
1484      # Display results
1485      print(f"\n🏆 Top {len(matches)} matches:")
1486      for i, match in enumerate(matches, 1):
1487          symbol = "✅" if match["match"] else "⚠️"
1488          similarity_percent = match['similarity'] * 100
1489          
1490          # Color code based on similarity
1491          if similarity_percent >= 80:
1492              similarity_str = f"🎯 {similarity_percent:.1f}%"
1493          elif similarity_percent >= 60:
1494              similarity_str = f"🔍 {similarity_percent:.1f}%"
1495          else:
1496              similarity_str = f"📊 {similarity_percent:.1f}%"
1497          
1498          print(f"\n  {i}. {symbol} {similarity_str}")
1499          print(f"     User: {match['username']}")
1500          print(f"     Platform: {match['platform']}")
1501          print(f"     Distance: {match['distance']:.4f}")
1502          
1503          if match['image_url']:
1504              print(f"     Image: {match['image_url'][:80]}...")
1505      
1506      # Option to save the face to database
1507      if save_to_db and username:
1508          save_face_to_db(face_system, target_encoding, uri, username, uri)
1509          print(f"✅ Face saved to database with username: {username}")
1510      
1511      return matches
1512  
1513  
1514  def save_face_to_db(face_system, encoding: np.ndarray, image_url: str, username: str, page_url: str = None, platform: str = "direct_uri"):
1515      """Save a face to the database directly."""
1516      face_record = {
1517          "username": username,
1518          "platform": platform,
1519          "page_url": page_url or image_url,
1520          "image_url": image_url,
1521          "encoding": encoding.tolist(),
1522          "timestamp": time.time(),
1523          "source": "direct_uri"
1524      }
1525      
1526      face_system.faces.append(face_record)
1527      return face_record
1528  
1529  
1530  def batch_compare_from_file(face_system, filename: str):
1531      """Compare faces from a file containing URIs and usernames."""
1532      if not os.path.exists(filename):
1533          print(f"❌ File '{filename}' not found")
1534          return
1535      
1536      try:
1537          with open(filename, 'r') as f:
1538              lines = f.readlines()
1539      except Exception as e:
1540          print(f"❌ Error reading file: {e}")
1541          return
1542      
1543      print(f"\n📄 Processing {len(lines)} entries from {filename}...")
1544      
1545      results = []
1546      for line_num, line in enumerate(lines, 1):
1547          line = line.strip()
1548          if not line or line.startswith('#'):
1549              continue
1550          
1551          parts = line.split(',')
1552          if len(parts) >= 2:
1553              uri = parts[0].strip()
1554              username = parts[1].strip()
1555              
1556              print(f"\n[{line_num}] Processing {username} - {uri}")
1557              
1558              # Get the face
1559              target_bytes = get_image_bytes(uri)
1560              if not target_bytes:
1561                  print(f"  ❌ Could not load image")
1562                  continue
1563              
1564              target_encoding = compute_face_encoding(target_bytes)
1565              if target_encoding is None:
1566                  print(f"  ❌ No face detected")
1567                  continue
1568              
1569              # Search for matches
1570              matches = face_system.search_faces(target_encoding, threshold=0.6, top_k=5)
1571              
1572              if matches:
1573                  best_match = matches[0]
1574                  results.append({
1575                      'uri': uri,
1576                      'username': username,
1577                      'best_match': best_match['username'],
1578                      'similarity': best_match['similarity'],
1579                      'platform': best_match['platform']
1580                  })
1581                  
1582                  print(f"  🔍 Best match: {best_match['username']} ({best_match['similarity']:.3f})")
1583              else:
1584                  print(f"  ⚠️ No matches found")
1585      
1586      # Save results
1587      if results:
1588          output_file = f"comparison_results_{int(time.time())}.json"
1589          with open(output_file, 'w') as f:
1590              json.dump(results, f, indent=2)
1591          print(f"\n💾 Results saved to {output_file}")
1592      
1593      return results
1594  
1595  
1596  def extract_faces_from_webpage(url: str, username: str = None):
1597      """Extract faces from a webpage URL."""
1598      print(f"\n🌐 Extracting faces from webpage: {url}")
1599      
1600      crawler = EnhancedProfileCrawler()
1601      
1602      # Get the page
1603      try:
1604          response = requests.get(
1605              url,
1606              headers={'User-Agent': UserAgent().random},
1607              timeout=15
1608          )
1609          response.raise_for_status()
1610      except Exception as e:
1611          print(f"❌ Error fetching webpage: {e}")
1612          return []
1613      
1614      # Extract images
1615      image_urls = crawler.extract_images(response.text, url, {})
1616      
1617      if not image_urls:
1618          print("❌ No images found on page")
1619          return []
1620      
1621      print(f"📸 Found {len(image_urls)} images")
1622      
1623      faces = []
1624      for i, img_url in enumerate(image_urls[:10], 1):  # Limit to first 10 images
1625          print(f"  [{i}] Processing: {img_url[:80]}...")
1626          
1627          img_bytes = get_image_bytes(img_url)
1628          if not img_bytes:
1629              continue
1630          
1631          encoding = compute_face_encoding(img_bytes)
1632          if encoding is not None:
1633              faces.append({
1634                  'image_url': img_url,
1635                  'encoding': encoding,
1636                  'page_url': url
1637              })
1638              print(f"    ✅ Face detected")
1639      
1640      print(f"\n✅ Found {len(faces)} faces on the webpage")
1641      return faces
1642  
1643  
1644  def create_uri_batch_file():
1645      """Create a template batch file for URI comparisons."""
1646      template = """# URI comparison batch file
1647  # Format: image_url_or_path,username,optional_platform
1648  # 
1649  # Examples:
1650  https://example.com/face1.jpg,john_doe,facebook
1651  https://example.com/face2.jpg,jane_smith,instagram
1652  /path/to/local/image.jpg,anonymous,direct
1653  """
1654      
1655      filename = f"uri_batch_{int(time.time())}.txt"
1656      with open(filename, 'w') as f:
1657          f.write(template)
1658      
1659      print(f"📝 Created batch template file: {filename}")
1660      print("Edit this file with your URIs and usernames, then use option 7.")
1661  
1662  
1663  # ================== NEW FUNCTION: UPLOAD IMAGE AND SEARCH PLATFORMS ==================
1664  
1665  def search_platforms_by_face(face_system, crawler):
1666      """Search selected platforms using a face image."""
1667      print("\n📸 Upload Image and Search Platforms")
1668      print("-" * 40)
1669      
1670      # Get image
1671      uri = input("Enter image path or URL: ").strip()
1672      if not uri:
1673          print("❌ No image provided")
1674          return
1675      
1676      # Extract face from image
1677      print("🔍 Extracting face from image...")
1678      target_bytes = get_image_bytes(uri)
1679      if not target_bytes:
1680          print("❌ Could not load image")
1681          return
1682      
1683      target_encoding = compute_face_encoding(target_bytes)
1684      if target_encoding is None:
1685          print("❌ No face detected in image")
1686          return
1687      
1688      print("✅ Face encoding extracted")
1689      
1690      # Ask if they want to search with a specific username
1691      use_username = input("\nDo you want to search with a specific username? (y/N): ").strip().lower()
1692      username_to_search = None
1693      
1694      if use_username == 'y':
1695          username_to_search = input("Enter username to search: ").strip()
1696          if username_to_search:
1697              print(f"🔍 Will search for username: {username_to_search}")
1698      
1699      # Show available platforms
1700      templates = load_profile_templates()
1701      enabled_platforms = get_enabled_platforms(templates)
1702      categories = get_platforms_by_category(templates)
1703      
1704      if not enabled_platforms:
1705          print("❌ No platforms enabled in profile_templates.json")
1706          return
1707      
1708      print(f"\n📋 Available platforms ({len(enabled_platforms)} enabled):")
1709      for category, platforms in categories.items():
1710          print(f"\n  {category}:")
1711          for platform in sorted(platforms):
1712              config = templates[platform]
1713              url_template = config.get("url", "No URL")
1714              print(f"    {platform:20} - {url_template}")
1715      
1716      # Select platforms
1717      platform_input = input("\nEnter platforms to search (comma-separated, or 'all'): ").strip().lower()
1718      
1719      if platform_input == 'all':
1720          selected_platforms = enabled_platforms
1721      else:
1722          selected_platforms = []
1723          for item in platform_input.split(','):
1724              item = item.strip()
1725              if item in enabled_platforms:
1726                  selected_platforms.append(item)
1727      
1728      if not selected_platforms:
1729          print("⚠️  No platforms selected")
1730          return
1731      
1732      print(f"\n🔍 Will search {len(selected_platforms)} platform(s)")
1733      
1734      # Search logic
1735      results = []
1736      
1737      if username_to_search:
1738          # Search specific username on selected platforms
1739          print(f"\n🔎 Searching username '{username_to_search}' on {len(selected_platforms)} platforms...")
1740          
1741          crawl_results = crawler.crawl_usernames([username_to_search], selected_platforms)
1742          
1743          # Check if any profiles exist
1744          user_results = crawl_results.get(username_to_search, [])
1745          existing_profiles = [r for r in user_results if r["exists"]]
1746          
1747          if existing_profiles:
1748              print(f"\n✅ Found {len(existing_profiles)} profile(s) for '{username_to_search}':")
1749              
1750              for result in existing_profiles:
1751                  print(f"\n  Platform: {result['platform']}")
1752                  print(f"  URL: {result['url']}")
1753                  print(f"  Images found: {len(result['image_urls'])}")
1754                  
1755                  # Compare face with images from profile
1756                  if result["image_urls"]:
1757                      print("  Comparing faces from profile images...")
1758                      
1759                      best_similarity = 0
1760                      best_match_url = None
1761                      
1762                      for img_url in result["image_urls"][:3]:  # Check first 3 images
1763                          try:
1764                              img_bytes = get_image_bytes(img_url)
1765                              if img_bytes:
1766                                  img_encoding = compute_face_encoding(img_bytes)
1767                                  if img_encoding is not None:
1768                                      distance = float(face_recognition.face_distance([target_encoding], img_encoding)[0])
1769                                      similarity = max(0.0, 1.0 - min(distance, 1.0))
1770                                      
1771                                      if similarity > best_similarity:
1772                                          best_similarity = similarity
1773                                          best_match_url = img_url
1774                          except Exception:
1775                              continue
1776                      
1777                      if best_similarity > 0:
1778                          print(f"  🎯 Best face match: {best_similarity:.3f}")
1779                          if best_similarity > 0.6:
1780                              print(f"  ✅ LIKELY SAME PERSON!")
1781                          
1782                          results.append({
1783                              "platform": result["platform"],
1784                              "url": result["url"],
1785                              "username": username_to_search,
1786                              "similarity": best_similarity,
1787                              "match_url": best_match_url,
1788                              "type": "username_search"
1789                          })
1790                  else:
1791                      print("  ⚠️ No images to compare")
1792          else:
1793              print(f"❌ No profiles found for '{username_to_search}' on selected platforms")
1794      
1795      else:
1796          # First check existing face database
1797          print("\n🔍 Checking existing face database...")
1798          matches = face_system.search_faces(target_encoding, threshold=0.6, top_k=5)
1799          
1800          if matches:
1801              print(f"🎯 Found {len([m for m in matches if m['match']])} potential matches in database:")
1802              for match in matches[:3]:  # Show top 3
1803                  if match['match']:
1804                      print(f"  👤 {match['username']} on {match['platform']} - similarity: {match['similarity']:.3f}")
1805              
1806              # Ask if they want to search for these usernames
1807              search_existing = input("\nSearch platforms for these usernames? (y/N): ").strip().lower()
1808              
1809              if search_existing == 'y':
1810                  usernames_to_search = list(set([m['username'] for m in matches if m['match']]))[:5]
1811                  print(f"🔍 Searching for {len(usernames_to_search)} username(s): {', '.join(usernames_to_search)}")
1812                  
1813                  crawl_results = crawler.crawl_usernames(usernames_to_search, selected_platforms)
1814                  
1815                  for username in usernames_to_search:
1816                      user_results = crawl_results.get(username, [])
1817                      for result in user_results:
1818                          if result["exists"]:
1819                              results.append({
1820                                  "platform": result["platform"],
1821                                  "url": result["url"],
1822                                  "username": username,
1823                                  "type": "database_match_search"
1824                              })
1825          
1826          # Option to do a reverse image search style lookup
1827          print("\n🔄 Alternative: Check popular platforms for matching faces")
1828          print("   (This will crawl and compare faces from profiles)")
1829          
1830          do_crawl = input("\nCrawl and compare faces from profiles? (y/N): ").strip().lower()
1831          
1832          if do_crawl == 'y':
1833              # For demo, we'd need to implement a broader search
1834              # For now, we'll show a message
1835              print("\n⚠️  Advanced face-based platform crawling would require:")
1836              print("   1. Generating common usernames from face similarity")
1837              print("   2. Searching those usernames on platforms")
1838              print("   3. Comparing faces from found profiles")
1839              print("\n💡 Tip: Use option 1 with suspected usernames first")
1840      
1841      # Display results
1842      if results:
1843          print(f"\n📊 Search Results ({len(results)}):")
1844          print("=" * 60)
1845          
1846          for i, result in enumerate(results, 1):
1847              print(f"\n{i}. Platform: {result['platform']}")
1848              print(f"   Username: {result['username']}")
1849              print(f"   URL: {result['url']}")
1850              print(f"   Type: {result['type']}")
1851              
1852              if 'similarity' in result:
1853                  print(f"   Face Similarity: {result['similarity']:.3f}")
1854                  if result['similarity'] > 0.7:
1855                      print("   🎯 HIGH CONFIDENCE MATCH")
1856                  elif result['similarity'] > 0.5:
1857                      print("   🔍 Possible match")
1858              
1859              if 'match_url' in result:
1860                  print(f"   Match Image: {result['match_url'][:80]}...")
1861      else:
1862          print("\n❌ No results found")
1863      
1864      # Offer to save the face to database
1865      if target_encoding is not None:
1866          save_face = input("\n💾 Save this face to database for future searches? (y/N): ").strip().lower()
1867          if save_face == 'y':
1868              username = input("Enter username for this face (or leave blank for 'unknown'): ").strip() or "unknown"
1869              platform = input("Enter source platform (or leave blank for 'upload'): ").strip() or "upload"
1870              
1871              face_record = {
1872                  "username": username,
1873                  "platform": platform,
1874                  "page_url": uri if uri.startswith('http') else f"file://{os.path.abspath(uri)}",
1875                  "image_url": uri,
1876                  "encoding": target_encoding.tolist(),
1877                  "timestamp": time.time(),
1878                  "source": "image_upload_search"
1879              }
1880              
1881              face_system.faces.append(face_record)
1882              print(f"✅ Face saved to database as '{username}'")
1883  
1884  
1885  # ================== TEMPLATE MANAGEMENT FUNCTIONS ==================
1886  
1887  def manage_templates_menu():
1888      """Menu for managing profile templates."""
1889      while True:
1890          print("\n" + "=" * 60)
1891          print("Profile Templates Management")
1892          print("=" * 60)
1893          print("1. List all platforms")
1894          print("2. List by category")
1895          print("3. Add new platform")
1896          print("4. Edit existing platform")
1897          print("5. Enable/Disable platform")
1898          print("6. Export templates to JSON")
1899          print("7. Import templates from JSON")
1900          print("8. Back to main menu")
1901          
1902          choice = input("\nSelect option (1-8): ").strip()
1903          
1904          if choice == "1":
1905              # List all platforms
1906              templates = load_profile_templates()
1907              print(f"\n📋 All Platforms ({len(templates)} total):")
1908              for i, (platform_name, config) in enumerate(templates.items(), 1):
1909                  enabled = "✅" if config.get("enabled", True) else "❌"
1910                  category = config.get("category", "Uncategorized")
1911                  print(f"  {i:2d}. {enabled} {platform_name:20} - {category}")
1912          
1913          elif choice == "2":
1914              # List by category
1915              templates = load_profile_templates()
1916              categories = get_platforms_by_category(templates)
1917              
1918              print("\n📋 Platforms by Category:")
1919              for category, platforms in categories.items():
1920                  print(f"\n  {category}:")
1921                  for platform in sorted(platforms):
1922                      config = templates[platform]
1923                      enabled = "✅" if config.get("enabled", True) else "❌"
1924                      url_template = config.get("url", "No URL")
1925                      print(f"      {enabled} {platform:20} - {url_template}")
1926          
1927          elif choice == "3":
1928              # Add new platform
1929              print("\n➕ Add New Platform")
1930              
1931              platform_name = input("Platform name (lowercase, no spaces): ").strip().lower()
1932              if not platform_name:
1933                  print("❌ Platform name required")
1934                  continue
1935              
1936              templates = load_profile_templates()
1937              if platform_name in templates:
1938                  print(f"❌ Platform '{platform_name}' already exists")
1939                  continue
1940              
1941              url_template = input("URL template (use {} for username): ").strip()
1942              if not url_template or "{}" not in url_template:
1943                  print("❌ URL template must contain {} placeholder for username")
1944                  continue
1945              
1946              category = input("Category (e.g., Social Media, Tech, etc): ").strip() or "Other"
1947              
1948              print("\nAvailable check methods:")
1949              print("  status_code - Simple 200 OK check")
1950              print("  universal_check - Universal profile check (recommended)")
1951              print("  github_check - GitHub specific check")
1952              print("  twitter_check - Twitter specific check")
1953              print("  instagram_check - Instagram specific check")
1954              print("  fansfinder_check - OnlyFans via FansFinder check")
1955              
1956              check_method = input("Check method (default: universal_check): ").strip() or "universal_check"
1957              
1958              avatar_selector = input("Avatar CSS selector (optional): ").strip()
1959              
1960              requires_js = input("Requires JavaScript? (y/N): ").strip().lower() == 'y'
1961              
1962              new_template = {
1963                  "url": url_template,
1964                  "check_method": check_method,
1965                  "avatar_selector": avatar_selector,
1966                  "requires_javascript": requires_js,
1967                  "platform": platform_name,
1968                  "category": category,
1969                  "enabled": True,
1970                  "priority": 3
1971              }
1972              
1973              templates[platform_name] = new_template
1974              save_profile_templates(templates)
1975              print(f"✅ Platform '{platform_name}' added successfully")
1976          
1977          elif choice == "4":
1978              # Edit existing platform
1979              templates = load_profile_templates()
1980              
1981              print("\n✏️  Edit Platform")
1982              platforms = list(templates.keys())
1983              
1984              for i, platform in enumerate(platforms, 1):
1985                  print(f"  {i:2d}. {platform}")
1986              
1987              try:
1988                  selection = int(input("\nSelect platform number: ").strip())
1989                  if 1 <= selection <= len(platforms):
1990                      platform_name = platforms[selection - 1]
1991                      config = templates[platform_name]
1992                      
1993                      print(f"\nEditing: {platform_name}")
1994                      print(f"Current URL: {config.get('url')}")
1995                      new_url = input(f"New URL (Enter to keep current): ").strip()
1996                      if new_url:
1997                          if "{}" not in new_url:
1998                              print("❌ URL must contain {} placeholder")
1999                              continue
2000                          config["url"] = new_url
2001                      
2002                      print(f"Current category: {config.get('category')}")
2003                      new_category = input(f"New category: ").strip()
2004                      if new_category:
2005                          config["category"] = new_category
2006                      
2007                      print(f"Current check method: {config.get('check_method')}")
2008                      new_check = input(f"New check method: ").strip()
2009                      if new_check:
2010                          config["check_method"] = new_check
2011                      
2012                      print(f"Current avatar selector: {config.get('avatar_selector')}")
2013                      new_selector = input(f"New avatar selector: ").strip()
2014                      if new_selector:
2015                          config["avatar_selector"] = new_selector
2016                      
2017                      save_profile_templates(templates)
2018                      print(f"✅ Platform '{platform_name}' updated")
2019                  else:
2020                      print("❌ Invalid selection")
2021              except (ValueError, IndexError):
2022                  print("❌ Invalid input")
2023          
2024          elif choice == "5":
2025              # Enable/Disable platform
2026              templates = load_profile_templates()
2027              
2028              print("\n⚙️  Enable/Disable Platform")
2029              platforms = list(templates.keys())
2030              
2031              for i, platform in enumerate(platforms, 1):
2032                  enabled = "✅" if templates[platform].get("enabled", True) else "❌"
2033                  print(f"  {i:2d}. {enabled} {platform}")
2034              
2035              try:
2036                  selection = int(input("\nSelect platform number: ").strip())
2037                  if 1 <= selection <= len(platforms):
2038                      platform_name = platforms[selection - 1]
2039                      current = templates[platform_name].get("enabled", True)
2040                      templates[platform_name]["enabled"] = not current
2041                      
2042                      status = "enabled" if not current else "disabled"
2043                      save_profile_templates(templates)
2044                      print(f"✅ Platform '{platform_name}' {status}")
2045                  else:
2046                      print("❌ Invalid selection")
2047              except (ValueError, IndexError):
2048                  print("❌ Invalid input")
2049          
2050          elif choice == "6":
2051              # Export templates
2052              filename = input("Export filename (default: profile_templates_export.json): ").strip() or "profile_templates_export.json"
2053              templates = load_profile_templates()
2054              save_profile_templates(templates, filename)
2055              print(f"✅ Templates exported to {filename}")
2056          
2057          elif choice == "7":
2058              # Import templates
2059              filename = input("Import filename: ").strip()
2060              if not filename:
2061                  print("❌ Filename required")
2062                  continue
2063              
2064              if not os.path.exists(filename):
2065                  print(f"❌ File '{filename}' not found")
2066                  continue
2067              
2068              try:
2069                  with open(filename, 'r') as f:
2070                      imported = json.load(f)
2071                  
2072                  # Merge or replace?
2073                  print("\nImport options:")
2074                  print("  1. Merge with existing (keep both)")
2075                  print("  2. Replace existing (overwrite)")
2076                  print("  3. Cancel")
2077                  
2078                  option = input("Select option (1-3): ").strip()
2079                  
2080                  if option == "1":
2081                      templates = load_profile_templates()
2082                      templates.update(imported)
2083                      save_profile_templates(templates)
2084                      print(f"✅ Merged {len(imported)} templates")
2085                  elif option == "2":
2086                      save_profile_templates(imported)
2087                      print(f"✅ Replaced with {len(imported)} templates")
2088                  else:
2089                      print("❌ Import cancelled")
2090              
2091              except Exception as e:
2092                  print(f"❌ Error importing: {e}")
2093          
2094          elif choice == "8":
2095              # Back to main menu
2096              break
2097  
2098  
2099  # ================== TESTING ==================
2100  
2101  def test_known_profiles():
2102      """Test with known profiles."""
2103      print("🧪 Testing with known profiles...")
2104      
2105      test_cases = [
2106          ("torvalds", "github", True, "Linus Torvalds"),
2107          ("jack", "twitter", True, "Jack Dorsey"),
2108          ("nasdaily", "instagram", True, "Nas Daily"),
2109          ("spez", "reddit", True, "Reddit CEO"),
2110          ("beeple", "artstation", True, "Digital artist"),
2111          ("nonexistent1234567890", "github", False, "Non-existent"),
2112      ]
2113      
2114      crawler = EnhancedProfileCrawler(CrawlerConfig())
2115      crawler.config.VERBOSE = False
2116      
2117      passed = 0
2118      failed = 0
2119      
2120      for username, platform, should_exist, description in test_cases:
2121          if platform not in PROFILE_TEMPLATES:
2122              print(f"  ⚠️  Skipping {platform} (not configured)")
2123              continue
2124          
2125          platform_config = PROFILE_TEMPLATES[platform]
2126          if isinstance(platform_config, str):
2127              url = platform_config.format(username)
2128          else:
2129              url = platform_config.get("url", "").format(username)
2130          
2131          print(f"\n🔍 {username} on {platform} ({description}):")
2132          print(f"  URL: {url}")
2133          
2134          result = crawler.check_profile(url, platform, username)
2135          
2136          status = "✅ PASS" if result["exists"] == should_exist else "❌ FAIL"
2137          if result["exists"] == should_exist:
2138              passed += 1
2139          else:
2140              failed += 1
2141          
2142          print(f"  {status} - Expected: {should_exist}, Got: {result['exists']}")
2143          print(f"  Status: {result['status_code']}, Images: {len(result['image_urls'])}")
2144          
2145          if result["error"]:
2146              print(f"  Error: {result['error']}")
2147          
2148          # Show first image if exists
2149          if result["image_urls"]:
2150              print(f"  First image: {result['image_urls'][0][:80]}...")
2151      
2152      print(f"\n📊 Test Results: {passed} passed, {failed} failed")
2153  
2154  
2155  def test_specific_profile():
2156      """Test a specific profile."""
2157      username = input("Username: ").strip()
2158      platform = input("Platform: ").strip()
2159      
2160      if not username or not platform:
2161          print("❌ Need username and platform")
2162          return
2163      
2164      if platform not in PROFILE_TEMPLATES:
2165          print(f"❌ Unknown platform. Available: {', '.join(list(PROFILE_TEMPLATES.keys())[:10])}...")
2166          return
2167      
2168      crawler = EnhancedProfileCrawler()
2169      
2170      platform_config = PROFILE_TEMPLATES[platform]
2171      if isinstance(platform_config, str):
2172          url = platform_config.format(username)
2173      else:
2174          url = platform_config.get("url", "").format(username)
2175      
2176      print(f"\n🔍 Testing {username} on {platform}...")
2177      print(f"  URL: {url}")
2178      
2179      result = crawler.check_profile(url, platform, username)
2180      
2181      print(f"\n📊 Results:")
2182      print(f"  Exists: {result['exists']}")
2183      print(f"  Status Code: {result['status_code']}")
2184      print(f"  Final URL: {result['final_url']}")
2185      print(f"  Content Length: {result['content_length']} chars")
2186      print(f"  Images Found: {len(result['image_urls'])}")
2187      
2188      if result["error"]:
2189          print(f"  Error: {result['error']}")
2190      
2191      # Show images
2192      for i, img_url in enumerate(result["image_urls"][:3], 1):
2193          print(f"\n  Image {i}:")
2194          print(f"    URL: {img_url}")
2195          
2196          # Try to download and check
2197          img_bytes = get_image_bytes(img_url)
2198          if img_bytes:
2199              print(f"    Size: {len(img_bytes)} bytes")
2200              encoding = compute_face_encoding(img_bytes)
2201              if encoding is not None:
2202                  print(f"    ✅ Face detected")
2203              else:
2204                  print(f"    ❌ No face detected")
2205          else:
2206              print(f"    ❌ Could not download")
2207  
2208  
2209  # ================== MAIN INTERFACE ==================
2210  
2211  def main():
2212      """Main interface."""
2213      print("🔍 Enhanced Cross-Platform Face Search")
2214      print("=" * 60)
2215      
2216      # Initialize
2217      crawler = EnhancedProfileCrawler()
2218      face_system = FaceIndexSystem()
2219      
2220      # Load existing index
2221      if os.path.exists("face_index.json"):
2222          face_system.load_index()
2223      
2224      while True:
2225          print("\n" + "=" * 60)
2226          print("1. Search for usernames")
2227          print("2. Upload image and search selected platforms (NEW)")
2228          print("3. Test specific profile")
2229          print("4. Run known profile tests")
2230          print("5. Compare target face (from local image)")
2231          print("6. Compare face from URL/URI")
2232          print("7. Extract faces from webpage")
2233          print("8. Batch compare from file")
2234          print("9. Create batch template")
2235          print("10. Show statistics")
2236          print("11. Manage profile templates")
2237          print("12. Save face index")
2238          print("13. Load face index")
2239          print("14. Clear face index")
2240          print("15. Exit")
2241          
2242          choice = input("\nSelect option (1-15): ").strip()
2243          
2244          if choice == "1":
2245              # Search usernames (existing code)
2246              usernames_input = input("Enter usernames (comma-separated): ").strip()
2247              if not usernames_input:
2248                  continue
2249              
2250              usernames = [u.strip() for u in usernames_input.split(',')]
2251              
2252              # Platform selection
2253              templates = load_profile_templates()
2254              enabled_platforms = get_enabled_platforms(templates)
2255              categories = get_platforms_by_category(templates)
2256              
2257              print(f"\n📋 Available platforms ({len(enabled_platforms)} enabled):")
2258              for category, platforms in categories.items():
2259                  print(f"\n  {category}:")
2260                  for platform in sorted(platforms):
2261                      config = templates[platform]
2262                      url_template = config.get("url", "No URL")
2263                      print(f"    {platform:20} - {url_template}")
2264              
2265              platform_input = input("\nEnter platforms (comma-separated, or 'all'): ").strip().lower()
2266              
2267              if platform_input == 'all':
2268                  selected_platforms = enabled_platforms
2269              else:
2270                  selected_platforms = []
2271                  for item in platform_input.split(','):
2272                      item = item.strip()
2273                      if item in enabled_platforms:
2274                          selected_platforms.append(item)
2275              
2276              if not selected_platforms:
2277                  # Default to all enabled platforms from the JSON file
2278                  selected_platforms = enabled_platforms
2279                  print(f"⚠️  No platforms specified, using all {len(selected_platforms)} enabled platforms")
2280              
2281              print(f"\n🔍 Searching {len(usernames)} user(s) on {len(selected_platforms)} platform(s)...")
2282              
2283              # Crawl
2284              results = crawler.crawl_usernames(usernames, selected_platforms)
2285              
2286              # Index faces
2287              print("\n📸 Indexing faces...")
2288              new_faces = face_system.index_from_results(results)
2289              
2290              # Summary
2291              print(f"\n📊 Summary:")
2292              total_found = 0
2293              total_faces = 0
2294              
2295              for username in usernames:
2296                  user_results = results.get(username, [])
2297                  found = [r for r in user_results if r["exists"]]
2298                  user_faces = len([f for f in new_faces if f["username"] == username])
2299                  
2300                  total_found += len(found)
2301                  total_faces += user_faces
2302                  
2303                  print(f"  {username}: {len(found)}/{len(user_results)} profiles, {user_faces} faces")
2304              
2305              print(f"\n  Total: {total_found} profiles found, {total_faces} faces indexed")
2306              
2307              # Offer to save
2308              if new_faces:
2309                  save = input("\n💾 Save results to face index? (y/N): ").strip().lower()
2310                  if save == 'y':
2311                      face_system.save_index()
2312          
2313          elif choice == "2":
2314              # NEW: Upload image and search selected platforms
2315              search_platforms_by_face(face_system, crawler)
2316          
2317          elif choice == "3":
2318              test_specific_profile()
2319          
2320          elif choice == "4":
2321              test_known_profiles()
2322          
2323          elif choice == "5":
2324              if not face_system.faces:
2325                  print("❌ No faces in index")
2326                  continue
2327              
2328              target_source = input("Target image path or URL: ").strip()
2329              if not target_source:
2330                  continue
2331              
2332              # Load target image
2333              target_bytes = get_image_bytes(target_source)
2334              if not target_bytes:
2335                  print("❌ Could not load image")
2336                  continue
2337              
2338              target_encoding = compute_face_encoding(target_bytes)
2339              if target_encoding is None:
2340                  print("❌ No face detected in target")
2341                  continue
2342              
2343              # Get threshold
2344              try:
2345                  threshold = float(input("Match threshold (0.1-1.0, default 0.6): ") or "0.6")
2346                  top_k = int(input("Number of results (default 10): ") or "10")
2347              except ValueError:
2348                  threshold = 0.6
2349                  top_k = 10
2350              
2351              print(f"\n🔍 Searching {len(face_system.faces)} indexed faces...")
2352              matches = face_system.search_faces(target_encoding, threshold, top_k)
2353              
2354              if not matches:
2355                  print("❌ No matches found")
2356                  continue
2357              
2358              print(f"\n🏆 Top {len(matches)} matches:")
2359              for i, match in enumerate(matches, 1):
2360                  symbol = "✅" if match["match"] else "⚠️"
2361                  print(f"\n  {i}. {symbol} Similarity: {match['similarity']:.3f}")
2362                  print(f"     User: {match['username']}")
2363                  print(f"     Platform: {match['platform']}")
2364                  if match['similarity'] > 0.7:
2365                      print(f"     🎯 Strong match!")
2366          
2367          elif choice == "6":
2368              # Compare face from URL/URI
2369              uri = input("Enter image URL or local file path: ").strip()
2370              if not uri:
2371                  continue
2372              
2373              print("\nOptions:")
2374              print("1. Just compare with existing faces")
2375              print("2. Compare and save to database")
2376              
2377              sub_choice = input("Select (1-2): ").strip()
2378              
2379              if sub_choice == "2":
2380                  username = input("Enter username for this face: ").strip()
2381                  if username:
2382                      compare_face_from_uri(face_system, uri, username, save_to_db=True)
2383                  else:
2384                      print("❌ Username required to save to database")
2385              else:
2386                  compare_face_from_uri(face_system, uri)
2387          
2388          elif choice == "7":
2389              # Extract faces from webpage
2390              url = input("Enter webpage URL: ").strip()
2391              if not url:
2392                  continue
2393              
2394              faces = extract_faces_from_webpage(url)
2395              
2396              if faces:
2397                  print("\nOptions:")
2398                  print("1. Compare each face with database")
2399                  print("2. Save all faces to database")
2400                  
2401                  sub_choice = input("Select (1-2): ").strip()
2402                  
2403                  if sub_choice == "1":
2404                      for i, face in enumerate(faces, 1):
2405                          print(f"\n[{i}] Comparing face from image...")
2406                          temp_uri = face['image_url']
2407                          matches = compare_face_from_uri(face_system, temp_uri)
2408                          
2409                          if matches and len(matches) > 0:
2410                              best = matches[0]
2411                              if best['similarity'] > 0.7:
2412                                  save = input(f"  Save as match to {best['username']}? (y/N): ").strip().lower()
2413                                  if save == 'y':
2414                                      username = input(f"  Username (default: {best['username']}): ").strip() or best['username']
2415                                      save_face_to_db(face_system, face['encoding'], face['image_url'], username, face['page_url'], "webpage_extraction")
2416                                      print(f"  ✅ Saved to database")
2417                  
2418                  elif sub_choice == "2":
2419                      username = input("Base username (faces will be saved as username_1, username_2, etc): ").strip()
2420                      if username:
2421                          for i, face in enumerate(faces, 1):
2422                              user_id = f"{username}_{i}"
2423                              save_face_to_db(face_system, face['encoding'], face['image_url'], user_id, face['page_url'], "webpage_extraction")
2424                          print(f"✅ Saved {len(faces)} faces to database")
2425                      else:
2426                          print("❌ Username required")
2427          
2428          elif choice == "8":
2429              # Batch compare from file
2430              filename = input("Enter filename with URIs and usernames (CSV format): ").strip()
2431              if filename and os.path.exists(filename):
2432                  batch_compare_from_file(face_system, filename)
2433              else:
2434                  print("❌ File not found")
2435          
2436          elif choice == "9":
2437              # Create batch template
2438              create_uri_batch_file()
2439          
2440          elif choice == "10":
2441              print(f"\n📊 Statistics:")
2442              print(f"  Total faces: {len(face_system.faces)}")
2443              
2444              if face_system.faces:
2445                  # Count by platform
2446                  platforms = {}
2447                  for face in face_system.faces:
2448                      platform = face.get("platform", "unknown")
2449                      platforms[platform] = platforms.get(platform, 0) + 1
2450                  
2451                  print(f"  By platform:")
2452                  for platform, count in sorted(platforms.items(), key=lambda x: x[1], reverse=True):
2453                      print(f"    {platform}: {count}")
2454                  
2455                  # Count by source
2456                  sources = {}
2457                  for face in face_system.faces:
2458                      source = face.get("source", "unknown")
2459                      sources[source] = sources.get(source, 0) + 1
2460                  
2461                  print(f"  By source:")
2462                  for source, count in sorted(sources.items(), key=lambda x: x[1], reverse=True):
2463                      print(f"    {source}: {count}")
2464          
2465          elif choice == "11":
2466              manage_templates_menu()
2467          
2468          elif choice == "12":
2469              filename = input("Filename (default: face_index.json): ").strip() or "face_index.json"
2470              face_system.save_index(filename)
2471          
2472          elif choice == "13":
2473              filename = input("Filename (default: face_index.json): ").strip() or "face_index.json"
2474              face_system.load_index(filename)
2475          
2476          elif choice == "14":
2477              confirm = input("Clear all indexed faces? (y/N): ").strip().lower()
2478              if confirm == 'y':
2479                  face_system.faces = []
2480                  print("✅ Face index cleared")
2481          
2482          elif choice == "15":
2483              print("👋 Goodbye!")
2484              break
2485  
2486  
2487  if __name__ == "__main__":
2488      main()