/ hash_advanced.py
hash_advanced.py
1 #!/usr/bin/env python3 2 3 import base64 4 import os 5 import sys 6 import time 7 import random 8 import re 9 import json 10 from dataclasses import dataclass, asdict 11 from io import BytesIO 12 from typing import List, Dict, Any, Tuple, Optional, Set, Generator 13 from urllib.parse import urljoin, urlparse, urldefrag, quote 14 from concurrent.futures import ThreadPoolExecutor, as_completed, Future 15 16 import requests 17 from PIL import Image, UnidentifiedImageError 18 import numpy as np 19 import face_recognition 20 from bs4 import BeautifulSoup 21 import tldextract 22 from fake_useragent import UserAgent 23 24 25 # ================== CONFIGURATION ================== 26 27 class CrawlerConfig: 28 """Configuration for web crawling.""" 29 MAX_PAGES_PER_USERNAME = 50 30 MAX_DEPTH = 1 31 TIMEOUT = 15 32 MAX_WORKERS = 10 33 DELAY = (1.0, 3.0) 34 USER_AGENT_ROTATION = True 35 FOLLOW_SAME_DOMAIN = False 36 EXCLUDE_EXTENSIONS = {'.pdf', '.zip', '.tar', '.gz', '.exe', '.dmg', '.iso'} 37 VALID_IMAGE_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.svg'} 38 MAX_IMAGE_SIZE_MB = 5 39 MAX_RETRIES = 2 40 RATE_LIMIT_DELAY = 1.0 41 VERBOSE = True 42 PROFILE_TEMPLATES_FILE = "profile_templates.json" 43 44 45 # ================== LOAD PROFILE TEMPLATES FROM JSON ================== 46 47 def load_profile_templates(filename: str = "profile_templates.json") -> Dict[str, Any]: 48 """ 49 Load profile templates from a JSON file. 50 If file doesn't exist, return empty dict. 51 """ 52 try: 53 if os.path.exists(filename): 54 with open(filename, 'r') as f: 55 templates = json.load(f) 56 print(f"โ Loaded {len(templates)} profile templates from {filename}") 57 return templates 58 else: 59 print(f"โ Profile templates file not found: {filename}") 60 print(" Create a JSON file with platform configurations.") 61 return {} 62 except Exception as e: 63 print(f"โ Error loading profile templates from {filename}: {e}") 64 return {} 65 66 67 def save_profile_templates(templates: Dict[str, Any], filename: str = "profile_templates.json"): 68 """Save profile templates to JSON file.""" 69 try: 70 with open(filename, 'w') as f: 71 json.dump(templates, f, indent=2) 72 print(f"๐พ Saved {len(templates)} profile templates to {filename}") 73 except Exception as e: 74 print(f"โ Error saving profile templates: {e}") 75 76 77 def get_enabled_platforms(templates: Dict[str, Any]) -> List[str]: 78 """Get list of enabled platforms from templates.""" 79 enabled = [] 80 for platform_name, config in templates.items(): 81 if config.get("enabled", True): 82 enabled.append(platform_name) 83 return enabled 84 85 86 def get_platforms_by_category(templates: Dict[str, Any]) -> Dict[str, List[str]]: 87 """Organize platforms by category.""" 88 categories = {} 89 for platform_name, config in templates.items(): 90 if config.get("enabled", True): 91 category = config.get("category", "Uncategorized") 92 if category not in categories: 93 categories[category] = [] 94 categories[category].append(platform_name) 95 return categories 96 97 98 # Load templates at module level 99 PROFILE_TEMPLATES = load_profile_templates() 100 101 102 # ================== SITE-SPECIFIC CHECKERS ================== 103 104 class SiteCheckers: 105 """Site-specific profile existence checkers.""" 106 107 @staticmethod 108 def github_check(response: requests.Response, username: str) -> bool: 109 """Check if GitHub profile exists.""" 110 if response.status_code != 200: 111 return False 112 113 html = response.text.lower() 114 115 # More accurate GitHub existence check 116 not_found_indicators = [ 117 'this is not the web page you are looking for', 118 'page not found', 119 'github could not find that page', 120 'there isn\'t a github pages site here', 121 ] 122 123 # Check for absence of profile elements 124 if any(indicator in html for indicator in not_found_indicators): 125 return False 126 127 # Positive indicators 128 profile_indicators = [ 129 f'itemprop="name"', 130 'vcard-names-container', 131 'js-profile-editable-area', 132 'p-nickname vcard-username', 133 'user-profile-frame', 134 ] 135 136 # Check for username in page 137 if username.lower() in html: 138 for indicator in profile_indicators: 139 if indicator in html: 140 return True 141 142 # Alternative: check for common GitHub profile elements 143 soup = BeautifulSoup(response.text, 'html.parser') 144 145 # Check for profile-specific elements 146 if soup.find('div', {'class': 'user-profile-frame'}): 147 return True 148 149 if soup.find('span', {'itemprop': 'name'}): 150 return True 151 152 # Check for the username in the page title 153 title = soup.find('title') 154 if title and username.lower() in title.text.lower(): 155 return True 156 157 # Check for avatar image (GitHub avatars have specific URLs) 158 for img in soup.find_all('img'): 159 src = img.get('src', '') 160 if 'avatars.githubusercontent.com' in src and 'identicon' not in src: 161 return True 162 163 return False 164 165 @staticmethod 166 def stackoverflow_check(response: requests.Response, username: str) -> bool: 167 """Check if Stack Overflow profile exists.""" 168 if response.status_code != 200: 169 return False 170 171 html = response.text.lower() 172 173 # Stack Overflow shows "Page Not Found" for non-existent users 174 if 'page not found' in html or '404 - page not found' in html: 175 return False 176 177 # Check for user profile elements 178 profile_indicators = [ 179 'user-card', 180 'user-avatar', 181 'user-details', 182 ] 183 184 for indicator in profile_indicators: 185 if indicator in html: 186 return True 187 188 return False 189 190 @staticmethod 191 def twitter_check(response: requests.Response, username: str) -> bool: 192 """Check if Twitter profile exists.""" 193 # Twitter often redirects or shows different pages 194 final_url = response.url.lower() 195 196 # Check if we got redirected to twitter.com/home (logged out view) 197 if 'twitter.com/home' in final_url: 198 return False 199 200 # Check if we're on the user's profile 201 if f'twitter.com/{username.lower()}' in final_url: 202 return True 203 204 # Check response content 205 html = response.text.lower() 206 207 # Twitter shows "This account doesn't exist" for non-existent users 208 if 'this account doesn\'t exist' in html or 'account suspended' in html: 209 return False 210 211 # Check for profile elements 212 if 'profile-header' in html or 'user-actions' in html: 213 return True 214 215 return response.status_code == 200 216 217 @staticmethod 218 def instagram_check(response: requests.Response, username: str) -> bool: 219 """Check if Instagram profile exists.""" 220 if response.status_code != 200: 221 return False 222 223 html = response.text.lower() 224 225 # Instagram shows "Sorry, this page isn't available." 226 if 'sorry, this page isn\'t available' in html: 227 return False 228 229 # Check for profile elements 230 if 'profile-page' in html or 'vcard' in html: 231 return True 232 233 return True 234 235 @staticmethod 236 def reddit_check(response: requests.Response, username: str) -> bool: 237 """Check if Reddit profile exists.""" 238 if response.status_code != 200: 239 return False 240 241 html = response.text.lower() 242 243 # Reddit shows "page not found" or "this user has deleted their account" 244 if 'page not found' in html or 'this user has deleted' in html: 245 return False 246 247 # Check for user profile elements 248 if 'user-profile' in html or f'user/{username.lower()}' in html: 249 return True 250 251 return True 252 253 @staticmethod 254 def artstation_check(response: requests.Response, username: str) -> bool: 255 """Check if ArtStation profile exists.""" 256 if response.status_code != 200: 257 return False 258 259 html = response.text.lower() 260 261 # ArtStation shows "The page you were looking for doesn't exist" 262 if 'doesn\'t exist' in html or 'page not found' in html: 263 return False 264 265 # Check for profile elements 266 if 'artist-header' in html or 'user-profile' in html: 267 return True 268 269 return True 270 271 @staticmethod 272 def deviantart_check(response: requests.Response, username: str) -> bool: 273 """Check if DeviantArt profile exists.""" 274 if response.status_code != 200: 275 return False 276 277 html = response.text.lower() 278 279 # DeviantArt shows "The deviation you are looking for appears to be missing" 280 if 'deviation you are looking for' in html or 'does not exist' in html: 281 return False 282 283 return True 284 285 @staticmethod 286 def flickr_check(response: requests.Response, username: str) -> bool: 287 """Check if Flickr profile exists.""" 288 if response.status_code != 200: 289 return False 290 291 html = response.text.lower() 292 293 # Flickr shows "This member is no longer active on Flickr" 294 if 'no longer active' in html or 'does not exist' in html: 295 return False 296 297 return True 298 299 @staticmethod 300 def _500px_check(response: requests.Response, username: str) -> bool: 301 """Check if 500px profile exists.""" 302 if response.status_code != 200: 303 return False 304 305 html = response.text.lower() 306 307 # 500px shows "The page you requested could not be found" 308 if 'could not be found' in html: 309 return False 310 311 return True 312 313 @staticmethod 314 def bandcamp_check(response: requests.Response, username: str) -> bool: 315 """Check if Bandcamp profile exists.""" 316 if response.status_code != 200: 317 return False 318 319 html = response.text.lower() 320 321 # Bandcamp shows "Couldn't find that one" 322 if 'couldn\'t find that one' in html: 323 return False 324 325 return True 326 327 @staticmethod 328 def keybase_check(response: requests.Response, username: str) -> bool: 329 """Check if Keybase profile exists.""" 330 if response.status_code != 200: 331 return False 332 333 html = response.text.lower() 334 335 # Keybase shows "User not found" 336 if 'user not found' in html: 337 return False 338 339 return True 340 341 @staticmethod 342 def gitlab_check(response: requests.Response, username: str) -> bool: 343 """Check if GitLab profile exists.""" 344 if response.status_code == 404: 345 return False 346 347 if response.status_code != 200: 348 return True # GitLab might redirect or show other pages 349 350 html = response.text.lower() 351 352 # GitLab shows "The page could not be found" for 404s 353 if 'page could not be found' in html: 354 return False 355 356 return True 357 358 @staticmethod 359 def universal_check(response: requests.Response, username: str) -> bool: 360 """Universal profile checker for any site.""" 361 if response.status_code != 200: 362 return False 363 364 html = response.text.lower() 365 366 # Common "not found" patterns across many sites 367 not_found_indicators = [ 368 'page not found', 369 '404', 370 'not found', 371 'doesn\'t exist', 372 'does not exist', 373 'couldn\'t be found', 374 'no longer available', 375 'user not found', 376 'profile not found', 377 'account not found', 378 'this page could not be found', 379 'sorry, this page isn\'t available', 380 'the page you were looking for', 381 'we couldn\'t find that page', 382 ] 383 384 for indicator in not_found_indicators: 385 if indicator in html: 386 return False 387 388 # Check for username in page (good indicator of profile page) 389 soup = BeautifulSoup(response.text, 'html.parser') 390 391 # Check title 392 title = soup.find('title') 393 if title and username.lower() in title.text.lower(): 394 return True 395 396 # Check meta tags 397 for meta in soup.find_all('meta'): 398 content = meta.get('content', '').lower() 399 if username.lower() in content: 400 return True 401 402 # Check for common profile elements 403 profile_keywords = ['profile', 'user', 'member', 'account', 'avatar'] 404 page_text = soup.get_text().lower() 405 406 for keyword in profile_keywords: 407 if keyword in page_text: 408 # Also check if username appears near profile keyword 409 text_lower = page_text.replace('\n', ' ').replace('\r', ' ') 410 if username.lower() in text_lower: 411 return True 412 413 # Default to True if we got a 200 and no "not found" indicators 414 return True 415 416 @staticmethod 417 def fansfinder_check(response: requests.Response, username: str) -> bool: 418 """Check if OnlyFans profile exists via FansFinder.""" 419 html = response.text.lower() 420 421 # Check for specific indicators that the profile exists on FansFinder 422 existence_indicators = [ 423 f'data-username="{username.lower()}"', # Username in data attribute 424 f'onlyfans.com/{username.lower()}', # Profile URL in page 425 "media.onlyfinder.com", # OnlyFans content URLs via FansFinder 426 "og:title", # Open Graph tags 427 "og:description", # Open Graph tags 428 "user-profile profile-container", # Profile container 429 "avatar-container", # Avatar container 430 "about-profile", # User about section 431 "profile-icon", # Profile icons 432 "img-responsive", # Responsive images 433 ] 434 435 # Check for "profile not found" indicators 436 not_found_indicators = [ 437 "page not found", 438 "profile not found", 439 "doesn't exist", 440 "does not exist", 441 "no longer active", 442 "user not found", 443 "couldn't find that profile", 444 "this profile is not available", 445 "no results found", 446 "no profiles found", 447 "0 results", 448 ] 449 450 # If we see "not found" indicators, profile doesn't exist 451 for indicator in not_found_indicators: 452 if indicator in html: 453 return False 454 455 # Check for existence indicators 456 for indicator in existence_indicators: 457 if indicator in html: 458 return True 459 460 # Also check for specific patterns in the HTML structure 461 soup = BeautifulSoup(response.text, 'html.parser') 462 463 # Check for the specific FansFinder profile container 464 profile_containers = soup.find_all('div', {'class': re.compile(r'user-profile.*profile-container')}) 465 if profile_containers: 466 for container in profile_containers: 467 # Check if username is in container's data attributes 468 data_username = container.get('data-username', '') 469 if username.lower() == data_username.lower(): 470 return True 471 472 # Check for OnlyFans link in the container 473 onlyfans_links = container.find_all('a', href=True) 474 for link in onlyfans_links: 475 if f'onlyfans.com/{username.lower()}' in link.get('href', '').lower(): 476 return True 477 478 # Check for avatar images 479 avatar_images = soup.find_all('img', {'class': 'img-responsive'}) 480 if avatar_images: 481 # Check if any image has the username in alt or title 482 for img in avatar_images: 483 alt_text = img.get('alt', '').lower() 484 title_text = img.get('title', '').lower() 485 if username.lower() in alt_text or username.lower() in title_text: 486 return True 487 488 # Check for profile headers with the username 489 profile_headers = soup.find_all(['h1', 'h2', 'h3', 'h4']) 490 for header in profile_headers: 491 if username.lower() in header.get_text().lower(): 492 return True 493 494 # Check response status 495 if response.status_code == 200: 496 # Additional check: look for FansFinder specific elements 497 if 'fansfinder' in response.url.lower(): 498 # Check if we have meaningful content (not just a search page) 499 if len(response.text) > 5000: # Profile pages tend to be larger 500 # Look for profile-specific data 501 if 'profile-container' in html or 'avatar-container' in html: 502 return True 503 504 return False 505 506 507 # ================== ENHANCED PROFILE CRAWLER ================== 508 509 class EnhancedProfileCrawler: 510 """Enhanced crawler with site-specific checks and universal image extraction.""" 511 512 def __init__(self, config: CrawlerConfig = None): 513 self.config = config or CrawlerConfig() 514 self.ua = UserAgent() if self.config.USER_AGENT_ROTATION else None 515 self.session = requests.Session() 516 self.session.headers.update({ 517 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 518 'Accept-Language': 'en-US,en;q=0.5', 519 'Accept-Encoding': 'gzip, deflate', 520 'Connection': 'keep-alive', 521 'Upgrade-Insecure-Requests': '1', 522 'Cache-Control': 'no-cache', 523 'DNT': '1', 524 }) 525 self.checkers = SiteCheckers() 526 self.rate_limit_cache = {} 527 self.profile_templates = load_profile_templates(self.config.PROFILE_TEMPLATES_FILE) 528 529 def get_random_user_agent(self) -> str: 530 """Get a random user agent.""" 531 if self.ua: 532 return self.ua.random 533 return 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' 534 535 def get_browser_like_headers(self) -> Dict[str, str]: 536 """Get headers that look like a real browser.""" 537 return { 538 'User-Agent': self.get_random_user_agent(), 539 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 540 'Accept-Language': 'en-US,en;q=0.5', 541 'Accept-Encoding': 'gzip, deflate, br', 542 'DNT': '1', 543 'Connection': 'keep-alive', 544 'Upgrade-Insecure-Requests': '1', 545 'Sec-Fetch-Dest': 'document', 546 'Sec-Fetch-Mode': 'navigate', 547 'Sec-Fetch-Site': 'none', 548 'Sec-Fetch-User': '?1', 549 'Cache-Control': 'max-age=0', 550 'TE': 'trailers', 551 } 552 553 def check_rate_limit(self, domain: str): 554 """Rate limiting by domain.""" 555 current_time = time.time() 556 if domain in self.rate_limit_cache: 557 last_request = self.rate_limit_cache[domain] 558 elapsed = current_time - last_request 559 if elapsed < self.config.RATE_LIMIT_DELAY: 560 sleep_time = self.config.RATE_LIMIT_DELAY - elapsed 561 if self.config.VERBOSE: 562 print(f" โณ Rate limiting: waiting {sleep_time:.1f}s for {domain}") 563 time.sleep(sleep_time) 564 565 self.rate_limit_cache[domain] = current_time 566 567 def is_valid_avatar(self, url: str, img_element) -> bool: 568 """Universal check if an image is likely a valid avatar.""" 569 url_lower = url.lower() 570 571 # Skip known placeholder/blank avatars 572 placeholder_keywords = [ 573 'default', 'placeholder', 'anonymous', 'unknown', 574 'ghost', 'blank', 'null', 'empty', 'none', 575 'no-avatar', 'no-avatar.jpg', 'no-photo', 'no-image', 576 'default_avatar', 'default-profile', 'default-user', 577 'gravatar.com/avatar/?', # Empty gravatar 578 'identicon', 'monsterid', 'wavatar', 'retro', # GitHub defaults 579 '0.jpg', '0.png', '0.gif', # Zero filenames 580 ] 581 582 for keyword in placeholder_keywords: 583 if keyword in url_lower: 584 return False 585 586 # Check image element attributes 587 alt_text = (img_element.get('alt') or '').lower() 588 for keyword in placeholder_keywords: 589 if keyword in alt_text: 590 return False 591 592 title_text = (img_element.get('title') or '').lower() 593 for keyword in placeholder_keywords: 594 if keyword in title_text: 595 return False 596 597 # Check for common placeholder dimensions (very small images) 598 try: 599 width_attr = img_element.get('width', '') 600 height_attr = img_element.get('height', '') 601 602 if width_attr and height_attr: 603 width = int(''.join(filter(str.isdigit, width_attr)) or '0') 604 height = int(''.join(filter(str.isdigit, height_attr)) or '0') 605 606 # Skip very small images (likely icons, not avatars) 607 if width < 32 or height < 32: 608 return False 609 except: 610 pass 611 612 # Check for common placeholder class names 613 img_class = ' '.join(img_element.get('class', [])).lower() 614 placeholder_classes = [ 615 'placeholder', 'default', 'empty', 'blank', 616 'no-avatar', 'no-image', 'avatar-placeholder' 617 ] 618 619 for p_class in placeholder_classes: 620 if p_class in img_class: 621 return False 622 623 # Platform-specific checks 624 # GitHub 625 if 'github' in url_lower: 626 if any(x in url_lower for x in ['identicon', 'monsterid', 'retro', 'wavatar']): 627 return False 628 629 # Gravatar 630 if 'gravatar.com/avatar/' in url_lower: 631 # Check for MD5 hash length (32 chars) - empty gravatars have short or no hash 632 import re 633 match = re.search(r'gravatar\.com/avatar/([a-fA-F0-9]+)', url_lower) 634 if match: 635 hash_value = match.group(1) 636 if len(hash_value) < 32: # Not a proper MD5 hash 637 return False 638 639 return True 640 641 def get_image_src(self, img_element) -> Optional[str]: 642 """Get image source from img element with priority order.""" 643 # List of attributes to check in order of priority 644 src_attrs = ['src', 'data-src', 'data-original', 'data-lazy-src', 'data-lazyload', 'srcset'] 645 646 for attr in src_attrs: 647 src = img_element.get(attr) 648 if not src: 649 continue 650 651 # Clean and return the URL 652 if attr == 'srcset': 653 # Handle srcset - take the first (usually highest quality) image 654 srcset_parts = src.split(',') 655 if srcset_parts: 656 first_part = srcset_parts[0].strip() 657 url = first_part.split(' ')[0].strip() 658 return url if url else None 659 else: 660 # For other attributes, return directly 661 return src.strip() 662 663 return None 664 665 def extract_fansfinder_avatar(self, html: str, base_url: str, username: str) -> List[str]: 666 """Extract avatar from FansFinder profile page for specific username.""" 667 soup = BeautifulSoup(html, 'html.parser') 668 image_urls = set() 669 670 # Look for the specific avatar container structure 671 avatar_containers = soup.find_all('div', {'class': 'avatar-container'}) 672 673 for container in avatar_containers: 674 # Check if this container belongs to our username 675 # Look for data-username attribute in parent containers 676 parent = container.find_parent('div', {'data-username': username.lower()}) 677 if parent: 678 # This container belongs to our username 679 images = container.find_all('img') 680 for img in images: 681 src = self.get_image_src(img) 682 if src: 683 try: 684 full_url = urljoin(base_url, src) 685 if self.is_valid_avatar(full_url, img): 686 image_urls.add(full_url) 687 except Exception as e: 688 if self.config.VERBOSE: 689 print(f" [!] URL join error: {e}") 690 691 # Also look for images with username patterns in URLs and attributes 692 for img in soup.find_all('img'): 693 src = self.get_image_src(img) 694 if not src: 695 continue 696 697 src_lower = src.lower() 698 username_lower = username.lower() 699 700 # Check if image URL contains the username 701 if username_lower in src_lower: 702 # Check for specific patterns 703 patterns = [ 704 f'{username_lower}-onlyfans.', 705 f'{username_lower}_onlyfans.', 706 f'/{username_lower}/', 707 f'/{username_lower}-', 708 ] 709 710 for pattern in patterns: 711 if pattern in src_lower: 712 try: 713 full_url = urljoin(base_url, src) 714 if self.is_valid_avatar(full_url, img): 715 image_urls.add(full_url) 716 break 717 except Exception as e: 718 if self.config.VERBOSE: 719 print(f" [!] URL join error: {e}") 720 721 # Check alt and title attributes 722 alt = img.get('alt', '').lower() 723 title = img.get('title', '').lower() 724 725 # If alt or title contains the username and "onlyfans" 726 if username_lower in alt and 'onlyfans' in alt: 727 try: 728 full_url = urljoin(base_url, src) 729 if self.is_valid_avatar(full_url, img): 730 image_urls.add(full_url) 731 except Exception as e: 732 if self.config.VERBOSE: 733 print(f" [!] URL join error: {e}") 734 735 if username_lower in title and 'onlyfans' in title: 736 try: 737 full_url = urljoin(base_url, src) 738 if self.is_valid_avatar(full_url, img): 739 image_urls.add(full_url) 740 except Exception as e: 741 if self.config.VERBOSE: 742 print(f" [!] URL join error: {e}") 743 744 # If we still don't have images, look for the most likely profile image 745 if not image_urls: 746 # Look for images with img-responsive class 747 for img in soup.find_all('img', {'class': 'img-responsive'}): 748 src = self.get_image_src(img) 749 if src: 750 try: 751 full_url = urljoin(base_url, src) 752 if self.is_valid_avatar(full_url, img): 753 image_urls.add(full_url) 754 except Exception as e: 755 if self.config.VERBOSE: 756 print(f" [!] URL join error: {e}") 757 758 return list(image_urls) 759 760 def check_profile_with_cf_bypass(self, url: str, platform: str, username: str) -> Dict[str, Any]: 761 """Check profile with Cloudflare bypass attempts.""" 762 domain = urlparse(url).netloc 763 self.check_rate_limit(domain) 764 765 time.sleep(random.uniform(*self.config.DELAY)) 766 767 try: 768 headers = self.get_browser_like_headers() 769 770 response = self.session.get( 771 url, 772 headers=headers, 773 timeout=self.config.TIMEOUT, 774 allow_redirects=True, 775 stream=False 776 ) 777 778 # For OnlyFans specifically, use fansfinder_check 779 exists = False 780 if platform == "onlyfans": 781 exists = self.checkers.fansfinder_check(response, username) 782 else: 783 # Use standard check for other platforms 784 platform_config = self.profile_templates.get(platform, {}) 785 check_method = platform_config.get("check_method", "status_code") 786 787 if check_method == "status_code": 788 exists = response.status_code == 200 789 else: 790 # Use the appropriate checker method 791 check_method_name = f"{check_method}" 792 if hasattr(self.checkers, check_method_name): 793 checker_func = getattr(self.checkers, check_method_name) 794 exists = checker_func(response, username) 795 796 # Extract images if profile exists 797 image_urls = [] 798 if exists: 799 platform_config = self.profile_templates.get(platform, {}) 800 if platform == "onlyfans": 801 # Use the updated method that takes username 802 image_urls = self.extract_fansfinder_avatar(response.text, url, username) 803 else: 804 image_urls = self.extract_images(response.text, url, platform_config) 805 806 result = { 807 "exists": exists, 808 "status_code": response.status_code, 809 "url": response.url, 810 "image_urls": image_urls, 811 "error": None, 812 "platform": platform, 813 "username": username, 814 "final_url": response.url, 815 "content_length": len(response.text), 816 "cf_protected": "cf-ray" in response.headers # Indicate if Cloudflare was detected 817 } 818 819 return result 820 821 except Exception as e: 822 return { 823 "exists": False, 824 "status_code": 0, 825 "url": url, 826 "image_urls": [], 827 "error": str(e), 828 "platform": platform, 829 "username": username 830 } 831 832 def check_profile(self, url: str, platform: str, username: str) -> Dict[str, Any]: 833 """Check if a profile exists with site-specific logic.""" 834 # Use special handling for OnlyFans (using FansFinder) 835 if platform == "onlyfans": 836 return self.check_profile_with_cf_bypass(url, platform, username) 837 838 domain = urlparse(url).netloc 839 self.check_rate_limit(domain) 840 841 # Random delay to avoid detection 842 time.sleep(random.uniform(*self.config.DELAY)) 843 844 try: 845 # Update headers for this request 846 headers = {'User-Agent': self.get_random_user_agent()} 847 848 response = self.session.get( 849 url, 850 headers=headers, 851 timeout=self.config.TIMEOUT, 852 allow_redirects=True, 853 stream=False 854 ) 855 856 # Get platform configuration 857 platform_config = self.profile_templates.get(platform, {}) 858 if isinstance(platform_config, str): 859 platform_config = {"url": platform_config, "check_method": "status_code"} 860 861 check_method = platform_config.get("check_method", "status_code") 862 exists = False 863 864 # Use appropriate check method 865 if check_method == "status_code": 866 exists = response.status_code == 200 867 elif check_method == "github_check": 868 exists = self.checkers.github_check(response, username) 869 elif check_method == "twitter_check": 870 exists = self.checkers.twitter_check(response, username) 871 elif check_method == "instagram_check": 872 exists = self.checkers.instagram_check(response, username) 873 elif check_method == "reddit_check": 874 exists = self.checkers.reddit_check(response, username) 875 elif check_method == "stackoverflow_check": 876 exists = self.checkers.stackoverflow_check(response, username) 877 elif check_method == "artstation_check": 878 exists = self.checkers.artstation_check(response, username) 879 elif check_method == "deviantart_check": 880 exists = self.checkers.deviantart_check(response, username) 881 elif check_method == "flickr_check": 882 exists = self.checkers.flickr_check(response, username) 883 elif check_method == "500px_check": 884 exists = self.checkers._500px_check(response, username) 885 elif check_method == "bandcamp_check": 886 exists = self.checkers.bandcamp_check(response, username) 887 elif check_method == "keybase_check": 888 exists = self.checkers.keybase_check(response, username) 889 elif check_method == "gitlab_check": 890 exists = self.checkers.gitlab_check(response, username) 891 elif check_method == "universal_check": 892 exists = self.checkers.universal_check(response, username) 893 elif check_method == "fansfinder_check": 894 exists = self.checkers.fansfinder_check(response, username) 895 else: 896 # Default: status code 200 897 exists = response.status_code == 200 898 899 # Extract images if profile exists 900 image_urls = [] 901 if exists: 902 image_urls = self.extract_images(response.text, url, platform_config, username) # Added username parameter 903 904 result = { 905 "exists": exists, 906 "status_code": response.status_code, 907 "url": response.url, # Use final URL after redirects 908 "image_urls": image_urls, 909 "error": None, 910 "platform": platform, 911 "username": username, 912 "final_url": response.url, 913 "content_length": len(response.text) 914 } 915 916 return result 917 918 except requests.exceptions.Timeout: 919 return { 920 "exists": False, 921 "status_code": 408, 922 "url": url, 923 "image_urls": [], 924 "error": "Timeout", 925 "platform": platform, 926 "username": username 927 } 928 except requests.exceptions.ConnectionError: 929 return { 930 "exists": False, 931 "status_code": 0, 932 "url": url, 933 "image_urls": [], 934 "error": "Connection error", 935 "platform": platform, 936 "username": username 937 } 938 except Exception as e: 939 return { 940 "exists": False, 941 "status_code": 0, 942 "url": url, 943 "image_urls": [], 944 "error": str(e), 945 "platform": platform, 946 "username": username 947 } 948 949 def extract_images(self, html: str, base_url: str, platform_config: Dict, username: str = None) -> List[str]: 950 """Universal image extraction that works with any site.""" 951 soup = BeautifulSoup(html, 'html.parser') 952 image_urls = set() 953 954 # Get platform name for specific handling if needed 955 platform_name = platform_config.get("platform", "") 956 platform_url = platform_config.get("url", "") 957 958 # Special handling for OnlyFans/FansFinder 959 if platform_name == "onlyfans" and username: 960 fansfinder_images = self.extract_fansfinder_avatar(html, base_url, username) 961 image_urls.update(fansfinder_images) 962 963 # Phase 1: Try platform-specific selector first 964 avatar_selector = platform_config.get("avatar_selector", "") 965 if avatar_selector: 966 try: 967 for img in soup.select(avatar_selector): 968 src = self.get_image_src(img) 969 if src: 970 try: 971 full_url = urljoin(base_url, src) 972 if self.is_valid_avatar(full_url, img): 973 image_urls.add(full_url) 974 except Exception as e: 975 if self.config.VERBOSE: 976 print(f" [!] URL join error: {e}") 977 except Exception as e: 978 if self.config.VERBOSE: 979 print(f" [!] Error with selector {avatar_selector}: {e}") 980 981 # Phase 2: Universal avatar detection patterns 982 if not image_urls: 983 # Pattern 1: Look for common avatar class patterns 984 avatar_patterns = [ 985 r'.*avatar.*', # Contains 'avatar' 986 r'.*profile.*', # Contains 'profile' 987 r'.*user.*', # Contains 'user' 988 r'.*photo.*', # Contains 'photo' 989 r'.*pic.*', # Contains 'pic' 990 r'.*image.*', # Contains 'image' 991 ] 992 993 for img in soup.find_all('img'): 994 # Check class attribute 995 img_class = ' '.join(img.get('class', [])) 996 if any(re.search(pattern, img_class, re.IGNORECASE) for pattern in avatar_patterns): 997 src = self.get_image_src(img) 998 if src: 999 try: 1000 full_url = urljoin(base_url, src) 1001 if self.is_valid_avatar(full_url, img): 1002 image_urls.add(full_url) 1003 except: 1004 pass 1005 1006 # Check id attribute 1007 img_id = img.get('id', '') 1008 if any(re.search(pattern, img_id, re.IGNORECASE) for pattern in avatar_patterns): 1009 src = self.get_image_src(img) 1010 if src: 1011 try: 1012 full_url = urljoin(base_url, src) 1013 if self.is_valid_avatar(full_url, img): 1014 image_urls.add(full_url) 1015 except: 1016 pass 1017 1018 # Check alt attribute 1019 img_alt = img.get('alt', '').lower() 1020 if any(keyword in img_alt for keyword in ['profile', 'avatar', 'user', 'photo', 'picture']): 1021 src = self.get_image_src(img) 1022 if src: 1023 try: 1024 full_url = urljoin(base_url, src) 1025 if self.is_valid_avatar(full_url, img): 1026 image_urls.add(full_url) 1027 except: 1028 pass 1029 1030 # Phase 3: Look for meta tags (social sharing images) 1031 meta_selectors = [ 1032 'meta[property="og:image"]', 1033 'meta[name="og:image"]', 1034 'meta[property="twitter:image"]', 1035 'meta[name="twitter:image"]', 1036 'meta[itemprop="image"]', 1037 'meta[name="image"]', 1038 ] 1039 1040 for selector in meta_selectors: 1041 for meta in soup.select(selector): 1042 content = meta.get('content') 1043 if content: 1044 try: 1045 full_url = urljoin(base_url, content) 1046 # Check if it looks like a profile image 1047 if self.config.VERBOSE: 1048 print(f" ๐ฑ Found meta image: {full_url}") 1049 image_urls.add(full_url) 1050 except: 1051 pass 1052 1053 # Phase 4: Check all images with common avatar filename patterns 1054 if not image_urls: 1055 avatar_filename_patterns = [ 1056 r'.*avatar.*\.(jpg|jpeg|png|gif|webp)$', 1057 r'.*profile.*\.(jpg|jpeg|png|gif|webp)$', 1058 r'.*user.*\.(jpg|jpeg|png|gif|webp)$', 1059 r'.*pic.*\.(jpg|jpeg|png|gif|webp)$', 1060 r'.*photo.*\.(jpg|jpeg|png|gif|webp)$', 1061 r'.*pfp.*\.(jpg|jpeg|png|gif|webp)$', # pfp = profile picture 1062 r'.*me.*\.(jpg|jpeg|png|gif|webp)$', 1063 ] 1064 1065 for img in soup.find_all('img'): 1066 src = self.get_image_src(img) 1067 if not src: 1068 continue 1069 1070 # Extract filename from URL 1071 filename = src.split('/')[-1].split('?')[0].lower() 1072 1073 # Check if filename matches avatar patterns 1074 if any(re.search(pattern, filename, re.IGNORECASE) for pattern in avatar_filename_patterns): 1075 try: 1076 full_url = urljoin(base_url, src) 1077 if self.is_valid_avatar(full_url, img): 1078 image_urls.add(full_url) 1079 except: 1080 pass 1081 1082 # Phase 5: Check all images with common avatar URL patterns 1083 if not image_urls: 1084 avatar_url_patterns = [ 1085 r'.*\/avatar\/.*', 1086 r'.*\/profile\/.*', 1087 r'.*\/user\/.*', 1088 r'.*\/photo\/.*', 1089 r'gravatar\.com\/avatar\/.*', 1090 r'avatars\..*\.com\/.*', 1091 r'cdn\.discordapp\.com\/avatars\/.*', 1092 r'ugc\.production\.linktr\.ee\/.*', # Linktree CDN 1093 r'pbs\.twimg\.com\/profile_images\/.*', # Twitter 1094 r'instagram\.fbom.*\.fna\.fbcdn\.net\/.*', # Instagram 1095 r'i\.redd\.it\/.*', # Reddit 1096 r'i\.imgur\.com\/.*', # Imgur 1097 r'media\.onlyfinder\.com\/.*', # FansFinder/OnlyFans CDN 1098 ] 1099 1100 for img in soup.find_all('img'): 1101 src = self.get_image_src(img) 1102 if not src: 1103 continue 1104 1105 # Check if URL matches avatar patterns 1106 if any(re.search(pattern, src, re.IGNORECASE) for pattern in avatar_url_patterns): 1107 try: 1108 full_url = urljoin(base_url, src) 1109 if self.is_valid_avatar(full_url, img): 1110 image_urls.add(full_url) 1111 except: 1112 pass 1113 1114 # Phase 6: Fallback - take first few images that look reasonable 1115 if not image_urls: 1116 for img in soup.find_all('img')[:10]: # Limit to first 10 images 1117 src = self.get_image_src(img) 1118 if not src: 1119 continue 1120 1121 try: 1122 full_url = urljoin(base_url, src) 1123 1124 # Check if it's a reasonable size (not an icon) 1125 img_width = img.get('width') 1126 img_height = img.get('height') 1127 1128 # If dimensions are specified, check if it's avatar-sized 1129 if img_width and img_height: 1130 width = int(img_width) if img_width.isdigit() else 0 1131 height = int(img_height) if img_height.isdigit() else 0 1132 1133 # Avatars are usually square-ish and not tiny 1134 if width > 50 and height > 50: 1135 if self.is_valid_avatar(full_url, img): 1136 image_urls.add(full_url) 1137 else: 1138 # No dimensions, just add it 1139 if self.is_valid_avatar(full_url, img): 1140 image_urls.add(full_url) 1141 1142 except: 1143 pass 1144 1145 # Filter and clean URLs 1146 filtered_urls = [] 1147 for url in image_urls: 1148 # Skip data URIs and javascript 1149 if url.startswith(('data:', 'javascript:')): 1150 continue 1151 1152 # Remove query parameters that might cause issues 1153 try: 1154 parsed = urlparse(url) 1155 clean_url = f"{parsed.scheme}://{parsed.netloc}{parsed.path}" 1156 1157 # Check if it's likely an image 1158 path_lower = parsed.path.lower() 1159 has_image_ext = any(path_lower.endswith(ext) for ext in self.config.VALID_IMAGE_EXTENSIONS) 1160 1161 # Expanded list with more mainstream and frequently used hosts 1162 is_known_avatar_host = any( 1163 host in clean_url.lower() 1164 for host in [ 1165 'avatars.githubusercontent.com', # GitHub 1166 'gravatar.com', # Gravatar 1167 'avatar.trakt.tv', # Trakt 1168 'ugc.production.linktr.ee', # Linktree 1169 'cdn.discordapp.com', # Discord 1170 'pbs.twimg.com/profile_images', # Twitter/X 1171 'instagram.fbom1-2.fna.fbcdn.net', # Instagram (Facebook CDN) 1172 'scontent.cdninstagram.com', # Instagram 1173 'i.redd.it', # Reddit 1174 'i.imgur.com', # Imgur 1175 'public.onlyfans.com/files', # OnlyFans 1176 'media.onlyfinder.com', # FansFinder/OnlyFans CDN 1177 # Additional mainstream hosts 1178 'platform.twitter.com', # Twitter CDN variant 1179 'abs.twimg.com', # Twitter avatars 1180 'lh3.googleusercontent.com', # Google/YouTube 1181 'yt3.ggpht.com', # YouTube 1182 'a0.muscdn.com', # SoundCloud 1183 'i1.sndcdn.com', # SoundCloud 1184 'a.pomf.lol', # Pomf.cat (meme culture) 1185 'pbs.twimg.com/media', # Twitter media (profile pics often here) 1186 'via.placeholder.com', # Common placeholder service 1187 'ui-avatars.com', # Generated avatars 1188 'robohash.org', # Robot avatars 1189 'identicons.github.com', # GitHub identicons 1190 'secure.gravatar.com/avatar', # Gravatar secure 1191 'steamcdn-a.akamaihd.net', # Steam 1192 'steamuserimages-a.akamaihd.net', # Steam 1193 'avatar-management--avatars.us-west-2', # Twitch 1194 'static-cdn.jtvnw.net', # Twitch 1195 'tiktokcdn.com', # TikTok 1196 'byteimg.com', # TikTok CDN 1197 'ssl-profile-images-cdn.viago.co', # LinkedIn variant 1198 'media.licdn.com/dms/image', # LinkedIn 1199 'https://media.licdn.com/dms/image/v2/' 1200 ] 1201 ) 1202 1203 if has_image_ext or is_known_avatar_host: 1204 filtered_urls.append(clean_url) 1205 except: 1206 continue 1207 1208 # Return unique URLs, limited to reasonable number 1209 return list(set(filtered_urls))[:10] # Return up to 10 unique images 1210 1211 def crawl_usernames(self, usernames: List[str], platforms: List[str] = None) -> Dict[str, List[Dict]]: 1212 """Crawl multiple usernames across platforms.""" 1213 if platforms is None: 1214 platforms = get_enabled_platforms(self.profile_templates) 1215 1216 results = {username: [] for username in usernames} 1217 1218 with ThreadPoolExecutor(max_workers=min(self.config.MAX_WORKERS, len(platforms))) as executor: 1219 futures = [] 1220 1221 for username in usernames: 1222 username = username.strip() 1223 if not username: 1224 continue 1225 1226 for platform in platforms: 1227 if platform not in self.profile_templates: 1228 continue 1229 1230 platform_config = self.profile_templates[platform] 1231 if isinstance(platform_config, str): 1232 url = platform_config.format(username) 1233 else: 1234 url = platform_config.get("url", "").format(username) 1235 1236 if not url: 1237 continue 1238 1239 future = executor.submit( 1240 self.check_profile, 1241 url, platform, username 1242 ) 1243 futures.append((future, username, platform, url)) 1244 1245 # Process results 1246 completed = 0 1247 total = len(futures) 1248 1249 for future, username, platform, url in futures: 1250 try: 1251 result = future.result(timeout=self.config.TIMEOUT + 10) 1252 results[username].append(result) 1253 completed += 1 1254 1255 if self.config.VERBOSE: 1256 status = "โ " if result["exists"] else "โ" 1257 images = f" ({len(result['image_urls'])} img)" if result["image_urls"] else "" 1258 error = f" - {result['error']}" if result["error"] else "" 1259 print(f" {status} [{completed}/{total}] {platform}: {result['exists']}{images}{error}") 1260 1261 except Exception as e: 1262 if self.config.VERBOSE: 1263 print(f" โ {platform}: Error - {e}") 1264 1265 return results 1266 1267 1268 # ================== IMAGE PROCESSING ================== 1269 1270 def get_image_bytes(source: str, max_size_mb: int = 5, timeout: int = 10) -> Optional[bytes]: 1271 """Download image with error handling.""" 1272 try: 1273 if source.startswith("data:"): 1274 b64_data = source.split(",", 1)[1] 1275 return base64.b64decode(b64_data) 1276 elif source.startswith("http://") or source.startswith("https://"): 1277 headers = { 1278 'User-Agent': UserAgent().random, 1279 'Accept': 'image/*,*/*;q=0.8', 1280 } 1281 1282 response = requests.get( 1283 source, 1284 headers=headers, 1285 timeout=timeout, 1286 stream=True, 1287 allow_redirects=True 1288 ) 1289 response.raise_for_status() 1290 1291 # Check content type 1292 content_type = response.headers.get('Content-Type', '').lower() 1293 if content_type and not any(x in content_type for x in ['image/', 'octet-stream', 'binary']): 1294 return None 1295 1296 # Read in chunks 1297 content = b'' 1298 max_bytes = max_size_mb * 1024 * 1024 1299 1300 for chunk in response.iter_content(chunk_size=8192): 1301 content += chunk 1302 if len(content) > max_bytes: 1303 return None 1304 1305 return content 1306 else: 1307 if not os.path.exists(source): 1308 return None 1309 1310 with open(source, "rb") as f: 1311 return f.read() 1312 except Exception: 1313 return None 1314 1315 1316 def compute_face_encoding(image_bytes: bytes) -> Optional[np.ndarray]: 1317 """Extract face encoding from image.""" 1318 try: 1319 image = Image.open(BytesIO(image_bytes)) 1320 if image.mode != 'RGB': 1321 image = image.convert('RGB') 1322 1323 rgb_image = np.array(image) 1324 1325 # Try face detection 1326 face_locations = face_recognition.face_locations(rgb_image, model="hog") 1327 if not face_locations: 1328 return None 1329 1330 encodings = face_recognition.face_encodings(rgb_image, face_locations) 1331 return encodings[0] if encodings else None 1332 1333 except Exception: 1334 return None 1335 1336 1337 # ================== FACE INDEX SYSTEM ================== 1338 1339 class FaceIndexSystem: 1340 """Face indexing system.""" 1341 1342 def __init__(self): 1343 self.faces = [] 1344 self.config = CrawlerConfig() 1345 1346 def index_from_results(self, crawl_results: Dict[str, List[Dict]]) -> List[Dict]: 1347 """Index faces from crawl results.""" 1348 new_faces = [] 1349 1350 for username, results in crawl_results.items(): 1351 for result in results: 1352 if not result["exists"]: 1353 continue 1354 1355 for image_url in result["image_urls"][:2]: # Try first 2 images 1356 try: 1357 img_bytes = get_image_bytes(image_url) 1358 if not img_bytes: 1359 continue 1360 1361 encoding = compute_face_encoding(img_bytes) 1362 if encoding is None: 1363 continue 1364 1365 face_record = { 1366 "username": username, 1367 "platform": result["platform"], 1368 "page_url": result["url"], 1369 "image_url": image_url, 1370 "encoding": encoding.tolist(), 1371 "timestamp": time.time() 1372 } 1373 1374 self.faces.append(face_record) 1375 new_faces.append(face_record) 1376 1377 if self.config.VERBOSE: 1378 print(f" ๐ค Face indexed: {username}@{result['platform']}") 1379 1380 except Exception as e: 1381 if self.config.VERBOSE: 1382 print(f" [!] Error: {e}") 1383 1384 return new_faces 1385 1386 def search_faces(self, target_encoding: np.ndarray, threshold: float = 0.6, top_k: int = 10) -> List[Dict]: 1387 """Search for similar faces.""" 1388 results = [] 1389 1390 for face in self.faces: 1391 try: 1392 face_encoding = np.array(face["encoding"]) 1393 distance = float(face_recognition.face_distance([target_encoding], face_encoding)[0]) 1394 similarity = max(0.0, 1.0 - min(distance, 1.0)) 1395 1396 results.append({ 1397 "username": face["username"], 1398 "platform": face["platform"], 1399 "similarity": similarity, 1400 "distance": distance, 1401 "match": distance < threshold, 1402 "page_url": face["page_url"], 1403 "image_url": face["image_url"] 1404 }) 1405 except Exception: 1406 continue 1407 1408 results.sort(key=lambda x: x["similarity"], reverse=True) 1409 return results[:top_k] 1410 1411 def save_index(self, filename: str = "face_index.json"): 1412 """Save index to file.""" 1413 data = { 1414 "faces": self.faces, 1415 "metadata": { 1416 "total": len(self.faces), 1417 "timestamp": time.time() 1418 } 1419 } 1420 1421 with open(filename, 'w') as f: 1422 json.dump(data, f, indent=2) 1423 1424 print(f"๐พ Saved {len(self.faces)} faces to {filename}") 1425 1426 def load_index(self, filename: str = "face_index.json"): 1427 """Load index from file.""" 1428 try: 1429 with open(filename, 'r') as f: 1430 data = json.load(f) 1431 1432 self.faces = data.get("faces", []) 1433 print(f"๐ Loaded {len(self.faces)} faces from {filename}") 1434 return True 1435 except Exception as e: 1436 print(f"โ Error loading: {e}") 1437 return False 1438 1439 1440 # ================== NEW FUNCTIONS FOR URI FACE COMPARISON ================== 1441 1442 def compare_face_from_uri(face_system, uri: str, username: str = None, save_to_db: bool = False): 1443 """Compare a face from a URI (URL or local path) with indexed faces.""" 1444 print(f"\n๐ Comparing face from URI: {uri}") 1445 1446 # Load target image 1447 target_bytes = get_image_bytes(uri) 1448 if not target_bytes: 1449 print("โ Could not load image from URI") 1450 return 1451 1452 # Check if it's a valid image 1453 try: 1454 Image.open(BytesIO(target_bytes)).verify() 1455 except Exception: 1456 print("โ Invalid image file") 1457 return 1458 1459 # Extract face encoding 1460 print("๐งฌ Extracting face encoding...") 1461 target_encoding = compute_face_encoding(target_bytes) 1462 if target_encoding is None: 1463 print("โ No face detected in the image") 1464 return 1465 1466 print("โ Face encoding extracted successfully") 1467 1468 # Get comparison parameters 1469 try: 1470 threshold = float(input("Match threshold (0.1-1.0, default 0.6): ") or "0.6") 1471 top_k = int(input("Number of results to show (default 20): ") or "20") 1472 except ValueError: 1473 threshold = 0.6 1474 top_k = 20 1475 1476 # Search for matches 1477 print(f"\n๐ Searching {len(face_system.faces)} indexed faces...") 1478 matches = face_system.search_faces(target_encoding, threshold, top_k) 1479 1480 if not matches: 1481 print("โ No matches found") 1482 return matches 1483 1484 # Display results 1485 print(f"\n๐ Top {len(matches)} matches:") 1486 for i, match in enumerate(matches, 1): 1487 symbol = "โ " if match["match"] else "โ ๏ธ" 1488 similarity_percent = match['similarity'] * 100 1489 1490 # Color code based on similarity 1491 if similarity_percent >= 80: 1492 similarity_str = f"๐ฏ {similarity_percent:.1f}%" 1493 elif similarity_percent >= 60: 1494 similarity_str = f"๐ {similarity_percent:.1f}%" 1495 else: 1496 similarity_str = f"๐ {similarity_percent:.1f}%" 1497 1498 print(f"\n {i}. {symbol} {similarity_str}") 1499 print(f" User: {match['username']}") 1500 print(f" Platform: {match['platform']}") 1501 print(f" Distance: {match['distance']:.4f}") 1502 1503 if match['image_url']: 1504 print(f" Image: {match['image_url'][:80]}...") 1505 1506 # Option to save the face to database 1507 if save_to_db and username: 1508 save_face_to_db(face_system, target_encoding, uri, username, uri) 1509 print(f"โ Face saved to database with username: {username}") 1510 1511 return matches 1512 1513 1514 def save_face_to_db(face_system, encoding: np.ndarray, image_url: str, username: str, page_url: str = None, platform: str = "direct_uri"): 1515 """Save a face to the database directly.""" 1516 face_record = { 1517 "username": username, 1518 "platform": platform, 1519 "page_url": page_url or image_url, 1520 "image_url": image_url, 1521 "encoding": encoding.tolist(), 1522 "timestamp": time.time(), 1523 "source": "direct_uri" 1524 } 1525 1526 face_system.faces.append(face_record) 1527 return face_record 1528 1529 1530 def batch_compare_from_file(face_system, filename: str): 1531 """Compare faces from a file containing URIs and usernames.""" 1532 if not os.path.exists(filename): 1533 print(f"โ File '{filename}' not found") 1534 return 1535 1536 try: 1537 with open(filename, 'r') as f: 1538 lines = f.readlines() 1539 except Exception as e: 1540 print(f"โ Error reading file: {e}") 1541 return 1542 1543 print(f"\n๐ Processing {len(lines)} entries from {filename}...") 1544 1545 results = [] 1546 for line_num, line in enumerate(lines, 1): 1547 line = line.strip() 1548 if not line or line.startswith('#'): 1549 continue 1550 1551 parts = line.split(',') 1552 if len(parts) >= 2: 1553 uri = parts[0].strip() 1554 username = parts[1].strip() 1555 1556 print(f"\n[{line_num}] Processing {username} - {uri}") 1557 1558 # Get the face 1559 target_bytes = get_image_bytes(uri) 1560 if not target_bytes: 1561 print(f" โ Could not load image") 1562 continue 1563 1564 target_encoding = compute_face_encoding(target_bytes) 1565 if target_encoding is None: 1566 print(f" โ No face detected") 1567 continue 1568 1569 # Search for matches 1570 matches = face_system.search_faces(target_encoding, threshold=0.6, top_k=5) 1571 1572 if matches: 1573 best_match = matches[0] 1574 results.append({ 1575 'uri': uri, 1576 'username': username, 1577 'best_match': best_match['username'], 1578 'similarity': best_match['similarity'], 1579 'platform': best_match['platform'] 1580 }) 1581 1582 print(f" ๐ Best match: {best_match['username']} ({best_match['similarity']:.3f})") 1583 else: 1584 print(f" โ ๏ธ No matches found") 1585 1586 # Save results 1587 if results: 1588 output_file = f"comparison_results_{int(time.time())}.json" 1589 with open(output_file, 'w') as f: 1590 json.dump(results, f, indent=2) 1591 print(f"\n๐พ Results saved to {output_file}") 1592 1593 return results 1594 1595 1596 def extract_faces_from_webpage(url: str, username: str = None): 1597 """Extract faces from a webpage URL.""" 1598 print(f"\n๐ Extracting faces from webpage: {url}") 1599 1600 crawler = EnhancedProfileCrawler() 1601 1602 # Get the page 1603 try: 1604 response = requests.get( 1605 url, 1606 headers={'User-Agent': UserAgent().random}, 1607 timeout=15 1608 ) 1609 response.raise_for_status() 1610 except Exception as e: 1611 print(f"โ Error fetching webpage: {e}") 1612 return [] 1613 1614 # Extract images 1615 image_urls = crawler.extract_images(response.text, url, {}) 1616 1617 if not image_urls: 1618 print("โ No images found on page") 1619 return [] 1620 1621 print(f"๐ธ Found {len(image_urls)} images") 1622 1623 faces = [] 1624 for i, img_url in enumerate(image_urls[:10], 1): # Limit to first 10 images 1625 print(f" [{i}] Processing: {img_url[:80]}...") 1626 1627 img_bytes = get_image_bytes(img_url) 1628 if not img_bytes: 1629 continue 1630 1631 encoding = compute_face_encoding(img_bytes) 1632 if encoding is not None: 1633 faces.append({ 1634 'image_url': img_url, 1635 'encoding': encoding, 1636 'page_url': url 1637 }) 1638 print(f" โ Face detected") 1639 1640 print(f"\nโ Found {len(faces)} faces on the webpage") 1641 return faces 1642 1643 1644 def create_uri_batch_file(): 1645 """Create a template batch file for URI comparisons.""" 1646 template = """# URI comparison batch file 1647 # Format: image_url_or_path,username,optional_platform 1648 # 1649 # Examples: 1650 https://example.com/face1.jpg,john_doe,facebook 1651 https://example.com/face2.jpg,jane_smith,instagram 1652 /path/to/local/image.jpg,anonymous,direct 1653 """ 1654 1655 filename = f"uri_batch_{int(time.time())}.txt" 1656 with open(filename, 'w') as f: 1657 f.write(template) 1658 1659 print(f"๐ Created batch template file: {filename}") 1660 print("Edit this file with your URIs and usernames, then use option 7.") 1661 1662 1663 # ================== NEW FUNCTION: UPLOAD IMAGE AND SEARCH PLATFORMS ================== 1664 1665 def search_platforms_by_face(face_system, crawler): 1666 """Search selected platforms using a face image.""" 1667 print("\n๐ธ Upload Image and Search Platforms") 1668 print("-" * 40) 1669 1670 # Get image 1671 uri = input("Enter image path or URL: ").strip() 1672 if not uri: 1673 print("โ No image provided") 1674 return 1675 1676 # Extract face from image 1677 print("๐ Extracting face from image...") 1678 target_bytes = get_image_bytes(uri) 1679 if not target_bytes: 1680 print("โ Could not load image") 1681 return 1682 1683 target_encoding = compute_face_encoding(target_bytes) 1684 if target_encoding is None: 1685 print("โ No face detected in image") 1686 return 1687 1688 print("โ Face encoding extracted") 1689 1690 # Ask if they want to search with a specific username 1691 use_username = input("\nDo you want to search with a specific username? (y/N): ").strip().lower() 1692 username_to_search = None 1693 1694 if use_username == 'y': 1695 username_to_search = input("Enter username to search: ").strip() 1696 if username_to_search: 1697 print(f"๐ Will search for username: {username_to_search}") 1698 1699 # Show available platforms 1700 templates = load_profile_templates() 1701 enabled_platforms = get_enabled_platforms(templates) 1702 categories = get_platforms_by_category(templates) 1703 1704 if not enabled_platforms: 1705 print("โ No platforms enabled in profile_templates.json") 1706 return 1707 1708 print(f"\n๐ Available platforms ({len(enabled_platforms)} enabled):") 1709 for category, platforms in categories.items(): 1710 print(f"\n {category}:") 1711 for platform in sorted(platforms): 1712 config = templates[platform] 1713 url_template = config.get("url", "No URL") 1714 print(f" {platform:20} - {url_template}") 1715 1716 # Select platforms 1717 platform_input = input("\nEnter platforms to search (comma-separated, or 'all'): ").strip().lower() 1718 1719 if platform_input == 'all': 1720 selected_platforms = enabled_platforms 1721 else: 1722 selected_platforms = [] 1723 for item in platform_input.split(','): 1724 item = item.strip() 1725 if item in enabled_platforms: 1726 selected_platforms.append(item) 1727 1728 if not selected_platforms: 1729 print("โ ๏ธ No platforms selected") 1730 return 1731 1732 print(f"\n๐ Will search {len(selected_platforms)} platform(s)") 1733 1734 # Search logic 1735 results = [] 1736 1737 if username_to_search: 1738 # Search specific username on selected platforms 1739 print(f"\n๐ Searching username '{username_to_search}' on {len(selected_platforms)} platforms...") 1740 1741 crawl_results = crawler.crawl_usernames([username_to_search], selected_platforms) 1742 1743 # Check if any profiles exist 1744 user_results = crawl_results.get(username_to_search, []) 1745 existing_profiles = [r for r in user_results if r["exists"]] 1746 1747 if existing_profiles: 1748 print(f"\nโ Found {len(existing_profiles)} profile(s) for '{username_to_search}':") 1749 1750 for result in existing_profiles: 1751 print(f"\n Platform: {result['platform']}") 1752 print(f" URL: {result['url']}") 1753 print(f" Images found: {len(result['image_urls'])}") 1754 1755 # Compare face with images from profile 1756 if result["image_urls"]: 1757 print(" Comparing faces from profile images...") 1758 1759 best_similarity = 0 1760 best_match_url = None 1761 1762 for img_url in result["image_urls"][:3]: # Check first 3 images 1763 try: 1764 img_bytes = get_image_bytes(img_url) 1765 if img_bytes: 1766 img_encoding = compute_face_encoding(img_bytes) 1767 if img_encoding is not None: 1768 distance = float(face_recognition.face_distance([target_encoding], img_encoding)[0]) 1769 similarity = max(0.0, 1.0 - min(distance, 1.0)) 1770 1771 if similarity > best_similarity: 1772 best_similarity = similarity 1773 best_match_url = img_url 1774 except Exception: 1775 continue 1776 1777 if best_similarity > 0: 1778 print(f" ๐ฏ Best face match: {best_similarity:.3f}") 1779 if best_similarity > 0.6: 1780 print(f" โ LIKELY SAME PERSON!") 1781 1782 results.append({ 1783 "platform": result["platform"], 1784 "url": result["url"], 1785 "username": username_to_search, 1786 "similarity": best_similarity, 1787 "match_url": best_match_url, 1788 "type": "username_search" 1789 }) 1790 else: 1791 print(" โ ๏ธ No images to compare") 1792 else: 1793 print(f"โ No profiles found for '{username_to_search}' on selected platforms") 1794 1795 else: 1796 # First check existing face database 1797 print("\n๐ Checking existing face database...") 1798 matches = face_system.search_faces(target_encoding, threshold=0.6, top_k=5) 1799 1800 if matches: 1801 print(f"๐ฏ Found {len([m for m in matches if m['match']])} potential matches in database:") 1802 for match in matches[:3]: # Show top 3 1803 if match['match']: 1804 print(f" ๐ค {match['username']} on {match['platform']} - similarity: {match['similarity']:.3f}") 1805 1806 # Ask if they want to search for these usernames 1807 search_existing = input("\nSearch platforms for these usernames? (y/N): ").strip().lower() 1808 1809 if search_existing == 'y': 1810 usernames_to_search = list(set([m['username'] for m in matches if m['match']]))[:5] 1811 print(f"๐ Searching for {len(usernames_to_search)} username(s): {', '.join(usernames_to_search)}") 1812 1813 crawl_results = crawler.crawl_usernames(usernames_to_search, selected_platforms) 1814 1815 for username in usernames_to_search: 1816 user_results = crawl_results.get(username, []) 1817 for result in user_results: 1818 if result["exists"]: 1819 results.append({ 1820 "platform": result["platform"], 1821 "url": result["url"], 1822 "username": username, 1823 "type": "database_match_search" 1824 }) 1825 1826 # Option to do a reverse image search style lookup 1827 print("\n๐ Alternative: Check popular platforms for matching faces") 1828 print(" (This will crawl and compare faces from profiles)") 1829 1830 do_crawl = input("\nCrawl and compare faces from profiles? (y/N): ").strip().lower() 1831 1832 if do_crawl == 'y': 1833 # For demo, we'd need to implement a broader search 1834 # For now, we'll show a message 1835 print("\nโ ๏ธ Advanced face-based platform crawling would require:") 1836 print(" 1. Generating common usernames from face similarity") 1837 print(" 2. Searching those usernames on platforms") 1838 print(" 3. Comparing faces from found profiles") 1839 print("\n๐ก Tip: Use option 1 with suspected usernames first") 1840 1841 # Display results 1842 if results: 1843 print(f"\n๐ Search Results ({len(results)}):") 1844 print("=" * 60) 1845 1846 for i, result in enumerate(results, 1): 1847 print(f"\n{i}. Platform: {result['platform']}") 1848 print(f" Username: {result['username']}") 1849 print(f" URL: {result['url']}") 1850 print(f" Type: {result['type']}") 1851 1852 if 'similarity' in result: 1853 print(f" Face Similarity: {result['similarity']:.3f}") 1854 if result['similarity'] > 0.7: 1855 print(" ๐ฏ HIGH CONFIDENCE MATCH") 1856 elif result['similarity'] > 0.5: 1857 print(" ๐ Possible match") 1858 1859 if 'match_url' in result: 1860 print(f" Match Image: {result['match_url'][:80]}...") 1861 else: 1862 print("\nโ No results found") 1863 1864 # Offer to save the face to database 1865 if target_encoding is not None: 1866 save_face = input("\n๐พ Save this face to database for future searches? (y/N): ").strip().lower() 1867 if save_face == 'y': 1868 username = input("Enter username for this face (or leave blank for 'unknown'): ").strip() or "unknown" 1869 platform = input("Enter source platform (or leave blank for 'upload'): ").strip() or "upload" 1870 1871 face_record = { 1872 "username": username, 1873 "platform": platform, 1874 "page_url": uri if uri.startswith('http') else f"file://{os.path.abspath(uri)}", 1875 "image_url": uri, 1876 "encoding": target_encoding.tolist(), 1877 "timestamp": time.time(), 1878 "source": "image_upload_search" 1879 } 1880 1881 face_system.faces.append(face_record) 1882 print(f"โ Face saved to database as '{username}'") 1883 1884 1885 # ================== TEMPLATE MANAGEMENT FUNCTIONS ================== 1886 1887 def manage_templates_menu(): 1888 """Menu for managing profile templates.""" 1889 while True: 1890 print("\n" + "=" * 60) 1891 print("Profile Templates Management") 1892 print("=" * 60) 1893 print("1. List all platforms") 1894 print("2. List by category") 1895 print("3. Add new platform") 1896 print("4. Edit existing platform") 1897 print("5. Enable/Disable platform") 1898 print("6. Export templates to JSON") 1899 print("7. Import templates from JSON") 1900 print("8. Back to main menu") 1901 1902 choice = input("\nSelect option (1-8): ").strip() 1903 1904 if choice == "1": 1905 # List all platforms 1906 templates = load_profile_templates() 1907 print(f"\n๐ All Platforms ({len(templates)} total):") 1908 for i, (platform_name, config) in enumerate(templates.items(), 1): 1909 enabled = "โ " if config.get("enabled", True) else "โ" 1910 category = config.get("category", "Uncategorized") 1911 print(f" {i:2d}. {enabled} {platform_name:20} - {category}") 1912 1913 elif choice == "2": 1914 # List by category 1915 templates = load_profile_templates() 1916 categories = get_platforms_by_category(templates) 1917 1918 print("\n๐ Platforms by Category:") 1919 for category, platforms in categories.items(): 1920 print(f"\n {category}:") 1921 for platform in sorted(platforms): 1922 config = templates[platform] 1923 enabled = "โ " if config.get("enabled", True) else "โ" 1924 url_template = config.get("url", "No URL") 1925 print(f" {enabled} {platform:20} - {url_template}") 1926 1927 elif choice == "3": 1928 # Add new platform 1929 print("\nโ Add New Platform") 1930 1931 platform_name = input("Platform name (lowercase, no spaces): ").strip().lower() 1932 if not platform_name: 1933 print("โ Platform name required") 1934 continue 1935 1936 templates = load_profile_templates() 1937 if platform_name in templates: 1938 print(f"โ Platform '{platform_name}' already exists") 1939 continue 1940 1941 url_template = input("URL template (use {} for username): ").strip() 1942 if not url_template or "{}" not in url_template: 1943 print("โ URL template must contain {} placeholder for username") 1944 continue 1945 1946 category = input("Category (e.g., Social Media, Tech, etc): ").strip() or "Other" 1947 1948 print("\nAvailable check methods:") 1949 print(" status_code - Simple 200 OK check") 1950 print(" universal_check - Universal profile check (recommended)") 1951 print(" github_check - GitHub specific check") 1952 print(" twitter_check - Twitter specific check") 1953 print(" instagram_check - Instagram specific check") 1954 print(" fansfinder_check - OnlyFans via FansFinder check") 1955 1956 check_method = input("Check method (default: universal_check): ").strip() or "universal_check" 1957 1958 avatar_selector = input("Avatar CSS selector (optional): ").strip() 1959 1960 requires_js = input("Requires JavaScript? (y/N): ").strip().lower() == 'y' 1961 1962 new_template = { 1963 "url": url_template, 1964 "check_method": check_method, 1965 "avatar_selector": avatar_selector, 1966 "requires_javascript": requires_js, 1967 "platform": platform_name, 1968 "category": category, 1969 "enabled": True, 1970 "priority": 3 1971 } 1972 1973 templates[platform_name] = new_template 1974 save_profile_templates(templates) 1975 print(f"โ Platform '{platform_name}' added successfully") 1976 1977 elif choice == "4": 1978 # Edit existing platform 1979 templates = load_profile_templates() 1980 1981 print("\nโ๏ธ Edit Platform") 1982 platforms = list(templates.keys()) 1983 1984 for i, platform in enumerate(platforms, 1): 1985 print(f" {i:2d}. {platform}") 1986 1987 try: 1988 selection = int(input("\nSelect platform number: ").strip()) 1989 if 1 <= selection <= len(platforms): 1990 platform_name = platforms[selection - 1] 1991 config = templates[platform_name] 1992 1993 print(f"\nEditing: {platform_name}") 1994 print(f"Current URL: {config.get('url')}") 1995 new_url = input(f"New URL (Enter to keep current): ").strip() 1996 if new_url: 1997 if "{}" not in new_url: 1998 print("โ URL must contain {} placeholder") 1999 continue 2000 config["url"] = new_url 2001 2002 print(f"Current category: {config.get('category')}") 2003 new_category = input(f"New category: ").strip() 2004 if new_category: 2005 config["category"] = new_category 2006 2007 print(f"Current check method: {config.get('check_method')}") 2008 new_check = input(f"New check method: ").strip() 2009 if new_check: 2010 config["check_method"] = new_check 2011 2012 print(f"Current avatar selector: {config.get('avatar_selector')}") 2013 new_selector = input(f"New avatar selector: ").strip() 2014 if new_selector: 2015 config["avatar_selector"] = new_selector 2016 2017 save_profile_templates(templates) 2018 print(f"โ Platform '{platform_name}' updated") 2019 else: 2020 print("โ Invalid selection") 2021 except (ValueError, IndexError): 2022 print("โ Invalid input") 2023 2024 elif choice == "5": 2025 # Enable/Disable platform 2026 templates = load_profile_templates() 2027 2028 print("\nโ๏ธ Enable/Disable Platform") 2029 platforms = list(templates.keys()) 2030 2031 for i, platform in enumerate(platforms, 1): 2032 enabled = "โ " if templates[platform].get("enabled", True) else "โ" 2033 print(f" {i:2d}. {enabled} {platform}") 2034 2035 try: 2036 selection = int(input("\nSelect platform number: ").strip()) 2037 if 1 <= selection <= len(platforms): 2038 platform_name = platforms[selection - 1] 2039 current = templates[platform_name].get("enabled", True) 2040 templates[platform_name]["enabled"] = not current 2041 2042 status = "enabled" if not current else "disabled" 2043 save_profile_templates(templates) 2044 print(f"โ Platform '{platform_name}' {status}") 2045 else: 2046 print("โ Invalid selection") 2047 except (ValueError, IndexError): 2048 print("โ Invalid input") 2049 2050 elif choice == "6": 2051 # Export templates 2052 filename = input("Export filename (default: profile_templates_export.json): ").strip() or "profile_templates_export.json" 2053 templates = load_profile_templates() 2054 save_profile_templates(templates, filename) 2055 print(f"โ Templates exported to {filename}") 2056 2057 elif choice == "7": 2058 # Import templates 2059 filename = input("Import filename: ").strip() 2060 if not filename: 2061 print("โ Filename required") 2062 continue 2063 2064 if not os.path.exists(filename): 2065 print(f"โ File '{filename}' not found") 2066 continue 2067 2068 try: 2069 with open(filename, 'r') as f: 2070 imported = json.load(f) 2071 2072 # Merge or replace? 2073 print("\nImport options:") 2074 print(" 1. Merge with existing (keep both)") 2075 print(" 2. Replace existing (overwrite)") 2076 print(" 3. Cancel") 2077 2078 option = input("Select option (1-3): ").strip() 2079 2080 if option == "1": 2081 templates = load_profile_templates() 2082 templates.update(imported) 2083 save_profile_templates(templates) 2084 print(f"โ Merged {len(imported)} templates") 2085 elif option == "2": 2086 save_profile_templates(imported) 2087 print(f"โ Replaced with {len(imported)} templates") 2088 else: 2089 print("โ Import cancelled") 2090 2091 except Exception as e: 2092 print(f"โ Error importing: {e}") 2093 2094 elif choice == "8": 2095 # Back to main menu 2096 break 2097 2098 2099 # ================== TESTING ================== 2100 2101 def test_known_profiles(): 2102 """Test with known profiles.""" 2103 print("๐งช Testing with known profiles...") 2104 2105 test_cases = [ 2106 ("torvalds", "github", True, "Linus Torvalds"), 2107 ("jack", "twitter", True, "Jack Dorsey"), 2108 ("nasdaily", "instagram", True, "Nas Daily"), 2109 ("spez", "reddit", True, "Reddit CEO"), 2110 ("beeple", "artstation", True, "Digital artist"), 2111 ("nonexistent1234567890", "github", False, "Non-existent"), 2112 ] 2113 2114 crawler = EnhancedProfileCrawler(CrawlerConfig()) 2115 crawler.config.VERBOSE = False 2116 2117 passed = 0 2118 failed = 0 2119 2120 for username, platform, should_exist, description in test_cases: 2121 if platform not in PROFILE_TEMPLATES: 2122 print(f" โ ๏ธ Skipping {platform} (not configured)") 2123 continue 2124 2125 platform_config = PROFILE_TEMPLATES[platform] 2126 if isinstance(platform_config, str): 2127 url = platform_config.format(username) 2128 else: 2129 url = platform_config.get("url", "").format(username) 2130 2131 print(f"\n๐ {username} on {platform} ({description}):") 2132 print(f" URL: {url}") 2133 2134 result = crawler.check_profile(url, platform, username) 2135 2136 status = "โ PASS" if result["exists"] == should_exist else "โ FAIL" 2137 if result["exists"] == should_exist: 2138 passed += 1 2139 else: 2140 failed += 1 2141 2142 print(f" {status} - Expected: {should_exist}, Got: {result['exists']}") 2143 print(f" Status: {result['status_code']}, Images: {len(result['image_urls'])}") 2144 2145 if result["error"]: 2146 print(f" Error: {result['error']}") 2147 2148 # Show first image if exists 2149 if result["image_urls"]: 2150 print(f" First image: {result['image_urls'][0][:80]}...") 2151 2152 print(f"\n๐ Test Results: {passed} passed, {failed} failed") 2153 2154 2155 def test_specific_profile(): 2156 """Test a specific profile.""" 2157 username = input("Username: ").strip() 2158 platform = input("Platform: ").strip() 2159 2160 if not username or not platform: 2161 print("โ Need username and platform") 2162 return 2163 2164 if platform not in PROFILE_TEMPLATES: 2165 print(f"โ Unknown platform. Available: {', '.join(list(PROFILE_TEMPLATES.keys())[:10])}...") 2166 return 2167 2168 crawler = EnhancedProfileCrawler() 2169 2170 platform_config = PROFILE_TEMPLATES[platform] 2171 if isinstance(platform_config, str): 2172 url = platform_config.format(username) 2173 else: 2174 url = platform_config.get("url", "").format(username) 2175 2176 print(f"\n๐ Testing {username} on {platform}...") 2177 print(f" URL: {url}") 2178 2179 result = crawler.check_profile(url, platform, username) 2180 2181 print(f"\n๐ Results:") 2182 print(f" Exists: {result['exists']}") 2183 print(f" Status Code: {result['status_code']}") 2184 print(f" Final URL: {result['final_url']}") 2185 print(f" Content Length: {result['content_length']} chars") 2186 print(f" Images Found: {len(result['image_urls'])}") 2187 2188 if result["error"]: 2189 print(f" Error: {result['error']}") 2190 2191 # Show images 2192 for i, img_url in enumerate(result["image_urls"][:3], 1): 2193 print(f"\n Image {i}:") 2194 print(f" URL: {img_url}") 2195 2196 # Try to download and check 2197 img_bytes = get_image_bytes(img_url) 2198 if img_bytes: 2199 print(f" Size: {len(img_bytes)} bytes") 2200 encoding = compute_face_encoding(img_bytes) 2201 if encoding is not None: 2202 print(f" โ Face detected") 2203 else: 2204 print(f" โ No face detected") 2205 else: 2206 print(f" โ Could not download") 2207 2208 2209 # ================== MAIN INTERFACE ================== 2210 2211 def main(): 2212 """Main interface.""" 2213 print("๐ Enhanced Cross-Platform Face Search") 2214 print("=" * 60) 2215 2216 # Initialize 2217 crawler = EnhancedProfileCrawler() 2218 face_system = FaceIndexSystem() 2219 2220 # Load existing index 2221 if os.path.exists("face_index.json"): 2222 face_system.load_index() 2223 2224 while True: 2225 print("\n" + "=" * 60) 2226 print("1. Search for usernames") 2227 print("2. Upload image and search selected platforms (NEW)") 2228 print("3. Test specific profile") 2229 print("4. Run known profile tests") 2230 print("5. Compare target face (from local image)") 2231 print("6. Compare face from URL/URI") 2232 print("7. Extract faces from webpage") 2233 print("8. Batch compare from file") 2234 print("9. Create batch template") 2235 print("10. Show statistics") 2236 print("11. Manage profile templates") 2237 print("12. Save face index") 2238 print("13. Load face index") 2239 print("14. Clear face index") 2240 print("15. Exit") 2241 2242 choice = input("\nSelect option (1-15): ").strip() 2243 2244 if choice == "1": 2245 # Search usernames (existing code) 2246 usernames_input = input("Enter usernames (comma-separated): ").strip() 2247 if not usernames_input: 2248 continue 2249 2250 usernames = [u.strip() for u in usernames_input.split(',')] 2251 2252 # Platform selection 2253 templates = load_profile_templates() 2254 enabled_platforms = get_enabled_platforms(templates) 2255 categories = get_platforms_by_category(templates) 2256 2257 print(f"\n๐ Available platforms ({len(enabled_platforms)} enabled):") 2258 for category, platforms in categories.items(): 2259 print(f"\n {category}:") 2260 for platform in sorted(platforms): 2261 config = templates[platform] 2262 url_template = config.get("url", "No URL") 2263 print(f" {platform:20} - {url_template}") 2264 2265 platform_input = input("\nEnter platforms (comma-separated, or 'all'): ").strip().lower() 2266 2267 if platform_input == 'all': 2268 selected_platforms = enabled_platforms 2269 else: 2270 selected_platforms = [] 2271 for item in platform_input.split(','): 2272 item = item.strip() 2273 if item in enabled_platforms: 2274 selected_platforms.append(item) 2275 2276 if not selected_platforms: 2277 # Default to all enabled platforms from the JSON file 2278 selected_platforms = enabled_platforms 2279 print(f"โ ๏ธ No platforms specified, using all {len(selected_platforms)} enabled platforms") 2280 2281 print(f"\n๐ Searching {len(usernames)} user(s) on {len(selected_platforms)} platform(s)...") 2282 2283 # Crawl 2284 results = crawler.crawl_usernames(usernames, selected_platforms) 2285 2286 # Index faces 2287 print("\n๐ธ Indexing faces...") 2288 new_faces = face_system.index_from_results(results) 2289 2290 # Summary 2291 print(f"\n๐ Summary:") 2292 total_found = 0 2293 total_faces = 0 2294 2295 for username in usernames: 2296 user_results = results.get(username, []) 2297 found = [r for r in user_results if r["exists"]] 2298 user_faces = len([f for f in new_faces if f["username"] == username]) 2299 2300 total_found += len(found) 2301 total_faces += user_faces 2302 2303 print(f" {username}: {len(found)}/{len(user_results)} profiles, {user_faces} faces") 2304 2305 print(f"\n Total: {total_found} profiles found, {total_faces} faces indexed") 2306 2307 # Offer to save 2308 if new_faces: 2309 save = input("\n๐พ Save results to face index? (y/N): ").strip().lower() 2310 if save == 'y': 2311 face_system.save_index() 2312 2313 elif choice == "2": 2314 # NEW: Upload image and search selected platforms 2315 search_platforms_by_face(face_system, crawler) 2316 2317 elif choice == "3": 2318 test_specific_profile() 2319 2320 elif choice == "4": 2321 test_known_profiles() 2322 2323 elif choice == "5": 2324 if not face_system.faces: 2325 print("โ No faces in index") 2326 continue 2327 2328 target_source = input("Target image path or URL: ").strip() 2329 if not target_source: 2330 continue 2331 2332 # Load target image 2333 target_bytes = get_image_bytes(target_source) 2334 if not target_bytes: 2335 print("โ Could not load image") 2336 continue 2337 2338 target_encoding = compute_face_encoding(target_bytes) 2339 if target_encoding is None: 2340 print("โ No face detected in target") 2341 continue 2342 2343 # Get threshold 2344 try: 2345 threshold = float(input("Match threshold (0.1-1.0, default 0.6): ") or "0.6") 2346 top_k = int(input("Number of results (default 10): ") or "10") 2347 except ValueError: 2348 threshold = 0.6 2349 top_k = 10 2350 2351 print(f"\n๐ Searching {len(face_system.faces)} indexed faces...") 2352 matches = face_system.search_faces(target_encoding, threshold, top_k) 2353 2354 if not matches: 2355 print("โ No matches found") 2356 continue 2357 2358 print(f"\n๐ Top {len(matches)} matches:") 2359 for i, match in enumerate(matches, 1): 2360 symbol = "โ " if match["match"] else "โ ๏ธ" 2361 print(f"\n {i}. {symbol} Similarity: {match['similarity']:.3f}") 2362 print(f" User: {match['username']}") 2363 print(f" Platform: {match['platform']}") 2364 if match['similarity'] > 0.7: 2365 print(f" ๐ฏ Strong match!") 2366 2367 elif choice == "6": 2368 # Compare face from URL/URI 2369 uri = input("Enter image URL or local file path: ").strip() 2370 if not uri: 2371 continue 2372 2373 print("\nOptions:") 2374 print("1. Just compare with existing faces") 2375 print("2. Compare and save to database") 2376 2377 sub_choice = input("Select (1-2): ").strip() 2378 2379 if sub_choice == "2": 2380 username = input("Enter username for this face: ").strip() 2381 if username: 2382 compare_face_from_uri(face_system, uri, username, save_to_db=True) 2383 else: 2384 print("โ Username required to save to database") 2385 else: 2386 compare_face_from_uri(face_system, uri) 2387 2388 elif choice == "7": 2389 # Extract faces from webpage 2390 url = input("Enter webpage URL: ").strip() 2391 if not url: 2392 continue 2393 2394 faces = extract_faces_from_webpage(url) 2395 2396 if faces: 2397 print("\nOptions:") 2398 print("1. Compare each face with database") 2399 print("2. Save all faces to database") 2400 2401 sub_choice = input("Select (1-2): ").strip() 2402 2403 if sub_choice == "1": 2404 for i, face in enumerate(faces, 1): 2405 print(f"\n[{i}] Comparing face from image...") 2406 temp_uri = face['image_url'] 2407 matches = compare_face_from_uri(face_system, temp_uri) 2408 2409 if matches and len(matches) > 0: 2410 best = matches[0] 2411 if best['similarity'] > 0.7: 2412 save = input(f" Save as match to {best['username']}? (y/N): ").strip().lower() 2413 if save == 'y': 2414 username = input(f" Username (default: {best['username']}): ").strip() or best['username'] 2415 save_face_to_db(face_system, face['encoding'], face['image_url'], username, face['page_url'], "webpage_extraction") 2416 print(f" โ Saved to database") 2417 2418 elif sub_choice == "2": 2419 username = input("Base username (faces will be saved as username_1, username_2, etc): ").strip() 2420 if username: 2421 for i, face in enumerate(faces, 1): 2422 user_id = f"{username}_{i}" 2423 save_face_to_db(face_system, face['encoding'], face['image_url'], user_id, face['page_url'], "webpage_extraction") 2424 print(f"โ Saved {len(faces)} faces to database") 2425 else: 2426 print("โ Username required") 2427 2428 elif choice == "8": 2429 # Batch compare from file 2430 filename = input("Enter filename with URIs and usernames (CSV format): ").strip() 2431 if filename and os.path.exists(filename): 2432 batch_compare_from_file(face_system, filename) 2433 else: 2434 print("โ File not found") 2435 2436 elif choice == "9": 2437 # Create batch template 2438 create_uri_batch_file() 2439 2440 elif choice == "10": 2441 print(f"\n๐ Statistics:") 2442 print(f" Total faces: {len(face_system.faces)}") 2443 2444 if face_system.faces: 2445 # Count by platform 2446 platforms = {} 2447 for face in face_system.faces: 2448 platform = face.get("platform", "unknown") 2449 platforms[platform] = platforms.get(platform, 0) + 1 2450 2451 print(f" By platform:") 2452 for platform, count in sorted(platforms.items(), key=lambda x: x[1], reverse=True): 2453 print(f" {platform}: {count}") 2454 2455 # Count by source 2456 sources = {} 2457 for face in face_system.faces: 2458 source = face.get("source", "unknown") 2459 sources[source] = sources.get(source, 0) + 1 2460 2461 print(f" By source:") 2462 for source, count in sorted(sources.items(), key=lambda x: x[1], reverse=True): 2463 print(f" {source}: {count}") 2464 2465 elif choice == "11": 2466 manage_templates_menu() 2467 2468 elif choice == "12": 2469 filename = input("Filename (default: face_index.json): ").strip() or "face_index.json" 2470 face_system.save_index(filename) 2471 2472 elif choice == "13": 2473 filename = input("Filename (default: face_index.json): ").strip() or "face_index.json" 2474 face_system.load_index(filename) 2475 2476 elif choice == "14": 2477 confirm = input("Clear all indexed faces? (y/N): ").strip().lower() 2478 if confirm == 'y': 2479 face_system.faces = [] 2480 print("โ Face index cleared") 2481 2482 elif choice == "15": 2483 print("๐ Goodbye!") 2484 break 2485 2486 2487 if __name__ == "__main__": 2488 main()