/ clstrmaps.py
clstrmaps.py
1 from selenium import webdriver 2 from selenium.webdriver.chrome.service import Service 3 from selenium.webdriver.chrome.options import Options 4 from selenium.webdriver.common.by import By 5 from selenium.webdriver.support.ui import WebDriverWait 6 from selenium.webdriver.support import expected_conditions as EC 7 from bs4 import BeautifulSoup 8 import time 9 import random 10 import os 11 import csv 12 import re 13 import shutil 14 15 # Path to ChromeDriver 16 chrome_driver_path = '/usr/bin/chromedriver' 17 18 # State abbreviations to full names mapping 19 STATE_ABBREVIATIONS = { 20 'AL': 'Alabama', 'AK': 'Alaska', 'AZ': 'Arizona', 'AR': 'Arkansas', 21 'CA': 'California', 'CO': 'Colorado', 'CT': 'Connecticut', 'DE': 'Delaware', 22 'FL': 'Florida', 'GA': 'Georgia', 'HI': 'Hawaii', 'ID': 'Idaho', 23 'IL': 'Illinois', 'IN': 'Indiana', 'IA': 'Iowa', 'KS': 'Kansas', 24 'KY': 'Kentucky', 'LA': 'Louisiana', 'ME': 'Maine', 'MD': 'Maryland', 25 'MA': 'Massachusetts', 'MI': 'Michigan', 'MN': 'Minnesota', 'MS': 'Mississippi', 26 'MO': 'Missouri', 'MT': 'Montana', 'NE': 'Nebraska', 'NV': 'Nevada', 27 'NH': 'New Hampshire', 'NJ': 'New Jersey', 'NM': 'New Mexico', 'NY': 'New York', 28 'NC': 'North Carolina', 'ND': 'North Dakota', 'OH': 'Ohio', 'OK': 'Oklahoma', 29 'OR': 'Oregon', 'PA': 'Pennsylvania', 'RI': 'Rhode Island', 'SC': 'South Carolina', 30 'SD': 'South Dakota', 'TN': 'Tennessee', 'TX': 'Texas', 'UT': 'Utah', 31 'VT': 'Vermont', 'VA': 'Virginia', 'WA': 'Washington', 'WV': 'West Virginia', 32 'WI': 'Wisconsin', 'WY': 'Wyoming' 33 } 34 35 # Create results directory if it doesn't exist 36 if not os.path.exists('results'): 37 os.makedirs('results') 38 39 def expand_state_abbreviation(state): 40 """Convert state abbreviation to full name if needed""" 41 if not state: 42 return None 43 state = state.strip().upper() 44 return STATE_ABBREVIATIONS.get(state, state) 45 46 def human_delay(min=0.5, max=1.5): 47 time.sleep(random.uniform(min, max)) 48 49 def log_message(message): 50 with open('results/clustrmaps_log.txt', 'a') as log_file: 51 log_file.write(f"{time.strftime('%Y-%m-%d %H:%M:%S')} - {message}\n") 52 print(message) 53 54 def save_details_html(content, filename): 55 """Save or append to details HTML file with separator""" 56 try: 57 if os.path.exists(filename): 58 with open(filename, 'a', encoding='utf-8') as f: 59 f.write(f"\n<!-- ====== NEW DETAILS PAGE ====== -->\n") 60 f.write(content) 61 log_message(f"Appended to details HTML file: {filename}") 62 else: 63 with open(filename, 'w', encoding='utf-8') as f: 64 f.write(content) 65 log_message(f"Created new details HTML file: {filename}") 66 except Exception as e: 67 log_message(f"Error saving details HTML: {str(e)}") 68 69 def read_input_from_file(file_path): 70 data = [] 71 try: 72 with open(file_path, 'r') as file: 73 reader = csv.DictReader(file) 74 for row in reader: 75 if 'first_name' in row and 'last_name' in row: 76 record = { 77 'first_name': row['first_name'].strip(), 78 'last_name': row['last_name'].strip() 79 } 80 if 'state' in row and row['state'].strip(): 81 record['state'] = expand_state_abbreviation(row['state'].strip()) 82 data.append(record) 83 return data 84 except Exception as e: 85 log_message(f"Error reading input file: {str(e)}") 86 return [] 87 88 def extract_main_page_info(html_content): 89 soup = BeautifulSoup(html_content, 'html.parser') 90 results = [] 91 92 person_entries = soup.find_all('div', itemprop='Person') 93 94 for entry in person_entries: 95 person_info = {} 96 97 # Basic info 98 name_tag = entry.find('span', itemprop='name') 99 person_info['name'] = name_tag.text if name_tag else 'N/A' 100 101 age_tag = entry.find('span', class_='age') 102 person_info['age'] = age_tag.text.replace(',', '').strip() if age_tag else 'N/A' 103 104 # Address 105 street = entry.find('span', itemprop='streetAddress') 106 locality = entry.find('span', itemprop='addressLocality') 107 state = entry.find('span', itemprop='addressRegion') 108 person_info['address'] = f"{street.text}, {locality.text}" if street and locality else 'N/A' 109 person_info['state'] = expand_state_abbreviation(state.text) if state else 'N/A' 110 111 # Associated persons 112 associated = [el.find('span', itemprop='name').text for el in entry.find_all('span', itemprop='relatedTo') if el.find('span', itemprop='name')] 113 person_info['associated_persons'] = ' | '.join(associated) if associated else 'N/A' 114 115 # Phone 116 phone = entry.find('span', itemprop='telephone') 117 person_info['phone'] = phone.text if phone else 'N/A' 118 119 # Details page URL 120 details_link = entry.find('a', class_='btn-success') 121 if details_link and 'href' in details_link.attrs: 122 person_info['details_url'] = f"https://clustrmaps.com{details_link['href']}" 123 124 results.append(person_info) 125 126 return results 127 128 def extract_quick_facts(html_content, source_url): 129 soup = BeautifulSoup(html_content, 'html.parser') 130 quick_facts = { 131 'name': 'N/A', 132 'emails': 'N/A', 133 'phone_numbers': 'N/A', 134 'source_url': source_url 135 } 136 137 try: 138 # Extract name from the title or heading 139 name_tag = soup.find('h1', class_='person-name') 140 if name_tag: 141 quick_facts['name'] = name_tag.get_text(strip=True) 142 143 # Find the Quick Facts section 144 quick_facts_div = soup.find('div', id='intro') 145 if quick_facts_div: 146 full_text = quick_facts_div.get_text(' ', strip=True) 147 148 # Extract Emails 149 emails = set() 150 for mail_link in soup.find_all('a', href=lambda x: x and x.startswith('mailto:')): 151 email = mail_link.get_text(strip=True) 152 if '@' in email: 153 emails.add(email) 154 if emails: 155 quick_facts['emails'] = ' | '.join(sorted(emails)) 156 157 # Extract Phone Numbers 158 phones = set() 159 for tel_link in soup.find_all('a', href=lambda x: x and x.startswith('tel:')): 160 phone = tel_link.get_text(strip=True) 161 if phone: 162 phones.add(phone) 163 if phones: 164 quick_facts['phone_numbers'] = ' | '.join(sorted(phones)) 165 166 except Exception as e: 167 log_message(f"Error extracting quick facts: {str(e)}") 168 169 return quick_facts 170 171 def get_person_url(first_name, last_name, state=None): 172 first = '-'.join([part.capitalize() for part in first_name.split('-')]) 173 last = last_name.replace(' ', '_').capitalize() 174 if state: 175 # Convert state to proper format for URL (replace spaces with underscores) 176 state_url = state.replace(' ', '_') 177 return f"https://clustrmaps.com/persons/{first}-{last}/{state_url}" 178 return f"https://clustrmaps.com/persons/{first}-{last}" 179 180 def save_to_csv(data, filename, fieldnames): 181 try: 182 filename = f"results/{filename}" 183 if not os.path.exists(filename): 184 with open(filename, 'w', newline='', encoding='utf-8') as csvfile: 185 writer = csv.DictWriter(csvfile, fieldnames=fieldnames) 186 writer.writeheader() 187 writer.writerows(data) if isinstance(data, list) else writer.writerow(data) 188 log_message(f"Saved {filename}") 189 else: 190 log_message(f"File {filename} already exists - skipping") 191 except Exception as e: 192 log_message(f"Error saving {filename}: {str(e)}") 193 194 def create_driver(): 195 chrome_options = Options() 196 chrome_options.add_argument("--headless") 197 chrome_options.add_argument("--disable-gpu") 198 chrome_options.add_argument("--no-sandbox") 199 chrome_options.add_argument("--disable-dev-shm-usage") 200 chrome_options.add_argument("--window-size=1920,1080") 201 chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") 202 203 service = Service(chrome_driver_path) 204 return webdriver.Chrome(service=service, options=chrome_options) 205 206 def main(): 207 driver = None 208 209 try: 210 file_path = input("Enter CSV file path (columns: first_name, last_name, [state]): ").strip() 211 input_data = read_input_from_file(file_path) 212 213 if not input_data: 214 log_message("No valid input data. Exiting.") 215 return 216 217 driver = create_driver() 218 219 for i, person in enumerate(input_data): 220 first_name = person['first_name'] 221 last_name = person['last_name'] 222 state = person.get('state') 223 base_filename = f"{first_name}_{last_name}" 224 if state: 225 base_filename += f"_{state.replace(' ', '_')}" 226 227 log_message(f"\nProcessing {i+1}/{len(input_data)}: {first_name} {last_name}" + 228 (f" in {state}" if state else "")) 229 230 # Skip if all output files exist 231 if (os.path.exists(f"results/results_{base_filename}_main.csv") and 232 os.path.exists(f"results/results_{base_filename}_quickfacts.csv")): 233 log_message("All output files exist - skipping") 234 continue 235 236 # Get main profile page 237 url = get_person_url(first_name, last_name, state) 238 driver.get(url) 239 human_delay(2, 4) 240 241 # Check for 404 242 if "Page Not Found" in driver.title: 243 log_message(f"No main profile found at {url}") 244 continue 245 246 # # Save main debug HTML 247 # debug_filename = f'debug_{base_filename}.html' 248 # if not os.path.exists(debug_filename): 249 # with open(debug_filename, 'w', encoding='utf-8') as f: 250 # f.write(driver.page_source) 251 # log_message(f"Saved debug HTML to {debug_filename}") 252 # else: 253 # log_message(f"Debug file {debug_filename} already exists - skipping") 254 255 # Extract main page info 256 main_info = extract_main_page_info(driver.page_source) 257 save_to_csv( 258 main_info, 259 f"results_{base_filename}_main.csv", 260 ['name', 'age', 'address', 'state', 'associated_persons', 'phone', 'details_url'] 261 ) 262 263 # Process details pages 264 quick_facts_data = [] 265 for person_info in main_info: 266 if 'details_url' in person_info and person_info['details_url'] != 'N/A': 267 log_message(f"Fetching details page: {person_info['details_url']}") 268 driver.get(person_info['details_url']) 269 human_delay(2, 4) 270 271 # Save details HTML 272 details_filename = f"results/{base_filename}_details.html" 273 save_details_html(driver.page_source, details_filename) 274 275 # Extract quick facts 276 quick_facts = extract_quick_facts(driver.page_source, person_info['details_url']) 277 quick_facts['name'] = person_info['name'] 278 quick_facts_data.append(quick_facts) 279 280 if quick_facts_data: 281 save_to_csv( 282 quick_facts_data, 283 f"results_{base_filename}_quickfacts.csv", 284 ['name', 'emails', 'phone_numbers', 'source_url'] 285 ) 286 287 # Random delay between searches 288 delay = random.uniform(10, 20) 289 log_message(f"Waiting {delay:.1f} seconds...") 290 time.sleep(delay) 291 292 # Restart browser periodically 293 if i > 0 and i % 2 == 0: 294 log_message("Restarting browser...") 295 driver.quit() 296 time.sleep(random.uniform(5, 10)) 297 driver = create_driver() 298 299 except Exception as e: 300 log_message(f"Fatal error: {str(e)}") 301 if driver: 302 driver.save_screenshot('error.png') 303 finally: 304 if driver: 305 driver.quit() 306 307 if __name__ == "__main__": 308 main()