/ clstrmaps.py
clstrmaps.py
  1  from selenium import webdriver
  2  from selenium.webdriver.chrome.service import Service
  3  from selenium.webdriver.chrome.options import Options
  4  from selenium.webdriver.common.by import By
  5  from selenium.webdriver.support.ui import WebDriverWait
  6  from selenium.webdriver.support import expected_conditions as EC
  7  from bs4 import BeautifulSoup
  8  import time
  9  import random
 10  import os
 11  import csv
 12  import re
 13  import shutil
 14  
 15  # Path to ChromeDriver
 16  chrome_driver_path = '/usr/bin/chromedriver'
 17  
 18  # State abbreviations to full names mapping
 19  STATE_ABBREVIATIONS = {
 20      'AL': 'Alabama', 'AK': 'Alaska', 'AZ': 'Arizona', 'AR': 'Arkansas',
 21      'CA': 'California', 'CO': 'Colorado', 'CT': 'Connecticut', 'DE': 'Delaware',
 22      'FL': 'Florida', 'GA': 'Georgia', 'HI': 'Hawaii', 'ID': 'Idaho',
 23      'IL': 'Illinois', 'IN': 'Indiana', 'IA': 'Iowa', 'KS': 'Kansas',
 24      'KY': 'Kentucky', 'LA': 'Louisiana', 'ME': 'Maine', 'MD': 'Maryland',
 25      'MA': 'Massachusetts', 'MI': 'Michigan', 'MN': 'Minnesota', 'MS': 'Mississippi',
 26      'MO': 'Missouri', 'MT': 'Montana', 'NE': 'Nebraska', 'NV': 'Nevada',
 27      'NH': 'New Hampshire', 'NJ': 'New Jersey', 'NM': 'New Mexico', 'NY': 'New York',
 28      'NC': 'North Carolina', 'ND': 'North Dakota', 'OH': 'Ohio', 'OK': 'Oklahoma',
 29      'OR': 'Oregon', 'PA': 'Pennsylvania', 'RI': 'Rhode Island', 'SC': 'South Carolina',
 30      'SD': 'South Dakota', 'TN': 'Tennessee', 'TX': 'Texas', 'UT': 'Utah',
 31      'VT': 'Vermont', 'VA': 'Virginia', 'WA': 'Washington', 'WV': 'West Virginia',
 32      'WI': 'Wisconsin', 'WY': 'Wyoming'
 33  }
 34  
 35  # Create results directory if it doesn't exist
 36  if not os.path.exists('results'):
 37      os.makedirs('results')
 38  
 39  def expand_state_abbreviation(state):
 40      """Convert state abbreviation to full name if needed"""
 41      if not state:
 42          return None
 43      state = state.strip().upper()
 44      return STATE_ABBREVIATIONS.get(state, state)
 45  
 46  def human_delay(min=0.5, max=1.5):
 47      time.sleep(random.uniform(min, max))
 48  
 49  def log_message(message):
 50      with open('results/clustrmaps_log.txt', 'a') as log_file:
 51          log_file.write(f"{time.strftime('%Y-%m-%d %H:%M:%S')} - {message}\n")
 52      print(message)
 53  
 54  def save_details_html(content, filename):
 55      """Save or append to details HTML file with separator"""
 56      try:
 57          if os.path.exists(filename):
 58              with open(filename, 'a', encoding='utf-8') as f:
 59                  f.write(f"\n<!-- ====== NEW DETAILS PAGE ====== -->\n")
 60                  f.write(content)
 61              log_message(f"Appended to details HTML file: {filename}")
 62          else:
 63              with open(filename, 'w', encoding='utf-8') as f:
 64                  f.write(content)
 65              log_message(f"Created new details HTML file: {filename}")
 66      except Exception as e:
 67          log_message(f"Error saving details HTML: {str(e)}")
 68  
 69  def read_input_from_file(file_path):
 70      data = []
 71      try:
 72          with open(file_path, 'r') as file:
 73              reader = csv.DictReader(file)
 74              for row in reader:
 75                  if 'first_name' in row and 'last_name' in row:
 76                      record = {
 77                          'first_name': row['first_name'].strip(),
 78                          'last_name': row['last_name'].strip()
 79                      }
 80                      if 'state' in row and row['state'].strip():
 81                          record['state'] = expand_state_abbreviation(row['state'].strip())
 82                      data.append(record)
 83          return data
 84      except Exception as e:
 85          log_message(f"Error reading input file: {str(e)}")
 86          return []
 87  
 88  def extract_main_page_info(html_content):
 89      soup = BeautifulSoup(html_content, 'html.parser')
 90      results = []
 91      
 92      person_entries = soup.find_all('div', itemprop='Person')
 93      
 94      for entry in person_entries:
 95          person_info = {}
 96          
 97          # Basic info
 98          name_tag = entry.find('span', itemprop='name')
 99          person_info['name'] = name_tag.text if name_tag else 'N/A'
100          
101          age_tag = entry.find('span', class_='age')
102          person_info['age'] = age_tag.text.replace(',', '').strip() if age_tag else 'N/A'
103          
104          # Address
105          street = entry.find('span', itemprop='streetAddress')
106          locality = entry.find('span', itemprop='addressLocality')
107          state = entry.find('span', itemprop='addressRegion')
108          person_info['address'] = f"{street.text}, {locality.text}" if street and locality else 'N/A'
109          person_info['state'] = expand_state_abbreviation(state.text) if state else 'N/A'
110          
111          # Associated persons
112          associated = [el.find('span', itemprop='name').text for el in entry.find_all('span', itemprop='relatedTo') if el.find('span', itemprop='name')]
113          person_info['associated_persons'] = ' | '.join(associated) if associated else 'N/A'
114          
115          # Phone
116          phone = entry.find('span', itemprop='telephone')
117          person_info['phone'] = phone.text if phone else 'N/A'
118          
119          # Details page URL
120          details_link = entry.find('a', class_='btn-success')
121          if details_link and 'href' in details_link.attrs:
122              person_info['details_url'] = f"https://clustrmaps.com{details_link['href']}"
123          
124          results.append(person_info)
125      
126      return results
127  
128  def extract_quick_facts(html_content, source_url):
129      soup = BeautifulSoup(html_content, 'html.parser')
130      quick_facts = {
131          'name': 'N/A',
132          'emails': 'N/A',
133          'phone_numbers': 'N/A',
134          'source_url': source_url
135      }
136  
137      try:
138          # Extract name from the title or heading
139          name_tag = soup.find('h1', class_='person-name')
140          if name_tag:
141              quick_facts['name'] = name_tag.get_text(strip=True)
142  
143          # Find the Quick Facts section
144          quick_facts_div = soup.find('div', id='intro')
145          if quick_facts_div:
146              full_text = quick_facts_div.get_text(' ', strip=True)
147  
148          # Extract Emails
149          emails = set()
150          for mail_link in soup.find_all('a', href=lambda x: x and x.startswith('mailto:')):
151              email = mail_link.get_text(strip=True)
152              if '@' in email:
153                  emails.add(email)
154          if emails:
155              quick_facts['emails'] = ' | '.join(sorted(emails))
156  
157          # Extract Phone Numbers
158          phones = set()
159          for tel_link in soup.find_all('a', href=lambda x: x and x.startswith('tel:')):
160              phone = tel_link.get_text(strip=True)
161              if phone:
162                  phones.add(phone)
163          if phones:
164              quick_facts['phone_numbers'] = ' | '.join(sorted(phones))
165  
166      except Exception as e:
167          log_message(f"Error extracting quick facts: {str(e)}")
168      
169      return quick_facts
170  
171  def get_person_url(first_name, last_name, state=None):
172      first = '-'.join([part.capitalize() for part in first_name.split('-')])
173      last = last_name.replace(' ', '_').capitalize()
174      if state:
175          # Convert state to proper format for URL (replace spaces with underscores)
176          state_url = state.replace(' ', '_')
177          return f"https://clustrmaps.com/persons/{first}-{last}/{state_url}"
178      return f"https://clustrmaps.com/persons/{first}-{last}"
179  
180  def save_to_csv(data, filename, fieldnames):
181      try:
182          filename = f"results/{filename}"
183          if not os.path.exists(filename):
184              with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
185                  writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
186                  writer.writeheader()
187                  writer.writerows(data) if isinstance(data, list) else writer.writerow(data)
188              log_message(f"Saved {filename}")
189          else:
190              log_message(f"File {filename} already exists - skipping")
191      except Exception as e:
192          log_message(f"Error saving {filename}: {str(e)}")
193  
194  def create_driver():
195      chrome_options = Options()
196      chrome_options.add_argument("--headless")
197      chrome_options.add_argument("--disable-gpu")
198      chrome_options.add_argument("--no-sandbox")
199      chrome_options.add_argument("--disable-dev-shm-usage")
200      chrome_options.add_argument("--window-size=1920,1080")
201      chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
202      
203      service = Service(chrome_driver_path)
204      return webdriver.Chrome(service=service, options=chrome_options)
205  
206  def main():
207      driver = None
208      
209      try:
210          file_path = input("Enter CSV file path (columns: first_name, last_name, [state]): ").strip()
211          input_data = read_input_from_file(file_path)
212          
213          if not input_data:
214              log_message("No valid input data. Exiting.")
215              return
216          
217          driver = create_driver()
218          
219          for i, person in enumerate(input_data):
220              first_name = person['first_name']
221              last_name = person['last_name']
222              state = person.get('state')
223              base_filename = f"{first_name}_{last_name}"
224              if state:
225                  base_filename += f"_{state.replace(' ', '_')}"
226              
227              log_message(f"\nProcessing {i+1}/{len(input_data)}: {first_name} {last_name}" + 
228                         (f" in {state}" if state else ""))
229              
230              # Skip if all output files exist
231              if (os.path.exists(f"results/results_{base_filename}_main.csv") and 
232                  os.path.exists(f"results/results_{base_filename}_quickfacts.csv")):
233                  log_message("All output files exist - skipping")
234                  continue
235              
236              # Get main profile page
237              url = get_person_url(first_name, last_name, state)
238              driver.get(url)
239              human_delay(2, 4)
240              
241              # Check for 404
242              if "Page Not Found" in driver.title:
243                  log_message(f"No main profile found at {url}")
244                  continue
245              
246              # # Save main debug HTML
247              # debug_filename = f'debug_{base_filename}.html'
248              # if not os.path.exists(debug_filename):
249              #     with open(debug_filename, 'w', encoding='utf-8') as f:
250              #         f.write(driver.page_source)
251              #     log_message(f"Saved debug HTML to {debug_filename}")
252              # else:
253              #     log_message(f"Debug file {debug_filename} already exists - skipping")
254              
255              # Extract main page info
256              main_info = extract_main_page_info(driver.page_source)
257              save_to_csv(
258                  main_info,
259                  f"results_{base_filename}_main.csv",
260                  ['name', 'age', 'address', 'state', 'associated_persons', 'phone', 'details_url']
261              )
262              
263              # Process details pages
264              quick_facts_data = []
265              for person_info in main_info:
266                  if 'details_url' in person_info and person_info['details_url'] != 'N/A':
267                      log_message(f"Fetching details page: {person_info['details_url']}")
268                      driver.get(person_info['details_url'])
269                      human_delay(2, 4)
270                      
271                      # Save details HTML
272                      details_filename = f"results/{base_filename}_details.html"
273                      save_details_html(driver.page_source, details_filename)
274                      
275                      # Extract quick facts
276                      quick_facts = extract_quick_facts(driver.page_source, person_info['details_url'])
277                      quick_facts['name'] = person_info['name']
278                      quick_facts_data.append(quick_facts)
279              
280              if quick_facts_data:
281                  save_to_csv(
282                      quick_facts_data,
283                      f"results_{base_filename}_quickfacts.csv",
284                      ['name', 'emails', 'phone_numbers', 'source_url']
285                  )
286              
287              # Random delay between searches
288              delay = random.uniform(10, 20)
289              log_message(f"Waiting {delay:.1f} seconds...")
290              time.sleep(delay)
291              
292              # Restart browser periodically
293              if i > 0 and i % 2 == 0:
294                  log_message("Restarting browser...")
295                  driver.quit()
296                  time.sleep(random.uniform(5, 10))
297                  driver = create_driver()
298                  
299      except Exception as e:
300          log_message(f"Fatal error: {str(e)}")
301          if driver:
302              driver.save_screenshot('error.png')
303      finally:
304          if driver:
305              driver.quit()
306  
307  if __name__ == "__main__":
308      main()