account-researcher.py
1 #!/usr/bin/env python3 2 """ 3 Account Research Engine 4 Gathers intel from multiple sources per prospect, caches for 7 days. 5 6 Usage: 7 python3 account-researcher.py prospects.json 8 cat prospects.json | python3 account-researcher.py - 9 python3 account-researcher.py --domain example.com --company "Example Corp" 10 python3 account-researcher.py prospects.json --dry-run 11 """ 12 13 import json, os, sys, time, re, argparse 14 from pathlib import Path 15 from datetime import datetime, timedelta 16 from urllib.request import urlopen, Request 17 from urllib.error import URLError, HTTPError 18 from html.parser import HTMLParser 19 20 SCRIPT_DIR = Path(__file__).resolve().parent 21 DATA_DIR = SCRIPT_DIR.parent / "data" 22 CACHE_DIR = DATA_DIR / "account-research" 23 CACHE_DIR.mkdir(parents=True, exist_ok=True) 24 CACHE_DAYS = 7 25 26 # --- HTML helpers --- 27 class MetaExtractor(HTMLParser): 28 def __init__(self): 29 super().__init__() 30 self.title = "" 31 self.description = "" 32 self.body_text = [] 33 self._in_title = False 34 self._in_body = False 35 self._body_chars = 0 36 37 def handle_starttag(self, tag, attrs): 38 attrs_d = dict(attrs) 39 if tag == "title": 40 self._in_title = True 41 elif tag == "meta" and attrs_d.get("name", "").lower() == "description": 42 self.description = attrs_d.get("content", "") 43 elif tag == "body": 44 self._in_body = True 45 46 def handle_endtag(self, tag): 47 if tag == "title": 48 self._in_title = False 49 50 def handle_data(self, data): 51 if self._in_title: 52 self.title += data 53 if self._in_body and self._body_chars < 500: 54 clean = data.strip() 55 if clean: 56 self.body_text.append(clean) 57 self._body_chars += len(clean) 58 59 60 def fetch_url(url, timeout=10): 61 """Fetch URL, return text or None.""" 62 try: 63 req = Request(url, headers={"User-Agent": "Mozilla/5.0 (compatible; ResearchBot/1.0)"}) 64 with urlopen(req, timeout=timeout) as resp: 65 return resp.read(200_000).decode("utf-8", errors="replace") 66 except (URLError, HTTPError, OSError, ValueError): 67 return None 68 69 70 def is_cached(domain): 71 """Check if cache exists and is fresh (<7 days).""" 72 path = CACHE_DIR / f"{domain}.json" 73 if not path.exists(): 74 return False 75 mtime = datetime.fromtimestamp(path.stat().st_mtime) 76 return datetime.now() - mtime < timedelta(days=CACHE_DAYS) 77 78 79 def load_cache(domain): 80 path = CACHE_DIR / f"{domain}.json" 81 return json.loads(path.read_text()) if path.exists() else None 82 83 84 # --- Source collectors --- 85 86 def collect_website(domain): 87 """Scrape homepage for title, description, body snippet.""" 88 info = {"source": "website", "title": "", "description": "", "body_snippet": "", "gaps": []} 89 html = fetch_url(f"https://{domain}") 90 if not html: 91 html = fetch_url(f"http://{domain}") 92 if not html: 93 info["error"] = "Could not fetch homepage" 94 return info 95 96 ext = MetaExtractor() 97 try: 98 ext.feed(html) 99 except Exception: 100 pass 101 info["title"] = ext.title.strip() 102 info["description"] = ext.description.strip() 103 info["body_snippet"] = " ".join(ext.body_text)[:500] 104 105 # Detect marketing gaps 106 html_lower = html.lower() 107 if "/blog" not in html_lower and "blog" not in html_lower: 108 info["gaps"].append("no blog detected") 109 if "ga4" not in html_lower and "gtag" not in html_lower and "google-analytics" not in html_lower: 110 info["gaps"].append("no Google Analytics detected") 111 if len(ext.body_text) < 3: 112 info["gaps"].append("thin homepage content") 113 114 time.sleep(2) # Rate limit 115 return info 116 117 118 def collect_builtwith(domain): 119 """BuiltWith free API for tech stack.""" 120 info = {"source": "builtwith", "crm": [], "marketing_tools": [], "enterprise_signals": []} 121 122 api_key = os.environ.get("BUILTWITH_API_KEY", "free") 123 url = f"https://api.builtwith.com/free1/api.json?KEY={api_key}&LOOKUP={domain}" 124 try: 125 req = Request(url, headers={"User-Agent": "Mozilla/5.0"}) 126 with urlopen(req, timeout=15) as resp: 127 raw = resp.read().decode("utf-8", errors="replace") 128 data = json.loads(raw) 129 except Exception as e: 130 info["error"] = f"BuiltWith unavailable: {str(e)[:80]}" 131 return info 132 133 crm_names = {"salesforce", "hubspot", "pipedrive", "zoho", "marketo", "pardot", "dynamics"} 134 mktg_names = {"google analytics", "ga4", "google tag manager", "gtm", "semrush", "ahrefs", 135 "hotjar", "mixpanel", "segment", "amplitude", "heap", "optimizely", "mailchimp"} 136 enterprise_names = {"cloudflare", "aws", "azure", "gcp", "fastly", "akamai", "datadog", "new relic"} 137 138 try: 139 for group in data.get("groups", data.get("Results", [{}])): 140 categories = group.get("categories", group.get("Result", {}).get("Paths", [])) 141 if isinstance(categories, list): 142 for cat in categories: 143 techs = cat.get("technologies", cat.get("Technologies", [])) 144 if isinstance(techs, list): 145 for tech in techs: 146 name = tech.get("name", tech.get("Name", "")).strip() 147 name_lower = name.lower() 148 if any(c in name_lower for c in crm_names): 149 info["crm"].append(name) 150 elif any(m in name_lower for m in mktg_names): 151 info["marketing_tools"].append(name) 152 elif any(e in name_lower for e in enterprise_names): 153 info["enterprise_signals"].append(name) 154 except Exception: 155 pass 156 157 time.sleep(1) 158 return info 159 160 161 def collect_hiring(company_name): 162 """Search for hiring signals. Returns dict with findings.""" 163 info = {"source": "hiring", "signals": []} 164 info["note"] = f"Search needed: '{company_name} hiring marketing OR sales OR engineering'" 165 info["query"] = f"{company_name} hiring marketing OR sales OR engineering" 166 return info 167 168 169 def collect_news(company_name): 170 """Search for recent news/funding. Returns dict with findings.""" 171 info = {"source": "news", "signals": []} 172 info["note"] = f"Search needed: '{company_name} funding OR acquisition OR partnership'" 173 info["query"] = f"{company_name} funding OR acquisition OR partnership" 174 return info 175 176 177 def build_brief(prospect, website, builtwith, hiring, news): 178 """Combine raw data into a 3-5 sentence research brief.""" 179 parts = [] 180 181 company = prospect.get("company_name", prospect.get("domain", "Unknown")) 182 183 if website.get("description"): 184 parts.append(f"{company}: {website['description'][:150]}") 185 elif website.get("title"): 186 parts.append(f"{company} ({website['title'][:100]})") 187 else: 188 parts.append(f"{company} — homepage could not be analyzed.") 189 190 tech_items = builtwith.get("crm", []) + builtwith.get("marketing_tools", []) 191 if tech_items: 192 parts.append(f"Tech stack includes: {', '.join(tech_items[:5])}.") 193 elif not builtwith.get("error"): 194 parts.append("No major CRM/marketing tools detected — potential greenfield opportunity.") 195 196 gaps = website.get("gaps", []) 197 if gaps: 198 parts.append(f"Marketing gaps: {', '.join(gaps)}.") 199 200 if builtwith.get("enterprise_signals"): 201 parts.append(f"Enterprise infra: {', '.join(builtwith['enterprise_signals'][:3])}.") 202 203 if hiring.get("signals"): 204 parts.append(f"Hiring: {'; '.join(hiring['signals'][:2])}.") 205 if news.get("signals"): 206 parts.append(f"Recent: {'; '.join(news['signals'][:2])}.") 207 208 return " ".join(parts[:5]) 209 210 211 def research_prospect(prospect, dry_run=False): 212 """Run full research for one prospect. Returns result dict.""" 213 domain = prospect.get("domain", "").strip().lower() 214 company = prospect.get("company_name", domain) 215 216 if not domain: 217 return {"error": "No domain provided", "prospect": prospect} 218 219 if is_cached(domain) and not dry_run: 220 print(f" ♻️ {domain} — cached (< {CACHE_DAYS}d old)") 221 return load_cache(domain) 222 223 if dry_run: 224 print(f" 🔍 [DRY RUN] Would research: {domain} ({company})") 225 return {"domain": domain, "company_name": company, "dry_run": True} 226 227 print(f" 🔍 Researching {domain} ({company})...") 228 229 website = collect_website(domain) 230 builtwith = collect_builtwith(domain) 231 hiring = collect_hiring(company) 232 news = collect_news(company) 233 234 apollo = { 235 "source": "lead_source", 236 "employee_count": prospect.get("employee_count"), 237 "industry": prospect.get("industry"), 238 "hq_location": prospect.get("hq_location"), 239 "growth_trend": prospect.get("growth_trend"), 240 } 241 242 brief = build_brief(prospect, website, builtwith, hiring, news) 243 244 result = { 245 "domain": domain, 246 "company_name": company, 247 "contact_name": prospect.get("contact_name"), 248 "contact_title": prospect.get("contact_title"), 249 "researched_at": datetime.now().isoformat(), 250 "brief": brief, 251 "sources": { 252 "lead_source": apollo, 253 "website": website, 254 "builtwith": builtwith, 255 "hiring": hiring, 256 "news": news, 257 } 258 } 259 260 cache_path = CACHE_DIR / f"{domain}.json" 261 cache_path.write_text(json.dumps(result, indent=2)) 262 print(f" ✅ {domain} — saved to {cache_path.name}") 263 264 return result 265 266 267 def main(): 268 parser = argparse.ArgumentParser(description="Account Research Engine") 269 parser.add_argument("input", nargs="?", default="-", help="JSON file path or '-' for stdin") 270 parser.add_argument("--domain", help="Single domain to research") 271 parser.add_argument("--company", help="Company name (with --domain)") 272 parser.add_argument("--dry-run", action="store_true", help="Show what would be researched") 273 args = parser.parse_args() 274 275 if args.domain: 276 prospects = [{"domain": args.domain, "company_name": args.company or args.domain}] 277 elif args.input == "-": 278 prospects = json.load(sys.stdin) 279 else: 280 with open(args.input) as f: 281 prospects = json.load(f) 282 283 if not isinstance(prospects, list): 284 prospects = [prospects] 285 286 print(f"📊 Account Research Engine — {len(prospects)} prospect(s)") 287 if args.dry_run: 288 print(" [DRY RUN MODE]") 289 print() 290 291 results = [] 292 for p in prospects: 293 result = research_prospect(p, dry_run=args.dry_run) 294 results.append(result) 295 296 print(f"\n{'🏁' if not args.dry_run else '🔍'} Done — {len(results)} prospect(s) processed.") 297 return results 298 299 300 if __name__ == "__main__": 301 results = main() 302 for r in results: 303 if r.get("brief"): 304 print(f"\n📋 {r['domain']}: {r['brief']}")