/ lead-dossier / scripts / account-researcher.py
account-researcher.py
  1  #!/usr/bin/env python3
  2  """
  3  Account Research Engine
  4  Gathers intel from multiple sources per prospect, caches for 7 days.
  5  
  6  Usage:
  7      python3 account-researcher.py prospects.json
  8      cat prospects.json | python3 account-researcher.py -
  9      python3 account-researcher.py --domain example.com --company "Example Corp"
 10      python3 account-researcher.py prospects.json --dry-run
 11  """
 12  
 13  import json, os, sys, time, re, argparse
 14  from pathlib import Path
 15  from datetime import datetime, timedelta
 16  from urllib.request import urlopen, Request
 17  from urllib.error import URLError, HTTPError
 18  from html.parser import HTMLParser
 19  
 20  SCRIPT_DIR = Path(__file__).resolve().parent
 21  DATA_DIR = SCRIPT_DIR.parent / "data"
 22  CACHE_DIR = DATA_DIR / "account-research"
 23  CACHE_DIR.mkdir(parents=True, exist_ok=True)
 24  CACHE_DAYS = 7
 25  
 26  # --- HTML helpers ---
 27  class MetaExtractor(HTMLParser):
 28      def __init__(self):
 29          super().__init__()
 30          self.title = ""
 31          self.description = ""
 32          self.body_text = []
 33          self._in_title = False
 34          self._in_body = False
 35          self._body_chars = 0
 36  
 37      def handle_starttag(self, tag, attrs):
 38          attrs_d = dict(attrs)
 39          if tag == "title":
 40              self._in_title = True
 41          elif tag == "meta" and attrs_d.get("name", "").lower() == "description":
 42              self.description = attrs_d.get("content", "")
 43          elif tag == "body":
 44              self._in_body = True
 45  
 46      def handle_endtag(self, tag):
 47          if tag == "title":
 48              self._in_title = False
 49  
 50      def handle_data(self, data):
 51          if self._in_title:
 52              self.title += data
 53          if self._in_body and self._body_chars < 500:
 54              clean = data.strip()
 55              if clean:
 56                  self.body_text.append(clean)
 57                  self._body_chars += len(clean)
 58  
 59  
 60  def fetch_url(url, timeout=10):
 61      """Fetch URL, return text or None."""
 62      try:
 63          req = Request(url, headers={"User-Agent": "Mozilla/5.0 (compatible; ResearchBot/1.0)"})
 64          with urlopen(req, timeout=timeout) as resp:
 65              return resp.read(200_000).decode("utf-8", errors="replace")
 66      except (URLError, HTTPError, OSError, ValueError):
 67          return None
 68  
 69  
 70  def is_cached(domain):
 71      """Check if cache exists and is fresh (<7 days)."""
 72      path = CACHE_DIR / f"{domain}.json"
 73      if not path.exists():
 74          return False
 75      mtime = datetime.fromtimestamp(path.stat().st_mtime)
 76      return datetime.now() - mtime < timedelta(days=CACHE_DAYS)
 77  
 78  
 79  def load_cache(domain):
 80      path = CACHE_DIR / f"{domain}.json"
 81      return json.loads(path.read_text()) if path.exists() else None
 82  
 83  
 84  # --- Source collectors ---
 85  
 86  def collect_website(domain):
 87      """Scrape homepage for title, description, body snippet."""
 88      info = {"source": "website", "title": "", "description": "", "body_snippet": "", "gaps": []}
 89      html = fetch_url(f"https://{domain}")
 90      if not html:
 91          html = fetch_url(f"http://{domain}")
 92      if not html:
 93          info["error"] = "Could not fetch homepage"
 94          return info
 95  
 96      ext = MetaExtractor()
 97      try:
 98          ext.feed(html)
 99      except Exception:
100          pass
101      info["title"] = ext.title.strip()
102      info["description"] = ext.description.strip()
103      info["body_snippet"] = " ".join(ext.body_text)[:500]
104  
105      # Detect marketing gaps
106      html_lower = html.lower()
107      if "/blog" not in html_lower and "blog" not in html_lower:
108          info["gaps"].append("no blog detected")
109      if "ga4" not in html_lower and "gtag" not in html_lower and "google-analytics" not in html_lower:
110          info["gaps"].append("no Google Analytics detected")
111      if len(ext.body_text) < 3:
112          info["gaps"].append("thin homepage content")
113  
114      time.sleep(2)  # Rate limit
115      return info
116  
117  
118  def collect_builtwith(domain):
119      """BuiltWith free API for tech stack."""
120      info = {"source": "builtwith", "crm": [], "marketing_tools": [], "enterprise_signals": []}
121  
122      api_key = os.environ.get("BUILTWITH_API_KEY", "free")
123      url = f"https://api.builtwith.com/free1/api.json?KEY={api_key}&LOOKUP={domain}"
124      try:
125          req = Request(url, headers={"User-Agent": "Mozilla/5.0"})
126          with urlopen(req, timeout=15) as resp:
127              raw = resp.read().decode("utf-8", errors="replace")
128          data = json.loads(raw)
129      except Exception as e:
130          info["error"] = f"BuiltWith unavailable: {str(e)[:80]}"
131          return info
132  
133      crm_names = {"salesforce", "hubspot", "pipedrive", "zoho", "marketo", "pardot", "dynamics"}
134      mktg_names = {"google analytics", "ga4", "google tag manager", "gtm", "semrush", "ahrefs",
135                    "hotjar", "mixpanel", "segment", "amplitude", "heap", "optimizely", "mailchimp"}
136      enterprise_names = {"cloudflare", "aws", "azure", "gcp", "fastly", "akamai", "datadog", "new relic"}
137  
138      try:
139          for group in data.get("groups", data.get("Results", [{}])):
140              categories = group.get("categories", group.get("Result", {}).get("Paths", []))
141              if isinstance(categories, list):
142                  for cat in categories:
143                      techs = cat.get("technologies", cat.get("Technologies", []))
144                      if isinstance(techs, list):
145                          for tech in techs:
146                              name = tech.get("name", tech.get("Name", "")).strip()
147                              name_lower = name.lower()
148                              if any(c in name_lower for c in crm_names):
149                                  info["crm"].append(name)
150                              elif any(m in name_lower for m in mktg_names):
151                                  info["marketing_tools"].append(name)
152                              elif any(e in name_lower for e in enterprise_names):
153                                  info["enterprise_signals"].append(name)
154      except Exception:
155          pass
156  
157      time.sleep(1)
158      return info
159  
160  
161  def collect_hiring(company_name):
162      """Search for hiring signals. Returns dict with findings."""
163      info = {"source": "hiring", "signals": []}
164      info["note"] = f"Search needed: '{company_name} hiring marketing OR sales OR engineering'"
165      info["query"] = f"{company_name} hiring marketing OR sales OR engineering"
166      return info
167  
168  
169  def collect_news(company_name):
170      """Search for recent news/funding. Returns dict with findings."""
171      info = {"source": "news", "signals": []}
172      info["note"] = f"Search needed: '{company_name} funding OR acquisition OR partnership'"
173      info["query"] = f"{company_name} funding OR acquisition OR partnership"
174      return info
175  
176  
177  def build_brief(prospect, website, builtwith, hiring, news):
178      """Combine raw data into a 3-5 sentence research brief."""
179      parts = []
180  
181      company = prospect.get("company_name", prospect.get("domain", "Unknown"))
182  
183      if website.get("description"):
184          parts.append(f"{company}: {website['description'][:150]}")
185      elif website.get("title"):
186          parts.append(f"{company} ({website['title'][:100]})")
187      else:
188          parts.append(f"{company} — homepage could not be analyzed.")
189  
190      tech_items = builtwith.get("crm", []) + builtwith.get("marketing_tools", [])
191      if tech_items:
192          parts.append(f"Tech stack includes: {', '.join(tech_items[:5])}.")
193      elif not builtwith.get("error"):
194          parts.append("No major CRM/marketing tools detected — potential greenfield opportunity.")
195  
196      gaps = website.get("gaps", [])
197      if gaps:
198          parts.append(f"Marketing gaps: {', '.join(gaps)}.")
199  
200      if builtwith.get("enterprise_signals"):
201          parts.append(f"Enterprise infra: {', '.join(builtwith['enterprise_signals'][:3])}.")
202  
203      if hiring.get("signals"):
204          parts.append(f"Hiring: {'; '.join(hiring['signals'][:2])}.")
205      if news.get("signals"):
206          parts.append(f"Recent: {'; '.join(news['signals'][:2])}.")
207  
208      return " ".join(parts[:5])
209  
210  
211  def research_prospect(prospect, dry_run=False):
212      """Run full research for one prospect. Returns result dict."""
213      domain = prospect.get("domain", "").strip().lower()
214      company = prospect.get("company_name", domain)
215  
216      if not domain:
217          return {"error": "No domain provided", "prospect": prospect}
218  
219      if is_cached(domain) and not dry_run:
220          print(f"  ♻️  {domain} — cached (< {CACHE_DAYS}d old)")
221          return load_cache(domain)
222  
223      if dry_run:
224          print(f"  🔍 [DRY RUN] Would research: {domain} ({company})")
225          return {"domain": domain, "company_name": company, "dry_run": True}
226  
227      print(f"  🔍 Researching {domain} ({company})...")
228  
229      website = collect_website(domain)
230      builtwith = collect_builtwith(domain)
231      hiring = collect_hiring(company)
232      news = collect_news(company)
233  
234      apollo = {
235          "source": "lead_source",
236          "employee_count": prospect.get("employee_count"),
237          "industry": prospect.get("industry"),
238          "hq_location": prospect.get("hq_location"),
239          "growth_trend": prospect.get("growth_trend"),
240      }
241  
242      brief = build_brief(prospect, website, builtwith, hiring, news)
243  
244      result = {
245          "domain": domain,
246          "company_name": company,
247          "contact_name": prospect.get("contact_name"),
248          "contact_title": prospect.get("contact_title"),
249          "researched_at": datetime.now().isoformat(),
250          "brief": brief,
251          "sources": {
252              "lead_source": apollo,
253              "website": website,
254              "builtwith": builtwith,
255              "hiring": hiring,
256              "news": news,
257          }
258      }
259  
260      cache_path = CACHE_DIR / f"{domain}.json"
261      cache_path.write_text(json.dumps(result, indent=2))
262      print(f"  ✅ {domain} — saved to {cache_path.name}")
263  
264      return result
265  
266  
267  def main():
268      parser = argparse.ArgumentParser(description="Account Research Engine")
269      parser.add_argument("input", nargs="?", default="-", help="JSON file path or '-' for stdin")
270      parser.add_argument("--domain", help="Single domain to research")
271      parser.add_argument("--company", help="Company name (with --domain)")
272      parser.add_argument("--dry-run", action="store_true", help="Show what would be researched")
273      args = parser.parse_args()
274  
275      if args.domain:
276          prospects = [{"domain": args.domain, "company_name": args.company or args.domain}]
277      elif args.input == "-":
278          prospects = json.load(sys.stdin)
279      else:
280          with open(args.input) as f:
281              prospects = json.load(f)
282  
283      if not isinstance(prospects, list):
284          prospects = [prospects]
285  
286      print(f"📊 Account Research Engine — {len(prospects)} prospect(s)")
287      if args.dry_run:
288          print("   [DRY RUN MODE]")
289      print()
290  
291      results = []
292      for p in prospects:
293          result = research_prospect(p, dry_run=args.dry_run)
294          results.append(result)
295  
296      print(f"\n{'🏁' if not args.dry_run else '🔍'} Done — {len(results)} prospect(s) processed.")
297      return results
298  
299  
300  if __name__ == "__main__":
301      results = main()
302      for r in results:
303          if r.get("brief"):
304              print(f"\n📋 {r['domain']}: {r['brief']}")