daemon.py
1 #!/usr/bin/env python3 2 """ 3 News Scraper Daemon for Hong Kong Fire Documentary 4 Runs 24/7 on a machine, syncs with upstream, scrapes URLs, creates PRs. 5 6 Environment Variables Required: 7 GITHUB_TOKEN - Personal Access Token with repo and PR permissions 8 FORK_REPO - Your fork's repo path (e.g., 'username/repo-name') 9 10 Optional Environment Variables: 11 UPSTREAM_REPO - Upstream repo (default: Hong-Kong-Emergency-Coordination-Hub/...) 12 PR_BRANCH - Branch for PRs (default: scraper-updates) 13 MAIN_BRANCH - Main branch name (default: main) 14 15 Usage: 16 python daemon.py # Run daemon (runs forever) 17 python daemon.py --once # Run one cycle and exit (for testing) 18 """ 19 20 import argparse 21 import json 22 import logging 23 import os 24 import subprocess 25 import sys 26 import time 27 from datetime import datetime, timedelta 28 from pathlib import Path 29 import requests 30 31 # Project paths 32 SCRIPT_DIR = Path(__file__).parent.resolve() 33 PROJECT_ROOT = SCRIPT_DIR.parent.parent 34 LOGS_DIR = PROJECT_ROOT / "logs" 35 LOG_FILE = LOGS_DIR / "scraper.log" 36 37 # GitHub configuration - set via environment variables or defaults 38 UPSTREAM_REPO = os.environ.get( 39 "UPSTREAM_REPO", 40 "Hong-Kong-Emergency-Coordination-Hub/Hong-Kong-Fire-Documentary" 41 ) 42 FORK_REPO = os.environ.get("FORK_REPO", "") # Required - no default 43 UPSTREAM_URL = f"https://github.com/{UPSTREAM_REPO}.git" 44 PR_BRANCH = os.environ.get("PR_BRANCH", "scraper-updates") 45 MAIN_BRANCH = os.environ.get("MAIN_BRANCH", "main") 46 47 # Timing configuration 48 SYNC_INTERVAL_MINUTES = 10 49 PR_INTERVAL_MINUTES = 60 50 51 52 def setup_logging(): 53 """Set up logging to both file and console""" 54 LOGS_DIR.mkdir(exist_ok=True) 55 56 # Create formatter 57 formatter = logging.Formatter( 58 '%(asctime)s | %(levelname)-8s | %(message)s', 59 datefmt='%Y-%m-%d %H:%M:%S' 60 ) 61 62 # File handler (append mode) 63 file_handler = logging.FileHandler(LOG_FILE, encoding='utf-8') 64 file_handler.setFormatter(formatter) 65 file_handler.setLevel(logging.INFO) 66 67 # Console handler 68 console_handler = logging.StreamHandler() 69 console_handler.setFormatter(formatter) 70 console_handler.setLevel(logging.INFO) 71 72 # Root logger 73 logger = logging.getLogger() 74 logger.setLevel(logging.INFO) 75 logger.addHandler(file_handler) 76 logger.addHandler(console_handler) 77 78 return logger 79 80 81 def run_cmd(cmd: list[str], cwd: Path = None, check: bool = True) -> subprocess.CompletedProcess: 82 """Run a shell command and return the result""" 83 try: 84 result = subprocess.run( 85 cmd, 86 cwd=cwd or PROJECT_ROOT, 87 capture_output=True, 88 text=True, 89 check=check 90 ) 91 return result 92 except subprocess.CalledProcessError as e: 93 logging.error(f"Command failed: {' '.join(cmd)}") 94 logging.error(f"stderr: {e.stderr}") 95 raise 96 97 98 def get_github_token() -> str: 99 """Get GitHub token from environment variable""" 100 token = os.environ.get("GITHUB_TOKEN") 101 if not token: 102 logging.error("GITHUB_TOKEN environment variable not set!") 103 logging.error("Please set it: export GITHUB_TOKEN='your_token_here'") 104 sys.exit(1) 105 return token 106 107 108 def get_fork_repo() -> str: 109 """Get fork repo from environment variable""" 110 if not FORK_REPO: 111 logging.error("FORK_REPO environment variable not set!") 112 logging.error("Please set it: export FORK_REPO='username/repo-name'") 113 sys.exit(1) 114 return FORK_REPO 115 116 117 def get_fork_owner() -> str: 118 """Get the owner/username from FORK_REPO""" 119 return get_fork_repo().split("/")[0] 120 121 122 def setup_git_remotes(): 123 """Ensure git remotes are configured correctly""" 124 logging.info("Setting up git remotes...") 125 126 # Check current remotes 127 result = run_cmd(["git", "remote", "-v"], check=False) 128 129 # Add upstream if not exists 130 if "upstream" not in result.stdout: 131 run_cmd(["git", "remote", "add", "upstream", UPSTREAM_URL]) 132 logging.info(f"Added upstream remote: {UPSTREAM_URL}") 133 134 # Ensure origin points to fork 135 token = get_github_token() 136 fork_repo = get_fork_repo() 137 fork_url_with_token = f"https://{token}@github.com/{fork_repo}.git" 138 run_cmd(["git", "remote", "set-url", "origin", fork_url_with_token]) 139 logging.info("Configured origin remote with authentication") 140 141 142 def sync_with_upstream() -> bool: 143 """ 144 Sync local repo with upstream. 145 Returns True if there were changes, False otherwise. 146 """ 147 logging.info("Syncing with upstream...") 148 149 try: 150 # Fetch upstream 151 run_cmd(["git", "fetch", "upstream", MAIN_BRANCH]) 152 153 # Check if we're behind upstream 154 result = run_cmd([ 155 "git", "rev-list", "--count", 156 f"HEAD..upstream/{MAIN_BRANCH}" 157 ]) 158 commits_behind = int(result.stdout.strip()) 159 160 if commits_behind > 0: 161 logging.info(f"Behind upstream by {commits_behind} commits, merging...") 162 163 # Stash any local changes 164 run_cmd(["git", "stash"], check=False) 165 166 # Checkout main and merge upstream 167 run_cmd(["git", "checkout", MAIN_BRANCH]) 168 run_cmd(["git", "merge", f"upstream/{MAIN_BRANCH}", "--no-edit"]) 169 170 # Pop stash if exists 171 run_cmd(["git", "stash", "pop"], check=False) 172 173 logging.info("Synced with upstream successfully") 174 return True 175 else: 176 logging.info("Already up to date with upstream") 177 return False 178 179 except Exception as e: 180 logging.error(f"Failed to sync with upstream: {e}") 181 return False 182 183 184 def run_scraper() -> tuple[int, int]: 185 """ 186 Run the scraper to detect and scrape new URLs. 187 Returns (success_count, fail_count) 188 """ 189 logging.info("Running scraper...") 190 191 try: 192 # Import and run scraper 193 sys.path.insert(0, str(SCRIPT_DIR)) 194 from scraper import run_scraper as scrape, load_registry, get_all_urls, filter_new_urls 195 196 # Check for new URLs first 197 registry = load_registry() 198 all_urls = get_all_urls() 199 new_urls = filter_new_urls(all_urls, registry) 200 201 if not new_urls: 202 logging.info("No new URLs to scrape") 203 return 0, 0 204 205 logging.info(f"Found {len(new_urls)} new URLs to scrape") 206 207 # Run the scraper (it handles everything internally) 208 scrape(dry_run=False, verbose=False) 209 210 # Count results by checking registry again 211 new_registry = load_registry() 212 scraped_count = len(new_registry.get("scraped_urls", {})) - len(registry.get("scraped_urls", {})) 213 214 return scraped_count, len(new_urls) - scraped_count 215 216 except Exception as e: 217 logging.error(f"Scraper error: {e}") 218 return 0, 0 219 220 221 def has_local_changes() -> bool: 222 """Check if there are uncommitted changes""" 223 result = run_cmd(["git", "status", "--porcelain"]) 224 return bool(result.stdout.strip()) 225 226 227 def commit_changes() -> bool: 228 """Commit any local changes""" 229 if not has_local_changes(): 230 return False 231 232 logging.info("Committing changes...") 233 234 try: 235 # Stage all changes 236 run_cmd(["git", "add", "-A"]) 237 238 # Create commit message with timestamp 239 timestamp = datetime.now().strftime("%Y-%m-%d %H:%M") 240 msg = f"chore(scraper): auto-scrape {timestamp}" 241 242 run_cmd(["git", "commit", "-m", msg]) 243 logging.info(f"Committed: {msg}") 244 return True 245 246 except Exception as e: 247 logging.error(f"Failed to commit: {e}") 248 return False 249 250 251 def get_open_pr() -> dict | None: 252 """Check if there's an existing open PR from the scraper branch""" 253 token = get_github_token() 254 fork_owner = get_fork_owner() 255 256 url = f"https://api.github.com/repos/{UPSTREAM_REPO}/pulls" 257 headers = { 258 "Authorization": f"token {token}", 259 "Accept": "application/vnd.github.v3+json" 260 } 261 params = { 262 "head": f"{fork_owner}:{PR_BRANCH}", 263 "state": "open" 264 } 265 266 try: 267 response = requests.get(url, headers=headers, params=params) 268 response.raise_for_status() 269 prs = response.json() 270 271 if prs: 272 return prs[0] # Return first open PR 273 return None 274 275 except Exception as e: 276 logging.error(f"Failed to check for open PRs: {e}") 277 return None 278 279 280 def close_pr(pr_number: int) -> bool: 281 """Close an existing PR""" 282 token = get_github_token() 283 284 url = f"https://api.github.com/repos/{UPSTREAM_REPO}/pulls/{pr_number}" 285 headers = { 286 "Authorization": f"token {token}", 287 "Accept": "application/vnd.github.v3+json" 288 } 289 data = {"state": "closed"} 290 291 try: 292 response = requests.patch(url, headers=headers, json=data) 293 response.raise_for_status() 294 logging.info(f"Closed PR #{pr_number}") 295 return True 296 297 except Exception as e: 298 logging.error(f"Failed to close PR #{pr_number}: {e}") 299 return False 300 301 302 def push_to_pr_branch() -> bool: 303 """Push changes to the PR branch (force push to keep clean history)""" 304 logging.info(f"Pushing to branch '{PR_BRANCH}'...") 305 306 try: 307 # Create or checkout the PR branch 308 result = run_cmd(["git", "branch", "--list", PR_BRANCH], check=False) 309 310 if PR_BRANCH in result.stdout: 311 # Branch exists, checkout and reset to main 312 run_cmd(["git", "checkout", PR_BRANCH]) 313 run_cmd(["git", "reset", "--hard", MAIN_BRANCH]) 314 else: 315 # Create new branch from main 316 run_cmd(["git", "checkout", "-b", PR_BRANCH, MAIN_BRANCH]) 317 318 # Force push to origin 319 run_cmd(["git", "push", "origin", PR_BRANCH, "--force"]) 320 321 # Go back to main 322 run_cmd(["git", "checkout", MAIN_BRANCH]) 323 324 logging.info(f"Pushed to {PR_BRANCH}") 325 return True 326 327 except Exception as e: 328 logging.error(f"Failed to push: {e}") 329 # Try to get back to main 330 run_cmd(["git", "checkout", MAIN_BRANCH], check=False) 331 return False 332 333 334 def create_pr() -> bool: 335 """Create a new PR to upstream""" 336 token = get_github_token() 337 fork_owner = get_fork_owner() 338 339 url = f"https://api.github.com/repos/{UPSTREAM_REPO}/pulls" 340 headers = { 341 "Authorization": f"token {token}", 342 "Accept": "application/vnd.github.v3+json" 343 } 344 345 # Get count of archives for PR description 346 archives_dir = PROJECT_ROOT / "content" / "news" 347 archive_count = 0 348 for source_dir in archives_dir.iterdir(): 349 if source_dir.is_dir(): 350 archive_dir = source_dir / "archive" 351 if archive_dir.exists(): 352 archive_count += len(list(archive_dir.iterdir())) 353 354 timestamp = datetime.now().strftime("%Y-%m-%d %H:%M") 355 356 data = { 357 "title": f"[Auto-Scraper] News archives update - {timestamp}", 358 "head": f"{fork_owner}:{PR_BRANCH}", 359 "base": MAIN_BRANCH, 360 "body": f"""## Automated News Archive Update 361 362 This PR was automatically generated by the news scraper daemon. 363 364 ### Summary 365 - **Timestamp**: {timestamp} 366 - **Total archived articles**: {archive_count} 367 368 ### What's included 369 - Scraped HTML archives of news articles 370 - Updated URL registry to prevent duplicates 371 - Scraper activity logs 372 373 --- 374 *This PR will be automatically replaced if not merged before the next hourly update.* 375 """ 376 } 377 378 try: 379 response = requests.post(url, headers=headers, json=data) 380 response.raise_for_status() 381 pr_data = response.json() 382 logging.info(f"Created PR #{pr_data['number']}: {pr_data['html_url']}") 383 return True 384 385 except requests.exceptions.HTTPError as e: 386 if e.response.status_code == 422: 387 # PR already exists or no changes 388 logging.info("PR already exists or no changes to submit") 389 else: 390 logging.error(f"Failed to create PR: {e}") 391 logging.error(f"Response: {e.response.text}") 392 return False 393 394 except Exception as e: 395 logging.error(f"Failed to create PR: {e}") 396 return False 397 398 399 def manage_pr(): 400 """Close old PR if exists, push changes, and create new PR""" 401 logging.info("Managing PR...") 402 403 # Check for existing open PR 404 existing_pr = get_open_pr() 405 if existing_pr: 406 pr_number = existing_pr["number"] 407 logging.info(f"Found existing open PR #{pr_number}, closing...") 408 close_pr(pr_number) 409 410 # Push to PR branch 411 if not push_to_pr_branch(): 412 logging.error("Failed to push to PR branch") 413 return 414 415 # Create new PR 416 create_pr() 417 418 419 def commit_logs(): 420 """Commit log file changes""" 421 if not LOG_FILE.exists(): 422 return 423 424 try: 425 run_cmd(["git", "add", str(LOG_FILE)]) 426 427 # Check if there are staged changes for the log file 428 result = run_cmd(["git", "diff", "--cached", "--name-only"]) 429 if "logs/scraper.log" in result.stdout: 430 run_cmd(["git", "commit", "-m", "chore(logs): update scraper logs"]) 431 logging.info("Committed log updates") 432 433 except Exception as e: 434 logging.debug(f"No log changes to commit: {e}") 435 436 437 def run_daemon(run_once: bool = False): 438 """Main daemon loop""" 439 logger = setup_logging() 440 441 logging.info("=" * 60) 442 logging.info("News Scraper Daemon Starting") 443 logging.info(f"Fork: {get_fork_repo()}") 444 logging.info(f"Upstream: {UPSTREAM_REPO}") 445 logging.info(f"Sync interval: {SYNC_INTERVAL_MINUTES} minutes") 446 logging.info(f"PR interval: {PR_INTERVAL_MINUTES} minutes") 447 logging.info("=" * 60) 448 449 # Verify token and fork repo 450 get_github_token() 451 get_fork_repo() 452 453 # Setup git remotes 454 setup_git_remotes() 455 456 last_sync = datetime.min 457 last_pr = datetime.min 458 459 try: 460 while True: 461 now = datetime.now() 462 463 # Check if it's time to sync (every 10 minutes) 464 if now - last_sync >= timedelta(minutes=SYNC_INTERVAL_MINUTES): 465 logging.info("-" * 40) 466 logging.info("Starting sync cycle...") 467 468 # Sync with upstream 469 sync_with_upstream() 470 471 # Run scraper 472 success, failed = run_scraper() 473 if success > 0 or failed > 0: 474 logging.info(f"Scraper results: {success} success, {failed} failed") 475 476 # Commit any changes 477 commit_changes() 478 479 # Commit logs 480 commit_logs() 481 482 last_sync = now 483 logging.info("Sync cycle complete") 484 485 # Check if it's time to create/update PR (every hour) 486 if now - last_pr >= timedelta(minutes=PR_INTERVAL_MINUTES): 487 logging.info("-" * 40) 488 logging.info("Starting PR cycle...") 489 490 # Check if we have any archives to submit 491 if has_local_changes() or True: # Always try to create PR 492 # Push to fork first 493 try: 494 run_cmd(["git", "push", "origin", MAIN_BRANCH]) 495 except: 496 pass 497 498 manage_pr() 499 500 last_pr = now 501 logging.info("PR cycle complete") 502 503 if run_once: 504 logging.info("Run once mode, exiting...") 505 break 506 507 # Sleep for 1 minute between checks 508 time.sleep(60) 509 510 except KeyboardInterrupt: 511 logging.info("Daemon stopped by user") 512 except Exception as e: 513 logging.error(f"Daemon error: {e}") 514 raise 515 516 517 def main(): 518 parser = argparse.ArgumentParser( 519 description="News Scraper Daemon - runs 24/7, syncs and scrapes" 520 ) 521 parser.add_argument( 522 "--once", 523 action="store_true", 524 help="Run one sync+scrape+PR cycle and exit" 525 ) 526 527 args = parser.parse_args() 528 run_daemon(run_once=args.once) 529 530 531 if __name__ == "__main__": 532 main() 533