/ scripts / scraper / daemon.py
daemon.py
  1  #!/usr/bin/env python3
  2  """
  3  News Scraper Daemon for Hong Kong Fire Documentary
  4  Runs 24/7 on a machine, syncs with upstream, scrapes URLs, creates PRs.
  5  
  6  Environment Variables Required:
  7      GITHUB_TOKEN - Personal Access Token with repo and PR permissions
  8      FORK_REPO    - Your fork's repo path (e.g., 'username/repo-name')
  9  
 10  Optional Environment Variables:
 11      UPSTREAM_REPO - Upstream repo (default: Hong-Kong-Emergency-Coordination-Hub/...)
 12      PR_BRANCH     - Branch for PRs (default: scraper-updates)
 13      MAIN_BRANCH   - Main branch name (default: main)
 14      
 15  Usage:
 16      python daemon.py              # Run daemon (runs forever)
 17      python daemon.py --once       # Run one cycle and exit (for testing)
 18  """
 19  
 20  import argparse
 21  import json
 22  import logging
 23  import os
 24  import subprocess
 25  import sys
 26  import time
 27  from datetime import datetime, timedelta
 28  from pathlib import Path
 29  import requests
 30  
 31  # Project paths
 32  SCRIPT_DIR = Path(__file__).parent.resolve()
 33  PROJECT_ROOT = SCRIPT_DIR.parent.parent
 34  LOGS_DIR = PROJECT_ROOT / "logs"
 35  LOG_FILE = LOGS_DIR / "scraper.log"
 36  
 37  # GitHub configuration - set via environment variables or defaults
 38  UPSTREAM_REPO = os.environ.get(
 39      "UPSTREAM_REPO",
 40      "Hong-Kong-Emergency-Coordination-Hub/Hong-Kong-Fire-Documentary"
 41  )
 42  FORK_REPO = os.environ.get("FORK_REPO", "")  # Required - no default
 43  UPSTREAM_URL = f"https://github.com/{UPSTREAM_REPO}.git"
 44  PR_BRANCH = os.environ.get("PR_BRANCH", "scraper-updates")
 45  MAIN_BRANCH = os.environ.get("MAIN_BRANCH", "main")
 46  
 47  # Timing configuration
 48  SYNC_INTERVAL_MINUTES = 10
 49  PR_INTERVAL_MINUTES = 60
 50  
 51  
 52  def setup_logging():
 53      """Set up logging to both file and console"""
 54      LOGS_DIR.mkdir(exist_ok=True)
 55      
 56      # Create formatter
 57      formatter = logging.Formatter(
 58          '%(asctime)s | %(levelname)-8s | %(message)s',
 59          datefmt='%Y-%m-%d %H:%M:%S'
 60      )
 61      
 62      # File handler (append mode)
 63      file_handler = logging.FileHandler(LOG_FILE, encoding='utf-8')
 64      file_handler.setFormatter(formatter)
 65      file_handler.setLevel(logging.INFO)
 66      
 67      # Console handler
 68      console_handler = logging.StreamHandler()
 69      console_handler.setFormatter(formatter)
 70      console_handler.setLevel(logging.INFO)
 71      
 72      # Root logger
 73      logger = logging.getLogger()
 74      logger.setLevel(logging.INFO)
 75      logger.addHandler(file_handler)
 76      logger.addHandler(console_handler)
 77      
 78      return logger
 79  
 80  
 81  def run_cmd(cmd: list[str], cwd: Path = None, check: bool = True) -> subprocess.CompletedProcess:
 82      """Run a shell command and return the result"""
 83      try:
 84          result = subprocess.run(
 85              cmd,
 86              cwd=cwd or PROJECT_ROOT,
 87              capture_output=True,
 88              text=True,
 89              check=check
 90          )
 91          return result
 92      except subprocess.CalledProcessError as e:
 93          logging.error(f"Command failed: {' '.join(cmd)}")
 94          logging.error(f"stderr: {e.stderr}")
 95          raise
 96  
 97  
 98  def get_github_token() -> str:
 99      """Get GitHub token from environment variable"""
100      token = os.environ.get("GITHUB_TOKEN")
101      if not token:
102          logging.error("GITHUB_TOKEN environment variable not set!")
103          logging.error("Please set it: export GITHUB_TOKEN='your_token_here'")
104          sys.exit(1)
105      return token
106  
107  
108  def get_fork_repo() -> str:
109      """Get fork repo from environment variable"""
110      if not FORK_REPO:
111          logging.error("FORK_REPO environment variable not set!")
112          logging.error("Please set it: export FORK_REPO='username/repo-name'")
113          sys.exit(1)
114      return FORK_REPO
115  
116  
117  def get_fork_owner() -> str:
118      """Get the owner/username from FORK_REPO"""
119      return get_fork_repo().split("/")[0]
120  
121  
122  def setup_git_remotes():
123      """Ensure git remotes are configured correctly"""
124      logging.info("Setting up git remotes...")
125      
126      # Check current remotes
127      result = run_cmd(["git", "remote", "-v"], check=False)
128      
129      # Add upstream if not exists
130      if "upstream" not in result.stdout:
131          run_cmd(["git", "remote", "add", "upstream", UPSTREAM_URL])
132          logging.info(f"Added upstream remote: {UPSTREAM_URL}")
133      
134      # Ensure origin points to fork
135      token = get_github_token()
136      fork_repo = get_fork_repo()
137      fork_url_with_token = f"https://{token}@github.com/{fork_repo}.git"
138      run_cmd(["git", "remote", "set-url", "origin", fork_url_with_token])
139      logging.info("Configured origin remote with authentication")
140  
141  
142  def sync_with_upstream() -> bool:
143      """
144      Sync local repo with upstream.
145      Returns True if there were changes, False otherwise.
146      """
147      logging.info("Syncing with upstream...")
148      
149      try:
150          # Fetch upstream
151          run_cmd(["git", "fetch", "upstream", MAIN_BRANCH])
152          
153          # Check if we're behind upstream
154          result = run_cmd([
155              "git", "rev-list", "--count",
156              f"HEAD..upstream/{MAIN_BRANCH}"
157          ])
158          commits_behind = int(result.stdout.strip())
159          
160          if commits_behind > 0:
161              logging.info(f"Behind upstream by {commits_behind} commits, merging...")
162              
163              # Stash any local changes
164              run_cmd(["git", "stash"], check=False)
165              
166              # Checkout main and merge upstream
167              run_cmd(["git", "checkout", MAIN_BRANCH])
168              run_cmd(["git", "merge", f"upstream/{MAIN_BRANCH}", "--no-edit"])
169              
170              # Pop stash if exists
171              run_cmd(["git", "stash", "pop"], check=False)
172              
173              logging.info("Synced with upstream successfully")
174              return True
175          else:
176              logging.info("Already up to date with upstream")
177              return False
178              
179      except Exception as e:
180          logging.error(f"Failed to sync with upstream: {e}")
181          return False
182  
183  
184  def run_scraper() -> tuple[int, int]:
185      """
186      Run the scraper to detect and scrape new URLs.
187      Returns (success_count, fail_count)
188      """
189      logging.info("Running scraper...")
190      
191      try:
192          # Import and run scraper
193          sys.path.insert(0, str(SCRIPT_DIR))
194          from scraper import run_scraper as scrape, load_registry, get_all_urls, filter_new_urls
195          
196          # Check for new URLs first
197          registry = load_registry()
198          all_urls = get_all_urls()
199          new_urls = filter_new_urls(all_urls, registry)
200          
201          if not new_urls:
202              logging.info("No new URLs to scrape")
203              return 0, 0
204          
205          logging.info(f"Found {len(new_urls)} new URLs to scrape")
206          
207          # Run the scraper (it handles everything internally)
208          scrape(dry_run=False, verbose=False)
209          
210          # Count results by checking registry again
211          new_registry = load_registry()
212          scraped_count = len(new_registry.get("scraped_urls", {})) - len(registry.get("scraped_urls", {}))
213          
214          return scraped_count, len(new_urls) - scraped_count
215          
216      except Exception as e:
217          logging.error(f"Scraper error: {e}")
218          return 0, 0
219  
220  
221  def has_local_changes() -> bool:
222      """Check if there are uncommitted changes"""
223      result = run_cmd(["git", "status", "--porcelain"])
224      return bool(result.stdout.strip())
225  
226  
227  def commit_changes() -> bool:
228      """Commit any local changes"""
229      if not has_local_changes():
230          return False
231      
232      logging.info("Committing changes...")
233      
234      try:
235          # Stage all changes
236          run_cmd(["git", "add", "-A"])
237          
238          # Create commit message with timestamp
239          timestamp = datetime.now().strftime("%Y-%m-%d %H:%M")
240          msg = f"chore(scraper): auto-scrape {timestamp}"
241          
242          run_cmd(["git", "commit", "-m", msg])
243          logging.info(f"Committed: {msg}")
244          return True
245          
246      except Exception as e:
247          logging.error(f"Failed to commit: {e}")
248          return False
249  
250  
251  def get_open_pr() -> dict | None:
252      """Check if there's an existing open PR from the scraper branch"""
253      token = get_github_token()
254      fork_owner = get_fork_owner()
255      
256      url = f"https://api.github.com/repos/{UPSTREAM_REPO}/pulls"
257      headers = {
258          "Authorization": f"token {token}",
259          "Accept": "application/vnd.github.v3+json"
260      }
261      params = {
262          "head": f"{fork_owner}:{PR_BRANCH}",
263          "state": "open"
264      }
265      
266      try:
267          response = requests.get(url, headers=headers, params=params)
268          response.raise_for_status()
269          prs = response.json()
270          
271          if prs:
272              return prs[0]  # Return first open PR
273          return None
274          
275      except Exception as e:
276          logging.error(f"Failed to check for open PRs: {e}")
277          return None
278  
279  
280  def close_pr(pr_number: int) -> bool:
281      """Close an existing PR"""
282      token = get_github_token()
283      
284      url = f"https://api.github.com/repos/{UPSTREAM_REPO}/pulls/{pr_number}"
285      headers = {
286          "Authorization": f"token {token}",
287          "Accept": "application/vnd.github.v3+json"
288      }
289      data = {"state": "closed"}
290      
291      try:
292          response = requests.patch(url, headers=headers, json=data)
293          response.raise_for_status()
294          logging.info(f"Closed PR #{pr_number}")
295          return True
296          
297      except Exception as e:
298          logging.error(f"Failed to close PR #{pr_number}: {e}")
299          return False
300  
301  
302  def push_to_pr_branch() -> bool:
303      """Push changes to the PR branch (force push to keep clean history)"""
304      logging.info(f"Pushing to branch '{PR_BRANCH}'...")
305      
306      try:
307          # Create or checkout the PR branch
308          result = run_cmd(["git", "branch", "--list", PR_BRANCH], check=False)
309          
310          if PR_BRANCH in result.stdout:
311              # Branch exists, checkout and reset to main
312              run_cmd(["git", "checkout", PR_BRANCH])
313              run_cmd(["git", "reset", "--hard", MAIN_BRANCH])
314          else:
315              # Create new branch from main
316              run_cmd(["git", "checkout", "-b", PR_BRANCH, MAIN_BRANCH])
317          
318          # Force push to origin
319          run_cmd(["git", "push", "origin", PR_BRANCH, "--force"])
320          
321          # Go back to main
322          run_cmd(["git", "checkout", MAIN_BRANCH])
323          
324          logging.info(f"Pushed to {PR_BRANCH}")
325          return True
326          
327      except Exception as e:
328          logging.error(f"Failed to push: {e}")
329          # Try to get back to main
330          run_cmd(["git", "checkout", MAIN_BRANCH], check=False)
331          return False
332  
333  
334  def create_pr() -> bool:
335      """Create a new PR to upstream"""
336      token = get_github_token()
337      fork_owner = get_fork_owner()
338      
339      url = f"https://api.github.com/repos/{UPSTREAM_REPO}/pulls"
340      headers = {
341          "Authorization": f"token {token}",
342          "Accept": "application/vnd.github.v3+json"
343      }
344      
345      # Get count of archives for PR description
346      archives_dir = PROJECT_ROOT / "content" / "news"
347      archive_count = 0
348      for source_dir in archives_dir.iterdir():
349          if source_dir.is_dir():
350              archive_dir = source_dir / "archive"
351              if archive_dir.exists():
352                  archive_count += len(list(archive_dir.iterdir()))
353      
354      timestamp = datetime.now().strftime("%Y-%m-%d %H:%M")
355      
356      data = {
357          "title": f"[Auto-Scraper] News archives update - {timestamp}",
358          "head": f"{fork_owner}:{PR_BRANCH}",
359          "base": MAIN_BRANCH,
360          "body": f"""## Automated News Archive Update
361  
362  This PR was automatically generated by the news scraper daemon.
363  
364  ### Summary
365  - **Timestamp**: {timestamp}
366  - **Total archived articles**: {archive_count}
367  
368  ### What's included
369  - Scraped HTML archives of news articles
370  - Updated URL registry to prevent duplicates
371  - Scraper activity logs
372  
373  ---
374  *This PR will be automatically replaced if not merged before the next hourly update.*
375  """
376      }
377      
378      try:
379          response = requests.post(url, headers=headers, json=data)
380          response.raise_for_status()
381          pr_data = response.json()
382          logging.info(f"Created PR #{pr_data['number']}: {pr_data['html_url']}")
383          return True
384          
385      except requests.exceptions.HTTPError as e:
386          if e.response.status_code == 422:
387              # PR already exists or no changes
388              logging.info("PR already exists or no changes to submit")
389          else:
390              logging.error(f"Failed to create PR: {e}")
391              logging.error(f"Response: {e.response.text}")
392          return False
393          
394      except Exception as e:
395          logging.error(f"Failed to create PR: {e}")
396          return False
397  
398  
399  def manage_pr():
400      """Close old PR if exists, push changes, and create new PR"""
401      logging.info("Managing PR...")
402      
403      # Check for existing open PR
404      existing_pr = get_open_pr()
405      if existing_pr:
406          pr_number = existing_pr["number"]
407          logging.info(f"Found existing open PR #{pr_number}, closing...")
408          close_pr(pr_number)
409      
410      # Push to PR branch
411      if not push_to_pr_branch():
412          logging.error("Failed to push to PR branch")
413          return
414      
415      # Create new PR
416      create_pr()
417  
418  
419  def commit_logs():
420      """Commit log file changes"""
421      if not LOG_FILE.exists():
422          return
423      
424      try:
425          run_cmd(["git", "add", str(LOG_FILE)])
426          
427          # Check if there are staged changes for the log file
428          result = run_cmd(["git", "diff", "--cached", "--name-only"])
429          if "logs/scraper.log" in result.stdout:
430              run_cmd(["git", "commit", "-m", "chore(logs): update scraper logs"])
431              logging.info("Committed log updates")
432              
433      except Exception as e:
434          logging.debug(f"No log changes to commit: {e}")
435  
436  
437  def run_daemon(run_once: bool = False):
438      """Main daemon loop"""
439      logger = setup_logging()
440      
441      logging.info("=" * 60)
442      logging.info("News Scraper Daemon Starting")
443      logging.info(f"Fork: {get_fork_repo()}")
444      logging.info(f"Upstream: {UPSTREAM_REPO}")
445      logging.info(f"Sync interval: {SYNC_INTERVAL_MINUTES} minutes")
446      logging.info(f"PR interval: {PR_INTERVAL_MINUTES} minutes")
447      logging.info("=" * 60)
448      
449      # Verify token and fork repo
450      get_github_token()
451      get_fork_repo()
452      
453      # Setup git remotes
454      setup_git_remotes()
455      
456      last_sync = datetime.min
457      last_pr = datetime.min
458      
459      try:
460          while True:
461              now = datetime.now()
462              
463              # Check if it's time to sync (every 10 minutes)
464              if now - last_sync >= timedelta(minutes=SYNC_INTERVAL_MINUTES):
465                  logging.info("-" * 40)
466                  logging.info("Starting sync cycle...")
467                  
468                  # Sync with upstream
469                  sync_with_upstream()
470                  
471                  # Run scraper
472                  success, failed = run_scraper()
473                  if success > 0 or failed > 0:
474                      logging.info(f"Scraper results: {success} success, {failed} failed")
475                  
476                  # Commit any changes
477                  commit_changes()
478                  
479                  # Commit logs
480                  commit_logs()
481                  
482                  last_sync = now
483                  logging.info("Sync cycle complete")
484              
485              # Check if it's time to create/update PR (every hour)
486              if now - last_pr >= timedelta(minutes=PR_INTERVAL_MINUTES):
487                  logging.info("-" * 40)
488                  logging.info("Starting PR cycle...")
489                  
490                  # Check if we have any archives to submit
491                  if has_local_changes() or True:  # Always try to create PR
492                      # Push to fork first
493                      try:
494                          run_cmd(["git", "push", "origin", MAIN_BRANCH])
495                      except:
496                          pass
497                      
498                      manage_pr()
499                  
500                  last_pr = now
501                  logging.info("PR cycle complete")
502              
503              if run_once:
504                  logging.info("Run once mode, exiting...")
505                  break
506              
507              # Sleep for 1 minute between checks
508              time.sleep(60)
509              
510      except KeyboardInterrupt:
511          logging.info("Daemon stopped by user")
512      except Exception as e:
513          logging.error(f"Daemon error: {e}")
514          raise
515  
516  
517  def main():
518      parser = argparse.ArgumentParser(
519          description="News Scraper Daemon - runs 24/7, syncs and scrapes"
520      )
521      parser.add_argument(
522          "--once",
523          action="store_true",
524          help="Run one sync+scrape+PR cycle and exit"
525      )
526      
527      args = parser.parse_args()
528      run_daemon(run_once=args.once)
529  
530  
531  if __name__ == "__main__":
532      main()
533