Cradicle Explorer

__main__.py
  1  # -*- coding: utf-8 -*-
  2  
  3  """Command line entry point."""
  4  
  5  import argparse
  6  import logging
  7  import os
  8  
  9  from datetime import timezone
 10  from pathlib import Path
 11  from time import sleep
 12  from shutil import rmtree
 13  
 14  from pytility import normalize_space, parse_date
 15  from scrapy.cmdline import execute
 16  from scrapy.utils.job import job_dir as job_dir_from_settings
 17  from scrapy.utils.log import configure_logging
 18  from scrapy.utils.misc import arg_to_iter
 19  from scrapy.utils.project import get_project_settings
 20  from scrapy.utils.python import garbage_collect
 21  
 22  from .utils import date_from_file, now
 23  
 24  LOGGER = logging.getLogger(__name__)
 25  DATE_FORMAT = "%Y-%m-%dT%H-%M-%S"
 26  RESUMABLE_STATES = frozenset(("shutdown", "closespider_timeout"))
 27  
 28  
 29  def _find_states(
 30      path_dir,
 31      state_file=".state",
 32      delete="finished",
 33      delete_non_state=False,
 34  ):
 35      path_dir = Path(path_dir).resolve()
 36      delete = frozenset(arg_to_iter(delete))
 37      result = {}
 38  
 39      if not path_dir.is_dir():
 40          LOGGER.warning("<%s> is not an existing dir", path_dir)
 41          return result
 42  
 43      LOGGER.info("Finding jobs and their states in <%s>", path_dir)
 44  
 45      for sub_dir in path_dir.iterdir():
 46          state_path = sub_dir / state_file
 47  
 48          if not sub_dir.is_dir() or not state_path.is_file():
 49              continue
 50  
 51          try:
 52              with state_path.open() as file_obj:
 53                  state = normalize_space(next(file_obj, None))
 54          except Exception:
 55              LOGGER.exeception("Unable to read a state from <%s>", state_path)
 56              state = None
 57  
 58          if not state:
 59              LOGGER.warning("No valid state file in <%s>", sub_dir)
 60  
 61          if state in delete or (delete_non_state and not state):
 62              LOGGER.info("Deleting <%s> with state <%s>", sub_dir, state)
 63              rmtree(sub_dir, ignore_errors=True)
 64          elif state:
 65              result[sub_dir.name] = state
 66  
 67      return result
 68  
 69  
 70  def _parse_args():
 71      parser = argparse.ArgumentParser(description="TODO")
 72      parser.add_argument("spider", help="TODO")
 73      parser.add_argument("--job-dir", "-j", help="TODO")
 74      parser.add_argument("--feeds-dir", "-f", help="TODO")
 75      parser.add_argument("--feeds-subdir", "-F", help="TODO")
 76      parser.add_argument(
 77          "--file-tag", "-t", default=os.getenv("SCRAPER_FILE_TAG"), help="TODO"
 78      )
 79      parser.add_argument("--dont-run-before", "-d", help="TODO")
 80      parser.add_argument(
 81          "--verbose",
 82          "-v",
 83          action="count",
 84          default=0,
 85          help="log level (repeat for more verbosity)",
 86      )
 87  
 88      return parser.parse_known_args()
 89  
 90  
 91  def main():
 92      """Command line entry point."""
 93  
 94      settings = get_project_settings()
 95      configure_logging(settings)
 96  
 97      args, remainder = _parse_args()
 98      LOGGER.info(args)
 99      LOGGER.info(remainder)
100  
101      base_dir = Path(settings["BASE_DIR"]).resolve()
102      cache_dir = base_dir / ".scrapy" / "httpcache"
103      feeds_dir = Path(args.feeds_dir) if args.feeds_dir else base_dir / "feeds"
104      feeds_dir = feeds_dir.resolve()
105      feeds_dir_scraper = (
106          feeds_dir / args.feeds_subdir if args.feeds_subdir else feeds_dir / args.spider
107      )
108      file_tag = normalize_space(args.file_tag)
109      out_file = feeds_dir_scraper / "%(class)s" / f"%(time)s{file_tag}.jl"
110  
111      LOGGER.info("Output file will be <%s>", out_file)
112  
113      from_settings = job_dir_from_settings(settings)
114      job_dir = (
115          Path(args.job_dir)
116          if args.job_dir
117          else Path(from_settings)
118          if from_settings
119          else base_dir / "jobs" / args.spider
120      )
121      job_dir = job_dir.resolve()
122  
123      cache_dir.mkdir(parents=True, exist_ok=True)
124      feeds_dir_scraper.mkdir(parents=True, exist_ok=True)
125      job_dir.mkdir(parents=True, exist_ok=True)
126  
127      dont_run_before_file = job_dir / ".dont_run_before"
128      dont_run_before = parse_date(
129          args.dont_run_before, tzinfo=timezone.utc
130      ) or date_from_file(dont_run_before_file, tzinfo=timezone.utc)
131  
132      if dont_run_before:
133          LOGGER.info("Don't run before %s", dont_run_before.isoformat())
134          sleep_seconds = dont_run_before.timestamp() - now().timestamp()
135          if sleep_seconds > 0:
136              LOGGER.info("Going to sleep for %.1f seconds", sleep_seconds)
137              sleep(sleep_seconds)
138  
139      states = _find_states(
140          job_dir, state_file=settings.get("STATE_TAG_FILE") or ".state"
141      )
142  
143      running = sorted(sub_dir for sub_dir, state in states.items() if state == "running")
144  
145      if len(running) > 1:
146          LOGGER.warning(
147              "Found %d running jobs %s, please check and fix!", len(running), running
148          )
149          return
150  
151      if running:
152          LOGGER.info("Found a running job <%s>, skipping...", running[0])
153          return
154  
155      resumable = sorted(
156          sub_dir for sub_dir, state in states.items() if state in RESUMABLE_STATES
157      )
158  
159      if len(resumable) > 1:
160          LOGGER.warning(
161              "Found %d resumable jobs %s, please check and fix!",
162              len(resumable),
163              resumable,
164          )
165          return
166  
167      if resumable:
168          LOGGER.info("Resuming previous job <%s>", resumable[0])
169  
170      job_tag = resumable[0] if resumable else now().strftime(DATE_FORMAT)
171      curr_job = job_dir / job_tag
172  
173      command = [
174          "scrapy",
175          "crawl",
176          args.spider,
177          "--output",
178          str(out_file),
179          "--set",
180          f"JOBDIR={curr_job}",
181          "--set",
182          f"DONT_RUN_BEFORE_FILE={dont_run_before_file}",
183      ] + remainder
184  
185      LOGGER.info("Executing command %r", command)
186  
187      try:
188          execute(argv=command)
189      finally:
190          garbage_collect()
191  
192  
193  if __name__ == "__main__":
194      main()