/ board_game_scraper / __main__.py
__main__.py
1 # -*- coding: utf-8 -*- 2 3 """Command line entry point.""" 4 5 import argparse 6 import logging 7 import os 8 9 from datetime import timezone 10 from pathlib import Path 11 from time import sleep 12 from shutil import rmtree 13 14 from pytility import normalize_space, parse_date 15 from scrapy.cmdline import execute 16 from scrapy.utils.job import job_dir as job_dir_from_settings 17 from scrapy.utils.log import configure_logging 18 from scrapy.utils.misc import arg_to_iter 19 from scrapy.utils.project import get_project_settings 20 from scrapy.utils.python import garbage_collect 21 22 from .utils import date_from_file, now 23 24 LOGGER = logging.getLogger(__name__) 25 DATE_FORMAT = "%Y-%m-%dT%H-%M-%S" 26 RESUMABLE_STATES = frozenset(("shutdown", "closespider_timeout")) 27 28 29 def _find_states( 30 path_dir, 31 state_file=".state", 32 delete="finished", 33 delete_non_state=False, 34 ): 35 path_dir = Path(path_dir).resolve() 36 delete = frozenset(arg_to_iter(delete)) 37 result = {} 38 39 if not path_dir.is_dir(): 40 LOGGER.warning("<%s> is not an existing dir", path_dir) 41 return result 42 43 LOGGER.info("Finding jobs and their states in <%s>", path_dir) 44 45 for sub_dir in path_dir.iterdir(): 46 state_path = sub_dir / state_file 47 48 if not sub_dir.is_dir() or not state_path.is_file(): 49 continue 50 51 try: 52 with state_path.open() as file_obj: 53 state = normalize_space(next(file_obj, None)) 54 except Exception: 55 LOGGER.exeception("Unable to read a state from <%s>", state_path) 56 state = None 57 58 if not state: 59 LOGGER.warning("No valid state file in <%s>", sub_dir) 60 61 if state in delete or (delete_non_state and not state): 62 LOGGER.info("Deleting <%s> with state <%s>", sub_dir, state) 63 rmtree(sub_dir, ignore_errors=True) 64 elif state: 65 result[sub_dir.name] = state 66 67 return result 68 69 70 def _parse_args(): 71 parser = argparse.ArgumentParser(description="TODO") 72 parser.add_argument("spider", help="TODO") 73 parser.add_argument("--job-dir", "-j", help="TODO") 74 parser.add_argument("--feeds-dir", "-f", help="TODO") 75 parser.add_argument("--feeds-subdir", "-F", help="TODO") 76 parser.add_argument( 77 "--file-tag", "-t", default=os.getenv("SCRAPER_FILE_TAG"), help="TODO" 78 ) 79 parser.add_argument("--dont-run-before", "-d", help="TODO") 80 parser.add_argument( 81 "--verbose", 82 "-v", 83 action="count", 84 default=0, 85 help="log level (repeat for more verbosity)", 86 ) 87 88 return parser.parse_known_args() 89 90 91 def main(): 92 """Command line entry point.""" 93 94 settings = get_project_settings() 95 configure_logging(settings) 96 97 args, remainder = _parse_args() 98 LOGGER.info(args) 99 LOGGER.info(remainder) 100 101 base_dir = Path(settings["BASE_DIR"]).resolve() 102 cache_dir = base_dir / ".scrapy" / "httpcache" 103 feeds_dir = Path(args.feeds_dir) if args.feeds_dir else base_dir / "feeds" 104 feeds_dir = feeds_dir.resolve() 105 feeds_dir_scraper = ( 106 feeds_dir / args.feeds_subdir if args.feeds_subdir else feeds_dir / args.spider 107 ) 108 file_tag = normalize_space(args.file_tag) 109 out_file = feeds_dir_scraper / "%(class)s" / f"%(time)s{file_tag}.jl" 110 111 LOGGER.info("Output file will be <%s>", out_file) 112 113 from_settings = job_dir_from_settings(settings) 114 job_dir = ( 115 Path(args.job_dir) 116 if args.job_dir 117 else Path(from_settings) 118 if from_settings 119 else base_dir / "jobs" / args.spider 120 ) 121 job_dir = job_dir.resolve() 122 123 cache_dir.mkdir(parents=True, exist_ok=True) 124 feeds_dir_scraper.mkdir(parents=True, exist_ok=True) 125 job_dir.mkdir(parents=True, exist_ok=True) 126 127 dont_run_before_file = job_dir / ".dont_run_before" 128 dont_run_before = parse_date( 129 args.dont_run_before, tzinfo=timezone.utc 130 ) or date_from_file(dont_run_before_file, tzinfo=timezone.utc) 131 132 if dont_run_before: 133 LOGGER.info("Don't run before %s", dont_run_before.isoformat()) 134 sleep_seconds = dont_run_before.timestamp() - now().timestamp() 135 if sleep_seconds > 0: 136 LOGGER.info("Going to sleep for %.1f seconds", sleep_seconds) 137 sleep(sleep_seconds) 138 139 states = _find_states( 140 job_dir, state_file=settings.get("STATE_TAG_FILE") or ".state" 141 ) 142 143 running = sorted(sub_dir for sub_dir, state in states.items() if state == "running") 144 145 if len(running) > 1: 146 LOGGER.warning( 147 "Found %d running jobs %s, please check and fix!", len(running), running 148 ) 149 return 150 151 if running: 152 LOGGER.info("Found a running job <%s>, skipping...", running[0]) 153 return 154 155 resumable = sorted( 156 sub_dir for sub_dir, state in states.items() if state in RESUMABLE_STATES 157 ) 158 159 if len(resumable) > 1: 160 LOGGER.warning( 161 "Found %d resumable jobs %s, please check and fix!", 162 len(resumable), 163 resumable, 164 ) 165 return 166 167 if resumable: 168 LOGGER.info("Resuming previous job <%s>", resumable[0]) 169 170 job_tag = resumable[0] if resumable else now().strftime(DATE_FORMAT) 171 curr_job = job_dir / job_tag 172 173 command = [ 174 "scrapy", 175 "crawl", 176 args.spider, 177 "--output", 178 str(out_file), 179 "--set", 180 f"JOBDIR={curr_job}", 181 "--set", 182 f"DONT_RUN_BEFORE_FILE={dont_run_before_file}", 183 ] + remainder 184 185 LOGGER.info("Executing command %r", command) 186 187 try: 188 execute(argv=command) 189 finally: 190 garbage_collect() 191 192 193 if __name__ == "__main__": 194 main()