archive.py
1 import json 2 from loguru import logger 3 4 from src.setup.paths import ARCHIVE_DIR 5 from src.setup.types import HTTPArchive, TorrentArchive, ScrapedArchive, AuthorArchive 6 from src.data_preparation.sourcing import ViaHTTP, ViaTorrent, ViaScraper, Author, get_sources 7 8 9 class AuthorArchiver: 10 def __init__(self, author: Author): 11 self.author: Author = author 12 13 def archive_http_downloads(self, books: list[ViaHTTP] | None) -> HTTPArchive: 14 assert books != None 15 book_archive: HTTPArchive = {} 16 for book in books: 17 book_archive[self.author.name] = [] 18 book_details: dict[str, str | bool | int | None] = { 19 "title": book.title, 20 "url": book.url, 21 "format": book.format, 22 "needs_ocr": book.needs_ocr, 23 "start_page": book.start_page, 24 "end_page": book.end_page, 25 } 26 27 book_archive[self.author.name].append(book_details) 28 29 return book_archive 30 31 def archive_torrent_downloads(self, books: list[ViaTorrent] | None) -> TorrentArchive: 32 assert books != None 33 torrent_archive: TorrentArchive = [] 34 for batch in books: 35 torrent_archive.append({ 36 f"magnet #{books.index(batch)}": batch.magnet, 37 "biographers_and_compilers": self.author.biographers_and_compilers 38 }) 39 40 return torrent_archive 41 42 def archive_scraped_details(self, books: list[ViaScraper] | None) -> ScrapedArchive: 43 assert books != None 44 archive: ScrapedArchive = {} 45 for book in books: 46 archive[self.author.name] = [] 47 book_details: dict[str, str] = {"title": book.title, "url": book.url} 48 archive[self.author.name].append(book_details) 49 50 return archive 51 52 def construct_archive( 53 self, 54 books_from_http: list[ViaHTTP] | None, 55 books_from_torrent: list[ViaTorrent] | None, 56 books_from_scraper: list[ViaScraper] | None 57 ) -> AuthorArchive: 58 59 match (books_from_http!= None, books_from_torrent != None, books_from_scraper != None): 60 61 case (True, True, True): 62 return ( 63 self.archive_http_downloads(books=books_from_http), 64 self.archive_torrent_downloads(books=books_from_torrent), 65 self.archive_scraped_details(books=books_from_scraper) 66 ) 67 68 case (True, False, False): 69 return self.archive_http_downloads(books=books_from_http) 70 case (False, True, False): 71 return self.archive_torrent_downloads(books=books_from_torrent) 72 case(False, False, True): 73 return self.archive_scraped_details(books=books_from_scraper) 74 case(False, True, True): 75 return self.archive_http_downloads(books=books_from_http), self.archive_scraped_details(books=books_from_scraper) 76 case(True, False, True): 77 return self.archive_http_downloads(books=books_from_http), self.archive_scraped_details(books=books_from_scraper) 78 case (True, True, False): 79 return self.archive_http_downloads(books=books_from_http), self.archive_torrent_downloads(books=books_from_torrent) 80 case (False, False, False): 81 logger.warning(f"No book metadata have been provided for {self.author.name} (regardless of source)") 82 83 84 def make_final_archive(authors: list[Author]): 85 final_archive: list[AuthorArchive] = [] 86 for author in authors: 87 books_from_http: list[ViaHTTP] | None = author.books_via_http 88 books_from_torrent: list[ViaTorrent] | None = author.books_via_torrent 89 books_from_scraper: list[ViaScraper] | None = author.books_via_scraper 90 91 archiver = AuthorArchiver(author=author) 92 author_archive: AuthorArchive = archiver.construct_archive( 93 books_from_http=books_from_http, 94 books_from_torrent=books_from_torrent, 95 books_from_scraper=books_from_scraper 96 ) 97 98 final_archive.append(author_archive) 99 100 with open(ARCHIVE_DIR, mode="w") as file: 101 json.dump(final_archive, file) 102 103 logger.success("Sources Archived") 104 105 106 if __name__ == "__main__": 107 authors: list[Author] = get_sources() 108 make_final_archive(authors=authors) 109