/ src / data_preparation / archive.py
archive.py
  1  import json 
  2  from loguru import logger
  3  
  4  from src.setup.paths import ARCHIVE_DIR 
  5  from src.setup.types import HTTPArchive, TorrentArchive, ScrapedArchive, AuthorArchive 
  6  from src.data_preparation.sourcing import ViaHTTP, ViaTorrent, ViaScraper, Author, get_sources
  7  
  8  
  9  class AuthorArchiver:
 10      def __init__(self, author: Author):
 11          self.author: Author = author
 12  
 13      def archive_http_downloads(self, books: list[ViaHTTP] | None) -> HTTPArchive:
 14          assert books != None
 15          book_archive: HTTPArchive = {}
 16          for book in books:
 17              book_archive[self.author.name] =  []
 18              book_details: dict[str, str | bool | int | None] = {
 19                  "title": book.title,
 20                  "url": book.url,
 21                  "format": book.format,
 22                  "needs_ocr": book.needs_ocr,
 23                  "start_page": book.start_page, 
 24                  "end_page": book.end_page, 
 25              }
 26  
 27              book_archive[self.author.name].append(book_details)
 28  
 29          return book_archive
 30  
 31      def archive_torrent_downloads(self, books: list[ViaTorrent] | None) -> TorrentArchive: 
 32          assert books != None
 33          torrent_archive: TorrentArchive = [] 
 34          for batch in books:
 35              torrent_archive.append({
 36                  f"magnet #{books.index(batch)}": batch.magnet,
 37                  "biographers_and_compilers": self.author.biographers_and_compilers
 38              })
 39  
 40          return torrent_archive
 41  
 42      def archive_scraped_details(self, books: list[ViaScraper] | None) -> ScrapedArchive: 
 43          assert books != None
 44          archive: ScrapedArchive = {} 
 45          for book in books:
 46              archive[self.author.name] =  []
 47              book_details: dict[str, str] = {"title": book.title, "url": book.url}
 48              archive[self.author.name].append(book_details)
 49  
 50          return archive
 51  
 52      def construct_archive(
 53          self, 
 54          books_from_http: list[ViaHTTP] | None, 
 55          books_from_torrent: list[ViaTorrent] | None, 
 56          books_from_scraper: list[ViaScraper] | None 
 57          ) -> AuthorArchive:
 58  
 59          match (books_from_http!= None, books_from_torrent != None, books_from_scraper != None):
 60  
 61              case (True, True, True):
 62                  return (
 63                          self.archive_http_downloads(books=books_from_http), 
 64                          self.archive_torrent_downloads(books=books_from_torrent), 
 65                          self.archive_scraped_details(books=books_from_scraper)
 66                  )
 67  
 68              case (True, False, False): 
 69                  return self.archive_http_downloads(books=books_from_http)
 70              case (False, True, False):
 71                  return self.archive_torrent_downloads(books=books_from_torrent)
 72              case(False, False, True):
 73                  return self.archive_scraped_details(books=books_from_scraper)
 74              case(False, True, True):
 75                  return self.archive_http_downloads(books=books_from_http), self.archive_scraped_details(books=books_from_scraper)
 76              case(True, False, True):
 77                  return self.archive_http_downloads(books=books_from_http), self.archive_scraped_details(books=books_from_scraper)
 78              case (True, True, False): 
 79                  return self.archive_http_downloads(books=books_from_http), self.archive_torrent_downloads(books=books_from_torrent)
 80              case (False, False, False):
 81                  logger.warning(f"No book metadata have been provided for {self.author.name} (regardless of source)")
 82  
 83  
 84  def make_final_archive(authors: list[Author]):
 85      final_archive: list[AuthorArchive] = [] 
 86      for author in authors:
 87          books_from_http: list[ViaHTTP] | None = author.books_via_http 
 88          books_from_torrent: list[ViaTorrent] | None = author.books_via_torrent
 89          books_from_scraper: list[ViaScraper] | None = author.books_via_scraper
 90  
 91          archiver = AuthorArchiver(author=author)
 92          author_archive: AuthorArchive = archiver.construct_archive(
 93              books_from_http=books_from_http, 
 94              books_from_torrent=books_from_torrent, 
 95              books_from_scraper=books_from_scraper
 96          ) 
 97  
 98          final_archive.append(author_archive)
 99  
100      with open(ARCHIVE_DIR, mode="w") as file:
101          json.dump(final_archive, file)
102          
103      logger.success("Sources Archived") 
104  
105  
106  if __name__ == "__main__":
107      authors: list[Author] = get_sources()
108      make_final_archive(authors=authors)
109