sourcing.py
1 import os 2 import json 3 import shutil 4 import asyncio 5 import requests 6 from glob import glob 7 from pathlib import Path 8 9 from tqdm import tqdm 10 from loguru import logger 11 from torrentp import TorrentDownloader 12 13 from src.data_preparation.scraping import scrape 14 from src.setup.paths import CHROMA_DIR, DATA_DIR, OCR_IMAGES, IMAGES_IN_DOWNLOADS, make_fundamental_paths 15 16 17 def find_raw_data_for_author(author_name: str) -> Path: 18 return [author.path_to_raw_data for author in get_sources() if author_name == author.name][0] 19 20 21 class ViaScraper: 22 def __init__( 23 self, 24 title: str, 25 url: str, 26 is_interview: bool = False, 27 initial_marker: str | None = None, 28 terminal_marker: str | None = None 29 ) -> None: 30 self.url: str = url 31 self.title: str = title 32 self.is_interview: bool = is_interview 33 self.file_name: str = f"{self.title}.txt" 34 self.initial_marker: str | None = initial_marker 35 self.terminal_marker: str | None = terminal_marker 36 37 def download(self, author_name: str) -> None: 38 39 destination_path: Path = find_raw_data_for_author(author_name=author_name) 40 file_path: Path = destination_path.joinpath(f"{self.file_name}") 41 42 if not Path(file_path).exists(): 43 logger.warning(f'Attempting to scrape "{self.title}"') 44 45 text: str | None = scrape( 46 url=self.url, 47 initial_marker=self.initial_marker, 48 terminal_marker=self.terminal_marker 49 ) 50 51 if isinstance(text, str): 52 _ = Path(file_path).write_text(text) 53 logger.success("Scraped and saved text") 54 else: 55 raise Exception(f"Scraping of {self.title} failed") 56 57 58 class ViaHTTP: 59 def __init__( 60 self, 61 title: str, 62 url: str | None, 63 format: str = "pdf", 64 needs_ocr: bool = False, 65 start_page: int | None = None, 66 end_page: int | None = None 67 ) -> None: 68 69 self.title: str = title 70 self.format: str = format 71 self.url: str | None = url 72 self.needs_ocr: bool = needs_ocr 73 self.start_page: int | None = start_page 74 self.end_page: int | None = end_page 75 self.file_name: str = title.lower().replace(" ", "_") 76 make_fundamental_paths() 77 78 def download(self, file_path: str) -> None: 79 assert self.url != None 80 if not Path(file_path).exists(): 81 logger.warning(f'Downloading "{self.title}"...') 82 try: 83 response = requests.get(url=self.url) 84 if response.status_code == 200: 85 with open(file_path, mode="wb") as file: 86 _ = file.write(response.content) 87 88 logger.success(f'Downloaded "{self.title}"') 89 90 except Exception as error: 91 logger.error(f"Unable to download {self.title}. Error: {error}") 92 93 94 class ViaTorrent: 95 def __init__(self, magnet: str) -> None: 96 self.magnet: str = magnet 97 98 def download(self, file_path: str): 99 torrent = TorrentDownloader(file_path=self.magnet, save_path=file_path) 100 asyncio.run(torrent.start_download()) 101 102 def extract_files(self, download_path: str, author_name: str) -> None: 103 104 contents: list[str] = glob(download_path + "/**/*", recursive=True) 105 files: list[str] = [object for object in contents if os.path.isfile(object)] 106 directories: list[str] = [object for object in contents if object not in files] 107 text_extensions: tuple[str, str, str, str, str] = ("txt", "pdf", "epub", "mobi", "azw3") 108 image_extensions: tuple[str, str] = ("jpg", "png") 109 110 paths_of_downloaded_files: list[str] = [] 111 paths_of_downloaded_images: list[str] = [] 112 author_image_dir: Path = IMAGES_IN_DOWNLOADS.joinpath(author_name) 113 114 for file in tqdm( 115 iterable=files, 116 desc="Extracting files of interest..." 117 ): 118 file_base_name: str = os.path.basename(file) 119 file_is_text: bool = file.lower().endswith(text_extensions) 120 file_is_image: bool = file.lower().endswith(image_extensions) 121 122 if file_is_text: 123 if not Path(download_path).joinpath(file_base_name).exists(): 124 shutil.move(file, download_path) 125 126 paths_of_downloaded_files.append( 127 str(Path(download_path + f"/{file_base_name}")) 128 ) 129 130 elif file_is_image: 131 if not Path(author_image_dir.joinpath(f"{file_base_name}")).exists(): 132 shutil.move(file, author_image_dir) 133 134 paths_of_downloaded_images.append( 135 str(author_image_dir.joinpath(f"{file_base_name}")) 136 ) 137 138 self.log_downloaded_files( 139 author_name=author_name, 140 paths_of_downloaded_files=paths_of_downloaded_files, 141 paths_of_downloaded_images=paths_of_downloaded_images 142 ) 143 144 self.remove_book_directories(directories=directories) 145 146 @staticmethod 147 def remove_book_directories(directories: list[str]) -> None: 148 149 for directory in tqdm( 150 iterable=directories, 151 desc="Deleting directories that contained the extracted files..." 152 ): 153 if Path(directory).exists(): 154 shutil.rmtree(directory) 155 156 def log_downloaded_files( 157 self, 158 author_name: str, 159 paths_of_downloaded_files: list[str], 160 paths_of_downloaded_images: list[str] 161 ) -> None: 162 163 author_path: Path = find_raw_data_for_author(author_name=author_name).parent 164 author_image_dir: Path = IMAGES_IN_DOWNLOADS.joinpath(author_name) 165 166 object_types_and_paths: dict[Path, list[str]] = { 167 author_path.joinpath("downloaded_files.json"): paths_of_downloaded_files, 168 author_image_dir.joinpath("downloaded_images.json"): paths_of_downloaded_images 169 } 170 171 for path, logs in object_types_and_paths.items(): 172 173 if Path(path).exists(): 174 os.remove(path) 175 176 with open(path, mode="w") as file: 177 json.dump(logs, file) 178 179 180 class Author: 181 def __init__( 182 self, 183 name: str, 184 books_via_http: list[ViaHTTP] | None = None, 185 books_via_torrent: list[ViaTorrent] | None = None, 186 books_via_scraper: list[ViaScraper] | None = None, 187 biographers_and_compilers: list[str] | None = None 188 ) -> None: 189 190 self.name: str = name 191 self.path_to_data: Path = DATA_DIR.joinpath(name) 192 self.path_to_raw_data : Path = self.path_to_data.joinpath("raw") 193 self.books_via_http: list[ViaHTTP] | None = books_via_http 194 self.books_via_torrent: list[ViaTorrent] | None = books_via_torrent 195 self.books_via_scraper: list[ViaScraper] | None = books_via_scraper 196 self.biographers_and_compilers: list[str] | None = biographers_and_compilers 197 198 self.make_paths() # Placed here because it the function requires self.path_to_data 199 200 self.file_paths: list[Path] = [ 201 self.path_to_raw_data.joinpath(file) for 202 file in os.listdir(self.path_to_raw_data) if self.path_to_raw_data.joinpath(file).is_file() 203 ] 204 205 def download_books(self) -> None: 206 207 match (self.books_via_http != None, self.books_via_torrent != None, self.books_via_scraper != None): 208 209 case (True, True, True): 210 self.download_via_http() 211 self.download_via_scraper() 212 self.download_via_torrents() 213 214 case (True, False, False): 215 self.download_via_http() 216 case (False, True, False): 217 self.download_via_torrents() 218 case(False, False, True): 219 self.download_via_scraper() 220 221 case(False, True, True): 222 self.download_via_torrents() 223 self.download_via_scraper() 224 case(True, False, True): 225 self.download_via_http() 226 self.download_via_scraper() 227 case (True, True, False): 228 self.download_via_http() 229 self.download_via_torrents() 230 231 case (False, False, False): 232 raise Exception(f"Across download methods, no information on any books have been provided for {self.name}") 233 234 def download_via_http(self) -> None: 235 assert self.books_via_http != None 236 237 book_paths: list[str] = [] 238 for book in self.books_via_http: 239 file_path: Path = self.path_to_raw_data.joinpath(f"{book.file_name}.pdf") 240 book.download(file_path=str(file_path)) 241 book_paths.append(str(file_path)) 242 243 def download_via_torrents(self) -> None: 244 assert self.books_via_torrent != None 245 246 for book in self.books_via_torrent: 247 if self.must_torrent(): 248 self.leech(book=book) 249 250 def download_via_scraper(self) -> None: 251 assert self.books_via_scraper != None 252 253 for book in self.books_via_scraper: 254 book.download(author_name=self.name) 255 256 def must_torrent(self) -> bool: 257 contents: list[str] = glob(str(self.path_to_raw_data) + "/**/*", recursive=True) 258 files_only: list[str] = [object for object in contents if os.path.isfile(object)] 259 log_path: Path = self.path_to_data.joinpath("downloaded_files.json") 260 261 if not Path(log_path).exists(): 262 return True 263 else: 264 with open(log_path, mode="r", encoding="utf-8") as file: 265 logged_paths: list[str] = json.load(file) 266 267 if (len(files_only) == len(logged_paths)) and len(logged_paths) != 0: 268 logger.success(f"All files associated with {self.name} are available") 269 return False 270 else: 271 logger.warning(f"Some of {self.name}'s files are missing.") 272 return True 273 274 def leech(self, book: ViaTorrent): 275 book.download(file_path=str(self.path_to_raw_data)) 276 book.extract_files(download_path=str(self.path_to_raw_data), author_name=self.name) 277 278 def make_paths(self): 279 280 paths_to_create: list[Path] = [ 281 self.path_to_data, 282 self.path_to_raw_data, 283 OCR_IMAGES.joinpath(self.name), 284 CHROMA_DIR.joinpath(self.name), 285 IMAGES_IN_DOWNLOADS.joinpath(self.name) 286 ] 287 288 for path in paths_to_create: 289 if not Path(path).exists(): 290 os.mkdir(path=path) 291 292 293 294 295 def get_sources(): 296 297 marx = Author( 298 name="Karl Marx", 299 books_via_http=[ 300 ViaHTTP( 301 title="Capital Vol I", 302 url="https://www.marxists.org/archive/marx/works/download/pdf/Capital-Volume-I.pdf", 303 start_page=None, 304 end_page=None 305 ), 306 307 ViaHTTP( 308 title="Capital Vol II", 309 url="https://www.marxists.org/archive/marx/works/download/pdf/Capital-Volume-II.pdf", 310 start_page=None, 311 end_page=None 312 ), 313 314 ViaHTTP( 315 title="Capital Vol III", 316 url="https://www.marxists.org/archive/marx/works/download/pdf/Capital-Volume-III.pdf", 317 start_page=None, 318 end_page=None 319 ), 320 321 ViaHTTP( 322 title="Value, Price & Profit", 323 url="https://www.marxists.org/archive/marx/works/download/pdf/value-price-profit.pdf", 324 start_page=None, 325 end_page=None 326 ), 327 328 ViaHTTP( 329 title="Wage, Labour & Capital", 330 url="https://www.marxists.org/archive/marx/works/download/pdf/wage-labour-capital.pdf", 331 start_page=None, 332 end_page=None 333 ), 334 335 ViaHTTP( 336 title="The Communist Manifesto", 337 url="https://www.marxists.org/admin/books/manifesto/Manifesto.pdf", 338 start_page=30, 339 end_page=112 340 ), 341 ] 342 ) 343 344 345 mao = Author( 346 name="Mao Zedong", 347 books_via_http=[ 348 ViaHTTP( 349 title="Oppose Book Worship", 350 url="https://www.marxists.org/ebooks/mao/Oppose_Book_Worship_-_Mao_Zedong.pdf", 351 start_page=2, 352 end_page=12 353 ), 354 355 ViaHTTP( 356 title="Selected Works of Mao Tse-Tung Volume I", 357 url="https://www.marxists.org/reference/archive/mao/selected-works/sw-in-pdf/sw-flp-1965-v1.pdf", 358 start_page=20, 359 end_page=353 360 ), 361 362 ViaHTTP( 363 title="Selected Works of Mao Tse-Tung Volume II", 364 url="https://www.marxists.org/reference/archive/mao/selected-works/sw-in-pdf/sw-flp-1965-v2.pdf", 365 start_page=18, 366 end_page=473 367 ), 368 369 ViaHTTP( 370 title="Selected Works of Mao Tse-Tung Volume III", 371 url="https://www.marxists.org/reference/archive/mao/selected-works/sw-in-pdf/sw-flp-1965-v3.pdf", 372 start_page=16, 373 end_page=345 374 ), 375 376 ViaHTTP( 377 title="Selected Works of Mao Tse-Tung Volume IV", 378 url="https://www.marxists.org/reference/archive/mao/selected-works/sw-in-pdf/sw-flp-1965-v4.pdf", 379 start_page=17, 380 end_page=463 381 ), 382 383 ViaHTTP( 384 title="Selected Works of Mao Tse-Tung Volume V", 385 url="https://www.marxists.org/reference/archive/mao/selected-works/sw-in-pdf/sw-flp-1971-v5.pdf", 386 start_page=22, 387 end_page=524 388 ) 389 ], 390 books_via_scraper=[ 391 ViaScraper( 392 title="Combat Liberalism", 393 url="https://www.marxists.org/reference/archive/mao/selected-works/volume-2/mswv2_03.htm", 394 initial_marker="We stand for", 395 terminal_marker="Transcription" 396 ) 397 ] 398 ) 399 400 401 garvey = Author( 402 name="Marcus Garvey", 403 books_via_http=[ 404 ViaHTTP( 405 title="The Philosophy & Opinions of Marcus Garvey", 406 url="https://ia801208.us.archive.org/5/items/ThePhilosophyOpinionsOfMarcusGarveyOrAfricaForTheAfricans/EbookPhilAndOpinions.pdf", 407 start_page=3, 408 end_page=62 409 ), 410 ] 411 ) 412 413 414 vivekananda = Author( 415 name="Swami Vivekananda", 416 books_via_http=[ 417 ViaHTTP( 418 title="The Complete Works of Swami Vivekananda", 419 url="https://ia801608.us.archive.org/9/items/complete-works-of-swami-vivekananda-all-volumes-swami-vivekananda/Complete%20Works%20of%20Swami%20Vivekananda%20-%20%20All%20Volumes%20-%20Swami%20Vivekananda.pdf", 420 start_page=81, 421 end_page=5162 422 ) 423 ] 424 ) 425 426 427 blavatsky = Author( 428 name="Helena Pretrovna Blavatsky", 429 books_via_http=[ 430 ViaHTTP( 431 title="The Secret Doctrine (Volume I)", 432 url="https://www.gutenberg.org/files/54824/54824-pdf.pdf", 433 start_page=12, 434 end_page=971 435 ), 436 437 ViaHTTP( 438 title="The Secret Doctrine (Volume II)", 439 url="https://www.gutenberg.org/files/54488/54488-pdf.pdf", 440 start_page=23, 441 end_page=1156 442 ), 443 444 ViaHTTP( 445 title="The Secret Doctrine (Volume III)", 446 url="https://www.gutenberg.org/files/56880/56880-pdf.pdf", 447 start_page=9, 448 end_page=796 449 ), 450 ], 451 452 books_via_scraper=[ 453 ViaScraper( 454 title="The Secret Doctrine (Volume IV)", 455 url="https://www.gutenberg.org/ebooks/61626.epub.noimages", 456 ) 457 ] 458 459 ) 460 461 462 gandhi = Author( 463 name="Mohandas Karamchand Ghandi", 464 books_via_http=[ 465 ViaHTTP( 466 title="An Autobiography: The Story of My Experiments with Truth", 467 url="https://www.mkgandhi.org/ebks/An-Autobiography.pdf", 468 start_page=16, 469 end_page=556 470 ), 471 472 ViaHTTP( 473 title="Hind Swaraj or Indian Home Rule", 474 url="https://www.mkgandhi.org/ebks/hind_swaraj.pdf", 475 start_page=12, 476 end_page=89 477 ), 478 479 ViaHTTP( 480 title="The Bhagavad Gita, According to Gandhi", 481 url="https://ia800904.us.archive.org/10/items/InnerEngineeringAYogisGuideToJoy_20190116/Mahatma_gandhiThe_bhagavad_gita_according_to_gandhi.pdf", 482 start_page=10, 483 end_page=177 484 ), 485 486 ViaHTTP( 487 title="Non-Violent Resistance", 488 url="https://dn720701.ca.archive.org/0/items/nonviolentresist00mkga/nonviolentresist00mkga.pdf", 489 start_page=16, 490 end_page=388 491 ) 492 ] 493 ) 494 495 496 rai = Author( 497 name="Lala Lajpat Rai", 498 books_via_http=[ 499 ViaHTTP( 500 title="The Story of My Deportation", 501 url="https://ia601503.us.archive.org/21/items/in.ernet.dli.2015.19903/2015.19903.The--Story-Of-My-Deportation_text.pdf", 502 start_page=8, 503 end_page=274, 504 needs_ocr=True 505 ), 506 507 ViaHTTP( 508 title="Young India: An Interpretation and a History of the Nationalist Movement from Within", 509 url="https://ia800802.us.archive.org/21/items/16RaiYoungindia/16-rai-youngindia.pdf", 510 start_page=7, 511 end_page=294 512 ), 513 514 ] 515 ) 516 517 518 rizal = Author( 519 name="Jose Rizal", 520 books_via_http=[ 521 ViaHTTP( 522 title="The Social Cancer", 523 url="https://www.geocities.ws/qcpujoserizal/Rizal/pdf/Noli.pdf", 524 ), 525 ], 526 527 books_via_scraper=[ 528 ViaScraper( 529 title="The Reign of Greed", 530 url="https://www.gutenberg.org/files/10676/10676-h/10676-h.htm", 531 initial_marker="On the Upper Deck", 532 terminal_marker="Colophon" 533 ) 534 ] 535 ) 536 537 538 lenin = Author( 539 name="Vladimir Lenin", 540 books_via_http=[ 541 ViaHTTP( 542 title="What Is to Be Done?: Burning Questions of our Movements", 543 url="https://www.marxists.org/ebooks/lenin/what-is-to-be-done.pdf", 544 start_page=7, 545 end_page=124 546 ), 547 548 ViaHTTP( 549 title="The State and Revolution", 550 url="https://www.marxists.org/ebooks/lenin/state-and-revolution.pdf", 551 start_page=7, 552 end_page=83 553 ), 554 555 ] 556 ) 557 558 559 yat_sen = Author( 560 name="Sun Yat-sen", 561 books_via_http=[ 562 ViaHTTP( 563 title="The Three Principles of the People", 564 url="https://chinese.larouchepub.com/wp-content/uploads/2017/05/San-Min-Chu-I_ALL-en.pdf", 565 start_page=3, 566 end_page=74 567 ), 568 569 ViaHTTP( 570 title="The International Development of China", 571 url="https://chinese.larouchepub.com/wp-content/uploads/2017/05/sun_IDC-en.pdf", 572 start_page=15, 573 end_page=305 574 ), 575 ] 576 ) 577 578 nietzsche = Author( 579 name="Friedrich Nietzsche", 580 books_via_scraper=[ 581 ViaScraper( 582 title="Thus Spake Zarathustra", 583 url="https://www.gutenberg.org/files/1998/1998-h/1998-h.htm", 584 initial_marker="ZARATHUSTRA’S DISCOURSES.", 585 terminal_marker="APPENDIX" 586 ), 587 588 ViaScraper( 589 title="Beyond Good and Evil", 590 url="https://www.gutenberg.org/cache/epub/4363/pg4363-images.html", 591 initial_marker="CHAPTER I. ", 592 terminal_marker="FROM THE HEIGHTS" 593 ), 594 595 ViaScraper( 596 title="The Genealogy of Morals", 597 url="https://www.gutenberg.org/cache/epub/52319/pg52319-images.html", 598 initial_marker="FIRST ESSAY.", 599 terminal_marker="betray us!" 600 ), 601 602 ViaScraper( 603 title="The Antichrist", 604 url="https://www.gutenberg.org/cache/epub/19322/pg19322-images.html", 605 initial_marker="PREFACE", 606 terminal_marker="THE END" 607 ), 608 609 ViaScraper( 610 title="Human, All Too Human", 611 url="https://www.gutenberg.org/cache/epub/38145/pg38145-images.html", 612 initial_marker="PREFACE.", 613 terminal_marker="sinfulness." 614 ), 615 616 ViaScraper( 617 title="Ecce Homo", 618 url="https://www.gutenberg.org/cache/epub/52190/pg52190-images.html", 619 initial_marker="PREFACE.", 620 terminal_marker="It really seems that we have a path." 621 ), 622 623 624 ] 625 ) 626 627 628 kropotkin = Author( 629 name="Pyotr Kropotkin", 630 books_via_scraper=[ 631 ViaScraper( 632 title="Mutual Aid: A Factor of Evolution", 633 url="https://www.gutenberg.org/cache/epub/4341/pg4341-images.html", 634 initial_marker="Two aspects", 635 terminal_marker="our race" 636 ), 637 638 ViaScraper( 639 title="The Conquest of Bread", 640 url="https://www.gutenberg.org/cache/epub/23428/pg23428-images.html", 641 initial_marker="OUR RICHES", 642 terminal_marker="Social Revolution." 643 ) 644 ] 645 646 ) 647 648 649 bakunin = Author( 650 name="Mikhail Bakunin", 651 books_via_scraper=[ 652 ViaScraper( 653 title="God and the State", 654 url="https://www.gutenberg.org/cache/epub/36568/pg36568-images.html", 655 initial_marker="Elisée Reclus.", 656 terminal_marker="Genius of Christianity" 657 ) 658 ] 659 ) 660 661 662 proudhon = Author( 663 name="Pierre-Joseph Proudhon", 664 books_via_scraper=[ 665 ViaScraper( 666 title="What is Property? An Inquiry into the Principle of Right and of Government", 667 url="https://www.gutenberg.org/cache/epub/360/pg360-images.html", 668 initial_marker="FIRST MEMOIR.", 669 terminal_marker="and absurd." 670 ), 671 672 ViaScraper( 673 title="System of Economical Contradictions; Or, The Philosophy of Misery", 674 url="https://www.gutenberg.org/cache/epub/444/pg444-images.html", 675 initial_marker="Before entering", 676 terminal_marker="reason of our existence." 677 ) 678 ] 679 680 ) 681 682 683 berkman = Author( 684 name="Alexander Berkman", 685 books_via_scraper=[ 686 ViaScraper( 687 title="Prison Memoirs of an Anarchist", 688 url="https://www.gutenberg.org/cache/epub/34406/pg34406-images.html", 689 initial_marker="The Call of Homestead", 690 terminal_marker="I have found work to do." 691 ), 692 693 ViaScraper( 694 title="Deportation - Its Meaning & Menace", 695 url="https://www.gutenberg.org/cache/epub/68442/pg68442-images.html", 696 initial_marker="DEPORTATION—Its Meaning and Menace", 697 terminal_marker="but also of reward." 698 ) 699 ] 700 ) 701 702 703 sun_tzu = Author( 704 name="Sun Tzu", 705 books_via_http=[ 706 ViaHTTP( 707 title="The Art of War", 708 url="https://sites.ualberta.ca/~enoch/Readings/The_Art_Of_War.pdf", 709 start_page=3, 710 end_page=65 711 ) 712 ] 713 ) 714 715 return [ 716 marx, garvey, gandhi, yat_sen, lenin, kropotkin, rizal, nietzsche, 717 mao, bakunin, proudhon, berkman, sun_tzu, vivekananda, rai, blavatsky 718 ] 719 720 721 if __name__ == "__main__": 722 for author in get_sources(): 723 author.download_books() 724 logger.info("Next author...") 725