/ src / data_preparation / sourcing.py
sourcing.py
  1  import os
  2  import json
  3  import shutil
  4  import asyncio
  5  import requests
  6  from glob import glob
  7  from pathlib import Path
  8  
  9  from tqdm import tqdm
 10  from loguru import logger
 11  from torrentp import TorrentDownloader
 12  
 13  from src.data_preparation.scraping import scrape 
 14  from src.setup.paths import CHROMA_DIR, DATA_DIR, OCR_IMAGES, IMAGES_IN_DOWNLOADS, make_fundamental_paths
 15  
 16  
 17  def find_raw_data_for_author(author_name: str) -> Path:
 18      return [author.path_to_raw_data for author in get_sources() if author_name == author.name][0]
 19  
 20  
 21  class ViaScraper:
 22      def __init__(
 23          self, 
 24          title: str, 
 25          url: str, 
 26          is_interview: bool = False,
 27          initial_marker: str | None = None,
 28          terminal_marker: str | None = None
 29      ) -> None:
 30          self.url: str = url
 31          self.title: str = title
 32          self.is_interview: bool = is_interview
 33          self.file_name: str = f"{self.title}.txt"
 34          self.initial_marker: str | None = initial_marker
 35          self.terminal_marker: str | None = terminal_marker 
 36  
 37      def download(self, author_name: str) -> None:
 38  
 39          destination_path: Path = find_raw_data_for_author(author_name=author_name)
 40          file_path: Path = destination_path.joinpath(f"{self.file_name}")
 41          
 42          if not Path(file_path).exists():
 43              logger.warning(f'Attempting to scrape "{self.title}"')
 44              
 45              text: str | None = scrape(
 46                  url=self.url, 
 47                  initial_marker=self.initial_marker,
 48                  terminal_marker=self.terminal_marker
 49              ) 
 50  
 51              if isinstance(text, str):
 52                  _ = Path(file_path).write_text(text)
 53                  logger.success("Scraped and saved text")
 54              else:
 55                  raise Exception(f"Scraping of {self.title} failed")
 56  
 57  
 58  class ViaHTTP:
 59      def __init__(
 60          self, 
 61          title: str, 
 62          url: str | None, 
 63          format: str = "pdf",
 64          needs_ocr: bool = False, 
 65          start_page: int | None = None, 
 66          end_page: int | None = None
 67      ) -> None:
 68  
 69          self.title: str = title
 70          self.format: str = format
 71          self.url: str | None = url 
 72          self.needs_ocr: bool = needs_ocr
 73          self.start_page: int | None = start_page
 74          self.end_page: int | None = end_page
 75          self.file_name: str = title.lower().replace(" ", "_") 
 76          make_fundamental_paths()
 77           
 78      def download(self, file_path: str) -> None:
 79          assert self.url != None
 80          if not Path(file_path).exists():
 81              logger.warning(f'Downloading "{self.title}"...')
 82              try:
 83                  response = requests.get(url=self.url)
 84                  if response.status_code == 200:
 85                      with open(file_path, mode="wb") as file:
 86                          _ = file.write(response.content)
 87  
 88                      logger.success(f'Downloaded "{self.title}"')
 89                 
 90              except Exception as error:
 91                  logger.error(f"Unable to download {self.title}. Error: {error}")
 92      
 93  
 94  class ViaTorrent:
 95      def __init__(self, magnet: str) -> None:
 96          self.magnet: str = magnet 
 97  
 98      def download(self, file_path: str):
 99          torrent = TorrentDownloader(file_path=self.magnet, save_path=file_path)
100          asyncio.run(torrent.start_download())
101         
102      def extract_files(self, download_path: str, author_name: str) -> None:
103  
104          contents: list[str] = glob(download_path + "/**/*", recursive=True) 
105          files: list[str] = [object for object in contents if os.path.isfile(object)]
106          directories: list[str] = [object for object in contents if object not in files]
107          text_extensions: tuple[str, str, str, str, str] = ("txt", "pdf", "epub", "mobi", "azw3")
108          image_extensions: tuple[str, str] = ("jpg", "png")
109  
110          paths_of_downloaded_files: list[str] = []
111          paths_of_downloaded_images: list[str] = []
112          author_image_dir: Path = IMAGES_IN_DOWNLOADS.joinpath(author_name)
113  
114          for file in tqdm(
115              iterable=files,
116              desc="Extracting files of interest..."
117          ):
118              file_base_name: str = os.path.basename(file)
119              file_is_text: bool = file.lower().endswith(text_extensions) 
120              file_is_image: bool = file.lower().endswith(image_extensions) 
121  
122              if file_is_text: 
123                  if not Path(download_path).joinpath(file_base_name).exists():
124                      shutil.move(file, download_path)
125  
126                  paths_of_downloaded_files.append(
127                      str(Path(download_path + f"/{file_base_name}"))
128                  )
129  
130              elif file_is_image: 
131                  if not Path(author_image_dir.joinpath(f"{file_base_name}")).exists():
132                      shutil.move(file, author_image_dir)
133  
134                  paths_of_downloaded_images.append(
135                      str(author_image_dir.joinpath(f"{file_base_name}"))
136                  )
137          
138          self.log_downloaded_files(
139              author_name=author_name, 
140              paths_of_downloaded_files=paths_of_downloaded_files,
141              paths_of_downloaded_images=paths_of_downloaded_images
142          )
143  
144          self.remove_book_directories(directories=directories)
145  
146      @staticmethod
147      def remove_book_directories(directories: list[str]) -> None:
148  
149          for directory in tqdm(
150              iterable=directories,
151              desc="Deleting directories that contained the extracted files..."
152          ): 
153              if Path(directory).exists():
154                  shutil.rmtree(directory)
155  
156      def log_downloaded_files(
157          self, 
158          author_name: str, 
159          paths_of_downloaded_files: list[str],
160          paths_of_downloaded_images: list[str]
161      ) -> None:
162  
163          author_path: Path = find_raw_data_for_author(author_name=author_name).parent
164          author_image_dir: Path = IMAGES_IN_DOWNLOADS.joinpath(author_name)
165  
166          object_types_and_paths: dict[Path, list[str]] = {
167              author_path.joinpath("downloaded_files.json"): paths_of_downloaded_files,
168              author_image_dir.joinpath("downloaded_images.json"): paths_of_downloaded_images 
169          }
170  
171          for path, logs in object_types_and_paths.items(): 
172  
173              if Path(path).exists():
174                  os.remove(path)
175  
176              with open(path, mode="w") as file:
177                  json.dump(logs, file)
178  
179  
180  class Author:
181      def __init__(
182          self, 
183          name: str, 
184          books_via_http: list[ViaHTTP] | None = None, 
185          books_via_torrent: list[ViaTorrent] | None = None,
186          books_via_scraper: list[ViaScraper] | None = None,
187          biographers_and_compilers: list[str] | None = None
188      ) -> None:
189  
190          self.name: str = name
191          self.path_to_data: Path = DATA_DIR.joinpath(name)
192          self.path_to_raw_data : Path = self.path_to_data.joinpath("raw")
193          self.books_via_http: list[ViaHTTP] | None = books_via_http 
194          self.books_via_torrent: list[ViaTorrent] | None = books_via_torrent 
195          self.books_via_scraper: list[ViaScraper] | None = books_via_scraper 
196          self.biographers_and_compilers: list[str] | None = biographers_and_compilers
197  
198          self.make_paths()  # Placed here because it the function requires self.path_to_data
199  
200          self.file_paths: list[Path] = [
201              self.path_to_raw_data.joinpath(file) for 
202                  file in os.listdir(self.path_to_raw_data) if self.path_to_raw_data.joinpath(file).is_file()  
203          ]
204  
205      def download_books(self) -> None:
206  
207          match (self.books_via_http != None, self.books_via_torrent != None, self.books_via_scraper != None):
208  
209              case (True, True, True):
210                  self.download_via_http()
211                  self.download_via_scraper()
212                  self.download_via_torrents()
213  
214              case (True, False, False): 
215                  self.download_via_http()
216              case (False, True, False):
217                  self.download_via_torrents()
218              case(False, False, True):
219                  self.download_via_scraper()
220  
221              case(False, True, True):
222                  self.download_via_torrents()
223                  self.download_via_scraper()
224              case(True, False, True):
225                  self.download_via_http()
226                  self.download_via_scraper()
227              case (True, True, False): 
228                  self.download_via_http()
229                  self.download_via_torrents()
230  
231              case (False, False, False):
232                  raise Exception(f"Across download methods, no information on any books have been provided for {self.name}") 
233  
234      def download_via_http(self) -> None: 
235          assert self.books_via_http != None
236  
237          book_paths: list[str] = []
238          for book in self.books_via_http:
239              file_path: Path = self.path_to_raw_data.joinpath(f"{book.file_name}.pdf")
240              book.download(file_path=str(file_path))
241              book_paths.append(str(file_path))
242  
243      def download_via_torrents(self) -> None:
244          assert self.books_via_torrent != None
245          
246          for book in self.books_via_torrent:
247              if self.must_torrent():
248                  self.leech(book=book)
249  
250      def download_via_scraper(self) -> None:
251          assert self.books_via_scraper != None
252  
253          for book in self.books_via_scraper:
254              book.download(author_name=self.name)
255  
256      def must_torrent(self) -> bool:
257          contents: list[str] = glob(str(self.path_to_raw_data) + "/**/*", recursive=True) 
258          files_only: list[str] = [object for object in contents if os.path.isfile(object)]
259          log_path: Path = self.path_to_data.joinpath("downloaded_files.json")
260  
261          if not Path(log_path).exists():
262              return True
263          else:
264              with open(log_path, mode="r", encoding="utf-8") as file:
265                  logged_paths: list[str] = json.load(file)
266              
267              if (len(files_only) == len(logged_paths)) and len(logged_paths) != 0:
268                  logger.success(f"All files associated with {self.name} are available")
269                  return False
270              else:
271                  logger.warning(f"Some of {self.name}'s files are missing.")
272                  return True
273  
274      def leech(self, book: ViaTorrent):
275          book.download(file_path=str(self.path_to_raw_data))
276          book.extract_files(download_path=str(self.path_to_raw_data), author_name=self.name)
277  
278      def make_paths(self):
279  
280          paths_to_create: list[Path] = [
281              self.path_to_data,
282              self.path_to_raw_data,
283              OCR_IMAGES.joinpath(self.name),
284              CHROMA_DIR.joinpath(self.name), 
285              IMAGES_IN_DOWNLOADS.joinpath(self.name)
286          ] 
287  
288          for path in paths_to_create:
289              if not Path(path).exists():
290                  os.mkdir(path=path)
291  
292  
293  
294  
295  def get_sources():
296     
297      marx = Author(
298          name="Karl Marx",
299          books_via_http=[
300              ViaHTTP(
301                  title="Capital Vol I",
302                  url="https://www.marxists.org/archive/marx/works/download/pdf/Capital-Volume-I.pdf",
303                  start_page=None,
304                  end_page=None
305              ),
306  
307              ViaHTTP(
308                  title="Capital Vol II",
309                  url="https://www.marxists.org/archive/marx/works/download/pdf/Capital-Volume-II.pdf",
310                  start_page=None,
311                  end_page=None
312              ),
313  
314              ViaHTTP(
315                  title="Capital Vol III",
316                  url="https://www.marxists.org/archive/marx/works/download/pdf/Capital-Volume-III.pdf",
317                  start_page=None,
318                  end_page=None
319              ),
320  
321              ViaHTTP(
322                  title="Value, Price & Profit",
323                  url="https://www.marxists.org/archive/marx/works/download/pdf/value-price-profit.pdf",
324                  start_page=None,
325                  end_page=None
326              ),
327  
328              ViaHTTP(
329                  title="Wage, Labour & Capital",
330                  url="https://www.marxists.org/archive/marx/works/download/pdf/wage-labour-capital.pdf",
331                  start_page=None,
332                  end_page=None
333              ),
334  
335              ViaHTTP(
336                  title="The Communist Manifesto",
337                  url="https://www.marxists.org/admin/books/manifesto/Manifesto.pdf",
338                  start_page=30,
339                  end_page=112
340              ),
341          ]
342      )
343  
344  
345      mao = Author(
346          name="Mao Zedong",
347          books_via_http=[
348              ViaHTTP(
349                  title="Oppose Book Worship",
350                  url="https://www.marxists.org/ebooks/mao/Oppose_Book_Worship_-_Mao_Zedong.pdf",
351                  start_page=2,
352                  end_page=12
353              ),
354  
355              ViaHTTP(
356                  title="Selected Works of Mao Tse-Tung Volume I",
357                  url="https://www.marxists.org/reference/archive/mao/selected-works/sw-in-pdf/sw-flp-1965-v1.pdf",
358                  start_page=20,
359                  end_page=353
360              ),
361  
362              ViaHTTP(
363                  title="Selected Works of Mao Tse-Tung Volume II",
364                  url="https://www.marxists.org/reference/archive/mao/selected-works/sw-in-pdf/sw-flp-1965-v2.pdf",
365                  start_page=18,
366                  end_page=473
367              ),
368  
369              ViaHTTP(
370                  title="Selected Works of Mao Tse-Tung Volume III",
371                  url="https://www.marxists.org/reference/archive/mao/selected-works/sw-in-pdf/sw-flp-1965-v3.pdf",
372                  start_page=16,
373                  end_page=345
374              ),
375  
376              ViaHTTP(
377                  title="Selected Works of Mao Tse-Tung Volume IV",
378                  url="https://www.marxists.org/reference/archive/mao/selected-works/sw-in-pdf/sw-flp-1965-v4.pdf",
379                  start_page=17,
380                  end_page=463
381              ),
382  
383              ViaHTTP(
384                  title="Selected Works of Mao Tse-Tung Volume V",
385                  url="https://www.marxists.org/reference/archive/mao/selected-works/sw-in-pdf/sw-flp-1971-v5.pdf",
386                  start_page=22,
387                  end_page=524
388              )
389          ],
390          books_via_scraper=[
391              ViaScraper(
392                  title="Combat Liberalism",
393                  url="https://www.marxists.org/reference/archive/mao/selected-works/volume-2/mswv2_03.htm",
394                  initial_marker="We stand for",
395                  terminal_marker="Transcription"
396              )
397          ]
398      )
399  
400  
401      garvey = Author(
402          name="Marcus Garvey",
403          books_via_http=[
404              ViaHTTP(
405                  title="The Philosophy & Opinions of Marcus Garvey",
406                  url="https://ia801208.us.archive.org/5/items/ThePhilosophyOpinionsOfMarcusGarveyOrAfricaForTheAfricans/EbookPhilAndOpinions.pdf",
407                  start_page=3,
408                  end_page=62
409              ),
410          ]
411      )
412  
413  
414      vivekananda = Author(
415          name="Swami Vivekananda",
416          books_via_http=[
417              ViaHTTP(
418                  title="The Complete Works of Swami Vivekananda",
419                  url="https://ia801608.us.archive.org/9/items/complete-works-of-swami-vivekananda-all-volumes-swami-vivekananda/Complete%20Works%20of%20Swami%20Vivekananda%20-%20%20All%20Volumes%20-%20Swami%20Vivekananda.pdf",
420                  start_page=81,
421                  end_page=5162
422              )
423          ]
424      )
425  
426  
427      blavatsky = Author(
428          name="Helena Pretrovna Blavatsky",
429          books_via_http=[
430              ViaHTTP(
431                  title="The Secret Doctrine (Volume I)",
432                  url="https://www.gutenberg.org/files/54824/54824-pdf.pdf",
433                  start_page=12,
434                  end_page=971
435              ),
436              
437              ViaHTTP(
438                  title="The Secret Doctrine (Volume II)",
439                  url="https://www.gutenberg.org/files/54488/54488-pdf.pdf",
440                  start_page=23,
441                  end_page=1156
442              ),
443  
444              ViaHTTP(
445                  title="The Secret Doctrine (Volume III)",
446                  url="https://www.gutenberg.org/files/56880/56880-pdf.pdf",
447                  start_page=9,
448                  end_page=796
449              ),
450          ],
451  
452          books_via_scraper=[
453              ViaScraper(
454                  title="The Secret Doctrine (Volume IV)",
455                  url="https://www.gutenberg.org/ebooks/61626.epub.noimages",
456              )
457          ]
458      
459      )
460  
461  
462      gandhi = Author(
463          name="Mohandas Karamchand Ghandi",
464          books_via_http=[
465              ViaHTTP(
466                  title="An Autobiography: The Story of My Experiments with Truth",
467                  url="https://www.mkgandhi.org/ebks/An-Autobiography.pdf",
468                  start_page=16,
469                  end_page=556
470              ),
471  
472              ViaHTTP(
473                  title="Hind Swaraj or Indian Home Rule",
474                  url="https://www.mkgandhi.org/ebks/hind_swaraj.pdf",
475                  start_page=12,
476                  end_page=89
477              ),
478  
479              ViaHTTP(
480                  title="The Bhagavad Gita, According to Gandhi",
481                  url="https://ia800904.us.archive.org/10/items/InnerEngineeringAYogisGuideToJoy_20190116/Mahatma_gandhiThe_bhagavad_gita_according_to_gandhi.pdf",
482                  start_page=10,
483                  end_page=177
484              ),
485  
486              ViaHTTP(
487                  title="Non-Violent Resistance",
488                  url="https://dn720701.ca.archive.org/0/items/nonviolentresist00mkga/nonviolentresist00mkga.pdf",
489                  start_page=16,
490                  end_page=388
491              )
492          ]
493      )
494  
495  
496      rai = Author(
497          name="Lala Lajpat Rai",
498          books_via_http=[
499              ViaHTTP(
500                  title="The Story of My Deportation",
501                  url="https://ia601503.us.archive.org/21/items/in.ernet.dli.2015.19903/2015.19903.The--Story-Of-My-Deportation_text.pdf",
502                  start_page=8,
503                  end_page=274,
504                  needs_ocr=True
505              ),
506  
507              ViaHTTP(
508                  title="Young India: An Interpretation and a History of the Nationalist Movement from Within",
509                  url="https://ia800802.us.archive.org/21/items/16RaiYoungindia/16-rai-youngindia.pdf",
510                  start_page=7,
511                  end_page=294
512              ),
513  
514          ]
515      )
516  
517  
518      rizal = Author(
519          name="Jose Rizal",
520          books_via_http=[
521              ViaHTTP(
522                  title="The Social Cancer", 
523                  url="https://www.geocities.ws/qcpujoserizal/Rizal/pdf/Noli.pdf",
524              ),
525          ],
526  
527          books_via_scraper=[
528              ViaScraper(
529                  title="The Reign of Greed", 
530                  url="https://www.gutenberg.org/files/10676/10676-h/10676-h.htm",
531                  initial_marker="On the Upper Deck",
532                  terminal_marker="Colophon"
533              )
534          ]
535      )
536  
537  
538      lenin = Author(
539          name="Vladimir Lenin",
540          books_via_http=[
541              ViaHTTP(
542                  title="What Is to Be Done?: Burning Questions of our Movements",
543                  url="https://www.marxists.org/ebooks/lenin/what-is-to-be-done.pdf",
544                  start_page=7,
545                  end_page=124
546              ),
547  
548              ViaHTTP(
549                  title="The State and Revolution",
550                  url="https://www.marxists.org/ebooks/lenin/state-and-revolution.pdf",
551                  start_page=7,
552                  end_page=83
553              ),
554  
555          ]
556      )
557  
558  
559      yat_sen = Author(
560          name="Sun Yat-sen",
561          books_via_http=[
562              ViaHTTP(
563                  title="The Three Principles of the People",
564                  url="https://chinese.larouchepub.com/wp-content/uploads/2017/05/San-Min-Chu-I_ALL-en.pdf",
565                  start_page=3,
566                  end_page=74
567              ),
568  
569              ViaHTTP(
570                  title="The International Development of China",
571                  url="https://chinese.larouchepub.com/wp-content/uploads/2017/05/sun_IDC-en.pdf",
572                  start_page=15, 
573                  end_page=305
574              ),
575          ]
576      )
577  
578      nietzsche = Author(
579          name="Friedrich Nietzsche",
580          books_via_scraper=[
581              ViaScraper(
582                  title="Thus Spake Zarathustra",
583                  url="https://www.gutenberg.org/files/1998/1998-h/1998-h.htm",
584                  initial_marker="ZARATHUSTRA’S DISCOURSES.",
585                  terminal_marker="APPENDIX"
586              ),
587  
588              ViaScraper(
589                  title="Beyond Good and Evil",
590                  url="https://www.gutenberg.org/cache/epub/4363/pg4363-images.html",
591                  initial_marker="CHAPTER I. ",
592                  terminal_marker="FROM THE HEIGHTS"
593              ),
594  
595              ViaScraper(
596                  title="The Genealogy of Morals",
597                  url="https://www.gutenberg.org/cache/epub/52319/pg52319-images.html",
598                  initial_marker="FIRST ESSAY.",
599                  terminal_marker="betray us!"
600              ),
601  
602              ViaScraper(
603                  title="The Antichrist",
604                  url="https://www.gutenberg.org/cache/epub/19322/pg19322-images.html",
605                  initial_marker="PREFACE",
606                  terminal_marker="THE END"
607              ),
608  
609              ViaScraper(
610                  title="Human, All Too Human",
611                  url="https://www.gutenberg.org/cache/epub/38145/pg38145-images.html",
612                  initial_marker="PREFACE.",
613                  terminal_marker="sinfulness."
614              ),
615  
616              ViaScraper(
617                  title="Ecce Homo",
618                  url="https://www.gutenberg.org/cache/epub/52190/pg52190-images.html",
619                  initial_marker="PREFACE.",
620                  terminal_marker="It really seems that we have a path."
621              ),
622  
623  
624          ]
625      )
626  
627      
628      kropotkin = Author(
629          name="Pyotr Kropotkin",
630          books_via_scraper=[
631              ViaScraper(
632                  title="Mutual Aid: A Factor of Evolution",
633                  url="https://www.gutenberg.org/cache/epub/4341/pg4341-images.html",
634                  initial_marker="Two aspects",
635                  terminal_marker="our race"
636              ),
637  
638              ViaScraper(
639                  title="The Conquest of Bread",
640                  url="https://www.gutenberg.org/cache/epub/23428/pg23428-images.html",
641                  initial_marker="OUR RICHES",
642                  terminal_marker="Social Revolution."
643              )
644          ]
645  
646      )
647  
648  
649      bakunin = Author(
650          name="Mikhail Bakunin",
651          books_via_scraper=[
652              ViaScraper(
653                  title="God and the State",
654                  url="https://www.gutenberg.org/cache/epub/36568/pg36568-images.html",
655                  initial_marker="Elisée Reclus.",
656                  terminal_marker="Genius of Christianity"
657              )
658          ]
659      )
660  
661  
662      proudhon = Author(
663          name="Pierre-Joseph Proudhon",
664          books_via_scraper=[
665              ViaScraper(
666                  title="What is Property? An Inquiry into the Principle of Right and of Government",
667                  url="https://www.gutenberg.org/cache/epub/360/pg360-images.html",
668                  initial_marker="FIRST MEMOIR.",
669                  terminal_marker="and absurd."
670              ),
671  
672              ViaScraper(
673                  title="System of Economical Contradictions; Or, The Philosophy of Misery",
674                  url="https://www.gutenberg.org/cache/epub/444/pg444-images.html",
675                  initial_marker="Before entering",
676                  terminal_marker="reason of our existence."
677              )
678          ]
679  
680      )
681      
682  
683      berkman = Author(
684          name="Alexander Berkman",
685          books_via_scraper=[
686              ViaScraper(
687                  title="Prison Memoirs of an Anarchist",
688                  url="https://www.gutenberg.org/cache/epub/34406/pg34406-images.html",
689                  initial_marker="The Call of Homestead",
690                  terminal_marker="I have found work to do."
691              ),
692  
693              ViaScraper(
694                  title="Deportation - Its Meaning & Menace",
695                  url="https://www.gutenberg.org/cache/epub/68442/pg68442-images.html",
696                  initial_marker="DEPORTATION—Its Meaning and Menace",
697                  terminal_marker="but also of reward."
698              )
699          ]
700      )
701  
702  
703      sun_tzu = Author(
704          name="Sun Tzu",
705          books_via_http=[
706              ViaHTTP(
707                  title="The Art of War",
708                  url="https://sites.ualberta.ca/~enoch/Readings/The_Art_Of_War.pdf",
709                  start_page=3,
710                  end_page=65
711              )
712          ]
713      )
714  
715      return [
716          marx, garvey, gandhi, yat_sen, lenin, kropotkin, rizal, nietzsche, 
717          mao, bakunin, proudhon, berkman, sun_tzu, vivekananda, rai, blavatsky 
718      ] 
719  
720  
721  if __name__ == "__main__":
722      for author in get_sources():
723          author.download_books()               
724          logger.info("Next author...")
725