/ analyze_pdfs.py
analyze_pdfs.py
1 import os 2 import subprocess 3 import json 4 5 # Navigate to research directory 6 os.chdir("/run/media/unknown/ADATA HD710 PRO/epstein_research") 7 8 print(f"š PDF DOCUMENT ANALYSIS") 9 print("="*60) 10 11 # Find all PDFs 12 pdfs = [] 13 for root, dirs, files in os.walk("."): 14 for file in files: 15 if file.lower().endswith('.pdf'): 16 full_path = os.path.join(root, file) 17 pdfs.append(full_path) 18 19 print(f"Found {len(pdfs)} PDF files") 20 21 if len(pdfs) == 0: 22 print("No PDFs found. Checking file types...") 23 os.system("find . -type f | head -20 | xargs file -b | head -10") 24 exit() 25 26 # Analyze first 5 PDFs 27 print(f"\nš Analyzing first 5 PDFs...") 28 29 results = [] 30 for i, pdf in enumerate(pdfs[:5], 1): 31 try: 32 print(f"\n[{i}] {pdf}") 33 34 # Get file info 35 size = os.path.getsize(pdf) 36 print(f" Size: {size:,} bytes") 37 38 # Try to extract text using pdftotext (if installed) 39 # First check if we have pdftotext 40 has_pdftotext = subprocess.run(["which", "pdftotext"], 41 capture_output=True).returncode == 0 42 43 extracted_text = "" 44 if has_pdftotext: 45 print(" Extracting text with pdftotext...") 46 temp_txt = f"temp_extract_{i}.txt" 47 cmd = f"pdftotext '{pdf}' {temp_txt} 2>/dev/null" 48 os.system(cmd) 49 50 if os.path.exists(temp_txt): 51 with open(temp_txt, 'r', encoding='utf-8', errors='ignore') as f: 52 extracted_text = f.read(2000) 53 os.remove(temp_txt) 54 print(f" Extracted {len(extracted_text)} characters") 55 else: 56 print(" Could not extract text") 57 else: 58 print(" pdftotext not available. Using file command for info.") 59 file_info = subprocess.run(["file", "-b", pdf], 60 capture_output=True, text=True).stdout 61 print(f" File type: {file_info[:100]}") 62 63 # Ask AI to analyze based on what we have 64 if extracted_text: 65 prompt = f"""Analyze this PDF document extract: 66 67 PDF: {os.path.basename(pdf)} 68 SIZE: {size:,} bytes 69 DIRECTORY: {os.path.dirname(pdf)} 70 71 EXTRACTED TEXT (first 2000 chars): 72 {extracted_text} 73 74 Provide analysis: 75 1. What type of document is this? (legal, financial, report, etc.) 76 2. Key topics or subjects mentioned 77 3. Any names, dates, locations, or patterns 78 4. Recommended next analysis steps""" 79 else: 80 prompt = f"""Analyze this PDF file: 81 82 PDF: {os.path.basename(pdf)} 83 SIZE: {size:,} bytes ({size/1024/1024:.1f} MB) 84 DIRECTORY: {os.path.dirname(pdf)} 85 86 We couldn't extract text content. Based on the filename and location, 87 provide investigative analysis: 88 1. What might this document contain based on filename/directory? 89 2. How should an investigator approach analyzing this PDF? 90 3. What tools would be needed for proper analysis? 91 4. Recommended forensic approach""" 92 93 print(" š¤ Asking AI for analysis...") 94 result = subprocess.run( 95 ['ollama', 'run', 'tinyllama', prompt], 96 capture_output=True, 97 text=True, 98 timeout=60 99 ) 100 101 if result.returncode == 0: 102 analysis = result.stdout.strip() 103 print(f" Analysis: {analysis[:200]}...") 104 105 results.append({ 106 'pdf': pdf, 107 'size': size, 108 'analysis': analysis[:1000] 109 }) 110 else: 111 print(f" ā AI analysis failed") 112 113 except Exception as e: 114 print(f" ā ļø Error: {str(e)[:50]}") 115 116 # Save results 117 if results: 118 with open("pdf_analysis_report.json", "w") as f: 119 json.dump({ 120 'total_pdfs': len(pdfs), 121 'analyzed': len(results), 122 'results': results 123 }, f, indent=2) 124 125 print(f"\nā Analysis complete!") 126 print(f"š Total PDFs in research directory: {len(pdfs)}") 127 print(f"š Analyzed: {len(results)} PDFs") 128 print(f"š¾ Report saved: pdf_analysis_report.json") 129 130 # Show summary 131 print("\nš QUICK SUMMARY:") 132 for r in results: 133 print(f" ⢠{r['pdf']} ({r['size']:,} bytes)") 134 else: 135 print("\nā Could not analyze any PDFs") 136 137 print(f"\nšÆ Next: Install pdftotext for better analysis: sudo apt install poppler-utils")