Cradicle Explorer

/ analyze_pdfs.py
analyze_pdfs.py
  1  import os
  2  import subprocess
  3  import json
  4  
  5  # Navigate to research directory
  6  os.chdir("/run/media/unknown/ADATA HD710 PRO/epstein_research")
  7  
  8  print(f"📚 PDF DOCUMENT ANALYSIS")
  9  print("="*60)
 10  
 11  # Find all PDFs
 12  pdfs = []
 13  for root, dirs, files in os.walk("."):
 14      for file in files:
 15          if file.lower().endswith('.pdf'):
 16              full_path = os.path.join(root, file)
 17              pdfs.append(full_path)
 18  
 19  print(f"Found {len(pdfs)} PDF files")
 20  
 21  if len(pdfs) == 0:
 22      print("No PDFs found. Checking file types...")
 23      os.system("find . -type f | head -20 | xargs file -b | head -10")
 24      exit()
 25  
 26  # Analyze first 5 PDFs
 27  print(f"\n🔍 Analyzing first 5 PDFs...")
 28  
 29  results = []
 30  for i, pdf in enumerate(pdfs[:5], 1):
 31      try:
 32          print(f"\n[{i}] {pdf}")
 33          
 34          # Get file info
 35          size = os.path.getsize(pdf)
 36          print(f"   Size: {size:,} bytes")
 37          
 38          # Try to extract text using pdftotext (if installed)
 39          # First check if we have pdftotext
 40          has_pdftotext = subprocess.run(["which", "pdftotext"], 
 41                                       capture_output=True).returncode == 0
 42          
 43          extracted_text = ""
 44          if has_pdftotext:
 45              print("   Extracting text with pdftotext...")
 46              temp_txt = f"temp_extract_{i}.txt"
 47              cmd = f"pdftotext '{pdf}' {temp_txt} 2>/dev/null"
 48              os.system(cmd)
 49              
 50              if os.path.exists(temp_txt):
 51                  with open(temp_txt, 'r', encoding='utf-8', errors='ignore') as f:
 52                      extracted_text = f.read(2000)
 53                  os.remove(temp_txt)
 54                  print(f"   Extracted {len(extracted_text)} characters")
 55              else:
 56                  print("   Could not extract text")
 57          else:
 58              print("   pdftotext not available. Using file command for info.")
 59              file_info = subprocess.run(["file", "-b", pdf], 
 60                                       capture_output=True, text=True).stdout
 61              print(f"   File type: {file_info[:100]}")
 62          
 63          # Ask AI to analyze based on what we have
 64          if extracted_text:
 65              prompt = f"""Analyze this PDF document extract:
 66              
 67              PDF: {os.path.basename(pdf)}
 68              SIZE: {size:,} bytes
 69              DIRECTORY: {os.path.dirname(pdf)}
 70              
 71              EXTRACTED TEXT (first 2000 chars):
 72              {extracted_text}
 73              
 74              Provide analysis:
 75              1. What type of document is this? (legal, financial, report, etc.)
 76              2. Key topics or subjects mentioned
 77              3. Any names, dates, locations, or patterns
 78              4. Recommended next analysis steps"""
 79          else:
 80              prompt = f"""Analyze this PDF file:
 81              
 82              PDF: {os.path.basename(pdf)}
 83              SIZE: {size:,} bytes ({size/1024/1024:.1f} MB)
 84              DIRECTORY: {os.path.dirname(pdf)}
 85              
 86              We couldn't extract text content. Based on the filename and location,
 87              provide investigative analysis:
 88              1. What might this document contain based on filename/directory?
 89              2. How should an investigator approach analyzing this PDF?
 90              3. What tools would be needed for proper analysis?
 91              4. Recommended forensic approach"""
 92          
 93          print("   🤖 Asking AI for analysis...")
 94          result = subprocess.run(
 95              ['ollama', 'run', 'tinyllama', prompt],
 96              capture_output=True,
 97              text=True,
 98              timeout=60
 99          )
100          
101          if result.returncode == 0:
102              analysis = result.stdout.strip()
103              print(f"   Analysis: {analysis[:200]}...")
104              
105              results.append({
106                  'pdf': pdf,
107                  'size': size,
108                  'analysis': analysis[:1000]
109              })
110          else:
111              print(f"   ❌ AI analysis failed")
112              
113      except Exception as e:
114          print(f"   ⚠️  Error: {str(e)[:50]}")
115  
116  # Save results
117  if results:
118      with open("pdf_analysis_report.json", "w") as f:
119          json.dump({
120              'total_pdfs': len(pdfs),
121              'analyzed': len(results),
122              'results': results
123          }, f, indent=2)
124      
125      print(f"\n✅ Analysis complete!")
126      print(f"📊 Total PDFs in research directory: {len(pdfs)}")
127      print(f"📄 Analyzed: {len(results)} PDFs")
128      print(f"💾 Report saved: pdf_analysis_report.json")
129      
130      # Show summary
131      print("\n📈 QUICK SUMMARY:")
132      for r in results:
133          print(f"  • {r['pdf']} ({r['size']:,} bytes)")
134  else:
135      print("\n❌ Could not analyze any PDFs")
136  
137  print(f"\n🎯 Next: Install pdftotext for better analysis: sudo apt install poppler-utils")