Cradicle Explorer

/ analyze_pdfs_no_tools.py
analyze_pdfs_no_tools.py
  1  import os
  2  import subprocess
  3  import json
  4  import time
  5  
  6  os.chdir("/run/media/unknown/ADATA HD710 PRO/epstein_research")
  7  
  8  print(f"📚 PDF ANALYSIS (No External Tools)")
  9  print("="*60)
 10  
 11  # Find PDFs
 12  pdfs = []
 13  for root, dirs, files in os.walk("."):
 14      for file in files:
 15          if file.lower().endswith('.pdf'):
 16              pdfs.append(os.path.join(root, file))
 17  
 18  print(f"📊 Found {len(pdfs)} PDF files")
 19  
 20  if len(pdfs) == 0:
 21      print("No PDFs found.")
 22      exit()
 23  
 24  # Quick analysis strategy
 25  print(f"\n🎯 Analysis Strategy for {len(pdfs)} PDFs:")
 26  print("1. Sample files from each directory")
 27  print("2. Use file command to identify type")
 28  print("3. Analyze filenames and structure")
 29  print("4. Use AI for investigative insights")
 30  
 31  # Analyze directory structure
 32  print(f"\n📁 Directory Structure Analysis:")
 33  directories = {}
 34  for pdf in pdfs[:50]:  # First 50
 35      dir_path = os.path.dirname(pdf)
 36      directories[dir_path] = directories.get(dir_path, 0) + 1
 37  
 38  print(f"PDFs distributed across {len(directories)} directories")
 39  print("\nTop directories:")
 40  for dir_path, count in sorted(directories.items(), key=lambda x: x[1], reverse=True)[:5]:
 41      print(f"  {dir_path}: {count} PDFs")
 42  
 43  # Sample analysis without pdftotext
 44  print(f"\n🔍 Analyzing Sample PDFs (Filename & Metadata Only)...")
 45  
 46  samples_to_analyze = 3
 47  results = []
 48  
 49  for i, pdf in enumerate(pdfs[:samples_to_analyze], 1):
 50      try:
 51          print(f"\n[{i}] {os.path.basename(pdf)}")
 52          print(f"   📍 Location: {pdf}")
 53          
 54          # Get file info
 55          size = os.path.getsize(pdf)
 56          size_kb = size / 1024
 57          
 58          # Use file command to get basic info
 59          file_cmd = f"file -b '{pdf}'"
 60          file_info = subprocess.run(file_cmd, shell=True, capture_output=True, text=True).stdout.strip()
 61          
 62          print(f"   📏 Size: {size_kb:.1f} KB ({size:,} bytes)")
 63          print(f"   📄 Type: {file_info[:100]}")
 64          
 65          # Analyze filename pattern
 66          filename = os.path.basename(pdf)
 67          dir_name = os.path.dirname(pdf)
 68          
 69          # Ask AI for analysis based on filename/structure
 70          prompt = f"""INVESTIGATIVE ANALYSIS OF PDF COLLECTION:
 71  
 72  PDF FILENAME: {filename}
 73  LOCATION: {pdf}
 74  DIRECTORY STRUCTURE: {dir_name}
 75  FILE SIZE: {size_kb:.1f} KB
 76  FILE TYPE: {file_info}
 77  
 78  ADDITIONAL CONTEXT:
 79  - Total collection: {len(pdfs)} PDF files
 80  - Directory pattern: Contains 'DataSet 1', 'VOL00001', 'IMAGES' folders
 81  - Filename pattern: 'EFTA00000983.pdf' (sequential numbering)
 82  
 83  ANALYSIS QUESTIONS:
 84  1. What type of documents are these likely to be?
 85  2. What does 'EFTA' prefix suggest? (European Free Trade Association?)
 86  3. Investigative significance of this collection?
 87  4. Best approach to analyze 4,000+ image-based PDFs?
 88  5. Recommended forensic tools/methods?
 89  
 90  Provide concise investigative analysis."""
 91          
 92          print(f"   🤖 AI Analysis...")
 93          start = time.time()
 94          
 95          ai_result = subprocess.run(
 96              ['ollama', 'run', 'tinyllama', prompt],
 97              capture_output=True,
 98              text=True,
 99              timeout=30
100          )
101          
102          elapsed = time.time() - start
103          
104          if ai_result.returncode == 0:
105              analysis = ai_result.stdout.strip()
106              print(f"   ✅ Analysis ({elapsed:.1f}s):")
107              print(f"   {analysis[:200]}...")
108              
109              results.append({
110                  'file': pdf,
111                  'size_kb': size_kb,
112                  'file_info': file_info,
113                  'analysis': analysis[:500]
114              })
115          else:
116              print(f"   ❌ AI failed after {elapsed:.1f}s")
117              
118      except Exception as e:
119          print(f"   ⚠️  Error: {str(e)[:50]}")
120  
121  # Collection-wide analysis
122  print(f"\n📈 COLLECTION-WIDE ANALYSIS")
123  print("="*60)
124  
125  collection_prompt = f"""ANALYZE THIS PDF COLLECTION FOR INVESTIGATIVE PURPOSES:
126  
127  COLLECTION SIZE: {len(pdfs)} PDF files
128  LOCATION: /run/media/unknown/ADATA HD710 PRO/epstein_research/
129  
130  DIRECTORY STRUCTURE:
131  {chr(10).join([f"- {d}: {c} files" for d, c in sorted(directories.items(), key=lambda x: x[1], reverse=True)[:5]])}
132  
133  SAMPLE FILES ANALYZED:
134  {chr(10).join([f"- {r['file']} ({r['size_kb']:.1f} KB): {r['file_info'][:50]}" for r in results[:3]])}
135  
136  FILENAME PATTERN: EFTA00000983.pdf, EFTA00000984.pdf, etc.
137  (Sequential numbering, likely scanned documents)
138  
139  INVESTIGATIVE QUESTIONS:
140  1. What is the likely origin/purpose of this collection?
141  2. Why would EFTA (European Free Trade Association) documents be in 'epstein_research'?
142  3. Best strategy to process 4,000+ potentially image-based PDFs?
143  4. What forensic tools are essential?
144  5. Recommended analysis workflow?
145  
146  Provide strategic investigative analysis."""
147  
148  print("🤖 Analyzing entire collection...")
149  ai_result = subprocess.run(
150      ['ollama', 'run', 'tinyllama', collection_prompt],
151      capture_output=True,
152      text=True,
153      timeout=45
154  )
155  
156  if ai_result.returncode == 0:
157      collection_analysis = ai_result.stdout.strip()
158      print(f"\n📋 COLLECTION ANALYSIS:")
159      print("-"*40)
160      print(collection_analysis)
161  else:
162      print("❌ Collection analysis failed")
163  
164  # Save results
165  if results:
166      report_file = "pdf_analysis_no_tools.json"
167      with open(report_file, "w") as f:
168          json.dump({
169              'total_pdfs': len(pdfs),
170              'directories_analyzed': len(directories),
171              'samples_analyzed': len(results),
172              'directory_distribution': dict(sorted(directories.items(), key=lambda x: x[1], reverse=True)[:10]),
173              'sample_results': results,
174              'collection_analysis': collection_analysis if 'collection_analysis' in locals() else ""
175          }, f, indent=2)
176      
177      print(f"\n✅ Analysis saved to: {report_file}")
178      print(f"📊 Summary: {len(pdfs)} PDFs across {len(directories)} directories")
179      
180  print(f"\n🎯 NEXT STEPS:")
181  print("1. Install poppler-utils for text extraction: sudo apt --fix-broken install poppler-utils")
182  print("2. Use OCR if PDFs are image-based: sudo apt install tesseract-ocr")
183  print("3. Bulk search: pdfgrep -r 'pattern' .")
184  print("4. Consider sampling strategy for 4,000+ files")