/ analyze_pdfs_no_tools.py
analyze_pdfs_no_tools.py
1 import os 2 import subprocess 3 import json 4 import time 5 6 os.chdir("/run/media/unknown/ADATA HD710 PRO/epstein_research") 7 8 print(f"š PDF ANALYSIS (No External Tools)") 9 print("="*60) 10 11 # Find PDFs 12 pdfs = [] 13 for root, dirs, files in os.walk("."): 14 for file in files: 15 if file.lower().endswith('.pdf'): 16 pdfs.append(os.path.join(root, file)) 17 18 print(f"š Found {len(pdfs)} PDF files") 19 20 if len(pdfs) == 0: 21 print("No PDFs found.") 22 exit() 23 24 # Quick analysis strategy 25 print(f"\nšÆ Analysis Strategy for {len(pdfs)} PDFs:") 26 print("1. Sample files from each directory") 27 print("2. Use file command to identify type") 28 print("3. Analyze filenames and structure") 29 print("4. Use AI for investigative insights") 30 31 # Analyze directory structure 32 print(f"\nš Directory Structure Analysis:") 33 directories = {} 34 for pdf in pdfs[:50]: # First 50 35 dir_path = os.path.dirname(pdf) 36 directories[dir_path] = directories.get(dir_path, 0) + 1 37 38 print(f"PDFs distributed across {len(directories)} directories") 39 print("\nTop directories:") 40 for dir_path, count in sorted(directories.items(), key=lambda x: x[1], reverse=True)[:5]: 41 print(f" {dir_path}: {count} PDFs") 42 43 # Sample analysis without pdftotext 44 print(f"\nš Analyzing Sample PDFs (Filename & Metadata Only)...") 45 46 samples_to_analyze = 3 47 results = [] 48 49 for i, pdf in enumerate(pdfs[:samples_to_analyze], 1): 50 try: 51 print(f"\n[{i}] {os.path.basename(pdf)}") 52 print(f" š Location: {pdf}") 53 54 # Get file info 55 size = os.path.getsize(pdf) 56 size_kb = size / 1024 57 58 # Use file command to get basic info 59 file_cmd = f"file -b '{pdf}'" 60 file_info = subprocess.run(file_cmd, shell=True, capture_output=True, text=True).stdout.strip() 61 62 print(f" š Size: {size_kb:.1f} KB ({size:,} bytes)") 63 print(f" š Type: {file_info[:100]}") 64 65 # Analyze filename pattern 66 filename = os.path.basename(pdf) 67 dir_name = os.path.dirname(pdf) 68 69 # Ask AI for analysis based on filename/structure 70 prompt = f"""INVESTIGATIVE ANALYSIS OF PDF COLLECTION: 71 72 PDF FILENAME: {filename} 73 LOCATION: {pdf} 74 DIRECTORY STRUCTURE: {dir_name} 75 FILE SIZE: {size_kb:.1f} KB 76 FILE TYPE: {file_info} 77 78 ADDITIONAL CONTEXT: 79 - Total collection: {len(pdfs)} PDF files 80 - Directory pattern: Contains 'DataSet 1', 'VOL00001', 'IMAGES' folders 81 - Filename pattern: 'EFTA00000983.pdf' (sequential numbering) 82 83 ANALYSIS QUESTIONS: 84 1. What type of documents are these likely to be? 85 2. What does 'EFTA' prefix suggest? (European Free Trade Association?) 86 3. Investigative significance of this collection? 87 4. Best approach to analyze 4,000+ image-based PDFs? 88 5. Recommended forensic tools/methods? 89 90 Provide concise investigative analysis.""" 91 92 print(f" š¤ AI Analysis...") 93 start = time.time() 94 95 ai_result = subprocess.run( 96 ['ollama', 'run', 'tinyllama', prompt], 97 capture_output=True, 98 text=True, 99 timeout=30 100 ) 101 102 elapsed = time.time() - start 103 104 if ai_result.returncode == 0: 105 analysis = ai_result.stdout.strip() 106 print(f" ā Analysis ({elapsed:.1f}s):") 107 print(f" {analysis[:200]}...") 108 109 results.append({ 110 'file': pdf, 111 'size_kb': size_kb, 112 'file_info': file_info, 113 'analysis': analysis[:500] 114 }) 115 else: 116 print(f" ā AI failed after {elapsed:.1f}s") 117 118 except Exception as e: 119 print(f" ā ļø Error: {str(e)[:50]}") 120 121 # Collection-wide analysis 122 print(f"\nš COLLECTION-WIDE ANALYSIS") 123 print("="*60) 124 125 collection_prompt = f"""ANALYZE THIS PDF COLLECTION FOR INVESTIGATIVE PURPOSES: 126 127 COLLECTION SIZE: {len(pdfs)} PDF files 128 LOCATION: /run/media/unknown/ADATA HD710 PRO/epstein_research/ 129 130 DIRECTORY STRUCTURE: 131 {chr(10).join([f"- {d}: {c} files" for d, c in sorted(directories.items(), key=lambda x: x[1], reverse=True)[:5]])} 132 133 SAMPLE FILES ANALYZED: 134 {chr(10).join([f"- {r['file']} ({r['size_kb']:.1f} KB): {r['file_info'][:50]}" for r in results[:3]])} 135 136 FILENAME PATTERN: EFTA00000983.pdf, EFTA00000984.pdf, etc. 137 (Sequential numbering, likely scanned documents) 138 139 INVESTIGATIVE QUESTIONS: 140 1. What is the likely origin/purpose of this collection? 141 2. Why would EFTA (European Free Trade Association) documents be in 'epstein_research'? 142 3. Best strategy to process 4,000+ potentially image-based PDFs? 143 4. What forensic tools are essential? 144 5. Recommended analysis workflow? 145 146 Provide strategic investigative analysis.""" 147 148 print("š¤ Analyzing entire collection...") 149 ai_result = subprocess.run( 150 ['ollama', 'run', 'tinyllama', collection_prompt], 151 capture_output=True, 152 text=True, 153 timeout=45 154 ) 155 156 if ai_result.returncode == 0: 157 collection_analysis = ai_result.stdout.strip() 158 print(f"\nš COLLECTION ANALYSIS:") 159 print("-"*40) 160 print(collection_analysis) 161 else: 162 print("ā Collection analysis failed") 163 164 # Save results 165 if results: 166 report_file = "pdf_analysis_no_tools.json" 167 with open(report_file, "w") as f: 168 json.dump({ 169 'total_pdfs': len(pdfs), 170 'directories_analyzed': len(directories), 171 'samples_analyzed': len(results), 172 'directory_distribution': dict(sorted(directories.items(), key=lambda x: x[1], reverse=True)[:10]), 173 'sample_results': results, 174 'collection_analysis': collection_analysis if 'collection_analysis' in locals() else "" 175 }, f, indent=2) 176 177 print(f"\nā Analysis saved to: {report_file}") 178 print(f"š Summary: {len(pdfs)} PDFs across {len(directories)} directories") 179 180 print(f"\nšÆ NEXT STEPS:") 181 print("1. Install poppler-utils for text extraction: sudo apt --fix-broken install poppler-utils") 182 print("2. Use OCR if PDFs are image-based: sudo apt install tesseract-ocr") 183 print("3. Bulk search: pdfgrep -r 'pattern' .") 184 print("4. Consider sampling strategy for 4,000+ files")