/ pyproject.toml
pyproject.toml
  1  [project]
  2  name = "rag-ingestion"
  3  version = "0.1.0"
  4  description = "Utilities for ingesting PDFs into a RAG pipeline."
  5  readme = "README.md"
  6  requires-python = ">=3.10"
  7  dependencies = [
  8    "langchain-core>=0.3.0",
  9    "langchain-text-splitters>=0.3.0",
 10    "langchain-community>=0.3.0",
 11    "langchain-openai>=0.1.0",
 12    "pymupdf>=1.24.9",
 13    "pymupdf4llm>=0.0.17",
 14    "pydantic>=2.0.0",
 15    "pydantic-settings>=2.0.0",
 16    "pyyaml>=6.0",
 17    "chromadb>=0.4.0",
 18    "qdrant-client>=1.7.0",
 19    "langchain-qdrant>=0.1.0",
 20    "sentence-transformers>=2.2.0",
 21    "numpy>=1.24.0,<2.0",
 22    "langchain-huggingface>=0.0.1",
 23    "google-genai>=1.0.0",
 24    "langchain-google-genai>=2.1.10",
 25    "docling==2.64.0",
 26    "fastapi>=0.104.0",
 27    "openai-whisper>=20231117",
 28    "uvicorn[standard]>=0.24.0",
 29    "rapidfuzz>=3.0.0",
 30    "boto3>=1.42.24",
 31  ]
 32  
 33  [project.optional-dependencies]
 34  dev = [
 35      "ipykernel>=6.29.0",
 36      "jupyter>=1.0.0",
 37      "ruff>=0.1.0",
 38      "pre-commit>=3.0.0",
 39  ]
 40  openai = [
 41    "openai>=1.0.0",  # For OpenAI embeddings
 42    "tiktoken>=0.5.0",  # Required by OpenAIEmbeddings for token counting
 43  ]
 44  
 45  [build-system]
 46  requires = ["setuptools>=68.0"]
 47  build-backend = "setuptools.build_meta"
 48  
 49  [tool.setuptools.packages.find]
 50  where = ["src"]
 51  
 52  [tool.ruff]
 53  # Line length matching the project style
 54  line-length = 100
 55  target-version = "py312"
 56  
 57  # Exclude common directories
 58  exclude = [
 59      ".git",
 60      "__pycache__",
 61      ".venv",
 62      "venv",
 63      "env",
 64      ".eggs",
 65      "*.egg",
 66      "build",
 67      "dist",
 68      ".pytest_cache",
 69      ".mypy_cache",
 70      ".ruff_cache",
 71  ]
 72  
 73  [tool.ruff.lint]
 74  # Ignore specific rules (must be before select)
 75  ignore = [
 76      "E501",  # Line too long (handled by formatter)
 77      "E402",  # Module level import not at top of file (docstrings should come first per PEP 257)
 78      "ARG002", # Unused method arguments (part of protocol interfaces, some implementations don't use them)
 79      "PLR0913", # Too many arguments
 80      "PLR2004", # Magic value used in comparison
 81      "TRY003", # Avoid specifying long messages outside exception class
 82      "TRY301", # Abstract raise to inner function (prefer direct raise for clarity)
 83      "TRY300", # Consider moving statement to else block (prefer early return pattern)
 84      "PLW0603", # Using global statement (needed for singleton pattern)
 85      "PLW0602", # Using global for lock (needed for singleton pattern)
 86  ]
 87  
 88  # Enable specific rule sets
 89  select = [
 90      "E",   # pycodestyle errors
 91      "W",   # pycodestyle warnings
 92      "F",   # pyflakes
 93      "I",   # isort
 94      "B",   # flake8-bugbear
 95      "C4",  # flake8-comprehensions
 96      "UP",  # pyupgrade
 97      "ARG", # flake8-unused-arguments
 98      "SIM", # flake8-simplify
 99      "PTH", # flake8-use-pathlib
100      "ERA", # eradicate (commented-out code)
101      "PD",  # pandas-vet
102      "PL",  # Pylint
103      "TRY", # tryceratops
104      "RUF", # Ruff-specific rules
105  ]
106  
107  # Per-file ignores
108  [tool.ruff.lint.per-file-ignores]
109  "src/__init__.py" = ["RUF022"]  # Allow unsorted __all__ to preserve comments
110  
111  [tool.ruff.lint.isort]
112  known-first-party = ["src"]
113  
114  [tool.ruff.lint.pylint]
115  max-args = 10
116  max-branches = 20
117  max-returns = 10
118  max-statements = 50
119