/ pyod / utils / knowledge / algorithms.json
algorithms.json
   1  {
   2    "ECOD": {
   3      "class_path": "pyod.models.ecod.ECOD",
   4      "full_name": "Empirical Cumulative Distribution Functions",
   5      "status": "shipped",
   6      "data_types": ["tabular"],
   7      "category": "probabilistic",
   8      "complexity": {"time": "O(n * d * log(n))", "space": "O(n * d)"},
   9      "strengths": [
  10        "Parameter-free and highly interpretable",
  11        "Fast computation with parallelization support",
  12        "Strong benchmark performance across diverse datasets",
  13        "No assumption on data distribution"
  14      ],
  15      "weaknesses": [
  16        "May struggle with strongly correlated features",
  17        "Assumes outliers deviate in at least one marginal dimension"
  18      ],
  19      "best_for": "General-purpose outlier detection when speed and interpretability are priorities",
  20      "avoid_when": "Features are heavily correlated and outliers only manifest in joint distributions",
  21      "benchmark_refs": ["ADBench"],
  22      "benchmark_rank": {"ADBench_overall": 5},
  23      "paper": {"id": "ecod", "short": "Li et al., TKDE 2022"},
  24      "default_params": {"contamination": 0.1, "n_jobs": 1},
  25      "preprocessing_mode": "external",
  26      "requires": [],
  27      "version_added": "0.9.0"
  28    },
  29    "ABOD": {
  30      "class_path": "pyod.models.abod.ABOD",
  31      "full_name": "Angle-Based Outlier Detection",
  32      "status": "shipped",
  33      "data_types": ["tabular"],
  34      "category": "probabilistic",
  35      "complexity": {"time": "O(n^2 * d) for fast, O(n^3 * d) for default", "space": "O(n^2)"},
  36      "strengths": [
  37        "Effective in high-dimensional spaces",
  38        "Not affected by distance concentration in high dimensions",
  39        "Good theoretical foundation"
  40      ],
  41      "weaknesses": [
  42        "Computationally expensive, especially in default mode",
  43        "Sensitive to noise in low-dimensional data",
  44        "Requires sufficient neighbors for angle variance estimation"
  45      ],
  46      "best_for": "High-dimensional datasets where distance-based methods suffer from curse of dimensionality",
  47      "avoid_when": "Dataset is very large or low-dimensional where distance-based methods work well",
  48      "benchmark_refs": ["ADBench"],
  49      "benchmark_rank": {},
  50      "paper": {"id": "abod", "short": "Kriegel et al., KDD 2008"},
  51      "default_params": {"contamination": 0.1, "n_neighbors": 5, "method": "fast"},
  52      "preprocessing_mode": "external",
  53      "requires": [],
  54      "version_added": "0.5.0"
  55    },
  56    "COPOD": {
  57      "class_path": "pyod.models.copod.COPOD",
  58      "full_name": "Copula-Based Outlier Detection",
  59      "status": "shipped",
  60      "data_types": ["tabular"],
  61      "category": "probabilistic",
  62      "complexity": {"time": "O(n * d * log(n))", "space": "O(n * d)"},
  63      "strengths": [
  64        "Parameter-free and highly interpretable",
  65        "Fast computation with parallelization support",
  66        "Models tail probabilities via empirical copulas"
  67      ],
  68      "weaknesses": [
  69        "Assumes feature-wise independence for tail modeling",
  70        "May miss complex multivariate interactions"
  71      ],
  72      "best_for": "Large-scale datasets where speed and interpretability matter and features are roughly independent",
  73      "avoid_when": "Outliers only appear in joint distributions with strong feature dependencies",
  74      "benchmark_refs": ["ADBench"],
  75      "benchmark_rank": {"ADBench_overall": 6},
  76      "paper": {"id": "copod", "short": "Li et al., ICDM 2020"},
  77      "default_params": {"contamination": 0.1, "n_jobs": 1},
  78      "preprocessing_mode": "external",
  79      "requires": [],
  80      "version_added": "0.7.0"
  81    },
  82    "MAD": {
  83      "class_path": "pyod.models.mad.MAD",
  84      "full_name": "Median Absolute Deviation",
  85      "status": "shipped",
  86      "data_types": ["tabular"],
  87      "category": "probabilistic",
  88      "complexity": {"time": "O(n * log(n))", "space": "O(n)"},
  89      "strengths": [
  90        "Extremely simple and fast",
  91        "Robust to outliers in the training data",
  92        "Well-suited for univariate data"
  93      ],
  94      "weaknesses": [
  95        "Only works on univariate data",
  96        "Cannot capture multivariate relationships"
  97      ],
  98      "best_for": "Univariate outlier detection with a robust central tendency measure",
  99      "avoid_when": "Data is multivariate or relationships between features are important",
 100      "benchmark_refs": [],
 101      "benchmark_rank": {},
 102      "paper": {"id": "mad", "short": "Iglewicz and Hoaglin, 1993"},
 103      "default_params": {"threshold": 3.5, "contamination": 0.1},
 104      "preprocessing_mode": "external",
 105      "requires": [],
 106      "version_added": "0.7.5"
 107    },
 108    "SOS": {
 109      "class_path": "pyod.models.sos.SOS",
 110      "full_name": "Stochastic Outlier Selection",
 111      "status": "shipped",
 112      "data_types": ["tabular"],
 113      "category": "probabilistic",
 114      "complexity": {"time": "O(n^2 * d)", "space": "O(n^2)"},
 115      "strengths": [
 116        "Based on affinity and binding probability concepts",
 117        "Produces meaningful probability-like outlier scores",
 118        "Perplexity parameter controls local vs global sensitivity"
 119      ],
 120      "weaknesses": [
 121        "Quadratic complexity limits scalability",
 122        "Sensitive to perplexity parameter choice"
 123      ],
 124      "best_for": "Medium-sized datasets where probability-calibrated outlier scores are valuable",
 125      "avoid_when": "Dataset is very large (>10K samples) due to quadratic complexity",
 126      "benchmark_refs": [],
 127      "benchmark_rank": {},
 128      "paper": {"id": "sos", "short": "Janssens et al., 2012"},
 129      "default_params": {"contamination": 0.1, "perplexity": 4.5, "metric": "euclidean"},
 130      "preprocessing_mode": "external",
 131      "requires": [],
 132      "version_added": "0.6.0"
 133    },
 134    "QMCD": {
 135      "class_path": "pyod.models.qmcd.QMCD",
 136      "full_name": "Quasi-Monte Carlo Discrepancy",
 137      "status": "shipped",
 138      "data_types": ["tabular"],
 139      "category": "probabilistic",
 140      "complexity": {"time": "O(n^2 * d)", "space": "O(n * d)"},
 141      "strengths": [
 142        "Based on uniformity criterion from quasi-Monte Carlo theory",
 143        "Parameter-free (only contamination needed)",
 144        "Theoretically grounded in discrepancy measures"
 145      ],
 146      "weaknesses": [
 147        "Quadratic time complexity",
 148        "Assumes outliers deviate from uniform space-filling"
 149      ],
 150      "best_for": "Detecting anomalies as deviations from uniform space-filling in moderate-dimensional data",
 151      "avoid_when": "Dataset is very large or high-dimensional",
 152      "benchmark_refs": [],
 153      "benchmark_rank": {},
 154      "paper": {"id": "qmcd", "short": "Fang et al., 2001"},
 155      "default_params": {"contamination": 0.1},
 156      "preprocessing_mode": "external",
 157      "requires": [],
 158      "version_added": "0.9.0"
 159    },
 160    "KDE": {
 161      "class_path": "pyod.models.kde.KDE",
 162      "full_name": "Kernel Density Estimation",
 163      "status": "shipped",
 164      "data_types": ["tabular"],
 165      "category": "probabilistic",
 166      "complexity": {"time": "O(n^2 * d)", "space": "O(n * d)"},
 167      "strengths": [
 168        "Non-parametric density estimation",
 169        "Multiple kernel choices available",
 170        "Scores directly reflect density"
 171      ],
 172      "weaknesses": [
 173        "Quadratic prediction time",
 174        "Sensitive to bandwidth and kernel selection",
 175        "Suffers from curse of dimensionality"
 176      ],
 177      "best_for": "Low-to-moderate dimensional data where non-parametric density estimation is desired",
 178      "avoid_when": "Data is high-dimensional or dataset is very large",
 179      "benchmark_refs": [],
 180      "benchmark_rank": {},
 181      "paper": {"id": "kde", "short": "Latecki et al., SDM 2007"},
 182      "default_params": {"contamination": 0.1},
 183      "preprocessing_mode": "external",
 184      "requires": [],
 185      "version_added": "0.6.5"
 186    },
 187    "Sampling": {
 188      "class_path": "pyod.models.sampling.Sampling",
 189      "full_name": "Rapid Distance-Based Outlier Detection via Sampling",
 190      "status": "shipped",
 191      "data_types": ["tabular"],
 192      "category": "probabilistic",
 193      "complexity": {"time": "O(n * s * d) where s is subset size", "space": "O(n * d)"},
 194      "strengths": [
 195        "Linear time complexity through sampling",
 196        "Scalable to large datasets",
 197        "Simple and effective approach"
 198      ],
 199      "weaknesses": [
 200        "Randomness from sampling can affect reproducibility",
 201        "May miss local outliers with small subset sizes"
 202      ],
 203      "best_for": "Large-scale datasets requiring fast distance-based outlier detection",
 204      "avoid_when": "Precise and deterministic results are required or dataset is small enough for exact methods",
 205      "benchmark_refs": [],
 206      "benchmark_rank": {},
 207      "paper": {"id": "sampling", "short": "Sugiyama and Borgwardt, 2013"},
 208      "default_params": {"contamination": 0.1},
 209      "preprocessing_mode": "external",
 210      "requires": [],
 211      "version_added": "0.7.5"
 212    },
 213    "GMM": {
 214      "class_path": "pyod.models.gmm.GMM",
 215      "full_name": "Gaussian Mixture Model",
 216      "status": "shipped",
 217      "data_types": ["tabular"],
 218      "category": "probabilistic",
 219      "complexity": {"time": "O(n * k * d^2) per EM iteration", "space": "O(k * d^2)"},
 220      "strengths": [
 221        "Models complex multi-modal distributions",
 222        "Soft clustering provides probabilistic scores",
 223        "Flexible covariance structures"
 224      ],
 225      "weaknesses": [
 226        "Sensitive to initialization and number of components",
 227        "Assumes Gaussian component distributions",
 228        "EM convergence not guaranteed to global optimum"
 229      ],
 230      "best_for": "Data with multi-modal distributions that can be approximated by Gaussian mixtures",
 231      "avoid_when": "Data does not follow mixture-of-Gaussians assumption or is very high-dimensional",
 232      "benchmark_refs": [],
 233      "benchmark_rank": {},
 234      "paper": {"id": "gmm", "short": "Aggarwal, 2017"},
 235      "default_params": {"contamination": 0.1},
 236      "preprocessing_mode": "external",
 237      "requires": [],
 238      "version_added": "0.9.8"
 239    },
 240    "PCA": {
 241      "class_path": "pyod.models.pca.PCA",
 242      "full_name": "Principal Component Analysis",
 243      "status": "shipped",
 244      "data_types": ["tabular"],
 245      "category": "linear",
 246      "complexity": {"time": "O(n * d^2)", "space": "O(d^2)"},
 247      "strengths": [
 248        "Fast training and prediction",
 249        "Well-understood linear model",
 250        "Effective when anomalies lie in minor principal components"
 251      ],
 252      "weaknesses": [
 253        "Assumes linear relationships",
 254        "Sensitive to the number of components selected",
 255        "Cannot capture nonlinear anomalies"
 256      ],
 257      "best_for": "Datasets with linear structure where outliers deviate from main variance directions",
 258      "avoid_when": "Data has strong nonlinear structure or outliers align with principal components",
 259      "benchmark_refs": [],
 260      "benchmark_rank": {},
 261      "paper": {"id": "pca", "short": "Shyu et al., 2003"},
 262      "default_params": {"contamination": 0.1},
 263      "preprocessing_mode": "external",
 264      "requires": [],
 265      "version_added": "0.5.0"
 266    },
 267    "KPCA": {
 268      "class_path": "pyod.models.kpca.KPCA",
 269      "full_name": "Kernel Principal Component Analysis",
 270      "status": "shipped",
 271      "data_types": ["tabular"],
 272      "category": "linear",
 273      "complexity": {"time": "O(n^2 * d)", "space": "O(n^2)"},
 274      "strengths": [
 275        "Captures nonlinear relationships via kernel trick",
 276        "Extension of PCA to nonlinear manifolds",
 277        "Flexible kernel choices"
 278      ],
 279      "weaknesses": [
 280        "Quadratic memory and time complexity",
 281        "Kernel and hyperparameter selection required",
 282        "Slower than linear PCA"
 283      ],
 284      "best_for": "Moderately sized datasets with nonlinear structure",
 285      "avoid_when": "Dataset is very large due to quadratic kernel matrix or a linear model suffices",
 286      "benchmark_refs": [],
 287      "benchmark_rank": {},
 288      "paper": {"id": "kpca", "short": "Hoffmann, 2007"},
 289      "default_params": {"contamination": 0.1},
 290      "preprocessing_mode": "external",
 291      "requires": [],
 292      "version_added": "0.8.0"
 293    },
 294    "MCD": {
 295      "class_path": "pyod.models.mcd.MCD",
 296      "full_name": "Minimum Covariance Determinant",
 297      "status": "shipped",
 298      "data_types": ["tabular"],
 299      "category": "linear",
 300      "complexity": {"time": "O(n * d^2)", "space": "O(d^2)"},
 301      "strengths": [
 302        "Robust estimation of covariance matrix",
 303        "Well-established statistical method",
 304        "Resistant to masking effects from outliers"
 305      ],
 306      "weaknesses": [
 307        "Assumes data follows a Gaussian distribution",
 308        "May fail in high-dimensional settings (d > n)",
 309        "Computationally intensive for large d"
 310      ],
 311      "best_for": "Multivariate Gaussian-like data requiring robust covariance estimation",
 312      "avoid_when": "Data is non-Gaussian, very high-dimensional, or strongly nonlinear",
 313      "benchmark_refs": [],
 314      "benchmark_rank": {},
 315      "paper": {"id": "mcd", "short": "Rousseeuw and Driessen, 1999"},
 316      "default_params": {"contamination": 0.1},
 317      "preprocessing_mode": "external",
 318      "requires": [],
 319      "version_added": "0.5.0"
 320    },
 321    "CD": {
 322      "class_path": "pyod.models.cd.CD",
 323      "full_name": "Cook's Distance",
 324      "status": "shipped",
 325      "data_types": ["tabular"],
 326      "category": "linear",
 327      "complexity": {"time": "O(n * d^2)", "space": "O(n * d)"},
 328      "strengths": [
 329        "Based on classical influence measures from regression",
 330        "Interpretable as influence on regression fit",
 331        "Works well when a regression model is appropriate"
 332      ],
 333      "weaknesses": [
 334        "Requires a target variable (semi-supervised framing)",
 335        "Assumes linear regression model",
 336        "Not suitable for purely unsupervised settings without a model"
 337      ],
 338      "best_for": "Identifying influential observations in regression settings",
 339      "avoid_when": "No natural target variable exists or data is purely unsupervised",
 340      "benchmark_refs": [],
 341      "benchmark_rank": {},
 342      "paper": {"id": "cd", "short": "Cook, 1977"},
 343      "default_params": {"contamination": 0.1},
 344      "preprocessing_mode": "external",
 345      "requires": [],
 346      "version_added": "0.8.5"
 347    },
 348    "OCSVM": {
 349      "class_path": "pyod.models.ocsvm.OCSVM",
 350      "full_name": "One-Class Support Vector Machine",
 351      "status": "shipped",
 352      "data_types": ["tabular"],
 353      "category": "linear",
 354      "complexity": {"time": "O(n^2 * d) to O(n^3)", "space": "O(n * d)"},
 355      "strengths": [
 356        "Flexible boundary via kernel functions",
 357        "Strong theoretical foundation from SVM literature",
 358        "Effective in high-dimensional feature spaces"
 359      ],
 360      "weaknesses": [
 361        "Cubic training time for large datasets",
 362        "Sensitive to kernel and nu parameter choice",
 363        "Does not scale well beyond ~10K samples"
 364      ],
 365      "best_for": "Medium-sized datasets where a flexible decision boundary is needed",
 366      "avoid_when": "Dataset is very large or real-time training is required",
 367      "benchmark_refs": ["ADBench"],
 368      "benchmark_rank": {},
 369      "paper": {"id": "ocsvm", "short": "Scholkopf et al., 2001"},
 370      "default_params": {"contamination": 0.1, "kernel": "rbf"},
 371      "preprocessing_mode": "external",
 372      "requires": [],
 373      "version_added": "0.5.0"
 374    },
 375    "LMDD": {
 376      "class_path": "pyod.models.lmdd.LMDD",
 377      "full_name": "Linear Model Deviation-based Detection",
 378      "status": "shipped",
 379      "data_types": ["tabular"],
 380      "category": "linear",
 381      "complexity": {"time": "O(n_iter * n * d)", "space": "O(n * d)"},
 382      "strengths": [
 383        "Multiple deviation measures available (AAD, IQR, MAD)",
 384        "Random subspace approach provides diversity",
 385        "Interpretable linear projections"
 386      ],
 387      "weaknesses": [
 388        "Results can vary due to random projections",
 389        "Requires tuning of number of iterations and measure",
 390        "May underperform with nonlinear anomaly patterns"
 391      ],
 392      "best_for": "Multivariate data where anomalies are detectable through linear projections",
 393      "avoid_when": "Anomalies require nonlinear feature combinations to detect",
 394      "benchmark_refs": [],
 395      "benchmark_rank": {},
 396      "paper": {"id": "lmdd", "short": "Arning et al., KDD 1996"},
 397      "default_params": {"contamination": 0.1, "n_iter": 50, "dis_measure": "aad"},
 398      "preprocessing_mode": "external",
 399      "requires": [],
 400      "version_added": "0.8.0"
 401    },
 402    "LOF": {
 403      "class_path": "pyod.models.lof.LOF",
 404      "full_name": "Local Outlier Factor",
 405      "status": "shipped",
 406      "data_types": ["tabular"],
 407      "category": "proximity",
 408      "complexity": {"time": "O(n^2 * d)", "space": "O(n * d)"},
 409      "strengths": [
 410        "Detects local density-based outliers",
 411        "Adapts to varying densities across the dataset",
 412        "Well-established and widely used"
 413      ],
 414      "weaknesses": [
 415        "Quadratic complexity limits scalability",
 416        "Sensitive to n_neighbors parameter",
 417        "Difficulty with uniformly dense regions"
 418      ],
 419      "best_for": "Datasets with clusters of varying densities where local anomalies are of interest",
 420      "avoid_when": "Dataset is very large (>50K) or data is uniformly distributed",
 421      "benchmark_refs": ["ADBench"],
 422      "benchmark_rank": {},
 423      "paper": {"id": "lof", "short": "Breunig et al., SIGMOD 2000"},
 424      "default_params": {"contamination": 0.1, "n_neighbors": 20, "algorithm": "auto"},
 425      "preprocessing_mode": "external",
 426      "requires": [],
 427      "version_added": "0.5.0"
 428    },
 429    "COF": {
 430      "class_path": "pyod.models.cof.COF",
 431      "full_name": "Connectivity-Based Outlier Factor",
 432      "status": "shipped",
 433      "data_types": ["tabular"],
 434      "category": "proximity",
 435      "complexity": {"time": "O(n^2 * d)", "space": "O(n^2)"},
 436      "strengths": [
 437        "Considers connectivity-based distances",
 438        "Better than LOF for certain pattern structures",
 439        "Handles low-density patterns in neighborhood chains"
 440      ],
 441      "weaknesses": [
 442        "Quadratic time and space complexity",
 443        "Slow for large datasets",
 444        "Marginal improvement over LOF in many cases"
 445      ],
 446      "best_for": "Datasets where outliers lie along sparse connectivity paths between clusters",
 447      "avoid_when": "Dataset is large or simpler methods like LOF already perform well",
 448      "benchmark_refs": [],
 449      "benchmark_rank": {},
 450      "paper": {"id": "cof", "short": "Tang et al., 2002"},
 451      "default_params": {"contamination": 0.1, "n_neighbors": 20, "method": "fast"},
 452      "preprocessing_mode": "external",
 453      "requires": [],
 454      "version_added": "0.6.5"
 455    },
 456    "CBLOF": {
 457      "class_path": "pyod.models.cblof.CBLOF",
 458      "full_name": "Cluster-Based Local Outlier Factor",
 459      "status": "shipped",
 460      "data_types": ["tabular"],
 461      "category": "proximity",
 462      "complexity": {"time": "O(n * k * d)", "space": "O(n * d)"},
 463      "strengths": [
 464        "Combines clustering with local outlier scoring",
 465        "Efficient through cluster-based summarization",
 466        "Effective when data has clear cluster structure"
 467      ],
 468      "weaknesses": [
 469        "Depends on quality of underlying clustering",
 470        "Sensitive to number of clusters parameter",
 471        "May fail if data does not cluster well"
 472      ],
 473      "best_for": "Data with well-separated clusters where outliers deviate from cluster structure",
 474      "avoid_when": "Data has no meaningful cluster structure or clusters are heavily overlapping",
 475      "benchmark_refs": ["ADBench"],
 476      "benchmark_rank": {},
 477      "paper": {"id": "cblof", "short": "He et al., 2003"},
 478      "default_params": {"contamination": 0.1, "n_clusters": 8},
 479      "preprocessing_mode": "external",
 480      "requires": [],
 481      "version_added": "0.5.0"
 482    },
 483    "LOCI": {
 484      "class_path": "pyod.models.loci.LOCI",
 485      "full_name": "Local Correlation Integral",
 486      "status": "shipped",
 487      "data_types": ["tabular"],
 488      "category": "proximity",
 489      "complexity": {"time": "O(n^2 * d)", "space": "O(n^2)"},
 490      "strengths": [
 491        "Automatic determination of outlier threshold",
 492        "Multi-granularity outlier detection via alpha parameter",
 493        "Does not require explicit k-neighbor parameter"
 494      ],
 495      "weaknesses": [
 496        "Quadratic complexity makes it slow",
 497        "Difficult to scale to large datasets",
 498        "Parameter alpha requires tuning"
 499      ],
 500      "best_for": "Small to medium datasets where automatic threshold selection is valued",
 501      "avoid_when": "Dataset is large or faster LOF-based methods are sufficient",
 502      "benchmark_refs": [],
 503      "benchmark_rank": {},
 504      "paper": {"id": "loci", "short": "Papadimitriou et al., ICDE 2003"},
 505      "default_params": {"contamination": 0.1, "alpha": 0.5, "k": 3},
 506      "preprocessing_mode": "external",
 507      "requires": [],
 508      "version_added": "0.6.0"
 509    },
 510    "HBOS": {
 511      "class_path": "pyod.models.hbos.HBOS",
 512      "full_name": "Histogram-Based Outlier Score",
 513      "status": "shipped",
 514      "data_types": ["tabular"],
 515      "category": "proximity",
 516      "complexity": {"time": "O(n * d)", "space": "O(n_bins * d)"},
 517      "strengths": [
 518        "Extremely fast - linear time complexity",
 519        "Simple histogram-based approach",
 520        "Scales well to large datasets"
 521      ],
 522      "weaknesses": [
 523        "Assumes feature independence",
 524        "Cannot capture multivariate interactions",
 525        "Sensitive to number of bins"
 526      ],
 527      "best_for": "Large-scale datasets where speed is critical and features are roughly independent",
 528      "avoid_when": "Outliers only manifest through feature interactions or correlations",
 529      "benchmark_refs": ["ADBench"],
 530      "benchmark_rank": {"ADBench_overall": 7},
 531      "paper": {"id": "hbos", "short": "Goldstein and Dengel, KI 2012"},
 532      "default_params": {"contamination": 0.1, "n_bins": 10, "alpha": 0.1, "tol": 0.5},
 533      "preprocessing_mode": "external",
 534      "requires": [],
 535      "version_added": "0.5.0"
 536    },
 537    "HDBSCAN": {
 538      "class_path": "pyod.models.hdbscan.HDBSCAN",
 539      "full_name": "Hierarchical Density-Based Spatial Clustering of Applications with Noise",
 540      "status": "shipped",
 541      "data_types": ["tabular"],
 542      "category": "proximity",
 543      "complexity": {"time": "O(n * log(n)) to O(n^2)", "space": "O(n)"},
 544      "strengths": [
 545        "Automatically finds clusters of varying density",
 546        "Identifies noise points as potential outliers",
 547        "Minimal parameter tuning required"
 548      ],
 549      "weaknesses": [
 550        "Performance depends on min_cluster_size choice",
 551        "May be slow for very large datasets",
 552        "Outlier scores derived from cluster membership"
 553      ],
 554      "best_for": "Datasets with variable-density clusters where noise points are outliers",
 555      "avoid_when": "Data does not have cluster structure or very high-dimensional",
 556      "benchmark_refs": [],
 557      "benchmark_rank": {},
 558      "paper": {"id": "hdbscan", "short": "Campello et al., PAKDD 2013"},
 559      "default_params": {"min_cluster_size": 5},
 560      "preprocessing_mode": "external",
 561      "requires": [],
 562      "version_added": "0.9.8"
 563    },
 564    "KNN": {
 565      "class_path": "pyod.models.knn.KNN",
 566      "full_name": "K-Nearest Neighbors Outlier Detection",
 567      "status": "shipped",
 568      "data_types": ["tabular"],
 569      "category": "proximity",
 570      "complexity": {"time": "O(n^2 * d)", "space": "O(n * d)"},
 571      "strengths": [
 572        "Simple and intuitive distance-based method",
 573        "Multiple scoring methods (largest, mean, median)",
 574        "Strong benchmark performance"
 575      ],
 576      "weaknesses": [
 577        "Quadratic complexity limits scalability",
 578        "Sensitive to distance metric and k choice",
 579        "Struggles with varying density clusters"
 580      ],
 581      "best_for": "General-purpose distance-based outlier detection on moderate-sized datasets",
 582      "avoid_when": "Dataset is very large or has highly variable local densities",
 583      "benchmark_refs": ["ADBench"],
 584      "benchmark_rank": {"ADBench_overall": 4},
 585      "paper": {"id": "knn", "short": "Ramaswamy et al., SIGMOD 2000"},
 586      "default_params": {"contamination": 0.1, "n_neighbors": 5, "method": "largest"},
 587      "preprocessing_mode": "external",
 588      "requires": [],
 589      "version_added": "0.5.0"
 590    },
 591    "SOD": {
 592      "class_path": "pyod.models.sod.SOD",
 593      "full_name": "Subspace Outlier Detection",
 594      "status": "shipped",
 595      "data_types": ["tabular"],
 596      "category": "proximity",
 597      "complexity": {"time": "O(n^2 * d)", "space": "O(n * d)"},
 598      "strengths": [
 599        "Detects outliers in axis-parallel subspaces",
 600        "Effective when anomalies hide in subspaces",
 601        "Reference set approach captures local structure"
 602      ],
 603      "weaknesses": [
 604        "Quadratic complexity",
 605        "Requires tuning of ref_set and n_neighbors",
 606        "May miss anomalies not aligned with axis-parallel subspaces"
 607      ],
 608      "best_for": "High-dimensional data where outliers deviate in axis-parallel subspaces",
 609      "avoid_when": "Data is low-dimensional or anomalies require oblique subspaces to detect",
 610      "benchmark_refs": [],
 611      "benchmark_rank": {},
 612      "paper": {"id": "sod", "short": "Kriegel et al., PAKDD 2009"},
 613      "default_params": {"contamination": 0.1, "n_neighbors": 20, "ref_set": 10},
 614      "preprocessing_mode": "external",
 615      "requires": [],
 616      "version_added": "0.6.0"
 617    },
 618    "ROD": {
 619      "class_path": "pyod.models.rod.ROD",
 620      "full_name": "Rotation-Based Outlier Detection",
 621      "status": "shipped",
 622      "data_types": ["tabular"],
 623      "category": "proximity",
 624      "complexity": {"time": "O(n * d^2)", "space": "O(n * d)"},
 625      "strengths": [
 626        "Considers rotations to detect outliers across projections",
 627        "Parameter-free (only contamination needed)",
 628        "Supports parallel execution"
 629      ],
 630      "weaknesses": [
 631        "Designed primarily for 3D data, may not generalize well to very high dimensions",
 632        "Limited theoretical analysis compared to classical methods"
 633      ],
 634      "best_for": "Low-dimensional data where rotation-invariant outlier detection is desired",
 635      "avoid_when": "Data is high-dimensional or a well-tuned alternative is available",
 636      "benchmark_refs": [],
 637      "benchmark_rank": {},
 638      "paper": {"id": "rod", "short": "Almardeny et al., 2020"},
 639      "default_params": {"contamination": 0.1, "parallel_execution": false},
 640      "preprocessing_mode": "external",
 641      "requires": [],
 642      "version_added": "0.8.5"
 643    },
 644    "IForest": {
 645      "class_path": "pyod.models.iforest.IForest",
 646      "full_name": "Isolation Forest",
 647      "status": "shipped",
 648      "data_types": ["tabular"],
 649      "category": "ensemble",
 650      "complexity": {"time": "O(n * t * log(n)) where t is n_estimators", "space": "O(t * n)"},
 651      "strengths": [
 652        "Excellent overall benchmark performance",
 653        "Linear time complexity with efficient implementation",
 654        "Handles high-dimensional data well",
 655        "Does not require distance or density computation"
 656      ],
 657      "weaknesses": [
 658        "May struggle with local anomalies in dense regions",
 659        "Axis-aligned splits can miss anomalies in correlated features"
 660      ],
 661      "best_for": "General-purpose anomaly detection especially on large or high-dimensional datasets",
 662      "avoid_when": "Anomalies are local density deviations or features are strongly correlated",
 663      "benchmark_refs": ["ADBench"],
 664      "benchmark_rank": {"ADBench_overall": 3},
 665      "paper": {"id": "iforest", "short": "Liu et al., ICDM 2008"},
 666      "default_params": {"contamination": 0.1, "n_estimators": 100},
 667      "preprocessing_mode": "external",
 668      "requires": [],
 669      "version_added": "0.5.0"
 670    },
 671    "INNE": {
 672      "class_path": "pyod.models.inne.INNE",
 673      "full_name": "Isolation-based Anomaly Detection Using Nearest-Neighbor Ensembles",
 674      "status": "shipped",
 675      "data_types": ["tabular"],
 676      "category": "ensemble",
 677      "complexity": {"time": "O(n * t * s) where t is n_estimators, s is sample size", "space": "O(t * s)"},
 678      "strengths": [
 679        "Combines isolation and nearest-neighbor concepts",
 680        "Handles local anomalies better than Isolation Forest",
 681        "Efficient sampling-based approach"
 682      ],
 683      "weaknesses": [
 684        "Requires tuning of sample size and number of estimators",
 685        "Less established than Isolation Forest"
 686      ],
 687      "best_for": "Datasets where local density variations matter and Isolation Forest underperforms",
 688      "avoid_when": "A simpler method like IForest already works well",
 689      "benchmark_refs": [],
 690      "benchmark_rank": {},
 691      "paper": {"id": "inne", "short": "Bandaragoda et al., KAIS 2018"},
 692      "default_params": {"contamination": 0.1},
 693      "preprocessing_mode": "external",
 694      "requires": [],
 695      "version_added": "0.9.5"
 696    },
 697    "DIF": {
 698      "class_path": "pyod.models.dif.DIF",
 699      "full_name": "Deep Isolation Forest",
 700      "status": "shipped",
 701      "data_types": ["tabular"],
 702      "category": "ensemble",
 703      "complexity": {"time": "O(n * t * d * log(n))", "space": "O(t * n)"},
 704      "strengths": [
 705        "Uses deep random representations for better isolation",
 706        "Handles complex data distributions",
 707        "Extends isolation forest to representation space"
 708      ],
 709      "weaknesses": [
 710        "More computationally expensive than standard IForest",
 711        "Requires tuning of representation parameters"
 712      ],
 713      "best_for": "Complex datasets where standard Isolation Forest misses anomalies due to axis-aligned splits",
 714      "avoid_when": "Standard Isolation Forest performs well or dataset is small and simple",
 715      "benchmark_refs": [],
 716      "benchmark_rank": {},
 717      "paper": {"id": "dif", "short": "Xu et al., TKDE 2023"},
 718      "default_params": {"contamination": 0.1},
 719      "preprocessing_mode": "external",
 720      "requires": [],
 721      "version_added": "0.9.8"
 722    },
 723    "FeatureBagging": {
 724      "class_path": "pyod.models.feature_bagging.FeatureBagging",
 725      "full_name": "Feature Bagging Outlier Detection",
 726      "status": "shipped",
 727      "data_types": ["tabular"],
 728      "category": "ensemble",
 729      "complexity": {"time": "O(n_estimators * base_detector_time)", "space": "O(n_estimators * base_detector_space)"},
 730      "strengths": [
 731        "Reduces variance through feature subsampling",
 732        "Flexible with any base detector",
 733        "Robust against irrelevant features"
 734      ],
 735      "weaknesses": [
 736        "Performance depends on choice of base detector",
 737        "Slower than single detector due to ensemble overhead",
 738        "May not improve over base if features are all relevant"
 739      ],
 740      "best_for": "High-dimensional data with potentially irrelevant features",
 741      "avoid_when": "All features are relevant or a single strong detector suffices",
 742      "benchmark_refs": [],
 743      "benchmark_rank": {},
 744      "paper": {"id": "feature_bagging", "short": "Lazarevic and Kumar, KDD 2005"},
 745      "default_params": {"contamination": 0.1, "n_estimators": 10},
 746      "preprocessing_mode": "external",
 747      "requires": ["combo"],
 748      "version_added": "0.5.0"
 749    },
 750    "LSCP": {
 751      "class_path": "pyod.models.lscp.LSCP",
 752      "full_name": "Locally Selective Combination of Parallel Outlier Ensembles",
 753      "status": "shipped",
 754      "data_types": ["tabular"],
 755      "category": "ensemble",
 756      "complexity": {"time": "O(n * n_detectors * base_cost)", "space": "O(n * n_detectors)"},
 757      "strengths": [
 758        "Locally selects the best detector for each region",
 759        "Leverages diversity among base detectors",
 760        "Adaptive combination strategy"
 761      ],
 762      "weaknesses": [
 763        "Requires a list of pre-instantiated base detectors",
 764        "Slower due to training multiple detectors",
 765        "Complex internal selection mechanism"
 766      ],
 767      "best_for": "Scenarios where diverse base detectors are available and local performance varies",
 768      "avoid_when": "Only one detector type is appropriate or computational budget is limited",
 769      "benchmark_refs": [],
 770      "benchmark_rank": {},
 771      "paper": {"id": "lscp", "short": "Zhao et al., SDM 2019"},
 772      "default_params": {"local_region_size": 30},
 773      "preprocessing_mode": "external",
 774      "requires": [],
 775      "version_added": "0.6.5"
 776    },
 777    "LODA": {
 778      "class_path": "pyod.models.loda.LODA",
 779      "full_name": "Lightweight Online Detector of Anomalies",
 780      "status": "shipped",
 781      "data_types": ["tabular"],
 782      "category": "ensemble",
 783      "complexity": {"time": "O(n * n_cuts * d)", "space": "O(n_bins * n_cuts)"},
 784      "strengths": [
 785        "Lightweight and fast",
 786        "Supports online/streaming updates",
 787        "Uses ensemble of random projections"
 788      ],
 789      "weaknesses": [
 790        "Random projections may miss certain anomaly patterns",
 791        "Histogram-based scoring can be coarse",
 792        "Sensitive to number of bins and cuts"
 793      ],
 794      "best_for": "Streaming or online anomaly detection with limited computational resources",
 795      "avoid_when": "Batch setting with enough time for more powerful methods",
 796      "benchmark_refs": [],
 797      "benchmark_rank": {},
 798      "paper": {"id": "loda", "short": "Pevny, 2016"},
 799      "default_params": {"contamination": 0.1, "n_bins": 10, "n_random_cuts": 100},
 800      "preprocessing_mode": "external",
 801      "requires": [],
 802      "version_added": "0.6.0"
 803    },
 804    "SUOD": {
 805      "class_path": "pyod.models.suod.SUOD",
 806      "full_name": "Scalable Unsupervised Outlier Detection",
 807      "status": "shipped",
 808      "data_types": ["tabular"],
 809      "category": "ensemble",
 810      "complexity": {"time": "varies (depends on base estimators)", "space": "varies"},
 811      "strengths": [
 812        "Accelerates large-scale outlier detection via approximation",
 813        "Supports parallel execution of base detectors",
 814        "Modular framework for combining multiple detectors"
 815      ],
 816      "weaknesses": [
 817        "Approximation may reduce accuracy",
 818        "Overhead from the acceleration framework",
 819        "Requires choosing and configuring base estimators"
 820      ],
 821      "best_for": "Large-scale datasets where running multiple detectors is desired but time is limited",
 822      "avoid_when": "Exact results from a single well-chosen detector are preferred",
 823      "benchmark_refs": [],
 824      "benchmark_rank": {},
 825      "paper": {"id": "suod", "short": "Zhao et al., MLSys 2021"},
 826      "default_params": {"contamination": 0.1},
 827      "preprocessing_mode": "external",
 828      "requires": ["suod"],
 829      "version_added": "0.8.0"
 830    },
 831    "XGBOD": {
 832      "class_path": "pyod.models.xgbod.XGBOD",
 833      "full_name": "Extreme Gradient Boosting Outlier Detection",
 834      "status": "shipped",
 835      "data_types": ["tabular"],
 836      "category": "ensemble",
 837      "complexity": {"time": "O(n * d * n_estimators * log(n))", "space": "O(n * d)"},
 838      "strengths": [
 839        "Combines unsupervised representations with supervised XGBoost",
 840        "High accuracy when labels are available",
 841        "Leverages powerful gradient boosting framework"
 842      ],
 843      "weaknesses": [
 844        "Requires labeled data (semi-supervised)",
 845        "Depends on XGBoost library",
 846        "Training is more expensive than unsupervised methods"
 847      ],
 848      "best_for": "Semi-supervised settings where some labeled anomalies are available",
 849      "avoid_when": "No labeled data is available or a purely unsupervised approach is needed",
 850      "benchmark_refs": [],
 851      "benchmark_rank": {},
 852      "paper": {"id": "xgbod", "short": "Zhao and Hryniewicki, IJCNN 2018"},
 853      "default_params": {},
 854      "preprocessing_mode": "external",
 855      "requires": ["xgboost"],
 856      "version_added": "0.5.0"
 857    },
 858    "AutoEncoder": {
 859      "class_path": "pyod.models.auto_encoder.AutoEncoder",
 860      "full_name": "Fully Connected AutoEncoder",
 861      "status": "shipped",
 862      "data_types": ["tabular"],
 863      "category": "deep_learning",
 864      "complexity": {"time": "O(n * d * h * epochs) where h is hidden size", "space": "O(d * h)"},
 865      "strengths": [
 866        "Learns nonlinear feature representations",
 867        "Reconstruction error is an intuitive anomaly score",
 868        "Flexible architecture"
 869      ],
 870      "weaknesses": [
 871        "Requires tuning of architecture and hyperparameters",
 872        "May overfit on small datasets",
 873        "Training can be unstable"
 874      ],
 875      "best_for": "Datasets with complex nonlinear structure where reconstruction-based scoring is appropriate",
 876      "avoid_when": "Dataset is small, tabular and simple methods suffice, or training time is limited",
 877      "benchmark_refs": [],
 878      "benchmark_rank": {},
 879      "paper": {"id": "autoencoder", "short": "Aggarwal, 2017"},
 880      "default_params": {"contamination": 0.1},
 881      "preprocessing_mode": "external",
 882      "requires": ["torch"],
 883      "version_added": "0.6.0"
 884    },
 885    "VAE": {
 886      "class_path": "pyod.models.vae.VAE",
 887      "full_name": "Variational AutoEncoder",
 888      "status": "shipped",
 889      "data_types": ["tabular"],
 890      "category": "deep_learning",
 891      "complexity": {"time": "O(n * d * h * epochs)", "space": "O(d * h)"},
 892      "strengths": [
 893        "Probabilistic latent space with regularization",
 894        "Generates calibrated reconstruction probabilities",
 895        "Learns smooth latent representations"
 896      ],
 897      "weaknesses": [
 898        "More complex to train than standard autoencoder",
 899        "KL divergence term may dominate loss",
 900        "Requires careful balancing of loss components"
 901      ],
 902      "best_for": "Datasets where probabilistic reconstruction scoring and smooth latent spaces are beneficial",
 903      "avoid_when": "Simpler autoencoder or non-deep methods work well, or dataset is very small",
 904      "benchmark_refs": [],
 905      "benchmark_rank": {},
 906      "paper": {"id": "vae", "short": "Kingma and Welling, 2014"},
 907      "default_params": {"contamination": 0.1},
 908      "preprocessing_mode": "external",
 909      "requires": ["torch"],
 910      "version_added": "0.6.0"
 911    },
 912    "SO_GAAL": {
 913      "class_path": "pyod.models.so_gaal.SO_GAAL",
 914      "full_name": "Single-Objective Generative Adversarial Active Learning",
 915      "status": "shipped",
 916      "data_types": ["tabular"],
 917      "category": "deep_learning",
 918      "complexity": {"time": "O(n * d * h * epochs)", "space": "O(d * h)"},
 919      "strengths": [
 920        "GAN-based approach generates informative outlier samples",
 921        "Does not require labeled anomalies",
 922        "Novel adversarial training framework for OD"
 923      ],
 924      "weaknesses": [
 925        "GAN training instability",
 926        "Sensitive to hyperparameters and architecture",
 927        "Slow training compared to non-deep methods"
 928      ],
 929      "best_for": "Exploratory anomaly detection with GAN-generated reference outliers",
 930      "avoid_when": "Stable and fast results are required, or dataset is small",
 931      "benchmark_refs": [],
 932      "benchmark_rank": {},
 933      "paper": {"id": "so_gaal", "short": "Liu et al., 2019"},
 934      "default_params": {"contamination": 0.1},
 935      "preprocessing_mode": "external",
 936      "requires": ["torch"],
 937      "version_added": "0.6.0"
 938    },
 939    "MO_GAAL": {
 940      "class_path": "pyod.models.mo_gaal.MO_GAAL",
 941      "full_name": "Multiple-Objective Generative Adversarial Active Learning",
 942      "status": "shipped",
 943      "data_types": ["tabular"],
 944      "category": "deep_learning",
 945      "complexity": {"time": "O(k * n * d * h * epochs) where k is number of generators", "space": "O(k * d * h)"},
 946      "strengths": [
 947        "Multiple generators provide diverse outlier references",
 948        "Better coverage of outlier space than SO_GAAL",
 949        "Multi-objective formulation improves robustness"
 950      ],
 951      "weaknesses": [
 952        "Even more complex training than SO_GAAL",
 953        "Multiple generators increase computational cost",
 954        "Difficult to tune"
 955      ],
 956      "best_for": "Complex datasets where diverse generated outlier references improve detection",
 957      "avoid_when": "Computational resources are limited or simpler GAN approaches suffice",
 958      "benchmark_refs": [],
 959      "benchmark_rank": {},
 960      "paper": {"id": "mo_gaal", "short": "Liu et al., 2019"},
 961      "default_params": {"k": 10, "stop_epochs": 20, "contamination": 0.1},
 962      "preprocessing_mode": "external",
 963      "requires": ["torch"],
 964      "version_added": "0.6.0"
 965    },
 966    "DeepSVDD": {
 967      "class_path": "pyod.models.deep_svdd.DeepSVDD",
 968      "full_name": "Deep Support Vector Data Description",
 969      "status": "shipped",
 970      "data_types": ["tabular"],
 971      "category": "deep_learning",
 972      "complexity": {"time": "O(n * d * h * epochs)", "space": "O(d * h)"},
 973      "strengths": [
 974        "Learns a compact hypersphere around normal data",
 975        "Combines deep learning with SVDD objective",
 976        "Effective for one-class classification"
 977      ],
 978      "weaknesses": [
 979        "Sensitive to network architecture choices",
 980        "Risk of hypersphere collapse",
 981        "Requires careful initialization"
 982      ],
 983      "best_for": "One-class anomaly detection where a compact normal data description is desired",
 984      "avoid_when": "Normal data is multi-modal or simpler one-class methods are sufficient",
 985      "benchmark_refs": [],
 986      "benchmark_rank": {},
 987      "paper": {"id": "deep_svdd", "short": "Ruff et al., ICML 2018"},
 988      "default_params": {"contamination": 0.1},
 989      "preprocessing_mode": "external",
 990      "requires": ["torch"],
 991      "version_added": "0.7.5"
 992    },
 993    "AnoGAN": {
 994      "class_path": "pyod.models.anogan.AnoGAN",
 995      "full_name": "Anomaly Detection with Generative Adversarial Networks",
 996      "status": "shipped",
 997      "data_types": ["tabular"],
 998      "category": "deep_learning",
 999      "complexity": {"time": "O(n * d * h * epochs) + O(n * iterations) for inference", "space": "O(d * h)"},
1000      "strengths": [
1001        "GAN learns the normal data distribution",
1002        "Can detect complex non-linear anomalies",
1003        "Anomaly scoring via reconstruction in latent space"
1004      ],
1005      "weaknesses": [
1006        "GAN training instability",
1007        "Slow inference due to iterative latent optimization",
1008        "Requires significant tuning"
1009      ],
1010      "best_for": "Complex data distributions where GAN-based generation quality is high",
1011      "avoid_when": "Fast inference is needed or training instability is a concern",
1012      "benchmark_refs": [],
1013      "benchmark_rank": {},
1014      "paper": {"id": "anogan", "short": "Schlegl et al., IPMI 2017"},
1015      "default_params": {"contamination": 0.1},
1016      "preprocessing_mode": "external",
1017      "requires": ["torch"],
1018      "version_added": "0.7.5"
1019    },
1020    "ALAD": {
1021      "class_path": "pyod.models.alad.ALAD",
1022      "full_name": "Adversarially Learned Anomaly Detection",
1023      "status": "shipped",
1024      "data_types": ["tabular"],
1025      "category": "deep_learning",
1026      "complexity": {"time": "O(n * d * h * epochs)", "space": "O(d * h)"},
1027      "strengths": [
1028        "Bi-directional GAN avoids slow iterative inference of AnoGAN",
1029        "Multiple discriminators stabilize training",
1030        "Fast inference after training"
1031      ],
1032      "weaknesses": [
1033        "Complex architecture with multiple networks",
1034        "Still subject to GAN training challenges",
1035        "Many hyperparameters to tune"
1036      ],
1037      "best_for": "Scenarios where GAN-based detection is desired but fast inference is needed",
1038      "avoid_when": "Simpler reconstruction-based deep methods suffice or dataset is small",
1039      "benchmark_refs": [],
1040      "benchmark_rank": {},
1041      "paper": {"id": "alad", "short": "Zenati et al., ICDM 2018"},
1042      "default_params": {"contamination": 0.1},
1043      "preprocessing_mode": "external",
1044      "requires": ["torch"],
1045      "version_added": "0.7.5"
1046    },
1047    "AE1SVM": {
1048      "class_path": "pyod.models.ae1svm.AE1SVM",
1049      "full_name": "AutoEncoder with One-Class SVM",
1050      "status": "shipped",
1051      "data_types": ["tabular"],
1052      "category": "deep_learning",
1053      "complexity": {"time": "O(n * d * h * epochs)", "space": "O(d * h + n_sv * h)"},
1054      "strengths": [
1055        "Jointly optimizes autoencoder and one-class SVM",
1056        "Combines representation learning with boundary detection",
1057        "End-to-end training"
1058      ],
1059      "weaknesses": [
1060        "Complex joint optimization",
1061        "Requires tuning of both AE and SVM hyperparameters",
1062        "Training can be unstable"
1063      ],
1064      "best_for": "Datasets benefiting from joint representation learning and one-class classification",
1065      "avoid_when": "Simpler pipeline of separate AE + SVM works well, or dataset is small",
1066      "benchmark_refs": [],
1067      "benchmark_rank": {},
1068      "paper": {"id": "ae1svm", "short": "Nguyen and Vien, ECML-PKDD 2019"},
1069      "default_params": {"contamination": 0.1},
1070      "preprocessing_mode": "external",
1071      "requires": ["torch"],
1072      "version_added": "0.9.0"
1073    },
1074    "DevNet": {
1075      "class_path": "pyod.models.devnet.DevNet",
1076      "full_name": "Deep Anomaly Detection with Deviation Networks",
1077      "status": "shipped",
1078      "data_types": ["tabular"],
1079      "category": "deep_learning",
1080      "complexity": {"time": "O(n * d * h * epochs)", "space": "O(d * h)"},
1081      "strengths": [
1082        "End-to-end deep anomaly scoring network",
1083        "Can leverage a few labeled anomalies (semi-supervised)",
1084        "Deviation loss directly optimizes anomaly scores"
1085      ],
1086      "weaknesses": [
1087        "Requires at least a few labeled anomalies for best results",
1088        "Deep network training overhead",
1089        "Sensitive to architecture and loss parameters"
1090      ],
1091      "best_for": "Semi-supervised anomaly detection where a small number of labeled anomalies are available",
1092      "avoid_when": "No labeled anomalies are available or dataset is too small for deep learning",
1093      "benchmark_refs": [],
1094      "benchmark_rank": {},
1095      "paper": {"id": "devnet", "short": "Pang et al., KDD 2019"},
1096      "default_params": {"contamination": 0.1},
1097      "preprocessing_mode": "external",
1098      "requires": ["torch"],
1099      "version_added": "0.9.5"
1100    },
1101    "RGraph": {
1102      "class_path": "pyod.models.rgraph.RGraph",
1103      "full_name": "R-Graph Outlier Detection",
1104      "status": "shipped",
1105      "data_types": ["tabular"],
1106      "category": "graph",
1107      "complexity": {"time": "O(n^2 * d + transition_steps * n^2)", "space": "O(n^2)"},
1108      "strengths": [
1109        "Graph-based approach captures relational structure",
1110        "Random walk on graph reveals connectivity anomalies",
1111        "Tunable transition steps for multi-scale detection"
1112      ],
1113      "weaknesses": [
1114        "Quadratic memory for adjacency/transition matrix",
1115        "Slow for large datasets",
1116        "Requires tuning of graph construction parameters"
1117      ],
1118      "best_for": "Datasets where graph connectivity and neighborhood structure reveal anomalies",
1119      "avoid_when": "Dataset is very large or a simpler proximity method suffices",
1120      "benchmark_refs": [],
1121      "benchmark_rank": {},
1122      "paper": {"id": "rgraph", "short": "You et al., AAAI 2017"},
1123      "default_params": {"contamination": 0.1, "transition_steps": 10, "n_nonzero": 10, "gamma": 50.0},
1124      "preprocessing_mode": "external",
1125      "requires": [],
1126      "version_added": "0.8.5"
1127    },
1128    "LUNAR": {
1129      "class_path": "pyod.models.lunar.LUNAR",
1130      "full_name": "Learnable Unified Neighborhood-based Anomaly Ranking",
1131      "status": "shipped",
1132      "data_types": ["tabular"],
1133      "category": "graph",
1134      "complexity": {"time": "O(n * k * d + n * h * epochs)", "space": "O(n * k + d * h)"},
1135      "strengths": [
1136        "Learns to score anomalies from neighbor graphs via GNN",
1137        "Combines neighborhood structure with learned representations",
1138        "Supports both score and weight model types"
1139      ],
1140      "weaknesses": [
1141        "Requires PyTorch for GNN training",
1142        "More complex setup than classical methods",
1143        "Training overhead from neural network"
1144      ],
1145      "best_for": "Datasets where learned neighborhood-based scoring outperforms handcrafted rules",
1146      "avoid_when": "PyTorch is not available or simpler KNN/LOF methods work well",
1147      "benchmark_refs": [],
1148      "benchmark_rank": {},
1149      "paper": {"id": "lunar", "short": "Goodge et al., AAAI 2022"},
1150      "default_params": {"contamination": 0.1, "model_type": "WEIGHT", "n_neighbours": 5},
1151      "preprocessing_mode": "external",
1152      "requires": ["torch"],
1153      "version_added": "0.9.5"
1154    },
1155    "EmbeddingOD": {
1156      "class_path": "pyod.models.embedding.EmbeddingOD",
1157      "full_name": "Embedding-Based Outlier Detection",
1158      "status": "shipped",
1159      "data_types": ["text", "image"],
1160      "category": "embedding",
1161      "complexity": {"time": "O(n * embedding_cost + detector_cost)", "space": "O(n * embedding_dim)"},
1162      "strengths": [
1163        "Leverages foundation model embeddings for anomaly detection",
1164        "Supports text and image data natively",
1165        "Flexible choice of downstream detector"
1166      ],
1167      "weaknesses": [
1168        "Requires a pre-trained encoder model",
1169        "Embedding quality depends on the foundation model",
1170        "Higher memory usage for large embedding dimensions"
1171      ],
1172      "best_for": "Anomaly detection on unstructured data (text, images) via foundation model representations",
1173      "avoid_when": "Data is already tabular or a suitable encoder is not available",
1174      "benchmark_refs": [],
1175      "benchmark_rank": {},
1176      "paper": {"id": "embedding_od", "short": "Zhao et al., 2025"},
1177      "default_params": {"contamination": 0.1, "detector": "LUNAR"},
1178      "preprocessing_mode": "internal",
1179      "requires": ["torch"],
1180      "version_added": "2.0.5"
1181    },
1182    "MultiModalOD": {
1183      "class_path": "pyod.models.embedding.MultiModalOD",
1184      "full_name": "Multi-Modal Outlier Detection",
1185      "status": "shipped",
1186      "data_types": ["text", "image", "multimodal"],
1187      "category": "embedding",
1188      "complexity": {"time": "O(n * n_modalities * embedding_cost + detector_cost)", "space": "O(n * n_modalities * embedding_dim)"},
1189      "strengths": [
1190        "Combines multiple data modalities for anomaly detection",
1191        "Supports flexible modality combination strategies",
1192        "Leverages foundation model embeddings per modality"
1193      ],
1194      "weaknesses": [
1195        "Requires encoder for each modality",
1196        "Higher computational cost with multiple modalities",
1197        "Combination strategy choice affects performance"
1198      ],
1199      "best_for": "Anomaly detection on multi-modal data combining text, image, or other modalities",
1200      "avoid_when": "Only a single modality is available or data is purely tabular",
1201      "benchmark_refs": [],
1202      "benchmark_rank": {},
1203      "paper": {"id": "multimodal_od", "short": "Zhao et al., 2025"},
1204      "default_params": {"contamination": 0.1, "combination": "average"},
1205      "preprocessing_mode": "internal",
1206      "requires": ["torch"],
1207      "version_added": "2.1.0"
1208    },
1209    "LLMAD": {
1210      "class_path": "pyod.models.llmad.LLMAD",
1211      "full_name": "LLM-Based Anomaly Detection",
1212      "status": "planned",
1213      "data_types": ["tabular", "text"],
1214      "category": "embedding",
1215      "complexity": {"time": "varies", "space": "varies"},
1216      "strengths": [
1217        "Zero-shot anomaly detection via LLM reasoning",
1218        "No training data required",
1219        "Handles diverse data types through natural language understanding"
1220      ],
1221      "weaknesses": [
1222        "Not yet implemented",
1223        "Requires LLM API access with associated costs",
1224        "Inference latency from LLM calls"
1225      ],
1226      "best_for": "Zero-shot or few-shot anomaly detection leveraging LLM world knowledge",
1227      "avoid_when": "Feature is needed before release or LLM API costs are prohibitive",
1228      "benchmark_refs": [],
1229      "benchmark_rank": {},
1230      "paper": {"id": "llmad", "short": "TBD"},
1231      "default_params": {"contamination": 0.1},
1232      "preprocessing_mode": "internal",
1233      "requires": [],
1234      "version_added": "TBD"
1235    },
1236    "TimeSeriesOD": {
1237      "class_path": "pyod.models.ts_od.TimeSeriesOD",
1238      "full_name": "Time Series Outlier Detection",
1239      "status": "shipped",
1240      "data_types": ["time_series"],
1241      "category": "time_series",
1242      "complexity": {"time": "varies by detector", "space": "varies by detector"},
1243      "strengths": [
1244        "Unified interface for time-series anomaly detection",
1245        "Bridges PyOD detectors with sliding-window preprocessing",
1246        "Supports any PyOD base detector"
1247      ],
1248      "weaknesses": [
1249        "Inherits limitations of chosen base detector",
1250        "Window size must be tuned per dataset"
1251      ],
1252      "best_for": "General-purpose time series anomaly detection with any PyOD detector",
1253      "avoid_when": "Specialized temporal methods (LSTM, Transformer) are more appropriate",
1254      "benchmark_refs": ["TSB_AD"],
1255      "benchmark_rank": {"TSB_AD_overall_iforest": 16, "TSB_AD_point_iforest": 8},
1256      "paper": {"id": "tsod", "short": "Zhao et al., 2024"},
1257      "default_params": {"detector": "IForest", "window_size": 50, "contamination": 0.1},
1258      "preprocessing_mode": "external",
1259      "requires": [],
1260      "version_added": "2.2.0"
1261    },
1262    "MatrixProfile": {
1263      "class_path": "pyod.models.ts_matrix_profile.MatrixProfile",
1264      "full_name": "Matrix Profile (STOMP)",
1265      "status": "shipped",
1266      "data_types": ["time_series"],
1267      "category": "time_series",
1268      "complexity": {"time": "O(n^2)", "space": "O(n)"},
1269      "strengths": ["No parameters beyond window size", "Exact nearest-neighbor distances", "Well-studied theoretically"],
1270      "weaknesses": ["Transductive only (no out-of-sample prediction)", "O(n^2) may be slow on long series", "Single-threaded in v1"],
1271      "best_for": "Subsequence anomaly detection where exact distances matter",
1272      "avoid_when": "Out-of-sample prediction is needed",
1273      "benchmark_refs": ["TSB_AD"],
1274      "benchmark_rank": {"TSB_AD_overall": 10, "TSB_AD_short": 4},
1275      "paper": {"id": "yeh2016matrix", "short": "Yeh et al., ICDM 2016"},
1276      "default_params": {"window_size": 50, "contamination": 0.1},
1277      "preprocessing_mode": "external",
1278      "requires": [],
1279      "version_added": "2.2.0"
1280    },
1281    "SpectralResidual": {
1282      "class_path": "pyod.models.ts_spectral_residual.SpectralResidual",
1283      "full_name": "Spectral Residual Anomaly Detection",
1284      "status": "shipped",
1285      "data_types": ["time_series"],
1286      "category": "time_series",
1287      "complexity": {"time": "O(n log n)", "space": "O(n)"},
1288      "strengths": ["Very fast (FFT-based)", "No training needed", "Works well on periodic data", "#3 for point anomalies in TSB-AD"],
1289      "weaknesses": ["Assumes frequency-domain structure", "Less effective on non-periodic data"],
1290      "best_for": "Fast detection of point/spike anomalies in periodic or seasonal time series",
1291      "avoid_when": "Data has no frequency structure",
1292      "benchmark_refs": ["TSB_AD"],
1293      "benchmark_rank": {"TSB_AD_overall": 14, "TSB_AD_point": 3, "TSB_AD_short": 8},
1294      "paper": {"id": "ren2019time", "short": "Ren et al., KDD 2019"},
1295      "default_params": {"score_window": 3, "contamination": 0.1},
1296      "preprocessing_mode": "external",
1297      "requires": [],
1298      "version_added": "2.2.0"
1299    },
1300    "KShape": {
1301      "class_path": "pyod.models.ts_kshape.KShape",
1302      "full_name": "k-Shape Clustering Anomaly Detection",
1303      "status": "shipped",
1304      "data_types": ["time_series"],
1305      "category": "time_series",
1306      "complexity": {"time": "O(n * k * max_iter)", "space": "O(n * m)"},
1307      "strengths": ["Shape-aware clustering", "Handles shifted patterns", "Top-2 overall in TSB-AD benchmark"],
1308      "weaknesses": ["Sensitive to n_clusters choice", "Degrades on long time series"],
1309      "best_for": "Detecting shape-based anomalies in short-to-medium time series subsequences",
1310      "avoid_when": "Time series is very long (performance degrades)",
1311      "benchmark_refs": ["TSB_AD"],
1312      "benchmark_rank": {"TSB_AD_overall": 2, "TSB_AD_short": 2, "TSB_AD_point": 6, "TSB_AD_long": 9},
1313      "paper": {"id": "paparrizos2015kshape", "short": "Paparrizos & Gravano, SIGMOD 2015"},
1314      "default_params": {"n_clusters": 3, "window_size": 50, "contamination": 0.1},
1315      "preprocessing_mode": "external",
1316      "requires": [],
1317      "version_added": "2.2.0"
1318    },
1319    "SAND": {
1320      "class_path": "pyod.models.ts_sand.SAND",
1321      "full_name": "Streaming Anomaly Detection",
1322      "status": "experimental",
1323      "data_types": ["time_series"],
1324      "category": "time_series",
1325      "complexity": {"time": "O(n * k)", "space": "O(k * m)"},
1326      "strengths": ["Handles concept drift", "Streaming-compatible", "Adapts to changing patterns", "#3 on short time series in TSB-AD"],
1327      "weaknesses": ["Experimental implementation", "Simplified from original paper"],
1328      "best_for": "Non-stationary time series with evolving normal patterns, especially short series",
1329      "avoid_when": "Production reliability is critical",
1330      "benchmark_refs": ["TSB_AD"],
1331      "benchmark_rank": {"TSB_AD_overall": 11, "TSB_AD_short": 3},
1332      "paper": {"id": "boniol2021sand", "short": "Boniol et al., VLDB 2021"},
1333      "default_params": {"n_clusters": 5, "window_size": 50, "contamination": 0.1},
1334      "preprocessing_mode": "external",
1335      "requires": [],
1336      "version_added": "2.2.0"
1337    },
1338    "LSTMAD": {
1339      "class_path": "pyod.models.ts_lstm.LSTMAD",
1340      "full_name": "LSTM-based Anomaly Detection",
1341      "status": "shipped",
1342      "data_types": ["time_series"],
1343      "category": "time_series",
1344      "complexity": {"time": "O(n * epochs)", "space": "O(model_params)"},
1345      "strengths": ["Captures temporal dependencies", "Native multivariate support", "Mahalanobis distance scoring"],
1346      "weaknesses": ["Requires PyTorch", "Slower training than classical methods", "Needs sufficient data"],
1347      "best_for": "Multivariate and long time series with complex temporal patterns",
1348      "avoid_when": "Fast inference is critical or data is very short",
1349      "benchmark_refs": ["TSB_AD"],
1350      "benchmark_rank": {"TSB_AD_overall": 13, "TSB_AD_multivariate": 4, "TSB_AD_long": 8},
1351      "paper": {"id": "malhotra2015long", "short": "Malhotra et al., ESANN 2015"},
1352      "default_params": {"window_size": 50, "epochs": 50, "contamination": 0.1},
1353      "preprocessing_mode": "external",
1354      "requires": ["torch"],
1355      "version_added": "2.2.0"
1356    },
1357    "AnomalyTransformer": {
1358      "class_path": "pyod.models.ts_anomaly_transformer.AnomalyTransformer",
1359      "full_name": "Anomaly Transformer",
1360      "status": "experimental",
1361      "data_types": ["time_series"],
1362      "category": "time_series",
1363      "complexity": {"time": "O(n * L * d^2)", "space": "O(model_params)"},
1364      "strengths": ["Association discrepancy is theoretically motivated", "Native multivariate"],
1365      "weaknesses": ["Requires PyTorch", "Complex architecture", "High memory usage", "Last place in TSB-AD benchmark (32/32)"],
1366      "best_for": "Research use only; simpler methods outperform on standard benchmarks",
1367      "avoid_when": "Accuracy matters -- underperforms simpler methods like MatrixProfile and IForest on all TSB-AD scenarios",
1368      "benchmark_refs": ["TSB_AD"],
1369      "benchmark_rank": {"TSB_AD_overall": 32},
1370      "paper": {"id": "xu2022anomaly", "short": "Xu et al., ICLR 2022"},
1371      "default_params": {"window_size": 100, "d_model": 512, "epochs": 10, "contamination": 0.1},
1372      "preprocessing_mode": "external",
1373      "requires": ["torch"],
1374      "version_added": "2.2.0"
1375    },
1376    "DOMINANT": {
1377      "class_path": "pyod.models.pyg_dominant.DOMINANT",
1378      "full_name": "Deep Anomaly Detection on Attributed Networks",
1379      "status": "shipped",
1380      "data_types": ["graph"],
1381      "category": "graph",
1382      "complexity": {"time": "O(epochs * (n * d_h + n^2))", "space": "O(n^2)"},
1383      "strengths": ["Joint structure+attribute reconstruction", "Strong BOND benchmark performance", "Standard GCN architecture"],
1384      "weaknesses": ["O(n^2) memory for adjacency reconstruction", "Requires node features"],
1385      "best_for": "Attributed graphs where anomalies manifest in both structure and attributes",
1386      "avoid_when": "Graph is very large (>10k nodes) or has no node features",
1387      "benchmark_refs": ["BOND"],
1388      "benchmark_rank": {"BOND_deep": 1},
1389      "paper": {"id": "ding2019dominant", "short": "Ding et al., SDM 2019"},
1390      "default_params": {"hidden_dim": 64, "num_layers": 2, "epochs": 100, "contamination": 0.1},
1391      "preprocessing_mode": "external",
1392      "requires": ["torch_geometric"],
1393      "version_added": "2.2.0"
1394    },
1395    "CoLA": {
1396      "class_path": "pyod.models.pyg_cola.CoLA",
1397      "full_name": "Contrastive Self-Supervised Anomaly Detection",
1398      "status": "shipped",
1399      "data_types": ["graph"],
1400      "category": "graph",
1401      "complexity": {"time": "O(epochs * n * d_h)", "space": "O(n * d_h + m)"},
1402      "strengths": ["Contrastive learning captures local-context discrepancy", "Strong BOND performance", "Sparse neighbor aggregation"],
1403      "weaknesses": ["Sensitive to graph connectivity", "Requires node features"],
1404      "best_for": "Attributed graphs where anomalies have unusual local neighborhoods",
1405      "avoid_when": "Graph is disconnected or has no node features",
1406      "benchmark_refs": ["BOND"],
1407      "benchmark_rank": {"BOND_deep": 2},
1408      "paper": {"id": "liu2022cola", "short": "Liu et al., WWW 2022"},
1409      "default_params": {"hidden_dim": 64, "num_layers": 2, "epochs": 100, "contamination": 0.1},
1410      "preprocessing_mode": "external",
1411      "requires": ["torch_geometric"],
1412      "version_added": "2.2.0"
1413    },
1414    "CONAD": {
1415      "class_path": "pyod.models.pyg_conad.CONAD",
1416      "full_name": "Contrastive Attributed Network Anomaly Detection",
1417      "status": "shipped",
1418      "data_types": ["graph"],
1419      "category": "graph",
1420      "complexity": {"time": "O(epochs * n * d_h)", "space": "O(n * d_h)"},
1421      "strengths": ["Data augmentation improves robustness", "Dual objective (contrastive + reconstruction)"],
1422      "weaknesses": ["Augmentation ratio is a sensitive hyperparameter", "Requires node features"],
1423      "best_for": "Attributed graphs where robustness to noise is important",
1424      "avoid_when": "Graph structure is too sparse for meaningful augmentation",
1425      "benchmark_refs": ["BOND"],
1426      "benchmark_rank": {},
1427      "paper": {"id": "xu2022conad", "short": "Xu et al., PAKDD 2022"},
1428      "default_params": {"hidden_dim": 64, "epochs": 100, "aug_ratio": 0.2, "contamination": 0.1},
1429      "preprocessing_mode": "external",
1430      "requires": ["torch_geometric"],
1431      "version_added": "2.2.0"
1432    },
1433    "AnomalyDAE": {
1434      "class_path": "pyod.models.pyg_anomalydae.AnomalyDAE",
1435      "full_name": "Dual Autoencoder for Anomaly Detection",
1436      "status": "shipped",
1437      "data_types": ["graph"],
1438      "category": "graph",
1439      "complexity": {"time": "O(epochs * (n * d_h + n^2))", "space": "O(n^2)"},
1440      "strengths": ["Attention-based structure encoding (GAT)", "Separate structure and attribute autoencoders"],
1441      "weaknesses": ["O(n^2) memory for adjacency reconstruction", "Requires node features"],
1442      "best_for": "Attributed graphs where attention over neighbors reveals anomaly patterns",
1443      "avoid_when": "Graph is very large or structure is unimportant",
1444      "benchmark_refs": ["BOND"],
1445      "benchmark_rank": {},
1446      "paper": {"id": "fan2020anomalydae", "short": "Fan et al., CIKM 2020"},
1447      "default_params": {"embed_dim": 64, "num_heads": 4, "epochs": 100, "contamination": 0.1},
1448      "preprocessing_mode": "external",
1449      "requires": ["torch_geometric"],
1450      "version_added": "2.2.0"
1451    },
1452    "GUIDE": {
1453      "class_path": "pyod.models.pyg_guide.GUIDE",
1454      "full_name": "Higher-order Structure Based Anomaly Detection",
1455      "status": "shipped",
1456      "data_types": ["graph"],
1457      "category": "graph",
1458      "complexity": {"time": "O(epochs * n * d_h + m * d_avg)", "space": "O(n^2)"},
1459      "strengths": ["Exploits higher-order motifs (triangles)", "Dual-view captures different structural signals"],
1460      "weaknesses": ["Motif construction adds overhead", "Sparse graphs may have few triangles"],
1461      "best_for": "Dense attributed graphs with meaningful higher-order structures",
1462      "avoid_when": "Graph is tree-like (no triangles) or very large",
1463      "benchmark_refs": ["BOND"],
1464      "benchmark_rank": {},
1465      "paper": {"id": "yuan2021guide", "short": "Yuan et al., BigData 2021"},
1466      "default_params": {"hidden_dim": 64, "epochs": 100, "contamination": 0.1},
1467      "preprocessing_mode": "external",
1468      "requires": ["torch_geometric"],
1469      "version_added": "2.2.0"
1470    },
1471    "Radar": {
1472      "class_path": "pyod.models.pyg_radar.Radar",
1473      "full_name": "Residual Analysis for Anomaly Detection",
1474      "status": "shipped",
1475      "data_types": ["graph"],
1476      "category": "graph",
1477      "complexity": {"time": "O(max_iter * n^2 * d)", "space": "O(n^2)"},
1478      "strengths": ["No neural network training", "Interpretable residuals", "Lightweight baseline"],
1479      "weaknesses": ["O(n^2) dense matrix operations", "Linear model may miss complex patterns"],
1480      "best_for": "Small-to-medium attributed graphs as a fast baseline",
1481      "avoid_when": "Graph is very large or anomalies are structural-only",
1482      "benchmark_refs": ["BOND"],
1483      "benchmark_rank": {},
1484      "paper": {"id": "li2017radar", "short": "Li et al., IJCAI 2017"},
1485      "default_params": {"alpha": 1.0, "gamma": 0.01, "max_iter": 100, "contamination": 0.1},
1486      "preprocessing_mode": "external",
1487      "requires": ["torch_geometric"],
1488      "version_added": "2.2.0"
1489    },
1490    "ANOMALOUS": {
1491      "class_path": "pyod.models.pyg_anomalous.ANOMALOUS",
1492      "full_name": "Joint Modeling Approach for Anomaly Detection",
1493      "status": "shipped",
1494      "data_types": ["graph"],
1495      "category": "graph",
1496      "complexity": {"time": "O(max_iter * n^2 * d)", "space": "O(n^2)"},
1497      "strengths": ["Laplacian regularization for smooth predictions", "No neural network training", "Extends Radar with graph structure"],
1498      "weaknesses": ["O(n^2) dense matrix operations", "Linear model"],
1499      "best_for": "Small-to-medium attributed graphs where smoothness matters",
1500      "avoid_when": "Graph is very large or anomalies are purely structural",
1501      "benchmark_refs": ["BOND"],
1502      "benchmark_rank": {},
1503      "paper": {"id": "peng2018anomalous", "short": "Peng et al., IJCAI 2018"},
1504      "default_params": {"alpha": 1.0, "gamma": 1.0, "lambda_r": 0.01, "max_iter": 100, "contamination": 0.1},
1505      "preprocessing_mode": "external",
1506      "requires": ["torch_geometric"],
1507      "version_added": "2.2.0"
1508    },
1509    "SCAN_Graph": {
1510      "class_path": "pyod.models.pyg_scan.SCAN",
1511      "full_name": "Structural Clustering Algorithm for Networks",
1512      "status": "shipped",
1513      "data_types": ["graph"],
1514      "category": "graph",
1515      "complexity": {"time": "O(m * d_avg)", "space": "O(n + m)"},
1516      "strengths": ["Structure-only (no features needed)", "No training or hyperparameter tuning", "Fast and lightweight"],
1517      "weaknesses": ["Ignores node attributes", "Only detects structural anomalies"],
1518      "best_for": "Structure-only graphs or as a fast structural baseline",
1519      "avoid_when": "Node attributes are available and important for anomaly detection",
1520      "benchmark_refs": [],
1521      "benchmark_rank": {},
1522      "paper": {"id": "xu2007scan", "short": "Xu et al., KDD 2007"},
1523      "default_params": {"epsilon": 0.5, "mu": 2, "contamination": 0.1},
1524      "preprocessing_mode": "external",
1525      "requires": ["torch_geometric"],
1526      "version_added": "2.2.0"
1527    }
1528  }