algorithms.json
1 { 2 "ECOD": { 3 "class_path": "pyod.models.ecod.ECOD", 4 "full_name": "Empirical Cumulative Distribution Functions", 5 "status": "shipped", 6 "data_types": ["tabular"], 7 "category": "probabilistic", 8 "complexity": {"time": "O(n * d * log(n))", "space": "O(n * d)"}, 9 "strengths": [ 10 "Parameter-free and highly interpretable", 11 "Fast computation with parallelization support", 12 "Strong benchmark performance across diverse datasets", 13 "No assumption on data distribution" 14 ], 15 "weaknesses": [ 16 "May struggle with strongly correlated features", 17 "Assumes outliers deviate in at least one marginal dimension" 18 ], 19 "best_for": "General-purpose outlier detection when speed and interpretability are priorities", 20 "avoid_when": "Features are heavily correlated and outliers only manifest in joint distributions", 21 "benchmark_refs": ["ADBench"], 22 "benchmark_rank": {"ADBench_overall": 5}, 23 "paper": {"id": "ecod", "short": "Li et al., TKDE 2022"}, 24 "default_params": {"contamination": 0.1, "n_jobs": 1}, 25 "preprocessing_mode": "external", 26 "requires": [], 27 "version_added": "0.9.0" 28 }, 29 "ABOD": { 30 "class_path": "pyod.models.abod.ABOD", 31 "full_name": "Angle-Based Outlier Detection", 32 "status": "shipped", 33 "data_types": ["tabular"], 34 "category": "probabilistic", 35 "complexity": {"time": "O(n^2 * d) for fast, O(n^3 * d) for default", "space": "O(n^2)"}, 36 "strengths": [ 37 "Effective in high-dimensional spaces", 38 "Not affected by distance concentration in high dimensions", 39 "Good theoretical foundation" 40 ], 41 "weaknesses": [ 42 "Computationally expensive, especially in default mode", 43 "Sensitive to noise in low-dimensional data", 44 "Requires sufficient neighbors for angle variance estimation" 45 ], 46 "best_for": "High-dimensional datasets where distance-based methods suffer from curse of dimensionality", 47 "avoid_when": "Dataset is very large or low-dimensional where distance-based methods work well", 48 "benchmark_refs": ["ADBench"], 49 "benchmark_rank": {}, 50 "paper": {"id": "abod", "short": "Kriegel et al., KDD 2008"}, 51 "default_params": {"contamination": 0.1, "n_neighbors": 5, "method": "fast"}, 52 "preprocessing_mode": "external", 53 "requires": [], 54 "version_added": "0.5.0" 55 }, 56 "COPOD": { 57 "class_path": "pyod.models.copod.COPOD", 58 "full_name": "Copula-Based Outlier Detection", 59 "status": "shipped", 60 "data_types": ["tabular"], 61 "category": "probabilistic", 62 "complexity": {"time": "O(n * d * log(n))", "space": "O(n * d)"}, 63 "strengths": [ 64 "Parameter-free and highly interpretable", 65 "Fast computation with parallelization support", 66 "Models tail probabilities via empirical copulas" 67 ], 68 "weaknesses": [ 69 "Assumes feature-wise independence for tail modeling", 70 "May miss complex multivariate interactions" 71 ], 72 "best_for": "Large-scale datasets where speed and interpretability matter and features are roughly independent", 73 "avoid_when": "Outliers only appear in joint distributions with strong feature dependencies", 74 "benchmark_refs": ["ADBench"], 75 "benchmark_rank": {"ADBench_overall": 6}, 76 "paper": {"id": "copod", "short": "Li et al., ICDM 2020"}, 77 "default_params": {"contamination": 0.1, "n_jobs": 1}, 78 "preprocessing_mode": "external", 79 "requires": [], 80 "version_added": "0.7.0" 81 }, 82 "MAD": { 83 "class_path": "pyod.models.mad.MAD", 84 "full_name": "Median Absolute Deviation", 85 "status": "shipped", 86 "data_types": ["tabular"], 87 "category": "probabilistic", 88 "complexity": {"time": "O(n * log(n))", "space": "O(n)"}, 89 "strengths": [ 90 "Extremely simple and fast", 91 "Robust to outliers in the training data", 92 "Well-suited for univariate data" 93 ], 94 "weaknesses": [ 95 "Only works on univariate data", 96 "Cannot capture multivariate relationships" 97 ], 98 "best_for": "Univariate outlier detection with a robust central tendency measure", 99 "avoid_when": "Data is multivariate or relationships between features are important", 100 "benchmark_refs": [], 101 "benchmark_rank": {}, 102 "paper": {"id": "mad", "short": "Iglewicz and Hoaglin, 1993"}, 103 "default_params": {"threshold": 3.5, "contamination": 0.1}, 104 "preprocessing_mode": "external", 105 "requires": [], 106 "version_added": "0.7.5" 107 }, 108 "SOS": { 109 "class_path": "pyod.models.sos.SOS", 110 "full_name": "Stochastic Outlier Selection", 111 "status": "shipped", 112 "data_types": ["tabular"], 113 "category": "probabilistic", 114 "complexity": {"time": "O(n^2 * d)", "space": "O(n^2)"}, 115 "strengths": [ 116 "Based on affinity and binding probability concepts", 117 "Produces meaningful probability-like outlier scores", 118 "Perplexity parameter controls local vs global sensitivity" 119 ], 120 "weaknesses": [ 121 "Quadratic complexity limits scalability", 122 "Sensitive to perplexity parameter choice" 123 ], 124 "best_for": "Medium-sized datasets where probability-calibrated outlier scores are valuable", 125 "avoid_when": "Dataset is very large (>10K samples) due to quadratic complexity", 126 "benchmark_refs": [], 127 "benchmark_rank": {}, 128 "paper": {"id": "sos", "short": "Janssens et al., 2012"}, 129 "default_params": {"contamination": 0.1, "perplexity": 4.5, "metric": "euclidean"}, 130 "preprocessing_mode": "external", 131 "requires": [], 132 "version_added": "0.6.0" 133 }, 134 "QMCD": { 135 "class_path": "pyod.models.qmcd.QMCD", 136 "full_name": "Quasi-Monte Carlo Discrepancy", 137 "status": "shipped", 138 "data_types": ["tabular"], 139 "category": "probabilistic", 140 "complexity": {"time": "O(n^2 * d)", "space": "O(n * d)"}, 141 "strengths": [ 142 "Based on uniformity criterion from quasi-Monte Carlo theory", 143 "Parameter-free (only contamination needed)", 144 "Theoretically grounded in discrepancy measures" 145 ], 146 "weaknesses": [ 147 "Quadratic time complexity", 148 "Assumes outliers deviate from uniform space-filling" 149 ], 150 "best_for": "Detecting anomalies as deviations from uniform space-filling in moderate-dimensional data", 151 "avoid_when": "Dataset is very large or high-dimensional", 152 "benchmark_refs": [], 153 "benchmark_rank": {}, 154 "paper": {"id": "qmcd", "short": "Fang et al., 2001"}, 155 "default_params": {"contamination": 0.1}, 156 "preprocessing_mode": "external", 157 "requires": [], 158 "version_added": "0.9.0" 159 }, 160 "KDE": { 161 "class_path": "pyod.models.kde.KDE", 162 "full_name": "Kernel Density Estimation", 163 "status": "shipped", 164 "data_types": ["tabular"], 165 "category": "probabilistic", 166 "complexity": {"time": "O(n^2 * d)", "space": "O(n * d)"}, 167 "strengths": [ 168 "Non-parametric density estimation", 169 "Multiple kernel choices available", 170 "Scores directly reflect density" 171 ], 172 "weaknesses": [ 173 "Quadratic prediction time", 174 "Sensitive to bandwidth and kernel selection", 175 "Suffers from curse of dimensionality" 176 ], 177 "best_for": "Low-to-moderate dimensional data where non-parametric density estimation is desired", 178 "avoid_when": "Data is high-dimensional or dataset is very large", 179 "benchmark_refs": [], 180 "benchmark_rank": {}, 181 "paper": {"id": "kde", "short": "Latecki et al., SDM 2007"}, 182 "default_params": {"contamination": 0.1}, 183 "preprocessing_mode": "external", 184 "requires": [], 185 "version_added": "0.6.5" 186 }, 187 "Sampling": { 188 "class_path": "pyod.models.sampling.Sampling", 189 "full_name": "Rapid Distance-Based Outlier Detection via Sampling", 190 "status": "shipped", 191 "data_types": ["tabular"], 192 "category": "probabilistic", 193 "complexity": {"time": "O(n * s * d) where s is subset size", "space": "O(n * d)"}, 194 "strengths": [ 195 "Linear time complexity through sampling", 196 "Scalable to large datasets", 197 "Simple and effective approach" 198 ], 199 "weaknesses": [ 200 "Randomness from sampling can affect reproducibility", 201 "May miss local outliers with small subset sizes" 202 ], 203 "best_for": "Large-scale datasets requiring fast distance-based outlier detection", 204 "avoid_when": "Precise and deterministic results are required or dataset is small enough for exact methods", 205 "benchmark_refs": [], 206 "benchmark_rank": {}, 207 "paper": {"id": "sampling", "short": "Sugiyama and Borgwardt, 2013"}, 208 "default_params": {"contamination": 0.1}, 209 "preprocessing_mode": "external", 210 "requires": [], 211 "version_added": "0.7.5" 212 }, 213 "GMM": { 214 "class_path": "pyod.models.gmm.GMM", 215 "full_name": "Gaussian Mixture Model", 216 "status": "shipped", 217 "data_types": ["tabular"], 218 "category": "probabilistic", 219 "complexity": {"time": "O(n * k * d^2) per EM iteration", "space": "O(k * d^2)"}, 220 "strengths": [ 221 "Models complex multi-modal distributions", 222 "Soft clustering provides probabilistic scores", 223 "Flexible covariance structures" 224 ], 225 "weaknesses": [ 226 "Sensitive to initialization and number of components", 227 "Assumes Gaussian component distributions", 228 "EM convergence not guaranteed to global optimum" 229 ], 230 "best_for": "Data with multi-modal distributions that can be approximated by Gaussian mixtures", 231 "avoid_when": "Data does not follow mixture-of-Gaussians assumption or is very high-dimensional", 232 "benchmark_refs": [], 233 "benchmark_rank": {}, 234 "paper": {"id": "gmm", "short": "Aggarwal, 2017"}, 235 "default_params": {"contamination": 0.1}, 236 "preprocessing_mode": "external", 237 "requires": [], 238 "version_added": "0.9.8" 239 }, 240 "PCA": { 241 "class_path": "pyod.models.pca.PCA", 242 "full_name": "Principal Component Analysis", 243 "status": "shipped", 244 "data_types": ["tabular"], 245 "category": "linear", 246 "complexity": {"time": "O(n * d^2)", "space": "O(d^2)"}, 247 "strengths": [ 248 "Fast training and prediction", 249 "Well-understood linear model", 250 "Effective when anomalies lie in minor principal components" 251 ], 252 "weaknesses": [ 253 "Assumes linear relationships", 254 "Sensitive to the number of components selected", 255 "Cannot capture nonlinear anomalies" 256 ], 257 "best_for": "Datasets with linear structure where outliers deviate from main variance directions", 258 "avoid_when": "Data has strong nonlinear structure or outliers align with principal components", 259 "benchmark_refs": [], 260 "benchmark_rank": {}, 261 "paper": {"id": "pca", "short": "Shyu et al., 2003"}, 262 "default_params": {"contamination": 0.1}, 263 "preprocessing_mode": "external", 264 "requires": [], 265 "version_added": "0.5.0" 266 }, 267 "KPCA": { 268 "class_path": "pyod.models.kpca.KPCA", 269 "full_name": "Kernel Principal Component Analysis", 270 "status": "shipped", 271 "data_types": ["tabular"], 272 "category": "linear", 273 "complexity": {"time": "O(n^2 * d)", "space": "O(n^2)"}, 274 "strengths": [ 275 "Captures nonlinear relationships via kernel trick", 276 "Extension of PCA to nonlinear manifolds", 277 "Flexible kernel choices" 278 ], 279 "weaknesses": [ 280 "Quadratic memory and time complexity", 281 "Kernel and hyperparameter selection required", 282 "Slower than linear PCA" 283 ], 284 "best_for": "Moderately sized datasets with nonlinear structure", 285 "avoid_when": "Dataset is very large due to quadratic kernel matrix or a linear model suffices", 286 "benchmark_refs": [], 287 "benchmark_rank": {}, 288 "paper": {"id": "kpca", "short": "Hoffmann, 2007"}, 289 "default_params": {"contamination": 0.1}, 290 "preprocessing_mode": "external", 291 "requires": [], 292 "version_added": "0.8.0" 293 }, 294 "MCD": { 295 "class_path": "pyod.models.mcd.MCD", 296 "full_name": "Minimum Covariance Determinant", 297 "status": "shipped", 298 "data_types": ["tabular"], 299 "category": "linear", 300 "complexity": {"time": "O(n * d^2)", "space": "O(d^2)"}, 301 "strengths": [ 302 "Robust estimation of covariance matrix", 303 "Well-established statistical method", 304 "Resistant to masking effects from outliers" 305 ], 306 "weaknesses": [ 307 "Assumes data follows a Gaussian distribution", 308 "May fail in high-dimensional settings (d > n)", 309 "Computationally intensive for large d" 310 ], 311 "best_for": "Multivariate Gaussian-like data requiring robust covariance estimation", 312 "avoid_when": "Data is non-Gaussian, very high-dimensional, or strongly nonlinear", 313 "benchmark_refs": [], 314 "benchmark_rank": {}, 315 "paper": {"id": "mcd", "short": "Rousseeuw and Driessen, 1999"}, 316 "default_params": {"contamination": 0.1}, 317 "preprocessing_mode": "external", 318 "requires": [], 319 "version_added": "0.5.0" 320 }, 321 "CD": { 322 "class_path": "pyod.models.cd.CD", 323 "full_name": "Cook's Distance", 324 "status": "shipped", 325 "data_types": ["tabular"], 326 "category": "linear", 327 "complexity": {"time": "O(n * d^2)", "space": "O(n * d)"}, 328 "strengths": [ 329 "Based on classical influence measures from regression", 330 "Interpretable as influence on regression fit", 331 "Works well when a regression model is appropriate" 332 ], 333 "weaknesses": [ 334 "Requires a target variable (semi-supervised framing)", 335 "Assumes linear regression model", 336 "Not suitable for purely unsupervised settings without a model" 337 ], 338 "best_for": "Identifying influential observations in regression settings", 339 "avoid_when": "No natural target variable exists or data is purely unsupervised", 340 "benchmark_refs": [], 341 "benchmark_rank": {}, 342 "paper": {"id": "cd", "short": "Cook, 1977"}, 343 "default_params": {"contamination": 0.1}, 344 "preprocessing_mode": "external", 345 "requires": [], 346 "version_added": "0.8.5" 347 }, 348 "OCSVM": { 349 "class_path": "pyod.models.ocsvm.OCSVM", 350 "full_name": "One-Class Support Vector Machine", 351 "status": "shipped", 352 "data_types": ["tabular"], 353 "category": "linear", 354 "complexity": {"time": "O(n^2 * d) to O(n^3)", "space": "O(n * d)"}, 355 "strengths": [ 356 "Flexible boundary via kernel functions", 357 "Strong theoretical foundation from SVM literature", 358 "Effective in high-dimensional feature spaces" 359 ], 360 "weaknesses": [ 361 "Cubic training time for large datasets", 362 "Sensitive to kernel and nu parameter choice", 363 "Does not scale well beyond ~10K samples" 364 ], 365 "best_for": "Medium-sized datasets where a flexible decision boundary is needed", 366 "avoid_when": "Dataset is very large or real-time training is required", 367 "benchmark_refs": ["ADBench"], 368 "benchmark_rank": {}, 369 "paper": {"id": "ocsvm", "short": "Scholkopf et al., 2001"}, 370 "default_params": {"contamination": 0.1, "kernel": "rbf"}, 371 "preprocessing_mode": "external", 372 "requires": [], 373 "version_added": "0.5.0" 374 }, 375 "LMDD": { 376 "class_path": "pyod.models.lmdd.LMDD", 377 "full_name": "Linear Model Deviation-based Detection", 378 "status": "shipped", 379 "data_types": ["tabular"], 380 "category": "linear", 381 "complexity": {"time": "O(n_iter * n * d)", "space": "O(n * d)"}, 382 "strengths": [ 383 "Multiple deviation measures available (AAD, IQR, MAD)", 384 "Random subspace approach provides diversity", 385 "Interpretable linear projections" 386 ], 387 "weaknesses": [ 388 "Results can vary due to random projections", 389 "Requires tuning of number of iterations and measure", 390 "May underperform with nonlinear anomaly patterns" 391 ], 392 "best_for": "Multivariate data where anomalies are detectable through linear projections", 393 "avoid_when": "Anomalies require nonlinear feature combinations to detect", 394 "benchmark_refs": [], 395 "benchmark_rank": {}, 396 "paper": {"id": "lmdd", "short": "Arning et al., KDD 1996"}, 397 "default_params": {"contamination": 0.1, "n_iter": 50, "dis_measure": "aad"}, 398 "preprocessing_mode": "external", 399 "requires": [], 400 "version_added": "0.8.0" 401 }, 402 "LOF": { 403 "class_path": "pyod.models.lof.LOF", 404 "full_name": "Local Outlier Factor", 405 "status": "shipped", 406 "data_types": ["tabular"], 407 "category": "proximity", 408 "complexity": {"time": "O(n^2 * d)", "space": "O(n * d)"}, 409 "strengths": [ 410 "Detects local density-based outliers", 411 "Adapts to varying densities across the dataset", 412 "Well-established and widely used" 413 ], 414 "weaknesses": [ 415 "Quadratic complexity limits scalability", 416 "Sensitive to n_neighbors parameter", 417 "Difficulty with uniformly dense regions" 418 ], 419 "best_for": "Datasets with clusters of varying densities where local anomalies are of interest", 420 "avoid_when": "Dataset is very large (>50K) or data is uniformly distributed", 421 "benchmark_refs": ["ADBench"], 422 "benchmark_rank": {}, 423 "paper": {"id": "lof", "short": "Breunig et al., SIGMOD 2000"}, 424 "default_params": {"contamination": 0.1, "n_neighbors": 20, "algorithm": "auto"}, 425 "preprocessing_mode": "external", 426 "requires": [], 427 "version_added": "0.5.0" 428 }, 429 "COF": { 430 "class_path": "pyod.models.cof.COF", 431 "full_name": "Connectivity-Based Outlier Factor", 432 "status": "shipped", 433 "data_types": ["tabular"], 434 "category": "proximity", 435 "complexity": {"time": "O(n^2 * d)", "space": "O(n^2)"}, 436 "strengths": [ 437 "Considers connectivity-based distances", 438 "Better than LOF for certain pattern structures", 439 "Handles low-density patterns in neighborhood chains" 440 ], 441 "weaknesses": [ 442 "Quadratic time and space complexity", 443 "Slow for large datasets", 444 "Marginal improvement over LOF in many cases" 445 ], 446 "best_for": "Datasets where outliers lie along sparse connectivity paths between clusters", 447 "avoid_when": "Dataset is large or simpler methods like LOF already perform well", 448 "benchmark_refs": [], 449 "benchmark_rank": {}, 450 "paper": {"id": "cof", "short": "Tang et al., 2002"}, 451 "default_params": {"contamination": 0.1, "n_neighbors": 20, "method": "fast"}, 452 "preprocessing_mode": "external", 453 "requires": [], 454 "version_added": "0.6.5" 455 }, 456 "CBLOF": { 457 "class_path": "pyod.models.cblof.CBLOF", 458 "full_name": "Cluster-Based Local Outlier Factor", 459 "status": "shipped", 460 "data_types": ["tabular"], 461 "category": "proximity", 462 "complexity": {"time": "O(n * k * d)", "space": "O(n * d)"}, 463 "strengths": [ 464 "Combines clustering with local outlier scoring", 465 "Efficient through cluster-based summarization", 466 "Effective when data has clear cluster structure" 467 ], 468 "weaknesses": [ 469 "Depends on quality of underlying clustering", 470 "Sensitive to number of clusters parameter", 471 "May fail if data does not cluster well" 472 ], 473 "best_for": "Data with well-separated clusters where outliers deviate from cluster structure", 474 "avoid_when": "Data has no meaningful cluster structure or clusters are heavily overlapping", 475 "benchmark_refs": ["ADBench"], 476 "benchmark_rank": {}, 477 "paper": {"id": "cblof", "short": "He et al., 2003"}, 478 "default_params": {"contamination": 0.1, "n_clusters": 8}, 479 "preprocessing_mode": "external", 480 "requires": [], 481 "version_added": "0.5.0" 482 }, 483 "LOCI": { 484 "class_path": "pyod.models.loci.LOCI", 485 "full_name": "Local Correlation Integral", 486 "status": "shipped", 487 "data_types": ["tabular"], 488 "category": "proximity", 489 "complexity": {"time": "O(n^2 * d)", "space": "O(n^2)"}, 490 "strengths": [ 491 "Automatic determination of outlier threshold", 492 "Multi-granularity outlier detection via alpha parameter", 493 "Does not require explicit k-neighbor parameter" 494 ], 495 "weaknesses": [ 496 "Quadratic complexity makes it slow", 497 "Difficult to scale to large datasets", 498 "Parameter alpha requires tuning" 499 ], 500 "best_for": "Small to medium datasets where automatic threshold selection is valued", 501 "avoid_when": "Dataset is large or faster LOF-based methods are sufficient", 502 "benchmark_refs": [], 503 "benchmark_rank": {}, 504 "paper": {"id": "loci", "short": "Papadimitriou et al., ICDE 2003"}, 505 "default_params": {"contamination": 0.1, "alpha": 0.5, "k": 3}, 506 "preprocessing_mode": "external", 507 "requires": [], 508 "version_added": "0.6.0" 509 }, 510 "HBOS": { 511 "class_path": "pyod.models.hbos.HBOS", 512 "full_name": "Histogram-Based Outlier Score", 513 "status": "shipped", 514 "data_types": ["tabular"], 515 "category": "proximity", 516 "complexity": {"time": "O(n * d)", "space": "O(n_bins * d)"}, 517 "strengths": [ 518 "Extremely fast - linear time complexity", 519 "Simple histogram-based approach", 520 "Scales well to large datasets" 521 ], 522 "weaknesses": [ 523 "Assumes feature independence", 524 "Cannot capture multivariate interactions", 525 "Sensitive to number of bins" 526 ], 527 "best_for": "Large-scale datasets where speed is critical and features are roughly independent", 528 "avoid_when": "Outliers only manifest through feature interactions or correlations", 529 "benchmark_refs": ["ADBench"], 530 "benchmark_rank": {"ADBench_overall": 7}, 531 "paper": {"id": "hbos", "short": "Goldstein and Dengel, KI 2012"}, 532 "default_params": {"contamination": 0.1, "n_bins": 10, "alpha": 0.1, "tol": 0.5}, 533 "preprocessing_mode": "external", 534 "requires": [], 535 "version_added": "0.5.0" 536 }, 537 "HDBSCAN": { 538 "class_path": "pyod.models.hdbscan.HDBSCAN", 539 "full_name": "Hierarchical Density-Based Spatial Clustering of Applications with Noise", 540 "status": "shipped", 541 "data_types": ["tabular"], 542 "category": "proximity", 543 "complexity": {"time": "O(n * log(n)) to O(n^2)", "space": "O(n)"}, 544 "strengths": [ 545 "Automatically finds clusters of varying density", 546 "Identifies noise points as potential outliers", 547 "Minimal parameter tuning required" 548 ], 549 "weaknesses": [ 550 "Performance depends on min_cluster_size choice", 551 "May be slow for very large datasets", 552 "Outlier scores derived from cluster membership" 553 ], 554 "best_for": "Datasets with variable-density clusters where noise points are outliers", 555 "avoid_when": "Data does not have cluster structure or very high-dimensional", 556 "benchmark_refs": [], 557 "benchmark_rank": {}, 558 "paper": {"id": "hdbscan", "short": "Campello et al., PAKDD 2013"}, 559 "default_params": {"min_cluster_size": 5}, 560 "preprocessing_mode": "external", 561 "requires": [], 562 "version_added": "0.9.8" 563 }, 564 "KNN": { 565 "class_path": "pyod.models.knn.KNN", 566 "full_name": "K-Nearest Neighbors Outlier Detection", 567 "status": "shipped", 568 "data_types": ["tabular"], 569 "category": "proximity", 570 "complexity": {"time": "O(n^2 * d)", "space": "O(n * d)"}, 571 "strengths": [ 572 "Simple and intuitive distance-based method", 573 "Multiple scoring methods (largest, mean, median)", 574 "Strong benchmark performance" 575 ], 576 "weaknesses": [ 577 "Quadratic complexity limits scalability", 578 "Sensitive to distance metric and k choice", 579 "Struggles with varying density clusters" 580 ], 581 "best_for": "General-purpose distance-based outlier detection on moderate-sized datasets", 582 "avoid_when": "Dataset is very large or has highly variable local densities", 583 "benchmark_refs": ["ADBench"], 584 "benchmark_rank": {"ADBench_overall": 4}, 585 "paper": {"id": "knn", "short": "Ramaswamy et al., SIGMOD 2000"}, 586 "default_params": {"contamination": 0.1, "n_neighbors": 5, "method": "largest"}, 587 "preprocessing_mode": "external", 588 "requires": [], 589 "version_added": "0.5.0" 590 }, 591 "SOD": { 592 "class_path": "pyod.models.sod.SOD", 593 "full_name": "Subspace Outlier Detection", 594 "status": "shipped", 595 "data_types": ["tabular"], 596 "category": "proximity", 597 "complexity": {"time": "O(n^2 * d)", "space": "O(n * d)"}, 598 "strengths": [ 599 "Detects outliers in axis-parallel subspaces", 600 "Effective when anomalies hide in subspaces", 601 "Reference set approach captures local structure" 602 ], 603 "weaknesses": [ 604 "Quadratic complexity", 605 "Requires tuning of ref_set and n_neighbors", 606 "May miss anomalies not aligned with axis-parallel subspaces" 607 ], 608 "best_for": "High-dimensional data where outliers deviate in axis-parallel subspaces", 609 "avoid_when": "Data is low-dimensional or anomalies require oblique subspaces to detect", 610 "benchmark_refs": [], 611 "benchmark_rank": {}, 612 "paper": {"id": "sod", "short": "Kriegel et al., PAKDD 2009"}, 613 "default_params": {"contamination": 0.1, "n_neighbors": 20, "ref_set": 10}, 614 "preprocessing_mode": "external", 615 "requires": [], 616 "version_added": "0.6.0" 617 }, 618 "ROD": { 619 "class_path": "pyod.models.rod.ROD", 620 "full_name": "Rotation-Based Outlier Detection", 621 "status": "shipped", 622 "data_types": ["tabular"], 623 "category": "proximity", 624 "complexity": {"time": "O(n * d^2)", "space": "O(n * d)"}, 625 "strengths": [ 626 "Considers rotations to detect outliers across projections", 627 "Parameter-free (only contamination needed)", 628 "Supports parallel execution" 629 ], 630 "weaknesses": [ 631 "Designed primarily for 3D data, may not generalize well to very high dimensions", 632 "Limited theoretical analysis compared to classical methods" 633 ], 634 "best_for": "Low-dimensional data where rotation-invariant outlier detection is desired", 635 "avoid_when": "Data is high-dimensional or a well-tuned alternative is available", 636 "benchmark_refs": [], 637 "benchmark_rank": {}, 638 "paper": {"id": "rod", "short": "Almardeny et al., 2020"}, 639 "default_params": {"contamination": 0.1, "parallel_execution": false}, 640 "preprocessing_mode": "external", 641 "requires": [], 642 "version_added": "0.8.5" 643 }, 644 "IForest": { 645 "class_path": "pyod.models.iforest.IForest", 646 "full_name": "Isolation Forest", 647 "status": "shipped", 648 "data_types": ["tabular"], 649 "category": "ensemble", 650 "complexity": {"time": "O(n * t * log(n)) where t is n_estimators", "space": "O(t * n)"}, 651 "strengths": [ 652 "Excellent overall benchmark performance", 653 "Linear time complexity with efficient implementation", 654 "Handles high-dimensional data well", 655 "Does not require distance or density computation" 656 ], 657 "weaknesses": [ 658 "May struggle with local anomalies in dense regions", 659 "Axis-aligned splits can miss anomalies in correlated features" 660 ], 661 "best_for": "General-purpose anomaly detection especially on large or high-dimensional datasets", 662 "avoid_when": "Anomalies are local density deviations or features are strongly correlated", 663 "benchmark_refs": ["ADBench"], 664 "benchmark_rank": {"ADBench_overall": 3}, 665 "paper": {"id": "iforest", "short": "Liu et al., ICDM 2008"}, 666 "default_params": {"contamination": 0.1, "n_estimators": 100}, 667 "preprocessing_mode": "external", 668 "requires": [], 669 "version_added": "0.5.0" 670 }, 671 "INNE": { 672 "class_path": "pyod.models.inne.INNE", 673 "full_name": "Isolation-based Anomaly Detection Using Nearest-Neighbor Ensembles", 674 "status": "shipped", 675 "data_types": ["tabular"], 676 "category": "ensemble", 677 "complexity": {"time": "O(n * t * s) where t is n_estimators, s is sample size", "space": "O(t * s)"}, 678 "strengths": [ 679 "Combines isolation and nearest-neighbor concepts", 680 "Handles local anomalies better than Isolation Forest", 681 "Efficient sampling-based approach" 682 ], 683 "weaknesses": [ 684 "Requires tuning of sample size and number of estimators", 685 "Less established than Isolation Forest" 686 ], 687 "best_for": "Datasets where local density variations matter and Isolation Forest underperforms", 688 "avoid_when": "A simpler method like IForest already works well", 689 "benchmark_refs": [], 690 "benchmark_rank": {}, 691 "paper": {"id": "inne", "short": "Bandaragoda et al., KAIS 2018"}, 692 "default_params": {"contamination": 0.1}, 693 "preprocessing_mode": "external", 694 "requires": [], 695 "version_added": "0.9.5" 696 }, 697 "DIF": { 698 "class_path": "pyod.models.dif.DIF", 699 "full_name": "Deep Isolation Forest", 700 "status": "shipped", 701 "data_types": ["tabular"], 702 "category": "ensemble", 703 "complexity": {"time": "O(n * t * d * log(n))", "space": "O(t * n)"}, 704 "strengths": [ 705 "Uses deep random representations for better isolation", 706 "Handles complex data distributions", 707 "Extends isolation forest to representation space" 708 ], 709 "weaknesses": [ 710 "More computationally expensive than standard IForest", 711 "Requires tuning of representation parameters" 712 ], 713 "best_for": "Complex datasets where standard Isolation Forest misses anomalies due to axis-aligned splits", 714 "avoid_when": "Standard Isolation Forest performs well or dataset is small and simple", 715 "benchmark_refs": [], 716 "benchmark_rank": {}, 717 "paper": {"id": "dif", "short": "Xu et al., TKDE 2023"}, 718 "default_params": {"contamination": 0.1}, 719 "preprocessing_mode": "external", 720 "requires": [], 721 "version_added": "0.9.8" 722 }, 723 "FeatureBagging": { 724 "class_path": "pyod.models.feature_bagging.FeatureBagging", 725 "full_name": "Feature Bagging Outlier Detection", 726 "status": "shipped", 727 "data_types": ["tabular"], 728 "category": "ensemble", 729 "complexity": {"time": "O(n_estimators * base_detector_time)", "space": "O(n_estimators * base_detector_space)"}, 730 "strengths": [ 731 "Reduces variance through feature subsampling", 732 "Flexible with any base detector", 733 "Robust against irrelevant features" 734 ], 735 "weaknesses": [ 736 "Performance depends on choice of base detector", 737 "Slower than single detector due to ensemble overhead", 738 "May not improve over base if features are all relevant" 739 ], 740 "best_for": "High-dimensional data with potentially irrelevant features", 741 "avoid_when": "All features are relevant or a single strong detector suffices", 742 "benchmark_refs": [], 743 "benchmark_rank": {}, 744 "paper": {"id": "feature_bagging", "short": "Lazarevic and Kumar, KDD 2005"}, 745 "default_params": {"contamination": 0.1, "n_estimators": 10}, 746 "preprocessing_mode": "external", 747 "requires": ["combo"], 748 "version_added": "0.5.0" 749 }, 750 "LSCP": { 751 "class_path": "pyod.models.lscp.LSCP", 752 "full_name": "Locally Selective Combination of Parallel Outlier Ensembles", 753 "status": "shipped", 754 "data_types": ["tabular"], 755 "category": "ensemble", 756 "complexity": {"time": "O(n * n_detectors * base_cost)", "space": "O(n * n_detectors)"}, 757 "strengths": [ 758 "Locally selects the best detector for each region", 759 "Leverages diversity among base detectors", 760 "Adaptive combination strategy" 761 ], 762 "weaknesses": [ 763 "Requires a list of pre-instantiated base detectors", 764 "Slower due to training multiple detectors", 765 "Complex internal selection mechanism" 766 ], 767 "best_for": "Scenarios where diverse base detectors are available and local performance varies", 768 "avoid_when": "Only one detector type is appropriate or computational budget is limited", 769 "benchmark_refs": [], 770 "benchmark_rank": {}, 771 "paper": {"id": "lscp", "short": "Zhao et al., SDM 2019"}, 772 "default_params": {"local_region_size": 30}, 773 "preprocessing_mode": "external", 774 "requires": [], 775 "version_added": "0.6.5" 776 }, 777 "LODA": { 778 "class_path": "pyod.models.loda.LODA", 779 "full_name": "Lightweight Online Detector of Anomalies", 780 "status": "shipped", 781 "data_types": ["tabular"], 782 "category": "ensemble", 783 "complexity": {"time": "O(n * n_cuts * d)", "space": "O(n_bins * n_cuts)"}, 784 "strengths": [ 785 "Lightweight and fast", 786 "Supports online/streaming updates", 787 "Uses ensemble of random projections" 788 ], 789 "weaknesses": [ 790 "Random projections may miss certain anomaly patterns", 791 "Histogram-based scoring can be coarse", 792 "Sensitive to number of bins and cuts" 793 ], 794 "best_for": "Streaming or online anomaly detection with limited computational resources", 795 "avoid_when": "Batch setting with enough time for more powerful methods", 796 "benchmark_refs": [], 797 "benchmark_rank": {}, 798 "paper": {"id": "loda", "short": "Pevny, 2016"}, 799 "default_params": {"contamination": 0.1, "n_bins": 10, "n_random_cuts": 100}, 800 "preprocessing_mode": "external", 801 "requires": [], 802 "version_added": "0.6.0" 803 }, 804 "SUOD": { 805 "class_path": "pyod.models.suod.SUOD", 806 "full_name": "Scalable Unsupervised Outlier Detection", 807 "status": "shipped", 808 "data_types": ["tabular"], 809 "category": "ensemble", 810 "complexity": {"time": "varies (depends on base estimators)", "space": "varies"}, 811 "strengths": [ 812 "Accelerates large-scale outlier detection via approximation", 813 "Supports parallel execution of base detectors", 814 "Modular framework for combining multiple detectors" 815 ], 816 "weaknesses": [ 817 "Approximation may reduce accuracy", 818 "Overhead from the acceleration framework", 819 "Requires choosing and configuring base estimators" 820 ], 821 "best_for": "Large-scale datasets where running multiple detectors is desired but time is limited", 822 "avoid_when": "Exact results from a single well-chosen detector are preferred", 823 "benchmark_refs": [], 824 "benchmark_rank": {}, 825 "paper": {"id": "suod", "short": "Zhao et al., MLSys 2021"}, 826 "default_params": {"contamination": 0.1}, 827 "preprocessing_mode": "external", 828 "requires": ["suod"], 829 "version_added": "0.8.0" 830 }, 831 "XGBOD": { 832 "class_path": "pyod.models.xgbod.XGBOD", 833 "full_name": "Extreme Gradient Boosting Outlier Detection", 834 "status": "shipped", 835 "data_types": ["tabular"], 836 "category": "ensemble", 837 "complexity": {"time": "O(n * d * n_estimators * log(n))", "space": "O(n * d)"}, 838 "strengths": [ 839 "Combines unsupervised representations with supervised XGBoost", 840 "High accuracy when labels are available", 841 "Leverages powerful gradient boosting framework" 842 ], 843 "weaknesses": [ 844 "Requires labeled data (semi-supervised)", 845 "Depends on XGBoost library", 846 "Training is more expensive than unsupervised methods" 847 ], 848 "best_for": "Semi-supervised settings where some labeled anomalies are available", 849 "avoid_when": "No labeled data is available or a purely unsupervised approach is needed", 850 "benchmark_refs": [], 851 "benchmark_rank": {}, 852 "paper": {"id": "xgbod", "short": "Zhao and Hryniewicki, IJCNN 2018"}, 853 "default_params": {}, 854 "preprocessing_mode": "external", 855 "requires": ["xgboost"], 856 "version_added": "0.5.0" 857 }, 858 "AutoEncoder": { 859 "class_path": "pyod.models.auto_encoder.AutoEncoder", 860 "full_name": "Fully Connected AutoEncoder", 861 "status": "shipped", 862 "data_types": ["tabular"], 863 "category": "deep_learning", 864 "complexity": {"time": "O(n * d * h * epochs) where h is hidden size", "space": "O(d * h)"}, 865 "strengths": [ 866 "Learns nonlinear feature representations", 867 "Reconstruction error is an intuitive anomaly score", 868 "Flexible architecture" 869 ], 870 "weaknesses": [ 871 "Requires tuning of architecture and hyperparameters", 872 "May overfit on small datasets", 873 "Training can be unstable" 874 ], 875 "best_for": "Datasets with complex nonlinear structure where reconstruction-based scoring is appropriate", 876 "avoid_when": "Dataset is small, tabular and simple methods suffice, or training time is limited", 877 "benchmark_refs": [], 878 "benchmark_rank": {}, 879 "paper": {"id": "autoencoder", "short": "Aggarwal, 2017"}, 880 "default_params": {"contamination": 0.1}, 881 "preprocessing_mode": "external", 882 "requires": ["torch"], 883 "version_added": "0.6.0" 884 }, 885 "VAE": { 886 "class_path": "pyod.models.vae.VAE", 887 "full_name": "Variational AutoEncoder", 888 "status": "shipped", 889 "data_types": ["tabular"], 890 "category": "deep_learning", 891 "complexity": {"time": "O(n * d * h * epochs)", "space": "O(d * h)"}, 892 "strengths": [ 893 "Probabilistic latent space with regularization", 894 "Generates calibrated reconstruction probabilities", 895 "Learns smooth latent representations" 896 ], 897 "weaknesses": [ 898 "More complex to train than standard autoencoder", 899 "KL divergence term may dominate loss", 900 "Requires careful balancing of loss components" 901 ], 902 "best_for": "Datasets where probabilistic reconstruction scoring and smooth latent spaces are beneficial", 903 "avoid_when": "Simpler autoencoder or non-deep methods work well, or dataset is very small", 904 "benchmark_refs": [], 905 "benchmark_rank": {}, 906 "paper": {"id": "vae", "short": "Kingma and Welling, 2014"}, 907 "default_params": {"contamination": 0.1}, 908 "preprocessing_mode": "external", 909 "requires": ["torch"], 910 "version_added": "0.6.0" 911 }, 912 "SO_GAAL": { 913 "class_path": "pyod.models.so_gaal.SO_GAAL", 914 "full_name": "Single-Objective Generative Adversarial Active Learning", 915 "status": "shipped", 916 "data_types": ["tabular"], 917 "category": "deep_learning", 918 "complexity": {"time": "O(n * d * h * epochs)", "space": "O(d * h)"}, 919 "strengths": [ 920 "GAN-based approach generates informative outlier samples", 921 "Does not require labeled anomalies", 922 "Novel adversarial training framework for OD" 923 ], 924 "weaknesses": [ 925 "GAN training instability", 926 "Sensitive to hyperparameters and architecture", 927 "Slow training compared to non-deep methods" 928 ], 929 "best_for": "Exploratory anomaly detection with GAN-generated reference outliers", 930 "avoid_when": "Stable and fast results are required, or dataset is small", 931 "benchmark_refs": [], 932 "benchmark_rank": {}, 933 "paper": {"id": "so_gaal", "short": "Liu et al., 2019"}, 934 "default_params": {"contamination": 0.1}, 935 "preprocessing_mode": "external", 936 "requires": ["torch"], 937 "version_added": "0.6.0" 938 }, 939 "MO_GAAL": { 940 "class_path": "pyod.models.mo_gaal.MO_GAAL", 941 "full_name": "Multiple-Objective Generative Adversarial Active Learning", 942 "status": "shipped", 943 "data_types": ["tabular"], 944 "category": "deep_learning", 945 "complexity": {"time": "O(k * n * d * h * epochs) where k is number of generators", "space": "O(k * d * h)"}, 946 "strengths": [ 947 "Multiple generators provide diverse outlier references", 948 "Better coverage of outlier space than SO_GAAL", 949 "Multi-objective formulation improves robustness" 950 ], 951 "weaknesses": [ 952 "Even more complex training than SO_GAAL", 953 "Multiple generators increase computational cost", 954 "Difficult to tune" 955 ], 956 "best_for": "Complex datasets where diverse generated outlier references improve detection", 957 "avoid_when": "Computational resources are limited or simpler GAN approaches suffice", 958 "benchmark_refs": [], 959 "benchmark_rank": {}, 960 "paper": {"id": "mo_gaal", "short": "Liu et al., 2019"}, 961 "default_params": {"k": 10, "stop_epochs": 20, "contamination": 0.1}, 962 "preprocessing_mode": "external", 963 "requires": ["torch"], 964 "version_added": "0.6.0" 965 }, 966 "DeepSVDD": { 967 "class_path": "pyod.models.deep_svdd.DeepSVDD", 968 "full_name": "Deep Support Vector Data Description", 969 "status": "shipped", 970 "data_types": ["tabular"], 971 "category": "deep_learning", 972 "complexity": {"time": "O(n * d * h * epochs)", "space": "O(d * h)"}, 973 "strengths": [ 974 "Learns a compact hypersphere around normal data", 975 "Combines deep learning with SVDD objective", 976 "Effective for one-class classification" 977 ], 978 "weaknesses": [ 979 "Sensitive to network architecture choices", 980 "Risk of hypersphere collapse", 981 "Requires careful initialization" 982 ], 983 "best_for": "One-class anomaly detection where a compact normal data description is desired", 984 "avoid_when": "Normal data is multi-modal or simpler one-class methods are sufficient", 985 "benchmark_refs": [], 986 "benchmark_rank": {}, 987 "paper": {"id": "deep_svdd", "short": "Ruff et al., ICML 2018"}, 988 "default_params": {"contamination": 0.1}, 989 "preprocessing_mode": "external", 990 "requires": ["torch"], 991 "version_added": "0.7.5" 992 }, 993 "AnoGAN": { 994 "class_path": "pyod.models.anogan.AnoGAN", 995 "full_name": "Anomaly Detection with Generative Adversarial Networks", 996 "status": "shipped", 997 "data_types": ["tabular"], 998 "category": "deep_learning", 999 "complexity": {"time": "O(n * d * h * epochs) + O(n * iterations) for inference", "space": "O(d * h)"}, 1000 "strengths": [ 1001 "GAN learns the normal data distribution", 1002 "Can detect complex non-linear anomalies", 1003 "Anomaly scoring via reconstruction in latent space" 1004 ], 1005 "weaknesses": [ 1006 "GAN training instability", 1007 "Slow inference due to iterative latent optimization", 1008 "Requires significant tuning" 1009 ], 1010 "best_for": "Complex data distributions where GAN-based generation quality is high", 1011 "avoid_when": "Fast inference is needed or training instability is a concern", 1012 "benchmark_refs": [], 1013 "benchmark_rank": {}, 1014 "paper": {"id": "anogan", "short": "Schlegl et al., IPMI 2017"}, 1015 "default_params": {"contamination": 0.1}, 1016 "preprocessing_mode": "external", 1017 "requires": ["torch"], 1018 "version_added": "0.7.5" 1019 }, 1020 "ALAD": { 1021 "class_path": "pyod.models.alad.ALAD", 1022 "full_name": "Adversarially Learned Anomaly Detection", 1023 "status": "shipped", 1024 "data_types": ["tabular"], 1025 "category": "deep_learning", 1026 "complexity": {"time": "O(n * d * h * epochs)", "space": "O(d * h)"}, 1027 "strengths": [ 1028 "Bi-directional GAN avoids slow iterative inference of AnoGAN", 1029 "Multiple discriminators stabilize training", 1030 "Fast inference after training" 1031 ], 1032 "weaknesses": [ 1033 "Complex architecture with multiple networks", 1034 "Still subject to GAN training challenges", 1035 "Many hyperparameters to tune" 1036 ], 1037 "best_for": "Scenarios where GAN-based detection is desired but fast inference is needed", 1038 "avoid_when": "Simpler reconstruction-based deep methods suffice or dataset is small", 1039 "benchmark_refs": [], 1040 "benchmark_rank": {}, 1041 "paper": {"id": "alad", "short": "Zenati et al., ICDM 2018"}, 1042 "default_params": {"contamination": 0.1}, 1043 "preprocessing_mode": "external", 1044 "requires": ["torch"], 1045 "version_added": "0.7.5" 1046 }, 1047 "AE1SVM": { 1048 "class_path": "pyod.models.ae1svm.AE1SVM", 1049 "full_name": "AutoEncoder with One-Class SVM", 1050 "status": "shipped", 1051 "data_types": ["tabular"], 1052 "category": "deep_learning", 1053 "complexity": {"time": "O(n * d * h * epochs)", "space": "O(d * h + n_sv * h)"}, 1054 "strengths": [ 1055 "Jointly optimizes autoencoder and one-class SVM", 1056 "Combines representation learning with boundary detection", 1057 "End-to-end training" 1058 ], 1059 "weaknesses": [ 1060 "Complex joint optimization", 1061 "Requires tuning of both AE and SVM hyperparameters", 1062 "Training can be unstable" 1063 ], 1064 "best_for": "Datasets benefiting from joint representation learning and one-class classification", 1065 "avoid_when": "Simpler pipeline of separate AE + SVM works well, or dataset is small", 1066 "benchmark_refs": [], 1067 "benchmark_rank": {}, 1068 "paper": {"id": "ae1svm", "short": "Nguyen and Vien, ECML-PKDD 2019"}, 1069 "default_params": {"contamination": 0.1}, 1070 "preprocessing_mode": "external", 1071 "requires": ["torch"], 1072 "version_added": "0.9.0" 1073 }, 1074 "DevNet": { 1075 "class_path": "pyod.models.devnet.DevNet", 1076 "full_name": "Deep Anomaly Detection with Deviation Networks", 1077 "status": "shipped", 1078 "data_types": ["tabular"], 1079 "category": "deep_learning", 1080 "complexity": {"time": "O(n * d * h * epochs)", "space": "O(d * h)"}, 1081 "strengths": [ 1082 "End-to-end deep anomaly scoring network", 1083 "Can leverage a few labeled anomalies (semi-supervised)", 1084 "Deviation loss directly optimizes anomaly scores" 1085 ], 1086 "weaknesses": [ 1087 "Requires at least a few labeled anomalies for best results", 1088 "Deep network training overhead", 1089 "Sensitive to architecture and loss parameters" 1090 ], 1091 "best_for": "Semi-supervised anomaly detection where a small number of labeled anomalies are available", 1092 "avoid_when": "No labeled anomalies are available or dataset is too small for deep learning", 1093 "benchmark_refs": [], 1094 "benchmark_rank": {}, 1095 "paper": {"id": "devnet", "short": "Pang et al., KDD 2019"}, 1096 "default_params": {"contamination": 0.1}, 1097 "preprocessing_mode": "external", 1098 "requires": ["torch"], 1099 "version_added": "0.9.5" 1100 }, 1101 "RGraph": { 1102 "class_path": "pyod.models.rgraph.RGraph", 1103 "full_name": "R-Graph Outlier Detection", 1104 "status": "shipped", 1105 "data_types": ["tabular"], 1106 "category": "graph", 1107 "complexity": {"time": "O(n^2 * d + transition_steps * n^2)", "space": "O(n^2)"}, 1108 "strengths": [ 1109 "Graph-based approach captures relational structure", 1110 "Random walk on graph reveals connectivity anomalies", 1111 "Tunable transition steps for multi-scale detection" 1112 ], 1113 "weaknesses": [ 1114 "Quadratic memory for adjacency/transition matrix", 1115 "Slow for large datasets", 1116 "Requires tuning of graph construction parameters" 1117 ], 1118 "best_for": "Datasets where graph connectivity and neighborhood structure reveal anomalies", 1119 "avoid_when": "Dataset is very large or a simpler proximity method suffices", 1120 "benchmark_refs": [], 1121 "benchmark_rank": {}, 1122 "paper": {"id": "rgraph", "short": "You et al., AAAI 2017"}, 1123 "default_params": {"contamination": 0.1, "transition_steps": 10, "n_nonzero": 10, "gamma": 50.0}, 1124 "preprocessing_mode": "external", 1125 "requires": [], 1126 "version_added": "0.8.5" 1127 }, 1128 "LUNAR": { 1129 "class_path": "pyod.models.lunar.LUNAR", 1130 "full_name": "Learnable Unified Neighborhood-based Anomaly Ranking", 1131 "status": "shipped", 1132 "data_types": ["tabular"], 1133 "category": "graph", 1134 "complexity": {"time": "O(n * k * d + n * h * epochs)", "space": "O(n * k + d * h)"}, 1135 "strengths": [ 1136 "Learns to score anomalies from neighbor graphs via GNN", 1137 "Combines neighborhood structure with learned representations", 1138 "Supports both score and weight model types" 1139 ], 1140 "weaknesses": [ 1141 "Requires PyTorch for GNN training", 1142 "More complex setup than classical methods", 1143 "Training overhead from neural network" 1144 ], 1145 "best_for": "Datasets where learned neighborhood-based scoring outperforms handcrafted rules", 1146 "avoid_when": "PyTorch is not available or simpler KNN/LOF methods work well", 1147 "benchmark_refs": [], 1148 "benchmark_rank": {}, 1149 "paper": {"id": "lunar", "short": "Goodge et al., AAAI 2022"}, 1150 "default_params": {"contamination": 0.1, "model_type": "WEIGHT", "n_neighbours": 5}, 1151 "preprocessing_mode": "external", 1152 "requires": ["torch"], 1153 "version_added": "0.9.5" 1154 }, 1155 "EmbeddingOD": { 1156 "class_path": "pyod.models.embedding.EmbeddingOD", 1157 "full_name": "Embedding-Based Outlier Detection", 1158 "status": "shipped", 1159 "data_types": ["text", "image"], 1160 "category": "embedding", 1161 "complexity": {"time": "O(n * embedding_cost + detector_cost)", "space": "O(n * embedding_dim)"}, 1162 "strengths": [ 1163 "Leverages foundation model embeddings for anomaly detection", 1164 "Supports text and image data natively", 1165 "Flexible choice of downstream detector" 1166 ], 1167 "weaknesses": [ 1168 "Requires a pre-trained encoder model", 1169 "Embedding quality depends on the foundation model", 1170 "Higher memory usage for large embedding dimensions" 1171 ], 1172 "best_for": "Anomaly detection on unstructured data (text, images) via foundation model representations", 1173 "avoid_when": "Data is already tabular or a suitable encoder is not available", 1174 "benchmark_refs": [], 1175 "benchmark_rank": {}, 1176 "paper": {"id": "embedding_od", "short": "Zhao et al., 2025"}, 1177 "default_params": {"contamination": 0.1, "detector": "LUNAR"}, 1178 "preprocessing_mode": "internal", 1179 "requires": ["torch"], 1180 "version_added": "2.0.5" 1181 }, 1182 "MultiModalOD": { 1183 "class_path": "pyod.models.embedding.MultiModalOD", 1184 "full_name": "Multi-Modal Outlier Detection", 1185 "status": "shipped", 1186 "data_types": ["text", "image", "multimodal"], 1187 "category": "embedding", 1188 "complexity": {"time": "O(n * n_modalities * embedding_cost + detector_cost)", "space": "O(n * n_modalities * embedding_dim)"}, 1189 "strengths": [ 1190 "Combines multiple data modalities for anomaly detection", 1191 "Supports flexible modality combination strategies", 1192 "Leverages foundation model embeddings per modality" 1193 ], 1194 "weaknesses": [ 1195 "Requires encoder for each modality", 1196 "Higher computational cost with multiple modalities", 1197 "Combination strategy choice affects performance" 1198 ], 1199 "best_for": "Anomaly detection on multi-modal data combining text, image, or other modalities", 1200 "avoid_when": "Only a single modality is available or data is purely tabular", 1201 "benchmark_refs": [], 1202 "benchmark_rank": {}, 1203 "paper": {"id": "multimodal_od", "short": "Zhao et al., 2025"}, 1204 "default_params": {"contamination": 0.1, "combination": "average"}, 1205 "preprocessing_mode": "internal", 1206 "requires": ["torch"], 1207 "version_added": "2.1.0" 1208 }, 1209 "LLMAD": { 1210 "class_path": "pyod.models.llmad.LLMAD", 1211 "full_name": "LLM-Based Anomaly Detection", 1212 "status": "planned", 1213 "data_types": ["tabular", "text"], 1214 "category": "embedding", 1215 "complexity": {"time": "varies", "space": "varies"}, 1216 "strengths": [ 1217 "Zero-shot anomaly detection via LLM reasoning", 1218 "No training data required", 1219 "Handles diverse data types through natural language understanding" 1220 ], 1221 "weaknesses": [ 1222 "Not yet implemented", 1223 "Requires LLM API access with associated costs", 1224 "Inference latency from LLM calls" 1225 ], 1226 "best_for": "Zero-shot or few-shot anomaly detection leveraging LLM world knowledge", 1227 "avoid_when": "Feature is needed before release or LLM API costs are prohibitive", 1228 "benchmark_refs": [], 1229 "benchmark_rank": {}, 1230 "paper": {"id": "llmad", "short": "TBD"}, 1231 "default_params": {"contamination": 0.1}, 1232 "preprocessing_mode": "internal", 1233 "requires": [], 1234 "version_added": "TBD" 1235 }, 1236 "TimeSeriesOD": { 1237 "class_path": "pyod.models.ts_od.TimeSeriesOD", 1238 "full_name": "Time Series Outlier Detection", 1239 "status": "shipped", 1240 "data_types": ["time_series"], 1241 "category": "time_series", 1242 "complexity": {"time": "varies by detector", "space": "varies by detector"}, 1243 "strengths": [ 1244 "Unified interface for time-series anomaly detection", 1245 "Bridges PyOD detectors with sliding-window preprocessing", 1246 "Supports any PyOD base detector" 1247 ], 1248 "weaknesses": [ 1249 "Inherits limitations of chosen base detector", 1250 "Window size must be tuned per dataset" 1251 ], 1252 "best_for": "General-purpose time series anomaly detection with any PyOD detector", 1253 "avoid_when": "Specialized temporal methods (LSTM, Transformer) are more appropriate", 1254 "benchmark_refs": ["TSB_AD"], 1255 "benchmark_rank": {"TSB_AD_overall_iforest": 16, "TSB_AD_point_iforest": 8}, 1256 "paper": {"id": "tsod", "short": "Zhao et al., 2024"}, 1257 "default_params": {"detector": "IForest", "window_size": 50, "contamination": 0.1}, 1258 "preprocessing_mode": "external", 1259 "requires": [], 1260 "version_added": "2.2.0" 1261 }, 1262 "MatrixProfile": { 1263 "class_path": "pyod.models.ts_matrix_profile.MatrixProfile", 1264 "full_name": "Matrix Profile (STOMP)", 1265 "status": "shipped", 1266 "data_types": ["time_series"], 1267 "category": "time_series", 1268 "complexity": {"time": "O(n^2)", "space": "O(n)"}, 1269 "strengths": ["No parameters beyond window size", "Exact nearest-neighbor distances", "Well-studied theoretically"], 1270 "weaknesses": ["Transductive only (no out-of-sample prediction)", "O(n^2) may be slow on long series", "Single-threaded in v1"], 1271 "best_for": "Subsequence anomaly detection where exact distances matter", 1272 "avoid_when": "Out-of-sample prediction is needed", 1273 "benchmark_refs": ["TSB_AD"], 1274 "benchmark_rank": {"TSB_AD_overall": 10, "TSB_AD_short": 4}, 1275 "paper": {"id": "yeh2016matrix", "short": "Yeh et al., ICDM 2016"}, 1276 "default_params": {"window_size": 50, "contamination": 0.1}, 1277 "preprocessing_mode": "external", 1278 "requires": [], 1279 "version_added": "2.2.0" 1280 }, 1281 "SpectralResidual": { 1282 "class_path": "pyod.models.ts_spectral_residual.SpectralResidual", 1283 "full_name": "Spectral Residual Anomaly Detection", 1284 "status": "shipped", 1285 "data_types": ["time_series"], 1286 "category": "time_series", 1287 "complexity": {"time": "O(n log n)", "space": "O(n)"}, 1288 "strengths": ["Very fast (FFT-based)", "No training needed", "Works well on periodic data", "#3 for point anomalies in TSB-AD"], 1289 "weaknesses": ["Assumes frequency-domain structure", "Less effective on non-periodic data"], 1290 "best_for": "Fast detection of point/spike anomalies in periodic or seasonal time series", 1291 "avoid_when": "Data has no frequency structure", 1292 "benchmark_refs": ["TSB_AD"], 1293 "benchmark_rank": {"TSB_AD_overall": 14, "TSB_AD_point": 3, "TSB_AD_short": 8}, 1294 "paper": {"id": "ren2019time", "short": "Ren et al., KDD 2019"}, 1295 "default_params": {"score_window": 3, "contamination": 0.1}, 1296 "preprocessing_mode": "external", 1297 "requires": [], 1298 "version_added": "2.2.0" 1299 }, 1300 "KShape": { 1301 "class_path": "pyod.models.ts_kshape.KShape", 1302 "full_name": "k-Shape Clustering Anomaly Detection", 1303 "status": "shipped", 1304 "data_types": ["time_series"], 1305 "category": "time_series", 1306 "complexity": {"time": "O(n * k * max_iter)", "space": "O(n * m)"}, 1307 "strengths": ["Shape-aware clustering", "Handles shifted patterns", "Top-2 overall in TSB-AD benchmark"], 1308 "weaknesses": ["Sensitive to n_clusters choice", "Degrades on long time series"], 1309 "best_for": "Detecting shape-based anomalies in short-to-medium time series subsequences", 1310 "avoid_when": "Time series is very long (performance degrades)", 1311 "benchmark_refs": ["TSB_AD"], 1312 "benchmark_rank": {"TSB_AD_overall": 2, "TSB_AD_short": 2, "TSB_AD_point": 6, "TSB_AD_long": 9}, 1313 "paper": {"id": "paparrizos2015kshape", "short": "Paparrizos & Gravano, SIGMOD 2015"}, 1314 "default_params": {"n_clusters": 3, "window_size": 50, "contamination": 0.1}, 1315 "preprocessing_mode": "external", 1316 "requires": [], 1317 "version_added": "2.2.0" 1318 }, 1319 "SAND": { 1320 "class_path": "pyod.models.ts_sand.SAND", 1321 "full_name": "Streaming Anomaly Detection", 1322 "status": "experimental", 1323 "data_types": ["time_series"], 1324 "category": "time_series", 1325 "complexity": {"time": "O(n * k)", "space": "O(k * m)"}, 1326 "strengths": ["Handles concept drift", "Streaming-compatible", "Adapts to changing patterns", "#3 on short time series in TSB-AD"], 1327 "weaknesses": ["Experimental implementation", "Simplified from original paper"], 1328 "best_for": "Non-stationary time series with evolving normal patterns, especially short series", 1329 "avoid_when": "Production reliability is critical", 1330 "benchmark_refs": ["TSB_AD"], 1331 "benchmark_rank": {"TSB_AD_overall": 11, "TSB_AD_short": 3}, 1332 "paper": {"id": "boniol2021sand", "short": "Boniol et al., VLDB 2021"}, 1333 "default_params": {"n_clusters": 5, "window_size": 50, "contamination": 0.1}, 1334 "preprocessing_mode": "external", 1335 "requires": [], 1336 "version_added": "2.2.0" 1337 }, 1338 "LSTMAD": { 1339 "class_path": "pyod.models.ts_lstm.LSTMAD", 1340 "full_name": "LSTM-based Anomaly Detection", 1341 "status": "shipped", 1342 "data_types": ["time_series"], 1343 "category": "time_series", 1344 "complexity": {"time": "O(n * epochs)", "space": "O(model_params)"}, 1345 "strengths": ["Captures temporal dependencies", "Native multivariate support", "Mahalanobis distance scoring"], 1346 "weaknesses": ["Requires PyTorch", "Slower training than classical methods", "Needs sufficient data"], 1347 "best_for": "Multivariate and long time series with complex temporal patterns", 1348 "avoid_when": "Fast inference is critical or data is very short", 1349 "benchmark_refs": ["TSB_AD"], 1350 "benchmark_rank": {"TSB_AD_overall": 13, "TSB_AD_multivariate": 4, "TSB_AD_long": 8}, 1351 "paper": {"id": "malhotra2015long", "short": "Malhotra et al., ESANN 2015"}, 1352 "default_params": {"window_size": 50, "epochs": 50, "contamination": 0.1}, 1353 "preprocessing_mode": "external", 1354 "requires": ["torch"], 1355 "version_added": "2.2.0" 1356 }, 1357 "AnomalyTransformer": { 1358 "class_path": "pyod.models.ts_anomaly_transformer.AnomalyTransformer", 1359 "full_name": "Anomaly Transformer", 1360 "status": "experimental", 1361 "data_types": ["time_series"], 1362 "category": "time_series", 1363 "complexity": {"time": "O(n * L * d^2)", "space": "O(model_params)"}, 1364 "strengths": ["Association discrepancy is theoretically motivated", "Native multivariate"], 1365 "weaknesses": ["Requires PyTorch", "Complex architecture", "High memory usage", "Last place in TSB-AD benchmark (32/32)"], 1366 "best_for": "Research use only; simpler methods outperform on standard benchmarks", 1367 "avoid_when": "Accuracy matters -- underperforms simpler methods like MatrixProfile and IForest on all TSB-AD scenarios", 1368 "benchmark_refs": ["TSB_AD"], 1369 "benchmark_rank": {"TSB_AD_overall": 32}, 1370 "paper": {"id": "xu2022anomaly", "short": "Xu et al., ICLR 2022"}, 1371 "default_params": {"window_size": 100, "d_model": 512, "epochs": 10, "contamination": 0.1}, 1372 "preprocessing_mode": "external", 1373 "requires": ["torch"], 1374 "version_added": "2.2.0" 1375 }, 1376 "DOMINANT": { 1377 "class_path": "pyod.models.pyg_dominant.DOMINANT", 1378 "full_name": "Deep Anomaly Detection on Attributed Networks", 1379 "status": "shipped", 1380 "data_types": ["graph"], 1381 "category": "graph", 1382 "complexity": {"time": "O(epochs * (n * d_h + n^2))", "space": "O(n^2)"}, 1383 "strengths": ["Joint structure+attribute reconstruction", "Strong BOND benchmark performance", "Standard GCN architecture"], 1384 "weaknesses": ["O(n^2) memory for adjacency reconstruction", "Requires node features"], 1385 "best_for": "Attributed graphs where anomalies manifest in both structure and attributes", 1386 "avoid_when": "Graph is very large (>10k nodes) or has no node features", 1387 "benchmark_refs": ["BOND"], 1388 "benchmark_rank": {"BOND_deep": 1}, 1389 "paper": {"id": "ding2019dominant", "short": "Ding et al., SDM 2019"}, 1390 "default_params": {"hidden_dim": 64, "num_layers": 2, "epochs": 100, "contamination": 0.1}, 1391 "preprocessing_mode": "external", 1392 "requires": ["torch_geometric"], 1393 "version_added": "2.2.0" 1394 }, 1395 "CoLA": { 1396 "class_path": "pyod.models.pyg_cola.CoLA", 1397 "full_name": "Contrastive Self-Supervised Anomaly Detection", 1398 "status": "shipped", 1399 "data_types": ["graph"], 1400 "category": "graph", 1401 "complexity": {"time": "O(epochs * n * d_h)", "space": "O(n * d_h + m)"}, 1402 "strengths": ["Contrastive learning captures local-context discrepancy", "Strong BOND performance", "Sparse neighbor aggregation"], 1403 "weaknesses": ["Sensitive to graph connectivity", "Requires node features"], 1404 "best_for": "Attributed graphs where anomalies have unusual local neighborhoods", 1405 "avoid_when": "Graph is disconnected or has no node features", 1406 "benchmark_refs": ["BOND"], 1407 "benchmark_rank": {"BOND_deep": 2}, 1408 "paper": {"id": "liu2022cola", "short": "Liu et al., WWW 2022"}, 1409 "default_params": {"hidden_dim": 64, "num_layers": 2, "epochs": 100, "contamination": 0.1}, 1410 "preprocessing_mode": "external", 1411 "requires": ["torch_geometric"], 1412 "version_added": "2.2.0" 1413 }, 1414 "CONAD": { 1415 "class_path": "pyod.models.pyg_conad.CONAD", 1416 "full_name": "Contrastive Attributed Network Anomaly Detection", 1417 "status": "shipped", 1418 "data_types": ["graph"], 1419 "category": "graph", 1420 "complexity": {"time": "O(epochs * n * d_h)", "space": "O(n * d_h)"}, 1421 "strengths": ["Data augmentation improves robustness", "Dual objective (contrastive + reconstruction)"], 1422 "weaknesses": ["Augmentation ratio is a sensitive hyperparameter", "Requires node features"], 1423 "best_for": "Attributed graphs where robustness to noise is important", 1424 "avoid_when": "Graph structure is too sparse for meaningful augmentation", 1425 "benchmark_refs": ["BOND"], 1426 "benchmark_rank": {}, 1427 "paper": {"id": "xu2022conad", "short": "Xu et al., PAKDD 2022"}, 1428 "default_params": {"hidden_dim": 64, "epochs": 100, "aug_ratio": 0.2, "contamination": 0.1}, 1429 "preprocessing_mode": "external", 1430 "requires": ["torch_geometric"], 1431 "version_added": "2.2.0" 1432 }, 1433 "AnomalyDAE": { 1434 "class_path": "pyod.models.pyg_anomalydae.AnomalyDAE", 1435 "full_name": "Dual Autoencoder for Anomaly Detection", 1436 "status": "shipped", 1437 "data_types": ["graph"], 1438 "category": "graph", 1439 "complexity": {"time": "O(epochs * (n * d_h + n^2))", "space": "O(n^2)"}, 1440 "strengths": ["Attention-based structure encoding (GAT)", "Separate structure and attribute autoencoders"], 1441 "weaknesses": ["O(n^2) memory for adjacency reconstruction", "Requires node features"], 1442 "best_for": "Attributed graphs where attention over neighbors reveals anomaly patterns", 1443 "avoid_when": "Graph is very large or structure is unimportant", 1444 "benchmark_refs": ["BOND"], 1445 "benchmark_rank": {}, 1446 "paper": {"id": "fan2020anomalydae", "short": "Fan et al., CIKM 2020"}, 1447 "default_params": {"embed_dim": 64, "num_heads": 4, "epochs": 100, "contamination": 0.1}, 1448 "preprocessing_mode": "external", 1449 "requires": ["torch_geometric"], 1450 "version_added": "2.2.0" 1451 }, 1452 "GUIDE": { 1453 "class_path": "pyod.models.pyg_guide.GUIDE", 1454 "full_name": "Higher-order Structure Based Anomaly Detection", 1455 "status": "shipped", 1456 "data_types": ["graph"], 1457 "category": "graph", 1458 "complexity": {"time": "O(epochs * n * d_h + m * d_avg)", "space": "O(n^2)"}, 1459 "strengths": ["Exploits higher-order motifs (triangles)", "Dual-view captures different structural signals"], 1460 "weaknesses": ["Motif construction adds overhead", "Sparse graphs may have few triangles"], 1461 "best_for": "Dense attributed graphs with meaningful higher-order structures", 1462 "avoid_when": "Graph is tree-like (no triangles) or very large", 1463 "benchmark_refs": ["BOND"], 1464 "benchmark_rank": {}, 1465 "paper": {"id": "yuan2021guide", "short": "Yuan et al., BigData 2021"}, 1466 "default_params": {"hidden_dim": 64, "epochs": 100, "contamination": 0.1}, 1467 "preprocessing_mode": "external", 1468 "requires": ["torch_geometric"], 1469 "version_added": "2.2.0" 1470 }, 1471 "Radar": { 1472 "class_path": "pyod.models.pyg_radar.Radar", 1473 "full_name": "Residual Analysis for Anomaly Detection", 1474 "status": "shipped", 1475 "data_types": ["graph"], 1476 "category": "graph", 1477 "complexity": {"time": "O(max_iter * n^2 * d)", "space": "O(n^2)"}, 1478 "strengths": ["No neural network training", "Interpretable residuals", "Lightweight baseline"], 1479 "weaknesses": ["O(n^2) dense matrix operations", "Linear model may miss complex patterns"], 1480 "best_for": "Small-to-medium attributed graphs as a fast baseline", 1481 "avoid_when": "Graph is very large or anomalies are structural-only", 1482 "benchmark_refs": ["BOND"], 1483 "benchmark_rank": {}, 1484 "paper": {"id": "li2017radar", "short": "Li et al., IJCAI 2017"}, 1485 "default_params": {"alpha": 1.0, "gamma": 0.01, "max_iter": 100, "contamination": 0.1}, 1486 "preprocessing_mode": "external", 1487 "requires": ["torch_geometric"], 1488 "version_added": "2.2.0" 1489 }, 1490 "ANOMALOUS": { 1491 "class_path": "pyod.models.pyg_anomalous.ANOMALOUS", 1492 "full_name": "Joint Modeling Approach for Anomaly Detection", 1493 "status": "shipped", 1494 "data_types": ["graph"], 1495 "category": "graph", 1496 "complexity": {"time": "O(max_iter * n^2 * d)", "space": "O(n^2)"}, 1497 "strengths": ["Laplacian regularization for smooth predictions", "No neural network training", "Extends Radar with graph structure"], 1498 "weaknesses": ["O(n^2) dense matrix operations", "Linear model"], 1499 "best_for": "Small-to-medium attributed graphs where smoothness matters", 1500 "avoid_when": "Graph is very large or anomalies are purely structural", 1501 "benchmark_refs": ["BOND"], 1502 "benchmark_rank": {}, 1503 "paper": {"id": "peng2018anomalous", "short": "Peng et al., IJCAI 2018"}, 1504 "default_params": {"alpha": 1.0, "gamma": 1.0, "lambda_r": 0.01, "max_iter": 100, "contamination": 0.1}, 1505 "preprocessing_mode": "external", 1506 "requires": ["torch_geometric"], 1507 "version_added": "2.2.0" 1508 }, 1509 "SCAN_Graph": { 1510 "class_path": "pyod.models.pyg_scan.SCAN", 1511 "full_name": "Structural Clustering Algorithm for Networks", 1512 "status": "shipped", 1513 "data_types": ["graph"], 1514 "category": "graph", 1515 "complexity": {"time": "O(m * d_avg)", "space": "O(n + m)"}, 1516 "strengths": ["Structure-only (no features needed)", "No training or hyperparameter tuning", "Fast and lightweight"], 1517 "weaknesses": ["Ignores node attributes", "Only detects structural anomalies"], 1518 "best_for": "Structure-only graphs or as a fast structural baseline", 1519 "avoid_when": "Node attributes are available and important for anomaly detection", 1520 "benchmark_refs": [], 1521 "benchmark_rank": {}, 1522 "paper": {"id": "xu2007scan", "short": "Xu et al., KDD 2007"}, 1523 "default_params": {"epsilon": 0.5, "mu": 2, "contamination": 0.1}, 1524 "preprocessing_mode": "external", 1525 "requires": ["torch_geometric"], 1526 "version_added": "2.2.0" 1527 } 1528 }