Cradicle Explorer

/ examples / embedding_od_example.py
embedding_od_example.py
 1  # -*- coding: utf-8 -*-
 2  """Example of using EmbeddingOD for text anomaly detection.
 3  
 4  EmbeddingOD chains a foundation model encoder with any PyOD detector,
 5  enabling anomaly detection on text, image, and other non-tabular data.
 6  
 7  This implements the two-step approach shown to outperform end-to-end
 8  methods in NLP-ADBench (Li et al., EMNLP 2025).
 9  
10  Requirements:
11      pip install pyod sentence-transformers
12  """
13  # Author: Yue Zhao <yzhao062@gmail.com>
14  # License: BSD 2 clause
15  
16  from pyod.models.embedding import EmbeddingOD
17  
18  # Training data: normal samples (consistent topic)
19  train_texts = [
20      "Quarterly revenue exceeded expectations by 12 percent",
21      "The company announced a new product line for Q3",
22      "Stock price remained stable after the earnings report",
23      "Board of directors approved the annual dividend",
24      "Operating costs decreased due to efficiency improvements",
25      "Market analysts upgraded the company rating to buy",
26      "New partnership expected to drive growth next quarter",
27      "Employee headcount grew by 5 percent this year",
28  ] * 20  # 160 training samples
29  
30  # Test data: mix of normal and anomalous
31  test_texts = [
32      "Annual report shows strong financial performance",    # normal
33      "Cost reduction strategy yielded positive results",     # normal
34      "The volcano erupted covering the island in ash",       # anomaly
35      "Alien signals detected by deep space telescope",       # anomaly
36      "Profit margins improved across all business units",    # normal
37      "A rare species of deep-sea fish was discovered",       # anomaly
38  ]
39  
40  # ---- Method 1: Manual configuration ----
41  print("Method 1: Manual configuration")
42  clf = EmbeddingOD(encoder='all-MiniLM-L6-v2', detector='KNN',
43                    contamination=0.1)
44  clf.fit(train_texts)
45  
46  scores = clf.decision_function(test_texts)
47  labels = clf.predict(test_texts)
48  proba = clf.predict_proba(test_texts)
49  
50  for i, text in enumerate(test_texts):
51      print(f"  [{labels[i]}] score={scores[i]:.3f}  "
52            f"prob={proba[i, 1]:.3f}  {text[:50]}")
53  
54  # ---- Method 2: Use a preset ----
55  print("\nMethod 2: Preset (fast text)")
56  clf2 = EmbeddingOD.for_text(quality='fast')
57  clf2.fit(train_texts)
58  
59  labels2 = clf2.predict(test_texts)
60  for i, text in enumerate(test_texts):
61      tag = "ANOMALY" if labels2[i] == 1 else "normal "
62      print(f"  {tag}  {text[:50]}")
63  
64  # ---- Method 3: Custom encoder function ----
65  print("\nMethod 3: Custom encoder (random projection demo)")
66  import numpy as np
67  
68  
69  def hash_encoder(texts):
70      """Toy encoder: hash-based random projection."""
71      rng = np.random.RandomState(42)
72      vocab = {}
73      dim = 50
74      result = np.zeros((len(texts), dim))
75      for i, text in enumerate(texts):
76          for word in text.lower().split():
77              if word not in vocab:
78                  vocab[word] = rng.randn(dim)
79              result[i] += vocab[word]
80      return result
81  
82  
83  clf3 = EmbeddingOD(encoder=hash_encoder, detector='LOF')
84  clf3.fit(train_texts)
85  labels3 = clf3.predict(test_texts)
86  print(f"  Predictions: {labels3}")