embedding_od_example.py
1 # -*- coding: utf-8 -*- 2 """Example of using EmbeddingOD for text anomaly detection. 3 4 EmbeddingOD chains a foundation model encoder with any PyOD detector, 5 enabling anomaly detection on text, image, and other non-tabular data. 6 7 This implements the two-step approach shown to outperform end-to-end 8 methods in NLP-ADBench (Li et al., EMNLP 2025). 9 10 Requirements: 11 pip install pyod sentence-transformers 12 """ 13 # Author: Yue Zhao <yzhao062@gmail.com> 14 # License: BSD 2 clause 15 16 from pyod.models.embedding import EmbeddingOD 17 18 # Training data: normal samples (consistent topic) 19 train_texts = [ 20 "Quarterly revenue exceeded expectations by 12 percent", 21 "The company announced a new product line for Q3", 22 "Stock price remained stable after the earnings report", 23 "Board of directors approved the annual dividend", 24 "Operating costs decreased due to efficiency improvements", 25 "Market analysts upgraded the company rating to buy", 26 "New partnership expected to drive growth next quarter", 27 "Employee headcount grew by 5 percent this year", 28 ] * 20 # 160 training samples 29 30 # Test data: mix of normal and anomalous 31 test_texts = [ 32 "Annual report shows strong financial performance", # normal 33 "Cost reduction strategy yielded positive results", # normal 34 "The volcano erupted covering the island in ash", # anomaly 35 "Alien signals detected by deep space telescope", # anomaly 36 "Profit margins improved across all business units", # normal 37 "A rare species of deep-sea fish was discovered", # anomaly 38 ] 39 40 # ---- Method 1: Manual configuration ---- 41 print("Method 1: Manual configuration") 42 clf = EmbeddingOD(encoder='all-MiniLM-L6-v2', detector='KNN', 43 contamination=0.1) 44 clf.fit(train_texts) 45 46 scores = clf.decision_function(test_texts) 47 labels = clf.predict(test_texts) 48 proba = clf.predict_proba(test_texts) 49 50 for i, text in enumerate(test_texts): 51 print(f" [{labels[i]}] score={scores[i]:.3f} " 52 f"prob={proba[i, 1]:.3f} {text[:50]}") 53 54 # ---- Method 2: Use a preset ---- 55 print("\nMethod 2: Preset (fast text)") 56 clf2 = EmbeddingOD.for_text(quality='fast') 57 clf2.fit(train_texts) 58 59 labels2 = clf2.predict(test_texts) 60 for i, text in enumerate(test_texts): 61 tag = "ANOMALY" if labels2[i] == 1 else "normal " 62 print(f" {tag} {text[:50]}") 63 64 # ---- Method 3: Custom encoder function ---- 65 print("\nMethod 3: Custom encoder (random projection demo)") 66 import numpy as np 67 68 69 def hash_encoder(texts): 70 """Toy encoder: hash-based random projection.""" 71 rng = np.random.RandomState(42) 72 vocab = {} 73 dim = 50 74 result = np.zeros((len(texts), dim)) 75 for i, text in enumerate(texts): 76 for word in text.lower().split(): 77 if word not in vocab: 78 vocab[word] = rng.randn(dim) 79 result[i] += vocab[word] 80 return result 81 82 83 clf3 = EmbeddingOD(encoder=hash_encoder, detector='LOF') 84 clf3.fit(train_texts) 85 labels3 = clf3.predict(test_texts) 86 print(f" Predictions: {labels3}")