/ pyod / test / test_skill_kb_consistency.py
test_skill_kb_consistency.py
  1  # -*- coding: utf-8 -*-
  2  """KB consistency tests for packaged pyod skill content.
  3  
  4  Enforces three invariants:
  5  
  6  1. Every backtick-wrapped token in hand-written skill prose that looks like
  7     a detector name must exist in `pyod.utils.knowledge.algorithms`. Author
  8     prose must use backtick convention for detector names.
  9  
 10  2. Every KB-DERIVED block in every skill .md file must have a matching
 11     BEGIN / END marker pair with the same section name. Malformed or
 12     unmatched markers fail loudly.
 13  
 14  3. Every KB-DERIVED block's body is byte-identical to what
 15     scripts/regen_skill.py would produce (re-run the generator to fix).
 16  """
 17  import importlib.util
 18  import re
 19  from pathlib import Path
 20  
 21  REPO_ROOT = Path(__file__).resolve().parents[2]
 22  SKILLS_DIR = REPO_ROOT / "pyod" / "skills"
 23  SCRIPT_PATH = REPO_ROOT / "scripts" / "regen_skill.py"
 24  
 25  # Allowlist of backtick-wrapped tokens that look like detector names but
 26  # are actually legitimate non-detector references (Python symbols, env
 27  # vars, CLI commands, etc.). Add to this list when a false positive
 28  # surfaces, with a one-line reason.
 29  #
 30  # IMPORTANT: do NOT add any token here that is also a live KB key. The
 31  # allowlist must not shadow real detector names — that would let a typo
 32  # in `IForset` slip through if `IForest` were also allowlisted. The
 33  # `test_allowlist_does_not_shadow_kb_keys` test enforces this invariant.
 34  _BACKTICK_ALLOWLIST = {
 35      # Python stdlib / pyod module names that look detector-like but
 36      # are orchestration objects, not KB-registered detectors.
 37      "ADEngine", "ADEngine.start", "ADEngine.plan", "ADEngine.run",
 38      "ADEngine.analyze", "ADEngine.iterate", "ADEngine.report",
 39      "ADEngine.investigate", "ADEngine.profile_data",
 40      "ADEngine.list_detectors", "ADEngine.explain_detector",
 41      "ADEngine.compare_detectors", "ADEngine.get_benchmarks",
 42      "BaseDetector", "MultiModalEncoder",
 43      # CLI commands and flags
 44      "pyod", "pyod-install-skill", "pip", "pyod[graph]", "pyod[mcp]",
 45      "pyod info", "pyod install skill", "pyod install skill --project",
 46      "pyod mcp serve", "python -m pyod.mcp_server",
 47      # Benchmark and paper names — valid references that aren't KB keys
 48      "ADBench", "TSB-AD", "BOND", "NLP-ADBench",
 49      # Common Python state / attribute refs
 50      "state", "state.profile", "state.plan", "state.scores", "state.labels",
 51      "state.quality", "state.next_action", "state.best_detector",
 52      "state.top_cases", "state.feature_importance", "state.consensus_scores",
 53      "decision_scores_", "decision_function", "predict", "predict_proba",
 54      "predict_confidence", "predict_with_rejection", "labels_", "fit",
 55      "fit_predict",
 56      # Sklearn utilities commonly mentioned in prose
 57      # (NOTE: PCA is a live KB detector, NOT allowlisted — see comment above.)
 58      "StandardScaler", "RobustScaler",
 59      # Test runner / stdlib references
 60      "True", "False", "None", "HOME", "USERPROFILE", "PYTHONPATH",
 61      # Filesystem paths often wrapped in backticks
 62      "~/.claude", "~/.codex", "~/.claude/skills",
 63      "./skills/", "./skills/od-expert/",
 64      # Skill file references (capital-S start, look detector-like to the heuristic)
 65      "SKILL.md",
 66      # Vendor / encoder backend names referenced in text/image docs
 67      "HuggingFace", "OpenAI", "OPENAI_API_KEY", "PIL.Image",
 68  }
 69  
 70  # Backtick-wrapped token pattern. Captures the inside of a single-backtick
 71  # span. Example: `IForest` → captures "IForest". We deliberately do NOT
 72  # match triple-backtick code fences (which are whole-line delimiters) —
 73  # the regex only matches inline spans.
 74  _BACKTICK_RE = re.compile(r"(?<!`)`([^`\n]+?)`(?!`)")
 75  
 76  # Exact valid marker forms. A line that mentions a KB marker MUST
 77  # fullmatch one of these after strip(); anything else is an error.
 78  _BEGIN_MARKER_RE = re.compile(r"<!-- BEGIN KB-DERIVED: ([a-z0-9_\-]+) -->")
 79  _END_MARKER_RE = re.compile(r"<!-- END KB-DERIVED: ([a-z0-9_\-]+) -->")
 80  
 81  # "Suspicious marker" detector — any line containing an HTML comment
 82  # with BEGIN/END + KB- is a candidate for validation, case-insensitive.
 83  # This deliberately matches typo'd sentinels like `KB-DEREIVED`,
 84  # `KB-DERIVD`, or `Kb-derived` so that the validator can flag them.
 85  # A literal substring check for "KB-DERIVED" would silently miss those
 86  # typos — that was the hole flagged in the Round 3 review.
 87  _SUSPICIOUS_MARKER_RE = re.compile(
 88      r"<!--\s*(BEGIN|END)\s+KB-",
 89      re.IGNORECASE,
 90  )
 91  
 92  # Paired marker regex (matches well-formed blocks only — same as the
 93  # generator's _BLOCK_RE). Used to strip KB-DERIVED bodies before
 94  # backtick scanning, and to identify the section body for regen check.
 95  _PAIRED_BLOCK_RE = re.compile(
 96      r"<!-- BEGIN KB-DERIVED: ([a-z0-9_\-]+) -->\n(.*?)<!-- END KB-DERIVED: \1 -->",
 97      re.DOTALL,
 98  )
 99  
100  
101  def _import_regen_skill():
102      """Import scripts/regen_skill.py as a module by file path."""
103      spec = importlib.util.spec_from_file_location("regen_skill", SCRIPT_PATH)
104      mod = importlib.util.module_from_spec(spec)
105      spec.loader.exec_module(mod)
106      return mod
107  
108  
109  def _load_kb():
110      """Load pyod.utils.knowledge.algorithms."""
111      from pyod.utils.ad_engine import ADEngine
112      return ADEngine().kb.algorithms
113  
114  
115  def _all_skill_files():
116      """Return sorted list of all *.md files under pyod/skills/."""
117      return sorted(SKILLS_DIR.rglob("*.md"))
118  
119  
120  def _strip_paired_kb_derived_blocks(text):
121      """Remove content inside well-formed KB-DERIVED blocks.
122  
123      KB-DERIVED blocks already came from the KB by definition; we only
124      want to validate hand-written prose. Malformed blocks are left in
125      place so the marker-pairing test can still flag them.
126      """
127      return _PAIRED_BLOCK_RE.sub("", text)
128  
129  
130  def test_skill_files_exist():
131      """At least the od-expert SKILL.md must exist."""
132      files = _all_skill_files()
133      paths = {f.relative_to(REPO_ROOT) for f in files}
134      assert any("od_expert" in str(p) and p.name == "SKILL.md" for p in paths), (
135          f"od_expert/SKILL.md not found among {sorted(paths)}"
136      )
137  
138  
139  def test_kb_derived_markers_are_well_formed():
140      """Every line mentioning KB-DERIVED must be a valid BEGIN or END marker,
141      and every BEGIN must match a later END with the same section name.
142  
143      This is a line-oriented parser that catches everything the regex-based
144      approach missed in Round 1:
145        - Typo'd markers that don't match either valid form (e.g.
146          ``<!-- BEGIN KB-DERIVED foo -->`` — missing colon, or
147          ``<!-- BEGIN KB-DEREIVED: foo -->`` — typo)
148        - END markers that appear before any BEGIN (depth would go negative)
149        - Unclosed BEGIN markers at end-of-file
150        - Nested BEGIN-inside-BEGIN
151        - Section name mismatch between BEGIN and its corresponding END
152        - Stray lines containing the literal "KB-DERIVED" that are not
153          actual markers (e.g. a prose reference that would otherwise be
154          invisible to the generator but visible to a human reader)
155  
156      The generator's regex-based replacement is only safe if every
157      KB-DERIVED marker in the file matches one of the two exact forms.
158      This test enforces that invariant at build time.
159      """
160      files = _all_skill_files()
161      problems = []
162      for path in files:
163          text = path.read_text(encoding="utf-8")
164          lines = text.splitlines()
165          open_name = None  # None = no block open; str = name of open block
166          for lineno, line in enumerate(lines, start=1):
167              # Only inspect lines that look like a KB-DERIVED marker
168              # (BEGIN/END + KB-, case-insensitive). This is broader than
169              # the literal "KB-DERIVED" substring check so typo'd
170              # sentinels like KB-DEREIVED still land in the validator.
171              if not _SUSPICIOUS_MARKER_RE.search(line):
172                  continue
173              begin_m = _BEGIN_MARKER_RE.fullmatch(line.strip())
174              end_m = _END_MARKER_RE.fullmatch(line.strip())
175              if begin_m is None and end_m is None:
176                  problems.append(
177                      f"{path.relative_to(REPO_ROOT)}:{lineno}: line looks "
178                      f"like a KB-DERIVED marker but does not match the exact "
179                      f"syntax (typo in sentinel or section name, extra text, "
180                      f"wrong spacing, etc.). Line: {line!r}"
181                  )
182                  continue
183              if begin_m is not None and end_m is not None:
184                  # Shouldn't happen given distinct regexes, but guard anyway.
185                  problems.append(
186                      f"{path.relative_to(REPO_ROOT)}:{lineno}: line matches "
187                      f"both BEGIN and END patterns. Line: {line!r}"
188                  )
189                  continue
190              if begin_m is not None:
191                  name = begin_m.group(1)
192                  if open_name is not None:
193                      problems.append(
194                          f"{path.relative_to(REPO_ROOT)}:{lineno}: nested "
195                          f"BEGIN {name!r} while {open_name!r} still open"
196                      )
197                      # Keep parsing; treat the new BEGIN as the open block.
198                  open_name = name
199              else:  # end_m is not None
200                  name = end_m.group(1)
201                  if open_name is None:
202                      problems.append(
203                          f"{path.relative_to(REPO_ROOT)}:{lineno}: END "
204                          f"{name!r} without matching BEGIN"
205                      )
206                  elif open_name != name:
207                      problems.append(
208                          f"{path.relative_to(REPO_ROOT)}:{lineno}: END "
209                          f"{name!r} does not match open BEGIN {open_name!r}"
210                      )
211                      open_name = None
212                  else:
213                      open_name = None  # clean close
214          if open_name is not None:
215              problems.append(
216                  f"{path.relative_to(REPO_ROOT)}: unclosed BEGIN {open_name!r} "
217                  f"at end of file"
218              )
219      assert not problems, (
220          "Malformed KB-DERIVED markers in skill files:\n  "
221          + "\n  ".join(problems)
222          + "\n\nFix the marker pairing manually. Every KB-DERIVED line must "
223          + "match one of:\n"
224          + "  <!-- BEGIN KB-DERIVED: section-name -->\n"
225          + "  <!-- END KB-DERIVED: section-name -->\n"
226          + "with matching section names and no nesting."
227      )
228  
229  
230  def test_kb_derived_sections_are_up_to_date():
231      """Every KB-DERIVED block in every skill file must match the live KB.
232  
233      If this test fails, run `python scripts/regen_skill.py` and commit
234      the result.
235      """
236      regen = _import_regen_skill()
237      files = _all_skill_files()
238      rc = regen.check_files(files)
239      assert rc == 0, (
240          "KB-DERIVED sections in skill files are stale. "
241          "Run `python scripts/regen_skill.py` and commit the result."
242      )
243  
244  
245  def _extract_backtick_tokens(text):
246      """Return the set of inline backtick-wrapped tokens in `text`.
247  
248      Only matches single-backtick spans (inline code), not triple-backtick
249      fences. Splits on spaces and commas to handle things like
250      ``pyod install skill`` (multi-word) — those count as one token, not
251      three.
252      """
253      matches = _BACKTICK_RE.findall(text)
254      return set(matches)
255  
256  
257  def test_backtick_detector_names_exist_in_kb():
258      """Every backtick-wrapped detector-like token in prose must exist in the KB.
259  
260      The design enforces a convention: detector names in hand-written
261      skill prose must be wrapped in backticks (inline code style). For
262      example:
263  
264          Use `IForest` for large tabular data.   # validated
265          Use Isolation Forest for large data.    # NOT validated (free prose)
266  
267      This lets maintainers write expository prose freely while keeping
268      the canonical detector references type-checked against the KB.
269  
270      Failure modes this catches:
271        - Typo (`IForset` instead of `IForest`)
272        - Renamed detector (`Deep_SVDD` after KB rename to `DeepSVDD`)
273        - Stale reference to a removed detector
274  
275      Allowlist: tokens that are legitimate backtick references but not
276      KB keys (Python symbols, CLI commands, file paths, benchmark names)
277      live in `_BACKTICK_ALLOWLIST` at the top of this file. Add to that
278      list when a false positive surfaces, with a one-line reason.
279      """
280      kb = _load_kb()
281      known = set(kb.keys())
282      files = _all_skill_files()
283      unknown_per_file = {}
284      for path in files:
285          text = path.read_text(encoding="utf-8")
286          text = _strip_paired_kb_derived_blocks(text)
287          tokens = _extract_backtick_tokens(text)
288          # Heuristic: a token is "detector-like" if it starts with a
289          # capital letter and contains no spaces beyond method-call dots.
290          detector_like = {
291              t for t in tokens
292              if t and t[0].isupper() and " " not in t
293          }
294          unknown = detector_like - known - _BACKTICK_ALLOWLIST
295          if unknown:
296              unknown_per_file[path.relative_to(REPO_ROOT)] = sorted(unknown)
297      assert not unknown_per_file, (
298          "Unknown detector-like backtick tokens in skill hand-written content:\n"
299          + "\n".join(
300              f"  {path}: {names}"
301              for path, names in unknown_per_file.items()
302          )
303          + "\n\nEither:\n"
304          + "  1. Fix the skill — this detector is not in the KB.\n"
305          + "  2. Use free prose instead (remove the backticks).\n"
306          + "  3. Add the token to _BACKTICK_ALLOWLIST with a one-line reason."
307      )
308  
309  
310  def test_kb_loads_cleanly():
311      """Sanity check: the KB must load without errors."""
312      kb = _load_kb()
313      assert kb, "KB is empty"
314      assert isinstance(kb, dict)
315      # Spot-check that the KB has the expected schema
316      sample_name, sample_algo = next(iter(kb.items()))
317      assert "data_types" in sample_algo, (
318          f"KB algorithm {sample_name} missing 'data_types' field"
319      )
320  
321  
322  def test_allowlist_does_not_shadow_kb_keys():
323      """The backtick allowlist must not contain any live KB detector name.
324  
325      Allowing a real detector name here defeats the safety net's purpose:
326      if `IForest` is in the allowlist and the maintainer typos it as
327      `IForset`, the typo would fail the KB lookup but the allowlist
328      fallback would let it through silently. This test enforces that the
329      allowlist and the live KB are disjoint sets, so every detector name
330      in skill prose is validated against the KB on every CI run.
331      """
332      kb = _load_kb()
333      overlap = _BACKTICK_ALLOWLIST & set(kb.keys())
334      assert not overlap, (
335          f"_BACKTICK_ALLOWLIST shadows live KB detector names: {sorted(overlap)}.\n"
336          f"Remove these entries — they should be validated against the KB, "
337          f"not allowlisted. Allowlisting them would let typos slip past the "
338          f"safety net."
339      )