meet_bot.py
1 """Headless Google Meet bot — Playwright + live-caption scraping. 2 3 Runs as a standalone subprocess spawned by ``process_manager.py``. Reads config 4 from env vars, writes status + transcript to files under 5 ``$HERMES_HOME/workspace/meetings/<meeting-id>/``. The main hermes process 6 reads those files via the ``meet_*`` tools — no IPC beyond filesystem. 7 8 The scraping strategy mirrors OpenUtter (sumansid/openutter): we don't parse 9 WebRTC audio, we enable Google Meet's built-in live captions and observe the 10 captions container in the DOM via a MutationObserver. This is lossy and 11 English-biased but it is: 12 13 * deterministic (no API keys, no STT billing), 14 * works behind Meet's normal login / admission, 15 * survives Meet UI rewrites fairly well because the caption container has a 16 stable ARIA role. 17 18 Run standalone for debugging:: 19 20 HERMES_MEET_URL=https://meet.google.com/abc-defg-hij \\ 21 HERMES_MEET_OUT_DIR=/tmp/meet-debug \\ 22 HERMES_MEET_HEADED=1 \\ 23 python -m plugins.google_meet.meet_bot 24 25 No meet.google.com URL → exits non-zero. Any URL that doesn't start with 26 ``https://meet.google.com/`` is rejected (explicit-by-design). 27 """ 28 29 from __future__ import annotations 30 31 import json 32 import os 33 import re 34 import signal 35 import sys 36 import threading 37 import time 38 from pathlib import Path 39 from typing import Optional 40 41 # Match ``https://meet.google.com/abc-defg-hij`` or ``.../lookup/...`` — the 42 # short three-segment code or a lookup URL. Anything else is rejected. 43 MEET_URL_RE = re.compile( 44 r"^https://meet\.google\.com/(" 45 r"[a-z0-9]{3,}-[a-z0-9]{3,}-[a-z0-9]{3,}" 46 r"|lookup/[^/?#]+" 47 r"|new" 48 r")(?:[/?#].*)?$" 49 ) 50 51 52 # Filenames the bot reads/writes in ``HERMES_MEET_OUT_DIR``. 53 SAY_QUEUE_FILENAME = "say_queue.jsonl" 54 SAY_PCM_FILENAME = "speaker.pcm" 55 56 57 def _is_safe_meet_url(url: str) -> bool: 58 """Return True if *url* is a Google Meet URL we're willing to navigate to.""" 59 if not isinstance(url, str): 60 return False 61 return bool(MEET_URL_RE.match(url.strip())) 62 63 64 def _meeting_id_from_url(url: str) -> str: 65 """Extract the 3-segment meeting code from a Meet URL. 66 67 For ``https://meet.google.com/abc-defg-hij`` → ``abc-defg-hij``. 68 For ``.../lookup/<id>`` or ``/new`` we fall back to a timestamped id — the 69 bot won't know the real code until after redirect, and callers pass this 70 through to filename anyway. 71 """ 72 m = re.search( 73 r"meet\.google\.com/([a-z0-9]{3,}-[a-z0-9]{3,}-[a-z0-9]{3,})", 74 url or "", 75 ) 76 if m: 77 return m.group(1) 78 return f"meet-{int(time.time())}" 79 80 81 # --------------------------------------------------------------------------- 82 # Status + transcript file writers 83 # --------------------------------------------------------------------------- 84 85 class _BotState: 86 """Single-process mutable state, flushed to ``status.json`` on each change.""" 87 88 def __init__(self, out_dir: Path, meeting_id: str, url: str): 89 self.out_dir = out_dir 90 self.meeting_id = meeting_id 91 self.url = url 92 self.in_call = False 93 self.captioning = False 94 self.captions_enabled_attempted = False 95 self.lobby_waiting = False 96 self.join_attempted_at: Optional[float] = None 97 self.joined_at: Optional[float] = None 98 self.last_caption_at: Optional[float] = None 99 self.transcript_lines = 0 100 self.error: Optional[str] = None 101 self.exited = False 102 # v2 realtime fields. 103 self.realtime = False 104 self.realtime_ready = False 105 self.realtime_device: Optional[str] = None 106 self.audio_bytes_out: int = 0 107 self.last_audio_out_at: Optional[float] = None 108 self.last_barge_in_at: Optional[float] = None 109 self.leave_reason: Optional[str] = None 110 # Scraped captions, in order, deduped. Each entry is a dict of 111 # {"ts": <epoch>, "speaker": str, "text": str}. 112 self._seen: set = set() 113 out_dir.mkdir(parents=True, exist_ok=True) 114 self.transcript_path = out_dir / "transcript.txt" 115 self.status_path = out_dir / "status.json" 116 self._flush() 117 118 # -------- transcript ------------------------------------------------ 119 120 def record_caption(self, speaker: str, text: str) -> None: 121 """Append a caption line if we haven't seen this exact (speaker, text).""" 122 speaker = (speaker or "").strip() or "Unknown" 123 text = (text or "").strip() 124 if not text: 125 return 126 key = f"{speaker}|{text}" 127 if key in self._seen: 128 return 129 self._seen.add(key) 130 self.transcript_lines += 1 131 self.last_caption_at = time.time() 132 ts = time.strftime("%H:%M:%S", time.localtime(self.last_caption_at)) 133 line = f"[{ts}] {speaker}: {text}\n" 134 # Atomic-ish append — good enough for a single-writer. 135 with self.transcript_path.open("a", encoding="utf-8") as f: 136 f.write(line) 137 self._flush() 138 139 # -------- status file ---------------------------------------------- 140 141 def _flush(self) -> None: 142 data = { 143 "meetingId": self.meeting_id, 144 "url": self.url, 145 "inCall": self.in_call, 146 "captioning": self.captioning, 147 "captionsEnabledAttempted": self.captions_enabled_attempted, 148 "lobbyWaiting": self.lobby_waiting, 149 "joinAttemptedAt": self.join_attempted_at, 150 "joinedAt": self.joined_at, 151 "lastCaptionAt": self.last_caption_at, 152 "transcriptLines": self.transcript_lines, 153 "transcriptPath": str(self.transcript_path), 154 "error": self.error, 155 "exited": self.exited, 156 "pid": os.getpid(), 157 # v2 realtime telemetry. 158 "realtime": self.realtime, 159 "realtimeReady": self.realtime_ready, 160 "realtimeDevice": self.realtime_device, 161 "audioBytesOut": self.audio_bytes_out, 162 "lastAudioOutAt": self.last_audio_out_at, 163 "lastBargeInAt": self.last_barge_in_at, 164 "leaveReason": self.leave_reason, 165 } 166 tmp = self.status_path.with_suffix(".json.tmp") 167 tmp.write_text(json.dumps(data, indent=2), encoding="utf-8") 168 tmp.replace(self.status_path) 169 170 def set(self, **kwargs) -> None: 171 for k, v in kwargs.items(): 172 setattr(self, k, v) 173 self._flush() 174 175 176 # --------------------------------------------------------------------------- 177 # Playwright bot entry point 178 # --------------------------------------------------------------------------- 179 180 # JavaScript injected into the Meet tab to observe captions. Captures 181 # {speaker, text} tuples via a MutationObserver on the caption container, 182 # and exposes ``window.__hermesMeetDrain()`` to pull new entries. This 183 # mirrors the OpenUtter caption scraping approach. 184 _CAPTION_OBSERVER_JS = r""" 185 (() => { 186 if (window.__hermesMeetInstalled) return; 187 window.__hermesMeetInstalled = true; 188 window.__hermesMeetQueue = []; 189 190 const captionSelector = '[role="region"][aria-label*="aption" i], ' + 191 'div[jsname="YSxPC"], ' + // legacy 192 'div[jsname="tgaKEf"]'; // current (Apr 2026) 193 194 function pushEntry(speaker, text) { 195 if (!text || !text.trim()) return; 196 window.__hermesMeetQueue.push({ 197 ts: Date.now(), 198 speaker: (speaker || '').trim(), 199 text: text.trim(), 200 }); 201 } 202 203 function scan(root) { 204 // Meet captions render as a list of rows; each row contains a speaker 205 // label and a text block. Selectors vary across Meet rewrites; we try 206 // a few shapes and fall back to raw text. 207 const rows = root.querySelectorAll('div[jsname="dsyhDe"], div.CNusmb, div.TBMuR'); 208 if (rows.length) { 209 rows.forEach((row) => { 210 const spkEl = row.querySelector('div.KcIKyf, div.zs7s8d, span[jsname="YSxPC"]'); 211 const txtEl = row.querySelector('div.bh44bd, span[jsname="tgaKEf"], div.iTTPOb'); 212 const speaker = spkEl ? spkEl.innerText : ''; 213 const text = txtEl ? txtEl.innerText : row.innerText; 214 pushEntry(speaker, text); 215 }); 216 return; 217 } 218 // Fallback: treat the whole region's innerText as one anonymous line. 219 const text = (root.innerText || '').split('\n').filter(Boolean).pop(); 220 pushEntry('', text); 221 } 222 223 function attach() { 224 const el = document.querySelector(captionSelector); 225 if (!el) return false; 226 const obs = new MutationObserver(() => scan(el)); 227 obs.observe(el, { childList: true, subtree: true, characterData: true }); 228 scan(el); 229 return true; 230 } 231 232 // Try now and retry on interval — the caption region only appears after 233 // captions are enabled and someone speaks. 234 if (!attach()) { 235 const iv = setInterval(() => { if (attach()) clearInterval(iv); }, 1500); 236 } 237 238 window.__hermesMeetDrain = () => { 239 const out = window.__hermesMeetQueue.slice(); 240 window.__hermesMeetQueue = []; 241 return out; 242 }; 243 })(); 244 """ 245 246 247 def _enable_captions_js() -> str: 248 """Return a small JS snippet that tries to click the 'Turn on captions' button. 249 250 Best-effort — Meet's caption toggle is keyboard-accessible via ``c``. We 251 dispatch that keystroke as a cheap fallback. Real click targeting is too 252 brittle to rely on. 253 """ 254 return r""" 255 (() => { 256 const ev = new KeyboardEvent('keydown', { 257 key: 'c', code: 'KeyC', keyCode: 67, which: 67, bubbles: true, 258 }); 259 document.body.dispatchEvent(ev); 260 return true; 261 })(); 262 """ 263 264 265 def _start_realtime_speaker( 266 *, 267 rt: dict, 268 out_dir: Path, 269 bridge_info: dict, 270 api_key: str, 271 model: str, 272 voice: str, 273 instructions: str, 274 stop_flag: dict, 275 state: "_BotState", 276 ) -> None: 277 """Wire up the OpenAI Realtime session + speaker thread + PCM pump. 278 279 The speaker thread reads text lines from ``say_queue.jsonl``, sends each 280 to OpenAI Realtime, and writes PCM audio into ``speaker.pcm``. A 281 separate *pump* thread forwards that PCM into the OS audio sink so 282 Chrome's fake mic picks it up. On Linux we pipe to ``paplay`` against 283 the null-sink; on macOS the caller is expected to have the BlackHole 284 device selected as default input. 285 """ 286 try: 287 from plugins.google_meet.realtime.openai_client import ( 288 RealtimeSession, 289 RealtimeSpeaker, 290 ) 291 except Exception as e: 292 state.set(error=f"realtime import failed: {e}") 293 return 294 295 pcm_path = out_dir / SAY_PCM_FILENAME 296 queue_path = out_dir / SAY_QUEUE_FILENAME 297 processed_path = out_dir / "say_processed.jsonl" 298 # Reset the sink file so we start clean each session. 299 pcm_path.write_bytes(b"") 300 # Make sure the queue exists so the speaker poller doesn't error on 301 # first iteration. 302 queue_path.touch() 303 304 try: 305 session = RealtimeSession( 306 api_key=api_key, 307 model=model, 308 voice=voice, 309 instructions=instructions, 310 audio_sink_path=pcm_path, 311 sample_rate=24000, 312 ) 313 session.connect() 314 except Exception as e: 315 state.set(error=f"realtime connect failed: {e}") 316 return 317 318 rt["session"] = session 319 320 def _stop_fn(): 321 return stop_flag.get("stop", False) 322 323 rt["speaker_stop"] = lambda: stop_flag.__setitem__("stop", stop_flag.get("stop", False)) 324 325 speaker = RealtimeSpeaker( 326 session=session, 327 queue_path=queue_path, 328 processed_path=processed_path, 329 ) 330 331 def _speaker_loop(): 332 try: 333 speaker.run_until_stopped(_stop_fn) 334 except Exception as e: 335 state.set(error=f"realtime speaker crashed: {e}") 336 337 t_speaker = threading.Thread(target=_speaker_loop, name="meet-speaker", daemon=True) 338 t_speaker.start() 339 rt["speaker_thread"] = t_speaker 340 341 # PCM pump: feeds speaker.pcm (24kHz s16le mono) into the OS audio 342 # device that Chrome's fake mic reads from. Different tools per 343 # platform, but the contract is the same — block-read the growing 344 # PCM file and stream it to the device in near-real-time. 345 platform_tag = (bridge_info or {}).get("platform") 346 if platform_tag == "linux": 347 import subprocess as _sp 348 349 sink = (bridge_info or {}).get("write_target") or "hermes_meet_sink" 350 try: 351 proc = _sp.Popen( 352 [ 353 "paplay", 354 "--raw", 355 "--rate=24000", 356 "--format=s16le", 357 "--channels=1", 358 f"--device={sink}", 359 str(pcm_path), 360 ], 361 stdin=_sp.DEVNULL, 362 stdout=_sp.DEVNULL, 363 stderr=_sp.DEVNULL, 364 ) 365 rt["pcm_pump"] = proc 366 except FileNotFoundError: 367 state.set(error="paplay not found — install pulseaudio-utils for realtime on Linux") 368 elif platform_tag == "darwin": 369 # macOS: use ffmpeg to tail-read speaker.pcm and write it to the 370 # BlackHole output device. The user must have BlackHole selected 371 # as the default input in System Settings → Sound for Chrome to 372 # pick it up. We prefer ffmpeg because it's scriptable and can 373 # target AVFoundation devices by name; fall back to afplay-ing 374 # the file in a tight loop if ffmpeg is absent. 375 import shutil as _shutil 376 import subprocess as _sp 377 378 device_name = (bridge_info or {}).get("write_target") or "BlackHole 2ch" 379 if _shutil.which("ffmpeg"): 380 try: 381 # -re: read input at native frame rate. 382 # -f avfoundation -i: speaker path as raw PCM. 383 # -f s16le -ar 24000 -ac 1 -i <pcm>: interpret the file. 384 # -f audiotoolbox -audio_device_index: write to BlackHole. 385 # Simpler: output as raw via coreaudio using "-f audiotoolbox". 386 # ffmpeg's audiotoolbox output picks the current default 387 # output device, which isn't what we want. Instead we use 388 # -f avfoundation with the named device as OUTPUT via 389 # -vn and the device name. 390 proc = _sp.Popen( 391 [ 392 "ffmpeg", 393 "-nostdin", "-hide_banner", "-loglevel", "error", 394 "-re", 395 "-f", "s16le", "-ar", "24000", "-ac", "1", 396 "-i", str(pcm_path), 397 "-f", "audiotoolbox", 398 "-audio_device_index", _mac_audio_device_index(device_name), 399 "-", 400 ], 401 stdin=_sp.DEVNULL, 402 stdout=_sp.DEVNULL, 403 stderr=_sp.DEVNULL, 404 ) 405 rt["pcm_pump"] = proc 406 except FileNotFoundError: 407 state.set(error="ffmpeg not found — install via `brew install ffmpeg` for realtime on macOS") 408 except Exception as e: 409 state.set(error=f"macOS pcm pump failed to start: {e}") 410 else: 411 state.set(error="ffmpeg not found — install via `brew install ffmpeg` for realtime on macOS") 412 413 414 def _mac_audio_device_index(device_name: str) -> str: 415 """Return the ffmpeg ``-audio_device_index`` for *device_name*, as a string. 416 417 Probes ``ffmpeg -f avfoundation -list_devices true -i ''`` (which prints 418 the device table on stderr) and matches *device_name* case-insensitively. 419 Defaults to ``"0"`` if the device can't be found — caller will get a 420 misrouted stream but not a crash, and the error will be obvious. 421 """ 422 import subprocess as _sp 423 424 try: 425 out = _sp.run( 426 ["ffmpeg", "-f", "avfoundation", "-list_devices", "true", "-i", ""], 427 capture_output=True, 428 text=True, 429 timeout=10, 430 ) 431 except Exception: 432 return "0" 433 # ffmpeg prints the table on stderr. Lines look like: 434 # [AVFoundation indev @ 0x...] [0] BlackHole 2ch 435 import re as _re 436 437 needle = device_name.strip().lower() 438 for line in (out.stderr or "").splitlines(): 439 m = _re.search(r"\[(\d+)\]\s+(.+)$", line) 440 if not m: 441 continue 442 if m.group(2).strip().lower() == needle: 443 return m.group(1) 444 return "0" 445 446 447 def run_bot() -> int: # noqa: C901 — orchestration, explicit branches 448 url = os.environ.get("HERMES_MEET_URL", "").strip() 449 out_dir_env = os.environ.get("HERMES_MEET_OUT_DIR", "").strip() 450 headed = os.environ.get("HERMES_MEET_HEADED", "").lower() in ("1", "true", "yes") 451 auth_state = os.environ.get("HERMES_MEET_AUTH_STATE", "").strip() 452 guest_name = os.environ.get("HERMES_MEET_GUEST_NAME", "Hermes Agent") 453 duration_s = _parse_duration(os.environ.get("HERMES_MEET_DURATION", "")) 454 # v2: optional realtime mode. Enabled when HERMES_MEET_MODE=realtime. 455 mode = os.environ.get("HERMES_MEET_MODE", "transcribe").strip().lower() 456 realtime_model = os.environ.get("HERMES_MEET_REALTIME_MODEL", "gpt-realtime") 457 realtime_voice = os.environ.get("HERMES_MEET_REALTIME_VOICE", "alloy") 458 realtime_instructions = os.environ.get("HERMES_MEET_REALTIME_INSTRUCTIONS", "") 459 realtime_api_key = os.environ.get("HERMES_MEET_REALTIME_KEY") or os.environ.get("OPENAI_API_KEY", "") 460 461 if not url or not _is_safe_meet_url(url): 462 sys.stderr.write( 463 "google_meet bot: refusing to launch — HERMES_MEET_URL must be a " 464 "meet.google.com URL. got: %r\n" % url 465 ) 466 return 2 467 if not out_dir_env: 468 sys.stderr.write("google_meet bot: HERMES_MEET_OUT_DIR is required\n") 469 return 2 470 471 out_dir = Path(out_dir_env) 472 meeting_id = _meeting_id_from_url(url) 473 state = _BotState(out_dir=out_dir, meeting_id=meeting_id, url=url) 474 475 # SIGTERM → exit cleanly so the parent ``meet_leave`` gets a finalized 476 # transcript. We set a flag instead of raising so the Playwright context 477 # teardown runs in the finally block below. 478 stop_flag = {"stop": False} 479 480 def _on_signal(_sig, _frame): 481 stop_flag["stop"] = True 482 483 signal.signal(signal.SIGTERM, _on_signal) 484 signal.signal(signal.SIGINT, _on_signal) 485 486 # v2 realtime: provision virtual audio device + start speaker thread. 487 # We track these in a dict so the finally block can tear them down 488 # regardless of how we exit. If anything in the realtime setup fails we 489 # fall back to transcribe mode with a status flag. 490 rt = { 491 "enabled": mode == "realtime", 492 "bridge": None, # AudioBridge | None 493 "bridge_info": None, # dict | None 494 "session": None, # RealtimeSession | None 495 "speaker_thread": None, # threading.Thread | None 496 "speaker_stop": None, # callable | None 497 } 498 if rt["enabled"]: 499 if not realtime_api_key: 500 state.set(error="realtime mode requested but no API key in HERMES_MEET_REALTIME_KEY/OPENAI_API_KEY — falling back to transcribe") 501 rt["enabled"] = False 502 else: 503 try: 504 from plugins.google_meet.audio_bridge import AudioBridge 505 bridge = AudioBridge() 506 rt["bridge_info"] = bridge.setup() 507 rt["bridge"] = bridge 508 state.set(realtime=True, realtime_device=rt["bridge_info"].get("device_name")) 509 except Exception as e: 510 state.set(error=f"audio bridge setup failed: {e} — falling back to transcribe") 511 rt["enabled"] = False 512 513 try: 514 from playwright.sync_api import sync_playwright 515 except ImportError as e: 516 state.set(error=f"playwright not installed: {e}", exited=True) 517 sys.stderr.write( 518 "google_meet bot: playwright is not installed. Run " 519 "`pip install playwright && python -m playwright install chromium`\n" 520 ) 521 if rt["bridge"]: 522 rt["bridge"].teardown() 523 return 3 524 525 # Chrome env: if realtime is live on Linux, point PULSE_SOURCE at the 526 # virtual source so Chrome's fake mic reads the audio we generate. 527 chrome_env = os.environ.copy() 528 chrome_args = [ 529 "--use-fake-ui-for-media-stream", 530 "--disable-blink-features=AutomationControlled", 531 ] 532 if not rt["enabled"]: 533 # v1-style fake device (silence) — we don't care about mic content 534 # when we're not speaking. 535 chrome_args.insert(1, "--use-fake-device-for-media-stream") 536 elif rt["bridge_info"] and rt["bridge_info"].get("platform") == "linux": 537 chrome_env["PULSE_SOURCE"] = rt["bridge_info"].get("device_name", "") 538 539 try: 540 with sync_playwright() as pw: 541 # Playwright's launch() doesn't take env; we set PULSE_SOURCE 542 # via the process env before launch so the child Chrome inherits it. 543 for k, v in chrome_env.items(): 544 os.environ[k] = v 545 browser = pw.chromium.launch( 546 headless=not headed, 547 args=chrome_args, 548 ) 549 context_args = { 550 "viewport": {"width": 1280, "height": 800}, 551 "user_agent": ( 552 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " 553 "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36" 554 ), 555 "permissions": ["microphone", "camera"], 556 } 557 if auth_state and Path(auth_state).is_file(): 558 context_args["storage_state"] = auth_state 559 context = browser.new_context(**context_args) 560 page = context.new_page() 561 562 try: 563 page.goto(url, wait_until="domcontentloaded", timeout=30_000) 564 except Exception as e: 565 state.set(error=f"navigate failed: {e}", exited=True) 566 return 4 567 568 # Guest-mode: Meet shows a name field before "Ask to join". When 569 # we're authed, we instead see "Join now". 570 _try_guest_name(page, guest_name) 571 _click_join(page, state) 572 573 # Install caption observer and attempt to enable captions. 574 try: 575 page.evaluate(_enable_captions_js()) 576 state.set(captions_enabled_attempted=True) 577 except Exception: 578 pass 579 try: 580 page.evaluate(_CAPTION_OBSERVER_JS) 581 except Exception as e: 582 state.set(error=f"caption observer install failed: {e}") 583 584 # Note: in_call=False until admission is confirmed (we detect 585 # either the Leave button or the caption region, signalling we 586 # made it past the lobby). 587 state.set(captioning=True, join_attempted_at=time.time()) 588 589 # v2 realtime: start the speaker thread reading from the 590 # plugin-side say queue. The thread reads JSONL lines written by 591 # meet_say, calls OpenAI Realtime, and streams the audio PCM to 592 # the virtual sink that Chrome's fake-mic is pointed at. 593 if rt["enabled"]: 594 _start_realtime_speaker( 595 rt=rt, 596 out_dir=out_dir, 597 bridge_info=rt["bridge_info"], 598 api_key=realtime_api_key, 599 model=realtime_model, 600 voice=realtime_voice, 601 instructions=realtime_instructions, 602 stop_flag=stop_flag, 603 state=state, 604 ) 605 if rt["session"] is not None: 606 state.set(realtime_ready=True) 607 608 # Admission + drain loop. Runs until SIGTERM, duration expiry, 609 # or the page detects "You were removed / you left the 610 # meeting". Responsible for: 611 # * detecting admission (Leave button visible → in_call=True) 612 # * timing out stuck-in-lobby (default 5 minutes) 613 # * draining scraped captions into the transcript 614 # * triggering realtime barge-in when a human speaks while 615 # the bot is generating audio 616 # * periodically flushing realtime counters into status.json 617 deadline = (time.time() + duration_s) if duration_s else None 618 lobby_deadline = time.time() + float( 619 os.environ.get("HERMES_MEET_LOBBY_TIMEOUT", "300") 620 ) 621 last_admission_check = 0.0 622 while not stop_flag["stop"]: 623 now = time.time() 624 if deadline and now > deadline: 625 state.set(leave_reason="duration_expired") 626 break 627 628 # Admission detection every ~3s until admitted. 629 if not state.in_call and (now - last_admission_check) > 3.0: 630 last_admission_check = now 631 admitted = _detect_admission(page) 632 if admitted: 633 state.set( 634 in_call=True, 635 lobby_waiting=False, 636 joined_at=now, 637 ) 638 elif now > lobby_deadline: 639 state.set( 640 error=( 641 "lobby timeout — host never admitted the bot " 642 f"within {int(lobby_deadline - state.join_attempted_at) if state.join_attempted_at else 0}s" 643 ), 644 leave_reason="lobby_timeout", 645 ) 646 break 647 elif _detect_denied(page): 648 state.set( 649 error="host denied admission", 650 leave_reason="denied", 651 ) 652 break 653 654 try: 655 queued = page.evaluate("window.__hermesMeetDrain && window.__hermesMeetDrain()") 656 if isinstance(queued, list): 657 for entry in queued: 658 if not isinstance(entry, dict): 659 continue 660 speaker = str(entry.get("speaker", "")) 661 text = str(entry.get("text", "")) 662 state.record_caption(speaker=speaker, text=text) 663 # Barge-in: if the bot is currently generating 664 # audio AND a real human just spoke, cancel the 665 # in-flight response so we don't talk over them. 666 if rt["enabled"] and rt["session"] is not None: 667 if _looks_like_human_speaker(speaker, guest_name): 668 try: 669 cancelled = rt["session"].cancel_response() 670 if cancelled: 671 state.set(last_barge_in_at=now) 672 except Exception: 673 pass 674 except Exception: 675 # Meet reloaded or we got booted — try to detect and 676 # exit gracefully rather than spinning. 677 if page.is_closed(): 678 state.set(leave_reason="page_closed") 679 break 680 681 # Fold the realtime session's byte/timestamp counters into 682 # the status file so meet_status can surface them. 683 if rt["session"] is not None: 684 state.set( 685 audio_bytes_out=getattr(rt["session"], "audio_bytes_out", 0), 686 last_audio_out_at=getattr(rt["session"], "last_audio_out_at", None), 687 ) 688 689 time.sleep(1.0) 690 691 # Try to leave cleanly — click "Leave call" button if present. 692 try: 693 page.evaluate( 694 "() => { const b = document.querySelector('button[aria-label*=\"eave call\"]');" 695 " if (b) b.click(); }" 696 ) 697 except Exception: 698 pass 699 700 context.close() 701 browser.close() 702 # v2: teardown realtime speaker + audio bridge. 703 if rt["speaker_stop"]: 704 try: 705 rt["speaker_stop"]() 706 except Exception: 707 pass 708 if rt["speaker_thread"] is not None: 709 try: 710 rt["speaker_thread"].join(timeout=5.0) 711 except Exception: 712 pass 713 if rt["session"]: 714 try: 715 rt["session"].close() 716 except Exception: 717 pass 718 if rt["bridge"]: 719 try: 720 rt["bridge"].teardown() 721 except Exception: 722 pass 723 state.set(in_call=False, captioning=False, exited=True) 724 return 0 725 726 except Exception as e: 727 state.set(error=f"unhandled: {e}", exited=True) 728 return 1 729 730 731 def _try_guest_name(page, guest_name: str) -> None: 732 """If Meet is showing a guest-name input, type *guest_name* into it.""" 733 try: 734 # Meet's guest name input has placeholder "Your name". 735 locator = page.locator('input[aria-label*="name" i]').first 736 if locator.count() and locator.is_visible(): 737 locator.fill(guest_name, timeout=2_000) 738 except Exception: 739 pass 740 741 742 def _detect_admission(page) -> bool: 743 """True if we're clearly past the lobby and in the call itself. 744 745 Uses a JS-side probe because Meet's DOM structure varies by client 746 version. We check several high-signal indicators and declare admission 747 on the first hit: 748 749 1. Leave-call button is present (``aria-label`` contains "eave call"). 750 2. Caption region has appeared (we installed the observer and it attached). 751 3. The participant list container is visible. 752 753 Conservative by default — returns False on any error. 754 """ 755 probe = r""" 756 (() => { 757 const leave = document.querySelector('button[aria-label*="eave call" i]'); 758 if (leave) return true; 759 if (window.__hermesMeetInstalled) { 760 const caps = document.querySelector( 761 '[role="region"][aria-label*="aption" i], ' + 762 'div[jsname="YSxPC"], div[jsname="tgaKEf"]' 763 ); 764 if (caps) return true; 765 } 766 const parts = document.querySelector('[aria-label*="articipants" i]'); 767 if (parts) return true; 768 return false; 769 })(); 770 """ 771 try: 772 return bool(page.evaluate(probe)) 773 except Exception: 774 return False 775 776 777 def _detect_denied(page) -> bool: 778 """True when Meet is showing a 'you were denied' / 'no one admitted' page.""" 779 probe = r""" 780 (() => { 781 const text = document.body ? document.body.innerText || '' : ''; 782 // English only — matches what shows up when the host denies or 783 // removes a guest. 784 if (/You can't join this video call/i.test(text)) return true; 785 if (/You were removed from the meeting/i.test(text)) return true; 786 if (/No one responded to your request to join/i.test(text)) return true; 787 return false; 788 })(); 789 """ 790 try: 791 return bool(page.evaluate(probe)) 792 except Exception: 793 return False 794 795 796 def _looks_like_human_speaker(speaker: str, bot_guest_name: str) -> bool: 797 """Whether a caption line's speaker is probably a human, not our bot echo. 798 799 Meet attributes captions to the speaker's display name. When Chrome is 800 reading our fake mic, Meet still attributes captions to *our* bot name 801 (because the bot is the one "speaking"). We don't want those to trigger 802 barge-in. Anything else — real participant names — does. 803 804 Conservative: unknown / blank speakers (common when caption scraping 805 falls back to raw text) do NOT trigger barge-in, because we can't tell 806 whether it was a human or us. 807 """ 808 if not speaker or not speaker.strip(): 809 return False 810 spk = speaker.strip().lower() 811 if spk in ("unknown", "you", bot_guest_name.strip().lower()): 812 return False 813 return True 814 815 816 def _click_join(page, state: _BotState) -> None: 817 """Click 'Join now' or 'Ask to join' if either button is visible. 818 819 Flags ``lobby_waiting`` when we hit the "waiting for host to admit you" 820 state so the agent can surface that in status. 821 """ 822 for label in ("Join now", "Ask to join"): 823 try: 824 btn = page.get_by_role("button", name=label, exact=False).first 825 if btn.count() and btn.is_visible(): 826 btn.click(timeout=3_000) 827 if label == "Ask to join": 828 state.set(lobby_waiting=True) 829 break 830 except Exception: 831 continue 832 833 834 def _parse_duration(raw: str) -> Optional[float]: 835 """Parse ``30m`` / ``2h`` / ``90`` (seconds) → float seconds, or None.""" 836 if not raw: 837 return None 838 raw = raw.strip().lower() 839 try: 840 if raw.endswith("h"): 841 return float(raw[:-1]) * 3600 842 if raw.endswith("m"): 843 return float(raw[:-1]) * 60 844 if raw.endswith("s"): 845 return float(raw[:-1]) 846 return float(raw) 847 except ValueError: 848 return None 849 850 851 if __name__ == "__main__": # pragma: no cover — subprocess entry point 852 sys.exit(run_bot())