/ plugins / google_meet / meet_bot.py
meet_bot.py
  1  """Headless Google Meet bot — Playwright + live-caption scraping.
  2  
  3  Runs as a standalone subprocess spawned by ``process_manager.py``. Reads config
  4  from env vars, writes status + transcript to files under
  5  ``$HERMES_HOME/workspace/meetings/<meeting-id>/``. The main hermes process
  6  reads those files via the ``meet_*`` tools — no IPC beyond filesystem.
  7  
  8  The scraping strategy mirrors OpenUtter (sumansid/openutter): we don't parse
  9  WebRTC audio, we enable Google Meet's built-in live captions and observe the
 10  captions container in the DOM via a MutationObserver. This is lossy and
 11  English-biased but it is:
 12  
 13  * deterministic (no API keys, no STT billing),
 14  * works behind Meet's normal login / admission,
 15  * survives Meet UI rewrites fairly well because the caption container has a
 16    stable ARIA role.
 17  
 18  Run standalone for debugging::
 19  
 20      HERMES_MEET_URL=https://meet.google.com/abc-defg-hij \\
 21      HERMES_MEET_OUT_DIR=/tmp/meet-debug \\
 22      HERMES_MEET_HEADED=1 \\
 23      python -m plugins.google_meet.meet_bot
 24  
 25  No meet.google.com URL → exits non-zero. Any URL that doesn't start with
 26  ``https://meet.google.com/`` is rejected (explicit-by-design).
 27  """
 28  
 29  from __future__ import annotations
 30  
 31  import json
 32  import os
 33  import re
 34  import signal
 35  import sys
 36  import threading
 37  import time
 38  from pathlib import Path
 39  from typing import Optional
 40  
 41  # Match ``https://meet.google.com/abc-defg-hij`` or ``.../lookup/...`` — the
 42  # short three-segment code or a lookup URL. Anything else is rejected.
 43  MEET_URL_RE = re.compile(
 44      r"^https://meet\.google\.com/("
 45      r"[a-z0-9]{3,}-[a-z0-9]{3,}-[a-z0-9]{3,}"
 46      r"|lookup/[^/?#]+"
 47      r"|new"
 48      r")(?:[/?#].*)?$"
 49  )
 50  
 51  
 52  # Filenames the bot reads/writes in ``HERMES_MEET_OUT_DIR``.
 53  SAY_QUEUE_FILENAME = "say_queue.jsonl"
 54  SAY_PCM_FILENAME = "speaker.pcm"
 55  
 56  
 57  def _is_safe_meet_url(url: str) -> bool:
 58      """Return True if *url* is a Google Meet URL we're willing to navigate to."""
 59      if not isinstance(url, str):
 60          return False
 61      return bool(MEET_URL_RE.match(url.strip()))
 62  
 63  
 64  def _meeting_id_from_url(url: str) -> str:
 65      """Extract the 3-segment meeting code from a Meet URL.
 66  
 67      For ``https://meet.google.com/abc-defg-hij`` → ``abc-defg-hij``.
 68      For ``.../lookup/<id>`` or ``/new`` we fall back to a timestamped id — the
 69      bot won't know the real code until after redirect, and callers pass this
 70      through to filename anyway.
 71      """
 72      m = re.search(
 73          r"meet\.google\.com/([a-z0-9]{3,}-[a-z0-9]{3,}-[a-z0-9]{3,})",
 74          url or "",
 75      )
 76      if m:
 77          return m.group(1)
 78      return f"meet-{int(time.time())}"
 79  
 80  
 81  # ---------------------------------------------------------------------------
 82  # Status + transcript file writers
 83  # ---------------------------------------------------------------------------
 84  
 85  class _BotState:
 86      """Single-process mutable state, flushed to ``status.json`` on each change."""
 87  
 88      def __init__(self, out_dir: Path, meeting_id: str, url: str):
 89          self.out_dir = out_dir
 90          self.meeting_id = meeting_id
 91          self.url = url
 92          self.in_call = False
 93          self.captioning = False
 94          self.captions_enabled_attempted = False
 95          self.lobby_waiting = False
 96          self.join_attempted_at: Optional[float] = None
 97          self.joined_at: Optional[float] = None
 98          self.last_caption_at: Optional[float] = None
 99          self.transcript_lines = 0
100          self.error: Optional[str] = None
101          self.exited = False
102          # v2 realtime fields.
103          self.realtime = False
104          self.realtime_ready = False
105          self.realtime_device: Optional[str] = None
106          self.audio_bytes_out: int = 0
107          self.last_audio_out_at: Optional[float] = None
108          self.last_barge_in_at: Optional[float] = None
109          self.leave_reason: Optional[str] = None
110          # Scraped captions, in order, deduped. Each entry is a dict of
111          # {"ts": <epoch>, "speaker": str, "text": str}.
112          self._seen: set = set()
113          out_dir.mkdir(parents=True, exist_ok=True)
114          self.transcript_path = out_dir / "transcript.txt"
115          self.status_path = out_dir / "status.json"
116          self._flush()
117  
118      # -------- transcript ------------------------------------------------
119  
120      def record_caption(self, speaker: str, text: str) -> None:
121          """Append a caption line if we haven't seen this exact (speaker, text)."""
122          speaker = (speaker or "").strip() or "Unknown"
123          text = (text or "").strip()
124          if not text:
125              return
126          key = f"{speaker}|{text}"
127          if key in self._seen:
128              return
129          self._seen.add(key)
130          self.transcript_lines += 1
131          self.last_caption_at = time.time()
132          ts = time.strftime("%H:%M:%S", time.localtime(self.last_caption_at))
133          line = f"[{ts}] {speaker}: {text}\n"
134          # Atomic-ish append — good enough for a single-writer.
135          with self.transcript_path.open("a", encoding="utf-8") as f:
136              f.write(line)
137          self._flush()
138  
139      # -------- status file ----------------------------------------------
140  
141      def _flush(self) -> None:
142          data = {
143              "meetingId": self.meeting_id,
144              "url": self.url,
145              "inCall": self.in_call,
146              "captioning": self.captioning,
147              "captionsEnabledAttempted": self.captions_enabled_attempted,
148              "lobbyWaiting": self.lobby_waiting,
149              "joinAttemptedAt": self.join_attempted_at,
150              "joinedAt": self.joined_at,
151              "lastCaptionAt": self.last_caption_at,
152              "transcriptLines": self.transcript_lines,
153              "transcriptPath": str(self.transcript_path),
154              "error": self.error,
155              "exited": self.exited,
156              "pid": os.getpid(),
157              # v2 realtime telemetry.
158              "realtime": self.realtime,
159              "realtimeReady": self.realtime_ready,
160              "realtimeDevice": self.realtime_device,
161              "audioBytesOut": self.audio_bytes_out,
162              "lastAudioOutAt": self.last_audio_out_at,
163              "lastBargeInAt": self.last_barge_in_at,
164              "leaveReason": self.leave_reason,
165          }
166          tmp = self.status_path.with_suffix(".json.tmp")
167          tmp.write_text(json.dumps(data, indent=2), encoding="utf-8")
168          tmp.replace(self.status_path)
169  
170      def set(self, **kwargs) -> None:
171          for k, v in kwargs.items():
172              setattr(self, k, v)
173          self._flush()
174  
175  
176  # ---------------------------------------------------------------------------
177  # Playwright bot entry point
178  # ---------------------------------------------------------------------------
179  
180  # JavaScript injected into the Meet tab to observe captions. Captures
181  # {speaker, text} tuples via a MutationObserver on the caption container,
182  # and exposes ``window.__hermesMeetDrain()`` to pull new entries. This
183  # mirrors the OpenUtter caption scraping approach.
184  _CAPTION_OBSERVER_JS = r"""
185  (() => {
186    if (window.__hermesMeetInstalled) return;
187    window.__hermesMeetInstalled = true;
188    window.__hermesMeetQueue = [];
189  
190    const captionSelector = '[role="region"][aria-label*="aption" i], ' +
191                            'div[jsname="YSxPC"], ' +  // legacy
192                            'div[jsname="tgaKEf"]';    // current (Apr 2026)
193  
194    function pushEntry(speaker, text) {
195      if (!text || !text.trim()) return;
196      window.__hermesMeetQueue.push({
197        ts: Date.now(),
198        speaker: (speaker || '').trim(),
199        text: text.trim(),
200      });
201    }
202  
203    function scan(root) {
204      // Meet captions render as a list of rows; each row contains a speaker
205      // label and a text block. Selectors vary across Meet rewrites; we try
206      // a few shapes and fall back to raw text.
207      const rows = root.querySelectorAll('div[jsname="dsyhDe"], div.CNusmb, div.TBMuR');
208      if (rows.length) {
209        rows.forEach((row) => {
210          const spkEl = row.querySelector('div.KcIKyf, div.zs7s8d, span[jsname="YSxPC"]');
211          const txtEl = row.querySelector('div.bh44bd, span[jsname="tgaKEf"], div.iTTPOb');
212          const speaker = spkEl ? spkEl.innerText : '';
213          const text = txtEl ? txtEl.innerText : row.innerText;
214          pushEntry(speaker, text);
215        });
216        return;
217      }
218      // Fallback: treat the whole region's innerText as one anonymous line.
219      const text = (root.innerText || '').split('\n').filter(Boolean).pop();
220      pushEntry('', text);
221    }
222  
223    function attach() {
224      const el = document.querySelector(captionSelector);
225      if (!el) return false;
226      const obs = new MutationObserver(() => scan(el));
227      obs.observe(el, { childList: true, subtree: true, characterData: true });
228      scan(el);
229      return true;
230    }
231  
232    // Try now and retry on interval — the caption region only appears after
233    // captions are enabled and someone speaks.
234    if (!attach()) {
235      const iv = setInterval(() => { if (attach()) clearInterval(iv); }, 1500);
236    }
237  
238    window.__hermesMeetDrain = () => {
239      const out = window.__hermesMeetQueue.slice();
240      window.__hermesMeetQueue = [];
241      return out;
242    };
243  })();
244  """
245  
246  
247  def _enable_captions_js() -> str:
248      """Return a small JS snippet that tries to click the 'Turn on captions' button.
249  
250      Best-effort — Meet's caption toggle is keyboard-accessible via ``c``. We
251      dispatch that keystroke as a cheap fallback. Real click targeting is too
252      brittle to rely on.
253      """
254      return r"""
255      (() => {
256        const ev = new KeyboardEvent('keydown', {
257          key: 'c', code: 'KeyC', keyCode: 67, which: 67, bubbles: true,
258        });
259        document.body.dispatchEvent(ev);
260        return true;
261      })();
262      """
263  
264  
265  def _start_realtime_speaker(
266      *,
267      rt: dict,
268      out_dir: Path,
269      bridge_info: dict,
270      api_key: str,
271      model: str,
272      voice: str,
273      instructions: str,
274      stop_flag: dict,
275      state: "_BotState",
276  ) -> None:
277      """Wire up the OpenAI Realtime session + speaker thread + PCM pump.
278  
279      The speaker thread reads text lines from ``say_queue.jsonl``, sends each
280      to OpenAI Realtime, and writes PCM audio into ``speaker.pcm``. A
281      separate *pump* thread forwards that PCM into the OS audio sink so
282      Chrome's fake mic picks it up. On Linux we pipe to ``paplay`` against
283      the null-sink; on macOS the caller is expected to have the BlackHole
284      device selected as default input.
285      """
286      try:
287          from plugins.google_meet.realtime.openai_client import (
288              RealtimeSession,
289              RealtimeSpeaker,
290          )
291      except Exception as e:
292          state.set(error=f"realtime import failed: {e}")
293          return
294  
295      pcm_path = out_dir / SAY_PCM_FILENAME
296      queue_path = out_dir / SAY_QUEUE_FILENAME
297      processed_path = out_dir / "say_processed.jsonl"
298      # Reset the sink file so we start clean each session.
299      pcm_path.write_bytes(b"")
300      # Make sure the queue exists so the speaker poller doesn't error on
301      # first iteration.
302      queue_path.touch()
303  
304      try:
305          session = RealtimeSession(
306              api_key=api_key,
307              model=model,
308              voice=voice,
309              instructions=instructions,
310              audio_sink_path=pcm_path,
311              sample_rate=24000,
312          )
313          session.connect()
314      except Exception as e:
315          state.set(error=f"realtime connect failed: {e}")
316          return
317  
318      rt["session"] = session
319  
320      def _stop_fn():
321          return stop_flag.get("stop", False)
322  
323      rt["speaker_stop"] = lambda: stop_flag.__setitem__("stop", stop_flag.get("stop", False))
324  
325      speaker = RealtimeSpeaker(
326          session=session,
327          queue_path=queue_path,
328          processed_path=processed_path,
329      )
330  
331      def _speaker_loop():
332          try:
333              speaker.run_until_stopped(_stop_fn)
334          except Exception as e:
335              state.set(error=f"realtime speaker crashed: {e}")
336  
337      t_speaker = threading.Thread(target=_speaker_loop, name="meet-speaker", daemon=True)
338      t_speaker.start()
339      rt["speaker_thread"] = t_speaker
340  
341      # PCM pump: feeds speaker.pcm (24kHz s16le mono) into the OS audio
342      # device that Chrome's fake mic reads from. Different tools per
343      # platform, but the contract is the same — block-read the growing
344      # PCM file and stream it to the device in near-real-time.
345      platform_tag = (bridge_info or {}).get("platform")
346      if platform_tag == "linux":
347          import subprocess as _sp
348  
349          sink = (bridge_info or {}).get("write_target") or "hermes_meet_sink"
350          try:
351              proc = _sp.Popen(
352                  [
353                      "paplay",
354                      "--raw",
355                      "--rate=24000",
356                      "--format=s16le",
357                      "--channels=1",
358                      f"--device={sink}",
359                      str(pcm_path),
360                  ],
361                  stdin=_sp.DEVNULL,
362                  stdout=_sp.DEVNULL,
363                  stderr=_sp.DEVNULL,
364              )
365              rt["pcm_pump"] = proc
366          except FileNotFoundError:
367              state.set(error="paplay not found — install pulseaudio-utils for realtime on Linux")
368      elif platform_tag == "darwin":
369          # macOS: use ffmpeg to tail-read speaker.pcm and write it to the
370          # BlackHole output device. The user must have BlackHole selected
371          # as the default input in System Settings → Sound for Chrome to
372          # pick it up. We prefer ffmpeg because it's scriptable and can
373          # target AVFoundation devices by name; fall back to afplay-ing
374          # the file in a tight loop if ffmpeg is absent.
375          import shutil as _shutil
376          import subprocess as _sp
377  
378          device_name = (bridge_info or {}).get("write_target") or "BlackHole 2ch"
379          if _shutil.which("ffmpeg"):
380              try:
381                  # -re: read input at native frame rate.
382                  # -f avfoundation -i: speaker path as raw PCM.
383                  # -f s16le -ar 24000 -ac 1 -i <pcm>: interpret the file.
384                  # -f audiotoolbox -audio_device_index: write to BlackHole.
385                  # Simpler: output as raw via coreaudio using "-f audiotoolbox".
386                  # ffmpeg's audiotoolbox output picks the current default
387                  # output device, which isn't what we want. Instead we use
388                  # -f avfoundation with the named device as OUTPUT via
389                  # -vn and the device name.
390                  proc = _sp.Popen(
391                      [
392                          "ffmpeg",
393                          "-nostdin", "-hide_banner", "-loglevel", "error",
394                          "-re",
395                          "-f", "s16le", "-ar", "24000", "-ac", "1",
396                          "-i", str(pcm_path),
397                          "-f", "audiotoolbox",
398                          "-audio_device_index", _mac_audio_device_index(device_name),
399                          "-",
400                      ],
401                      stdin=_sp.DEVNULL,
402                      stdout=_sp.DEVNULL,
403                      stderr=_sp.DEVNULL,
404                  )
405                  rt["pcm_pump"] = proc
406              except FileNotFoundError:
407                  state.set(error="ffmpeg not found — install via `brew install ffmpeg` for realtime on macOS")
408              except Exception as e:
409                  state.set(error=f"macOS pcm pump failed to start: {e}")
410          else:
411              state.set(error="ffmpeg not found — install via `brew install ffmpeg` for realtime on macOS")
412  
413  
414  def _mac_audio_device_index(device_name: str) -> str:
415      """Return the ffmpeg ``-audio_device_index`` for *device_name*, as a string.
416  
417      Probes ``ffmpeg -f avfoundation -list_devices true -i ''`` (which prints
418      the device table on stderr) and matches *device_name* case-insensitively.
419      Defaults to ``"0"`` if the device can't be found — caller will get a
420      misrouted stream but not a crash, and the error will be obvious.
421      """
422      import subprocess as _sp
423  
424      try:
425          out = _sp.run(
426              ["ffmpeg", "-f", "avfoundation", "-list_devices", "true", "-i", ""],
427              capture_output=True,
428              text=True,
429              timeout=10,
430          )
431      except Exception:
432          return "0"
433      # ffmpeg prints the table on stderr. Lines look like:
434      #   [AVFoundation indev @ 0x...] [0] BlackHole 2ch
435      import re as _re
436  
437      needle = device_name.strip().lower()
438      for line in (out.stderr or "").splitlines():
439          m = _re.search(r"\[(\d+)\]\s+(.+)$", line)
440          if not m:
441              continue
442          if m.group(2).strip().lower() == needle:
443              return m.group(1)
444      return "0"
445  
446  
447  def run_bot() -> int:  # noqa: C901 — orchestration, explicit branches
448      url = os.environ.get("HERMES_MEET_URL", "").strip()
449      out_dir_env = os.environ.get("HERMES_MEET_OUT_DIR", "").strip()
450      headed = os.environ.get("HERMES_MEET_HEADED", "").lower() in ("1", "true", "yes")
451      auth_state = os.environ.get("HERMES_MEET_AUTH_STATE", "").strip()
452      guest_name = os.environ.get("HERMES_MEET_GUEST_NAME", "Hermes Agent")
453      duration_s = _parse_duration(os.environ.get("HERMES_MEET_DURATION", ""))
454      # v2: optional realtime mode. Enabled when HERMES_MEET_MODE=realtime.
455      mode = os.environ.get("HERMES_MEET_MODE", "transcribe").strip().lower()
456      realtime_model = os.environ.get("HERMES_MEET_REALTIME_MODEL", "gpt-realtime")
457      realtime_voice = os.environ.get("HERMES_MEET_REALTIME_VOICE", "alloy")
458      realtime_instructions = os.environ.get("HERMES_MEET_REALTIME_INSTRUCTIONS", "")
459      realtime_api_key = os.environ.get("HERMES_MEET_REALTIME_KEY") or os.environ.get("OPENAI_API_KEY", "")
460  
461      if not url or not _is_safe_meet_url(url):
462          sys.stderr.write(
463              "google_meet bot: refusing to launch — HERMES_MEET_URL must be a "
464              "meet.google.com URL. got: %r\n" % url
465          )
466          return 2
467      if not out_dir_env:
468          sys.stderr.write("google_meet bot: HERMES_MEET_OUT_DIR is required\n")
469          return 2
470  
471      out_dir = Path(out_dir_env)
472      meeting_id = _meeting_id_from_url(url)
473      state = _BotState(out_dir=out_dir, meeting_id=meeting_id, url=url)
474  
475      # SIGTERM → exit cleanly so the parent ``meet_leave`` gets a finalized
476      # transcript. We set a flag instead of raising so the Playwright context
477      # teardown runs in the finally block below.
478      stop_flag = {"stop": False}
479  
480      def _on_signal(_sig, _frame):
481          stop_flag["stop"] = True
482  
483      signal.signal(signal.SIGTERM, _on_signal)
484      signal.signal(signal.SIGINT, _on_signal)
485  
486      # v2 realtime: provision virtual audio device + start speaker thread.
487      # We track these in a dict so the finally block can tear them down
488      # regardless of how we exit. If anything in the realtime setup fails we
489      # fall back to transcribe mode with a status flag.
490      rt = {
491          "enabled": mode == "realtime",
492          "bridge": None,            # AudioBridge | None
493          "bridge_info": None,       # dict | None
494          "session": None,           # RealtimeSession | None
495          "speaker_thread": None,    # threading.Thread | None
496          "speaker_stop": None,      # callable | None
497      }
498      if rt["enabled"]:
499          if not realtime_api_key:
500              state.set(error="realtime mode requested but no API key in HERMES_MEET_REALTIME_KEY/OPENAI_API_KEY — falling back to transcribe")
501              rt["enabled"] = False
502          else:
503              try:
504                  from plugins.google_meet.audio_bridge import AudioBridge
505                  bridge = AudioBridge()
506                  rt["bridge_info"] = bridge.setup()
507                  rt["bridge"] = bridge
508                  state.set(realtime=True, realtime_device=rt["bridge_info"].get("device_name"))
509              except Exception as e:
510                  state.set(error=f"audio bridge setup failed: {e} — falling back to transcribe")
511                  rt["enabled"] = False
512  
513      try:
514          from playwright.sync_api import sync_playwright
515      except ImportError as e:
516          state.set(error=f"playwright not installed: {e}", exited=True)
517          sys.stderr.write(
518              "google_meet bot: playwright is not installed. Run "
519              "`pip install playwright && python -m playwright install chromium`\n"
520          )
521          if rt["bridge"]:
522              rt["bridge"].teardown()
523          return 3
524  
525      # Chrome env: if realtime is live on Linux, point PULSE_SOURCE at the
526      # virtual source so Chrome's fake mic reads the audio we generate.
527      chrome_env = os.environ.copy()
528      chrome_args = [
529          "--use-fake-ui-for-media-stream",
530          "--disable-blink-features=AutomationControlled",
531      ]
532      if not rt["enabled"]:
533          # v1-style fake device (silence) — we don't care about mic content
534          # when we're not speaking.
535          chrome_args.insert(1, "--use-fake-device-for-media-stream")
536      elif rt["bridge_info"] and rt["bridge_info"].get("platform") == "linux":
537          chrome_env["PULSE_SOURCE"] = rt["bridge_info"].get("device_name", "")
538  
539      try:
540          with sync_playwright() as pw:
541              # Playwright's launch() doesn't take env; we set PULSE_SOURCE
542              # via the process env before launch so the child Chrome inherits it.
543              for k, v in chrome_env.items():
544                  os.environ[k] = v
545              browser = pw.chromium.launch(
546                  headless=not headed,
547                  args=chrome_args,
548              )
549              context_args = {
550                  "viewport": {"width": 1280, "height": 800},
551                  "user_agent": (
552                      "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
553                      "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
554                  ),
555                  "permissions": ["microphone", "camera"],
556              }
557              if auth_state and Path(auth_state).is_file():
558                  context_args["storage_state"] = auth_state
559              context = browser.new_context(**context_args)
560              page = context.new_page()
561  
562              try:
563                  page.goto(url, wait_until="domcontentloaded", timeout=30_000)
564              except Exception as e:
565                  state.set(error=f"navigate failed: {e}", exited=True)
566                  return 4
567  
568              # Guest-mode: Meet shows a name field before "Ask to join". When
569              # we're authed, we instead see "Join now".
570              _try_guest_name(page, guest_name)
571              _click_join(page, state)
572  
573              # Install caption observer and attempt to enable captions.
574              try:
575                  page.evaluate(_enable_captions_js())
576                  state.set(captions_enabled_attempted=True)
577              except Exception:
578                  pass
579              try:
580                  page.evaluate(_CAPTION_OBSERVER_JS)
581              except Exception as e:
582                  state.set(error=f"caption observer install failed: {e}")
583  
584              # Note: in_call=False until admission is confirmed (we detect
585              # either the Leave button or the caption region, signalling we
586              # made it past the lobby).
587              state.set(captioning=True, join_attempted_at=time.time())
588  
589              # v2 realtime: start the speaker thread reading from the
590              # plugin-side say queue. The thread reads JSONL lines written by
591              # meet_say, calls OpenAI Realtime, and streams the audio PCM to
592              # the virtual sink that Chrome's fake-mic is pointed at.
593              if rt["enabled"]:
594                  _start_realtime_speaker(
595                      rt=rt,
596                      out_dir=out_dir,
597                      bridge_info=rt["bridge_info"],
598                      api_key=realtime_api_key,
599                      model=realtime_model,
600                      voice=realtime_voice,
601                      instructions=realtime_instructions,
602                      stop_flag=stop_flag,
603                      state=state,
604                  )
605                  if rt["session"] is not None:
606                      state.set(realtime_ready=True)
607  
608              # Admission + drain loop. Runs until SIGTERM, duration expiry,
609              # or the page detects "You were removed / you left the
610              # meeting". Responsible for:
611              #   * detecting admission (Leave button visible → in_call=True)
612              #   * timing out stuck-in-lobby (default 5 minutes)
613              #   * draining scraped captions into the transcript
614              #   * triggering realtime barge-in when a human speaks while
615              #     the bot is generating audio
616              #   * periodically flushing realtime counters into status.json
617              deadline = (time.time() + duration_s) if duration_s else None
618              lobby_deadline = time.time() + float(
619                  os.environ.get("HERMES_MEET_LOBBY_TIMEOUT", "300")
620              )
621              last_admission_check = 0.0
622              while not stop_flag["stop"]:
623                  now = time.time()
624                  if deadline and now > deadline:
625                      state.set(leave_reason="duration_expired")
626                      break
627  
628                  # Admission detection every ~3s until admitted.
629                  if not state.in_call and (now - last_admission_check) > 3.0:
630                      last_admission_check = now
631                      admitted = _detect_admission(page)
632                      if admitted:
633                          state.set(
634                              in_call=True,
635                              lobby_waiting=False,
636                              joined_at=now,
637                          )
638                      elif now > lobby_deadline:
639                          state.set(
640                              error=(
641                                  "lobby timeout — host never admitted the bot "
642                                  f"within {int(lobby_deadline - state.join_attempted_at) if state.join_attempted_at else 0}s"
643                              ),
644                              leave_reason="lobby_timeout",
645                          )
646                          break
647                      elif _detect_denied(page):
648                          state.set(
649                              error="host denied admission",
650                              leave_reason="denied",
651                          )
652                          break
653  
654                  try:
655                      queued = page.evaluate("window.__hermesMeetDrain && window.__hermesMeetDrain()")
656                      if isinstance(queued, list):
657                          for entry in queued:
658                              if not isinstance(entry, dict):
659                                  continue
660                              speaker = str(entry.get("speaker", ""))
661                              text = str(entry.get("text", ""))
662                              state.record_caption(speaker=speaker, text=text)
663                              # Barge-in: if the bot is currently generating
664                              # audio AND a real human just spoke, cancel the
665                              # in-flight response so we don't talk over them.
666                              if rt["enabled"] and rt["session"] is not None:
667                                  if _looks_like_human_speaker(speaker, guest_name):
668                                      try:
669                                          cancelled = rt["session"].cancel_response()
670                                          if cancelled:
671                                              state.set(last_barge_in_at=now)
672                                      except Exception:
673                                          pass
674                  except Exception:
675                      # Meet reloaded or we got booted — try to detect and
676                      # exit gracefully rather than spinning.
677                      if page.is_closed():
678                          state.set(leave_reason="page_closed")
679                          break
680  
681                  # Fold the realtime session's byte/timestamp counters into
682                  # the status file so meet_status can surface them.
683                  if rt["session"] is not None:
684                      state.set(
685                          audio_bytes_out=getattr(rt["session"], "audio_bytes_out", 0),
686                          last_audio_out_at=getattr(rt["session"], "last_audio_out_at", None),
687                      )
688  
689                  time.sleep(1.0)
690  
691              # Try to leave cleanly — click "Leave call" button if present.
692              try:
693                  page.evaluate(
694                      "() => { const b = document.querySelector('button[aria-label*=\"eave call\"]');"
695                      " if (b) b.click(); }"
696                  )
697              except Exception:
698                  pass
699  
700              context.close()
701              browser.close()
702              # v2: teardown realtime speaker + audio bridge.
703              if rt["speaker_stop"]:
704                  try:
705                      rt["speaker_stop"]()
706                  except Exception:
707                      pass
708              if rt["speaker_thread"] is not None:
709                  try:
710                      rt["speaker_thread"].join(timeout=5.0)
711                  except Exception:
712                      pass
713              if rt["session"]:
714                  try:
715                      rt["session"].close()
716                  except Exception:
717                      pass
718              if rt["bridge"]:
719                  try:
720                      rt["bridge"].teardown()
721                  except Exception:
722                      pass
723              state.set(in_call=False, captioning=False, exited=True)
724              return 0
725  
726      except Exception as e:
727          state.set(error=f"unhandled: {e}", exited=True)
728          return 1
729  
730  
731  def _try_guest_name(page, guest_name: str) -> None:
732      """If Meet is showing a guest-name input, type *guest_name* into it."""
733      try:
734          # Meet's guest name input has placeholder "Your name".
735          locator = page.locator('input[aria-label*="name" i]').first
736          if locator.count() and locator.is_visible():
737              locator.fill(guest_name, timeout=2_000)
738      except Exception:
739          pass
740  
741  
742  def _detect_admission(page) -> bool:
743      """True if we're clearly past the lobby and in the call itself.
744  
745      Uses a JS-side probe because Meet's DOM structure varies by client
746      version. We check several high-signal indicators and declare admission
747      on the first hit:
748  
749        1. Leave-call button is present (``aria-label`` contains "eave call").
750        2. Caption region has appeared (we installed the observer and it attached).
751        3. The participant list container is visible.
752  
753      Conservative by default — returns False on any error.
754      """
755      probe = r"""
756      (() => {
757        const leave = document.querySelector('button[aria-label*="eave call" i]');
758        if (leave) return true;
759        if (window.__hermesMeetInstalled) {
760          const caps = document.querySelector(
761            '[role="region"][aria-label*="aption" i], ' +
762            'div[jsname="YSxPC"], div[jsname="tgaKEf"]'
763          );
764          if (caps) return true;
765        }
766        const parts = document.querySelector('[aria-label*="articipants" i]');
767        if (parts) return true;
768        return false;
769      })();
770      """
771      try:
772          return bool(page.evaluate(probe))
773      except Exception:
774          return False
775  
776  
777  def _detect_denied(page) -> bool:
778      """True when Meet is showing a 'you were denied' / 'no one admitted' page."""
779      probe = r"""
780      (() => {
781        const text = document.body ? document.body.innerText || '' : '';
782        // English only — matches what shows up when the host denies or
783        // removes a guest.
784        if (/You can't join this video call/i.test(text)) return true;
785        if (/You were removed from the meeting/i.test(text)) return true;
786        if (/No one responded to your request to join/i.test(text)) return true;
787        return false;
788      })();
789      """
790      try:
791          return bool(page.evaluate(probe))
792      except Exception:
793          return False
794  
795  
796  def _looks_like_human_speaker(speaker: str, bot_guest_name: str) -> bool:
797      """Whether a caption line's speaker is probably a human, not our bot echo.
798  
799      Meet attributes captions to the speaker's display name. When Chrome is
800      reading our fake mic, Meet still attributes captions to *our* bot name
801      (because the bot is the one "speaking"). We don't want those to trigger
802      barge-in. Anything else — real participant names — does.
803  
804      Conservative: unknown / blank speakers (common when caption scraping
805      falls back to raw text) do NOT trigger barge-in, because we can't tell
806      whether it was a human or us.
807      """
808      if not speaker or not speaker.strip():
809          return False
810      spk = speaker.strip().lower()
811      if spk in ("unknown", "you", bot_guest_name.strip().lower()):
812          return False
813      return True
814  
815  
816  def _click_join(page, state: _BotState) -> None:
817      """Click 'Join now' or 'Ask to join' if either button is visible.
818  
819      Flags ``lobby_waiting`` when we hit the "waiting for host to admit you"
820      state so the agent can surface that in status.
821      """
822      for label in ("Join now", "Ask to join"):
823          try:
824              btn = page.get_by_role("button", name=label, exact=False).first
825              if btn.count() and btn.is_visible():
826                  btn.click(timeout=3_000)
827                  if label == "Ask to join":
828                      state.set(lobby_waiting=True)
829                  break
830          except Exception:
831              continue
832  
833  
834  def _parse_duration(raw: str) -> Optional[float]:
835      """Parse ``30m`` / ``2h`` / ``90`` (seconds) → float seconds, or None."""
836      if not raw:
837          return None
838      raw = raw.strip().lower()
839      try:
840          if raw.endswith("h"):
841              return float(raw[:-1]) * 3600
842          if raw.endswith("m"):
843              return float(raw[:-1]) * 60
844          if raw.endswith("s"):
845              return float(raw[:-1])
846          return float(raw)
847      except ValueError:
848          return None
849  
850  
851  if __name__ == "__main__":  # pragma: no cover — subprocess entry point
852      sys.exit(run_bot())