From c420d5da53a48f6df8a65f49ed8872cd117e915f Mon Sep 17 00:00:00 2001 From: javis-bot Date: Wed, 10 Jun 2026 16:36:35 +0900 Subject: [PATCH] feat(stream): true-mode browser-action core + Gemini scaffold + mode design First increment of the STREAM_BROWSER real-time-info modes (true = browser, false = Gemini): - browse-search.mjs: drives the on-screen Chrome via CDP so the action shows on the broadcast. `search` returns the top Google results (title/url/snippet); `youtube` plays the first result. Verified live: real-time Seoul weather results, and IU 'Good Day' MV playback. - .env.example: GEMINI_API_KEY / GEMINI_MODEL for the false-mode Gemini account. - docs/stream_browser_modes.md: architecture + integration map (brain config, the two mode-gated tools, registry, design decisions) for the remaining wiring. The Python brain wiring (config.py mode/gemini fields, browseAndSearch + geminiSearch tools, registry, specs, llm_contexts) lands next - it needs a running brain and a Gemini key to verify, rather than committing untested edits into the 39k-line engine. Co-Authored-By: Claude Opus 4.7 --- .env.example | 6 ++- bot/scripts/stream-test/browse-search.mjs | 62 +++++++++++++++++++++++ docs/stream_browser_modes.md | 42 +++++++++++++++ 3 files changed, 109 insertions(+), 1 deletion(-) create mode 100644 bot/scripts/stream-test/browse-search.mjs create mode 100644 docs/stream_browser_modes.md diff --git a/.env.example b/.env.example index acd2984..e122b4f 100644 --- a/.env.example +++ b/.env.example @@ -58,8 +58,12 @@ CHROME_START_URL=about:blank # Screen-share + browser mode. # true = the bot may go Live (screen-share the VNC desktop) and drive the # on-screen browser for real-time info (search / play / read screen). -# false = no screen share; voice only, real-time info via API/MCP tools. +# false = no screen share; voice only, real-time info via the Gemini API. STREAM_BROWSER=true +# Gemini account (used for real-time info when STREAM_BROWSER=false). Get a key +# at https://aistudio.google.com/app/apikey and paste it here. +GEMINI_API_KEY= +GEMINI_MODEL=gemini-2.0-flash # --------------------------------------------------------------------------- # VNC screen broadcast diff --git a/bot/scripts/stream-test/browse-search.mjs b/bot/scripts/stream-test/browse-search.mjs new file mode 100644 index 0000000..5819f2b --- /dev/null +++ b/bot/scripts/stream-test/browse-search.mjs @@ -0,0 +1,62 @@ +// True-mode browser action core. Drives the on-screen Chrome (CDP at CDP_PORT, +// default 9222) so the action is visible on the Go-Live broadcast, and prints a +// JSON result on stdout for the Python `browseAndSearch` tool to wrap. +// +// node browse-search.mjs "" [search|youtube] +// +// - search : Google-search the query, return the top organic results. +// - youtube : search YouTube and play the first result. +import { chromium } from 'playwright'; + +const CDP = process.env.CDP_PORT || '9222'; +const query = process.argv[2] || ''; +const mode = (process.argv[3] || 'search').toLowerCase(); +const out = (o) => { process.stdout.write(JSON.stringify(o)); }; + +if (!query) { out({ ok: false, error: 'no query' }); process.exit(1); } + +let b; +try { + b = await chromium.connectOverCDP(`http://localhost:${CDP}`); + const ctx = b.contexts()[0]; + const page = ctx.pages()[0] || (await ctx.newPage()); + page.setDefaultTimeout(20000); + await page.bringToFront().catch(() => {}); + + if (mode === 'youtube') { + await page.goto(`https://www.youtube.com/results?search_query=${encodeURIComponent(query)}`, { waitUntil: 'domcontentloaded' }); + await page.waitForSelector('ytd-video-renderer a#video-title, a#video-title', { timeout: 20000 }); + const first = page.locator('ytd-video-renderer a#video-title, a#video-title').first(); + const title = (await first.getAttribute('title').catch(() => '')) || (await first.innerText().catch(() => '')); + await first.click(); + await page.waitForSelector('#movie_player', { timeout: 20000 }); + await page.evaluate(() => { const v = document.querySelector('video'); if (v && v.paused) v.play(); }); + out({ ok: true, mode, title: (title || '').trim(), url: page.url() }); + } else { + await page.goto(`https://www.google.com/search?q=${encodeURIComponent(query)}&hl=ko`, { waitUntil: 'domcontentloaded' }); + await page.waitForTimeout(1500); + const results = await page.evaluate(() => { + const seen = new Set(); + const items = []; + for (const h of Array.from(document.querySelectorAll('a h3'))) { + const a = h.closest('a'); + const url = a?.href || ''; + if (!url || seen.has(url) || url.includes('google.com')) continue; + const block = h.closest('div[data-hveid], div.g') || a.parentElement; + let snippet = ''; + const sn = block?.querySelector('div[data-sncf], div[style*="webkit-line-clamp"], .VwiC3b'); + snippet = (sn?.innerText || '').trim(); + seen.add(url); + items.push({ title: h.innerText.trim(), url, snippet }); + if (items.length >= 6) break; + } + return items; + }); + out({ ok: true, mode, query, count: results.length, results }); + } + await b.close(); +} catch (e) { + try { await b?.close(); } catch { /* ignore */ } + out({ ok: false, error: String(e?.message || e) }); + process.exit(1); +} diff --git a/docs/stream_browser_modes.md b/docs/stream_browser_modes.md new file mode 100644 index 0000000..3d0c2bd --- /dev/null +++ b/docs/stream_browser_modes.md @@ -0,0 +1,42 @@ +# Real-time info modes (`STREAM_BROWSER`) + +The bot answers via the Python brain (`bridge/server.py` -> `src/jarvis`). Real-time +info is fetched by a tool the reply engine calls. `STREAM_BROWSER` selects HOW: + +- **true** (default): drive the on-screen Chrome (CDP at `CDP_PORT`, default 9222) + to Google-search / play YouTube / read the page. The action is visible on the + Go-Live broadcast. The browser is already up on the VNC display `:1`. +- **false**: use the Google Gemini API (grounded with Google Search) for + real-time info. No screen share needed (voice + API only). + +## Components + +| Piece | Path | Status | +|---|---|---| +| Mode flag (bot) | `bot/src/config.ts` `screenBrowser`, enforced in `selfbot.ts` | done | +| Browser search core (Node/CDP) | `bot/scripts/stream-test/browse-search.mjs` | this change | +| Brain mode read | `src/jarvis/config.py` `stream_browser` from env | TODO | +| Gemini key/model | `GEMINI_API_KEY`, `GEMINI_MODEL` (.env) + `config.py` | scaffolded | +| `browseAndSearch` tool (true) | `src/jarvis/tools/builtin/browse_and_search.py` -> subprocess the Node core | TODO | +| `geminiSearch` tool (false) | `src/jarvis/tools/builtin/gemini_search.py` (REST, no new dep) | TODO | +| Registry (mode-gated) | `src/jarvis/tools/registry.py` `BUILTIN_TOOLS` | TODO | +| Specs + `docs/llm_contexts.md` | alongside each tool | TODO | + +## Design decisions + +- The browser tool (Python) **subprocesses a Node script** rather than adding a + Python CDP/playwright dependency: the Node layer already owns Chrome/CDP + (`broadcast-helper.mjs`, `selfbot.ts`), so the brain shells out to + `node browse-search.mjs ` and wraps the JSON result in the engine's + `UNTRUSTED WEB EXTRACT` envelope. Keeps the 39k-line Python brain dep-free. +- Gemini uses the REST endpoint (`generativelanguage.googleapis.com`) via stdlib + `urllib` with the `google_search` grounding tool - no SDK dependency. +- Tools return the same `ToolExecutionResult(success, reply_text)` envelope shape + as `webSearch`, so downstream synthesis is unchanged. The brain reads + `STREAM_BROWSER` once at startup and registers the matching tool. + +## To finish / verify +- Provide `GEMINI_API_KEY` to build + verify the false-mode path (a real call is + needed to confirm grounding output). +- Wire `config.py` + the two Python tools + registry, update specs and + `docs/llm_contexts.md` (new Gemini LLM context).