init: restore source from @anthropic-ai/claude-code@2.1.88 sourcemap

This commit is contained in:
huo0
2026-03-31 16:30:12 +08:00
commit a8a678cb62
4494 changed files with 982833 additions and 0 deletions

View File

@@ -0,0 +1,553 @@
/**
* App category lookup for tiered CU permissions. Three categories land at a
* restricted tier instead of `"full"`:
*
* - **browser** → `"read"` tier — visible in screenshots, NO interaction.
* The model can read an already-open page but must use the Claude-in-Chrome
* MCP for navigation/clicking/typing.
* - **terminal** → `"click"` tier — visible + clickable, NO typing. The
* model can click a Run button or scroll test output in an IDE, but can't
* type into the integrated terminal. Use the Bash tool for shell work.
* - **trading** → `"read"` tier — same restrictions as browsers, but no
* CiC-MCP alternative exists. For platforms where a stray click can
* execute a trade or send a message to a counterparty.
*
* Uncategorized apps default to `"full"`. See `getDefaultTierForApp`.
*
* Identification is two-layered:
* 1. Bundle ID match (macOS-only; `InstalledApp.bundleId` is a
* CFBundleIdentifier and meaningless on Windows). Fast, exact, the
* primary mechanism while CU is darwin-gated.
* 2. Display-name substring match (cross-platform fallback). Catches
* unresolved requests ("Chrome" when Chrome isn't installed) AND will
* be the primary mechanism on Windows/Linux where there's no bundle ID.
* Windows-relevant names (PowerShell, cmd, Windows Terminal) are
* included now so they activate the moment the darwin gate lifts.
*
* Keep this file **import-free** (like sentinelApps.ts) — the renderer may
* import it via a package.json subpath export, and pulling in
* `@modelcontextprotocol/sdk` (a devDep) through the index → mcpServer chain
* would fail module resolution in Next.js. The `CuAppPermTier` type is
* duplicated as a string literal below rather than imported.
*/
export type DeniedCategory = "browser" | "terminal" | "trading";
/**
* Map a category to its hardcoded tier. Return-type is the string-literal
* union inline (this file is import-free; see header comment). The
* authoritative type is `CuAppPermTier` in types.ts — keep in sync.
*
* Not bijective — both `"browser"` and `"trading"` map to `"read"`. Copy
* that differs by category (the "use CiC" hint is browser-only) must check
* the category, not just the tier.
*/
export function categoryToTier(
category: DeniedCategory | null,
): "read" | "click" | "full" {
if (category === "browser" || category === "trading") return "read";
if (category === "terminal") return "click";
return "full";
}
// ─── Bundle-ID deny sets (macOS) ─────────────────────────────────────────
const BROWSER_BUNDLE_IDS: ReadonlySet<string> = new Set([
// Apple
"com.apple.Safari",
"com.apple.SafariTechnologyPreview",
// Google
"com.google.Chrome",
"com.google.Chrome.beta",
"com.google.Chrome.dev",
"com.google.Chrome.canary",
// Microsoft
"com.microsoft.edgemac",
"com.microsoft.edgemac.Beta",
"com.microsoft.edgemac.Dev",
"com.microsoft.edgemac.Canary",
// Mozilla
"org.mozilla.firefox",
"org.mozilla.firefoxdeveloperedition",
"org.mozilla.nightly",
// Chromium-based
"org.chromium.Chromium",
"com.brave.Browser",
"com.brave.Browser.beta",
"com.brave.Browser.nightly",
"com.operasoftware.Opera",
"com.operasoftware.OperaGX",
"com.operasoftware.OperaDeveloper",
"com.vivaldi.Vivaldi",
// The Browser Company
"company.thebrowser.Browser", // Arc
"company.thebrowser.dia", // Dia (agentic)
// Privacy-focused
"org.torproject.torbrowser",
"com.duckduckgo.macos.browser",
"ru.yandex.desktop.yandex-browser",
// Agentic / AI browsers — newer entrants with LLM integrations
"ai.perplexity.comet",
"com.sigmaos.sigmaos.macos", // SigmaOS
// Webkit-based misc
"com.kagi.kagimacOS", // Orion
]);
/**
* Terminals + IDEs with integrated terminals. Supersets
* `SHELL_ACCESS_BUNDLE_IDS` from sentinelApps.ts — terminals proceed to the
* approval dialog at tier "click", and the sentinel warning renders
* alongside the tier badge.
*/
const TERMINAL_BUNDLE_IDS: ReadonlySet<string> = new Set([
// Dedicated terminals
"com.apple.Terminal",
"com.googlecode.iterm2",
"dev.warp.Warp-Stable",
"dev.warp.Warp-Beta",
"com.github.wez.wezterm",
"org.alacritty",
"io.alacritty", // pre-v0.11.0 (renamed 2022-07) — kept for legacy installs
"net.kovidgoyal.kitty",
"co.zeit.hyper",
"com.mitchellh.ghostty",
"org.tabby",
"com.termius-dmg.mac", // Termius
// IDEs with integrated terminals — we can't distinguish "type in the
// editor" from "type in the integrated terminal" via screenshot+click.
// VS Code family
"com.microsoft.VSCode",
"com.microsoft.VSCodeInsiders",
"com.vscodium", // VSCodium
"com.todesktop.230313mzl4w4u92", // Cursor
"com.exafunction.windsurf", // Windsurf / Codeium
"dev.zed.Zed",
"dev.zed.Zed-Preview",
// JetBrains family (all have integrated terminals)
"com.jetbrains.intellij",
"com.jetbrains.intellij.ce",
"com.jetbrains.pycharm",
"com.jetbrains.pycharm.ce",
"com.jetbrains.WebStorm",
"com.jetbrains.CLion",
"com.jetbrains.goland",
"com.jetbrains.rubymine",
"com.jetbrains.PhpStorm",
"com.jetbrains.datagrip",
"com.jetbrains.rider",
"com.jetbrains.AppCode",
"com.jetbrains.rustrover",
"com.jetbrains.fleet",
"com.google.android.studio", // Android Studio (JetBrains-based)
// Other IDEs
"com.axosoft.gitkraken", // GitKraken has an integrated terminal panel. Also keeps the "kraken" trading-substring from miscategorizing it — bundle-ID wins.
"com.sublimetext.4",
"com.sublimetext.3",
"org.vim.MacVim",
"com.neovim.neovim",
"org.gnu.Emacs",
// Xcode's previous carve-out (full tier for Interface Builder / simulator)
// was reversed — at tier "click" IB and simulator taps still work (both are
// plain clicks) while the integrated terminal is blocked from keyboard input.
"com.apple.dt.Xcode",
"org.eclipse.platform.ide",
"org.netbeans.ide",
"com.microsoft.visual-studio", // Visual Studio for Mac
// AppleScript/automation execution surfaces — same threat as terminals:
// type(script) → key("cmd+r") runs arbitrary code. Added after #28011
// removed the osascript MCP server, making CU the only tool-call route
// to AppleScript.
"com.apple.ScriptEditor2",
"com.apple.Automator",
"com.apple.shortcuts",
]);
/**
* Trading / crypto platforms — granted at tier `"read"` so the agent can see
* balances and prices but can't click into an order, transfer, or IB chat.
* Bundle IDs populated from Homebrew cask `uninstall.quit` stanzas as they're
* verified; the name-substring fallback below is the primary check. Bloomberg
* Terminal has no native macOS build per their FAQ (web/Citrix only).
*
* Budgeting/accounting apps (Quicken, YNAB, QuickBooks, etc.) are NOT listed
* here — they default to tier `"full"`. The risk model for brokerage/crypto
* (a stray click can execute a trade) doesn't apply to budgeting apps; the
* Cowork system prompt carries the soft instruction to never execute trades
* or transfer money on the user's behalf.
*/
const TRADING_BUNDLE_IDS: ReadonlySet<string> = new Set([
// Verified via Homebrew quit/zap stanzas + mdls + electron-builder source.
// Trading
"com.webull.desktop.v1", // Webull (direct download, Qt)
"com.webull.trade.mac.v1", // Webull (Mac App Store)
"com.tastytrade.desktop",
"com.tradingview.tradingviewapp.desktop",
"com.fidelity.activetrader", // Fidelity Trader+ (new)
"com.fmr.activetrader", // Fidelity Active Trader Pro (legacy)
// Interactive Brokers TWS — install4j wrapper; Homebrew quit stanza is
// authoritative for this exact value but install4j IDs can drift across
// major versions — name-substring "trader workstation" is the fallback.
"com.install4j.5889-6375-8446-2021",
// Crypto
"com.binance.BinanceDesktop",
"com.electron.exodus",
// Electrum uses PyInstaller with bundle_identifier=None → defaults to
// org.pythonmac.unspecified.<AppName>. Confirmed in spesmilo/electrum
// source + Homebrew zap. IntuneBrew's "org.electrum.electrum" is a fork.
"org.pythonmac.unspecified.Electrum",
"com.ledger.live",
"io.trezor.TrezorSuite",
// No native macOS app (name-substring only): Schwab, E*TRADE, TradeStation,
// Robinhood, NinjaTrader, Coinbase, Kraken, Bloomberg. thinkorswim
// install4j ID drifts per-install — substring safer.
]);
// ─── Policy-deny (not a tier — cannot be granted at all) ─────────────────
//
// Streaming / ebook / music apps and a handful of publisher apps. These
// are auto-denied before the approval dialog — no tier can be granted.
// Rationale is copyright / content-control (the agent has no legitimate
// need to screenshot Netflix or click Play on Spotify).
//
// Sourced from the ACP CU-apps blocklist xlsx ("Full block" tab). See
// /tmp/extract_cu_blocklist.py for the extraction script.
const POLICY_DENIED_BUNDLE_IDS: ReadonlySet<string> = new Set([
// Verified via Homebrew quit/zap + mdls /System/Applications + IntuneBrew.
// Apple built-ins
"com.apple.TV",
"com.apple.Music",
"com.apple.iBooksX",
"com.apple.podcasts",
// Music
"com.spotify.client",
"com.amazon.music",
"com.tidal.desktop",
"com.deezer.deezer-desktop",
"com.pandora.desktop",
"com.electron.pocket-casts", // direct-download Electron wrapper
"au.com.shiftyjelly.PocketCasts", // Mac App Store
// Video
"tv.plex.desktop",
"tv.plex.htpc",
"tv.plex.plexamp",
"com.amazon.aiv.AIVApp", // Prime Video (iOS-on-Apple-Silicon)
// Ebooks
"net.kovidgoyal.calibre",
"com.amazon.Kindle", // legacy desktop, discontinued
"com.amazon.Lassen", // current Mac App Store (iOS-on-Mac)
"com.kobo.desktop.Kobo",
// No native macOS app (name-substring only): Netflix, Disney+, Hulu,
// HBO Max, Peacock, Paramount+, YouTube, Crunchyroll, Tubi, Vudu,
// Audible, Reddit, NYTimes. Their iOS apps don't opt into iPad-on-Mac.
]);
const POLICY_DENIED_NAME_SUBSTRINGS: readonly string[] = [
// Video streaming
"netflix",
"disney+",
"hulu",
"prime video",
"apple tv",
"peacock",
"paramount+",
// "plex" is too generic — would match "Perplexity". Covered by
// tv.plex.* bundle IDs on macOS.
"tubi",
"crunchyroll",
"vudu",
// E-readers / audiobooks
"kindle",
"apple books",
"kobo",
"play books",
"calibre",
"libby",
"readium",
"audible",
"libro.fm",
"speechify",
// Music
"spotify",
"apple music",
"amazon music",
"youtube music",
"tidal",
"deezer",
"pandora",
"pocket casts",
// Publisher / social apps (from the same blocklist tab)
"naver",
"reddit",
"sony music",
"vegas pro",
"pitchfork",
"economist",
"nytimes",
// Skipped (too generic for substring matching — need bundle ID):
// HBO Max / Max, YouTube (non-Music), Nook, Sony Catalyst, Wired
];
/**
* Policy-level auto-deny. Unlike `userDeniedBundleIds` (per-user Settings
* page), this is baked into the build. `buildAccessRequest` strips these
* before the approval dialog with "blocked by policy" guidance; the agent
* is told to not retry.
*/
export function isPolicyDenied(
bundleId: string | undefined,
displayName: string,
): boolean {
if (bundleId && POLICY_DENIED_BUNDLE_IDS.has(bundleId)) return true;
const lower = displayName.toLowerCase();
for (const sub of POLICY_DENIED_NAME_SUBSTRINGS) {
if (lower.includes(sub)) return true;
}
return false;
}
export function getDeniedCategory(bundleId: string): DeniedCategory | null {
if (BROWSER_BUNDLE_IDS.has(bundleId)) return "browser";
if (TERMINAL_BUNDLE_IDS.has(bundleId)) return "terminal";
if (TRADING_BUNDLE_IDS.has(bundleId)) return "trading";
return null;
}
// ─── Display-name fallback (cross-platform) ──────────────────────────────
/**
* Lowercase substrings checked against the requested display name. Catches:
* - Unresolved requests (app not installed, Spotlight miss)
* - Future Windows/Linux support where bundleId is meaningless
*
* Matched via `.includes()` on `name.toLowerCase()`. Entries are ordered
* by specificity (more-specific first is irrelevant since we return on
* first match, but groupings are by category for readability).
*/
const BROWSER_NAME_SUBSTRINGS: readonly string[] = [
"safari",
"chrome",
"firefox",
"microsoft edge",
"brave",
"opera",
"vivaldi",
"chromium",
// Arc/Dia: the canonical display name is just "Arc"/"Dia" — too short for
// substring matching (false-positives: "Arcade", "Diagram"). Covered by
// bundle ID on macOS. The "... browser" entries below catch natural-language
// phrasings ("the arc browser") but NOT the canonical short name.
"arc browser",
"tor browser",
"duckduckgo",
"yandex",
"orion browser",
// Agentic / AI browsers
"comet", // Perplexity's browser — "Comet" substring risks false positives
// but leaving for now; "comet" in an app name is rare
"sigmaos",
"dia browser",
];
const TERMINAL_NAME_SUBSTRINGS: readonly string[] = [
// macOS / cross-platform terminals
"terminal", // catches Terminal, Windows Terminal (NOT iTerm — separate entry)
"iterm",
"wezterm",
"alacritty",
"kitty",
"ghostty",
"tabby",
"termius",
// AppleScript runners — see bundle-ID comment above. "shortcuts" is too
// generic for substring matching (many apps have "shortcuts" in the name);
// covered by bundle ID only, like warp/hyper.
"script editor",
"automator",
// NOTE: "warp" and "hyper" are too generic for substring matching —
// they'd false-positive on "Warpaint" or "Hyperion". Covered by bundle ID
// (dev.warp.Warp-Stable, co.zeit.hyper) for macOS; Windows exe-name
// matching can be added when Windows CU ships.
// Windows shells (activate when the darwin gate lifts)
"powershell",
"cmd.exe",
"command prompt",
"git bash",
"conemu",
"cmder",
// IDEs (VS Code family)
"visual studio code",
"visual studio", // catches VS for Mac + Windows
"vscode",
"vs code",
"vscodium",
"cursor", // Cursor IDE — "cursor" is generic but IDE is the only common app
"windsurf",
// Zed: display name is just "Zed" — too short for substring matching
// (false-positives). Covered by bundle ID (dev.zed.Zed) on macOS.
// IDEs (JetBrains family)
"intellij",
"pycharm",
"webstorm",
"clion",
"goland",
"rubymine",
"phpstorm",
"datagrip",
"rider",
"appcode",
"rustrover",
"fleet",
"android studio",
// Other IDEs
"sublime text",
"macvim",
"neovim",
"emacs",
"xcode",
"eclipse",
"netbeans",
];
const TRADING_NAME_SUBSTRINGS: readonly string[] = [
// Trading — brokerage apps. Sourced from the ACP CU-apps blocklist xlsx
// ("Read Only" tab). Name-substring safe for proper nouns below; generic
// names (IG, Delta, HTX) are skipped and need bundle-ID matching once
// verified.
"bloomberg",
"ameritrade",
"thinkorswim",
"schwab",
"fidelity",
"e*trade",
"interactive brokers",
"trader workstation", // Interactive Brokers TWS
"tradestation",
"webull",
"robinhood",
"tastytrade",
"ninjatrader",
"tradingview",
"moomoo",
"tradezero",
"prorealtime",
"plus500",
"saxotrader",
"oanda",
"metatrader",
"forex.com",
"avaoptions",
"ctrader",
"jforex",
"iq option",
"olymp trade",
"binomo",
"pocket option",
"raceoption",
"expertoption",
"quotex",
"naga",
"morgan stanley",
"ubs neo",
"eikon", // Thomson Reuters / LSEG Workspace
// Crypto — exchanges, wallets, portfolio trackers
"coinbase",
"kraken",
"binance",
"okx",
"bybit",
// "gate.io" is too generic — the ".io" TLD suffix is common in app names
// (e.g., "Draw.io"). Needs bundle-ID matching once verified.
"phemex",
"stormgain",
"crypto.com",
// "exodus" is too generic — it's a common noun and would match unrelated
// apps/games. Needs bundle-ID matching once verified.
"electrum",
"ledger live",
"trezor",
"guarda",
"atomic wallet",
"bitpay",
"bisq",
"koinly",
"cointracker",
"blockfi",
"stripe cli",
// Crypto games / metaverse (same trade-execution risk model)
"decentraland",
"axie infinity",
"gods unchained",
];
/**
* Display-name substring match. Called when bundle-ID resolution returned
* nothing (`resolved === undefined`) or when no bundle-ID deny-list entry
* matched. Returns the category for the first matching substring, or null.
*
* Case-insensitive, substring — so `"Google Chrome"`, `"chrome"`, and
* `"Chrome Canary"` all match the `"chrome"` entry.
*/
export function getDeniedCategoryByDisplayName(
name: string,
): DeniedCategory | null {
const lower = name.toLowerCase();
// Trading first — proper-noun-only set, most specific. "Bloomberg Terminal"
// contains "terminal" and would miscategorize if TERMINAL_NAME_SUBSTRINGS
// ran first.
for (const sub of TRADING_NAME_SUBSTRINGS) {
if (lower.includes(sub)) return "trading";
}
for (const sub of BROWSER_NAME_SUBSTRINGS) {
if (lower.includes(sub)) return "browser";
}
for (const sub of TERMINAL_NAME_SUBSTRINGS) {
if (lower.includes(sub)) return "terminal";
}
return null;
}
/**
* Combined check — bundle ID first (exact, fast), then display-name
* fallback. This is the function tool-call handlers should use.
*
* `bundleId` may be undefined (unresolved request — model asked for an app
* that isn't installed or Spotlight didn't find). In that case only the
* display-name check runs.
*/
export function getDeniedCategoryForApp(
bundleId: string | undefined,
displayName: string,
): DeniedCategory | null {
if (bundleId) {
const byId = getDeniedCategory(bundleId);
if (byId) return byId;
}
return getDeniedCategoryByDisplayName(displayName);
}
/**
* Default tier for an app at grant time. Wraps `getDeniedCategoryForApp` +
* `categoryToTier`. Browsers → `"read"`, terminals/IDEs → `"click"`,
* everything else → `"full"`.
*
* Called by `buildAccessRequest` to populate `ResolvedAppRequest.proposedTier`
* before the approval dialog shows.
*/
export function getDefaultTierForApp(
bundleId: string | undefined,
displayName: string,
): "read" | "click" | "full" {
return categoryToTier(getDeniedCategoryForApp(bundleId, displayName));
}
export const _test = {
BROWSER_BUNDLE_IDS,
TERMINAL_BUNDLE_IDS,
TRADING_BUNDLE_IDS,
POLICY_DENIED_BUNDLE_IDS,
BROWSER_NAME_SUBSTRINGS,
TERMINAL_NAME_SUBSTRINGS,
TRADING_NAME_SUBSTRINGS,
POLICY_DENIED_NAME_SUBSTRINGS,
};

View File

@@ -0,0 +1,108 @@
/**
* Port of the API's image transcoder target-size algorithm. Pre-sizing
* screenshots to this function's output means the API's early-return fires
* (tokens ≤ max) and the image is NOT resized server-side — so the model
* sees exactly the dimensions in `ScreenshotResult.width/height` and
* `scaleCoord` stays coherent.
*
* Rust reference: api/api/image_transcoder/rust_transcoder/src/utils/resize.rs
* Sibling TS port: apps/claude-browser-use/src/utils/imageResize.ts (identical
* algorithm, lives in the Chrome extension tree — not a shared package).
*
* See COORDINATES.md for why this matters for click accuracy.
*/
export interface ResizeParams {
pxPerToken: number;
maxTargetPx: number;
maxTargetTokens: number;
}
/**
* Production defaults — match `resize.rs:160-164` and Chrome's
* `CDPService.ts:638-642`. Vision encoder uses 28px tiles; 1568 is both
* the long-edge cap (56 tiles) AND the token budget.
*/
export const API_RESIZE_PARAMS: ResizeParams = {
pxPerToken: 28,
maxTargetPx: 1568,
maxTargetTokens: 1568,
};
/** ceil(px / pxPerToken). Matches resize.rs:74-76 (which uses integer ceil-div). */
export function nTokensForPx(px: number, pxPerToken: number): number {
return Math.floor((px - 1) / pxPerToken) + 1;
}
function nTokensForImg(
width: number,
height: number,
pxPerToken: number,
): number {
return nTokensForPx(width, pxPerToken) * nTokensForPx(height, pxPerToken);
}
/**
* Binary-search along the width dimension for the largest image that:
* - preserves the input aspect ratio
* - has long edge ≤ maxTargetPx
* - has ceil(w/pxPerToken) × ceil(h/pxPerToken) ≤ maxTargetTokens
*
* Returns [width, height]. No-op if input already satisfies all three.
*
* The long-edge constraint alone (what we used to use) is insufficient on
* squarer-than-16:9 displays: 1568×1014 (MBP 16" AR) is 56×37 = 2072 tokens,
* over budget, and gets server-resized to 1372×887 — model then clicks in
* 1372-space but scaleCoord assumed 1568-space → ~14% coord error.
*
* Matches resize.rs:91-155 exactly (verified against its test vectors).
*/
export function targetImageSize(
width: number,
height: number,
params: ResizeParams,
): [number, number] {
const { pxPerToken, maxTargetPx, maxTargetTokens } = params;
if (
width <= maxTargetPx &&
height <= maxTargetPx &&
nTokensForImg(width, height, pxPerToken) <= maxTargetTokens
) {
return [width, height];
}
// Normalize to landscape for the search; transpose result back.
if (height > width) {
const [w, h] = targetImageSize(height, width, params);
return [h, w];
}
const aspectRatio = width / height;
// Loop invariant: lowerBoundWidth is always valid, upperBoundWidth is
// always invalid. ~12 iterations for a 4000px image.
let upperBoundWidth = width;
let lowerBoundWidth = 1;
for (;;) {
if (lowerBoundWidth + 1 === upperBoundWidth) {
return [
lowerBoundWidth,
Math.max(Math.round(lowerBoundWidth / aspectRatio), 1),
];
}
const middleWidth = Math.floor((lowerBoundWidth + upperBoundWidth) / 2);
const middleHeight = Math.max(Math.round(middleWidth / aspectRatio), 1);
if (
middleWidth <= maxTargetPx &&
nTokensForImg(middleWidth, middleHeight, pxPerToken) <= maxTargetTokens
) {
lowerBoundWidth = middleWidth;
} else {
upperBoundWidth = middleWidth;
}
}
}

View File

@@ -0,0 +1,69 @@
export type {
ComputerExecutor,
DisplayGeometry,
FrontmostApp,
InstalledApp,
ResolvePrepareCaptureResult,
RunningApp,
ScreenshotResult,
} from "./executor.js";
export type {
AppGrant,
CuAppPermTier,
ComputerUseHostAdapter,
ComputerUseOverrides,
ComputerUseSessionContext,
CoordinateMode,
CuGrantFlags,
CuPermissionRequest,
CuPermissionResponse,
CuSubGates,
CuTeachPermissionRequest,
Logger,
ResolvedAppRequest,
ScreenshotDims,
TeachStepRequest,
TeachStepResult,
} from "./types.js";
export { DEFAULT_GRANT_FLAGS } from "./types.js";
export {
SENTINEL_BUNDLE_IDS,
getSentinelCategory,
} from "./sentinelApps.js";
export type { SentinelCategory } from "./sentinelApps.js";
export {
categoryToTier,
getDefaultTierForApp,
getDeniedCategory,
getDeniedCategoryByDisplayName,
getDeniedCategoryForApp,
isPolicyDenied,
} from "./deniedApps.js";
export type { DeniedCategory } from "./deniedApps.js";
export { isSystemKeyCombo, normalizeKeySequence } from "./keyBlocklist.js";
export { ALL_SUB_GATES_OFF, ALL_SUB_GATES_ON } from "./subGates.js";
export { API_RESIZE_PARAMS, targetImageSize } from "./imageResize.js";
export type { ResizeParams } from "./imageResize.js";
export { defersLockAcquire, handleToolCall } from "./toolCalls.js";
export type {
CuCallTelemetry,
CuCallToolResult,
CuErrorKind,
} from "./toolCalls.js";
export { bindSessionContext, createComputerUseMcpServer } from "./mcpServer.js";
export { buildComputerUseTools } from "./tools.js";
export {
comparePixelAtLocation,
validateClickTarget,
} from "./pixelCompare.js";
export type { CropRawPatchFn, PixelCompareResult } from "./pixelCompare.js";

View File

@@ -0,0 +1,153 @@
/**
* Key combos that cross app boundaries or terminate processes. Gated behind
* the `systemKeyCombos` grant flag. When that flag is off, the `key` tool
* rejects these and returns a tool error telling the model to request the
* flag; all other combos work normally.
*
* Matching is canonicalized: every modifier alias the Rust executor accepts
* collapses to one canonical name. Without this, `command+q` / `meta+q` /
* `cmd+alt+escape` bypass the gate — see keyBlocklist.test.ts for the three
* bypass forms and the Rust parity check that catches future alias drift.
*/
/**
* Every modifier alias enigo_wrap.rs accepts (two copies: :351-359, :564-572),
* mapped to one canonical per Key:: variant. Left/right variants collapse —
* the blocklist doesn't distinguish which Ctrl.
*
* Canonical names are Rust's own variant names lowercased. Blocklist entries
* below use ONLY these. "meta" reads odd for Cmd+Q but it's honest: Rust
* sends Key::Meta, which is Cmd on darwin and Win on win32.
*/
const CANONICAL_MODIFIER: Readonly<Record<string, string>> = {
// Key::Meta — "meta"|"super"|"command"|"cmd"|"windows"|"win"
meta: "meta",
super: "meta",
command: "meta",
cmd: "meta",
windows: "meta",
win: "meta",
// Key::Control + LControl + RControl
ctrl: "ctrl",
control: "ctrl",
lctrl: "ctrl",
lcontrol: "ctrl",
rctrl: "ctrl",
rcontrol: "ctrl",
// Key::Shift + LShift + RShift
shift: "shift",
lshift: "shift",
rshift: "shift",
// Key::Alt and Key::Option — distinct Rust variants but same keycode on
// darwin (kVK_Option). Collapse: cmd+alt+escape and cmd+option+escape
// both Force Quit.
alt: "alt",
option: "alt",
};
/** Sort order for canonicals. ctrl < alt < shift < meta. */
const MODIFIER_ORDER = ["ctrl", "alt", "shift", "meta"];
/**
* Canonical-form entries only. Every modifier must be a CANONICAL_MODIFIER
* *value* (not key), modifiers must be in MODIFIER_ORDER, non-modifier last.
* The self-consistency test enforces this.
*/
const BLOCKED_DARWIN = new Set([
"meta+q", // Cmd+Q — quit frontmost app
"shift+meta+q", // Cmd+Shift+Q — log out
"alt+meta+escape", // Cmd+Option+Esc — Force Quit dialog
"meta+tab", // Cmd+Tab — app switcher
"meta+space", // Cmd+Space — Spotlight
"ctrl+meta+q", // Ctrl+Cmd+Q — lock screen
]);
const BLOCKED_WIN32 = new Set([
"ctrl+alt+delete", // Secure Attention Sequence
"alt+f4", // close window
"alt+tab", // window switcher
"meta+l", // Win+L — lock
"meta+d", // Win+D — show desktop
]);
/**
* Partition into sorted-canonical modifiers and non-modifier keys.
* Shared by normalizeKeySequence (join for display) and isSystemKeyCombo
* (check mods+each-key to catch the cmd+q+a suffix bypass).
*/
function partitionKeys(seq: string): { mods: string[]; keys: string[] } {
const parts = seq
.toLowerCase()
.split("+")
.map((p) => p.trim())
.filter(Boolean);
const mods: string[] = [];
const keys: string[] = [];
for (const p of parts) {
const canonical = CANONICAL_MODIFIER[p];
if (canonical !== undefined) {
mods.push(canonical);
} else {
keys.push(p);
}
}
// Dedupe: "cmd+command+q" → "meta+q", not "meta+meta+q".
const uniqueMods = [...new Set(mods)];
uniqueMods.sort(
(a, b) => MODIFIER_ORDER.indexOf(a) - MODIFIER_ORDER.indexOf(b),
);
return { mods: uniqueMods, keys };
}
/**
* Normalize "Cmd + Shift + Q" → "shift+meta+q": lowercase, trim, alias →
* canonical, dedupe, sort modifiers, non-modifiers last.
*/
export function normalizeKeySequence(seq: string): string {
const { mods, keys } = partitionKeys(seq);
return [...mods, ...keys].join("+");
}
/**
* True if the sequence would fire a blocked OS shortcut.
*
* Checks mods + EACH non-modifier key individually, not just the full
* joined string. `cmd+q+a` → Rust presses Cmd, then Q (Cmd+Q fires here),
* then A. Exact-match against "meta+q+a" misses; checking "meta+q" and
* "meta+a" separately catches the Q.
*
* Modifiers-only sequences ("cmd+shift") are checked as-is — no key to
* pair with, and no blocklist entry is modifier-only, so this is a no-op
* that falls through to false. Covers the click-modifier case where
* `left_click(text="cmd")` is legitimate.
*/
export function isSystemKeyCombo(
seq: string,
platform: "darwin" | "win32",
): boolean {
const blocklist = platform === "darwin" ? BLOCKED_DARWIN : BLOCKED_WIN32;
const { mods, keys } = partitionKeys(seq);
const prefix = mods.length > 0 ? mods.join("+") + "+" : "";
// No non-modifier keys (e.g. "cmd+shift" as click-modifiers) — check the
// whole thing. Never matches (no blocklist entry is modifier-only) but
// keeps the contract simple: every call reaches a .has().
if (keys.length === 0) {
return blocklist.has(mods.join("+"));
}
// mods + each key. Any hit blocks the whole sequence.
for (const key of keys) {
if (blocklist.has(prefix + key)) {
return true;
}
}
return false;
}
export const _test = {
CANONICAL_MODIFIER,
BLOCKED_DARWIN,
BLOCKED_WIN32,
MODIFIER_ORDER,
};

View File

@@ -0,0 +1,313 @@
/**
* MCP server factory + session-context binder.
*
* Two entry points:
*
* `bindSessionContext` — the wrapper closure. Takes a `ComputerUseSessionContext`
* (getters + callbacks backed by host session state), returns a dispatcher.
* Reusable by both the MCP CallTool handler here AND Cowork's
* `InternalServerDefinition.handleToolCall` (which doesn't go through MCP).
* This replaces the duplicated wrapper closures in apps/desktop/…/serverDef.ts
* and the Claude Code CLI's CU host wrapper — both did the same thing: build `ComputerUseOverrides`
* fresh from getters, call `handleToolCall`, stash screenshot, merge permissions.
*
* `createComputerUseMcpServer` — the Server object. When `context` is provided,
* the CallTool handler is real (uses `bindSessionContext`). When not, it's the
* legacy stub that returns a not-wired error. The tool-schema ListTools handler
* is the same either way.
*/
import { Server } from "@modelcontextprotocol/sdk/server/index.js";
import type { CallToolResult } from "@modelcontextprotocol/sdk/types.js";
import {
CallToolRequestSchema,
ListToolsRequestSchema,
} from "@modelcontextprotocol/sdk/types.js";
import type { ScreenshotResult } from "./executor.js";
import type { CuCallToolResult } from "./toolCalls.js";
import {
defersLockAcquire,
handleToolCall,
resetMouseButtonHeld,
} from "./toolCalls.js";
import { buildComputerUseTools } from "./tools.js";
import type {
AppGrant,
ComputerUseHostAdapter,
ComputerUseOverrides,
ComputerUseSessionContext,
CoordinateMode,
CuGrantFlags,
CuPermissionResponse,
} from "./types.js";
import { DEFAULT_GRANT_FLAGS } from "./types.js";
const DEFAULT_LOCK_HELD_MESSAGE =
"Another Claude session is currently using the computer. Wait for that " +
"session to finish, or find a non-computer-use approach.";
/**
* Dedupe `granted` into `existing` on bundleId, spread truthy-only flags over
* defaults+existing. Truthy-only: a subsequent `request_access` that doesn't
* request clipboard can't revoke an earlier clipboard grant — revocation lives
* in a Settings page, not here.
*
* Same merge both hosts implemented independently today.
*/
function mergePermissionResponse(
existing: readonly AppGrant[],
existingFlags: CuGrantFlags,
response: CuPermissionResponse,
): { apps: AppGrant[]; flags: CuGrantFlags } {
const seen = new Set(existing.map((a) => a.bundleId));
const apps = [
...existing,
...response.granted.filter((g) => !seen.has(g.bundleId)),
];
const truthyFlags = Object.fromEntries(
Object.entries(response.flags).filter(([, v]) => v === true),
);
const flags: CuGrantFlags = {
...DEFAULT_GRANT_FLAGS,
...existingFlags,
...truthyFlags,
};
return { apps, flags };
}
/**
* Bind session state to a reusable dispatcher. The returned function is the
* wrapper closure: async lock gate → build overrides fresh → `handleToolCall`
* → stash screenshot → strip piggybacked fields.
*
* The last-screenshot blob is held in a closure cell here (not on `ctx`), so
* hosts don't need to guarantee `ctx` object identity across calls — they just
* need to hold onto the returned dispatcher. Cowork caches per
* `InternalServerContext` in a WeakMap; the CLI host constructs once at server creation.
*/
export function bindSessionContext(
adapter: ComputerUseHostAdapter,
coordinateMode: CoordinateMode,
ctx: ComputerUseSessionContext,
): (name: string, args: unknown) => Promise<CuCallToolResult> {
const { logger, serverName } = adapter;
// Screenshot blob persists here across calls — NOT on `ctx`. Hosts hold
// onto the returned dispatcher; that's the identity that matters.
let lastScreenshot: ScreenshotResult | undefined;
const wrapPermission = ctx.onPermissionRequest
? async (
req: Parameters<NonNullable<typeof ctx.onPermissionRequest>>[0],
signal: AbortSignal,
): Promise<CuPermissionResponse> => {
const response = await ctx.onPermissionRequest!(req, signal);
const { apps, flags } = mergePermissionResponse(
ctx.getAllowedApps(),
ctx.getGrantFlags(),
response,
);
logger.debug(
`[${serverName}] permission result: granted=${response.granted.length} denied=${response.denied.length}`,
);
ctx.onAllowedAppsChanged?.(apps, flags);
return response;
}
: undefined;
const wrapTeachPermission = ctx.onTeachPermissionRequest
? async (
req: Parameters<NonNullable<typeof ctx.onTeachPermissionRequest>>[0],
signal: AbortSignal,
): Promise<CuPermissionResponse> => {
const response = await ctx.onTeachPermissionRequest!(req, signal);
logger.debug(
`[${serverName}] teach permission result: granted=${response.granted.length} denied=${response.denied.length}`,
);
// Teach doesn't request grant flags — preserve existing.
const { apps } = mergePermissionResponse(
ctx.getAllowedApps(),
ctx.getGrantFlags(),
response,
);
ctx.onAllowedAppsChanged?.(apps, {
...DEFAULT_GRANT_FLAGS,
...ctx.getGrantFlags(),
});
return response;
}
: undefined;
return async (name, args) => {
// ─── Async lock gate ─────────────────────────────────────────────────
// Replaces the sync Gate-3 in `handleToolCall` — we pass
// `checkCuLock: undefined` below so it no-ops. Hosts with
// cross-process locks (O_EXCL file) await the real primitive here
// instead of pre-computing + feeding a fake sync result.
if (ctx.checkCuLock) {
const lock = await ctx.checkCuLock();
if (lock.holder !== undefined && !lock.isSelf) {
const text =
ctx.formatLockHeldMessage?.(lock.holder) ?? DEFAULT_LOCK_HELD_MESSAGE;
return {
content: [{ type: "text", text }],
isError: true,
telemetry: { error_kind: "cu_lock_held" },
};
}
if (lock.holder === undefined && !defersLockAcquire(name)) {
await ctx.acquireCuLock?.();
// Re-check: the awaits above yield the microtask queue, so another
// session's check+acquire can interleave with ours. Hosts where
// acquire is a no-op when already held (Cowork's CuLockManager) give
// no signal that we lost — verify we're now the holder before
// proceeding. The CLI's O_EXCL file lock would surface this as a throw from
// acquire instead; this re-check is a belt-and-suspenders for that
// path too.
const recheck = await ctx.checkCuLock();
if (recheck.holder !== undefined && !recheck.isSelf) {
const text =
ctx.formatLockHeldMessage?.(recheck.holder) ??
DEFAULT_LOCK_HELD_MESSAGE;
return {
content: [{ type: "text", text }],
isError: true,
telemetry: { error_kind: "cu_lock_held" },
};
}
// Fresh holder → any prior session's mouseButtonHeld is stale.
// Mirrors what Gate-3 does on the acquire branch. After the
// re-check so we only clear module state when we actually won.
resetMouseButtonHeld();
}
}
// ─── Build overrides fresh ───────────────────────────────────────────
// Blob-first; dims-fallback with base64:"" when the closure cell is
// unset (cross-respawn). scaleCoord reads dims; pixelCompare sees "" →
// isEmpty → skip.
const dimsFallback = lastScreenshot
? undefined
: ctx.getLastScreenshotDims?.();
// Per-call AbortController for dialog dismissal. Aborted in `finally` —
// if handleToolCall finishes (MCP timeout, throw) before the user
// answers, the host's dialog handler sees the abort and tears down.
const dialogAbort = new AbortController();
const overrides: ComputerUseOverrides = {
allowedApps: [...ctx.getAllowedApps()],
grantFlags: ctx.getGrantFlags(),
userDeniedBundleIds: ctx.getUserDeniedBundleIds(),
coordinateMode,
selectedDisplayId: ctx.getSelectedDisplayId(),
displayPinnedByModel: ctx.getDisplayPinnedByModel?.(),
displayResolvedForApps: ctx.getDisplayResolvedForApps?.(),
lastScreenshot:
lastScreenshot ??
(dimsFallback ? { ...dimsFallback, base64: "" } : undefined),
onPermissionRequest: wrapPermission
? (req) => wrapPermission(req, dialogAbort.signal)
: undefined,
onTeachPermissionRequest: wrapTeachPermission
? (req) => wrapTeachPermission(req, dialogAbort.signal)
: undefined,
onAppsHidden: ctx.onAppsHidden,
getClipboardStash: ctx.getClipboardStash,
onClipboardStashChanged: ctx.onClipboardStashChanged,
onResolvedDisplayUpdated: ctx.onResolvedDisplayUpdated,
onDisplayPinned: ctx.onDisplayPinned,
onDisplayResolvedForApps: ctx.onDisplayResolvedForApps,
onTeachModeActivated: ctx.onTeachModeActivated,
onTeachStep: ctx.onTeachStep,
onTeachWorking: ctx.onTeachWorking,
getTeachModeActive: ctx.getTeachModeActive,
// Undefined → handleToolCall's sync Gate-3 no-ops. The async gate
// above already ran.
checkCuLock: undefined,
acquireCuLock: undefined,
isAborted: ctx.isAborted,
};
logger.debug(
`[${serverName}] tool=${name} allowedApps=${overrides.allowedApps.length} coordMode=${coordinateMode}`,
);
// ─── Dispatch ────────────────────────────────────────────────────────
try {
const result = await handleToolCall(adapter, name, args, overrides);
if (result.screenshot) {
lastScreenshot = result.screenshot;
const { base64: _blob, ...dims } = result.screenshot;
logger.debug(`[${serverName}] screenshot dims: ${JSON.stringify(dims)}`);
ctx.onScreenshotCaptured?.(dims);
}
return result;
} finally {
dialogAbort.abort();
}
};
}
export function createComputerUseMcpServer(
adapter: ComputerUseHostAdapter,
coordinateMode: CoordinateMode,
context?: ComputerUseSessionContext,
): Server {
const { serverName, logger } = adapter;
const server = new Server(
{ name: serverName, version: "0.1.3" },
{ capabilities: { tools: {}, logging: {} } },
);
const tools = buildComputerUseTools(
adapter.executor.capabilities,
coordinateMode,
);
server.setRequestHandler(ListToolsRequestSchema, async () =>
adapter.isDisabled() ? { tools: [] } : { tools },
);
if (context) {
const dispatch = bindSessionContext(adapter, coordinateMode, context);
server.setRequestHandler(
CallToolRequestSchema,
async (request): Promise<CallToolResult> => {
const { screenshot: _s, telemetry: _t, ...result } = await dispatch(
request.params.name,
request.params.arguments ?? {},
);
return result;
},
);
return server;
}
// Legacy: no context → stub handler. Reached only if something calls the
// server over MCP transport WITHOUT going through a binder (a wiring
// regression). Clear error instead of silent failure.
server.setRequestHandler(
CallToolRequestSchema,
async (request): Promise<CallToolResult> => {
logger.warn(
`[${serverName}] tool call "${request.params.name}" reached the stub handler — no session context bound. Per-session state unavailable.`,
);
return {
content: [
{
type: "text",
text: "This computer-use server instance is not wired to a session. Per-session app permissions are not available on this code path.",
},
],
isError: true,
};
},
);
return server;
}

View File

@@ -0,0 +1,171 @@
/**
* Staleness guard ported from the Vercept acquisition.
*
* Compares the model's last-seen screenshot against a fresh-right-now
* screenshot at the click target, so the model never clicks pixels it hasn't
* seen. If the 9×9 patch around the target differs, the click is aborted and
* the model is told to re-screenshot. This is NOT a popup detector.
*
* Semantics preserved exactly:
* - Skip on no `lastScreenshot` (cold start) — click proceeds.
* - Skip on any internal error (crop throws, screenshot fails, etc.) —
* click proceeds. Validation failure must never block the action.
* - 9×9 exact byte equality on raw pixel bytes. No fuzzing, no tolerance.
* - Compare in percentage coords so Retina scale doesn't matter.
*
* JPEG decode + crop is INJECTED via `ComputerUseHostAdapter.cropRawPatch`.
* The original used `sharp` (LGPL, native `.node` addon); we inject Electron's
* `nativeImage` (Chromium decoders, BSD, nothing to bundle) from the host, so
* this package never imports it — the crop is a function parameter.
*/
import type { ScreenshotResult } from "./executor.js";
import type { Logger } from "./types.js";
/** Injected by the host. See `ComputerUseHostAdapter.cropRawPatch`. */
export type CropRawPatchFn = (
jpegBase64: string,
rect: { x: number; y: number; width: number; height: number },
) => Buffer | null;
/** 9×9 is empirically the sweet spot — large enough to catch a tooltip
* appearing, small enough to not false-positive on surrounding animation.
**/
const DEFAULT_GRID_SIZE = 9;
export interface PixelCompareResult {
/** true → click may proceed. false → patch changed, abort the click. */
valid: boolean;
/** true → validation did not run (cold start, sub-gate off, or internal
* error). The caller MUST treat this identically to `valid: true`. */
skipped: boolean;
/** Populated when valid === false. Returned to the model verbatim. */
warning?: string;
}
/**
* Compute the crop rect for a patch centered on (xPercent, yPercent).
*
* Dimensions come from ScreenshotResult.width/height (physical pixels). Both
* screenshots have the same dimensions (same display, consecutive captures),
* so the rect is the same for both.
*/
function computeCropRect(
imgW: number,
imgH: number,
xPercent: number,
yPercent: number,
gridSize: number,
): { x: number; y: number; width: number; height: number } | null {
if (!imgW || !imgH) return null;
const clampedX = Math.max(0, Math.min(100, xPercent));
const clampedY = Math.max(0, Math.min(100, yPercent));
const centerX = Math.round((clampedX / 100.0) * imgW);
const centerY = Math.round((clampedY / 100.0) * imgH);
const halfGrid = Math.floor(gridSize / 2);
const cropX = Math.max(0, centerX - halfGrid);
const cropY = Math.max(0, centerY - halfGrid);
const cropW = Math.min(gridSize, imgW - cropX);
const cropH = Math.min(gridSize, imgH - cropY);
if (cropW <= 0 || cropH <= 0) return null;
return { x: cropX, y: cropY, width: cropW, height: cropH };
}
/**
* Compare the same patch location between two screenshots.
*
* @returns true when the raw pixel bytes are identical. false on any
* difference, or on any internal error (the caller treats an error here as
* `skipped`, so the false is harmless).
*/
export function comparePixelAtLocation(
crop: CropRawPatchFn,
lastScreenshot: ScreenshotResult,
freshScreenshot: ScreenshotResult,
xPercent: number,
yPercent: number,
gridSize: number = DEFAULT_GRID_SIZE,
): boolean {
// Both screenshots are of the same display — use the fresh one's
// dimensions (less likely to be stale than last's).
const rect = computeCropRect(
freshScreenshot.width,
freshScreenshot.height,
xPercent,
yPercent,
gridSize,
);
if (!rect) return false;
const patch1 = crop(lastScreenshot.base64, rect);
const patch2 = crop(freshScreenshot.base64, rect);
if (!patch1 || !patch2) return false;
// Direct buffer equality. Note: nativeImage.toBitmap() gives BGRA, sharp's
// .raw() gave RGB.
// Doesn't matter — we're comparing two same-format buffers for equality.
return patch1.equals(patch2);
}
/**
* Battle-tested click-target validation ported from the Vercept acquisition,
* with the fresh-screenshot capture delegated to the caller (we don't have
* a global `SystemActions.takeScreenshot()` — the executor is injected).
*
* Skip conditions (any of these → `{ valid: true, skipped: true }`):
* - `lastScreenshot` is undefined (cold start).
* - `takeFreshScreenshot()` throws or returns null.
* - Injected crop function returns null (decode failure).
* - Any other exception.
*
* The caller decides whether to invoke this at all (sub-gate check lives
* in toolCalls.ts, not here).
*/
export async function validateClickTarget(
crop: CropRawPatchFn,
lastScreenshot: ScreenshotResult | undefined,
xPercent: number,
yPercent: number,
takeFreshScreenshot: () => Promise<ScreenshotResult | null>,
logger: Logger,
gridSize: number = DEFAULT_GRID_SIZE,
): Promise<PixelCompareResult> {
if (!lastScreenshot) {
return { valid: true, skipped: true };
}
try {
const fresh = await takeFreshScreenshot();
if (!fresh) {
return { valid: true, skipped: true };
}
const pixelsMatch = comparePixelAtLocation(
crop,
lastScreenshot,
fresh,
xPercent,
yPercent,
gridSize,
);
if (pixelsMatch) {
return { valid: true, skipped: false };
}
return {
valid: false,
skipped: false,
warning:
"Screen content at the target location changed since the last screenshot. Take a new screenshot before clicking.",
};
} catch (err) {
// Skip validation on technical errors, execute action anyway.
// Battle-tested: validation failure must never block the click.
logger.debug("[pixelCompare] validation error, skipping", err);
return { valid: true, skipped: true };
}
}

View File

@@ -0,0 +1,43 @@
/**
* Bundle IDs that are escalations-in-disguise. The approval UI shows a warning
* badge for these; they are NOT blocked. Power users may legitimately want the
* model controlling a terminal.
*
* Imported by the renderer via the `./sentinelApps` subpath (package.json
* `exports`), which keeps Next.js from reaching index.ts → mcpServer.ts →
* @modelcontextprotocol/sdk (devDep, would fail module resolution). Keep
* this file import-free so the subpath stays clean.
*/
/** These apps can execute arbitrary shell commands. */
const SHELL_ACCESS_BUNDLE_IDS = new Set([
"com.apple.Terminal",
"com.googlecode.iterm2",
"com.microsoft.VSCode",
"dev.warp.Warp-Stable",
"com.github.wez.wezterm",
"io.alacritty",
"net.kovidgoyal.kitty",
"com.jetbrains.intellij",
"com.jetbrains.pycharm",
]);
/** Finder in the allowlist ≈ browse + open-any-file. */
const FILESYSTEM_ACCESS_BUNDLE_IDS = new Set(["com.apple.finder"]);
const SYSTEM_SETTINGS_BUNDLE_IDS = new Set(["com.apple.systempreferences"]);
export const SENTINEL_BUNDLE_IDS: ReadonlySet<string> = new Set([
...SHELL_ACCESS_BUNDLE_IDS,
...FILESYSTEM_ACCESS_BUNDLE_IDS,
...SYSTEM_SETTINGS_BUNDLE_IDS,
]);
export type SentinelCategory = "shell" | "filesystem" | "system_settings";
export function getSentinelCategory(bundleId: string): SentinelCategory | null {
if (SHELL_ACCESS_BUNDLE_IDS.has(bundleId)) return "shell";
if (FILESYSTEM_ACCESS_BUNDLE_IDS.has(bundleId)) return "filesystem";
if (SYSTEM_SETTINGS_BUNDLE_IDS.has(bundleId)) return "system_settings";
return null;
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,706 @@
/**
* MCP tool schemas for the computer-use server. Mirrors
* claude-for-chrome-mcp/src/browserTools.ts in shape (plain `Tool`-shaped
* object literals, no zod).
*
* Coordinate descriptions are baked in at tool-list build time from the
* `chicago_coordinate_mode` gate. The model sees exactly ONE coordinate
* convention in the param descriptions and never learns the other exists.
* The host (`serverDef.ts`) reads the same frozen gate value for
* `scaleCoord` — both must agree or clicks land in the wrong space.
*/
import type { Tool } from "@modelcontextprotocol/sdk/types.js";
import type { CoordinateMode } from "./types.js";
// See packages/desktop/computer-use-mcp/COORDINATES.md before touching any
// model-facing coordinate text. Chrome's browserTools.ts:143 is the reference
// phrasing — "pixels from the left edge", no geometry, no number to do math with.
const COORD_DESC: Record<CoordinateMode, { x: string; y: string }> = {
pixels: {
x: "Horizontal pixel position read directly from the most recent screenshot image, measured from the left edge. The server handles all scaling.",
y: "Vertical pixel position read directly from the most recent screenshot image, measured from the top edge. The server handles all scaling.",
},
normalized_0_100: {
x: "Horizontal position as a percentage of screen width, 0.0100.0 (0 = left edge, 100 = right edge).",
y: "Vertical position as a percentage of screen height, 0.0100.0 (0 = top edge, 100 = bottom edge).",
},
};
const FRONTMOST_GATE_DESC =
"The frontmost application must be in the session allowlist at the time of this call, or this tool returns an error and does nothing.";
/**
* Item schema for the `actions` array in `computer_batch`, `teach_step`, and
* `teach_batch`. All three dispatch through the same `dispatchAction` path
* with the same validation — keep this enum in sync with `BATCHABLE_ACTIONS`
* in toolCalls.ts.
*/
const BATCH_ACTION_ITEM_SCHEMA = {
type: "object",
properties: {
action: {
type: "string",
enum: [
"key",
"type",
"mouse_move",
"left_click",
"left_click_drag",
"right_click",
"middle_click",
"double_click",
"triple_click",
"scroll",
"hold_key",
"screenshot",
"cursor_position",
"left_mouse_down",
"left_mouse_up",
"wait",
],
description: "The action to perform.",
},
coordinate: {
type: "array",
items: { type: "number" },
minItems: 2,
maxItems: 2,
description:
"(x, y) for click/mouse_move/scroll/left_click_drag end point.",
},
start_coordinate: {
type: "array",
items: { type: "number" },
minItems: 2,
maxItems: 2,
description:
"(x, y) drag start — left_click_drag only. Omit to drag from current cursor.",
},
text: {
type: "string",
description:
"For type: the text. For key/hold_key: the chord string. For click/scroll: modifier keys to hold.",
},
scroll_direction: {
type: "string",
enum: ["up", "down", "left", "right"],
},
scroll_amount: { type: "integer", minimum: 0, maximum: 100 },
duration: {
type: "number",
description: "Seconds (0100). For hold_key/wait.",
},
repeat: {
type: "integer",
minimum: 1,
maximum: 100,
description: "For key: repeat count.",
},
},
required: ["action"],
};
/**
* Build the tool list. Parameterized by capabilities and coordinate mode so
* descriptions are honest and unambiguous (plan §1 — "Unfiltered + honest").
*
* `coordinateMode` MUST match what the host passes to `scaleCoord` at tool-
* -call time. Both should read the same frozen-at-load gate constant.
*
* `installedAppNames` — optional pre-sanitized list of app display names to
* enumerate in the `request_access` description. The caller is responsible
* for sanitization (length cap, character allowlist, sort, count cap) —
* this function just splices the list into the description verbatim. Omit
* to fall back to the generic "display names or bundle IDs" wording.
*/
export function buildComputerUseTools(
caps: {
screenshotFiltering: "native" | "none";
platform: "darwin" | "win32";
/** Include request_teach_access + teach_step. Read once at server construction. */
teachMode?: boolean;
},
coordinateMode: CoordinateMode,
installedAppNames?: string[],
): Tool[] {
const coord = COORD_DESC[coordinateMode];
// Shared hint suffix for BOTH request_access and request_teach_access —
// they use the same resolveRequestedApps path, so the model should get
// the same enumeration for both.
const installedAppsHint =
installedAppNames && installedAppNames.length > 0
? ` Available applications on this machine: ${installedAppNames.join(", ")}.`
: "";
// [x, y]` tuple — param shape for all
// click/move/scroll tools.
const coordinateTuple = {
type: "array",
items: { type: "number" },
minItems: 2,
maxItems: 2,
description: `(x, y): ${coord.x}`,
};
// Modifier hold during click. Shared across all 5 click variants.
const clickModifierText = {
type: "string",
description:
'Modifier keys to hold during the click (e.g. "shift", "ctrl+shift"). Supports the same syntax as the key tool.',
};
const screenshotDesc =
caps.screenshotFiltering === "native"
? "Take a screenshot of the primary display. Applications not in the session allowlist are excluded at the compositor level — only granted apps and the desktop are visible."
: "Take a screenshot of the primary display. On this platform, screenshots are NOT filtered — all open windows are visible. Input actions targeting apps not in the session allowlist are rejected.";
return [
{
name: "request_access",
description:
"Request user permission to control a set of applications for this session. Must be called before any other tool in this server. " +
"The user sees a single dialog listing all requested apps and either allows the whole set or denies it. " +
"Call this again mid-session to add more apps; previously granted apps remain granted. " +
"Returns the granted apps, denied apps, and screenshot filtering capability.",
inputSchema: {
type: "object" as const,
properties: {
apps: {
type: "array",
items: { type: "string" },
description:
"Application display names (e.g. \"Slack\", \"Calendar\") or bundle identifiers (e.g. \"com.tinyspeck.slackmacgap\"). Display names are resolved case-insensitively against installed apps." +
installedAppsHint,
},
reason: {
type: "string",
description:
"One-sentence explanation shown to the user in the approval dialog. Explain the task, not the mechanism.",
},
clipboardRead: {
type: "boolean",
description:
"Also request permission to read the user's clipboard (separate checkbox in the dialog).",
},
clipboardWrite: {
type: "boolean",
description:
"Also request permission to write the user's clipboard. When granted, multi-line `type` calls use the clipboard fast path.",
},
systemKeyCombos: {
type: "boolean",
description:
"Also request permission to send system-level key combos (quit app, switch app, lock screen). Without this, those specific combos are blocked.",
},
},
required: ["apps", "reason"],
},
},
{
name: "screenshot",
description:
screenshotDesc +
" Returns an error if the allowlist is empty. The returned image is what subsequent click coordinates are relative to.",
inputSchema: {
type: "object" as const,
properties: {
save_to_disk: {
type: "boolean",
description:
"Save the image to disk so it can be attached to a message for the user. Returns the saved path in the tool result. Only set this when you intend to share the image — screenshots you're just looking at don't need saving.",
},
},
required: [],
},
},
{
name: "zoom",
description:
"Take a higher-resolution screenshot of a specific region of the last full-screen screenshot. Use this liberally to inspect small text, button labels, or fine UI details that are hard to read in the downsampled full-screen image. " +
"IMPORTANT: Coordinates in subsequent click calls always refer to the full-screen screenshot, never the zoomed image. This tool is read-only for inspecting detail.",
inputSchema: {
type: "object" as const,
properties: {
region: {
type: "array",
items: { type: "integer" },
minItems: 4,
maxItems: 4,
description:
"(x0, y0, x1, y1): Rectangle to zoom into, in the coordinate space of the most recent full-screen screenshot. x0,y0 = top-left, x1,y1 = bottom-right.",
},
save_to_disk: {
type: "boolean",
description:
"Save the image to disk so it can be attached to a message for the user. Returns the saved path in the tool result. Only set this when you intend to share the image.",
},
},
required: ["region"],
},
},
{
name: "left_click",
description: `Left-click at the given coordinates. ${FRONTMOST_GATE_DESC}`,
inputSchema: {
type: "object" as const,
properties: {
coordinate: coordinateTuple,
text: clickModifierText,
},
required: ["coordinate"],
},
},
{
name: "double_click",
description: `Double-click at the given coordinates. Selects a word in most text editors. ${FRONTMOST_GATE_DESC}`,
inputSchema: {
type: "object" as const,
properties: {
coordinate: coordinateTuple,
text: clickModifierText,
},
required: ["coordinate"],
},
},
{
name: "triple_click",
description: `Triple-click at the given coordinates. Selects a line in most text editors. ${FRONTMOST_GATE_DESC}`,
inputSchema: {
type: "object" as const,
properties: {
coordinate: coordinateTuple,
text: clickModifierText,
},
required: ["coordinate"],
},
},
{
name: "right_click",
description: `Right-click at the given coordinates. Opens a context menu in most applications. ${FRONTMOST_GATE_DESC}`,
inputSchema: {
type: "object" as const,
properties: {
coordinate: coordinateTuple,
text: clickModifierText,
},
required: ["coordinate"],
},
},
{
name: "middle_click",
description: `Middle-click (scroll-wheel click) at the given coordinates. ${FRONTMOST_GATE_DESC}`,
inputSchema: {
type: "object" as const,
properties: {
coordinate: coordinateTuple,
text: clickModifierText,
},
required: ["coordinate"],
},
},
{
name: "type",
description: `Type text into whatever currently has keyboard focus. ${FRONTMOST_GATE_DESC} Newlines are supported. For keyboard shortcuts use \`key\` instead.`,
inputSchema: {
type: "object" as const,
properties: {
text: { type: "string", description: "Text to type." },
},
required: ["text"],
},
},
{
name: "key",
description:
`Press a key or key combination (e.g. "return", "escape", "cmd+a", "ctrl+shift+tab"). ${FRONTMOST_GATE_DESC} ` +
"System-level combos (quit app, switch app, lock screen) require the `systemKeyCombos` grant — without it they return an error. All other combos work.",
inputSchema: {
type: "object" as const,
properties: {
text: {
type: "string",
description: 'Modifiers joined with "+", e.g. "cmd+shift+a".',
},
repeat: {
type: "integer",
minimum: 1,
maximum: 100,
description: "Number of times to repeat the key press. Default is 1.",
},
},
required: ["text"],
},
},
{
name: "scroll",
description: `Scroll at the given coordinates. ${FRONTMOST_GATE_DESC}`,
inputSchema: {
type: "object" as const,
properties: {
coordinate: coordinateTuple,
scroll_direction: {
type: "string",
enum: ["up", "down", "left", "right"],
description: "Direction to scroll.",
},
scroll_amount: {
type: "integer",
minimum: 0,
maximum: 100,
description: "Number of scroll ticks.",
},
},
required: ["coordinate", "scroll_direction", "scroll_amount"],
},
},
{
name: "left_click_drag",
description: `Press, move to target, and release. ${FRONTMOST_GATE_DESC}`,
inputSchema: {
type: "object" as const,
properties: {
coordinate: {
...coordinateTuple,
description: `(x, y) end point: ${coord.x}`,
},
start_coordinate: {
...coordinateTuple,
description: `(x, y) start point. If omitted, drags from the current cursor position. ${coord.x}`,
},
},
required: ["coordinate"],
},
},
{
name: "mouse_move",
description: `Move the mouse cursor without clicking. Useful for triggering hover states. ${FRONTMOST_GATE_DESC}`,
inputSchema: {
type: "object" as const,
properties: {
coordinate: coordinateTuple,
},
required: ["coordinate"],
},
},
{
name: "open_application",
description:
"Bring an application to the front, launching it if necessary. The target application must already be in the session allowlist — call request_access first.",
inputSchema: {
type: "object" as const,
properties: {
app: {
type: "string",
description:
"Display name (e.g. \"Slack\") or bundle identifier (e.g. \"com.tinyspeck.slackmacgap\").",
},
},
required: ["app"],
},
},
{
name: "switch_display",
description:
"Switch which monitor subsequent screenshots capture. Use this when the " +
"application you need is on a different monitor than the one shown. " +
"The screenshot tool tells you which monitor it captured and lists " +
"other attached monitors by name — pass one of those names here. " +
"After switching, call screenshot to see the new monitor. " +
'Pass "auto" to return to automatic monitor selection.',
inputSchema: {
type: "object" as const,
properties: {
display: {
type: "string",
description:
'Monitor name from the screenshot note (e.g. "Built-in Retina Display", ' +
'"LG UltraFine"), or "auto" to re-enable automatic selection.',
},
},
required: ["display"],
},
},
{
name: "list_granted_applications",
description:
"List the applications currently in the session allowlist, plus the active grant flags and coordinate mode. No side effects.",
inputSchema: {
type: "object" as const,
properties: {},
required: [],
},
},
{
name: "read_clipboard",
description:
"Read the current clipboard contents as text. Requires the `clipboardRead` grant.",
inputSchema: {
type: "object" as const,
properties: {},
required: [],
},
},
{
name: "write_clipboard",
description:
"Write text to the clipboard. Requires the `clipboardWrite` grant.",
inputSchema: {
type: "object" as const,
properties: {
text: { type: "string" },
},
required: ["text"],
},
},
{
name: "wait",
description: "Wait for a specified duration.",
inputSchema: {
type: "object" as const,
properties: {
duration: {
type: "number",
description: "Duration in seconds (0100).",
},
},
required: ["duration"],
},
},
{
name: "cursor_position",
description:
"Get the current mouse cursor position. Returns image-pixel coordinates relative to the most recent screenshot, or logical points if no screenshot has been taken.",
inputSchema: {
type: "object" as const,
properties: {},
required: [],
},
},
{
name: "hold_key",
description:
`Press and hold a key or key combination for the specified duration, then release. ${FRONTMOST_GATE_DESC} ` +
"System-level combos require the `systemKeyCombos` grant.",
inputSchema: {
type: "object" as const,
properties: {
text: {
type: "string",
description: 'Key or chord to hold, e.g. "space", "shift+down".',
},
duration: {
type: "number",
description: "Duration in seconds (0100).",
},
},
required: ["text", "duration"],
},
},
{
name: "left_mouse_down",
description:
`Press the left mouse button at the current cursor position and leave it held. ${FRONTMOST_GATE_DESC} ` +
"Use mouse_move first to position the cursor. Call left_mouse_up to release. Errors if the button is already held.",
inputSchema: {
type: "object" as const,
properties: {},
required: [],
},
},
{
name: "left_mouse_up",
description:
`Release the left mouse button at the current cursor position. ${FRONTMOST_GATE_DESC} ` +
"Pairs with left_mouse_down. Safe to call even if the button is not currently held.",
inputSchema: {
type: "object" as const,
properties: {},
required: [],
},
},
{
name: "computer_batch",
description:
"Execute a sequence of actions in ONE tool call. Each individual tool call requires a model→API round trip (seconds); " +
"batching a predictable sequence eliminates all but one. Use this whenever you can predict the outcome of several actions ahead — " +
"e.g. click a field, type into it, press Return. Actions execute sequentially and stop on the first error. " +
`${FRONTMOST_GATE_DESC} The frontmost check runs before EACH action inside the batch — if an action opens a non-allowed app, the next action's gate fires and the batch stops there. ` +
"Mid-batch screenshot actions are allowed for inspection but coordinates in subsequent clicks always refer to the PRE-BATCH full-screen screenshot.",
inputSchema: {
type: "object" as const,
properties: {
actions: {
type: "array",
minItems: 1,
items: BATCH_ACTION_ITEM_SCHEMA,
description:
'List of actions. Example: [{"action":"left_click","coordinate":[100,200]},{"action":"type","text":"hello"},{"action":"key","text":"Return"}]',
},
},
required: ["actions"],
},
},
...(caps.teachMode ? buildTeachTools(coord, installedAppsHint) : []),
];
}
/**
* Teach-mode tools. Split out so the spread above stays a single expression;
* takes `coord` so `teach_step.anchor`'s description uses the same
* frozen coordinate-mode phrasing as click coords, and `installedAppsHint`
* so `request_teach_access.apps` gets the same enumeration as
* `request_access.apps` (same resolution path → same hint).
*/
function buildTeachTools(
coord: { x: string; y: string },
installedAppsHint: string,
): Tool[] {
// Shared between teach_step (top-level) and teach_batch (inside steps[]
// items). Depends on coord, so it lives inside this factory.
const teachStepProperties = {
explanation: {
type: "string",
description:
"Tooltip body text. Explain what the user is looking at and why it matters. " +
"This is the ONLY place the user sees your words — be complete but concise.",
},
next_preview: {
type: "string",
description:
"One line describing exactly what will happen when the user clicks Next. " +
'Example: "Next: I\'ll click Create Bucket and type the name." ' +
"Shown below the explanation in a smaller font.",
},
anchor: {
type: "array",
items: { type: "number" },
minItems: 2,
maxItems: 2,
description:
`(x, y) — where the tooltip arrow points. ${coord.x} ` +
"Omit to center the tooltip with no arrow (for general-context steps).",
},
actions: {
type: "array",
// Empty allowed — "read this, click Next" steps.
items: BATCH_ACTION_ITEM_SCHEMA,
description:
"Actions to execute when the user clicks Next. Same item schema as computer_batch.actions. " +
"Empty array is valid for purely explanatory steps. Actions run sequentially and stop on first error.",
},
} as const;
return [
{
name: "request_teach_access",
description:
"Request permission to guide the user through a task step-by-step with on-screen tooltips. " +
"Use this INSTEAD OF request_access when the user wants to LEARN how to do something " +
'(phrases like "teach me", "walk me through", "show me how", "help me learn"). ' +
"On approval the main Claude window hides and a fullscreen tooltip overlay appears. " +
"You then call teach_step repeatedly; each call shows one tooltip and waits for the user to click Next. " +
"Same app-allowlist semantics as request_access, but no clipboard/system-key flags. " +
"Teach mode ends automatically when your turn ends.",
inputSchema: {
type: "object" as const,
properties: {
apps: {
type: "array",
items: { type: "string" },
description:
'Application display names (e.g. "Slack", "Calendar") or bundle identifiers. Resolved case-insensitively against installed apps.' +
installedAppsHint,
},
reason: {
type: "string",
description:
'What you will be teaching. Shown in the approval dialog as "Claude wants to guide you through {reason}". Keep it short and task-focused.',
},
},
required: ["apps", "reason"],
},
},
{
name: "teach_step",
description:
"Show one guided-tour tooltip and wait for the user to click Next. On Next, execute the actions, " +
"take a fresh screenshot, and return both — you do NOT need a separate screenshot call between steps. " +
"The returned image shows the state after your actions ran; anchor the next teach_step against it. " +
"IMPORTANT — the user only sees the tooltip during teach mode. Put ALL narration in `explanation`. " +
"Text you emit outside teach_step calls is NOT visible until teach mode ends. " +
"Pack as many actions as possible into each step's `actions` array — the user waits through " +
"the whole round trip between clicks, so one step that fills a form beats five steps that fill one field each. " +
"Returns {exited:true} if the user clicks Exit — do not call teach_step again after that. " +
"Take an initial screenshot before your FIRST teach_step to anchor it.",
inputSchema: {
type: "object" as const,
properties: teachStepProperties,
required: ["explanation", "next_preview", "actions"],
},
},
{
name: "teach_batch",
description:
"Queue multiple teach steps in one tool call. Parallels computer_batch: " +
"N steps → one model↔API round trip instead of N. Each step still shows a tooltip " +
"and waits for the user's Next click, but YOU aren't waiting for a round trip between steps. " +
"You can call teach_batch multiple times in one tour — treat each batch as one predictable " +
"SEGMENT (typically: all the steps on one page). The returned screenshot shows the state " +
"after the batch's final actions; anchor the NEXT teach_batch against it. " +
"WITHIN a batch, all anchors and click coordinates refer to the PRE-BATCH screenshot " +
"(same invariant as computer_batch) — for steps 2+ in a batch, either omit anchor " +
"(centered tooltip) or target elements you know won't have moved. " +
"Good pattern: batch 5 tooltips on page A (last step navigates) → read returned screenshot → " +
"batch 3 tooltips on page B → done. " +
"Returns {exited:true, stepsCompleted:N} if the user clicks Exit — do NOT call again after that; " +
"{stepsCompleted, stepFailed, ...} if an action errors mid-batch; " +
"otherwise {stepsCompleted, results:[...]} plus a final screenshot. " +
"Fall back to individual teach_step calls when you need to react to each intermediate screenshot.",
inputSchema: {
type: "object" as const,
properties: {
steps: {
type: "array",
minItems: 1,
items: {
type: "object",
properties: teachStepProperties,
required: ["explanation", "next_preview", "actions"],
},
description:
"Ordered steps. Validated upfront — a typo in step 5 errors before any tooltip shows.",
},
},
required: ["steps"],
},
},
];
}

View File

@@ -0,0 +1,622 @@
import type {
ComputerExecutor,
InstalledApp,
ScreenshotResult,
} from "./executor.js";
/** `ScreenshotResult` without the base64 blob. The shape hosts persist for
* cross-respawn `scaleCoord` survival. */
export type ScreenshotDims = Omit<ScreenshotResult, "base64">;
/** Shape mirrors claude-for-chrome-mcp/src/types.ts:1-7 */
export interface Logger {
info: (message: string, ...args: unknown[]) => void;
error: (message: string, ...args: unknown[]) => void;
warn: (message: string, ...args: unknown[]) => void;
debug: (message: string, ...args: unknown[]) => void;
silly: (message: string, ...args: unknown[]) => void;
}
/**
* Per-app permission tier. Hardcoded by category at grant time — the
* approval dialog displays the tier but the user cannot change it (for now).
*
* - `"read"` — visible in screenshots, NO interaction (no clicks, no typing).
* Browsers land here: the model can read a page that's already open, but
* must use the Claude-in-Chrome MCP for any navigation/clicking. Trading
* platforms land here too (no CiC alternative — the model asks the user).
* - `"click"` — visible + plain left-click, scroll. NO typing/keys,
* NO right/middle-click, NO modifier-clicks, NO drag-drop (all text-
* injection vectors). Terminals/IDEs land here: the model can click a
* Run button or scroll test output, but `type("rm -rf /")` is blocked
* and so is right-click→Paste and dragging text onto the terminal.
* - `"full"` — visible + click + type/key/paste. Everything else.
*
* Enforced in `runInputActionGates` via the frontmost-app check: keyboard
* actions require `"full"`, mouse actions require `"click"` or higher.
*/
export type CuAppPermTier = "read" | "click" | "full";
/**
* A single app the user has approved for the current session. Session-scoped
* only — there is no "once" or "forever" scope (unlike Chrome's per-domain
* three-way). CU has no natural "once" unit; one task = hundreds of clicks.
* Mirrors how `chromeAllowedDomains` is a plain `string[]` with no per-item
* scope.
*/
export interface AppGrant {
bundleId: string;
displayName: string;
/** Epoch ms. For Settings-page display ("Granted 3m ago"). */
grantedAt: number;
/** Undefined → `"full"` (back-compat for pre-tier grants persisted in
* session state). */
tier?: CuAppPermTier;
}
/** Orthogonal to the app allowlist. */
export interface CuGrantFlags {
clipboardRead: boolean;
clipboardWrite: boolean;
/**
* When false, the `key` tool rejects combos in `keyBlocklist.ts`
* (cmd+q, cmd+tab, cmd+space, cmd+shift+q, ctrl+alt+delete). All other
* key sequences work regardless.
*/
systemKeyCombos: boolean;
}
export const DEFAULT_GRANT_FLAGS: CuGrantFlags = {
clipboardRead: false,
clipboardWrite: false,
systemKeyCombos: false,
};
/**
* Host picks via GrowthBook JSON feature `chicago_coordinate_mode`, baked
* into tool param descriptions at server-construction time. The model sees
* ONE convention and never learns the other exists. `normalized_0_100`
* sidesteps the Retina scaleFactor bug class entirely.
*/
export type CoordinateMode = "pixels" | "normalized_0_100";
/**
* Independent kill switches for subtle/risky ported behaviors. Read from
* GrowthBook by the host adapter, consulted in `toolCalls.ts`.
*/
export interface CuSubGates {
/** 9×9 exact-byte staleness guard before click. */
pixelValidation: boolean;
/** Route `type("foo\nbar")` through clipboard instead of keystroke-by-keystroke. */
clipboardPasteMultiline: boolean;
/**
* Ease-out-cubic mouse glide at 60fps, distance-proportional duration
* (2000 px/sec, capped at 0.5s). Adds up to ~0.5s latency
* per click. When off, cursor teleports instantly.
*/
mouseAnimation: boolean;
/**
* Pre-action sequence: hide non-allowlisted apps, then defocus us (from the
* Vercept acquisition). When off, the
* frontmost gate fires in the normal case and the model gets stuck — this
* is the A/B-test-the-old-broken-behavior switch.
*/
hideBeforeAction: boolean;
/**
* Auto-resolve the target display before each screenshot when the
* selected display has no allowed-app windows. When on, `handleScreenshot`
* uses the atomic Swift path; off → sticks with `selectedDisplayId`.
*/
autoTargetDisplay: boolean;
/**
* Stash+clear the clipboard while a tier-"click" app is frontmost.
* Closes the gap where a click-tier terminal/IDE has a UI Paste button
* that's plain-left-clickable — without this, the tier "click"
* keyboard block can be routed around by clicking Paste. Restored when
* a non-"click" app becomes frontmost, or at turn end.
*/
clipboardGuard: boolean;
}
// ----------------------------------------------------------------------------
// Permission request/response (mirror of BridgePermissionRequest, types.ts:77-94)
// ----------------------------------------------------------------------------
/** One entry per app the model asked for, after name → bundle ID resolution. */
export interface ResolvedAppRequest {
/** What the model asked for (e.g. "Slack", "com.tinyspeck.slackmacgap"). */
requestedName: string;
/** The resolved InstalledApp if found, else undefined (shown greyed in the UI). */
resolved?: InstalledApp;
/** Shell-access-equivalent bundle IDs get a UI warning. See sentinelApps.ts. */
isSentinel: boolean;
/** Already in the allowlist → skip the checkbox, return in `granted` immediately. */
alreadyGranted: boolean;
/** Hardcoded tier for this app (browser→"read", terminal→"click", else "full").
* The dialog displays this read-only; the renderer passes it through
* verbatim in the AppGrant. */
proposedTier: CuAppPermTier;
}
/**
* Payload for the renderer approval dialog. Rides through the existing
* `ToolPermissionRequest.input: unknown` field
* (packages/utils/desktop/bridge/common/claude.web.ts:1262) — no IPC schema
* change needed.
*/
export interface CuPermissionRequest {
requestId: string;
/** Model-provided reason string. Shown prominently in the approval UI. */
reason: string;
apps: ResolvedAppRequest[];
/** What the model asked for. User can toggle independently of apps. */
requestedFlags: Partial<CuGrantFlags>;
/**
* For the "On Windows, Claude can see all apps..." footnote. Taken from
* `executor.capabilities.screenshotFiltering` so the renderer doesn't
* need to know about platforms.
*/
screenshotFiltering: "native" | "none";
/**
* Present only when TCC permissions are NOT yet granted. When present,
* the renderer shows a TCC toggle panel (two rows: Accessibility, Screen
* Recording) INSTEAD OF the app list. Clicking a row's "Request" button
* triggers the OS prompt; the store polls on window-focus and flips the
* toggle when the grant is detected. macOS itself prompts the user to
* restart after granting Screen Recording — we don't.
*/
tccState?: {
accessibility: boolean;
screenRecording: boolean;
};
/**
* Apps with windows on the CU display that aren't in the requested
* allowlist. These will be hidden the first time Claude takes an action.
* Computed at request_access time — may be slightly stale by the time the
* user clicks Allow, but it's a preview, not a contract. Absent when
* empty so the renderer can skip the section cleanly.
*/
willHide?: Array<{ bundleId: string; displayName: string }>;
/**
* `chicagoAutoUnhide` app preference at request time. The renderer picks
* between "...then restored when Claude is done" and "...will be hidden"
* copy. Absent when `willHide` is absent (same condition).
*/
autoUnhideEnabled?: boolean;
}
/**
* What the renderer stuffs into `updatedInput._cuGrants` when the user clicks
* "Allow for this session" (mirror of the `_allowAllSites` sentinel at
* LocalAgentModeSessionManager.ts:2794).
*/
export interface CuPermissionResponse {
granted: AppGrant[];
/** Bundle IDs the user unchecked, or apps that weren't installed. */
denied: Array<{ bundleId: string; reason: "user_denied" | "not_installed" }>;
flags: CuGrantFlags;
/**
* Whether the user clicked Allow in THIS dialog. Only set by the
* teach-mode handler — regular request_access doesn't need it (the
* session manager's `result.behavior` gates the merge there). Needed
* because when all requested apps are already granted (skipDialogGrants
* non-empty, needDialog empty), Allow and Deny produce identical
* `{granted:[], denied:[]}` payloads and the tool handler can't tell
* them apart without this. Undefined → legacy/regular path, do not
* gate on it.
*/
userConsented?: boolean;
}
// ----------------------------------------------------------------------------
// Host adapter (mirror of ClaudeForChromeContext, types.ts:33-62)
// ----------------------------------------------------------------------------
/**
* Process-lifetime singleton dependencies. Everything that does NOT vary per
* tool call. Built once by `apps/desktop/src/main/nest-only/chicago/hostAdapter.ts`.
* No Electron imports in this package — the host injects everything.
*/
export interface ComputerUseHostAdapter {
serverName: string;
logger: Logger;
executor: ComputerExecutor;
/**
* TCC state check — Accessibility + Screen Recording on macOS. Pure check,
* no dialog, no relaunch. When either is missing, `request_access` threads
* the state through to the renderer which shows a toggle panel; all other
* tools return a tool error.
*/
ensureOsPermissions(): Promise<
| { granted: true }
| { granted: false; accessibility: boolean; screenRecording: boolean }
>;
/** The Settings-page kill switch (`chicagoEnabled` app preference). */
isDisabled(): boolean;
/**
* The `chicagoAutoUnhide` app preference. Consumed by `buildAccessRequest`
* to populate `CuPermissionRequest.autoUnhideEnabled` so the renderer's
* "will be hidden" copy can say "then restored" only when true.
*/
getAutoUnhideEnabled(): boolean;
/**
* Sub-gates re-read on every tool call so GrowthBook flips take effect
* mid-session without restart.
*/
getSubGates(): CuSubGates;
/**
* JPEG decode + crop + raw pixel bytes, for the PixelCompare staleness guard.
* Injected so this package stays Electron-free. The host implements it via
* `nativeImage.createFromBuffer(jpeg).crop(rect).toBitmap()` — Chromium's
* decoders, BSD-licensed, no `.node` binary.
*
* Returns null on decode/crop failure — caller treats null as `skipped`,
* click proceeds (validation failure must never block the action).
*/
cropRawPatch(
jpegBase64: string,
rect: { x: number; y: number; width: number; height: number },
): Buffer | null;
}
// ----------------------------------------------------------------------------
// Session context (getter/callback bag for bindSessionContext)
// ----------------------------------------------------------------------------
/**
* Per-session state binding for `bindSessionContext`. Hosts build this once
* per session with getters that read fresh from their session store and
* callbacks that write back. The returned dispatcher builds
* `ComputerUseOverrides` from these getters on every call.
*
* Callbacks must be set at construction time — `bindSessionContext` reads
* them once at bind, not per call.
*
* The lock hooks are **async** — `bindSessionContext` awaits them before
* `handleToolCall`, then passes `checkCuLock: undefined` in overrides so the
* sync Gate-3 in `handleToolCall` no-ops. Hosts with in-memory sync locks
* (Cowork) wrap them trivially; hosts with cross-process locks (the CLI's
* O_EXCL file) call the real async primitive directly.
*/
export interface ComputerUseSessionContext {
// ── Read state fresh per call ──────────────────────────────────────
getAllowedApps(): readonly AppGrant[];
getGrantFlags(): CuGrantFlags;
/** Per-user auto-deny list (Settings page). Empty array = none. */
getUserDeniedBundleIds(): readonly string[];
getSelectedDisplayId(): number | undefined;
getDisplayPinnedByModel?(): boolean;
getDisplayResolvedForApps?(): string | undefined;
getTeachModeActive?(): boolean;
/** Dims-only fallback when `lastScreenshot` is unset (cross-respawn).
* `bindSessionContext` reconstructs `{...dims, base64: ""}` so scaleCoord
* works and pixelCompare correctly skips. */
getLastScreenshotDims?(): ScreenshotDims | undefined;
// ── Write-back callbacks ───────────────────────────────────────────
/** Shows the approval dialog. Host routes to its UI, awaits user. The
* signal is aborted if the tool call finishes before the user answers
* (MCP timeout, etc.) — hosts dismiss the dialog on abort. */
onPermissionRequest?(
req: CuPermissionRequest,
signal: AbortSignal,
): Promise<CuPermissionResponse>;
/** Teach-mode sibling of `onPermissionRequest`. */
onTeachPermissionRequest?(
req: CuTeachPermissionRequest,
signal: AbortSignal,
): Promise<CuPermissionResponse>;
/** Called by `bindSessionContext` after merging a permission response into
* the allowlist (dedupe on bundleId, truthy-only flag spread). Host
* persists for resume survival. */
onAllowedAppsChanged?(apps: readonly AppGrant[], flags: CuGrantFlags): void;
onAppsHidden?(bundleIds: string[]): void;
/** Reads the session's clipboardGuard stash. undefined → no stash held. */
getClipboardStash?(): string | undefined;
/** Writes the clipboardGuard stash. undefined clears it. */
onClipboardStashChanged?(stash: string | undefined): void;
onResolvedDisplayUpdated?(displayId: number): void;
onDisplayPinned?(displayId: number | undefined): void;
onDisplayResolvedForApps?(sortedBundleIdsKey: string): void;
/** Called after each screenshot. Host persists for respawn survival. */
onScreenshotCaptured?(dims: ScreenshotDims): void;
onTeachModeActivated?(): void;
onTeachStep?(req: TeachStepRequest): Promise<TeachStepResult>;
onTeachWorking?(): void;
// ── Lock (async) ───────────────────────────────────────────────────
/** At most one session uses CU at a time. Awaited by `bindSessionContext`
* before dispatch. Undefined → no lock gating (proceed). */
checkCuLock?(): Promise<{ holder: string | undefined; isSelf: boolean }>;
/** Take the lock. Called when `checkCuLock` returned `holder: undefined`
* on a non-deferring tool. Host emits enter-CU signals here. */
acquireCuLock?(): Promise<void>;
/** Host-specific lock-held error text. Default is the package's generic
* message. The CLI host includes the holder session-ID prefix. */
formatLockHeldMessage?(holder: string): string;
/** User-abort signal. Passed through to `ComputerUseOverrides.isAborted`
* for the mid-loop checks in handleComputerBatch / handleType. See that
* field for semantics. */
isAborted?(): boolean;
}
// ----------------------------------------------------------------------------
// Per-call overrides (mirror of PermissionOverrides, types.ts:97-102)
// ----------------------------------------------------------------------------
/**
* Built FRESH on every tool call by `bindSessionContext` from
* `ComputerUseSessionContext` getters. This is what lets a singleton MCP
* server carry per-session state — the state lives on the host's session
* store, not the server.
*/
export interface ComputerUseOverrides {
allowedApps: AppGrant[];
grantFlags: CuGrantFlags;
coordinateMode: CoordinateMode;
/**
* User-configured auto-deny list (Settings → Desktop app → Computer Use).
* Bundle IDs
* here are stripped from request_access BEFORE the approval dialog — they
* never reach the user for approval regardless of tier. The response tells
* the agent to ask the user to remove the app from their deny list in
* Settings if access is genuinely needed.
*
* Per-USER, persists across restarts (read from appPreferences per call,
* not session state). Contrast with `allowedApps` which is per-session.
* Empty array = no user-configured denies (the default).
*/
userDeniedBundleIds: readonly string[];
/**
* Display CU operates on; read fresh per call. `scaleCoord` uses the
* `originX/Y` snapshotted in `lastScreenshot`, so mid-session switches
* only affect the NEXT screenshot/prepare call.
*/
selectedDisplayId?: number;
/**
* The `request_access` tool handler calls this and awaits. The wrapper
* closure in serverDef.ts (mirroring InternalMcpServerManager.ts:131-177)
* routes through `handleToolPermission` → IPC → renderer ChicagoApproval.
* When it resolves, the wrapper side-effectfully mutates
* `InternalServerContext.cuAllowedApps` BEFORE returning here.
*
* Undefined when the session wasn't wired with a permission handler (e.g.
* a future headless mode). `request_access` returns a tool error in that case.
*/
onPermissionRequest?: (req: CuPermissionRequest) => Promise<CuPermissionResponse>;
/**
* For the pixel-validation staleness guard. The model's-last-screenshot,
* stashed by serverDef.ts after each `screenshot` tool call. Undefined on
* cold start → pixel validation skipped (click proceeds).
*/
lastScreenshot?: ScreenshotResult;
/**
* Fired after every `prepareForAction` with the bundle IDs it just hid.
* The wrapper closure in serverDef.ts accumulates these into
* `Session.cuHiddenDuringTurn` via a write-through callback (same pattern
* as `onCuPermissionUpdated`). At turn end (`sdkMessage.type === "result"`),
* if the `chicagoAutoUnhide` setting is on, everything in the set is
* unhidden. Set is cleared regardless of the setting so it doesn't leak
* across turns.
*
* Undefined when the session wasn't wired with a tracker — unhide just
* doesn't happen.
*/
onAppsHidden?: (bundleIds: string[]) => void;
/**
* Reads the clipboardGuard stash from session state. `undefined` means no
* stash is held — `syncClipboardStash` stashes on first entry to click-tier
* and clears on restore. Sibling of the `cuHiddenDuringTurn` getter pattern
* — state lives on the host's session, not module-level here.
*/
getClipboardStash?: () => string | undefined;
/**
* Writes the clipboardGuard stash to session state. `undefined` clears.
* Sibling of `onAppsHidden` — the wrapper closure writes through to
* `Session.cuClipboardStash`. At turn end the host reads + clears it
* directly and restores via Electron's `clipboard.writeText` (no nest-only
* import surface).
*/
onClipboardStashChanged?: (stash: string | undefined) => void;
/**
* Write the resolver's picked display back to session so teach overlay
* positioning and subsequent non-resolver calls use the same display.
* Fired by `handleScreenshot` in the atomic `autoTargetDisplay` path when
* `resolvePrepareCapture`'s pick differs from `selectedDisplayId`.
* Fire-and-forget.
*/
onResolvedDisplayUpdated?: (displayId: number) => void;
/**
* Set when the model explicitly picked a display via `switch_display`.
* When true, `handleScreenshot` passes `autoResolve: false` so the Swift
* resolver honors `selectedDisplayId` directly (straight cuDisplayInfo
* passthrough) instead of running the co-location/chase chain. The
* resolver's Step 2 ("host + allowed co-located → host") otherwise
* overrides any `selectedDisplayId` whenever an allowed app shares the
* host's monitor.
*/
displayPinnedByModel?: boolean;
/**
* Write the model's explicit display pick to session. `displayId:
* undefined` clears both `selectedDisplayId` and the pin (back to auto).
* Sibling of `onResolvedDisplayUpdated` but also sets the pin flag —
* the two are semantically distinct (resolver-picked vs model-picked).
*/
onDisplayPinned?: (displayId: number | undefined) => void;
/**
* Sorted comma-joined bundle-ID set the display was last auto-resolved
* for. `handleScreenshot` compares this to the current allowed set and
* only passes `autoResolve: true` when they differ — so the resolver
* doesn't yank the display on every screenshot, only when the app set
* has changed since the last resolve (or manual switch).
*/
displayResolvedForApps?: string;
/**
* Records which app set the current display selection was made for. Fired
* alongside `onResolvedDisplayUpdated` when the resolver picks, so the next
* screenshot sees a matching set and skips auto-resolve.
*/
onDisplayResolvedForApps?: (sortedBundleIdsKey: string) => void;
/**
* Global CU lock — at most one session actively uses CU at a time. Checked
* in `handleToolCall` after kill-switch/TCC, before dispatch. Every CU tool
* including `request_access` goes through it.
*
* - `holder === undefined` → lock is free, safe to acquire
* - `isSelf === true` → this session already holds it (no-op, proceed)
* - `holder !== undefined && !isSelf` → blocked, return tool error
*
* `undefined` callback → lock system not wired (e.g. CCD). Proceed without
* gating — absence of the mechanism ≠ locked out.
*
* The host manages release (on session idle/stop/archive) — this package
* never releases.
*/
checkCuLock?: () => { holder: string | undefined; isSelf: boolean };
/**
* Take the lock for this session. `handleToolCall` calls this exactly once
* per turn, on the FIRST CU tool call when `checkCuLock().holder` is
* undefined. No-op if already held (defensive — the check should have
* short-circuited). Host emits an event the overlay listens to.
*/
acquireCuLock?: () => void;
/**
* User-abort signal. Checked mid-iteration inside `handleComputerBatch`
* and `handleType`'s grapheme loop so an in-flight batch/type stops
* promptly on overlay Stop instead of running to completion after the
* host has already abandoned the tool result.
*
* Undefined → never aborts (e.g. unwired host). Live per-check read —
* same lazy-getter pattern as `checkCuLock`.
*/
isAborted?: () => boolean;
// ── Teach mode ───────────────────────────────────────────────────────
// Wired only when the host's teachModeEnabled gate is on. All five
// undefined → `request_teach_access` / `teach_step` return tool errors
// and teach mode is effectively off.
/**
* Sibling of `onPermissionRequest`. Same blocking-await-on-renderer-dialog
* semantics, but routes to ComputerUseTeachApproval.tsx (which explains
* the window-hides-during-guide behavior) instead of ComputerUseApproval.
* The wrapper closure in serverDef.ts writes grants through to session state
* via `onCuPermissionUpdated` exactly as `onPermissionRequest` does.
*/
onTeachPermissionRequest?: (
req: CuTeachPermissionRequest,
) => Promise<CuPermissionResponse>;
/**
* Called by `handleRequestTeachAccess` after the user approves and at least
* one app was granted. Host sets `session.teachModeActive = true`, emits
* `teachModeChanged` → teach controller hides the main window and shows the
* fullscreen overlay. Cleared by the host on turn end (`transitionTo("idle")`)
* alongside the CU lock release.
*/
onTeachModeActivated?: () => void;
/**
* Read by `handleRequestAccess` and `handleRequestTeachAccess` to
* short-circuit with a clear tool error when teach mode is active. The
* main window is hidden during teach mode, so permission dialogs render
* invisibly and handleToolPermission blocks forever on an invisible
* prompt. Better to tell the model to exit teach mode first. Getter
* (not a boolean field) because teach mode state lives on the session,
* not on this per-call overrides object.
*/
getTeachModeActive?: () => boolean;
/**
* Called by `handleTeachStep` with the scaled anchor + text. Host stores
* the resolver, emits `teachStepRequested` → teach controller pushes the
* payload to the overlay → user reads, clicks Next → IPC → host calls the
* stored resolver → this promise resolves. `{action: "exit"}` when the user
* clicks Exit (or the turn is interrupted) — `handleTeachStep` short-circuits
* without executing actions.
*
* Same blocking-promise pattern as `onPermissionRequest`, but resolved by
* the teach overlay's own preload (not the main renderer's tool-approval UI).
*/
onTeachStep?: (req: TeachStepRequest) => Promise<TeachStepResult>;
/**
* Called immediately after `onTeachStep` resolves with "next", before
* action dispatch begins. Host emits `teachStepWorking` → overlay flips to
* the spinner state (Next button gone, Exit stays, "Working…" + rotating
* notch). The next `onTeachStep` call replaces the spinner with the new
* tooltip content.
*/
onTeachWorking?: () => void;
}
// ----------------------------------------------------------------------------
// Teach mode (guided-tour tooltips with Next-button action execution)
// ----------------------------------------------------------------------------
/**
* Payload the host pushes to the teach overlay BrowserWindow. Built by
* `handleTeachStep` in toolCalls.ts from the model's `teach_step` args.
*
* `anchorLogical` here is POST-`scaleCoord` — **full-display** logical
* macOS points (origin = monitor top-left, menu bar included, since
* cuDisplayInfo returns CGDisplayBounds). The overlay window is positioned
* at `workArea.{x,y}` (excludes menu bar/Dock), so `updateTeachStep` in
* teach/window.ts subtracts the workArea offset before IPC so the HTML's
* CSS coords match.
*/
export interface TeachStepRequest {
explanation: string;
nextPreview: string;
/** Full-display logical points. Undefined → overlay centers the tooltip, hides the arrow. */
anchorLogical?: { x: number; y: number };
}
export type TeachStepResult = { action: "next" } | { action: "exit" };
/**
* Payload for the renderer's ComputerUseTeachApproval dialog. Rides through
* `ToolPermissionRequest.input: unknown` same as `CuPermissionRequest`.
* Separate type (not a flag on `CuPermissionRequest`) so the two approval
* components can narrow independently and the teach dialog is free to drop
* fields it doesn't render (no grant-flag checkboxes in teach mode).
*/
export interface CuTeachPermissionRequest {
requestId: string;
/** Model-provided reason. Shown in the dialog headline ("guide you through {reason}"). */
reason: string;
apps: ResolvedAppRequest[];
screenshotFiltering: "native" | "none";
/** Present only when TCC is ungranted — same semantics as `CuPermissionRequest.tccState`. */
tccState?: {
accessibility: boolean;
screenRecording: boolean;
};
willHide?: Array<{ bundleId: string; displayName: string }>;
/** Same semantics as `CuPermissionRequest.autoUnhideEnabled`. */
autoUnhideEnabled?: boolean;
}