diff --git a/.github/workflows/npm-publish.yml b/.github/workflows/npm-publish.yml index 791f350da..a32dfc361 100644 --- a/.github/workflows/npm-publish.yml +++ b/.github/workflows/npm-publish.yml @@ -354,6 +354,78 @@ jobs: echo "EXTRA_TAGS=${EXTRA_TAGS}" >> "$GITHUB_OUTPUT" echo "Resolved dist-tags: ${NPM_TAG} ${EXTRA_TAGS:++ $EXTRA_TAGS} for ${TAG}" + - name: Generate build-info.json for published packages (RFC-41 §4.9) + # CI writes `build-info.json` into each public package root + # before `pnpm pack` runs, so the tarball — and therefore + # the installed npm package — carries the build's commit + # SHA + ISO timestamp + dist-tag + CI run id. + # + # The daemon's `loadBuildInfo()` (packages/cli/src/daemon/manifest.ts) + # reads this file at startup. `/api/status` exposes it as + # `{commit, commitShort, buildTime, distTag}`. `dkg doctor`'s + # §4.7.0 state summary surfaces it alongside the rest of the + # install context. Pre-release dist-tag testing becomes + # auditable: operators see exactly which commit they're + # running, not just a semver tag that may have multiple + # sequential builds. + # + # `files` field on each package.json must include + # `build-info.json` for it to be picked up by pnpm pack + # (cli has been updated; other public packages will be + # added on a follow-up if they also serve build-info via + # an HTTP route). + env: + GITHUB_SHA: ${{ github.sha }} + GITHUB_RUN_ID: ${{ github.run_id }} + NPM_TAG: ${{ steps.dist-tag.outputs.NPM_TAG }} + TAG_NAME: ${{ github.ref_name }} + run: | + set -euo pipefail + VER="${TAG_NAME#v}" + COMMIT="${GITHUB_SHA}" + COMMIT_SHORT="${COMMIT:0:8}" + BUILD_TIME="$(date -u +"%Y-%m-%dT%H:%M:%SZ")" + CI_RUN="${GITHUB_RUN_ID}" + DIST_TAG="${NPM_TAG}" + echo "build-info.json fields:" + echo " version = ${VER}" + echo " commit = ${COMMIT}" + echo " commitShort= ${COMMIT_SHORT}" + echo " buildTime = ${BUILD_TIME}" + echo " distTag = ${DIST_TAG}" + echo " ciRun = ${CI_RUN}" + node -e " + const fs = require('fs'); + const path = require('path'); + const payload = { + version: '${VER}', + commit: '${COMMIT}', + commitShort: '${COMMIT_SHORT}', + buildTime: '${BUILD_TIME}', + distTag: '${DIST_TAG}', + ciRun: '${CI_RUN}', + }; + const pkgsDir = path.join(process.cwd(), 'packages'); + for (const dir of fs.readdirSync(pkgsDir)) { + const pkgPath = path.join(pkgsDir, dir, 'package.json'); + if (!fs.existsSync(pkgPath)) continue; + const pkg = JSON.parse(fs.readFileSync(pkgPath, 'utf-8')); + if (pkg.private) continue; + // Only write build-info.json into packages whose 'files' + // field explicitly includes it. Other packages would + // accept the file on disk during pack-time but the + // tarball pruning step (npm's standard files-list + // behaviour) would silently drop it. Explicit gating + // keeps the file's presence on a published tarball + // a deterministic per-package choice. + const files = Array.isArray(pkg.files) ? pkg.files : null; + if (files && !files.includes('build-info.json')) continue; + const outPath = path.join(pkgsDir, dir, 'build-info.json'); + fs.writeFileSync(outPath, JSON.stringify(payload, null, 2) + '\n'); + console.log(' wrote ' + outPath); + } + " + - name: Pack public packages into tarballs # `pnpm pack` runs the same prepack/prepare lifecycle hooks # `pnpm publish` would, but EMITS A TARBALL instead of pushing diff --git a/CHANGELOG.md b/CHANGELOG.md index 3abad88c6..614e91562 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,20 @@ All notable changes to the DKG V9 node are documented here. The format is based ## [Unreleased] +### Changed — V10 EVM module hardening pass + +Consistency and defense-in-depth refinements across the V10 EVM-module contracts. No behaviour change for valid callers. + +- **CEI ordering on `DKGStakingConvictionNFT.withdraw`.** The receipt NFT is now burned before `StakingV10.withdraw` drives the CSS teardown + TRAC payout. `StakingV10.withdraw` gates on the CSS position (`pos.identityId == 0`), not NFT existence, so the CSS teardown is unaffected. +- **`nonReentrant` perimeter on KAV10 entrypoints.** `publish` / `update` / `extendKnowledgeCollectionLifetime` now carry OZ `ReentrancyGuard.nonReentrant` as a defense-in-depth perimeter against the ERC-1155 receiver-hook callback path. ~50 gas/call overhead. KAV10 version: `10.1.0` → `10.1.1`. +- **Strict-positive `tokenAmount` floor in KAV10 `_validateTokenAmount` AND post-discount floor in `PublishingConviction.coverPublishingCost`.** Both branches of the publish flow now charge a non-zero economic cost regardless of input rounding: direct-spend reverts `InvalidTokenAmount(1, 0)` on `tokenAmount == 0`; the conviction (PCA) branch inflates a truncated `discountedCost == 0` to `1` wei TRAC when `baseCost > 0` so the active-sink reward distribution + `windowSpent` accounting always fire. **BREAKING for any caller that previously relied on dust-CG zero-amount publishes** — the on-chain revert is the floor of truth; off-chain callers must encode `tokenAmount >= 1`. PublishingConviction version: `1.0.0` → `1.0.1`. +- **Single source of truth for op-wallet validation in `Identity.addOperationalWallets`.** Same-identity collisions (primary added by `createIdentity` OR intra-array duplicate within the same call) surface as `OperationalWalletDuplicate(wallet)`; cross-identity collisions still fire `OperationalKeyTaken(key)`; admin/operational wallet overlap surfaces as the existing `KeyAlreadyAttached(key)`. `Profile.createProfile`'s pre-flight validation loop is removed — atomic-revert semantics make the prior "fail-fast at the entrypoint" rationale moot, and the relocation removes the duplicate validation pass on the happy path. Identity version: `1.0.0` → `1.1.0`; Profile version: `1.3.0` → `1.4.2`. +- **`Profile.recreateProfile` signature refinement.** Drops the `uint16 initialOperatorFee` argument; the recovered profile is seeded at fee = 0 and the admin sets the real value via the cooldown-gated `updateOperatorFee` path. Keeps the recovery and steady-state surfaces symmetric on the operator-fee dimension. ADR `docs/adr/0001-recreate-profile-admin-only.md` updated. +- **Chain-package ABI sync.** `packages/chain/abi/KnowledgeAssetsV10.json` and `packages/chain/abi/Profile.json` re-exported from the freshly regenerated `evm-module/abi/` copies so the chain adapter's error decoder (which prefers its local override over the published artifact) resolves `ReentrancyGuardReentrantCall`, `InvalidTokenAmount`, and the new Profile error surface as structured errors instead of opaque reverts. +- **Regression coverage.** New unit tests pin every new revert surface: `publish` + `extendKnowledgeCollectionLifetime` revert `InvalidTokenAmount(1, 0)` on `tokenAmount == 0`; `publish` reverts `ReentrancyGuardReentrantCall()` when re-entered from the ERC-1155 mint acceptance callback (via the new `MockReentrantPublisher` test harness); `PublishingConviction.coverPublishingCost(baseCost=1, ...)` floors `discountedCost` at 1 and propagates that floor through the active-sink reward distribution + `windowSpent` accounting; `Identity.addOperationalWallets` per-class disambiguation; `Profile.createProfile` per-class diagnostics. + +Compatibility: `recreateProfile`'s signature is a BREAKING change for the recovery script. The `tokenAmount > 0` floor is a BREAKING change for any zero-cost publish flows. Off-chain consumers pinned to `KnowledgeAssetsV10@10.1.0` or `Profile@1.3.0` need a version bump. No storage-layout changes — KAV10's added `ReentrancyGuard` storage slot lands at the end of the inheritance chain, and V10's redeploy-and-reinit pattern doesn't preserve storage across upgrades anyway. + ## [10.0.0-rc.11] - 2026-05-26 ### Added — Node release visible on the libp2p wire diff --git a/README.md b/README.md index 585b23ba1..72a2fbbe9 100644 --- a/README.md +++ b/README.md @@ -143,6 +143,34 @@ TOKEN=$(dkg auth show) curl -H "Authorization: Bearer $TOKEN" http://127.0.0.1:9200/api/agents ``` +### Updating your node + +To update DKG, run one command: + +```bash +dkg update # pull the latest release from npm and restart +dkg update --check # check what's available without applying +dkg update --allow-prerelease # follow the `next` dist-tag for pre-release builds +dkg rollback # revert to the previous version +``` + +Do **not** `git pull` or clone the repository to update — `dkg update` is the canonical verb. If anything looks off (multiple repositories on disk, served UI doesn't match version, version skew between daemon and CLI), run `dkg doctor` for a structured diagnostic of the install state. See [`OT-RFC-41`](https://github.com/OriginTrail/dkgv10-spec/blob/main/rfcs/OT-RFC-41-edge-node-npm-only-install-and-update.md) for the design rationale. + +### Contributors / monorepo development + +Hacking on DKG itself? Don't go through `npm install -g`. Clone, install, and run from the workspace: + +```bash +git clone https://github.com/OriginTrail/dkg.git +cd dkg +pnpm install +pnpm dkg start # or `pnpm dkg ` +``` + +Contributor state lives under `~/.dkg-dev/` (separated from `~/.dkg/` so a contributor's dev work doesn't stomp on their own Edge install). `dkg update` is intentionally disabled in monorepo-checkout mode — use `git pull && pnpm install && pnpm build` instead. + +The legacy `install.sh` git-checkout installer was removed in rc.12 (OT-RFC-41 §5 PR 6). If you have an existing `install.sh`-style install, run `npm install -g @origintrail-official/dkg` to take over the install; the daemon will detect the legacy `~/.dkg/releases/` tree on first start, record the active slot version into `~/.dkg/previous-version` (rollback target), and resume from the npm-global install. `dkg doctor` flags any leftover cleanable state. See [`docs/archive/MIGRATE_TO_NPM.md`](docs/archive/MIGRATE_TO_NPM.md) for historical context on the pre-rc.12 procedure. + --- ## Community integrations @@ -236,9 +264,10 @@ dkg integration list [--tier community] # default tier filter is `verified`+ dkg integration info # show details for one entry dkg integration install # install cli/mcp kind; --allow-community for community-tier entries -# Update / rollback -dkg update [--check] [--allow-prerelease] # update node software +# Update / rollback / diagnose +dkg update [--check] [--allow-prerelease] # update node software via npm registry dkg rollback # roll back to previous version +dkg doctor [--json] # diagnostic report: install layout, version skew, orphan clones, UI mismatch, plugin root, config sanity ``` Run `dkg --help` for per-command options. @@ -311,6 +340,109 @@ analysis reports are under `bench/results/profiles/`, including --- +## Triple Store Backends + +A DKG node keeps every assertion in an [RDF](https://www.w3.org/RDF/) triple store. Out of the box the node runs an embedded [Oxigraph](https://github.com/oxigraph/oxigraph) instance, which is everything you need on a workstation — no extra process, no extra port, no extra config. Heavier deployments can swap in [Blazegraph](https://blazegraph.com/) (the mainnet store) or any SPARQL 1.1 server. + +| Backend | When to pick it | +|---|---| +| `oxigraph-worker` (default) | Single-operator nodes, dev, CI. No setup. File-backed, capped at process RAM. | +| `blazegraph` | High-throughput nodes, mainnet parity, very large graphs (10M+ quads). Run as a separate daemon (Docker or `java -jar`). Shares cleanly with V6 / V8 instances — DKG scopes its writes to the `did:dkg:context-graph:` named-graph prefix. | +| `sparql-http` | Any SPARQL 1.1 Protocol server (Fuseki, GraphDB, Stardog, Neptune…). Bring your own URL + (optional) auth header. | + +### Configure via `dkg init` + +Two paths: + +**1. Point at an existing Blazegraph instance:** + +``` +$ dkg init +… +Triple store backend (oxigraph / blazegraph) (oxigraph): blazegraph +Blazegraph SPARQL endpoint URL: http://127.0.0.1:9999/bigdata/namespace/mynode/sparql + Store endpoint reachable: blazegraph http://127.0.0.1:9999/bigdata/namespace/mynode/sparql +``` + +**2. Let `dkg init` provision a Blazegraph container via Docker:** + +``` +$ dkg init +… +Triple store backend (oxigraph / blazegraph) (oxigraph): blazegraph +Blazegraph SPARQL endpoint URL: ← leave blank +No URL provided. Provision a Blazegraph container via Docker? (y/n) (y): y + Starting Blazegraph in Docker (namespace: mynode)… + Docker available: Docker version 24.0.6, build ed223bc + Created container "dkg-blazegraph-mynode" on port 9999. + Created Blazegraph namespace "mynode". +``` + +The Docker provisioner pins `lyrasis/blazegraph:2.1.5` (same image and tag as mainnet and the devnet test fixture), uses `--restart unless-stopped`, auto-bumps the host port if 9999 is taken, and is idempotent — re-running `dkg init` against an already-provisioned namespace reuses the running container. + +The wizard validates non-Docker URLs via an `ASK { ?s ?p ?o }` probe before saving — typos or unreachable namespaces are caught at setup time, not at first boot. A 404 surfaces a specific "namespace likely doesn't exist" message rather than the generic network-failure hint. + +### Configure via flags (scripted setup) + +Every setup entry point honours `--store` / `--store-url`: + +```bash +# Init only +dkg init --store blazegraph --store-url http://127.0.0.1:9999/bigdata/namespace/mynode/sparql + +# Adapter setups (validated + persisted after the adapter step completes) +dkg hermes setup --store blazegraph --store-url http://blaze.example/sparql +dkg openclaw setup --store blazegraph --store-url http://blaze.example/sparql +dkg mcp setup --store blazegraph --store-url http://blaze.example/sparql +``` + +`--store oxigraph` on a previously-Blazegraph node clears the persisted block (force-fall-back to the local default). + +### Configure via `~/.dkg/config.json` + +```json +{ + "store": { + "backend": "blazegraph", + "options": { + "url": "http://127.0.0.1:9999/bigdata/namespace/mynode/sparql", + "managedByDkg": false + } + } +} +``` + +For `sparql-http`: + +```json +{ + "store": { + "backend": "sparql-http", + "options": { + "queryEndpoint": "http://server.example/query", + "updateEndpoint": "http://server.example/update", + "auth": "Bearer YOUR_TOKEN" + } + } +} +``` + +### What changes when you pick an external backend + +- **Boot-time health check**: the daemon refuses to start until the endpoint answers `ASK`. Unreachable URLs print an actionable message naming the URL — no half-broken daemon. +- **Namespace identity tag**: on first boot the daemon writes a triple into a reserved `` graph recording its node name. Subsequent boots verify the tag before doing any writes — two DKG nodes pointed at the same Blazegraph namespace can't silently corrupt each other any more. Mismatches print the cleanup recipe (`DELETE WHERE { ... }`). +- **Backend-aware reset**: chain-reset (and rebooting against a different backend) scopes its `DELETE` to the `did:dkg:context-graph:` prefix, leaving any V6/V8 data on the same Blazegraph instance untouched. Docker-provisioned namespaces (`managedByDkg: true`) use the faster `DROP ALL` path. +- **Backend-switch guard**: switching backends between boots is treated like a destructive operation. The daemon prints a multi-line warning and refuses to start unless you set `DKG_ACCEPT_STORE_RESET=1`. Reverting `store.backend` in your config recovers the previous backend's data. +- **Metrics**: `/api/status` exposes `storeUrl` and `storeQuads` (cached for 30 s) instead of the `storeBytes` file size — quad count is what's meaningful when the store isn't a local file. +- **Required config**: when you enable `largeLiteralStorage` or `sharedMemoryPublicSnapshotStorage` with an external backend, you must set their `directory` explicitly (no local store path to infer from). The daemon fails fast at config-load if either is missing. + +### Limitations + +- **Auth / TLS**: only the generic `sparql-http` backend accepts an `Authorization` header. For Blazegraph behind auth or HTTPS-with-custom-CA, run a reverse proxy in front of it for now. +- **Migration tool**: there is no `dkg migrate-store` between backends. Plan a chain-reset window if you need to switch on a node that holds important non-VM state. + +--- + ## Testnet Funding A DKG testnet node needs Base Sepolia ETH (to pay gas for on-chain operations) and test TRAC (for staking and publishing). The Origin Trail testnet faucet hands out both in a single API call, so first-setup paths auto-fund the generated admin wallet plus the three operational wallets when a faucet is configured in the network config. diff --git a/devnet/agent-provenance/README.md b/devnet/agent-provenance/README.md index 3206a4410..562de3356 100644 --- a/devnet/agent-provenance/README.md +++ b/devnet/agent-provenance/README.md @@ -14,7 +14,7 @@ prefer running them before falling back to the manual recipes below. | Suite | Scope | Runtime | Command | | --- | --- | --- | --- | | Hardhat e2e | All 10 sequence diagrams from `RFC-001-implementation-walkthrough.md` (incl. Phase 4 author override + pre-signed AuthorAttestation), run against an in-process Hardhat EVM. Covers contract correctness + publisher integration in a single process. | ~30s | `pnpm test:e2e:agent-provenance` | -| 5-node devnet | All 4 modes (a/b/c/d) + negative case from §4 + §9.5, run against `./scripts/devnet.sh start 5`. Mode (b) registers a custodial agent on core 2 and validates `KC.author = agent.wallet` while `publisherNodeIdentityId = core2.id`. | ~35s after devnet is up | `pnpm test:devnet:agent-provenance` | +| 5-node devnet | All 4 modes (a/b/c/d) + negative case from §4 + §9.5, run against `./scripts/devnet.sh start 5`. Mode (b) registers a custodial agent on core 2 and validates `KC.author = agent.wallet` while `publisherNodeIdentityId = core2.id`. Includes a `mode (a) strict` variant that drains edge op-wallets' TRAC to literal zero via `hardhat_setStorageAt` and asserts the publish still succeeds via the PCA — pinning the "publisher EOA holds only gas tokens, spends TRAC from PCA" operator scenario. | ~35s after devnet is up | `pnpm test:devnet:agent-provenance` | Phase 4 author override (RFC §4(b)) is now wired end-to-end via the agent-keystore: end-user agents register on a daemon (`POST diff --git a/devnet/agent-provenance/automated.test.ts b/devnet/agent-provenance/automated.test.ts index 547aa395d..942a82bfe 100644 --- a/devnet/agent-provenance/automated.test.ts +++ b/devnet/agent-provenance/automated.test.ts @@ -239,6 +239,37 @@ async function ensurePcaAccountForOpWallets( return accountId; } +/** + * Zero out the TRAC balance of every op wallet on `node` via direct + * storage writes against the Hardhat TRAC contract. Native ETH is + * untouched, so the wallets keep enough gas to submit publishes. + * + * Token = `Ownable, ERC20, AccessControl` — same layout + * `ensurePcaAccountForOpWallets` exploits to MINT TRAC. ERC20's + * `_balances` mapping is at slot 1 (Ownable's `_owner` takes slot 0). + * Storage key for `mapping(address => uint256)`: + * `keccak256(abi.encode(holder, slot))`. + * + * Used by the gas-only mode (a) variant below to construct the + * literal "publisher EOA has zero TRAC, only ETH for gas" precondition. + */ +async function drainOpWalletTrac(s: DevnetState, node: DevnetNode): Promise { + const tokenAddress = await s.token.getAddress(); + for (const w of node.opWallets) { + const slotKey = ethers.keccak256( + ethers.AbiCoder.defaultAbiCoder().encode( + ['address', 'uint256'], + [w.address, 1n], + ), + ); + await s.provider.send('hardhat_setStorageAt', [ + tokenAddress, + slotKey, + ethers.ZeroHash, + ]); + } +} + async function fetchStatus(node: DevnetNode): Promise<{ identityId: bigint; nodeRole: string }> { const res = await fetch(`http://127.0.0.1:${node.apiPort}/api/status`); if (!res.ok) { @@ -578,6 +609,98 @@ describe('Agent provenance — automated 5-node devnet validation', () => { expect(afterBalance).toBe(beforeBalance); }, 180_000); + // ========================================================================= + // Mode (a) — STRICT: publisher EOA holds ZERO TRAC, only gas (ETH). + // + // Tightens the cost-coverage invariant in mode (a). The plain mode (a) + // asserts "balance unchanged across publish"; that's necessary but + // doesn't pin the actual operator scenario where a publishing agent + // wallet is provisioned with ONLY gas tokens and never holds TRAC. + // + // Setup: + // 1. Reuse / create the mode (a) PCA so every edge op wallet is a + // registered conviction agent (`agentToAccountId != 0`). + // 2. Zero the TRAC `_balances` slot for every edge op wallet via + // `hardhat_setStorageAt`. ETH is untouched — daemon can still + // pay gas. After this step `sumOpBalances(edge) == 0n`. + // + // Action: edge runs `dkg publish` naming core1 for attribution. The + // daemon will pick one of the now-zero-TRAC op wallets as `msg.sender` + // for `KAV10.publish()`. The conviction branch fires + // (`agentToAccountId[msg.sender] != 0`, `epochs == lockDurationEpochs`, + // not expired) → `NFT.coverPublishingCost` updates `windowSpent` + // without calling `transferFrom(msg.sender, ...)` for TRAC, so the + // publish must succeed even though the EOA's TRAC balance is zero. + // + // Assertions: + // - Pre-publish `sumOpBalances(edge) == 0n` (precondition pinned). + // - Publish status == confirmed. + // - KC author is one of edge's op wallets. + // - `NFT.windowSpent(accountId, currentBillingWindow)` grew. + // - core1's `EpochStorage` publishing-value counter grew. + // - Post-publish `sumOpBalances(edge) == 0n` (the agent EOA's TRAC + // ledger entry was NEVER touched — strongest possible "agent only + // spent gas" assertion). + // + // Side effect: edge op-wallets stay at zero TRAC for the rest of the + // suite. This is fine for mode (c) (the `firstOpAccount !== 0n` skip + // already kicks in once mode (a) has run), mode (d) (unattributed + // publish still covered by PCA), the unauthorized-fall-through + // negative case (Eps-only assertion), and mode (b) (uses core2, + // independent of edge wallets). + // ========================================================================= + it('mode (a) strict — gas-only edge op-wallets (zero TRAC) publish via PCA', async () => { + const s = state.v!; + const core1 = s.nodes[1]!; + const edge = s.nodes[5]!; + if (core1.identityId === 0n) throw new Error('core1 has no identity'); + + const accountId = await ensurePcaAccountForOpWallets(s, edge); + + await drainOpWalletTrac(s, edge); + const drainedBalance = await sumOpBalances(s.token, edge); + expect(drainedBalance).toBe(0n); + + const epoch: bigint = await s.chronos.getCurrentEpoch(); + const beforeWindow: bigint = BigInt(await s.nft.getCurrentBillingWindow(accountId)); + const beforeSpent: bigint = + (await s.nft.windowSpent(accountId, beforeWindow)) + + (await s.nft.windowSpent(accountId, beforeWindow + 1n)); + const beforeEps: bigint = await s.eps.getNodeEpochProducedKnowledgeValue(core1.identityId, epoch); + + const file = makeNquadsFile('mode-a-strict'); + const result = await publishViaCli(edge, CONTEXT_GRAPH, file, { + publisherNodeIdentityId: core1.identityId, + }); + + expect(result.status.toLowerCase()).toBe('confirmed'); + expect(result.kcId).toBeDefined(); + + const onChainAuthor: string = await s.kcs.getLatestMerkleRootAuthor(result.kcId!); + const matchesAnyOpWallet = edge.opWallets.some( + (w) => w.address.toLowerCase() === onChainAuthor.toLowerCase(), + ); + expect(matchesAnyOpWallet).toBe(true); + + const afterWindow: bigint = BigInt(await s.nft.getCurrentBillingWindow(accountId)); + const afterSpent: bigint = + (await s.nft.windowSpent(accountId, beforeWindow)) + + (await s.nft.windowSpent(accountId, beforeWindow + 1n)) + + (afterWindow > beforeWindow + 1n + ? await s.nft.windowSpent(accountId, afterWindow) + : 0n); + expect(afterSpent - beforeSpent).toBeGreaterThan(0n); + + const afterEps: bigint = await s.eps.getNodeEpochProducedKnowledgeValue(core1.identityId, epoch); + expect(afterEps).toBeGreaterThan(beforeEps); + + // The strict invariant: every edge op-wallet's TRAC balance is STILL + // ZERO. The conviction branch never called `transferFrom` on the + // publishing agent's TRAC ledger entry — the EOA only spent gas. + const finalBalance = await sumOpBalances(s.token, edge); + expect(finalBalance).toBe(0n); + }, 180_000); + // ========================================================================= // Mode (c) — Same-operator edge + core, no PCA, full TRAC, attribution. // ========================================================================= diff --git a/devnet/edge-update-flow/automated.test.ts b/devnet/edge-update-flow/automated.test.ts new file mode 100644 index 000000000..6906b2f1a --- /dev/null +++ b/devnet/edge-update-flow/automated.test.ts @@ -0,0 +1,104 @@ +/** + * OT-RFC-41 Bundle B — Edge npm-only update + rollback flow integration test. + * + * The unit suite in `packages/cli/test/rfc-41-bundle-b.test.ts` covers + * the LOGIC of `performNpmUpdateEdge` + `dkg rollback` (Edge branch) + * with a mocked `_autoUpdateIo.exec`. That gets us 95% of the + * confidence. The last 5% — that `npm install -g` actually mutates + * disk, that the installed binary actually runs, that the + * previous-version breadcrumb survives a real exec round-trip — is + * what this integration test closes. + * + * The actual orchestration lives in `scripts/devnet-test-edge-update.sh` + * for two reasons: + * + * 1. The script is the form an operator can also run by hand for + * ad-hoc validation. Keeping the wiring in shell means the + * runbook in `docs/devnet/EDGE_UPDATE_VALIDATION.md` is a direct + * copy-paste of what CI executes — no test-runner-specific glue. + * 2. Running `npm install -g` requires a clean process env (npm + * reads `NPM_CONFIG_*`, `.npmrc`, etc. up front, not per-spawn), + * so a child process boundary is the path of least resistance. + * Vitest+forks gives us that boundary naturally. + * + * This file is the thinnest possible vitest wrapper: it shells out to + * the script, asserts a clean exit code, and surfaces failures with + * the script's own stderr so the failure report is grep-friendly. + * + * Preconditions: + * - `pnpm install` + `pnpm build` from the repo root. + * - `npx -y verdaccio --version` works on the host (the script + * uses `npx -y verdaccio@latest` to boot the registry). + * + * Runtime: ~3-5 minutes — the publish stage iterates ~15 public + * workspace packages and the install stage does two `npm install -g` + * round-trips with the MarkItDown postinstall. + * + * Run via `pnpm test:devnet:edge-update-flow` from the repo root. + */ +import { describe, it, expect } from 'vitest'; +import { spawn } from 'node:child_process'; +import { resolve, dirname } from 'node:path'; +import { fileURLToPath } from 'node:url'; +import { existsSync } from 'node:fs'; + +const HERE = dirname(fileURLToPath(import.meta.url)); +const REPO_ROOT = resolve(HERE, '..', '..'); +const SCRIPT = resolve(REPO_ROOT, 'scripts', 'devnet-test-edge-update.sh'); + +describe('RFC-41 Bundle B — Edge npm-only update + rollback round-trip', () => { + it('runs scripts/devnet-test-edge-update.sh to completion', async () => { + expect(existsSync(SCRIPT), `expected ${SCRIPT} to exist`).toBe(true); + + const stdoutChunks: Buffer[] = []; + const stderrChunks: Buffer[] = []; + + const proc = spawn('bash', [SCRIPT], { + cwd: REPO_ROOT, + env: { + ...process.env, + // Honour any operator overrides the runbook documents, but + // do not silently inject anything else — keep this transparent + // for debugging. + }, + stdio: ['ignore', 'pipe', 'pipe'], + }); + + proc.stdout.on('data', (c: Buffer) => { + stdoutChunks.push(c); + // Stream the script's progress so a stalled stage is visible + // in CI logs without having to wait for the test to time out. + process.stdout.write(c); + }); + proc.stderr.on('data', (c: Buffer) => { + stderrChunks.push(c); + process.stderr.write(c); + }); + + const exitCode: number = await new Promise((res, rej) => { + proc.once('error', rej); + proc.once('close', (code) => res(code ?? -1)); + }); + + if (exitCode !== 0) { + const stderr = Buffer.concat(stderrChunks).toString('utf-8'); + const stdout = Buffer.concat(stdoutChunks).toString('utf-8'); + const tail = (s: string, n: number) => s.split('\n').slice(-n).join('\n'); + throw new Error( + `devnet-test-edge-update.sh exited ${exitCode}\n` + + `--- last 40 lines of stderr ---\n${tail(stderr, 40)}\n` + + `--- last 40 lines of stdout ---\n${tail(stdout, 40)}`, + ); + } + + const stdout = Buffer.concat(stdoutChunks).toString('utf-8'); + // Final assertion lines emitted by the script — anchor the test + // to the production contract so a regression that quietly skips + // a stage still fails here. + expect(stdout).toMatch(/\[edge-update\] v1 installed and reports version /); + expect(stdout).toMatch(/\[edge-update\] previous-version=.* \(matches v1\)/); + expect(stdout).toMatch(/\[edge-update\] binary now reports /); + expect(stdout).toMatch(/\[edge-update\] binary back to .* after rollback — round-trip complete/); + expect(stdout).toMatch(/\[edge-update\] PASS — Edge npm-only update \+ rollback round-trip/); + }); +}); diff --git a/devnet/edge-update-flow/package.json b/devnet/edge-update-flow/package.json new file mode 100644 index 000000000..f4377b0fd --- /dev/null +++ b/devnet/edge-update-flow/package.json @@ -0,0 +1,12 @@ +{ + "name": "@origintrail-official/dkg-devnet-edge-update-flow", + "version": "0.0.0", + "private": true, + "type": "module", + "scripts": { + "test:devnet": "vitest run --config vitest.config.ts" + }, + "devDependencies": { + "vitest": "4.0.18" + } +} diff --git a/devnet/edge-update-flow/vitest.config.ts b/devnet/edge-update-flow/vitest.config.ts new file mode 100644 index 000000000..931c27d7a --- /dev/null +++ b/devnet/edge-update-flow/vitest.config.ts @@ -0,0 +1,45 @@ +import { defineConfig } from 'vitest/config'; +import { resolve } from 'node:path'; + +/** + * OT-RFC-41 Bundle B — Edge npm-only update + rollback flow. + * + * Wraps `scripts/devnet-test-edge-update.sh`, which spins up a local + * verdaccio registry, packs + publishes every public workspace + * package, installs the CLI globally into a scratch npm prefix, + * exercises the full `dkg update` → `dkg rollback` round-trip against + * a real `npm install -g`, and asserts the production + * `performNpmUpdateEdge` + `dkg rollback` (Edge branch) code paths + * actually mutate disk the way unit tests expect. + * + * This is the §6.2 testing-gap closer for RFC-41: unit tests in + * `packages/cli/test/rfc-41-bundle-b.test.ts` cover the LOGIC with a + * mocked `_autoUpdateIo.exec`; this suite covers the INTEGRATION with + * the real npm CLI. + * + * Pre-requisites (see `docs/devnet/EDGE_UPDATE_VALIDATION.md`): + * - `pnpm install` from the repo root + * - `pnpm build` (so `packages/* /dist/` exists for `pnpm pack`) + * - `npx -y verdaccio --version` works on the host (warm the cache) + * + * Run: `pnpm test:devnet:edge-update-flow` + * + * Runtime: ~3-5 minutes. Dominated by verdaccio cold-start and two + * `npm install -g` round-trips against the scratch prefix. + */ +export default defineConfig({ + test: { + include: [resolve(import.meta.dirname, 'automated.test.ts')], + testTimeout: 600_000, + hookTimeout: 60_000, + pool: 'forks', + sequence: { concurrent: false }, + globals: false, + }, + resolve: { + modules: [ + resolve(import.meta.dirname, '../../node_modules'), + 'node_modules', + ], + }, +}); diff --git a/docs/operator/MIGRATE_TO_NPM.md b/docs/archive/MIGRATE_TO_NPM.md similarity index 70% rename from docs/operator/MIGRATE_TO_NPM.md rename to docs/archive/MIGRATE_TO_NPM.md index ee0440290..a86634a7a 100644 --- a/docs/operator/MIGRATE_TO_NPM.md +++ b/docs/archive/MIGRATE_TO_NPM.md @@ -1,8 +1,36 @@ +# ARCHIVED — Migrate a git-checkout install to the npm path + +> **This document is archived as of rc.12 (OT-RFC-41 §5 PR 6).** +> The `dkg migrate-to-npm` command and the procedures below have been removed from the CLI. They are kept here for historical reference only. +> +> **What replaced this:** +> - **Edge nodes (default):** First-start migration is automatic. On first `dkg start` under rc.12, an Edge node with a legacy `~/.dkg/releases/` tree from a pre-rc.12 install records the active slot's version into `~/.dkg/previous-version` (so `dkg rollback` keeps working) and resumes running directly from the npm-global install. No operator action is needed. The legacy `~/.dkg/releases/` directory is left in place — `dkg doctor` will flag it as cleanable legacy state; `rm -rf ~/.dkg/releases/` is safe once you've verified the new install runs. +> - **Core nodes:** Keep using blue-green slots. Updates flow through `dkg update` (npm install into the inactive slot + atomic swap). No migration needed unless you were on a git-checkout install — in that case, follow the (historical) procedure below, then explicitly set `autoUpdate.source: "npm"` in `~/.dkg/config.json`. +> - **Diagnostics:** `dkg doctor` is the canonical "what does my install look like?" command. It reports an 18-field state summary plus six anomaly checks (install layout, version skew, orphan repos, plugin root, served-UI mismatch, config sanity). Use it before reasoning about install-state issues. +> +> See [OT-RFC-41](https://github.com/OriginTrail/dkgv10-spec/blob/main/rfcs/OT-RFC-41-edge-node-npm-only-install-and-update.md) for the full design. + +--- + # Migrate a git-checkout install to the npm-pinned auto-update path This guide is for operators currently running a DKG node from a `git clone`d checkout (typical layout: `~/dkg-v9/` with `.git`, `packages/`, `node_modules/`, `pnpm-lock.yaml`, `package.json`). It walks through converting that install to use the npm-pinned auto-update path without re-installing. -The end-state: the daemon's auto-updater fetches pre-built artifacts of a specific `@origintrail-official/dkg` version from npm into `~/.dkg/releases/{a,b}/`, instead of building from source against the tracked git branch on every update cycle. +The end-state: the daemon's auto-updater fetches pre-built artifacts of a specific `@origintrail-official/dkg` version from npm, instead of building from source against the tracked git branch on every update cycle. + +## rc.12 changes — read first + +This runbook predates [OT-RFC-41](https://github.com/OriginTrail/dkgv10-spec/blob/main/rfcs/OT-RFC-41-edge-node-npm-only-install-and-update.md) ("Edge Node NPM-Only Install and Update"). The RFC ships in rc.12 and changes several details described below: + +- **`dkg migrate-to-npm` is being removed in rc.12.** The migration logic is automated: on first daemon start under rc.12, an Edge node with a legacy `~/.dkg/releases/` tree records the active slot's version into `~/.dkg/previous-version` (so `dkg rollback` continues to work) and resumes running from the npm-global install. No operator action required. This runbook is retained for historical context and for operators upgrading on an older release that still has the command. +- **`install.sh` is deprecated and removed in rc.12.** Use `npm install -g @origintrail-official/dkg` for fresh installs. +- **Edge nodes no longer use blue-green release slots in rc.12.** The daemon runs directly from the npm-global install (`/usr/local/lib/node_modules/@origintrail-official/dkg/`); `~/.dkg/releases/` is not created. Core nodes (operators running `dkg init --role core`) retain blue-green slots — they still earn their keep on the 24/7 SLA. +- **`dkg update` semantics differ by `nodeRole` in rc.12.** Edge: `npm install -g @origintrail-official/dkg@` + restart. Core: in-slot install + atomic symlink swap, unchanged from prior behaviour. +- **Route plugins now resolve from `~/.dkg/plugins/node_modules/`** (a stable, install-mode-independent root). If you operate a fork with bare-name `routePlugins` entries previously installed via `npm install -g `, re-install them with `npm install --prefix ~/.dkg/plugins ` so they survive update cycles. +- **`dkg doctor`** ships as a first-class diagnostic command. Run it before reasoning about install-state confusion — it surfaces orphan repository clones, version skew between the CLI and the daemon, served-UI / source mismatch, plugin install root health, and (on Core) blue-green slot health. +- **The `installMode`, `commit`, `commitShort`, `buildTime`, and `distTag` fields are exposed on `/api/status`** (per RFC §4.9). When opening a support ticket, paste the output of `curl http://localhost:9200/api/status | jq` rather than just the version string. + +The rest of this document describes the now-deprecated `dkg migrate-to-npm` flow as it existed pre-rc.12. If you are on rc.11 or earlier and need to migrate before upgrading, follow it. Otherwise upgrade to rc.12 and let the automated migration handle it. ## Why migrate diff --git a/docs/devnet/EDGE_UPDATE_VALIDATION.md b/docs/devnet/EDGE_UPDATE_VALIDATION.md new file mode 100644 index 000000000..b1e052356 --- /dev/null +++ b/docs/devnet/EDGE_UPDATE_VALIDATION.md @@ -0,0 +1,305 @@ +# Edge npm-only update + rollback validation (RFC-41 Bundle B) + +This runbook is the operator-facing companion to +`scripts/devnet-test-edge-update.sh` and the vitest harness at +`devnet/edge-update-flow/automated.test.ts`. It closes RFC-41 §6.2: +end-to-end validation of the Edge node `dkg update` + `dkg rollback` +flow against a real `npm install -g` against a real (local) npm +registry, _without_ requiring a Trace Labs NPM publish. + +The §6.2 gap exists because automated devnet update testing has been +gated on the prerequisites in §6.5 (Trace Labs publishing a `next` +dist-tag with the Bundle B build). Until that lands, this runbook + +script let any contributor exercise the full flow on their own laptop +against a self-hosted `verdaccio` registry. + +## What this validates + +Bundle B's contract (RFC-41 §4.7, §4.8): + +1. `npm install -g @origintrail-official/dkg@` resolves through + the registry and installs the global binary at the npm prefix. +2. `dkg --version` reports the installed version. +3. `dkg update ` runs `npm install -g + @origintrail-official/dkg@`, writes the OLD version to + `~/.dkg/previous-version`, and the binary reports the new version. +4. `dkg rollback` reads `~/.dkg/previous-version`, runs `npm install + -g @origintrail-official/dkg@`, and the binary + reports the previous version. Round-trip lands back on the + starting version. + +What this does **not** validate (separate suites cover these): + +- `dkg init --role edge` flow — covered by Bundle B unit tests in + `packages/cli/test/rfc-41-bundle-b.test.ts` (monorepo guard, + `--role` parsing, config layout). The runbook below bootstraps a + minimal `~/.dkg/config.json` directly to keep the focus on the + update + rollback path. +- Daemon HTTP behavior under the new install — covered by + `devnet/v10-core-flows/` once the daemon is started against an + Edge-mode `config.json`. +- Core slot-based update — RFC-41 §4.7.2 keeps that path for Core + nodes; this runbook is Edge-only. +- Build-info / install-mode telemetry — Bundle A's `/api/status` + + `dkg doctor` output is covered by Bundle A's unit suite. + +## Prerequisites + +1. **Built workspace.** `pnpm pack` packs `dist/`, not source; without + it the packed tarballs are stubs. + ```bash + pnpm install + pnpm build + ``` +2. **Network access for `npx`.** The script boots a pinned + verdaccio (default `verdaccio@6.7.2`, see `VERDACCIO_VERSION` + in `scripts/devnet-test-edge-update.sh`) via `npx`. Warm the + npx cache once: + ```bash + npx -y verdaccio@6.7.2 --version + ``` + Override the pin per-run with `VERDACCIO_VERSION= + ./scripts/devnet-test-edge-update.sh` when debugging against + a different verdaccio. Bumping the default pin should be a + deliberate edit + a manual re-run against the updated runbook. +3. **No conflicting verdaccio.** Port `4873` must be free, or set + `VERDACCIO_PORT=` when invoking the script. +4. **Node ≥ what the repo requires.** Use `nvm use` (reads `.nvmrc`) + to pin the right version. + +The script does NOT touch: + +- The host's `~/.dkg` (uses `DKG_HOME=/dkg-home`). +- The host's npm global prefix (uses `NPM_CONFIG_PREFIX=/npm-global`). +- The host's `~/.npmrc` (uses `NPM_CONFIG_USERCONFIG=/.npmrc`). + +## Run it (automated) + +From the repo root: + +```bash +pnpm test:devnet:edge-update-flow +``` + +That registers the script as a vitest scenario alongside the other +`pnpm test:devnet:*` suites, gives you streamed progress, and fails +loud with the script's own stderr captured. + +To debug a failure, re-run the script directly with the scratch root +preserved: + +```bash +EDGE_UPDATE_KEEP_SCRATCH=1 ./scripts/devnet-test-edge-update.sh +``` + +The path it printed under `[edge-update] scratch root:` survives the +exit trap and contains: + +- `dkg-home/` — `config.json`, `previous-version`, etc. +- `npm-global/` — the scratch npm prefix with `bin/dkg`. +- `verdaccio-storage/` — the local registry's package storage. +- `verdaccio.log` — the registry's stderr. +- `tarballs/` — every packed tarball (v1 for all public packages, + plus v2 of the CLI). +- `.npmrc` — the registry/auth config the script used. + +## Run it (manual, step by step) + +If the script is failing in a way that's hard to read from logs alone, +here's the same flow broken into copy-pasteable shell. Same env vars, +same paths. + +### 1. Scratch root + env + +```bash +export SCRATCH_ROOT="$(mktemp -d -t dkg-edge-update.XXXXXX)" +export NPM_CONFIG_PREFIX="$SCRATCH_ROOT/npm-global" +export DKG_HOME="$SCRATCH_ROOT/dkg-home" +export NPM_CONFIG_USERCONFIG="$SCRATCH_ROOT/.npmrc" +export NPM_CONFIG_REGISTRY="http://127.0.0.1:4873/" +export PATH="$NPM_CONFIG_PREFIX/bin:$PATH" +mkdir -p "$NPM_CONFIG_PREFIX" "$DKG_HOME" "$SCRATCH_ROOT/tarballs" + +cat > "$NPM_CONFIG_USERCONFIG" < "$SCRATCH_ROOT/verdaccio-config.yaml" <"$SCRATCH_ROOT/verdaccio.log" 2>&1 & +echo "verdaccio pid: $!" + +# Wait until /ping returns 200. +until curl -fsS "$NPM_CONFIG_REGISTRY/-/ping" >/dev/null; do sleep 1; done +``` + +### 3. Pack + publish v1 of every public package + +```bash +cd +V1="$(node -p "require('./packages/cli/package.json').version")" +echo "v1 = $V1" + +# Discover public packages. +for d in packages/*; do + [ -f "$d/package.json" ] || continue + IS_PRIVATE="$(node -p "JSON.parse(require('fs').readFileSync('$d/package.json','utf-8')).private === true")" + [ "$IS_PRIVATE" = "true" ] && continue + ( cd "$d" && pnpm pack --pack-destination "$SCRATCH_ROOT/tarballs" >/dev/null ) +done + +for tgz in "$SCRATCH_ROOT/tarballs"/*.tgz; do + npm publish "$tgz" +done +``` + +### 4. Bump CLI to v2 and publish + +```bash +V2="${V1}-edge-update-test.1" +cp packages/cli/package.json "$SCRATCH_ROOT/cli-package.json.bak" +node -e " + const fs = require('fs'); + const p = JSON.parse(fs.readFileSync('packages/cli/package.json','utf-8')); + p.version = '$V2'; + fs.writeFileSync('packages/cli/package.json', JSON.stringify(p, null, 2) + '\n'); +" +( cd packages/cli && pnpm pack --pack-destination "$SCRATCH_ROOT/tarballs" >/dev/null ) +npm publish "$SCRATCH_ROOT/tarballs/origintrail-official-dkg-${V2}.tgz" +cp "$SCRATCH_ROOT/cli-package.json.bak" packages/cli/package.json +echo "v2 = $V2" +``` + +### 5. Install v1 globally + +```bash +npm install -g "@origintrail-official/dkg@$V1" +which dkg # → $NPM_CONFIG_PREFIX/bin/dkg +dkg --version # → $V1 +``` + +### 6. Bootstrap a minimal Edge config + +(`dkg init` is interactive and asks ~12 questions; we bypass it here +because the focus is the UPDATE flow. Bundle B unit tests cover the +init flow.) + +```bash +cat > "$DKG_HOME/config.json" <` — the headline assertion + +```bash +dkg update "$V2" +cat "$DKG_HOME/previous-version" # → $V1 (rollback breadcrumb) +dkg --version # → $V2 +``` + +### 8. `dkg rollback` + +```bash +dkg rollback +dkg --version # → $V1 (back to starting version) +``` + +### 9. Cleanup + +```bash +kill %1 2>/dev/null # stop verdaccio +rm -rf "$SCRATCH_ROOT" +``` + +## Expected output (script form) + +A passing run looks like: + +``` +[edge-update] scratch root: /tmp/dkg-edge-update.XXXXXX +[edge-update] verdaccio: http://127.0.0.1:4873 ... +[edge-update] stage 1: launching verdaccio +[edge-update] verdaccio up after 2s (pid=12345) +[edge-update] stage 2: v1=10.0.0-rc.11 v2=10.0.0-rc.11-edge-update-test.1 +[edge-update] packing public packages at v1 (10.0.0-rc.11) +[edge-update] packed 15 v1 tarballs into ... +[edge-update] packing v2 (CLI only, with bumped version ...) +[edge-update] v2 tarball: .../origintrail-official-dkg-10.0.0-rc.11-edge-update-test.1.tgz +[edge-update] restored .../packages/cli/package.json +[edge-update] stage 3: publishing 16 tarballs to verdaccio +[edge-update] published all tarballs +[edge-update] verdaccio knows CLI v1 + v2 +[edge-update] stage 4: npm install -g @origintrail-official/dkg@10.0.0-rc.11 +[edge-update] v1 installed and reports version 10.0.0-rc.11 +[edge-update] stage 5: bootstrap minimal Edge config at .../dkg-home/config.json +[edge-update] config.json nodeRole=edge, autoUpdate.source=npm +[edge-update] stage 6: dkg update 10.0.0-rc.11-edge-update-test.1 +[edge-update] previous-version=10.0.0-rc.11 (matches v1) +[edge-update] binary now reports 10.0.0-rc.11-edge-update-test.1 +[edge-update] stage 7: dkg rollback (expected back to 10.0.0-rc.11) +[edge-update] binary back to 10.0.0-rc.11 after rollback — round-trip complete +[edge-update] PASS — Edge npm-only update + rollback round-trip ... +``` + +## Troubleshooting + +| Symptom | Likely cause | Fix | +|---|---|---| +| `verdaccio did not respond to /-/ping within 60s` | First-time `npx verdaccio` is downloading | Pre-warm with `npx -y verdaccio@6.7.2 --version` | +| `port 4873 in use` | Existing verdaccio | `lsof -i :4873` → kill it, or `VERDACCIO_PORT=4874 ./script.sh` | +| `pnpm pack failed` | Workspace not built | `pnpm build` from repo root | +| `expected dkg binary at .../bin/dkg after install` | `npm install -g` silently failed | Re-run with `EDGE_UPDATE_KEEP_SCRATCH=1`, then `cat /verdaccio.log` | +| `dkg update failed` | Pre-flight doctor check found error severity | Re-run script directly with stderr visible: `bash -x scripts/devnet-test-edge-update.sh 2>&1 \| tee /tmp/dkg-update-log` | +| `previous-version` missing after `dkg update` | `getCurrentCliVersion()` returned empty | Check that v1 tarball includes a non-empty `package.json#version` | +| Test passes locally, fails in CI | CI host has a stale npm global prefix | Confirm CI uses an ephemeral runner; the script's scratch prefix is per-run | + +## Tying back to RFC-41 + +Run this **before** you merge any change that touches: + +- `packages/cli/src/daemon/auto-update.ts` (especially + `performNpmUpdateEdge` and `_performNpmUpdateInnerEdge`). +- `packages/cli/src/cli.ts` `dkg update` / `dkg rollback` action + handlers. +- `packages/cli/src/migration.ts` `noteEdgeLegacyReleases`. +- Any code under `packages/cli/src/daemon/manifest.ts` that exports + `_autoUpdateIo`. + +It's also a pre-merge gate for the RFC-41 follow-up PRs (deletion of +the dead git-build code paths) once Bundle B has soaked on devnet — +the script ensures the deletions did not accidentally break the +update flow that survived the cleanup. diff --git a/docs/onboarding/04-package-map.md b/docs/onboarding/04-package-map.md index 673abf6ab..d34fac01b 100644 --- a/docs/onboarding/04-package-map.md +++ b/docs/onboarding/04-package-map.md @@ -195,7 +195,7 @@ The `dkg` command-line tool. Provides commands for node lifecycle (`init`, `star ### @origintrail-official/dkg-node-ui `packages/node-ui/` -A dashboard backend and React frontend for monitoring a running DKG node. The backend provides `DashboardDB` (SQLite-based metrics, operation tracking, chat history, query logs), `StructuredLogger`, `MetricsCollector`, `OperationTracker`, and OpenTelemetry integration. The frontend (built with Vite) provides a visual dashboard with charts (Recharts), a SPARQL query editor (CodeMirror), and a knowledge graph explorer (using `graph-viz`). +A dashboard backend and React frontend for monitoring a running DKG node. The backend provides `DashboardDB` (SQLite-based metrics, operation tracking, chat history, query logs), `MetricsCollector`, `OperationTracker`, and OpenTelemetry integration. The frontend (built with Vite) provides a visual dashboard with charts (Recharts), a SPARQL query editor (CodeMirror), and a knowledge graph explorer (using `graph-viz`). **Depends on**: `core`, `graph-viz`. diff --git a/docs/setup/SETUP_CUSTOM.md b/docs/setup/SETUP_CUSTOM.md index e3dfbb2e3..d3df679b4 100644 --- a/docs/setup/SETUP_CUSTOM.md +++ b/docs/setup/SETUP_CUSTOM.md @@ -280,11 +280,27 @@ await node.start(); // Option A: Oxigraph directly const store = new OxigraphStore(); -// Option B: Any registered backend via the factory +// Option B: Any registered backend via the factory. +// +// Use this when you want the same backend the daemon uses — Blazegraph +// or any SPARQL 1.1 server (Fuseki, GraphDB, Stardog, …). The factory +// runs whichever adapter was registered for the named backend; see the +// "Triple Store Backends" section of the repo README for the full +// matrix and config shapes. +// // const store = await createTripleStore({ // backend: 'blazegraph', // options: { url: 'http://127.0.0.1:9999/bigdata/namespace/mynode/sparql' }, // }); +// +// const store = await createTripleStore({ +// backend: 'sparql-http', +// options: { +// queryEndpoint: 'http://server.example/query', +// updateEndpoint: 'http://server.example/update', +// auth: 'Bearer YOUR_TOKEN', // optional +// }, +// }); const router = new ProtocolRouter(node); const gossip = new GossipSubManager(node); diff --git a/docs/specs/SPEC_LU11_CHUNKED_CIPHERTEXT_COMMITMENT.md b/docs/specs/SPEC_LU11_CHUNKED_CIPHERTEXT_COMMITMENT.md new file mode 100644 index 000000000..380ebe8bb --- /dev/null +++ b/docs/specs/SPEC_LU11_CHUNKED_CIPHERTEXT_COMMITMENT.md @@ -0,0 +1,119 @@ +# LU-11: Chunked Ciphertext Commitment for Curated VM Publish + +**Status**: Draft — design delta for discussion. +**Author**: agent (claude-opus-4.7), drafting against feedback from the random-sampling agent on PR #595 / RFC-39. +**Depends on**: OT-RFC-38 LU-6 Phase B (PR [#610](https://github.com/OriginTrail/dkg/pull/610)) substrate. +**Unblocks**: OT-RFC-39 (curated random sampling), PR #114. + +--- + +## 1. Problem statement + +OT-RFC-38 §5.4.1 (in `docs/specs/SPEC_CG_HOSTING_MEMBERSHIP.md` on the LU-6 stack) specifies that the curated `ACKRequest` carries per-SWM-message ciphertext-chunk digests + a `ciphertextChunksRoot`, indexed under `(contextGraphId, batchId, swmMessageIndex)`. The spec further mandates a "persist-before-sign" invariant: cores MUST durably persist + index every chunk they intend to ACK before signing. + +**The current Phase A implementation does none of this.** Instead, the curated VM-publish path: + +1. Reads from SWM (which IS fed per-message via `swmHostModeStore.append` — confirmed by SCENARIO E of `devnet-test-rfc38-late-joiner.sh`). +2. Decrypts member-side, materialises the merged plaintext. +3. Re-encrypts the **entire merged plaintext** with a single chain-key AES-256-GCM blob via `v10-publish-payload.ts:encryptInlinePayload`. +4. Ships that one opaque blob inline as `PublishIntent.stagingQuads` (`storage-ack-handler.ts:197-260`). +5. Cores stage the blob under TTL, sign the existing V10 digest the publisher claimed, with **no per-message chunk linkage** to what they hold in `swmHostModeStore`. + +So today there is **no cryptographic binding between the on-chain commitment and the per-SWM-message ciphertext cores actually host**. RFC-39 curated random sampling — which needs to pick a per-message chunk by index and verify against an on-chain root — therefore has nothing well-defined to sample. + +**LU-11 (Chunked Ciphertext Commitment, short form CCC) closes this gap** by making the curated VM-publish path produce a `ciphertextChunksRoot` over the same per-message ciphertexts that SWM gossiped, and threading that root through to the ACK envelope (§5.4.1) and on-chain (RFC-39 §3.4). + +## 2. Today's curated publish, in three call sites + +| Step | File | Behavior | +|---|---|---| +| Member-side: per-message SWM gossip envelope | `dkg-agent.ts:publishWorkspaceGossip` → `swmHostModeStore.append` on receivers | Per-message ciphertext keyed by `seqno`, signed by `agentAddress`. Already correct shape for LU-11 — this is the substrate. | +| Member-side: aggregate + chain-key AEAD | `agent._resolveEncryptInlinePayload` → `core/v10-publish-payload.ts:encryptInlinePayload` | Concatenates all SWM-derived plaintext, encrypts in one AES-256-GCM call with a derived chain key. **This is where the chunking is lost.** | +| Core-side: ACK without chunk verification | `publisher/storage-ack-handler.ts:197-260` | Receives one `stagingQuads` blob, persists opaquely, signs V10 digest the publisher claimed. No `ciphertextChunks[]`, no `ciphertextChunksRoot`, no `swmMessageIndex` cross-reference. | + +## 3. Target behavior + +Per spec §5.4.1: + +- Curator emits **N** per-SWM-message ciphertexts `ct_1 .. ct_N` keyed to `swmMessageIndex_1 .. swmMessageIndex_N` (the SWM seqnos cores already hold under `swmHostModeStore`). +- Curator computes `ciphertextChunksRoot = merkleRoot([H(ct_i) for i in 1..N])`. +- ACK envelope (`ackProtocolVersion: 2`) carries `ciphertextChunks[]` + `ciphertextChunksRoot`; bytes stay in `swmHostModeStore` (cores already have them via gossip — no second copy on the ACK wire). +- Core verifies it holds every `ct_i` at `(contextGraphId, batchId, swmMessageIndex_i)` before signing. Missing chunks → `ChunkPullRequest` fallback (§5.4.3) or `DECLINE`. +- On-chain: `KnowledgeAssetsV10.PublishParams` gains a `ciphertextChunksRoot bytes32` field (RFC-39's contract change). Curated random sampling weights against this root; public CGs pass `bytes32(0)` and use the existing leaf-root path. + +## 4. Two convergence options for the publisher + +The architectural question is **what ciphertexts the publisher should emit per-message**: + +### Option A — Drop chain-key re-encryption, use SWM sender-key ciphertexts as authoritative + +- Publisher reads SWM, materialises plaintext, **does not re-encrypt**. The SWM sender-key envelopes ARE the authoritative ciphertext. +- `ct_i = swmHostModeStore.iterate(cgId).map(entry => entry.envelopeBytes)`. +- `ciphertextChunksRoot = merkleRoot([keccak256(ct_i)])`, leaves indexed by SWM `seqno`. +- Cores already hold `ct_i` under `(cgId, seqno)` — `swmMessageIndex == seqno`, zero translation. + +**Pros**: Simplest. Single ciphertext per message, no double-encryption overhead, perfect 1:1 mapping with the substrate. The "persist-before-sign" invariant becomes trivially satisfied because SWM ingest IS the persistence. + +**Cons**: Couples VM persistence key to SWM sender keys. Sender keys rotate (LU-4), so the on-chain commitment effectively binds to a key generation the curator can revoke. **Member key rotation could orphan an on-chain commitment** — once the old sender key is forgotten, the ciphertext is undecryptable even by members. This is a real problem: today's chain-key re-encryption exists precisely to give the publish a separate, stable key independent of member-state churn. + +### Option B — Keep chain-key AEAD, but chunk it 1:1 with SWM messages + +- Publisher reads SWM, materialises plaintext, re-encrypts **per-SWM-message** with the chain key — one AEAD call per source message instead of one over the whole batch. +- `ct_i = AES-GCM(chainKey, nonce_i, plaintext_i)` where `plaintext_i` is the i-th decrypted SWM envelope's payload and `nonce_i = HKDF(batchId || swmMessageIndex_i)` (deterministic from public inputs). +- `ciphertextChunksRoot = merkleRoot([keccak256(ct_i)])`, leaves indexed by `swmMessageIndex_i`. +- Cores hold the chain-key ciphertext `ct_i` keyed by `(cgId, batchId, swmMessageIndex_i)` — a **new index alongside SWM seqno**, both populated by the same ingest. + +**Pros**: Preserves the existing key-separation invariant (sender keys rotate freely without orphaning on-chain commitments). Drop-in for the existing `chain-key AEAD` security story. + +**Cons**: Two ciphertext copies per message at core ingest (sender-key envelope for member catchup, chain-key chunk for ACK verification). ~2x storage on cores for curated CGs. More code: ingest path needs to materialise the chain-key chunk alongside the sender-key envelope. + +### Recommendation: **Option B** + +Option A's "member key rotation orphans the on-chain commitment" risk is unacceptable for a permanent attestation surface. Mainnet curators MUST be able to rotate sender keys (member revocation, post-compromise) without losing access to prior on-chain attestations. + +The 2x storage cost is bounded by the existing `swmHostModeStore` retention policy and is small in absolute terms (curated CGs are a fraction of total traffic; ciphertext is already roughly plaintext-sized). The "two ciphertexts per message" framing is also slightly misleading — the sender-key envelope is short-lived (members consume + ack), while the chain-key chunk is the long-lived persisted artefact tied to the batch's `epochs`. + +## 5. Implementation plan (this PR) + +Phase-gated commits, each independently mergeable: + +| # | Commit | Touches | Verifiable when | +|---|---|---|---| +| 1 | **Design delta** (this doc) | `docs/specs/SPEC_LU11_CHUNKED_CIPHERTEXT_COMMITMENT.md` | Other-team review-approved. | +| 2 | **Chunked AEAD helper** in `@origintrail-official/dkg-core` | `core/v10-publish-payload.ts:encryptInlinePayloadChunked`, deterministic nonce derivation `nonce_i = HKDF(batchId, swmMessageIndex_i)` | Unit test: round-trip N messages, verify deterministic ciphertext, verify Merkle root over `H(ct_i)` matches a known fixture. | +| 3 | **Ciphertext-chunk Merkle builder** | `core/src/v10-merkle-tree.ts:buildCiphertextChunksRoot` (pure function, no chain coupling) | Unit test: 0, 1, 2, 32, 1023 chunks; verify against an oracle implementation. | +| 4 | **ACKRequest v2 wire format** | `core/src/proto/publish-intent.ts` adds optional `ciphertextChunks[]`, `ciphertextChunksRoot`, `ackProtocolVersion` fields. Backwards-compatible: missing fields imply `v1`. | Wire roundtrip test + decode of legacy v1 still works. | +| 5 | **Publisher emit** | `publisher/v10-publish-runner.ts` or wherever `isEncryptedPayload=true` is set: replace `stagingQuads`-as-blob with per-message chunks. SWM seqno → `swmMessageIndex` mapping. | Publish a curated CG with 5 SWM-derived messages, assert ACK request carries 5 `ciphertextChunks[]` with matching SWM seqnos. | +| 6 | **Core verify** | `publisher/storage-ack-handler.ts:197+` branches on `ackProtocolVersion`. For v2: read `ciphertextChunks[]`, look up each in `swmHostModeStore.get(cgId, swmMessageIndex)`, recompute root, decline on `BYTESIZE_MISMATCH` or missing chunks. | Devnet test: 2 cores host CG, publish triggers ACK round, both cores verify per-chunk before signing. Replace SCENARIO E's existing assertions with chunk-aware variants. | +| 7 | **ChunkPullRequest fallback** (§5.4.3) | `agent/src/swm/chunk-pull.ts` + wire format. Triggered when ACK verification can't find a chunk locally. | Devnet test: artificially evict a chunk from one core before ACK round, verify it pulls from a peer before signing. | +| 8 | **`ciphertextChunksRoot` to chain** (separates LU-11 publisher emit from RFC-39 contract field) | `chain/evm-adapter.ts` threads the new on-chain field. | Coordinated with RFC-39 contract PR (other agent). Feature-flagged: `bytes32(0)` until both sides shipped. | + +Commits 1-4 are pure-function / wire-format; can land in any order against any base. +Commits 5-6 require Phase B substrate (depends on PR #610 merging or rebasing onto its head). +Commit 7 is a separate sub-feature, could be its own PR. +Commit 8 is the handshake with the RFC-39 contract PR. + +## 6. Open questions + +1. **`swmMessageIndex` namespace**. SWM `seqno` is per-(cgId, host) — different cores may have different seqno counts for the same CG depending on when they started hosting. Spec §5.4.1 says "swmMessageIndex" — must be a curator-assigned monotonic counter (not core-local), threaded into the SWM envelope at publish time. **Add a new `swmMessageIndex` field to the SWM gossip envelope?** Or derive from `(timestamp, hash(payload))`? + +2. **Nonce derivation determinism**. Option B's `nonce_i = HKDF(batchId, swmMessageIndex_i)` must produce a unique nonce per `(batchId, swmMessageIndex)` pair. If a curator re-publishes the same logical batch (e.g. quorum failure → retry), does `batchId` change? If yes, no nonce collision. If no, we re-use a nonce under the same key → catastrophic AES-GCM failure. **Recommendation**: bind `batchId` to `publishOperationId` (unique per attempt) and document the invariant. + +3. **Chunk size policy**. §5.4.1 leaves chunk size to the publisher. Per-SWM-message is the obvious unit but means small chunks (~1KB typical) → high AEAD overhead (16-byte tag is ~1.5% of 1KB). Should the curator be allowed to coalesce N SWM messages into one chunk (trading sample granularity for storage efficiency)? **Recommendation**: ship 1:1 SWM message → chunk in v1; revisit coalescing as a separate proposal once we have curated-traffic data. + +4. **Migration**. Existing curated CGs published under Phase A use the inline-blob path with no `ciphertextChunksRoot`. The chain treats `bytes32(0)` as "no curated random-sampling commitment" (RFC-39 feature flag), so they keep working. No migration needed for the substrate. Open question: do we want a curator-driven "re-attest" path to upgrade old publishes? **Recommendation**: no. Old publishes stay as-is; curators publishing fresh batches automatically get the new path once LU-11 + RFC-39 ship. + +## 7. Non-goals for this PR + +- RFC-39's contract change (`ciphertextChunksRoot` on `KnowledgeAssetsV10.PublishParams` + the `_pickWeightedChallenge` branch). That's the other agent's PR. This PR's commit 8 only threads the field through the chain adapter; the contract diff lives in their PR. +- Curated random-sampling proof submission (`RandomSampling.submitProof` curated branch). Also their PR. +- ChunkPullRequest implementation (§5.4.3 fallback) — broken out as commit 7, may split into a follow-up PR depending on review size. +- Coalescing policy / curator-tunable chunk size — deferred per open question §6.3. + +## 8. Acceptance criteria + +- [ ] Other agent (random sampling / RFC-39) signs off on §4 Option B + the on-chain commit 8 handshake shape. +- [ ] SCENARIO E of `devnet-test-rfc38-late-joiner.sh` passes with `ackProtocolVersion: 2` (chunks verified per-message before sign). +- [ ] New devnet scenario: publish a curated CG, then independently verify the on-chain `ciphertextChunksRoot` matches a recompute from `swmHostModeStore.iterate()`. +- [ ] Backwards compat: a Phase-A curated CG published before this PR's commit 5 lands continues to be valid (no on-chain root, sampling falls back to leaf-root path). +- [ ] Unit tests for the chunk Merkle builder against a known test-vector set. diff --git a/docs/specs/SPEC_NODE_DASHBOARD.md b/docs/specs/SPEC_NODE_DASHBOARD.md index 1b3dcf98f..b17395e38 100644 --- a/docs/specs/SPEC_NODE_DASHBOARD.md +++ b/docs/specs/SPEC_NODE_DASHBOARD.md @@ -5,6 +5,34 @@ --- +> **2026-05 changelog — V15 of `DashboardDB`** +> +> The original design (below) included a `StructuredLogger` class that +> mirrored every log line into a SQLite `logs` table + FTS5 free-text +> index, exposed via `/api/logs?q=...`. After a production incident in +> which the FTS5 shadow tables grew to multiple GB on a 12-day-old node +> and corrupted the SQLite file, that path was removed: +> +> - `StructuredLogger` (class) — deleted; the dashboard was never wired +> to substitute it for `Logger` in production, so removal was a no-op +> for the daemon. +> - `logs_fts` virtual table + its two triggers — dropped in the V15 +> migration; one-shot `VACUUM` reclaims disk on upgrade. +> - `/api/logs` and `fetchLogs()` — removed; the dashboard log viewer +> uses `/api/node-log` (file-tail over `daemon.log`) which has always +> been the file-backed read path. +> +> The base `logs` table itself was retained: it backs the +> operation-correlated log lookup in `/api/operations/:id` and the +> failed-ops list (simple `WHERE operation_id = ?` queries — no FTS5 +> involved). Retention was lowered from 90 days to 14 to bound table +> growth in the absence of free-text search. +> +> Sections below describing the original FTS5/StructuredLogger design +> are kept for historical context; treat them as superseded. + +--- + ## Overview A unified web interface for operating a DKG node — monitoring, diff --git a/docs/testing/AUTO_UPDATE_LOCAL_TESTING.md b/docs/testing/AUTO_UPDATE_LOCAL_TESTING.md index b13da5bf1..ea63122ec 100644 --- a/docs/testing/AUTO_UPDATE_LOCAL_TESTING.md +++ b/docs/testing/AUTO_UPDATE_LOCAL_TESTING.md @@ -1,5 +1,13 @@ # Auto-Update Local Testing (Practical Runbook) +> **rc.12 note (OT-RFC-41):** This runbook predates RFC-41 and assumes blue-green slot updates. Under rc.12+: +> +> - **Edge nodes** no longer use slots. To validate an Edge update locally, run `npm install -g @origintrail-official/dkg@` and verify `~/.dkg/previous-version` was updated. `dkg rollback` re-installs the previous version from npm. +> - **Core nodes** still use blue-green slots — the rest of this runbook applies as-is when you set `dkg init --role core`. +> - **`install.sh` was removed in rc.12.** From a monorepo checkout, link the local CLI into your test home with `pnpm --filter @origintrail-official/dkg link --global` (or use `node packages/cli/dist/cli.js` directly) instead of `./install.sh`. Set `DKG_HOME` to a scratch directory so the test does not touch your real `~/.dkg/`. +> +> The procedure below is retained for Core operators and historical reference. + This runbook is for validating blue-green auto-update on your local machine before merge. ## 0) Use an isolated DKG home diff --git a/install.sh b/install.sh deleted file mode 100755 index e9bc98d19..000000000 --- a/install.sh +++ /dev/null @@ -1,161 +0,0 @@ -#!/bin/sh -set -e - -DKG_HOME="${DKG_HOME:-$HOME/.dkg}" -REPO_URL="${DKG_REPO:-https://github.com/OriginTrail/dkg-v9.git}" -BRANCH="${DKG_BRANCH:-main}" -BIN_DIR="${BIN_DIR:-$HOME/.local/bin}" - -red() { printf '\033[0;31m%s\033[0m\n' "$1"; } -green() { printf '\033[0;32m%s\033[0m\n' "$1"; } -info() { printf ' %s\n' "$1"; } - -echo "" -echo "DKG V9 Node Installer" -echo "=====================" -echo "" - -# Check prerequisites -command -v node >/dev/null 2>&1 || { red "Error: node is not installed (>= 20 required)."; exit 1; } -NODE_MAJOR=$(node -e "process.stdout.write(String(process.versions.node.split('.')[0]))") -if [ "$NODE_MAJOR" -lt 20 ]; then - red "Error: Node.js >= 20 required (found v$(node -v))." - exit 1 -fi -command -v pnpm >/dev/null 2>&1 || { red "Error: pnpm is not installed. Install with: npm install -g pnpm"; exit 1; } -command -v git >/dev/null 2>&1 || { red "Error: git is not installed."; exit 1; } - -green "Prerequisites OK (node v$(node -v | tr -d v), pnpm $(pnpm -v), git $(git --version | awk '{print $3}'))" -echo "" - -RELEASES_DIR="$DKG_HOME/releases" -SLOT_A="$RELEASES_DIR/a" -SLOT_B="$RELEASES_DIR/b" -SLOT_A_ENTRY="$SLOT_A/packages/cli/dist/cli.js" -SLOT_B_ENTRY="$SLOT_B/packages/cli/dist/cli.js" - -slot_ready() { - slot_path="$1" - entry_path="$2" - [ -d "$slot_path/.git" ] && [ -f "$entry_path" ] -} - -# Pick an install build that leaves a complete runnable checkout. Prefer the -# UI-inclusive `build:runtime` wrapper when present; the auto-updater can use -# `dkgBuild.releaseRuntimeBuildScript` because it builds/verifies the Node UI -# static bundle in a separate step, but fresh install.sh slots do not. -runtime_build_script() { - slot_path="$1" - node -e " - try { - const fs = require('fs'); - const pkg = JSON.parse(fs.readFileSync('$slot_path/package.json', 'utf-8')); - const isSafe = (s) => typeof s === 'string' && /^[A-Za-z0-9:_-]+\$/.test(s); - const rrbs = pkg.dkgBuild && pkg.dkgBuild.releaseRuntimeBuildScript; - if (pkg.scripts && typeof pkg.scripts['build:runtime'] === 'string') { - process.stdout.write('build:runtime'); - } else if (isSafe(rrbs) && pkg.scripts && typeof pkg.scripts[rrbs] === 'string') { - process.stdout.write(rrbs); - } else if (pkg.scripts && typeof pkg.scripts['build:runtime:packages'] === 'string') { - process.stdout.write('build:runtime:packages'); - } else { - process.stdout.write('build'); - } - } catch (e) { - process.stdout.write('build'); - } - " -} - -stage_markitdown() { - slot_path="$1" - slot_name="$2" - cli_dir="$slot_path/packages/cli" - script_path="$cli_dir/scripts/bundle-markitdown-binaries.mjs" - if [ ! -d "$cli_dir" ]; then - return - fi - if [ ! -f "$script_path" ]; then - info "Skipping MarkItDown staging in slot $slot_name (this checkout predates bundled MarkItDown support)." - return - fi - info "Staging MarkItDown binary in slot $slot_name ..." - (cd "$cli_dir" && node ./scripts/bundle-markitdown-binaries.mjs --build-current-platform --best-effort) -} - -mkdir -p "$RELEASES_DIR" - -if [ -L "$RELEASES_DIR/current" ] && slot_ready "$SLOT_A" "$SLOT_A_ENTRY" && slot_ready "$SLOT_B" "$SLOT_B_ENTRY"; then - green "Blue-green slots already exist. Skipping clone." - stage_markitdown "$SLOT_A" "a" - stage_markitdown "$SLOT_B" "b" -else - if [ -L "$RELEASES_DIR/current" ]; then - info "Detected incomplete slots. Rebuilding missing/broken slots..." - fi - info "Creating $DKG_HOME ..." - - if slot_ready "$SLOT_A" "$SLOT_A_ENTRY"; then - info "Slot a already exists and is ready." - else - rm -rf "$SLOT_A" - info "Cloning into slot a ..." - git clone --branch "$BRANCH" "$REPO_URL" "$SLOT_A" - info "Installing dependencies in slot a ..." - (cd "$SLOT_A" && pnpm install --frozen-lockfile) - # Runtime install build — skips evm-module's hardhat compile while still - # producing the Node UI static bundle on refs that expose `build:runtime`. - # The committed `packages/evm-module/abi/*.json` files are the runtime - # contract surface; CI enforces they stay in sync. - SLOT_A_BUILD_SCRIPT=$(runtime_build_script "$SLOT_A") - info "Building slot a (pnpm run $SLOT_A_BUILD_SCRIPT) ..." - (cd "$SLOT_A" && pnpm run "$SLOT_A_BUILD_SCRIPT") - fi - stage_markitdown "$SLOT_A" "a" - - if slot_ready "$SLOT_B" "$SLOT_B_ENTRY"; then - info "Slot b already exists and is ready." - else - rm -rf "$SLOT_B" - info "Cloning slot b (shared objects with a) ..." - git clone --reference "$SLOT_A" --dissociate --branch "$BRANCH" "$REPO_URL" "$SLOT_B" - info "Installing dependencies in slot b ..." - (cd "$SLOT_B" && pnpm install --frozen-lockfile) - SLOT_B_BUILD_SCRIPT=$(runtime_build_script "$SLOT_B") - info "Building slot b (pnpm run $SLOT_B_BUILD_SCRIPT) ..." - (cd "$SLOT_B" && pnpm run "$SLOT_B_BUILD_SCRIPT") - fi - stage_markitdown "$SLOT_B" "b" - - # Ensure current points to a known-good active slot. - ln -sfn a "$RELEASES_DIR/current" - echo "a" > "$RELEASES_DIR/active" - - green "Slots created: a (active), b (standby)" -fi - -# Create dkg symlink in bin dir -mkdir -p "$BIN_DIR" -DKG_BIN="$BIN_DIR/dkg" -CLI_ENTRY="$RELEASES_DIR/current/packages/cli/dist/cli.js" - -if [ -f "$CLI_ENTRY" ]; then - cat > "$DKG_BIN" <<'WRAPPER' -#!/bin/sh -DKG_HOME="${DKG_HOME:-$HOME/.dkg}" -exec node "$DKG_HOME/releases/current/packages/cli/dist/cli.js" "$@" -WRAPPER - chmod +x "$DKG_BIN" - info "Created $DKG_BIN" -else - info "Warning: $CLI_ENTRY not found. You may need to build first." -fi - -echo "" -green "Installation complete!" -echo "" -echo "Next steps:" -echo " 1. Ensure $BIN_DIR is in your PATH" -echo " 2. Run: dkg init" -echo " 3. Run: dkg start" -echo "" diff --git a/network/mainnet-base.json b/network/mainnet-base.json index f380084d1..87cd452dc 100644 --- a/network/mainnet-base.json +++ b/network/mainnet-base.json @@ -1,13 +1,25 @@ { - "_status": "pre-deployment: hubAddress must be set after V10 contracts are deployed on Base", - "chain": { - "name": "base", - "type": "evm", - "chainId": "base:8453", - "rpcUrl": "https://mainnet.base.org" - }, + "networkName": "DKG V10 Mainnet", + "networkId": "7449c543ff04a550b2dafa999fe8ee577a00b212023bb4d4244e8d58a4792c7b", + "genesisVersion": 1, + "relays": [ + "/ip4/178.105.87.39/tcp/9090/p2p/PEER_ID_SOLARIS", + "/ip4/178.105.105.102/tcp/9090/p2p/PEER_ID_LUNARIS", + "/ip4/178.156.214.4/tcp/9090/p2p/PEER_ID_ORIONIS", + "/ip4/178.105.111.185/tcp/9090/p2p/PEER_ID_KEPLER" + ], + "defaultContextGraphs": [], + "defaultNodeRole": "edge", "autoUpdate": { "enabled": true, - "branch": "main" + "repo": "OriginTrail/dkg", + "branch": "main", + "checkIntervalMinutes": 5 + }, + "chain": { + "type": "evm", + "rpcUrl": "https://mainnet.base.org", + "hubAddress": "0x99Aa571fD5e681c2D27ee08A7b7989DB02541d13", + "chainId": "base:8453" } } diff --git a/network/mainnet-gnosis.json b/network/mainnet-gnosis.json index dfd8b7dae..18a7623a0 100644 --- a/network/mainnet-gnosis.json +++ b/network/mainnet-gnosis.json @@ -1,13 +1,25 @@ { - "_status": "pre-deployment: hubAddress must be set after V10 contracts are deployed on Gnosis", - "chain": { - "name": "gnosis", - "type": "evm", - "chainId": "gnosis:100", - "rpcUrl": "https://rpc.gnosischain.com" - }, + "networkName": "DKG V10 Mainnet", + "networkId": "7449c543ff04a550b2dafa999fe8ee577a00b212023bb4d4244e8d58a4792c7b", + "genesisVersion": 1, + "relays": [ + "/ip4/178.105.87.39/tcp/9090/p2p/PEER_ID_SOLARIS", + "/ip4/178.105.105.102/tcp/9090/p2p/PEER_ID_LUNARIS", + "/ip4/178.156.214.4/tcp/9090/p2p/PEER_ID_ORIONIS", + "/ip4/178.105.111.185/tcp/9090/p2p/PEER_ID_KEPLER" + ], + "defaultContextGraphs": [], + "defaultNodeRole": "edge", "autoUpdate": { "enabled": true, - "branch": "main" + "repo": "OriginTrail/dkg", + "branch": "main", + "checkIntervalMinutes": 5 + }, + "chain": { + "type": "evm", + "rpcUrl": "https://rpc.gnosischain.com", + "hubAddress": "0x882D0BF07F956b1b94BBfe9E77F47c6fc7D4EC8f", + "chainId": "gnosis:100" } } diff --git a/network/mainnet-neuroweb.json b/network/mainnet-neuroweb.json index d6ced7f70..47e7f4c23 100644 --- a/network/mainnet-neuroweb.json +++ b/network/mainnet-neuroweb.json @@ -1,13 +1,25 @@ { - "_status": "pre-deployment: hubAddress must be set after V10 contracts are deployed on NeuroWeb", - "chain": { - "name": "neuroweb", - "type": "evm", - "chainId": "neuroweb:2043", - "rpcUrl": "https://astrosat-parachain-rpc.origin-trail.network" - }, + "networkName": "DKG V10 Mainnet", + "networkId": "7449c543ff04a550b2dafa999fe8ee577a00b212023bb4d4244e8d58a4792c7b", + "genesisVersion": 1, + "relays": [ + "/ip4/178.105.87.39/tcp/9090/p2p/PEER_ID_SOLARIS", + "/ip4/178.105.105.102/tcp/9090/p2p/PEER_ID_LUNARIS", + "/ip4/178.156.214.4/tcp/9090/p2p/PEER_ID_ORIONIS", + "/ip4/178.105.111.185/tcp/9090/p2p/PEER_ID_KEPLER" + ], + "defaultContextGraphs": [], + "defaultNodeRole": "edge", "autoUpdate": { "enabled": true, - "branch": "main" + "repo": "OriginTrail/dkg", + "branch": "main", + "checkIntervalMinutes": 5 + }, + "chain": { + "type": "evm", + "rpcUrl": "https://astrosat-parachain-rpc.origin-trail.network", + "hubAddress": "0x0957e25BD33034948abc28204ddA54b6E1142D6F", + "chainId": "neuroweb:2043" } } diff --git a/package.json b/package.json index 9f2e3418c..0483e65e5 100644 --- a/package.json +++ b/package.json @@ -47,6 +47,7 @@ "test:devnet:v10-e2e": "vitest run --config devnet/v10-end-to-end/vitest.config.ts", "test:devnet:v10-stress": "vitest run --config devnet/v10-stress/vitest.config.ts", "test:devnet:conviction-lazy-settle": "vitest run --config devnet/conviction-lazy-settle/vitest.config.ts", + "test:devnet:edge-update-flow": "vitest run --config devnet/edge-update-flow/vitest.config.ts", "test:all": "pnpm test && pnpm test:evm" }, "devDependencies": { @@ -69,17 +70,31 @@ "protobufjs" ], "overrides": { - "axios@<1.13.5": "1.13.5", + "axios@<1.15.2": "1.15.2", "minimatch@<10.2.3": "10.2.3", - "serialize-javascript@<7.0.3": "7.0.3", + "serialize-javascript@>=5.0.0 <7.0.5": "7.0.5", "bn.js@<4.12.3": "4.12.3", "undici@>=7.0.0 <7.18.2": "7.18.2", - "undici@>=5.0.0 <6.0.0": "6.23.0", + "undici@>=5.0.0 <6.24.0": "6.24.0", "cookie@<0.7.0": "0.7.0", "tmp@<0.2.4": "0.2.4", - "hono@<4.12.4": "4.12.4", - "@hono/node-server@<1.19.10": "1.19.10", - "immutable@<4.3.8": "4.3.8" + "hono@>=4.0.0 <4.12.18": "4.12.18", + "@hono/node-server@>=1.0.0 <1.19.13": "1.19.13", + "immutable@<4.3.8": "4.3.8", + "handlebars@>=4.0.0 <4.7.9": "4.7.9", + "fast-uri@>=3.0.0 <3.1.2": "3.1.2", + "express-rate-limit@>=8.0.0 <8.2.2": "8.2.2", + "lodash@>=4.0.0 <4.18.1": "4.18.1", + "lodash-es@>=4.0.0 <4.18.1": "4.18.1", + "path-to-regexp@>=8.0.0 <8.4.0": "8.4.0", + "vite@>=7.0.0 <7.3.2": "7.3.2", + "happy-dom@>=20.0.0 <20.8.9": "20.8.9", + "brace-expansion@>=5.0.0 <5.0.6": "5.0.6", + "follow-redirects@>=1.0.0 <1.16.0": "1.16.0", + "ip-address@>=10.0.0 <10.1.1": "10.1.1", + "picomatch@>=4.0.0 <4.0.4": "4.0.4", + "qs@>=6.0.0 <6.15.2": "6.15.2", + "ws@>=8.0.0 <8.20.1": "8.20.1" }, "patchedDependencies": { "hardhat@2.28.6": "patches/hardhat@2.28.6.patch" diff --git a/packages/agent/ontology/dkgskill.ttl b/packages/agent/ontology/dkgskill.ttl index 672974968..629cbb86b 100644 --- a/packages/agent/ontology/dkgskill.ttl +++ b/packages/agent/ontology/dkgskill.ttl @@ -176,4 +176,4 @@ dkgskill:contextGraphsServed a owl:DatatypeProperty ; rdfs:domain dkgskill:HostingProfile ; rdfs:range xsd:string ; rdfs:label "contextGraphs served" ; - rdfs:comment "Comma-separated contextGraph IDs this node serves." . + rdfs:comment "Context graph ID this node serves. Multi-valued: one triple per CG." . diff --git a/packages/agent/src/discovery.ts b/packages/agent/src/discovery.ts index eb4d82f6a..ddeffd456 100644 --- a/packages/agent/src/discovery.ts +++ b/packages/agent/src/discovery.ts @@ -1,5 +1,5 @@ import type { QueryEngine, QueryResult } from '@origintrail-official/dkg-query'; -import { DKG_ONTOLOGY, escapeSparqlLiteral, assertSafeIri } from '@origintrail-official/dkg-core'; +import { DKG_ONTOLOGY, escapeSparqlLiteral, assertSafeIri, sparqlIri } from '@origintrail-official/dkg-core'; import { AGENT_REGISTRY_CONTEXT_GRAPH } from './profile.js'; const SKILL = 'https://dkg.origintrail.io/skill#'; @@ -14,6 +14,20 @@ export interface DiscoveredAgent { nodeRole?: string; relayAddress?: string; agentAddress?: string; + /** + * Direct libp2p multiaddrs the agent has published via + * `dkg:multiaddr` (PR feat/chain-agents-cg-phonebook). Empty + * array when the profile pre-dates the phonebook schema or the + * agent has nothing dialable to advertise. + */ + multiaddrs?: string[]; + /** + * ISO-8601 timestamp from the agent's `dkg:lastSeen` triple. + * Undefined when the profile pre-dates the phonebook schema; + * consumers should treat undefined as "unknown freshness" and + * fall back to `relayAddress` only. + */ + lastSeen?: string; } export interface DiscoveredOffering { @@ -131,29 +145,84 @@ export class DiscoveryClient { } async findAgentByPeerId(peerId: string): Promise { - const sparql = ` - SELECT ?agent ?name ?framework ?nodeRole ?relayAddress WHERE { + // Two-query path keeps the existing single-row SELECT semantics + // for scalar columns (name, framework, nodeRole, relayAddress, + // lastSeen) while a separate query gathers all `dkg:multiaddr` + // rows. Pulling multiaddrs inline would force a GROUP_CONCAT + // round-trip; that works but is harder to test deterministically + // (engine-specific ordering / separator semantics). Two queries + // keep each result simple. + // `FILTER(isIRI(?agent))` constrains the first query at the engine + // layer so blank-node subjects (`_:b1`) and other non-IRI bindings + // never reach the JS code. The `assertSafeIri` / `sparqlIri` call + // below is defense-in-depth — an IRI that survives `isIRI` but + // contains a `>` / whitespace / control char would still break + // the second query's `<${agentUri}>` interpolation. Codex review + // of PR #700 round 3 caught the prior unguarded interpolation. + const scalar = ` + SELECT ?agent ?name ?framework ?nodeRole ?relayAddress ?agentAddress ?lastSeen WHERE { ?agent a <${DKG}Agent> ; <${SCHEMA}name> ?name ; <${DKG}peerId> "${escapeSparqlLiteral(peerId)}" . + FILTER(isIRI(?agent)) OPTIONAL { ?agent <${SKILL}framework> ?framework } OPTIONAL { ?agent <${DKG}nodeRole> ?nodeRole } OPTIONAL { ?agent <${DKG}relayAddress> ?relayAddress } + OPTIONAL { ?agent <${DKG}agentAddress> ?agentAddress } + OPTIONAL { ?agent <${DKG}lastSeen> ?lastSeen } } LIMIT 1 `; - const result = await this.engine.query(sparql, { contextGraphId: AGENT_REGISTRY_CONTEXT_GRAPH }); - if (result.bindings.length === 0) return null; + const scalarResult = await this.engine.query(scalar, { contextGraphId: AGENT_REGISTRY_CONTEXT_GRAPH }); + if (scalarResult.bindings.length === 0) return null; + + const row = scalarResult.bindings[0]; + const agentUri = row['agent']; + + // Defense-in-depth: even though `FILTER(isIRI(?agent))` above + // already drops blank-node subjects at the engine layer, the IRI + // could still contain a character that breaks SPARQL `<...>` + // interpolation (`>`, whitespace, control chars). If that happens + // we treat the whole entry as not-found rather than returning a + // partial profile — letting a malformed `agentUri` propagate to + // downstream consumers (who may re-interpolate it into their own + // queries) would just relocate the bug. With the engine-side + // FILTER in place this branch is "should never happen in + // practice"; the guard is purely a hardening fence. + let safeAgentIri: string; + try { + safeAgentIri = assertSafeIri(agentUri); + } catch { + return null; + } + + const multiSparql = ` + SELECT ?multiaddr WHERE { + ${sparqlIri(safeAgentIri)} <${DKG}multiaddr> ?multiaddr . + } + `; + const multiResult = await this.engine.query(multiSparql, { contextGraphId: AGENT_REGISTRY_CONTEXT_GRAPH }); + const multiaddrs = multiResult.bindings + .map((r) => (r['multiaddr'] ? stripQuotes(r['multiaddr']) : '')) + .filter((s) => s.length > 0); - const row = result.bindings[0]; return { - agentUri: row['agent'], + agentUri: safeAgentIri, name: stripQuotes(row['name']), peerId, framework: row['framework'] ? stripQuotes(row['framework']) : undefined, nodeRole: row['nodeRole'] ? stripQuotes(row['nodeRole']) : undefined, relayAddress: row['relayAddress'] ? stripQuotes(row['relayAddress']) : undefined, + // `agentAddress` is what `DKGAgent.drainPendingSenderKeyForPeer` keys + // its pending-by-agent queue lookups against. Omitting it here makes + // `drainPendingSenderKeyForPeer` an unconditional no-op in production + // — the queue grows but never replays. Match `findAgents()`'s scalar + // surface (`SELECT ... ?agentAddress`) so both discovery entry points + // resolve the same identity for the same peer. + agentAddress: row['agentAddress'] ? stripQuotes(row['agentAddress']) : undefined, + multiaddrs: multiaddrs.length > 0 ? multiaddrs : undefined, + lastSeen: row['lastSeen'] ? stripQuotes(row['lastSeen']) : undefined, }; } } diff --git a/packages/agent/src/dkg-agent-constants.ts b/packages/agent/src/dkg-agent-constants.ts index a7e612207..e503c4688 100644 --- a/packages/agent/src/dkg-agent-constants.ts +++ b/packages/agent/src/dkg-agent-constants.ts @@ -153,3 +153,32 @@ export const JOIN_APPROVAL_RETRY_TICK_MS = 30_000; * are coming from somewhere upstream of libp2p. */ export const MESSAGE_OUTBOX_TICK_MS = 30_000; + +/** + * Cadence at which a daemon re-publishes its own agent profile to + * the `agents` Context Graph (PR feat/chain-agents-cg-phonebook). + * + * Each heartbeat refreshes the profile's `dkg:multiaddr` triples + * (current dialable addrs) and `dkg:lastSeen` timestamp, so other + * peers querying agents-CG see fresh phonebook entries even when + * direct connections haven't been exchanged recently. Mirrors the + * `beaconReannounceTimer` (5 min) cadence and the relay reservation + * lifecycle (~30 min default duration limit), so we publish at least + * a few times per reservation epoch. + * + * Tuning: lower for chatty small networks (more responsive but more + * gossip volume), higher for large meshes (less volume; slower + * propagation of stale entries). Operators override via + * `config.network.agentProfileHeartbeatMs`. Set to `0` to disable + * (the one-shot startup publish still fires). + */ +export const AGENT_PROFILE_HEARTBEAT_MS = 5 * 60 * 1000; + +/** + * Staleness threshold for an agents-CG profile read during dial + * fallback. If `dkg:lastSeen` is older than this, the profile's + * `dkg:multiaddr` triples are ignored (the relay address is still + * tried — it's the safer minimum). 24h matches the existing peer- + * inactivity assumption built into the soak data. + */ +export const AGENT_PROFILE_STALE_THRESHOLD_MS = 24 * 60 * 60 * 1000; diff --git a/packages/agent/src/dkg-agent-types.ts b/packages/agent/src/dkg-agent-types.ts index 71f44c73f..b5d0e1c8d 100644 --- a/packages/agent/src/dkg-agent-types.ts +++ b/packages/agent/src/dkg-agent-types.ts @@ -33,7 +33,7 @@ import type { LiftAuthorityProof, SharedMemoryPublicSnapshotStorageConfig, } from '@origintrail-official/dkg-publisher'; -import type { ChainAdapter } from '@origintrail-official/dkg-chain'; +import type { ApprovalPolicy, ChainAdapter } from '@origintrail-official/dkg-chain'; import type { QueryAccessConfig } from '@origintrail-official/dkg-query'; import type { SkillHandler } from './messaging.js'; import type { CclFactResolutionMode } from './ccl-fact-resolution.js'; @@ -88,6 +88,37 @@ export type LocalSwmSenderKeyReceiveState = { skippedChainKeys: Map; }; +/** + * A SWM sender-key package that landed in the "no advertised peerId" + * branch of `createAndDistributeSwmSenderKeyEpoch` and is held for + * delivery once we learn a peerId for the recipient agent (via + * connection:open or a subsequent publish that re-resolves the + * recipient set). + * + * Keyed in-memory by lowercased `recipientAgentAddress`. The triple + * `(senderAgentAddress, recipientKeyId, epochId)` dedupes within an + * agent's queue; newer epochs supersede older ones for the same + * `(senderAgentAddress, recipientAgentAddress)` pair. + */ +export type PendingSenderKeyEntry = { + /** Lower-cased EIP-55 sender agent address. */ + senderAgentAddress: string; + /** Lower-cased EIP-55 recipient agent address (matches the map key). */ + recipientAgentAddress: string; + recipientKeyId: string; + epochId: string; + contextGraphId: string; + subGraphName?: string; + /** + * Canonical encoded `SwmSenderKeyPackageMsg` wire bytes — exactly + * what gets passed to `messenger.sendReliable(peerId, PROTOCOL_SWM_ + * SENDER_KEY, ...)` when the recipient becomes reachable. + */ + packageBytes: Uint8Array; + /** Wall-clock when the row was enqueued; used for diagnostics + future TTL. */ + createdAtMs: number; +}; + export type RandomSamplingStartResult = 'started' | 'retryable' | 'disabled'; export type ACKSignerResolution = { @@ -646,6 +677,25 @@ export interface DKGAgentConfig { * `getPeerDiagnostics()`. */ nodeVersion?: string; + /** + * libp2p networking tunables for small / sparse networks. All three + * fields are optional and forwarded straight into the matching + * `DKGNodeConfig` slots. Omitting any field preserves the upstream + * default. See `packages/core/src/types.ts` for per-field semantics + * and the operator-facing surface in `packages/cli/src/config.ts` + * (`network` block). + */ + peerStoreMaxAddressAgeMs?: number; + peerStoreMaxPeerAgeMs?: number; + dhtQuerySelfIntervalMs?: number; + /** + * Cadence at which the daemon re-publishes its own agent profile + * (PR feat/chain-agents-cg-phonebook). Forwarded straight from + * `DkgConfig.network.agentProfileHeartbeatMs`. Defaults to + * `AGENT_PROFILE_HEARTBEAT_MS` (5 min) when omitted; `0` disables + * the timer (the one-shot startup publish still fires). + */ + agentProfileHeartbeatMs?: number; /** * Path to the V10 Random Sampling prover write-ahead log. Core * nodes only; ignored on edge. When omitted, an in-memory WAL is @@ -688,10 +738,18 @@ export interface DKGAgentConfig { */ chainConfig?: { rpcUrl: string; + rpcUrls?: string[]; hubAddress: string; adminPrivateKey?: string; operationalKeys: string[]; chainId?: string; + /** + * Optional V10 allowance-sizing policy. Threaded straight through to + * the `EVMChainAdapter`; see `ApprovalPolicy` in + * `@origintrail-official/dkg-chain`. Omit to inherit the default + * (`'per-publish'`, bounded-per-publish with on-chain 1n floor). + */ + approvalPolicy?: ApprovalPolicy; }; /** Cross-agent query access configuration. */ queryAccess?: QueryAccessConfig; diff --git a/packages/agent/src/dkg-agent.ts b/packages/agent/src/dkg-agent.ts index 5538e4c1c..cf207fe01 100644 --- a/packages/agent/src/dkg-agent.ts +++ b/packages/agent/src/dkg-agent.ts @@ -1,7 +1,7 @@ import { DKGNode, ProtocolRouter, GossipSubManager, TypedEventBus, DKGEvent, LibP2PNetwork, PeerResolver, StubNetworkStateRegistry, - PROTOCOL_ACCESS, PROTOCOL_PUBLISH, PROTOCOL_SYNC, PROTOCOL_QUERY_REMOTE, PROTOCOL_STORAGE_ACK, PROTOCOL_VERIFY_PROPOSAL, PROTOCOL_JOIN_REQUEST, + PROTOCOL_ACCESS, PROTOCOL_PUBLISH, PROTOCOL_SYNC, PROTOCOL_QUERY_REMOTE, PROTOCOL_STORAGE_ACK, PROTOCOL_STORAGE_ACK_V2, PROTOCOL_GET_CIPHERTEXT_CHUNK, PROTOCOL_VERIFY_PROPOSAL, PROTOCOL_JOIN_REQUEST, PROTOCOL_SWM_SENDER_KEY, PROTOCOL_SWM_UPDATE, PROTOCOL_SWM_SHARE_ACK, PROTOCOL_SWM_HOST_CATCHUP, PROTOCOL_MESSAGE, contextGraphPublishTopic, contextGraphWorkspaceTopic, contextGraphAppTopic, contextGraphUpdateTopic, contextGraphFinalizationTopic, contextGraphDataGraphUri, contextGraphMetaGraphUri, contextGraphWorkspaceGraphUri, contextGraphWorkspaceMetaGraphUri, @@ -69,8 +69,16 @@ import { type ProtocolOutboxStore, type ProtocolOutboxEntry, encryptV10PublishPayload, + encryptChunked, + buildCiphertextChunksRoot, + computeGossipSigningPayloadV2, + GOSSIP_TYPE_WORKSPACE_PUBLISH_CHUNKED, + ciphertextChunkStoreGraph, + ciphertextChunkStoreSubject, + CIPHERTEXT_CHUNK_PREDICATE, type SubscriptionSource, SUBSCRIPTION_SOURCES, + pickNetworkTunables, } from '@origintrail-official/dkg-core'; import { GraphManager, PrivateContentStore, createTripleStore, type TripleStore, type TripleStoreConfig, type Quad, type LargeLiteralStorageConfig } from '@origintrail-official/dkg-storage'; import { EVMChainAdapter, NoChainAdapter, enrichEvmError, type EVMAdapterConfig, type ChainAdapter, type CreateContextGraphParams, type CreateOnChainContextGraphParams, type CreateOnChainContextGraphResult, type TxResult, type V10PublishingConvictionAccountInfo } from '@origintrail-official/dkg-chain'; @@ -113,7 +121,7 @@ import { ProfileManager } from './profile-manager.js'; import { DiscoveryClient, type SkillSearchOptions, type DiscoveredAgent, type DiscoveredOffering } from './discovery.js'; import { MessageHandler, type SkillHandler, type SkillRequest, type SkillResponse, type ChatHandler, type ChatAclCheck } from './messaging.js'; import { ed25519ToX25519Private, ed25519ToX25519Public } from './encryption.js'; -import { AGENT_REGISTRY_CONTEXT_GRAPH, canonicalAgentDidSubject, type AgentProfileConfig } from './profile.js'; +import { AGENT_REGISTRY_CONTEXT_GRAPH, canonicalAgentDidSubject, collectPublishableMultiaddrs, type AgentProfileConfig } from './profile.js'; import { signAgentDelegation, verifyAgentDelegation, @@ -167,6 +175,18 @@ import { mintSignedCatchupRequest, verifySignedCatchupRequest, } from './swm/host-catchup-sign.js'; +import { + createCiphertextChunkCatchupReplayGuard, + decodeCiphertextChunkCatchupRequest, + encodeCiphertextChunkCatchupRequest, + encodeCiphertextChunkCatchupResponse, + decodeCiphertextChunkCatchupResponse, + mintSignedCiphertextChunkCatchupRequest, + verifySignedCiphertextChunkCatchupRequest, + CIPHERTEXT_CHUNK_CATCHUP_WIRE_VERSION, + type CiphertextChunkCatchupRequest, + type CiphertextChunkCatchupResponse, +} from './swm/ciphertext-chunk-catchup.js'; import { waitForPeerProtocol } from './p2p/protocol-readiness.js'; import { orderCatchupPeers } from './p2p/peer-selection.js'; import { fetchSyncPages, type SyncPageResult } from './sync/requester/page-fetch.js'; @@ -249,6 +269,8 @@ import { STORAGE_ACK_REGISTRATION_RETRY_MS, JOIN_APPROVAL_RETRY_TICK_MS, MESSAGE_OUTBOX_TICK_MS, + AGENT_PROFILE_HEARTBEAT_MS, + AGENT_PROFILE_STALE_THRESHOLD_MS, } from './dkg-agent-constants.js'; import { ContextGraphNotFoundError, @@ -258,6 +280,7 @@ import { type PreSignedAuthorAttestation, type LocalSwmSenderKeySendState, type LocalSwmSenderKeyReceiveState, + type PendingSenderKeyEntry, type RandomSamplingStartResult, type ACKSignerResolution, type SyncRequestEnvelope, @@ -359,6 +382,33 @@ export type { * const response = await agent.invokeSkill(offerings[0], inputData); * await agent.stop(); */ +/** + * OT-RFC-38 LU-11. Target ciphertext-chunk size on the SWM gossip + * wire. 32 KiB stays well under libp2p's per-message ceiling (the + * mesh defaults to 1 MiB) so chunks rarely fragment at the transport + * layer, and produces a tree shallow enough that on-chain proof + * verification per RFC-39 sampling tick stays cheap. The last chunk + * is whatever fraction remains. + */ +const CIPHERTEXT_CHUNK_SIZE_BYTES = 32 * 1024; + +/** + * OT-RFC-38 LU-11. Split a single plaintext buffer into the + * fixed-size pieces the chunked AEAD path expects. Empty input is + * rejected — the publisher computes `merkleRoot` from non-empty + * `kaCount` quads, so an empty plaintext upstream is always a bug. + */ +function sliceIntoCiphertextChunks(plaintext: Uint8Array): Uint8Array[] { + if (plaintext.length === 0) { + throw new Error('LU-11: sliceIntoCiphertextChunks rejects empty plaintext'); + } + const chunks: Uint8Array[] = []; + for (let off = 0; off < plaintext.length; off += CIPHERTEXT_CHUNK_SIZE_BYTES) { + chunks.push(plaintext.subarray(off, Math.min(off + CIPHERTEXT_CHUNK_SIZE_BYTES, plaintext.length))); + } + return chunks; +} + export class DKGAgent { readonly wallet: AgentWallet; readonly node: DKGNode; @@ -719,11 +769,52 @@ export class DKGAgent { * See {@link CatchupReplayGuard}. */ private readonly catchupReplayGuard = new CatchupReplayGuard(); + /** + * OT-RFC-38 LU-11 / OT-RFC-39 — separate replay LRU for the chunk + * sync verb. Kept distinct from the host-catchup guard so a single + * EOA's two concurrent streams (one for the LU-6 envelope catchup, + * one for per-chunk backfill) never collide on nonce uniqueness. + */ + private readonly ciphertextChunkCatchupReplayGuard = createCiphertextChunkCatchupReplayGuard(); /** * OT-RFC-38 / LU-6 Phase B — periodic beacon re-announce timer * (curators only). See {@link beaconRegistry} jsdoc. */ private beaconReannounceTimer?: ReturnType; + /** + * PR feat/chain-agents-cg-phonebook — periodic agent-profile + * heartbeat. Re-publishes the profile to the `agents` Context + * Graph on `AGENT_PROFILE_HEARTBEAT_MS` cadence (operator + * override via `config.network.agentProfileHeartbeatMs`; `0` + * disables). Undefined until {@link start} runs and `null` after + * {@link stop} clears it. + */ + private agentProfileHeartbeatTimer?: ReturnType; + /** + * Heartbeat-tick coalescing flag. When a heartbeat is already + * in flight, the next tick logs + skips instead of queueing — this + * keeps the queue depth at 1 even if publish latency exceeds the + * heartbeat cadence (slow chain RPC, congested gossip mesh). + * + * NOT a correctness gate against concurrent `publishProfile()` + * callers — startup, key-rotation, and revocation also call + * `publishProfile()` directly, and they bypass this flag. The + * correctness gate is the `publishProfileTail` mutex below. + */ + private agentProfileHeartbeatInFlight = false; + /** + * Serialization mutex for `publishProfile()`. Tail-promise chain: + * each new caller `await`s the prior call (success or failure) and + * only then runs its own publish. Codex review of PR #700 round 2 + * flagged that the heartbeat-only inFlight guard left a race + * window between the heartbeat tick and the existing `startup` / + * key-rotation / revocation callers, both of which mutate + * `ProfileManager.currentKcId` and rewrite the registry triples + * on every call. Serializing inside `publishProfile()` covers + * every entry point at the lowest level instead of duplicating + * the guard at every caller. + */ + private publishProfileTail: Promise = Promise.resolve(); /** * OT-RFC-38 / LU-6 Phase B — sliding-window rate-limiter applied * to pre-registration (beacon-discovered) ciphertext writes. @@ -914,6 +1005,27 @@ export class DKGAgent { private readonly swmSenderKeySendStates = new Map(); private readonly swmSenderKeyReceiveStates = new Map(); private swmSenderKeyStateLoaded = false; + /** + * PR-2 (SWM-fanout plan): pending sender-key package fanouts that + * landed in the "no advertised peerId" branch of + * `createAndDistributeSwmSenderKeyEpoch`. Keyed by lowercased + * `recipientAgentAddress` so the connection:open listener can drain + * by agent identity (the only handle we have when we minted the row + * without a peerId). Per-key triple `(senderAgentAddress, + * recipientKeyId, epochId)` deduplicates within an agent. + * + * In-memory only for now. Older epochs are evicted whenever a newer + * epoch is enqueued for the same `(sender, recipient)` pair — the + * supersession matches what the membership-hash flow already implies + * sender-side, and avoids the "queued package for epoch N replays + * after we've rolled to N+1" footgun. + * + * A future PR will plumb a SQLite-backed store through + * `config.swmSenderKeyStores?.pendingByAgent` so durability survives + * daemon restart. Today, restart loses pending rows and the next + * publish re-enqueues if the same member still has no peerId. + */ + private readonly pendingSenderKeyByAgent = new Map(); private constructor( config: DKGAgentConfig, @@ -990,10 +1102,12 @@ export class DKGAgent { } else if (config.chainConfig && opKeys?.length) { const evmConfigBase = { rpcUrl: config.chainConfig.rpcUrl, + rpcUrls: config.chainConfig.rpcUrls, privateKey: opKeys[0], additionalKeys: opKeys.slice(1), hubAddress: config.chainConfig.hubAddress, chainId: config.chainConfig.chainId, + approvalPolicy: config.chainConfig.approvalPolicy, }; if (config.chainConfig.adminPrivateKey) { chain = new EVMChainAdapter({ ...evmConfigBase, adminPrivateKey: config.chainConfig.adminPrivateKey }); @@ -1023,6 +1137,7 @@ export class DKGAgent { relayServerCapacity: config.relayServerCapacity, relayReservationCount: config.relayReservationCount, nodeVersion: config.nodeVersion, + ...pickNetworkTunables(config), }; const node = new DKGNode(nodeConfig); @@ -1192,6 +1307,25 @@ export class DKGAgent { } const network = new LibP2PNetwork(this.node); + // Local helper: race a lookup against an optional AbortSignal so + // an in-flight SPARQL query honours the resolver's outer deadline. + // Codex PR #499 round 5 race: re-check signal.aborted INSIDE the + // listener-attach Promise so we don't lose the one-shot 'abort' + // event between the early gate and addEventListener. + const raceAgainstAbort = (lookup: Promise, signal: AbortSignal | undefined): Promise => { + if (!signal) return lookup; + return Promise.race([ + lookup, + new Promise((resolve) => { + if (signal.aborted) { + resolve(null); + return; + } + signal.addEventListener('abort', () => resolve(null), { once: true }); + }), + ]); + }; + const peerResolver = new PeerResolver({ network, registry: new StubNetworkStateRegistry(), @@ -1224,35 +1358,40 @@ export class DKGAgent { if (opts?.signal?.aborted) return null; const lookup = this.discovery.findAgentByPeerId(peerId) .then((agent) => agent?.relayAddress ?? null); - const signal = opts?.signal; - if (!signal) return lookup; - return Promise.race([ - lookup, - new Promise((resolve) => { - // Codex PR #499 round 5 (dkg-agent.ts:1354): the early - // `signal.aborted` check above and `addEventListener` - // are not atomic — the signal could fire in between, and - // since `abort` is a one-shot event, our late listener - // would never see it and this Promise would hang for the - // full lookup duration. Re-check INSIDE the constructor - // before subscribing so the abort branch resolves - // immediately if we lost that race. - if (signal.aborted) { - resolve(null); - return; - } - signal.addEventListener( - 'abort', - () => resolve(null), - { once: true }, - ); - }), - ]); + return raceAgainstAbort(lookup, opts?.signal); + }, + // PR feat/chain-agents-cg-phonebook: richer lookup that + // returns direct multiaddrs + relayAddress + lastSeen so the + // resolver can prime the peerStore with current dialable + // addrs and filter by freshness. The resolver falls through + // to `findRelayForPeer` if this returns null. + findAgentDialAddresses: async (peerId, opts) => { + if (opts?.signal?.aborted) return null; + const lookup = this.discovery.findAgentByPeerId(peerId) + .then((agent) => { + if (!agent) return null; + const lastSeenMs = agent.lastSeen ? Date.parse(agent.lastSeen) : undefined; + return { + multiaddrs: agent.multiaddrs ?? [], + relayAddress: agent.relayAddress, + lastSeenMs: Number.isFinite(lastSeenMs) ? lastSeenMs : undefined, + }; + }); + return raceAgainstAbort(lookup, opts?.signal); }, }, + agentDirectoryStaleThresholdMs: AGENT_PROFILE_STALE_THRESHOLD_MS, // Bootstrap is a libp2p-startup concern (`bootstrap({ list })` in // peerDiscovery, see node.ts) — not a per-peer resolution concern. // Removed here per Codex review feedback on PR #496. + // + // Note: `defaultPerStepTimeoutMs` is intentionally NOT wired from + // operator config. Production callers (`connectToPeerId`, chat / + // routed sends) always pass an explicit `perStepTimeoutMs` + // derived from their own deadline budget, so any constructor + // default would be a silent no-op for those paths. The + // constructor option survives as a test-fixture surface. + // Codex review of PR #698 round 2 caught this. }); this.peerResolver = peerResolver; this.router = new ProtocolRouter(this.node, { peerResolver }); @@ -1275,10 +1414,19 @@ export class DKGAgent { router: this.router, idempotencyStore, outboxStore, + // PR feat/chain-agents-cg-phonebook: stall-recovery now routes + // through the full PeerResolver instead of raw DHT findPeer. + // The dial fast-path (ProtocolRouter) already prefers + // PeerResolver.resolve() on every attempt, but the outbox + // stall-walk (`messenger.maybeScheduleDhtWalk`) was hardcoded + // to a DHT-only path — so an entry that timed out 5x because + // its addresses were stale couldn't recover by consulting + // agents-CG. Routing through PeerResolver picks up the + // phonebook fallback automatically; the raw findPeer call + // remains the step-2 DHT lookup inside resolve(), so we don't + // lose any pre-existing recovery path. resolvePeer: async (peerId, { signal }) => { - const { peerIdFromString } = await import('@libp2p/peer-id'); - const pid = peerIdFromString(peerId); - await this.node.libp2p.peerRouting.findPeer(pid, { signal }); + await peerResolver.resolve(peerId, { signal }).catch(() => undefined); }, }); this.gossip = new GossipSubManager(this.node, this.eventBus); @@ -1415,6 +1563,14 @@ export class DKGAgent { // envelope versioning, idempotency cache, and `/api/slo` stats. this.messenger.register(PROTOCOL_SWM_HOST_CATCHUP, (data, fromPeerId) => this.handleSwmHostCatchup(data, fromPeerId)); + // OT-RFC-38 LU-11 / OT-RFC-39: per-chunk ciphertext sync verb. + // Symmetric to PROTOCOL_SWM_HOST_CATCHUP but pulls one + // (cgId, batchId, chunkIndex) ciphertext at a time from the + // triple-store-backed chunk store the V2 ACK verifier reads + // against. Registered unconditionally — the handler itself + // gates by node role + per-CG authorization. + this.messenger.register(PROTOCOL_GET_CIPHERTEXT_CHUNK, (data, fromPeerId) => this.handleGetCiphertextChunk(data, fromPeerId)); + const effectiveRole = this.config.nodeRole ?? 'edge'; const ackSignerCandidates = this.getACKSignerCandidateWallets(ctx); let onChainIdentityId = 0n; @@ -1546,6 +1702,16 @@ export class DKGAgent { contextGraphSharedMemoryUri, chainId: chainIdForHandler, kav10Address: kav10AddressForHandler, + // Codex review (round 2) on PR #727: must NOT collapse to a + // plain `gossipWireIdFor` because `PublishIntent.swmGraphId` + // may be absent on a chunked V2 intent (the handler then + // falls back to the numeric `cgId`). Pass through + // `canonicalChunkStoreCgIdOrNull` so numeric ids resolve via + // the local on-chain map, and unknown shapes return null → + // handler widens to wildcard `GRAPH ?g` instead of pinning + // to a fabricated keccak-of-decimal-string. + normalizeContextGraphIdForChunkStore: (rawCgId: string) => + this.canonicalChunkStoreCgIdOrNull(rawCgId), // Codex PR #608: independently verify the publisher's // `isEncryptedPayload=true` claim against this node's // local view of the CG. `isPrivateContextGraph()` is the @@ -1638,6 +1804,7 @@ export class DKGAgent { // router.register under the hood (see Messenger.register // implementation), so router.unregister still removes it. this.router.unregister(PROTOCOL_STORAGE_ACK); + this.router.unregister(PROTOCOL_STORAGE_ACK_V2); this.log.warn( attemptCtx, `Unregistered V10 StorageACK handler: signer ${ackSignerWallet.address} ` + @@ -1714,6 +1881,20 @@ export class DKGAgent { const peerId = { toString: () => peerIdStr, toBytes: () => new Uint8Array() }; return ackHandler.handler(data, peerId); }); + // OT-RFC-38 LU-11 / OT-RFC-39 — V2 protocol id. Same + // handler instance, distinct libp2p protocol. Publishers + // running the chunked emit path negotiate V2 explicitly + // so pre-LU-11 cores (V1-only) never see a V2 envelope; + // the handler dispatches on `intent.ackProtocolVersion` + // internally — V2 envelopes hit the chunked verify + // branch, V1 envelopes (if any ever arrive on the V2 + // protocol id, which spec-conforming clients won't send) + // fall through to the legacy single-blob / public-CG + // paths. + this.messenger.register(PROTOCOL_STORAGE_ACK_V2, async (data, peerIdStr) => { + const peerId = { toString: () => peerIdStr, toBytes: () => new Uint8Array() }; + return ackHandler.handler(data, peerId); + }); storageACKProtocolRegistered = true; this.clearStorageACKRegistrationRetry(); this.log.info( @@ -1951,6 +2132,34 @@ export class DKGAgent { this.beaconReannounceTimer.unref(); } + // PR feat/chain-agents-cg-phonebook: schedule the periodic + // profile heartbeat alongside the beacon timer. The one-shot + // startup publish happens in `lifecycle.ts` (setTimeout 0); this + // timer is the steady-state refresh that keeps `dkg:multiaddr` + + // `dkg:lastSeen` fresh for peers' dial fallback. Default 5 min; + // operator-tunable; `0` disables. + const heartbeatMs = this.config.agentProfileHeartbeatMs ?? AGENT_PROFILE_HEARTBEAT_MS; + if (Number.isFinite(heartbeatMs) && Number.isInteger(heartbeatMs) && heartbeatMs > 0) { + this.agentProfileHeartbeatTimer = setInterval(() => { + if (this.agentProfileHeartbeatInFlight) { + this.log.debug?.(ctx, 'Agent profile heartbeat skipped: previous publish still in flight'); + return; + } + this.agentProfileHeartbeatInFlight = true; + this.publishProfile() + .catch((err) => { + const msg = err instanceof Error ? err.message : String(err); + this.log.warn(ctx, `Agent profile heartbeat publish failed: ${msg}`); + }) + .finally(() => { + this.agentProfileHeartbeatInFlight = false; + }); + }, heartbeatMs); + if (typeof this.agentProfileHeartbeatTimer.unref === 'function') { + this.agentProfileHeartbeatTimer.unref(); + } + } + // Set up messaging const x25519Priv = ed25519ToX25519Private(this.wallet.keypair.secretKey); this.messageHandler = new MessageHandler( @@ -2397,6 +2606,19 @@ export class DKGAgent { const message = err instanceof Error ? err.message : String(err); this.log.warn(ctx, `Opportunistic Messenger-outbox retry on connect failed for ${remotePeer}: ${message}`); } + // PR-2 (SWM-fanout plan): drain pending sender-key packages + // that were queued because the recipient had no advertised + // peerId at publish time. Tolerant of profile-lookup failure + // (the next connection:open will retry). + try { + const drained = await this.drainPendingSenderKeyForPeer(remotePeer); + if (drained > 0) { + this.log.info(ctx, `Drained ${drained} pending SWM sender-key package(s) for ${remotePeer}`); + } + } catch (err: unknown) { + const message = err instanceof Error ? err.message : String(err); + this.log.warn(ctx, `Pending SWM sender-key drain on connect failed for ${remotePeer}: ${message}`); + } })(); const now = Date.now(); @@ -2634,6 +2856,34 @@ export class DKGAgent { useWorkerThread: this.config.randomSamplingUseWorkerThread ?? true, tickIntervalMs: this.config.randomSamplingTickIntervalMs, log: this.randomSamplingLogger(ctx), + // OT-RFC-39 late-join sync — gives the prover an escape hatch + // when its tick fires on a curated KC whose ciphertext chunks + // never reached this core's local triple store (typically: the + // core was offline during the curator's publish, or joined the + // CG after the gossip envelopes rolled off the mesh). The hook + // pulls the missing chunks from authorized peers on demand via + // `PROTOCOL_GET_CIPHERTEXT_CHUNK` and persists them, after + // which the prover retries the extract exactly once. See + // `buildCiphertextChunkBackfill` for the discovery + fetch + // policy. + ciphertextChunkBackfill: this.buildCiphertextChunkBackfill(ctx), + // Codex review on PR #715 — let the prover's extractor pin + // the per-CG named graph instead of scanning `GRAPH ?g`. We + // chain `resolveLocalCgIdByOnChainId` (numeric → cleartext) + // then `gossipWireIdFor` (cleartext → curator nameHash, the + // wire form), matching what `ingestSwmCiphertextChunkEnvelope` + // and the V2 ACK loadChunk persist/look up under. Returns + // null when the local node doesn't have the CG metadata yet + // (chain replay still catching up); the extractor falls back + // to wildcard scanning for that tick, identical to pre-fix + // behaviour, so a missing local map degrades to "no + // cross-CG collision guard for this tick" rather than + // "extract fails outright". + canonicalCgIdForChunkStore: (cgId: bigint): string | null => { + const local = this.resolveLocalCgIdByOnChainId(cgId); + if (local === null) return null; + return this.gossipWireIdFor(local); + }, }); if (this.randomSamplingHandle && this.randomSamplingHandle !== handle) { try { await this.randomSamplingHandle.stop(); } catch { /* swallow bind replacement cleanup */ } @@ -3850,6 +4100,24 @@ export class DKGAgent { } async publishProfile(): Promise { + // Tail-chain serialization: every caller waits for the prior + // `publishProfile()` to settle (success or failure) before + // running its own publish. Prevents the startup / heartbeat / + // key-rotation / revocation paths from racing each other on + // `ProfileManager.currentKcId` and the registry triples. + // Codex review of PR #700 round 2. + const run = this.publishProfileTail + .catch(() => { + // swallow prior errors so a transient publish failure does + // not poison every subsequent publish for the lifetime of + // the agent + }) + .then(() => this.publishProfileImpl()); + this.publishProfileTail = run; + return run; + } + + private async publishProfileImpl(): Promise { const pubKeyBase64 = Buffer.from(this.wallet.keypair.publicKey).toString('base64'); const relayAddrs = this.config.relayPeers; const defaultAgent = this.defaultAgentAddress ? this.localAgents.get(this.defaultAgentAddress) : undefined; @@ -3892,6 +4160,8 @@ export class DKGAgent { publicKey: pubKeyBase64, relayAddress: relayAddrs?.[0], agentAddress: this.defaultAgentAddress, + multiaddrs: collectPublishableMultiaddrs(this.node.multiaddrs), + lastSeen: new Date().toISOString(), encryptionKeys: defaultAgent?.workspaceEncryptionKeys.map((k) => ({ encryptionKeyAlgorithm: k.encryptionKeyAlgorithm, publicEncryptionKey: k.publicEncryptionKey, @@ -5610,93 +5880,162 @@ export class DKGAgent { // (the recipient daemon has no matching local privkey for them) — that's // expected, not a hard error. We only abort when EVERY key for a given // agent failed. - const failuresByAgent = new Map(); - const successByAgent = new Set(); - const recordFailure = (agent: string, keyId: string, err: unknown) => { - const msg = err instanceof Error ? err.message : String(err); - const key = agent.toLowerCase(); - const list = failuresByAgent.get(key) ?? []; - list.push(`${keyId}: ${msg}`); - failuresByAgent.set(key, list); - }; - - for (const recipient of input.recipients) { - const recipientAgentAddress = ethers.getAddress(recipient.agentAddress); - const pkg = await this.createSignedSwmSenderKeyPackage({ - state, - recipient, - senderPrivateKey: input.sender.privateKey, - }); + // + // Fanout runs in parallel via Promise.allSettled. The pre-rc.12 loop + // awaited each `messenger.sendReliable` sequentially, so foreground + // publish latency scaled as `O(n_recipients × n_keys × send_timeout)` — + // a single offline member paid the full per-send timeout before the + // loop advanced. Concurrent fanout keeps the wall-clock cost bounded + // by the slowest individual send (~`DEFAULT_SEND_TIMEOUT_MS`). + // + // Concurrent mutation is moot: each per-recipient async closure runs + // on the single JS event loop and yields only at `await` points; the + // aggregation maps are appended to ONLY in the post-settle pass below. + type PerRecipientOutcome = + | { kind: 'success'; agentAddress: string } + | { kind: 'failure'; agentAddress: string; keyId: string; error: Error }; + + const settled = await Promise.allSettled( + input.recipients.map(async (recipient): Promise => { + const recipientAgentAddress = ethers.getAddress(recipient.agentAddress); + const pkg = await this.createSignedSwmSenderKeyPackage({ + state, + recipient, + senderPrivateKey: input.sender.privateKey, + }); - const isLocalRecipient = this.hasLocalAgent(recipientAgentAddress); - if (isLocalRecipient) { - try { - await this.acceptSwmSenderKeyPackage(pkg, this.node.peerId.toString(), input.ctx); - successByAgent.add(recipientAgentAddress.toLowerCase()); - } catch (err) { - recordFailure(recipientAgentAddress, recipient.recipientKeyId, err); + if (this.hasLocalAgent(recipientAgentAddress)) { + try { + await this.acceptSwmSenderKeyPackage(pkg, this.node.peerId.toString(), input.ctx); + return { kind: 'success', agentAddress: recipientAgentAddress }; + } catch (err) { + return { + kind: 'failure', + agentAddress: recipientAgentAddress, + keyId: recipient.recipientKeyId, + error: err instanceof Error ? err : new Error(String(err)), + }; + } } - continue; - } - - if (!recipient.peerId) { - recordFailure(recipientAgentAddress, recipient.recipientKeyId, new Error('no advertised peerId')); - continue; - } - this.log.info( - input.ctx, - `SWM sender-key setup send: senderAgent=${senderAgentAddress} recipientAgent=${recipientAgentAddress} ` + - `peerId=${recipient.peerId} contextGraph=${state.contextGraphId}${state.subGraphName ? `/${state.subGraphName}` : ''} ` + - `epoch=${state.epochId} membershipHash=${state.membershipHash} recipientKeyId=${recipient.recipientKeyId}`, - ); - try { - // rc.9 PR-8: route through messenger.sendReliable so - // sender-side idempotency + durable outbox + retry-with- - // backoff cover this protocol the same way they cover chat. - // - // Delivery semantics (C2 integration-pass relaxation): - // • `delivered=true && ack.accepted=true` → success. - // • `delivered=true && ack.accepted=false` → HARD failure - // (recipient explicitly rejected the package — bad key, - // bad membership hash, etc; queuing won't help). - // • `delivered=false` → SOFT success. - // The setup-package landed in the messenger's durable - // outbox and will be replayed when the recipient comes - // back online. Treating this as a hard failure used to - // block any open-publish-CG write whenever the curator - // was offline mid-batch, breaking the "members keep - // publishing under intermittent curator availability" - // contract C2 exercises. The recipient still gets the - // epoch + chain key eventually; the only cost is that - // they can't decrypt the broadcast that immediately - // follows until the queued setup catches up. - const sendResult = await this.messenger.sendReliable( - recipient.peerId, - PROTOCOL_SWM_SENDER_KEY, - encodeSwmSenderKeyPackage(pkg), - ); - if (!sendResult.delivered) { + if (!recipient.peerId) { + // PR-2 (SWM-fanout plan): the recipient agent has no advertised + // `dkg:peerId` triple in our local store (typically because we + // haven't synced their profile yet, or they really were never + // online). Pre-PR-2 this was a HARD failure for that key, and + // if every key for the agent landed here the whole publish + // threw — turning "one never-seen member" into "publish blocked + // for everyone". We now match the messenger.sendReliable + // soft-success contract: durably remember the package and + // attempt delivery once the agent shows up (via the + // connection:open drain below). + this.enqueuePendingSenderKey({ + senderAgentAddress: senderAgentAddress.toLowerCase(), + recipientAgentAddress: recipientAgentAddress.toLowerCase(), + recipientKeyId: recipient.recipientKeyId, + epochId: state.epochId, + contextGraphId: state.contextGraphId, + subGraphName: state.subGraphName, + packageBytes: encodeSwmSenderKeyPackage(pkg), + createdAtMs: Date.now(), + }); this.log.warn( input.ctx, `SWM sender-key setup for ${recipientAgentAddress} keyId=${recipient.recipientKeyId} ` + - `queued (not synchronously deliverable): ${sendResult.error} — recipient will receive on next reconnect`, + `queued (no advertised peerId) — will deliver when recipient connects`, ); - successByAgent.add(recipientAgentAddress.toLowerCase()); - continue; + return { kind: 'success', agentAddress: recipientAgentAddress }; } - const ack = decodeSwmSenderKeyPackageAck(sendResult.response); - if ( - ack.version !== SWM_SENDER_KEY_PACKAGE_VERSION || - ack.type !== SWM_SENDER_KEY_PACKAGE_ACK_TYPE || - !ack.accepted - ) { - recordFailure(recipientAgentAddress, recipient.recipientKeyId, new Error(ack.reason ?? 'unknown reason')); - } else { - successByAgent.add(recipientAgentAddress.toLowerCase()); + + this.log.info( + input.ctx, + `SWM sender-key setup send: senderAgent=${senderAgentAddress} recipientAgent=${recipientAgentAddress} ` + + `peerId=${recipient.peerId} contextGraph=${state.contextGraphId}${state.subGraphName ? `/${state.subGraphName}` : ''} ` + + `epoch=${state.epochId} membershipHash=${state.membershipHash} recipientKeyId=${recipient.recipientKeyId}`, + ); + try { + // rc.9 PR-8: route through messenger.sendReliable so + // sender-side idempotency + durable outbox + retry-with- + // backoff cover this protocol the same way they cover chat. + // + // Delivery semantics (C2 integration-pass relaxation): + // • `delivered=true && ack.accepted=true` → success. + // • `delivered=true && ack.accepted=false` → HARD failure + // (recipient explicitly rejected the package — bad key, + // bad membership hash, etc; queuing won't help). + // • `delivered=false` → SOFT success. + // The setup-package landed in the messenger's durable + // outbox and will be replayed when the recipient comes + // back online. Treating this as a hard failure used to + // block any open-publish-CG write whenever the curator + // was offline mid-batch, breaking the "members keep + // publishing under intermittent curator availability" + // contract C2 exercises. The recipient still gets the + // epoch + chain key eventually; the only cost is that + // they can't decrypt the broadcast that immediately + // follows until the queued setup catches up. + const sendResult = await this.messenger.sendReliable( + recipient.peerId, + PROTOCOL_SWM_SENDER_KEY, + encodeSwmSenderKeyPackage(pkg), + ); + if (!sendResult.delivered) { + this.log.warn( + input.ctx, + `SWM sender-key setup for ${recipientAgentAddress} keyId=${recipient.recipientKeyId} ` + + `queued (not synchronously deliverable): ${sendResult.error} — recipient will receive on next reconnect`, + ); + return { kind: 'success', agentAddress: recipientAgentAddress }; + } + const ack = decodeSwmSenderKeyPackageAck(sendResult.response); + if ( + ack.version !== SWM_SENDER_KEY_PACKAGE_VERSION || + ack.type !== SWM_SENDER_KEY_PACKAGE_ACK_TYPE || + !ack.accepted + ) { + return { + kind: 'failure', + agentAddress: recipientAgentAddress, + keyId: recipient.recipientKeyId, + error: new Error(ack.reason ?? 'unknown reason'), + }; + } + return { kind: 'success', agentAddress: recipientAgentAddress }; + } catch (err) { + return { + kind: 'failure', + agentAddress: recipientAgentAddress, + keyId: recipient.recipientKeyId, + error: err instanceof Error ? err : new Error(String(err)), + }; } - } catch (err) { - recordFailure(recipientAgentAddress, recipient.recipientKeyId, err); + }), + ); + + const failuresByAgent = new Map(); + const successByAgent = new Set(); + for (let i = 0; i < settled.length; i++) { + const r = settled[i]; + if (r.status === 'rejected') { + // The per-recipient closure catches all throw paths and returns a + // failure outcome, so a rejection here means the closure itself + // crashed (programmer error). Record it against the recipient so + // the surrounding logic doesn't lose track of the slot. + const recipient = input.recipients[i]; + const agent = ethers.getAddress(recipient.agentAddress).toLowerCase(); + const list = failuresByAgent.get(agent) ?? []; + list.push(`${recipient.recipientKeyId}: ${String(r.reason)}`); + failuresByAgent.set(agent, list); + continue; + } + const outcome = r.value; + if (outcome.kind === 'success') { + successByAgent.add(outcome.agentAddress.toLowerCase()); + } else { + const agent = outcome.agentAddress.toLowerCase(); + const list = failuresByAgent.get(agent) ?? []; + list.push(`${outcome.keyId}: ${outcome.error.message}`); + failuresByAgent.set(agent, list); } } @@ -5722,6 +6061,100 @@ export class DKGAgent { return state; } + /** + * PR-2 (SWM-fanout plan): enqueue a sender-key package whose recipient + * has no advertised `dkg:peerId` (so we can't even ask the messenger + * to queue it). Older epochs for the same `(sender, recipient)` pair + * are evicted — a newer epoch supersedes them by definition. + * + * Per-key dedup: `(senderAgentAddress, recipientKeyId, epochId)` + * matches an existing row, we replace it (idempotent re-enqueue). + */ + private enqueuePendingSenderKey(entry: PendingSenderKeyEntry): void { + const recipientKey = entry.recipientAgentAddress.toLowerCase(); + const existing = this.pendingSenderKeyByAgent.get(recipientKey) ?? []; + // Drop older epochs for the same (sender, recipient) pair; the newer + // epoch's membership-hash supersedes them. Keep entries for OTHER + // senders / recipients unchanged. + const filtered = existing.filter((e) => { + if (e.senderAgentAddress !== entry.senderAgentAddress) return true; + if (e.epochId === entry.epochId) { + // Same epoch: dedupe by recipientKeyId — caller may re-enqueue + // on retry. Replace by dropping the old slot; the new one is + // appended below. + return e.recipientKeyId !== entry.recipientKeyId; + } + return false; + }); + filtered.push(entry); + this.pendingSenderKeyByAgent.set(recipientKey, filtered); + } + + /** + * Drain queued sender-key packages whose recipient agent is one of + * the agent addresses advertised by `peerId`. Returns the number of + * rows successfully delivered (acked) and removed. + * + * Fired from the `connection:open` listener — see line 2382 — so the + * cost lives on the cold path of "we just connected to a new peer", + * not on every share. Each successful `sendReliable` with + * `delivered=true && ack.accepted=true` deletes the row; soft + * (`delivered=false`) leaves it queued for the next attempt; hard + * negative acks also delete it (the package is permanently invalid + * for this recipient). + */ + private async drainPendingSenderKeyForPeer(peerId: string): Promise { + if (this.pendingSenderKeyByAgent.size === 0) return 0; + let drained = 0; + let agentAddresses: string[] = []; + try { + const profile = await this.discovery.findAgentByPeerId(peerId); + if (profile?.agentAddress) { + agentAddresses = [profile.agentAddress.toLowerCase()]; + } + } catch { + // Resolution failure is benign — we'll try again on the next + // connection:open burst. Don't propagate. + } + if (agentAddresses.length === 0) return 0; + + for (const recipientAgentAddress of agentAddresses) { + const queue = this.pendingSenderKeyByAgent.get(recipientAgentAddress); + if (!queue || queue.length === 0) continue; + const remaining: PendingSenderKeyEntry[] = []; + for (const entry of queue) { + try { + const sendResult = await this.messenger.sendReliable( + peerId, + PROTOCOL_SWM_SENDER_KEY, + entry.packageBytes, + ); + if (!sendResult.delivered) { + // Messenger queued for retry — keep our row so the next + // connection:open / publish has another shot. + remaining.push(entry); + continue; + } + // Both accepted=true and accepted=false are terminal: the + // recipient saw the package. Don't retry — the messenger's + // idempotency key would block re-delivery anyway. + drained += 1; + } catch { + // Wire error: keep the row queued. Next connection:open + // attempt has its own try/catch wrapper so this never + // propagates out of the listener. + remaining.push(entry); + } + } + if (remaining.length === 0) { + this.pendingSenderKeyByAgent.delete(recipientAgentAddress); + } else { + this.pendingSenderKeyByAgent.set(recipientAgentAddress, remaining); + } + } + return drained; + } + private async createSignedSwmSenderKeyPackage(input: { state: LocalSwmSenderKeySendState; recipient: WorkspaceAgentRecipient; @@ -7014,6 +7447,17 @@ export class DKGAgent { undefined, onChainId ?? undefined, ); + // OT-RFC-38 LU-11 — also resolve the chunked emitter for curated + // CGs. When set, the publisher prefers this path: chunks fan out + // via SWM gossip and the V2 ACK carries only the commitment. + // Public CGs short-circuit to `undefined` here just like the + // single-blob resolver above. + const encryptInlineChunked = await this._resolveEncryptInlineChunked( + contextGraphId, + opts?.subGraphName, + undefined, + onChainId ?? undefined, + ); const result = await this.publisher.publish({ contextGraphId, @@ -7029,6 +7473,7 @@ export class DKGAgent { publishContextGraphId: onChainId ?? undefined, precomputedAttestation, encryptInlinePayload, + encryptInlineChunked, }); onPhase?.('broadcast', 'start'); @@ -7876,33 +8321,27 @@ export class DKGAgent { * NO_DATA_IN_SWM (same observable as today, the §1.1 bug). The * agent surfaces a warn so operators see the configuration miss. */ - private async _resolveEncryptInlinePayload( + /** + * Shared resolution between LU-5 (`_resolveEncryptInlinePayload`) and + * LU-11 (`_resolveEncryptInlineChunked`). Probes the access policy, + * bootstraps / rotates the swm-sender-key epoch, and returns the + * effective `chainKey` + AEAD CG-id binding. Returns `undefined` for + * public CGs so the caller stays on the plaintext-inline path. + * + * The original LU-5 method body lived inline here pre-LU-11; pulling + * it into a helper avoided drifting two near-identical curated- + * probe / epoch-rotation blocks once chunked emission joined the + * picture. All semantics (probe order, rotation triggers, fail- + * closed branches, error texts) are preserved. + */ + private async _resolveCuratedChainKeyContext( contextGraphId: string, - subGraphName?: string, - authorAgentAddress?: string, - publishContextGraphId?: string, - ): Promise<((plaintext: Uint8Array) => Promise) | undefined> { + subGraphName: string | undefined, + authorAgentAddress: string | undefined, + publishContextGraphId: string | undefined, + logPrefix: string, + ): Promise<{ chainKey: Uint8Array; aeadCgId: string; senderAddress: string } | undefined> { const ctx = createOperationContext('publish'); - // Codex PR #608 R4 #7375: the encryption decision must be keyed - // off the TARGET on-chain CG, not the source SWM graph. On remap - // publishes (`publishContextGraphId` differs from the local SWM - // `contextGraphId`), the prior source-only probe produced two - // distinct failure modes: - // - // public source → curated target: skipped encryption → plaintext - // leaked to the curated target's ACK peers (security). - // private source → public target: applied encryption → core's - // `isCgCurated` check (R3 #1325, now target-keyed) correctly - // rejected the opaque ACK → publish blocked (correctness). - // - // The probe mirrors the SWM data-plane `isCgCurated` callback at - // line 1499: local meta-graph first (works for URL-style ids the - // local store knows about), then chain access-policy fallback - // for numeric on-chain ids (covers the C2 case where the target - // is just the numeric `cgId` from the publish intent and the - // local store has no triple keyed by that id). Numeric IDs are - // chain-owned; if chain truth is unavailable, return UNKNOWN and - // fail closed instead of silently publishing plaintext. const targetCgId = publishContextGraphId ?? contextGraphId; const probeIsCurated = async (cgId: string): Promise => { try { @@ -7919,10 +8358,6 @@ export class DKGAgent { if (numericId <= 0n) return false; const getAccessPolicy = this.chain.getContextGraphAccessPolicy; if (typeof getAccessPolicy !== 'function') { - // Numeric ids are chain-owned policy surfaces. If the adapter - // cannot expose chain truth, choosing plaintext would risk a - // curated-target leak, so keep the UNKNOWN path and let the - // caller fail closed below. return null; } try { @@ -7933,7 +8368,7 @@ export class DKGAgent { } return null; } catch (err) { - this.log.warn(ctx, `_resolveEncryptInlinePayload: chain.getContextGraphAccessPolicy(${cgId}) failed — treating as UNKNOWN (fail-closed): ${err instanceof Error ? err.message : String(err)}`); + this.log.warn(ctx, `${logPrefix}: chain.getContextGraphAccessPolicy(${cgId}) failed — treating as UNKNOWN (fail-closed): ${err instanceof Error ? err.message : String(err)}`); } return null; }; @@ -7943,19 +8378,15 @@ export class DKGAgent { : await probeIsCurated(targetCgId); if (targetIsCurated == null || (targetCgId !== contextGraphId && sourceIsCurated == null)) { throw new Error( - `LU-5: publish access-policy is unknown — ` + + `${logPrefix}: publish access-policy is unknown — ` + `source CG "${contextGraphId}" curated=${sourceIsCurated ?? 'unknown'}, ` + `target CG "${targetCgId}" curated=${targetIsCurated ?? 'unknown'}. ` + `Refusing to choose plaintext vs encrypted inline payload without chain-confirmed policy.`, ); } if (targetCgId !== contextGraphId && sourceIsCurated !== targetIsCurated) { - // Fail-closed: a remap publish that crosses the privacy - // boundary in either direction is almost certainly an - // operator/caller mistake. Refuse rather than silently picking - // one side and producing the wrong wire shape. throw new Error( - `LU-5: remap publish source/target access-policy mismatch — ` + + `${logPrefix}: remap publish source/target access-policy mismatch — ` + `source CG "${contextGraphId}" curated=${sourceIsCurated}, ` + `target CG "${targetCgId}" curated=${targetIsCurated}. ` + `Refusing to publish: encrypting against the wrong CG's policy ` + @@ -7970,45 +8401,32 @@ export class DKGAgent { ?? this.defaultAgentAddress ?? this.peerId; - // Codex PR #608 R3 #7: mirror the rotation contract from - // `encryptWorkspacePayloadWithSenderKey` — always load persisted - // state FIRST so a daemon restart reuses the existing epoch - // instead of minting a new one, and ALWAYS recompute the current - // membership hash so an allowlist change forces an epoch - // rotation. The prior implementation only entered the bootstrap - // branch when the in-memory map happened to be empty AND never - // compared the current membership against the cached state, so - // (a) every restart silently rotated and (b) revocations / - // additions kept reusing a stale epoch until the next manual - // SWM write through `share()`. await this.loadSwmSenderKeyState(); const sender = this.getLocalSigningAgentForAddress(senderAddress); if (!sender) { throw new Error( - `LU-5: curated CG ${contextGraphId}: cannot bootstrap swm-sender-key — ` + + `${logPrefix}: curated CG ${contextGraphId}: cannot bootstrap swm-sender-key — ` + `no local custodial signing key for agent ${senderAddress}. ` + `Refusing to publish curated CG payload via the plaintext-inline fallback.`, ); } const resolution = await resolveWorkspaceAgentRecipients(this.store, { contextGraphId }); if (!resolution.requiresEncryption) { - // Access policy lookup said curated, but the recipient resolver - // disagrees. Conservative: refuse rather than silently downgrade. throw new Error( - `LU-5: curated CG ${contextGraphId}: access-policy says curated but recipient resolver ` + + `${logPrefix}: curated CG ${contextGraphId}: access-policy says curated but recipient resolver ` + `returned no agent recipients. Refusing to publish to avoid plaintext leak.`, ); } if (resolution.recipients.length === 0) { throw new Error( - `LU-5: curated CG ${contextGraphId}: no DKG agent recipients available — ` + + `${logPrefix}: curated CG ${contextGraphId}: no DKG agent recipients available — ` + `add at least one allowed agent before publishing.`, ); } const recipientSet = new Set(resolution.recipients.map((r) => r.agentAddress.toLowerCase())); if (!recipientSet.has(ethers.getAddress(senderAddress).toLowerCase())) { throw new Error( - `LU-5: curated CG ${contextGraphId}: sender ${senderAddress} is not in the recipient set — ` + + `${logPrefix}: curated CG ${contextGraphId}: sender ${senderAddress} is not in the recipient set — ` + `add yourself to the allowedAgents before publishing.`, ); } @@ -8029,7 +8447,7 @@ export class DKGAgent { : `membership changed (was=${state.membershipHash} now=${membershipHash})`; this.log.info( ctx, - `LU-5: bootstrapping/rotating swm-sender-key epoch for curated CG ${contextGraphId} ` + + `${logPrefix}: bootstrapping/rotating swm-sender-key epoch for curated CG ${contextGraphId} ` + `(sender=${senderAddress}, recipients=${resolution.recipients.length}, reason=${reason})`, ); state = await this.createAndDistributeSwmSenderKeyEpoch({ @@ -8044,15 +8462,24 @@ export class DKGAgent { await this.saveSwmSenderKeyState(); } - const chainKey = state.chainKey; - // Codex PR #608 R2 #12: the AEAD key must be derived from the - // *target* on-chain CG id (the one the published KC is bound to - // on chain) — not the source SWM CG id. On remap publishes - // (where the source `contextGraphId` differs from the target - // `publishContextGraphId`/`onChainId`), consumers verifying the - // KC use the canonical on-chain id; if we derive with the - // source id here, every consumer's decrypt fails. - const aeadCgId = publishContextGraphId ?? contextGraphId; + return { + chainKey: state.chainKey, + aeadCgId: publishContextGraphId ?? contextGraphId, + senderAddress, + }; + } + + private async _resolveEncryptInlinePayload( + contextGraphId: string, + subGraphName?: string, + authorAgentAddress?: string, + publishContextGraphId?: string, + ): Promise<((plaintext: Uint8Array) => Promise) | undefined> { + const resolved = await this._resolveCuratedChainKeyContext( + contextGraphId, subGraphName, authorAgentAddress, publishContextGraphId, 'LU-5', + ); + if (!resolved) return undefined; + const { chainKey, aeadCgId } = resolved; return async (plaintextNquads: Uint8Array): Promise => { return encryptV10PublishPayload({ chainKey, @@ -8062,6 +8489,148 @@ export class DKGAgent { }; } + /** + * OT-RFC-38 LU-11 / OT-RFC-39 — produce the chunked-AEAD inline + * callback for curated CGs. Returns `undefined` for public CGs so + * the LU-5 callback (also resolved unconditionally for curated CGs) + * stays as the only path. + * + * The returned closure does THREE things on the publish hot path: + * + * 1. slice plaintext into `CIPHERTEXT_CHUNK_SIZE_BYTES`-sized + * pieces (last chunk smaller), + * 2. AEAD-encrypt each chunk with a publish-operation-deterministic + * nonce (`deriveChunkNonce(publishOperationId, chunkIndex)`) so + * retries produce bit-identical ciphertext and idempotent SWM + * writes (idempotency is the spec's only protection against double- + * gossip racing the on-chain commitment), while distinct publish + * attempts rotate the AEAD nonce domain even if they share the + * same merkle root, + * 3. fan each ciphertext chunk out as a V2 SWM gossip envelope + * (`type = 'share-write-chunked'`, `swmMessageIndex = i`, + * payload = `[batchId(32)][ct_i]`) on the curated CG's + * workspace topic — so hosting cores (RFC-38 LU-6 host-mode) + * persist the bytes opaquely keyed by + * `(cgId, batchId, swmMessageIndex)` and members decrypt + * locally with the same chainKey they already hold. + * + * The returned `ciphertextChunksRoot` is the keccak256 root over + * `keccak256(ct_i)` leaves in `swmMessageIndex` order (see + * `buildCiphertextChunksRoot` in `@origintrail-official/dkg-core`). + * That same root lands on-chain via + * `KnowledgeAssetsV10.PublishParams.ciphertextChunksRoot` and binds + * the SWM-gossiped bytes to the chain commitment — RFC-39 random + * sampling samples `(cgId, batchId, chunkId)` against this root. + */ + private async _resolveEncryptInlineChunked( + contextGraphId: string, + subGraphName?: string, + authorAgentAddress?: string, + publishContextGraphId?: string, + ): Promise< + | ((input: { plaintextNquads: Uint8Array; batchId: Uint8Array; publishOperationId: string }) => Promise<{ + ciphertextChunksRoot: Uint8Array; + ciphertextChunkCount: number; + totalCiphertextBytes: number; + }>) + | undefined + > { + const resolved = await this._resolveCuratedChainKeyContext( + contextGraphId, subGraphName, authorAgentAddress, publishContextGraphId, 'LU-11', + ); + if (!resolved) return undefined; + const { chainKey, aeadCgId } = resolved; + const wireCgId = this.gossipWireIdFor(contextGraphId); + const topic = contextGraphWorkspaceTopic(wireCgId); + const signer = await this.resolveWorkspaceGossipSigningAgent(contextGraphId); + if (!signer) { + throw new Error( + `LU-11: curated CG ${contextGraphId}: cannot resolve a workspace-gossip signing agent — ` + + `cores reject unsigned chunked envelopes. Add a local custodial signing key for an ` + + `allowed agent before publishing.`, + ); + } + const signerWallet = new ethers.Wallet(signer.privateKey); + const signerAgentAddress = signer.agentAddress; + const log = this.log; + const ctx = createOperationContext('publish'); + const gossip = this.gossip; + + return async (input: { plaintextNquads: Uint8Array; batchId: Uint8Array; publishOperationId: string }): Promise<{ + ciphertextChunksRoot: Uint8Array; + ciphertextChunkCount: number; + totalCiphertextBytes: number; + }> => { + if (input.batchId.length !== 32) { + throw new Error( + `LU-11: chunked emit requires a 32-byte batchId (V10 KC merkleRoot); got ${input.batchId.length}`, + ); + } + if (input.publishOperationId.length === 0) { + throw new Error('LU-11: chunked emit requires a non-empty publishOperationId'); + } + const plaintextChunks = sliceIntoCiphertextChunks(input.plaintextNquads); + const { ciphertextChunks } = encryptChunked({ + chainKey, + contextGraphId: aeadCgId, + plaintextChunks, + publishOperationId: input.publishOperationId, + }); + const { root, leafCount } = buildCiphertextChunksRoot(ciphertextChunks); + const batchIdHex = ethers.hexlify(input.batchId); + let totalCiphertextBytes = 0; + for (let i = 0; i < ciphertextChunks.length; i++) { + const ct = ciphertextChunks[i]; + totalCiphertextBytes += ct.length; + const payload = new Uint8Array(input.batchId.length + ct.length); + payload.set(input.batchId, 0); + payload.set(ct, input.batchId.length); + const timestamp = new Date().toISOString(); + const signingPayload = computeGossipSigningPayloadV2( + GOSSIP_TYPE_WORKSPACE_PUBLISH_CHUNKED, + contextGraphId, + timestamp, + payload, + i, + ); + const signature = await signerWallet.signMessage(signingPayload); + const envelope = encodeGossipEnvelope({ + version: GOSSIP_ENVELOPE_VERSION, + type: GOSSIP_TYPE_WORKSPACE_PUBLISH_CHUNKED, + contextGraphId, + agentAddress: signerAgentAddress, + timestamp, + signature: ethers.getBytes(signature), + payload, + swmMessageIndex: i, + }); + try { + await gossip.publish(topic, envelope); + } catch (err) { + log.warn( + ctx, + `LU-11: chunked gossip publish failed for cgId=${contextGraphId} ` + + `batchId=${batchIdHex.slice(0, 18)}... op=${input.publishOperationId} chunkIndex=${i}: ${ + err instanceof Error ? err.message : String(err) + } — cores without this chunk will DECLINE the V2 ACK; ` + + `late-join sync can backfill once the catchup verb lands.`, + ); + } + } + log.info( + ctx, + `LU-11: emitted ${ciphertextChunks.length} ciphertext chunks ` + + `(${totalCiphertextBytes} bytes total) for curated CG ${contextGraphId} ` + + `batchId=${batchIdHex.slice(0, 18)}... op=${input.publishOperationId} on topic ${topic}`, + ); + return { + ciphertextChunksRoot: root, + ciphertextChunkCount: leafCount, + totalCiphertextBytes, + }; + }; + } + private async _loadSelectedSWMQuads( contextGraphId: string, selection: 'all' | { rootEntities: string[] }, @@ -8318,6 +8887,10 @@ export class DKGAgent { ): Promise { const ctx = options?.operationCtx ?? createOperationContext('publishFromSWM'); const effectiveSubCG = options?.subContextGraphId ?? options?.contextGraphId; + // `ctxGraphIdStr` doubles as `publishContextGraphId` for REMAP-flow + // publishes — the publisher uses its presence as a signal to DELETE the + // original copy from the default data graph. Keep it empty for non-REMAP + // publishes so we don't accidentally trigger the delete. const ctxGraphIdStr = effectiveSubCG != null ? String(effectiveSubCG) : undefined; const onChainId = ctxGraphIdStr ?? (await this.getContextGraphOnChainId(contextGraphId)) ?? undefined; @@ -8382,6 +8955,21 @@ export class DKGAgent { if (encryptInlinePayload) { this.log.info(ctx, `LU-5: curated CG ${contextGraphId} — wrapping inline ACK payload with chain-key AEAD`); } + // OT-RFC-38 LU-11 — also resolve the chunked emitter. Publisher + // prefers the chunked path when both are set; single-blob remains + // the unconditional fallback for any code path that resolves the + // chunked callback to `undefined` (currently impossible since + // both helpers share the curated probe, but kept defensively to + // future-proof CG types whose chunked path might lag rollout). + const encryptInlineChunked = await this._resolveEncryptInlineChunked( + contextGraphId, + options?.subGraphName, + options?.authorAgentAddress, + onChainId ?? undefined, + ); + if (encryptInlineChunked) { + this.log.info(ctx, `LU-11: curated CG ${contextGraphId} — chunked path active (per-chunk SWM gossip + V2 ACK)`); + } const result = await this.publisher.publishFromSharedMemory(contextGraphId, selection, { operationCtx: ctx, @@ -8395,11 +8983,35 @@ export class DKGAgent { publisherNodeIdentityIdOverride: options?.publisherNodeIdentityIdOverride, precomputedAttestation: resolvedSeal, encryptInlinePayload, + encryptInlineChunked, }); if (result.status === 'confirmed' && result.onChainResult) { const rootEntities = result.kaManifest.map(ka => ka.rootEntity); + // Always carry the resolved on-chain CG id in the finalization gossip + // so receiving cores promote SWM into the per-cgId `_meta` namespace + // (`/context//_meta`) that the RS prover reads from. + // Without this the prover 404'd with `KCNotFoundError` on every + // freshly-published KC even though the SWM payload had been + // replicated — see scripts/devnet-test-rfc39-comprehensive.sh + // Scenario A. We pass the *publisher-resolved* `onChainId` (which + // includes both explicit REMAP targets and the auto-lookup fallback) + // rather than the REMAP-only `ctxGraphIdStr`. + const broadcastCgId = onChainId != null ? String(onChainId) : undefined; + // PR #779 / #774 followup: tell receivers whether the publisher + // kept a root-graph copy of the canonical quads. Same-graph + // publishes (no explicit `subContextGraphId` / `publishContextGraphId`) + // dual-write to the root `` graph and the per-on-chain-id + // partition `/context/` so label-scoped queries + // resolve. Explicit-`subContextGraphId` / remap publishes delete + // the root copy on purpose (`dkg-publisher.ts` ~line 1393), so + // receivers MUST NOT dual-write either — otherwise a remap-style + // KC would re-appear under the source CG's label on every replica. + // `ctxGraphIdStr` is the publisher-side `publishContextGraphId` + // (set on REMAP/explicit-subCG calls, undefined otherwise) — the + // exact same signal the publisher uses to gate its own root delete. + const keepRootCopyOnLabel = !ctxGraphIdStr; const msg: FinalizationMessageMsg = { ual: result.ual, contextGraphId: contextGraphId, @@ -8413,14 +9025,15 @@ export class DKGAgent { rootEntities, timestampMs: Date.now(), operationId: ctx.operationId, - targetContextGraphId: result.contextGraphError ? undefined : ctxGraphIdStr, + targetContextGraphId: result.contextGraphError ? undefined : broadcastCgId, subGraphName: options?.subGraphName, + keepRootCopyOnLabel, }; const topic = contextGraphFinalizationTopic(contextGraphId); try { await this.gossip.publish(topic, encodeFinalizationMessage(msg)); - this.log.info(ctx, `Broadcast finalization for ${result.ual} to ${topic}${ctxGraphIdStr ? ` (contextGraph=${ctxGraphIdStr})` : ''}${result.contextGraphError ? ' (ctx-graph registration failed, omitting targetContextGraphId)' : ''}`); + this.log.info(ctx, `Broadcast finalization for ${result.ual} to ${topic}${broadcastCgId ? ` (contextGraph=${broadcastCgId})` : ''}${result.contextGraphError ? ' (ctx-graph registration failed, omitting targetContextGraphId)' : ''}`); } catch { this.log.warn(ctx, `No peers subscribed to ${topic} yet`); } @@ -9445,6 +10058,27 @@ export class DKGAgent { this.swmHostModeSubscribed.set(wireCgId, source); this.gossip.subscribe(swmTopic); const handler = (_topic: string, data: Uint8Array, from: string) => { + // OT-RFC-38 LU-11: peek envelope type and dispatch. Chunked + // envelopes (`type='share-write-chunked'`) take the V2 chunk + // persistence path; everything else flows through the legacy + // host-mode store unchanged. Failed decode falls through to + // `ingestSwmHostModeEnvelope` which is also defensive — the + // dispatch here is best-effort, not a security boundary. + let envelopeType: string | undefined; + try { + const peek = decodeGossipEnvelope(data); + envelopeType = peek?.type; + } catch { /* drop into legacy path */ } + if (envelopeType === GOSSIP_TYPE_WORKSPACE_PUBLISH_CHUNKED) { + this.ingestSwmCiphertextChunkEnvelope(contextGraphId, data, from).catch((err: unknown) => { + const msg = err instanceof Error ? err.message : String(err); + this.log.warn( + createOperationContext('system'), + `LU-11: chunked SWM ingest failed for "${contextGraphId}": ${msg}`, + ); + }); + return; + } this.ingestSwmHostModeEnvelope(contextGraphId, data, from).catch((err: unknown) => { const msg = err instanceof Error ? err.message : String(err); this.log.warn( @@ -9780,6 +10414,131 @@ export class DKGAgent { ); } + /** + * OT-RFC-38 LU-11 / OT-RFC-39 — chunked-ciphertext SWM ingest. + * Receives per-chunk SWM gossip envelopes + * (`type='share-write-chunked'`) that the publisher fans out via + * `_resolveEncryptInlineChunked`, verifies envelope authority + * against the curated CG's agent allowlist (same gate as the + * legacy host-mode store), strips the 32-byte `batchId` prefix + * from the payload, and persists the remaining ciphertext bytes + * under the deterministic chunk-store subject so the V2 ACK + * verifier can recompute the publisher's claimed + * `ciphertextChunksRoot` keyed by `(cgId, batchId, chunkIndex)`. + * + * Persistence model: one base64-encoded literal per chunk, in the + * per-CG named graph `ciphertextChunkStoreGraph(cgId)` under the + * subject `ciphertextChunkStoreSubject(batchId, chunkIndex)`. The + * store insert is idempotent — the same chunk arriving twice (or + * out of order) overwrites the existing triple harmlessly because + * `subject + predicate + graph` is unique. + * + * Late-join cores that come online after a publish has finalised + * end up here only opportunistically (if a peer's mesh re-floods + * the chunked envelope), which is unreliable; commit 7 adds the + * `GetCiphertextChunk` sync verb that pulls missing chunks + * explicitly via the protocol router. + */ + private async ingestSwmCiphertextChunkEnvelope( + contextGraphId: string, + data: Uint8Array, + fromPeerId: string, + ): Promise { + if (data.length === 0) return; + const ctx = createOperationContext('share'); + let envelope: GossipEnvelopeMsg | undefined; + try { + envelope = decodeGossipEnvelope(data); + } catch { + return; + } + if (!envelope || envelope.type !== GOSSIP_TYPE_WORKSPACE_PUBLISH_CHUNKED) { + return; + } + if (envelope.payload.length <= 32) { + // Chunked payload format: [32-byte batchId][ciphertext...]. + // Anything shorter can't carry a single ciphertext byte. + this.log.debug( + ctx, + `LU-11: ignoring chunked envelope on cg=${contextGraphId} from=${fromPeerId} with truncated payload (${envelope.payload.length} bytes)`, + ); + return; + } + if (typeof envelope.swmMessageIndex !== 'number' || envelope.swmMessageIndex < 0) { + this.log.debug( + ctx, + `LU-11: ignoring chunked envelope on cg=${contextGraphId} with invalid swmMessageIndex=${envelope.swmMessageIndex}`, + ); + return; + } + + // Subscription CG-id can be either cleartext (operator / member + // path) or wire-form hash (chain-event auto-subscribe). Compare + // both sides in wire-form so any combination accepts. + const envelopeWireId = this.gossipWireIdFor(envelope.contextGraphId); + const subscriptionWireId = this.gossipWireIdFor(contextGraphId); + if (envelopeWireId !== subscriptionWireId) return; + const storageCgId = envelope.contextGraphId; + + // Verify envelope signature against the curated CG's agent + // allowlist — exactly the same authority check the host-mode + // store uses; without it, any topic-reachable peer could plant + // arbitrary ciphertext under a victim's (cgId, batchId) keys. + const handlerSm = this.getOrCreateSharedMemoryHandler(); + const verdict = await handlerSm.verifyHostModeEnvelopeAuthority(data, storageCgId, fromPeerId); + if (!verdict.accepted) { + // Same transient-race classification as the LU-6 host-mode + // path: "no agent allowlist yet" is the post-create / pre- + // chain-event window; everything else is a real auth failure. + const isTransientRace = verdict.reason === 'no agent allowlist on context graph'; + const logFn = isTransientRace ? this.log.debug.bind(this.log) : this.log.warn.bind(this.log); + logFn( + ctx, + `LU-11: chunked envelope auth ${isTransientRace ? 'deferred' : 'rejected'} for cg=${storageCgId} from=${fromPeerId} swmMessageIndex=${envelope.swmMessageIndex}: ${verdict.reason}`, + ); + return; + } + + const batchId = envelope.payload.subarray(0, 32); + const ciphertext = envelope.payload.subarray(32); + const chunkIndex = envelope.swmMessageIndex; + // Codex review on PR #715 (refined round 2 on PR #727): canonicalize + // the cgId used in the per-CG named graph via + // `canonicalChunkStoreCgIdOrNull` so persist (here) and lookup + // (`handleGetCiphertextChunk`, V2 ACK loadChunk, prover extractor) + // converge on the same wire-form key. The persist site falls back + // to the raw `storageCgId` (legacy shape) when canonicalization + // can't safely resolve — the gossip envelope's `contextGraphId` + // is typically already cleartext / wire-form, so the null path is + // unlikely here, but the fallback keeps insert semantics safe and + // mirrors the lookup-side wildcard fallback rather than + // fabricating a bad keccak-of-decimal-string. + const persistCanonical = this.canonicalChunkStoreCgIdOrNull(storageCgId); + const chunksGraph = ciphertextChunkStoreGraph(persistCanonical ?? storageCgId); + const subject = ciphertextChunkStoreSubject(batchId, chunkIndex); + const literal = `"${Buffer.from(ciphertext).toString('base64')}"`; + try { + await this.store.insert([{ + subject, + predicate: CIPHERTEXT_CHUNK_PREDICATE, + object: literal, + graph: chunksGraph, + }]); + } catch (err) { + this.log.warn( + ctx, + `LU-11: failed to persist chunk cg=${storageCgId} batchId=${ethers.hexlify(batchId).slice(0, 18)}... chunkIndex=${chunkIndex}: ${ + err instanceof Error ? err.message : String(err) + }`, + ); + return; + } + this.log.info( + ctx, + `LU-11: persisted ciphertext chunk cg=${storageCgId} batchId=${ethers.hexlify(batchId).slice(0, 18)}... chunkIndex=${chunkIndex} bytes=${ciphertext.length}`, + ); + } + /** * OT-RFC-38 / LU-6 Phase B — curator-side: record a CG so the * periodic beacon timer keeps re-announcing it AND broadcast an @@ -10112,6 +10871,483 @@ export class DKGAgent { }); } + /** + * OT-RFC-38 LU-11 / OT-RFC-39 — responder for the + * `/dkg/10.0.2/get-ciphertext-chunk` sync verb. Loads one + * `(cgId, batchId, chunkIndex)` ciphertext from the local + * triple-store-backed chunk store and returns the base64 bytes + * (or a typed denial: bad signature, unauthorized, missing + * chunk). Authorization piggybacks on the existing LU-6 + * UNION-of-authorities gate: any source that recognises the + * requester EOA accepts (on-chain participants, beacon curator, + * local agent gate, libp2p peer allowlist). PR-B will refine + * this to include a sharding-table-membership chain probe so + * late-joining hosting cores (which won't be on the agent + * allowlist) can backfill ciphertexts they need to participate + * in RFC-39 random sampling. + */ + private async handleGetCiphertextChunk(data: Uint8Array, fromPeerId: string): Promise { + const ctx = createOperationContext('share'); + let req: CiphertextChunkCatchupRequest; + try { + req = decodeCiphertextChunkCatchupRequest(data); + } catch (err) { + const reason = err instanceof Error ? err.message : String(err); + return encodeCiphertextChunkCatchupResponse({ + version: CIPHERTEXT_CHUNK_CATCHUP_WIRE_VERSION, + contextGraphId: '', + batchIdHex: '', + chunkIndex: -1, + denied: `malformed request: ${reason}`, + }); + } + const nowMs = Date.now(); + const verify = verifySignedCiphertextChunkCatchupRequest(req, nowMs); + if (!verify.ok || !verify.recoveredSigner) { + this.log.info( + ctx, + `LU-11 chunk-catchup denied cg=${req.contextGraphId} from=${fromPeerId} requesterEoa=${req.requesterEoa} chunkIndex=${req.chunkIndex}: ${verify.reason}`, + ); + return encodeCiphertextChunkCatchupResponse({ + version: CIPHERTEXT_CHUNK_CATCHUP_WIRE_VERSION, + contextGraphId: req.contextGraphId, + batchIdHex: ethers.hexlify(req.batchId), + chunkIndex: req.chunkIndex, + denied: verify.reason ?? 'signature verification failed', + }); + } + const requesterEoa = verify.recoveredSigner; + if (!this.ciphertextChunkCatchupReplayGuard.recordIfFresh(requesterEoa, req.nonce, req.issuedAtMs, nowMs)) { + return encodeCiphertextChunkCatchupResponse({ + version: CIPHERTEXT_CHUNK_CATCHUP_WIRE_VERSION, + contextGraphId: req.contextGraphId, + batchIdHex: ethers.hexlify(req.batchId), + chunkIndex: req.chunkIndex, + denied: 'replayed chunk-catchup nonce', + }); + } + + // Reuse the LU-6 host-catchup authorization shape via a thin + // adapter — same UNION-of-authorities logic, but the chunk-catchup + // request payload lacks `sinceSeqno`/`maxEntries`/`maxBytes` so + // we pack the chunked-request fields into the shared verifier's + // shape with zero-defaults for the unused slots. (The shared + // authorization helper only reads `contextGraphId` and the EOA; + // the other fields are signature-digest input, not authorization + // input.) + let authOk = false; + let authReason: string = 'no authority source available for context graph'; + const requesterLower = requesterEoa.toLowerCase(); + let anyAuthorityFound = false; + try { + const chainParticipants = await this.resolveOnChainParticipantAgents(req.contextGraphId); + if (chainParticipants !== null) { + anyAuthorityFound = true; + if (chainParticipants.some((a) => a.toLowerCase() === requesterLower)) authOk = true; + } + } catch { /* probe failure non-fatal */ } + if (!authOk) { + try { + const beaconCurator = await this.resolveBeaconPinnedCuratorEoa(req.contextGraphId); + if (beaconCurator) { + anyAuthorityFound = true; + if (beaconCurator.toLowerCase() === requesterLower) authOk = true; + } + } catch { /* probe failure non-fatal */ } + } + if (!authOk) { + try { + const agentGate = await this.getContextGraphAgentGateAddresses(req.contextGraphId); + if (agentGate !== null) { + anyAuthorityFound = true; + if (agentGate.some((a) => a.toLowerCase() === requesterLower)) authOk = true; + } + } catch { /* probe failure non-fatal */ } + } + if (!authOk) { + try { + const allowedPeers = await this.getContextGraphAllowedPeers(req.contextGraphId); + if (allowedPeers !== null) { + anyAuthorityFound = true; + if (allowedPeers.includes(fromPeerId)) authOk = true; + } + } catch { /* probe failure non-fatal */ } + } + // OT-RFC-39 fifth authority — registered node operator. + // + // The four authorities above are MEMBER- or CURATOR-shaped: they + // gate "can this EOA decrypt / participate in" the CG. Curated + // CGs almost never list every sharding-table core in + // `allowedAgents` (curators only enrol agents that need to + // decrypt), so the existing layers deny EVERY core-to-core + // chunk fetch — exactly the late-join scenario OT-RFC-39 is + // designed to fix. Closing that gap means admitting any peer + // whose EOA is a registered node operator (identityId > 0n on + // chain). Three reasons this is safe for the CIPHERTEXT path + // (and not generalisable to plaintext catchup): + // + // 1. The bytes carried are AEAD-encrypted with the curator's + // sender key. A node operator without the sender key gets + // opaque ciphertext that is computationally indistinguishable + // from random, so no decryption power leaks. + // + // 2. The on-chain `(ciphertextChunksRoot, ciphertextChunkCount)` + // commitment is already public — anyone observing chain state + // learns "curated KC X has N chunks of size up to S each" + // without needing the wire fetch. The metadata our responder + // reveals is a strict subset of what the chain already + // reveals. + // + // 3. Registering an on-chain identity costs TRAC stake — it's + // a Sybil-resistant credential. Pairing the EOA recovery + // above (which proves the requester holds the operator key) + // with a non-zero identityId restricts ciphertext fetch to + // the same trust set the random-sampling picker draws from, + // which is the spec-intended population for hosting. + // + // Wire effect: the late-join sync verb now succeeds for any + // sharding-table core requesting chunks for any curated CG. The + // prover's auto-backfill can complete; the missed core proves + // its hosting and earns rewards on the period it would otherwise + // forfeit. + if (!authOk && typeof this.chain.getIdentityIdForAddress === 'function') { + try { + const reqIdentityId = await this.chain.getIdentityIdForAddress(requesterEoa); + if (reqIdentityId > 0n) { + anyAuthorityFound = true; + authOk = true; + this.log.debug( + ctx, + `LU-11 chunk-catchup admitted via OT-RFC-39 node-operator authority cg=${req.contextGraphId} requesterEoa=${requesterEoa} identityId=${reqIdentityId.toString()}`, + ); + } + } catch (err) { + this.log.debug( + ctx, + `LU-11 chunk-catchup node-operator probe failed cg=${req.contextGraphId} requesterEoa=${requesterEoa}: ${err instanceof Error ? err.message : String(err)}`, + ); + } + } + if (!authOk) { + authReason = anyAuthorityFound + ? 'requester EOA not in any of: on-chain participants, beacon curator, local agent-gate, allowedPeers, node-operator-registry' + : 'no authority source available for context graph'; + this.log.info( + ctx, + `LU-11 chunk-catchup denied cg=${req.contextGraphId} from=${fromPeerId} requesterEoa=${requesterEoa} chunkIndex=${req.chunkIndex}: ${authReason}`, + ); + return encodeCiphertextChunkCatchupResponse({ + version: CIPHERTEXT_CHUNK_CATCHUP_WIRE_VERSION, + contextGraphId: req.contextGraphId, + batchIdHex: ethers.hexlify(req.batchId), + chunkIndex: req.chunkIndex, + denied: authReason, + }); + } + + // Locate the chunk. Codex review (round 2) on PR #727: pin to the + // per-CG named graph when we can safely canonicalize `req.contextGraphId` + // (cleartext / bare-hex / locally-registered numeric on-chain id), + // and fall back to the wildcard `GRAPH ?g` scan when we can't. The + // previous PR #715 fix would have keccak'd a literal decimal string + // like "42" and produced a hash that did NOT match the curator + // nameHash → "chunk not found" for any requester that addressed + // the CG by its numeric on-chain id, narrowing the public API in + // a way that wasn't advertised. Scoped pinning still gives us the + // multi-CG identical-KC isolation we wanted from PR #715 whenever + // canonicalization succeeds; the wildcard fallback preserves the + // historical responder contract for the catching-up / numeric-id + // cases. + const canonicalCgIdForChunks = this.canonicalChunkStoreCgIdOrNull(req.contextGraphId); + const chunksGraphForLookup = canonicalCgIdForChunks + ? ciphertextChunkStoreGraph(canonicalCgIdForChunks) + : null; + const graphClause = chunksGraphForLookup + ? `GRAPH <${chunksGraphForLookup}>` + : 'GRAPH ?g'; + const subject = ciphertextChunkStoreSubject(req.batchId, req.chunkIndex); + const sparql = `SELECT ?o WHERE { ${graphClause} { <${subject}> <${CIPHERTEXT_CHUNK_PREDICATE}> ?o } } LIMIT 1`; + let result; + try { + result = await this.store.query(sparql); + } catch (err) { + const reason = err instanceof Error ? err.message : String(err); + this.log.warn(ctx, `LU-11 chunk-catchup store query failed cg=${req.contextGraphId} chunkIndex=${req.chunkIndex}: ${reason}`); + return encodeCiphertextChunkCatchupResponse({ + version: CIPHERTEXT_CHUNK_CATCHUP_WIRE_VERSION, + contextGraphId: req.contextGraphId, + batchIdHex: ethers.hexlify(req.batchId), + chunkIndex: req.chunkIndex, + denied: `store error: ${reason}`, + }); + } + if (result.type !== 'bindings' || result.bindings.length === 0) { + return encodeCiphertextChunkCatchupResponse({ + version: CIPHERTEXT_CHUNK_CATCHUP_WIRE_VERSION, + contextGraphId: req.contextGraphId, + batchIdHex: ethers.hexlify(req.batchId), + chunkIndex: req.chunkIndex, + denied: 'chunk not found', + }); + } + const literal = result.bindings[0]?.['o']; + if (typeof literal !== 'string') { + return encodeCiphertextChunkCatchupResponse({ + version: CIPHERTEXT_CHUNK_CATCHUP_WIRE_VERSION, + contextGraphId: req.contextGraphId, + batchIdHex: ethers.hexlify(req.batchId), + chunkIndex: req.chunkIndex, + denied: 'chunk stored value malformed', + }); + } + const ciphertextB64 = literal.startsWith('"') && literal.endsWith('"') + ? literal.slice(1, -1) + : literal; + this.log.debug( + ctx, + `LU-11 chunk-catchup served cg=${req.contextGraphId} from=${fromPeerId} batchId=${ethers.hexlify(req.batchId).slice(0, 18)}... chunkIndex=${req.chunkIndex} bytes=${Buffer.from(ciphertextB64, 'base64').length}`, + ); + return encodeCiphertextChunkCatchupResponse({ + version: CIPHERTEXT_CHUNK_CATCHUP_WIRE_VERSION, + contextGraphId: req.contextGraphId, + batchIdHex: ethers.hexlify(req.batchId), + chunkIndex: req.chunkIndex, + ciphertextB64, + }); + } + + /** + * OT-RFC-38 LU-11 / OT-RFC-39 — requester for the + * `/dkg/10.0.2/get-ciphertext-chunk` sync verb. Pulls one + * `(cgId, batchId, chunkIndex)` ciphertext from a known host and + * (when `persist === true`) writes it into the local per-chunk + * store so the V2 ACK verifier sees it on the next pass. Returns + * the raw decoded response so callers can inspect denial reasons + * or feed bytes to a member-side verifier. + * + * Late-joining hosting cores call this in a loop to backfill the + * `(cgId, batchId, 0..count-1)` set after seeing + * `KnowledgeCollectionCiphertextCommitmentSet` on chain or + * `MISSING_CIPHERTEXT_CHUNKS` from a V2 ACK request they + * routed forward. Loop policy + peer selection are intentionally + * caller-owned — this method is the single-pull primitive. + */ + async fetchCiphertextChunkFromPeer( + remotePeerId: string, + contextGraphId: string, + batchId: Uint8Array, + chunkIndex: number, + options?: { + persist?: boolean; + /** + * @deprecated Reserved for a future alternate-signer plumb-through. + * No-op today: the closure below always uses + * `this.chain.signMessage`. Kept on the public signature so + * existing TypeScript callers continue to compile through the + * rc.12 line (Codex review round 2 on PR #727 flagged + * removing it as a breaking API change). Will be removed in a + * future intentional major-version break — either replaced by + * a real signer callback (`sign?: (digest) => Promise`) + * or dropped entirely if no caller ever materialises. + */ + signWithChainAdapter?: boolean; + }, + ): Promise { + if (batchId.length !== 32) { + throw new Error(`fetchCiphertextChunkFromPeer requires a 32-byte batchId; got ${batchId.length}`); + } + if (!Number.isInteger(chunkIndex) || chunkIndex < 0) { + throw new Error(`fetchCiphertextChunkFromPeer requires a non-negative chunkIndex; got ${chunkIndex}`); + } + const ctx = createOperationContext('share'); + // Codex review on PR #715 / #717 / #727: the option above is a + // back-compat no-op. The implementation requires a chain adapter + // with `signMessage`; there is no real alternate-signer path yet, + // so callers must wire the chain. Honest error if absent. + if (typeof this.chain.signMessage !== 'function') { + throw new Error('fetchCiphertextChunkFromPeer: chain adapter does not expose signMessage; the LU-11 sync verb requires an operator-key signer'); + } + const sign = async (digest: Uint8Array) => { + // Match the host-catchup pattern: chain.signMessage returns + // {r, vs}; re-serialise to the 65-byte EIP-191 hex shape. + const { r, vs } = await this.chain.signMessage!(digest); + const sig = ethers.Signature.from({ r: ethers.hexlify(r), yParityAndS: ethers.hexlify(vs) }); + return sig.serialized; + }; + const signedReq = await mintSignedCiphertextChunkCatchupRequest({ + contextGraphId, + batchId, + chunkIndex, + sign, + }); + const reqBytes = encodeCiphertextChunkCatchupRequest(signedReq); + const sendResult = await this.messenger.sendReliable(remotePeerId, PROTOCOL_GET_CIPHERTEXT_CHUNK, reqBytes); + if (!sendResult.delivered) { + throw new Error(`LU-11 chunk-catchup transport failed: ${sendResult.error}`); + } + const resp = decodeCiphertextChunkCatchupResponse(sendResult.response); + if (options?.persist && resp.ciphertextB64) { + const subject = ciphertextChunkStoreSubject(batchId, chunkIndex); + const literal = `"${resp.ciphertextB64}"`; + // Codex review on PR #715 (refined round 2 on PR #727): use the + // central canonical helper so this persist site matches the + // ingest persist site exactly, including the safe fallback when + // canonicalization can't resolve. `contextGraphId` here is the + // local CG id the prover-side backfill passed in (cleartext + // resolved via `resolveLocalCgIdByOnChainId` in + // `buildCiphertextChunkBackfill`), so the helper normally + // returns a wire hash; the null path is theoretical defense. + const persistCanonical = this.canonicalChunkStoreCgIdOrNull(contextGraphId); + const chunksGraphForPersist = ciphertextChunkStoreGraph(persistCanonical ?? contextGraphId); + try { + await this.store.insert([{ + subject, + predicate: CIPHERTEXT_CHUNK_PREDICATE, + object: literal, + graph: chunksGraphForPersist, + }]); + this.log.debug( + ctx, + `LU-11 chunk-catchup persisted cg=${contextGraphId} batchId=${ethers.hexlify(batchId).slice(0, 18)}... chunkIndex=${chunkIndex} from=${remotePeerId}`, + ); + } catch (err) { + this.log.warn( + ctx, + `LU-11 chunk-catchup persistence failed cg=${contextGraphId} batchId=${ethers.hexlify(batchId).slice(0, 18)}... chunkIndex=${chunkIndex}: ${err instanceof Error ? err.message : String(err)}`, + ); + } + } + return resp; + } + + /** + * OT-RFC-39 — resolve a numeric on-chain CG id (the form the prover + * sees from `createChallenge` / `getKCContextGraphId`) back to the + * local cleartext id this agent registered the CG under. Scans + * `subscribedContextGraphs` because the reverse map is keyed by the + * wire-form `onChainHash`, not the numeric id. Returns null when + * this node has never seen the CG (legitimate during the chain-event + * replay race window after restart — caller falls back to passing + * the numeric id as a string, which the responder's authorization + * layer also resolves via on-chain participant lookup). + */ + private resolveLocalCgIdByOnChainId(onChainId: bigint): string | null { + const target = onChainId.toString(); + for (const [localId, sub] of this.subscribedContextGraphs) { + if (sub.onChainId === target) return localId; + } + return null; + } + + /** + * OT-RFC-39 — build the per-tick auto-backfill closure handed to the + * Random Sampling prover via {@link bindRandomSampling}. The closure + * is invoked when `extractCiphertextChunksFromStore` reports + * `CiphertextChunksMissingError`; it pulls the missing chunks from + * authorized peers and persists them so the prover's one-shot retry + * can build the proof. + * + * Peer discovery uses the same source the publish path uses: + * `gossip.getSubscribers(contextGraphWorkspaceTopic(wireId))`. Every + * authorized hosting core subscribes to that topic to receive the + * chunked-publish gossip, so the subscriber snapshot is the natural + * "who can answer me right now" set. Falls back to "no peers" when + * the local cleartext CG id is unknown (chain replay hasn't caught + * up yet) — the prover then logs `kc-not-synced` and re-ticks in + * 30s, by which time the chain handler has populated + * `subscribedContextGraphs`. + * + * Authorization happens on the RESPONDER side + * (`handleGetCiphertextChunk`): every peer the requester contacts + * verifies the request's recovered EOA against the on-chain + * participant set / beacon curator / agent-gate / allowedPeers. + * Requesters that aren't in any authority set get a `denied` ACK + * and we skip to the next peer. + * + * Cap policy: one fetch per missing chunk per peer; iterate peers + * until a chunk lands or we exhaust the list. No retries inside the + * hook — the prover's outer 30s loop is the natural retry boundary. + */ + private buildCiphertextChunkBackfill( + ctx: OperationContext, + ): (req: { cgId: bigint; batchId: Uint8Array; missingIndexes: number[] }) => Promise<{ fetched: number; failures: number; reason?: string }> { + return async ({ cgId, batchId, missingIndexes }) => { + if (missingIndexes.length === 0) return { fetched: 0, failures: 0 }; + + const localCgId = this.resolveLocalCgIdByOnChainId(cgId); + if (!localCgId) { + return { + fetched: 0, + failures: missingIndexes.length, + reason: 'cg-not-locally-registered', + }; + } + + const wireId = this.gossipWireIdFor(localCgId); + const workspaceTopic = contextGraphWorkspaceTopic(wireId); + let selfPeer: string | null = null; + try { selfPeer = this.peerId; } catch { /* pre-start */ } + const allSubscribers = this.gossip.getSubscribers(workspaceTopic); + const candidatePeers = Array.from(new Set( + allSubscribers.filter((p) => p && p !== selfPeer), + )); + + if (candidatePeers.length === 0) { + return { + fetched: 0, + failures: missingIndexes.length, + reason: 'no-peers', + }; + } + + const batchIdHex = ethers.hexlify(batchId).slice(0, 18); + this.log.info( + ctx, + `LU-11 backfill start cg=${localCgId} batchId=${batchIdHex}... missing=${missingIndexes.length} peers=${candidatePeers.length}`, + ); + + let fetched = 0; + let failures = 0; + let lastDenied: string | undefined; + for (const idx of missingIndexes) { + let got = false; + for (const peer of candidatePeers) { + try { + const resp = await this.fetchCiphertextChunkFromPeer(peer, localCgId, batchId, idx, { + persist: true, + }); + if (resp.denied) { + lastDenied = resp.denied; + continue; + } + if (resp.ciphertextB64) { + got = true; + break; + } + } catch (err) { + this.log.debug( + ctx, + `LU-11 backfill peer=${peer} chunk=${idx} cg=${localCgId} error: ${err instanceof Error ? err.message.slice(0, 200) : String(err).slice(0, 200)}`, + ); + } + } + if (got) fetched++; + else failures++; + } + + this.log.info( + ctx, + `LU-11 backfill done cg=${localCgId} batchId=${batchIdHex}... fetched=${fetched} failures=${failures}${lastDenied ? ` lastDenied=${lastDenied}` : ''}`, + ); + return { + fetched, + failures, + ...(failures > 0 && fetched === 0 && lastDenied ? { reason: `all-denied: ${lastDenied}` } : {}), + ...(failures > 0 && fetched === 0 && !lastDenied ? { reason: 'no-responders' } : {}), + }; + }; + } + /** * OT-RFC-38 LU-6 B1 — authorize a signed `swm-host-catchup` request. * @@ -11549,6 +12785,11 @@ export class DKGAgent { this.store, this.chain.chainId === 'none' ? undefined : this.chain, this.eventBus, + // Defensive: when a peer's finalization gossip omits + // `targetContextGraphId` (pre-cd68fa689 publisher in the mesh), + // resolve the on-chain id locally so per-cgId promotion still + // fires and the RS prover sees the KC. + (cgName: string) => this.getContextGraphOnChainId(cgName), ); } return this.finalizationHandler; @@ -12149,8 +13390,52 @@ export class DKGAgent { if (resolvedLocalAccessPolicy !== undefined && resolvedLocalAccessPolicy !== LOCAL_ACCESS_OPEN && resolvedLocalAccessPolicy !== LOCAL_ACCESS_CURATED) { throw new Error('accessPolicy must be 0 (open) or 1 (private/curated)'); } + // Closes #774 finding #1 — `dkg context-graph register + // --access-policy 1` against a CG that was created public used to + // silently register on-chain as curated while the local CG stayed + // public. The subsequent `dkg publish my-cg` then tripped the + // pre-publish LU-5 guard with a CG-policy-mismatch error and the + // operator had no easy way to recover (the LU-5 guard is correct + // — encrypting against the wrong CG's policy would leak plaintext + // OR be rejected by cores). + // + // Fail fast at register time with a clear remediation pointer so + // the operator never gets into the half-registered state. The + // single-call API (`POST /api/context-graph/create + // {accessPolicy:1, register:true, allowedAgents:[…]}`) and the + // CLI `dkg context-graph create my-cg --access-policy 1` BOTH set + // the local access policy at create time — those remain the + // supported paths for curated CGs. + // `isPrivateContextGraph()` reflects the CURRENT local ACL state, + // not strictly the create-time policy: a CG created public can + // later be locked down via allowlist mutations and would then + // also report `actualLocalIsCurated = true`. Phrase the error in + // terms of the current ACL state so the message stays accurate + // regardless of which write flipped the local policy (Codex r2 + // on #777). The remediation pointer covers both atomic-create + // paths because that is the only supported way to bring the CG + // out of the mismatched state. + const actualLocalIsCurated = await this.isPrivateContextGraph(id); + if ( + resolvedLocalAccessPolicy !== undefined + && ((resolvedLocalAccessPolicy === LOCAL_ACCESS_CURATED) !== actualLocalIsCurated) + ) { + const localStr = actualLocalIsCurated ? 'private/curated (1)' : 'public/open (0)'; + const requestedStr = resolvedLocalAccessPolicy === LOCAL_ACCESS_CURATED + ? 'private/curated (1)' + : 'public/open (0)'; + throw new Error( + `Context graph "${id}" currently has local access policy=${localStr} but register was called with --access-policy ${requestedStr}. ` + + `register cannot change the local access policy — encrypting against a different policy than the CG actually has would either leak plaintext or be rejected by cores ` + + `(this is what the pre-publish LU-5 guard then refuses). ` + + `To create a curated CG atomically, use one of: ` + + `(a) \`dkg context-graph create --access-policy 1 --allowed-agent \`, ` + + `(b) the single-call API \`POST /api/context-graph/create { accessPolicy: 1, register: true, allowedAgents: [...] }\`. ` + + `Then register without --access-policy.`, + ); + } if (resolvedLocalAccessPolicy === undefined) { - resolvedLocalAccessPolicy = await this.isPrivateContextGraph(id) + resolvedLocalAccessPolicy = actualLocalIsCurated ? LOCAL_ACCESS_CURATED : LOCAL_ACCESS_OPEN; } @@ -15825,6 +17110,52 @@ export class DKGAgent { return ethers.keccak256(ethers.toUtf8Bytes(localId)).toLowerCase(); } + /** + * OT-RFC-39 Codex review (round 2) on PR #727: + * `gossipWireIdFor(rawId)` would happily keccak a literal numeric + * string ("42") as if it were cleartext, producing a hash that does + * NOT equal the curator-committed `nameHash`. That's fine in any + * context where the input is guaranteed to be either cleartext or + * bare hex (gossip-topic construction, host-mode bookkeeping). The + * LU-11 ciphertext-chunk-store named graph is more sensitive: a + * remote requester / ACK PublishIntent may legitimately carry the + * numeric on-chain id, and pinning a SPARQL `GRAPH` to the wrong + * hash means the lookup misses every persisted chunk and declines + * a valid publish (Bug #4) or returns `chunk not found` (Bug #5). + * + * This helper resolves the canonical wire form for chunk-store + * routing OR returns null to signal "use wildcard `GRAPH ?g` + * fallback" — caller's responsibility. Numeric ids that can't be + * resolved through the local subscription map (chain replay hasn't + * caught up; CG isn't locally registered) return null rather than + * silently producing the wrong hash. + * + * Routing rules (first match wins): + * 1. `0x[64-hex]` → lowercase, already wire form + * 2. Tracked in `subscribedContextGraphs` → `gossipWireIdFor` (returns the onChainHash) + * 3. Pure decimal → `resolveLocalCgIdByOnChainId` then wire-form; null if unknown + * 4. Everything else (cleartext) → `gossipWireIdFor` (keccak of the cleartext bytes) + * + * Rule 3 NEVER falls through to a raw keccak of the decimal string — + * that would reproduce the exact bug Codex called out. The caller + * MUST handle the null return by widening to a wildcard scan. + */ + private canonicalChunkStoreCgIdOrNull(rawId: string): string | null { + if (typeof rawId !== 'string' || rawId.length === 0) return null; + if (/^0x[0-9a-fA-F]{64}$/.test(rawId)) return rawId.toLowerCase(); + if (this.subscribedContextGraphs.has(rawId)) return this.gossipWireIdFor(rawId); + if (/^\d+$/.test(rawId)) { + try { + const local = this.resolveLocalCgIdByOnChainId(BigInt(rawId)); + if (local === null) return null; + return this.gossipWireIdFor(local); + } catch { + return null; + } + } + return this.gossipWireIdFor(rawId); + } + /** * Canonical key for the host-mode subscription bookkeeping maps * (`swmHostModeSubscribed`, `swmHostModeHandlers`). @@ -17792,6 +19123,10 @@ export class DKGAgent { clearInterval(this.beaconReannounceTimer); this.beaconReannounceTimer = undefined; } + if (this.agentProfileHeartbeatTimer) { + clearInterval(this.agentProfileHeartbeatTimer); + this.agentProfileHeartbeatTimer = undefined; + } if (this.syncReconcilerTimer) { clearInterval(this.syncReconcilerTimer); this.syncReconcilerTimer = null; @@ -17979,6 +19314,20 @@ export class DKGAgent { } } : undefined, + // Surface the structured verifier when the chain adapter implements + // it. Translates a thrown chain-side exception into an explicit + // `'rpc-error'` reason so the ACKCollector can log infra failures + // distinctly from definitive key/stake rejections — pre-PR this + // try/catch swallowed RPC errors as `false`, conflating them. + verifyIdentityDetailed: typeof this.chain.verifyACKIdentityDetailed === 'function' + ? async (recoveredAddress: string, claimedIdentityId: bigint) => { + try { + return await this.chain.verifyACKIdentityDetailed!(recoveredAddress, claimedIdentityId); + } catch { + return { valid: false, reason: 'rpc-error' as const }; + } + } + : undefined, log: (msg: string) => { const ctx = createOperationContext('publish'); this.log.info(ctx, msg); @@ -18000,6 +19349,14 @@ export class DKGAgent { subGraphName: string | undefined, merkleLeafCount: number, isEncryptedPayload?: boolean, + // OT-RFC-38 LU-11 — when present, the publisher's chunked + // emitter has already AEAD-encrypted + SWM-gossiped per-chunk + // ciphertexts. The collector routes through V2 ACK with empty + // stagingQuads and these fields populating PublishIntent. + chunkedCommitment?: { + ciphertextChunksRoot: Uint8Array; + ciphertextChunkCount: number; + }, ) => { // Fail loud on non-numeric or non-positive CG ids: V10 publish requires // a real on-chain context graph and the contract rejects `cgId == 0` @@ -18087,6 +19444,7 @@ export class DKGAgent { subGraphName, merkleLeafCount, isEncryptedPayload, + chunkedCommitment, }); return result.acks; }; diff --git a/packages/agent/src/finalization-handler.ts b/packages/agent/src/finalization-handler.ts index b770a775c..5d32b8ae3 100644 --- a/packages/agent/src/finalization-handler.ts +++ b/packages/agent/src/finalization-handler.ts @@ -19,17 +19,35 @@ import { const DKG_NS = 'http://dkg.io/ontology/'; import { ethers } from 'ethers'; +/** + * Resolves a local context-graph id (the topic/CG name used in gossip) to + * its on-chain numeric id. Returns `null`/`undefined` for CGs that aren't + * registered on-chain. Used as a fallback when a peer-finalization gossip + * envelope omits `targetContextGraphId` (e.g. a pre-cd68fa689 publisher + * still in the mesh). + */ +export type ResolveContextGraphOnChainId = ( + contextGraphId: string, +) => Promise; + export class FinalizationHandler { private readonly store: TripleStore; private readonly chain: ChainAdapter | undefined; private readonly eventBus: EventBus | undefined; + private readonly resolveContextGraphOnChainId: ResolveContextGraphOnChainId | undefined; private readonly log = new Logger('FinalizationHandler'); private readonly processedUals = new Set(); - constructor(store: TripleStore, chain: ChainAdapter | undefined, eventBus?: EventBus) { + constructor( + store: TripleStore, + chain: ChainAdapter | undefined, + eventBus?: EventBus, + resolveContextGraphOnChainId?: ResolveContextGraphOnChainId, + ) { this.store = store; this.chain = chain; this.eventBus = eventBus; + this.resolveContextGraphOnChainId = resolveContextGraphOnChainId; } async handleFinalizationMessage(data: Uint8Array, contextGraphId: string): Promise { @@ -61,7 +79,29 @@ export class FinalizationHandler { const startKAId = protoToBigInt(msg.startKAId); const endKAId = protoToBigInt(msg.endKAId); - const ctxGraphId = msg.targetContextGraphId || undefined; + // The publisher's `cd68fa689` fix threads the resolved on-chain CG id + // into `targetContextGraphId` so receivers route SWM promotion into + // the per-cgId `/context//_meta` graph that the RS + // prover reads from. Pre-fix publishers (or any publisher whose + // `getContextGraphOnChainId` lookup returns null at gossip time) emit + // `targetContextGraphId: undefined`, which used to silently downgrade + // the receiver to legacy `/_meta` promotion — leaving the + // prover stuck on `kc-not-synced` until every publisher in the mesh + // ships the fix. As a belt-and-braces for rolling upgrades we resolve + // the id locally when the wire is empty; resolver failures or + // not-on-chain CGs fall back to legacy behavior unchanged. + let ctxGraphId = msg.targetContextGraphId || undefined; + if (!ctxGraphId && this.resolveContextGraphOnChainId) { + try { + const resolved = await this.resolveContextGraphOnChainId(contextGraphId); + if (resolved !== null && resolved !== undefined && String(resolved).length > 0) { + ctxGraphId = String(resolved); + this.log.info(ctx, `Finalization: gossip omitted targetContextGraphId; resolved locally to ${ctxGraphId} (defensive lookup)`); + } + } catch (err) { + this.log.warn(ctx, `Finalization: defensive on-chain CG id lookup failed for ${contextGraphId}: ${err instanceof Error ? err.message : String(err)}`); + } + } // Validate sub-graph name from gossip — reject invalid names entirely let subGraphName: string | undefined; @@ -100,11 +140,35 @@ export class FinalizationHandler { ); if (verified) { + // Codex r5b — drop the rolling-upgrade legacy-publisher + // fallback. Earlier rounds inferred same-graph intent from + // `targetContextGraphId === local-on-chain-id-for(contextGraphId)`, + // but Codex r5b correctly observed that signal is ambiguous: + // it ALSO matches an explicit-remap-to-self publish (one where + // the legacy publisher passed `subContextGraphId === ownCG's + // own on-chain id` to deliberately drop the root copy). Both + // shapes hit `targetContextGraphId === local id` on the wire, + // so the fallback would re-add a root copy that the publisher + // had intentionally removed — a data-isolation regression. + // + // The cure is worse than the disease: trading a hard + // data-isolation bug for a soft query-discoverability gap is + // unacceptable. Without an unambiguous version/intent signal + // on the wire, a legacy publisher's same-graph publish stays + // queryable on receivers via per-cgId partitions but not via + // the bare `` label until the publisher upgrades to a + // tristate-emitting build and re-emits. New publishers always + // set the tristate (encoded with explicit KEEP/DROP) so the + // gap is bounded by the upgrade window. PR #779 has not + // shipped to any production peer yet, so this is the right + // moment to harden the contract. + const keepRootCopyOnLabel: boolean = msg.keepRootCopyOnLabel === true; await this.promoteSharedMemoryToCanonical( contextGraphId, sharedMemoryQuads, msg.ual, msg.rootEntities, msg.publisherAddress, msg.txHash, blockNumber, startKAId, endKAId, protoToBigInt(msg.batchId), ctx, ctxGraphId, subGraphName, authorAddress, + keepRootCopyOnLabel, ); this.markProcessed(dedupeKey); this.log.info(ctx, `Finalization: promoted SWM snapshot to ${ctxGraphId ? `context graph ${ctxGraphId}` : 'canonical'} for ${msg.ual} (tx=${msg.txHash.slice(0, 10)}…)`); @@ -370,6 +434,16 @@ export class FinalizationHandler { * unattributed-publish path's no-author behaviour from RFC-001 §3.6). */ authorAddress?: string, + /** + * PR #779 same-graph signal: when `true` the publisher kept a root-graph + * copy of the canonical quads, so receivers mirror the dual-write so + * label-scoped queries resolve. When `false` (or omitted on older + * publishers) the publisher used the explicit-`subContextGraphId` / + * remap path and deleted its own root copy on purpose — receivers + * MUST NOT dual-write or they re-expose the KC under the source CG + * label and double-count it in unscoped queries. + */ + keepRootCopyOnLabel?: boolean, ): Promise { const graphManager = new GraphManager(this.store); await graphManager.ensureContextGraph(contextGraphId); @@ -399,6 +473,36 @@ export class FinalizationHandler { : ctxGraphId ? contextGraphDataUri(contextGraphId, ctxGraphId) : graphManager.dataGraphUri(contextGraphId); + // Devnet test #774-followup (v10-rc-validation §5 gossip replication): + // when `ctxGraphId` is set on a non-sub-graph publish, the canonical + // data lands in the per-on-chain-id partition + // `/context/` only. The publisher path + // (`dkg-publisher.ts` ~line 1382) intentionally ALSO writes the same + // quads to the root `` graph "so `agent.query(label)` (which + // resolves to `did:dkg:context-graph: