From 4bed1070ad780f223defb0227e676d2241ae38c2 Mon Sep 17 00:00:00 2001 From: youzini Date: Tue, 9 Jun 2026 04:54:00 +0000 Subject: [PATCH] feat(extraction): default to split extraction pipeline --- maintenance/extractor.js | 7 +- prompting/prompt-profiles.js | 42 ++++++ runtime/settings-defaults.js | 2 +- tests/default-settings.mjs | 2 +- tests/extractor-split-pipeline.mjs | 208 +++++++++++++++++++++++++---- 5 files changed, 233 insertions(+), 28 deletions(-) diff --git a/maintenance/extractor.js b/maintenance/extractor.js index 0d6d95b..6e70e9d 100644 --- a/maintenance/extractor.js +++ b/maintenance/extractor.js @@ -41,6 +41,7 @@ import { buildTaskLlmPayload, buildTaskPrompt, } from "../prompting/prompt-builder.js"; +import { isExtractProfileSplitSafe } from "../prompting/prompt-profiles.js"; import { RELATION_TYPES } from "../graph/schema.js"; import { applyTaskRegex } from "../prompting/task-regex.js"; import { getSTContextForPrompt, getSTContextSnapshot } from "../host/st-context.js"; @@ -1110,7 +1111,11 @@ async function applyExtractionPostCommit({ } function resolveExtractPipelineVersion(settings = {}) { - return String(settings?.extractPipelineVersion || "legacy-single").trim().toLowerCase(); + const requested = String(settings?.extractPipelineVersion || "split-v1").trim().toLowerCase(); + if (requested === "split-v1" && !isExtractProfileSplitSafe(settings)) { + return "legacy-single"; + } + return requested; } function shouldUseSplitExtractionPipeline(settings = {}) { diff --git a/prompting/prompt-profiles.js b/prompting/prompt-profiles.js index b17688d..00588cd 100644 --- a/prompting/prompt-profiles.js +++ b/prompting/prompt-profiles.js @@ -2035,6 +2035,48 @@ function shouldRefreshBuiltinDefaultProfile(taskType, profile = {}) { return false; } +export function isExtractProfileSplitSafe(settings = {}) { + if (String(settings?.extractPrompt || "").trim()) { + return false; + } + + const rawTaskProfiles = settings?.taskProfiles?.extract; + if (!rawTaskProfiles) return true; + + const profiles = Array.isArray(rawTaskProfiles?.profiles) ? rawTaskProfiles.profiles : []; + const activeProfileId = String(rawTaskProfiles?.activeProfileId || DEFAULT_PROFILE_ID); + const rawActiveProfile = profiles.find((profile) => String(profile?.id || "") === activeProfileId); + if (!rawActiveProfile) return false; + if (String(rawActiveProfile?.id || "") !== DEFAULT_PROFILE_ID) return false; + if (rawActiveProfile?.builtin !== true) return false; + if (rawActiveProfile?.metadata?.migratedFromLegacy === true) return false; + + const canonicalDefault = createDefaultTaskProfile("extract"); + if (shouldRefreshBuiltinDefaultProfile("extract", rawActiveProfile)) return false; + if ( + JSON.stringify(buildPromptBlockComparisonPayload(rawActiveProfile?.blocks || [])) !== + JSON.stringify(buildPromptBlockComparisonPayload(canonicalDefault.blocks || [])) + ) { + return false; + } + if (JSON.stringify(rawActiveProfile?.generation || {}) !== JSON.stringify(canonicalDefault.generation || {})) { + return false; + } + if (JSON.stringify(rawActiveProfile?.input || {}) !== JSON.stringify(canonicalDefault.input || {})) { + return false; + } + if (JSON.stringify(rawActiveProfile?.regex || {}) !== JSON.stringify(canonicalDefault.regex || {})) { + return false; + } + if (String(rawActiveProfile?.promptMode || "") !== String(canonicalDefault.promptMode || "")) { + return false; + } + if ((rawActiveProfile?.enabled !== false) !== (canonicalDefault.enabled !== false)) { + return false; + } + return true; +} + function createFallbackDefaultTaskProfile(taskType) { const legacyPromptField = LEGACY_PROMPT_FIELD_MAP[taskType]; const templateStamp = getDefaultTaskProfileTemplateStamp(taskType); diff --git a/runtime/settings-defaults.js b/runtime/settings-defaults.js index d7eb786..f2083a3 100644 --- a/runtime/settings-defaults.js +++ b/runtime/settings-defaults.js @@ -37,7 +37,7 @@ export const defaultSettings = { extractIncludeStoryTime: true, extractIncludeSummaries: true, extractActionMode: "pending", - extractPipelineVersion: "legacy-single", + extractPipelineVersion: "split-v1", // 召回设置 recallEnabled: true, diff --git a/tests/default-settings.mjs b/tests/default-settings.mjs index df514f2..bea2400 100644 --- a/tests/default-settings.mjs +++ b/tests/default-settings.mjs @@ -109,7 +109,7 @@ assert.equal(defaultSettings.loadNativeHydrateThresholdRecords, 30000); assert.equal(defaultSettings.nativeRolloutVersion, 2); assert.equal(defaultSettings.nativeEngineFailOpen, true); assert.equal(defaultSettings.graphNativeForceDisable, false); -assert.equal(defaultSettings.extractPipelineVersion, "legacy-single"); +assert.equal(defaultSettings.extractPipelineVersion, "split-v1"); assert.equal(defaultSettings.taskProfilesVersion, 3); assert.equal(defaultSettings.extractObjectivePrompt, ""); assert.equal(defaultSettings.extractSubjectivePrompt, ""); diff --git a/tests/extractor-split-pipeline.mjs b/tests/extractor-split-pipeline.mjs index c397e43..eef2990 100644 --- a/tests/extractor-split-pipeline.mjs +++ b/tests/extractor-split-pipeline.mjs @@ -65,6 +65,7 @@ installResolveHooks([ const { createEmptyGraph, createNode, addNode } = await import("../graph/graph.js"); const { DEFAULT_NODE_SCHEMA } = await import("../graph/schema.js"); const { extractMemories } = await import("../maintenance/extractor.js"); +const { defaultSettings } = await import("../runtime/settings-defaults.js"); function setTestOverrides(overrides = {}) { globalThis.__stBmeTestOverrides = overrides; @@ -190,6 +191,106 @@ function characterKnowledgeEntries(graph) { ); } +async function captureTaskTypesForExtract(settings, options = {}) { + const graph = createGraphWithCharacter(); + const capturedTaskTypes = []; + const restore = setTestOverrides({ + llm: { + async callLLMForJSON(payload = {}) { + capturedTaskTypes.push(payload.taskType); + if (payload.taskType === "extract_objective") return objectivePayload(); + if (payload.taskType === "extract_subjective") return subjectivePayload(); + if (payload.taskType === "extract") return { operations: [], cognitionUpdates: [], regionUpdates: {} }; + return { operations: [], cognitionUpdates: [], regionUpdates: {} }; + }, + }, + }); + + try { + const params = { + graph, + ...baseExtractParams, + }; + if (options.includeSettings !== false) { + params.settings = settings; + } + const result = await extractMemories(params); + return { graph, result, capturedTaskTypes }; + } finally { + restore(); + } +} + +function cloneJson(value) { + return JSON.parse(JSON.stringify(value)); +} + +function createCustomizedLegacyExtractProfileSettings() { + const taskProfiles = cloneJson(defaultSettings.taskProfiles); + const baseProfile = taskProfiles.extract.profiles[0]; + const customProfile = { + ...baseProfile, + id: "custom-legacy-extract-profile", + name: "Custom legacy extract profile", + builtin: false, + blocks: (Array.isArray(baseProfile.blocks) ? baseProfile.blocks : []).map((block, index) => + index === 0 + ? { ...block, content: `${String(block.content || "")}\nCUSTOM_LEGACY_EXTRACT_SENTINEL` } + : { ...block }, + ), + }; + taskProfiles.extract = { + activeProfileId: customProfile.id, + profiles: [baseProfile, customProfile], + }; + return { + ...defaultSettings, + extractPipelineVersion: "split-v1", + taskProfiles, + }; +} + +function createDefaultExtractProfileSettings(mutator) { + const taskProfiles = cloneJson(defaultSettings.taskProfiles); + const extractProfiles = taskProfiles.extract.profiles || []; + const defaultProfile = extractProfiles.find((profile) => profile.id === "default") || extractProfiles[0]; + mutator?.(defaultProfile, taskProfiles.extract); + return { + ...defaultSettings, + extractPipelineVersion: "split-v1", + taskProfiles, + }; +} + +// Phase 4 default switch: omitting settings should use the split pipeline by default. +{ + const { result, capturedTaskTypes } = await captureTaskTypesForExtract(undefined, { + includeSettings: false, + }); + + assert.equal(result.success, true); + assert.deepEqual( + capturedTaskTypes, + ["extract_objective", "extract_subjective"], + "extractMemories without explicit settings should default to split objective+subjective extraction", + ); +} + +// Phase 4 default switch: the default settings object should request split-v1. +{ + const { result, capturedTaskTypes } = await captureTaskTypesForExtract({ + ...defaultSettings, + }); + + assert.equal(result.success, true); + assert.equal(defaultSettings.extractPipelineVersion, "split-v1"); + assert.deepEqual( + capturedTaskTypes, + ["extract_objective", "extract_subjective"], + "defaultSettings should call split objective+subjective extraction", + ); +} + // split-v1 calls objective then subjective, merges both stage outputs, and commits once. { const graph = createGraphWithCharacter(); @@ -282,35 +383,92 @@ function characterKnowledgeEntries(graph) { } } -// Legacy/default extraction keeps the single extract taskType path. +// Legacy guard: a non-empty legacy extractPrompt should force the single extract taskType path. { - const graph = createGraphWithCharacter(); - const capturedTaskTypes = []; - const restore = setTestOverrides({ - llm: { - async callLLMForJSON(payload = {}) { - capturedTaskTypes.push(payload.taskType); - return { operations: [], cognitionUpdates: [], regionUpdates: {} }; - }, - }, + const { result, capturedTaskTypes } = await captureTaskTypesForExtract({ + ...defaultSettings, + extractPipelineVersion: "split-v1", + extractPrompt: "CUSTOM LEGACY EXTRACT PROMPT", }); - try { - const result = await extractMemories({ - graph, - ...baseExtractParams, - settings: {}, - }); + assert.equal(result.success, true); + assert.deepEqual( + capturedTaskTypes, + ["extract"], + "non-empty extractPrompt should guard back to legacy taskType extract", + ); +} - assert.equal(result.success, true); - assert.deepEqual( - capturedTaskTypes, - ["extract"], - "default extraction should keep calling only legacy taskType extract", - ); - } finally { - restore(); - } +// Legacy guard: an active customized legacy extract task profile should force the single extract path. +{ + const { result, capturedTaskTypes } = await captureTaskTypesForExtract( + createCustomizedLegacyExtractProfileSettings(), + ); + + assert.equal(result.success, true); + assert.deepEqual( + capturedTaskTypes, + ["extract"], + "customized active taskProfiles.extract profile should guard back to legacy taskType extract", + ); +} + +// Legacy guard: an explicit legacy override should always keep the single extract path. +{ + const { result, capturedTaskTypes } = await captureTaskTypesForExtract({ + ...defaultSettings, + extractPipelineVersion: "legacy-single", + }); + + assert.equal(result.success, true); + assert.deepEqual(capturedTaskTypes, ["extract"]); +} + +// Legacy guard: migrated legacy default-looking profiles are conservative legacy. +{ + const { result, capturedTaskTypes } = await captureTaskTypesForExtract( + createDefaultExtractProfileSettings((profile) => { + profile.metadata = { + ...(profile.metadata || {}), + migratedFromLegacy: true, + }; + }), + ); + + assert.equal(result.success, true); + assert.deepEqual(capturedTaskTypes, ["extract"]); +} + +// Legacy guard: stale default profile metadata is conservative legacy. +{ + const { result, capturedTaskTypes } = await captureTaskTypesForExtract( + createDefaultExtractProfileSettings((profile) => { + profile.metadata = { + ...(profile.metadata || {}), + defaultTemplateFingerprint: "stale-fingerprint", + }; + }), + ); + + assert.equal(result.success, true); + assert.deepEqual(capturedTaskTypes, ["extract"]); +} + +// Legacy guard: modified default profile content is conservative legacy even if id/builtin remain default. +{ + const { result, capturedTaskTypes } = await captureTaskTypesForExtract( + createDefaultExtractProfileSettings((profile) => { + profile.blocks = (profile.blocks || []).map((block, index) => + index === 0 + ? { ...block, content: `${String(block.content || "")} +CUSTOM_DEFAULT_PROFILE_SENTINEL` } + : { ...block }, + ); + }), + ); + + assert.equal(result.success, true); + assert.deepEqual(capturedTaskTypes, ["extract"]); } console.log("extractor-split-pipeline tests passed");