feat(extraction): default to split extraction pipeline

This commit is contained in:
youzini
2026-06-09 04:54:00 +00:00
parent e4f263a9ce
commit 4bed1070ad
5 changed files with 233 additions and 28 deletions

View File

@@ -41,6 +41,7 @@ import {
buildTaskLlmPayload,
buildTaskPrompt,
} from "../prompting/prompt-builder.js";
import { isExtractProfileSplitSafe } from "../prompting/prompt-profiles.js";
import { RELATION_TYPES } from "../graph/schema.js";
import { applyTaskRegex } from "../prompting/task-regex.js";
import { getSTContextForPrompt, getSTContextSnapshot } from "../host/st-context.js";
@@ -1110,7 +1111,11 @@ async function applyExtractionPostCommit({
}
function resolveExtractPipelineVersion(settings = {}) {
return String(settings?.extractPipelineVersion || "legacy-single").trim().toLowerCase();
const requested = String(settings?.extractPipelineVersion || "split-v1").trim().toLowerCase();
if (requested === "split-v1" && !isExtractProfileSplitSafe(settings)) {
return "legacy-single";
}
return requested;
}
function shouldUseSplitExtractionPipeline(settings = {}) {

View File

@@ -2035,6 +2035,48 @@ function shouldRefreshBuiltinDefaultProfile(taskType, profile = {}) {
return false;
}
export function isExtractProfileSplitSafe(settings = {}) {
if (String(settings?.extractPrompt || "").trim()) {
return false;
}
const rawTaskProfiles = settings?.taskProfiles?.extract;
if (!rawTaskProfiles) return true;
const profiles = Array.isArray(rawTaskProfiles?.profiles) ? rawTaskProfiles.profiles : [];
const activeProfileId = String(rawTaskProfiles?.activeProfileId || DEFAULT_PROFILE_ID);
const rawActiveProfile = profiles.find((profile) => String(profile?.id || "") === activeProfileId);
if (!rawActiveProfile) return false;
if (String(rawActiveProfile?.id || "") !== DEFAULT_PROFILE_ID) return false;
if (rawActiveProfile?.builtin !== true) return false;
if (rawActiveProfile?.metadata?.migratedFromLegacy === true) return false;
const canonicalDefault = createDefaultTaskProfile("extract");
if (shouldRefreshBuiltinDefaultProfile("extract", rawActiveProfile)) return false;
if (
JSON.stringify(buildPromptBlockComparisonPayload(rawActiveProfile?.blocks || [])) !==
JSON.stringify(buildPromptBlockComparisonPayload(canonicalDefault.blocks || []))
) {
return false;
}
if (JSON.stringify(rawActiveProfile?.generation || {}) !== JSON.stringify(canonicalDefault.generation || {})) {
return false;
}
if (JSON.stringify(rawActiveProfile?.input || {}) !== JSON.stringify(canonicalDefault.input || {})) {
return false;
}
if (JSON.stringify(rawActiveProfile?.regex || {}) !== JSON.stringify(canonicalDefault.regex || {})) {
return false;
}
if (String(rawActiveProfile?.promptMode || "") !== String(canonicalDefault.promptMode || "")) {
return false;
}
if ((rawActiveProfile?.enabled !== false) !== (canonicalDefault.enabled !== false)) {
return false;
}
return true;
}
function createFallbackDefaultTaskProfile(taskType) {
const legacyPromptField = LEGACY_PROMPT_FIELD_MAP[taskType];
const templateStamp = getDefaultTaskProfileTemplateStamp(taskType);

View File

@@ -37,7 +37,7 @@ export const defaultSettings = {
extractIncludeStoryTime: true,
extractIncludeSummaries: true,
extractActionMode: "pending",
extractPipelineVersion: "legacy-single",
extractPipelineVersion: "split-v1",
// 召回设置
recallEnabled: true,

View File

@@ -109,7 +109,7 @@ assert.equal(defaultSettings.loadNativeHydrateThresholdRecords, 30000);
assert.equal(defaultSettings.nativeRolloutVersion, 2);
assert.equal(defaultSettings.nativeEngineFailOpen, true);
assert.equal(defaultSettings.graphNativeForceDisable, false);
assert.equal(defaultSettings.extractPipelineVersion, "legacy-single");
assert.equal(defaultSettings.extractPipelineVersion, "split-v1");
assert.equal(defaultSettings.taskProfilesVersion, 3);
assert.equal(defaultSettings.extractObjectivePrompt, "");
assert.equal(defaultSettings.extractSubjectivePrompt, "");

View File

@@ -65,6 +65,7 @@ installResolveHooks([
const { createEmptyGraph, createNode, addNode } = await import("../graph/graph.js");
const { DEFAULT_NODE_SCHEMA } = await import("../graph/schema.js");
const { extractMemories } = await import("../maintenance/extractor.js");
const { defaultSettings } = await import("../runtime/settings-defaults.js");
function setTestOverrides(overrides = {}) {
globalThis.__stBmeTestOverrides = overrides;
@@ -190,6 +191,106 @@ function characterKnowledgeEntries(graph) {
);
}
async function captureTaskTypesForExtract(settings, options = {}) {
const graph = createGraphWithCharacter();
const capturedTaskTypes = [];
const restore = setTestOverrides({
llm: {
async callLLMForJSON(payload = {}) {
capturedTaskTypes.push(payload.taskType);
if (payload.taskType === "extract_objective") return objectivePayload();
if (payload.taskType === "extract_subjective") return subjectivePayload();
if (payload.taskType === "extract") return { operations: [], cognitionUpdates: [], regionUpdates: {} };
return { operations: [], cognitionUpdates: [], regionUpdates: {} };
},
},
});
try {
const params = {
graph,
...baseExtractParams,
};
if (options.includeSettings !== false) {
params.settings = settings;
}
const result = await extractMemories(params);
return { graph, result, capturedTaskTypes };
} finally {
restore();
}
}
function cloneJson(value) {
return JSON.parse(JSON.stringify(value));
}
function createCustomizedLegacyExtractProfileSettings() {
const taskProfiles = cloneJson(defaultSettings.taskProfiles);
const baseProfile = taskProfiles.extract.profiles[0];
const customProfile = {
...baseProfile,
id: "custom-legacy-extract-profile",
name: "Custom legacy extract profile",
builtin: false,
blocks: (Array.isArray(baseProfile.blocks) ? baseProfile.blocks : []).map((block, index) =>
index === 0
? { ...block, content: `${String(block.content || "")}\nCUSTOM_LEGACY_EXTRACT_SENTINEL` }
: { ...block },
),
};
taskProfiles.extract = {
activeProfileId: customProfile.id,
profiles: [baseProfile, customProfile],
};
return {
...defaultSettings,
extractPipelineVersion: "split-v1",
taskProfiles,
};
}
function createDefaultExtractProfileSettings(mutator) {
const taskProfiles = cloneJson(defaultSettings.taskProfiles);
const extractProfiles = taskProfiles.extract.profiles || [];
const defaultProfile = extractProfiles.find((profile) => profile.id === "default") || extractProfiles[0];
mutator?.(defaultProfile, taskProfiles.extract);
return {
...defaultSettings,
extractPipelineVersion: "split-v1",
taskProfiles,
};
}
// Phase 4 default switch: omitting settings should use the split pipeline by default.
{
const { result, capturedTaskTypes } = await captureTaskTypesForExtract(undefined, {
includeSettings: false,
});
assert.equal(result.success, true);
assert.deepEqual(
capturedTaskTypes,
["extract_objective", "extract_subjective"],
"extractMemories without explicit settings should default to split objective+subjective extraction",
);
}
// Phase 4 default switch: the default settings object should request split-v1.
{
const { result, capturedTaskTypes } = await captureTaskTypesForExtract({
...defaultSettings,
});
assert.equal(result.success, true);
assert.equal(defaultSettings.extractPipelineVersion, "split-v1");
assert.deepEqual(
capturedTaskTypes,
["extract_objective", "extract_subjective"],
"defaultSettings should call split objective+subjective extraction",
);
}
// split-v1 calls objective then subjective, merges both stage outputs, and commits once.
{
const graph = createGraphWithCharacter();
@@ -282,35 +383,92 @@ function characterKnowledgeEntries(graph) {
}
}
// Legacy/default extraction keeps the single extract taskType path.
// Legacy guard: a non-empty legacy extractPrompt should force the single extract taskType path.
{
const graph = createGraphWithCharacter();
const capturedTaskTypes = [];
const restore = setTestOverrides({
llm: {
async callLLMForJSON(payload = {}) {
capturedTaskTypes.push(payload.taskType);
return { operations: [], cognitionUpdates: [], regionUpdates: {} };
},
},
const { result, capturedTaskTypes } = await captureTaskTypesForExtract({
...defaultSettings,
extractPipelineVersion: "split-v1",
extractPrompt: "CUSTOM LEGACY EXTRACT PROMPT",
});
try {
const result = await extractMemories({
graph,
...baseExtractParams,
settings: {},
});
assert.equal(result.success, true);
assert.deepEqual(
capturedTaskTypes,
["extract"],
"non-empty extractPrompt should guard back to legacy taskType extract",
);
}
assert.equal(result.success, true);
assert.deepEqual(
capturedTaskTypes,
["extract"],
"default extraction should keep calling only legacy taskType extract",
);
} finally {
restore();
}
// Legacy guard: an active customized legacy extract task profile should force the single extract path.
{
const { result, capturedTaskTypes } = await captureTaskTypesForExtract(
createCustomizedLegacyExtractProfileSettings(),
);
assert.equal(result.success, true);
assert.deepEqual(
capturedTaskTypes,
["extract"],
"customized active taskProfiles.extract profile should guard back to legacy taskType extract",
);
}
// Legacy guard: an explicit legacy override should always keep the single extract path.
{
const { result, capturedTaskTypes } = await captureTaskTypesForExtract({
...defaultSettings,
extractPipelineVersion: "legacy-single",
});
assert.equal(result.success, true);
assert.deepEqual(capturedTaskTypes, ["extract"]);
}
// Legacy guard: migrated legacy default-looking profiles are conservative legacy.
{
const { result, capturedTaskTypes } = await captureTaskTypesForExtract(
createDefaultExtractProfileSettings((profile) => {
profile.metadata = {
...(profile.metadata || {}),
migratedFromLegacy: true,
};
}),
);
assert.equal(result.success, true);
assert.deepEqual(capturedTaskTypes, ["extract"]);
}
// Legacy guard: stale default profile metadata is conservative legacy.
{
const { result, capturedTaskTypes } = await captureTaskTypesForExtract(
createDefaultExtractProfileSettings((profile) => {
profile.metadata = {
...(profile.metadata || {}),
defaultTemplateFingerprint: "stale-fingerprint",
};
}),
);
assert.equal(result.success, true);
assert.deepEqual(capturedTaskTypes, ["extract"]);
}
// Legacy guard: modified default profile content is conservative legacy even if id/builtin remain default.
{
const { result, capturedTaskTypes } = await captureTaskTypesForExtract(
createDefaultExtractProfileSettings((profile) => {
profile.blocks = (profile.blocks || []).map((block, index) =>
index === 0
? { ...block, content: `${String(block.content || "")}
CUSTOM_DEFAULT_PROFILE_SENTINEL` }
: { ...block },
);
}),
);
assert.equal(result.success, true);
assert.deepEqual(capturedTaskTypes, ["extract"]);
}
console.log("extractor-split-pipeline tests passed");