feat(extraction): default to split extraction pipeline

2026-06-13 18:31:16 +08:00 · 2026-06-09 04:54:00 +00:00
parent e4f263a9ce
commit 4bed1070ad
5 changed files with 233 additions and 28 deletions
--- a/maintenance/extractor.js
+++ b/maintenance/extractor.js
@@ -41,6 +41,7 @@ import {
  buildTaskLlmPayload,
  buildTaskPrompt,
 } from "../prompting/prompt-builder.js";
+import { isExtractProfileSplitSafe } from "../prompting/prompt-profiles.js";
 import { RELATION_TYPES } from "../graph/schema.js";
 import { applyTaskRegex } from "../prompting/task-regex.js";
 import { getSTContextForPrompt, getSTContextSnapshot } from "../host/st-context.js";
@@ -1110,7 +1111,11 @@ async function applyExtractionPostCommit({
 }

 function resolveExtractPipelineVersion(settings = {}) {
-  return String(settings?.extractPipelineVersion || "legacy-single").trim().toLowerCase();
+  const requested = String(settings?.extractPipelineVersion || "split-v1").trim().toLowerCase();
+  if (requested === "split-v1" && !isExtractProfileSplitSafe(settings)) {
+    return "legacy-single";
+  }
+  return requested;
 }

 function shouldUseSplitExtractionPipeline(settings = {}) {
--- a/prompting/prompt-profiles.js
+++ b/prompting/prompt-profiles.js
@@ -2035,6 +2035,48 @@ function shouldRefreshBuiltinDefaultProfile(taskType, profile = {}) {
  return false;
 }

+export function isExtractProfileSplitSafe(settings = {}) {
+  if (String(settings?.extractPrompt || "").trim()) {
+    return false;
+  }
+
+  const rawTaskProfiles = settings?.taskProfiles?.extract;
+  if (!rawTaskProfiles) return true;
+
+  const profiles = Array.isArray(rawTaskProfiles?.profiles) ? rawTaskProfiles.profiles : [];
+  const activeProfileId = String(rawTaskProfiles?.activeProfileId || DEFAULT_PROFILE_ID);
+  const rawActiveProfile = profiles.find((profile) => String(profile?.id || "") === activeProfileId);
+  if (!rawActiveProfile) return false;
+  if (String(rawActiveProfile?.id || "") !== DEFAULT_PROFILE_ID) return false;
+  if (rawActiveProfile?.builtin !== true) return false;
+  if (rawActiveProfile?.metadata?.migratedFromLegacy === true) return false;
+
+  const canonicalDefault = createDefaultTaskProfile("extract");
+  if (shouldRefreshBuiltinDefaultProfile("extract", rawActiveProfile)) return false;
+  if (
+    JSON.stringify(buildPromptBlockComparisonPayload(rawActiveProfile?.blocks || [])) !==
+    JSON.stringify(buildPromptBlockComparisonPayload(canonicalDefault.blocks || []))
+  ) {
+    return false;
+  }
+  if (JSON.stringify(rawActiveProfile?.generation || {}) !== JSON.stringify(canonicalDefault.generation || {})) {
+    return false;
+  }
+  if (JSON.stringify(rawActiveProfile?.input || {}) !== JSON.stringify(canonicalDefault.input || {})) {
+    return false;
+  }
+  if (JSON.stringify(rawActiveProfile?.regex || {}) !== JSON.stringify(canonicalDefault.regex || {})) {
+    return false;
+  }
+  if (String(rawActiveProfile?.promptMode || "") !== String(canonicalDefault.promptMode || "")) {
+    return false;
+  }
+  if ((rawActiveProfile?.enabled !== false) !== (canonicalDefault.enabled !== false)) {
+    return false;
+  }
+  return true;
+}
+
 function createFallbackDefaultTaskProfile(taskType) {
  const legacyPromptField = LEGACY_PROMPT_FIELD_MAP[taskType];
  const templateStamp = getDefaultTaskProfileTemplateStamp(taskType);
--- a/runtime/settings-defaults.js
+++ b/runtime/settings-defaults.js
@@ -37,7 +37,7 @@ export const defaultSettings = {
  extractIncludeStoryTime: true,
  extractIncludeSummaries: true,
  extractActionMode: "pending",
-  extractPipelineVersion: "legacy-single",
+  extractPipelineVersion: "split-v1",

  // 召回设置
  recallEnabled: true,
--- a/tests/default-settings.mjs
+++ b/tests/default-settings.mjs
@@ -109,7 +109,7 @@ assert.equal(defaultSettings.loadNativeHydrateThresholdRecords, 30000);
 assert.equal(defaultSettings.nativeRolloutVersion, 2);
 assert.equal(defaultSettings.nativeEngineFailOpen, true);
 assert.equal(defaultSettings.graphNativeForceDisable, false);
-assert.equal(defaultSettings.extractPipelineVersion, "legacy-single");
+assert.equal(defaultSettings.extractPipelineVersion, "split-v1");
 assert.equal(defaultSettings.taskProfilesVersion, 3);
 assert.equal(defaultSettings.extractObjectivePrompt, "");
 assert.equal(defaultSettings.extractSubjectivePrompt, "");
--- a/tests/extractor-split-pipeline.mjs
+++ b/tests/extractor-split-pipeline.mjs
@@ -65,6 +65,7 @@ installResolveHooks([
 const { createEmptyGraph, createNode, addNode } = await import("../graph/graph.js");
 const { DEFAULT_NODE_SCHEMA } = await import("../graph/schema.js");
 const { extractMemories } = await import("../maintenance/extractor.js");
+const { defaultSettings } = await import("../runtime/settings-defaults.js");

 function setTestOverrides(overrides = {}) {
  globalThis.__stBmeTestOverrides = overrides;
@@ -190,6 +191,106 @@ function characterKnowledgeEntries(graph) {
  );
 }

+async function captureTaskTypesForExtract(settings, options = {}) {
+  const graph = createGraphWithCharacter();
+  const capturedTaskTypes = [];
+  const restore = setTestOverrides({
+    llm: {
+      async callLLMForJSON(payload = {}) {
+        capturedTaskTypes.push(payload.taskType);
+        if (payload.taskType === "extract_objective") return objectivePayload();
+        if (payload.taskType === "extract_subjective") return subjectivePayload();
+        if (payload.taskType === "extract") return { operations: [], cognitionUpdates: [], regionUpdates: {} };
+        return { operations: [], cognitionUpdates: [], regionUpdates: {} };
+      },
+    },
+  });
+
+  try {
+    const params = {
+      graph,
+      ...baseExtractParams,
+    };
+    if (options.includeSettings !== false) {
+      params.settings = settings;
+    }
+    const result = await extractMemories(params);
+    return { graph, result, capturedTaskTypes };
+  } finally {
+    restore();
+  }
+}
+
+function cloneJson(value) {
+  return JSON.parse(JSON.stringify(value));
+}
+
+function createCustomizedLegacyExtractProfileSettings() {
+  const taskProfiles = cloneJson(defaultSettings.taskProfiles);
+  const baseProfile = taskProfiles.extract.profiles[0];
+  const customProfile = {
+    ...baseProfile,
+    id: "custom-legacy-extract-profile",
+    name: "Custom legacy extract profile",
+    builtin: false,
+    blocks: (Array.isArray(baseProfile.blocks) ? baseProfile.blocks : []).map((block, index) =>
+      index === 0
+        ? { ...block, content: `${String(block.content || "")}\nCUSTOM_LEGACY_EXTRACT_SENTINEL` }
+        : { ...block },
+    ),
+  };
+  taskProfiles.extract = {
+    activeProfileId: customProfile.id,
+    profiles: [baseProfile, customProfile],
+  };
+  return {
+    ...defaultSettings,
+    extractPipelineVersion: "split-v1",
+    taskProfiles,
+  };
+}
+
+function createDefaultExtractProfileSettings(mutator) {
+  const taskProfiles = cloneJson(defaultSettings.taskProfiles);
+  const extractProfiles = taskProfiles.extract.profiles || [];
+  const defaultProfile = extractProfiles.find((profile) => profile.id === "default") || extractProfiles[0];
+  mutator?.(defaultProfile, taskProfiles.extract);
+  return {
+    ...defaultSettings,
+    extractPipelineVersion: "split-v1",
+    taskProfiles,
+  };
+}
+
+// Phase 4 default switch: omitting settings should use the split pipeline by default.
+{
+  const { result, capturedTaskTypes } = await captureTaskTypesForExtract(undefined, {
+    includeSettings: false,
+  });
+
+  assert.equal(result.success, true);
+  assert.deepEqual(
+    capturedTaskTypes,
+    ["extract_objective", "extract_subjective"],
+    "extractMemories without explicit settings should default to split objective+subjective extraction",
+  );
+}
+
+// Phase 4 default switch: the default settings object should request split-v1.
+{
+  const { result, capturedTaskTypes } = await captureTaskTypesForExtract({
+    ...defaultSettings,
+  });
+
+  assert.equal(result.success, true);
+  assert.equal(defaultSettings.extractPipelineVersion, "split-v1");
+  assert.deepEqual(
+    capturedTaskTypes,
+    ["extract_objective", "extract_subjective"],
+    "defaultSettings should call split objective+subjective extraction",
+  );
+}
+
 // split-v1 calls objective then subjective, merges both stage outputs, and commits once.
 {
  const graph = createGraphWithCharacter();
@@ -282,35 +383,92 @@ function characterKnowledgeEntries(graph) {
  }
 }

-// Legacy/default extraction keeps the single extract taskType path.
+// Legacy guard: a non-empty legacy extractPrompt should force the single extract taskType path.
 {
-  const graph = createGraphWithCharacter();
-  const capturedTaskTypes = [];
-  const restore = setTestOverrides({
-    llm: {
-      async callLLMForJSON(payload = {}) {
-        capturedTaskTypes.push(payload.taskType);
-        return { operations: [], cognitionUpdates: [], regionUpdates: {} };
-      },
-    },
+  const { result, capturedTaskTypes } = await captureTaskTypesForExtract({
+    ...defaultSettings,
+    extractPipelineVersion: "split-v1",
+    extractPrompt: "CUSTOM LEGACY EXTRACT PROMPT",
  });

-  try {
-    const result = await extractMemories({
-      graph,
-      ...baseExtractParams,
-      settings: {},
-    });
+  assert.equal(result.success, true);
+  assert.deepEqual(
+    capturedTaskTypes,
+    ["extract"],
+    "non-empty extractPrompt should guard back to legacy taskType extract",
+  );
+}

-    assert.equal(result.success, true);
-    assert.deepEqual(
-      capturedTaskTypes,
-      ["extract"],
-      "default extraction should keep calling only legacy taskType extract",
-    );
-  } finally {
-    restore();
-  }
+// Legacy guard: an active customized legacy extract task profile should force the single extract path.
+{
+  const { result, capturedTaskTypes } = await captureTaskTypesForExtract(
+    createCustomizedLegacyExtractProfileSettings(),
+  );
+
+  assert.equal(result.success, true);
+  assert.deepEqual(
+    capturedTaskTypes,
+    ["extract"],
+    "customized active taskProfiles.extract profile should guard back to legacy taskType extract",
+  );
+}
+
+// Legacy guard: an explicit legacy override should always keep the single extract path.
+{
+  const { result, capturedTaskTypes } = await captureTaskTypesForExtract({
+    ...defaultSettings,
+    extractPipelineVersion: "legacy-single",
+  });
+
+  assert.equal(result.success, true);
+  assert.deepEqual(capturedTaskTypes, ["extract"]);
+}
+
+// Legacy guard: migrated legacy default-looking profiles are conservative legacy.
+{
+  const { result, capturedTaskTypes } = await captureTaskTypesForExtract(
+    createDefaultExtractProfileSettings((profile) => {
+      profile.metadata = {
+        ...(profile.metadata || {}),
+        migratedFromLegacy: true,
+      };
+    }),
+  );
+
+  assert.equal(result.success, true);
+  assert.deepEqual(capturedTaskTypes, ["extract"]);
+}
+
+// Legacy guard: stale default profile metadata is conservative legacy.
+{
+  const { result, capturedTaskTypes } = await captureTaskTypesForExtract(
+    createDefaultExtractProfileSettings((profile) => {
+      profile.metadata = {
+        ...(profile.metadata || {}),
+        defaultTemplateFingerprint: "stale-fingerprint",
+      };
+    }),
+  );
+
+  assert.equal(result.success, true);
+  assert.deepEqual(capturedTaskTypes, ["extract"]);
+}
+
+// Legacy guard: modified default profile content is conservative legacy even if id/builtin remain default.
+{
+  const { result, capturedTaskTypes } = await captureTaskTypesForExtract(
+    createDefaultExtractProfileSettings((profile) => {
+      profile.blocks = (profile.blocks || []).map((block, index) =>
+        index === 0
+          ? { ...block, content: `${String(block.content || "")}
+CUSTOM_DEFAULT_PROFILE_SENTINEL` }
+          : { ...block },
+      );
+    }),
+  );
+
+  assert.equal(result.success, true);
+  assert.deepEqual(capturedTaskTypes, ["extract"]);
 }

 console.log("extractor-split-pipeline tests passed");