feat: improve shujuku-compatible extraction and recall input

This commit is contained in:
Youzini-afk
2026-04-11 16:05:06 +08:00
parent 154e553589
commit 322752bb43
15 changed files with 1642 additions and 35 deletions

View File

@@ -283,6 +283,9 @@ export function buildExtractionMessages(chat, startIdx, endIdx, settings) {
seq: index,
role: msg.is_user ? "user" : "assistant",
content,
rawContent: String(msg?.mes ?? ""),
name: String(msg?.name ?? "").trim(),
speaker: String(msg?.name ?? "").trim(),
});
}

View File

@@ -0,0 +1,453 @@
function splitConfigText(value = "") {
return String(value || "")
.split(/[\r\n,]+/)
.map((item) => String(item || "").trim())
.filter(Boolean);
}
function escapeRegex(value = "") {
return String(value || "").replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
}
function normalizeBoundaryRule(rawRule, mode = "exclude", index = 0) {
if (typeof rawRule === "string") {
const tag = String(rawRule || "").trim();
if (!tag) return null;
return {
id: `${mode}:tag:${index}:${tag}`,
mode,
kind: "tag",
label: tag,
tag,
};
}
if (!rawRule || typeof rawRule !== "object" || Array.isArray(rawRule)) {
return null;
}
const tag = String(rawRule.tag || rawRule.name || "").trim();
if (tag) {
return {
id: `${mode}:tag:${index}:${tag}`,
mode,
kind: "tag",
label: String(rawRule.label || tag).trim() || tag,
tag,
};
}
const start = String(rawRule.start ?? rawRule.open ?? rawRule.begin ?? "").trim();
const end = String(rawRule.end ?? rawRule.close ?? rawRule.finish ?? "").trim();
if (!start || !end) {
return null;
}
return {
id: `${mode}:boundary:${index}`,
mode,
kind: "boundary",
label: String(rawRule.label || `${start}${end}`).trim() || `${start}${end}`,
start,
end,
caseSensitive: rawRule.caseSensitive === true,
};
}
function normalizeBoundaryRules(rawRules = null, rawTags = "", mode = "exclude") {
const values = [];
if (Array.isArray(rawRules)) {
values.push(...rawRules);
} else if (rawRules !== null && rawRules !== undefined && rawRules !== "") {
values.push(rawRules);
}
values.push(...splitConfigText(rawTags));
return values
.map((item, index) => normalizeBoundaryRule(item, mode, index))
.filter(Boolean);
}
function applyTagBoundaryRule(text, rule) {
const input = String(text || "");
const escapedTag = escapeRegex(rule?.tag || "");
if (!escapedTag) {
return {
changed: false,
output: input,
ruleLabel: String(rule?.label || ""),
matchedText: "",
};
}
const regex = new RegExp(
`<${escapedTag}\\b[^>]*>([\\s\\S]*?)<\\/${escapedTag}>`,
"gi",
);
let match = null;
for (const candidate of input.matchAll(regex)) {
match = candidate;
}
if (!match) {
return {
changed: false,
output: input,
ruleLabel: String(rule?.label || ""),
matchedText: "",
};
}
const matchedText = String(match[0] || "");
if (rule?.mode === "extract") {
return {
changed: true,
output: String(match[1] || "").trim(),
ruleLabel: String(rule?.label || rule?.tag || ""),
matchedText,
};
}
const matchIndex = Number(match.index);
if (!Number.isFinite(matchIndex) || matchIndex < 0) {
return {
changed: false,
output: input,
ruleLabel: String(rule?.label || rule?.tag || ""),
matchedText: "",
};
}
return {
changed: true,
output: `${input.slice(0, matchIndex)}${input.slice(matchIndex + matchedText.length)}`.trim(),
ruleLabel: String(rule?.label || rule?.tag || ""),
matchedText,
};
}
function applyLiteralBoundaryRule(text, rule) {
const input = String(text || "");
const start = String(rule?.start || "");
const end = String(rule?.end || "");
if (!start || !end) {
return {
changed: false,
output: input,
ruleLabel: String(rule?.label || ""),
matchedText: "",
};
}
const sourceText = rule?.caseSensitive === true ? input : input.toLowerCase();
const startNeedle = rule?.caseSensitive === true ? start : start.toLowerCase();
const endNeedle = rule?.caseSensitive === true ? end : end.toLowerCase();
const startIndex = sourceText.lastIndexOf(startNeedle);
if (startIndex < 0) {
return {
changed: false,
output: input,
ruleLabel: String(rule?.label || ""),
matchedText: "",
};
}
const endIndex = sourceText.indexOf(endNeedle, startIndex + startNeedle.length);
if (endIndex < 0) {
return {
changed: false,
output: input,
ruleLabel: String(rule?.label || ""),
matchedText: "",
};
}
const matchedText = input.slice(startIndex, endIndex + end.length);
if (rule?.mode === "extract") {
return {
changed: true,
output: input.slice(startIndex + start.length, endIndex).trim(),
ruleLabel: String(rule?.label || ""),
matchedText,
};
}
return {
changed: true,
output: `${input.slice(0, startIndex)}${input.slice(endIndex + end.length)}`.trim(),
ruleLabel: String(rule?.label || ""),
matchedText,
};
}
function applyBoundaryRule(text, rule) {
if (rule?.kind === "tag") {
return applyTagBoundaryRule(text, rule);
}
if (rule?.kind === "boundary") {
return applyLiteralBoundaryRule(text, rule);
}
return {
changed: false,
output: String(text || ""),
ruleLabel: String(rule?.label || ""),
matchedText: "",
};
}
function applyFirstExtractRule(text, rules = []) {
const input = String(text || "");
for (const rule of Array.isArray(rules) ? rules : []) {
const result = applyBoundaryRule(input, rule);
if (result.changed) {
return {
changed: true,
output: result.output,
operation: {
mode: "extract",
rule: result.ruleLabel,
matchedLength: String(result.matchedText || "").length,
},
};
}
}
return {
changed: false,
output: input,
operation: null,
};
}
function applyExcludeRules(text, rules = []) {
const input = String(text || "");
let output = input;
const operations = [];
for (const rule of Array.isArray(rules) ? rules : []) {
const result = applyBoundaryRule(output, rule);
if (!result.changed) {
continue;
}
output = result.output;
operations.push({
mode: "exclude",
rule: result.ruleLabel,
matchedLength: String(result.matchedText || "").length,
});
}
return {
changed: output !== input,
output,
operations,
};
}
function normalizeRole(value = "") {
const role = String(value || "assistant").trim().toLowerCase();
if (["user", "assistant", "system"].includes(role)) {
return role;
}
return role === "ai" ? "assistant" : "assistant";
}
function resolveMessageContent(message = {}) {
if (typeof message?.content === "string") {
return message.content;
}
if (typeof message?.mes === "string") {
return message.mes;
}
return "";
}
function resolveMessageRawContent(message = {}) {
if (typeof message?.rawContent === "string") {
return message.rawContent;
}
if (typeof message?.mes === "string") {
return message.mes;
}
if (typeof message?.content === "string") {
return message.content;
}
return "";
}
function resolveSpeakerName(message = {}, role = "assistant", names = {}) {
const explicitSpeaker = String(
message?.speaker ?? message?.name ?? message?.displayName ?? "",
).trim();
if (explicitSpeaker) {
return explicitSpeaker;
}
if (role === "user") {
return String(names?.userName || "用户").trim() || "用户";
}
if (role === "assistant") {
return String(names?.charName || "角色").trim() || "角色";
}
return role || "assistant";
}
function normalizeExtractionMessage(message = {}, index = 0, names = {}) {
const role = normalizeRole(
message?.role ?? (message?.is_user === true ? "user" : "assistant"),
);
const content = String(resolveMessageContent(message) || "").trim();
const rawContent = String(resolveMessageRawContent(message) || content).trim();
const speaker = resolveSpeakerName(message, role, names);
const seq = Number.isFinite(Number(message?.seq)) ? Number(message.seq) : null;
return {
index,
seq,
role,
speaker,
name: speaker,
content,
rawContent,
sourceType: role === "user" ? "user_input" : "ai_output",
};
}
function countRoles(messages = []) {
return (Array.isArray(messages) ? messages : []).reduce(
(acc, message) => {
const role = normalizeRole(message?.role || "assistant");
acc[role] = Number(acc[role] || 0) + 1;
return acc;
},
{ user: 0, assistant: 0, system: 0 },
);
}
export function formatExtractionTranscript(messages = []) {
return (Array.isArray(messages) ? messages : [])
.map((message, index) => {
const seqLabel = Number.isFinite(Number(message?.seq))
? `#${Number(message.seq)}`
: `#${index + 1}`;
const role = normalizeRole(message?.role || "assistant");
const speaker = String(message?.speaker || message?.name || "").trim();
const speakerLabel = speaker ? `|${speaker}` : "";
return `${seqLabel} [${role}${speakerLabel}]: ${String(message?.content || "")}`;
})
.filter((item) => String(item || "").trim())
.join("\n\n");
}
export function buildExtractionInputContext(
messages = [],
{ settings = {}, userName = "", charName = "" } = {},
) {
const normalizedMessages = (Array.isArray(messages) ? messages : [])
.map((message, index) => normalizeExtractionMessage(message, index, {
userName,
charName,
}))
.filter(
(message) =>
String(message?.content || "").trim().length > 0 ||
String(message?.rawContent || "").trim().length > 0,
);
const extractRules = normalizeBoundaryRules(
settings?.extractAssistantExtractRules,
settings?.extractAssistantExtractTags,
"extract",
);
const excludeRules = normalizeBoundaryRules(
settings?.extractAssistantExcludeRules,
settings?.extractAssistantExcludeTags,
"exclude",
);
const filteredMessages = [];
const messageOperations = [];
let changedAssistantMessageCount = 0;
let droppedAssistantMessageCount = 0;
let extractedAssistantMessageCount = 0;
let excludedAssistantMessageCount = 0;
for (const message of normalizedMessages) {
const operations = [];
let nextContent = String(message.content || "");
if (message.role === "assistant") {
const extractResult = applyFirstExtractRule(nextContent, extractRules);
if (extractResult.changed) {
nextContent = extractResult.output;
extractedAssistantMessageCount += 1;
operations.push(extractResult.operation);
}
const excludeResult = applyExcludeRules(nextContent, excludeRules);
if (excludeResult.changed) {
nextContent = excludeResult.output;
excludedAssistantMessageCount += 1;
operations.push(...excludeResult.operations);
}
}
const normalizedContent = String(nextContent || "").trim();
if (operations.length > 0 || normalizedContent !== String(message.content || "").trim()) {
if (message.role === "assistant") {
changedAssistantMessageCount += 1;
}
messageOperations.push({
seq: message.seq,
role: message.role,
speaker: message.speaker,
beforeLength: String(message.content || "").length,
afterLength: normalizedContent.length,
operations,
});
}
if (!normalizedContent) {
if (message.role === "assistant" && String(message.content || "").trim()) {
droppedAssistantMessageCount += 1;
}
continue;
}
filteredMessages.push({
...message,
content: normalizedContent,
extractionFilterOperations: operations,
});
}
const rawTranscript = formatExtractionTranscript(
normalizedMessages.filter((message) => String(message.content || "").trim()),
);
const filteredTranscript = formatExtractionTranscript(filteredMessages);
return {
rawMessages: normalizedMessages,
filteredMessages,
rawTranscript,
filteredTranscript,
debug: {
rawMessageCount: normalizedMessages.length,
filteredMessageCount: filteredMessages.length,
rawRoleCounts: countRoles(normalizedMessages),
filteredRoleCounts: countRoles(filteredMessages),
rawTranscriptLength: rawTranscript.length,
filteredTranscriptLength: filteredTranscript.length,
changedAssistantMessageCount,
droppedAssistantMessageCount,
extractedAssistantMessageCount,
excludedAssistantMessageCount,
assistantBoundaryConfig: {
extractRuleCount: extractRules.length,
excludeRuleCount: excludeRules.length,
extractRules: extractRules.map((rule) => rule.label),
excludeRules: excludeRules.map((rule) => rule.label),
},
rawMessages: normalizedMessages,
filteredMessages,
messageOperations,
},
};
}

View File

@@ -32,8 +32,10 @@ import {
deriveStoryTimeSpanFromNodes,
describeNodeStoryTime,
normalizeStoryTime,
resolveActiveStoryContext,
upsertTimelineSegment,
} from "../graph/story-timeline.js";
import { getActiveSummaryEntries } from "../graph/summary-state.js";
import {
buildTaskExecutionDebugContext,
buildTaskLlmPayload,
@@ -42,6 +44,7 @@ import {
import { RELATION_TYPES } from "../graph/schema.js";
import { applyTaskRegex } from "../prompting/task-regex.js";
import { getSTContextForPrompt, getSTContextSnapshot } from "../host/st-context.js";
import { buildExtractionInputContext } from "./extraction-context.js";
import {
aliasSetMatchesValue,
buildUserPovAliasNormalizedSet,
@@ -61,6 +64,17 @@ function createTaskLlmDebugContext(promptBuild, regexInput) {
: null;
}
function createExtractTaskLlmDebugContext(promptBuild, regexInput, inputContext = null) {
const debugContext = createTaskLlmDebugContext(promptBuild, regexInput);
if (!inputContext || typeof inputContext !== "object") {
return debugContext;
}
return {
...debugContext,
inputContext,
};
}
function resolveTaskPromptPayload(promptBuild, fallbackUserPrompt = "") {
if (typeof buildTaskLlmPayload === "function") {
return buildTaskLlmPayload(promptBuild, fallbackUserPrompt);
@@ -86,6 +100,54 @@ function resolveTaskLlmSystemPrompt(promptPayload, fallbackSystemPrompt = "") {
return String(promptPayload?.systemPrompt || fallbackSystemPrompt || "");
}
function buildActiveSummariesText(graph) {
const entries = getActiveSummaryEntries(graph);
if (!Array.isArray(entries) || entries.length === 0) return "";
return entries
.map((entry, index) => {
const rangeLabel = Array.isArray(entry.messageRange) && entry.messageRange.length >= 2
&& entry.messageRange[0] >= 0 && entry.messageRange[1] >= 0
? `${entry.messageRange[0]}~${entry.messageRange[1]}`
: "";
const levelLabel = entry.level ? `L${entry.level}` : "";
const prefix = [rangeLabel, levelLabel].filter(Boolean).join(" ");
return `[${index + 1}]${prefix ? ` (${prefix})` : ""} ${String(entry.text || entry.summary || "").trim()}`;
})
.filter((line) => line.trim())
.join("\n");
}
function buildStoryTimeContextText(graph) {
const storyCtx = resolveActiveStoryContext(graph);
if (!storyCtx?.resolved) return "";
const parts = [];
if (storyCtx.activeStoryTimeLabel) {
parts.push(`当前活跃剧情时间:${storyCtx.activeStoryTimeLabel}`);
}
if (storyCtx.source) {
parts.push(`来源:${storyCtx.source}`);
}
const seg = storyCtx.segment;
if (seg?.tense && seg.tense !== "unknown") {
parts.push(`时态:${seg.tense}`);
}
return parts.join(" | ");
}
function applyRecentMessageCap(messages, cap = 0) {
if (!Array.isArray(messages) || messages.length === 0) return messages;
const numericCap = Number(cap);
if (!Number.isFinite(numericCap) || numericCap <= 0) return messages;
if (messages.length <= numericCap) return messages;
return messages.slice(-numericCap);
}
function resolveExtractPromptStructuredMode(settings) {
const mode = String(settings?.extractPromptStructuredMode || "both").trim().toLowerCase();
if (["transcript", "structured", "both"].includes(mode)) return mode;
return "both";
}
function isAbortError(error) {
return error?.name === "AbortError";
}
@@ -799,13 +861,42 @@ export async function extractMemories({
`[ST-BME] 提取开始: chat[${effectiveStartSeq}..${effectiveEndSeq}], ${messages.length} 条消息`,
);
// 构建对话文本
const dialogueText = messages
.map((m) => {
const seqLabel = Number.isFinite(m.seq) ? `#${m.seq}` : "#?";
return `${seqLabel} [${m.role}]: ${m.content}`;
})
.join("\n\n");
const extractionInput = buildExtractionInputContext(messages, {
settings,
userName: stContext?.prompt?.userName || "",
charName: stContext?.prompt?.charName || "",
});
const allStructuredMessages = Array.isArray(extractionInput?.filteredMessages)
? extractionInput.filteredMessages.map((message) => ({
seq: message?.seq,
role: message?.role,
content: message?.content,
speaker: message?.speaker,
name: message?.name,
}))
: [];
// Phase 3: apply recent message cap
const structuredMessages = applyRecentMessageCap(
allStructuredMessages,
settings?.extractRecentMessageCap,
);
const cappedMessageCount = allStructuredMessages.length - structuredMessages.length;
if (cappedMessageCount > 0) {
debugLog(
`[ST-BME][extract-p3] extractRecentMessageCap=${settings?.extractRecentMessageCap}, ` +
`capped ${cappedMessageCount} messages (${allStructuredMessages.length} -> ${structuredMessages.length})`,
);
}
// Phase 3: structured mode determines what goes into recentMessages/dialogueText
const structuredMode = resolveExtractPromptStructuredMode(settings);
const dialogueText = structuredMode === "structured"
? ""
: String(extractionInput?.filteredTranscript || "");
const promptRecentMessages = structuredMode === "transcript"
? dialogueText
: structuredMessages;
// 构建当前图概览(让 LLM 知道已有哪些节点,避免重复)
const graphOverview = buildGraphOverview(graph, schema);
@@ -817,16 +908,36 @@ export async function extractMemories({
? `${messages[0]?.seq ?? "?"} ~ ${messages[messages.length - 1]?.seq ?? "?"}`
: "";
// Phase 3: layered context — active summaries and story time
const activeSummaries = settings?.extractIncludeSummaries !== false
? buildActiveSummariesText(graph)
: "";
const storyTimeContext = settings?.extractIncludeStoryTime !== false
? buildStoryTimeContextText(graph)
: "";
debugLog(
`[ST-BME][extract-p3] structuredMode=${structuredMode}, ` +
`activeSummaries=${activeSummaries ? activeSummaries.split("\n").length + " entries" : "none"}, ` +
`storyTimeContext=${storyTimeContext ? "present" : "none"}, ` +
`worldbookMode=${String(settings?.extractWorldbookMode || "active")}`,
);
const extractWorldbookMode = String(settings?.extractWorldbookMode || "active").trim().toLowerCase();
const promptBuild = await buildTaskPrompt(settings, "extract", {
taskName: "extract",
schema: schemaDescription,
schemaDescription,
recentMessages: dialogueText,
chatMessages: messages,
recentMessages: promptRecentMessages,
chatMessages: structuredMessages,
dialogueText,
graphStats: graphOverview,
graphOverview,
currentRange,
activeSummaries,
storyTimeContext,
taskInputDebug: extractionInput?.debug || null,
__skipWorldInfo: extractWorldbookMode === "none",
...getSTContextForPrompt(),
});
@@ -843,19 +954,50 @@ export async function extractMemories({
"system",
);
// 用户提示词
const userPrompt = [
"## 当前对话内容(需提取记忆)",
dialogueText,
"",
// 用户提示词 — Phase 3 分层信息结构
const userPromptSections = [];
// Layer 1: 当前对话切片
if (dialogueText) {
userPromptSections.push("## 当前对话内容(需提取记忆)", dialogueText, "");
} else if (structuredMode === "structured" && structuredMessages.length > 0) {
userPromptSections.push(
"## 当前对话内容(结构化消息,需提取记忆)",
"(结构化消息已通过 profile blocks 注入,请参考上方 recentMessages 块。)",
"",
);
}
// Layer 2: 当前图谱状态
userPromptSections.push(
"## 当前图谱状态",
graphOverview || "(空图谱,尚无节点)",
"",
"## 节点类型定义",
schemaDescription,
"",
"请分析对话,按 JSON 格式输出操作列表。",
].join("\n");
);
// Layer 3: 已有总结快照(帮助避免重复提取)
if (activeSummaries) {
userPromptSections.push(
"## 近期局面总结(已有覆盖,避免重复)",
activeSummaries,
"",
);
}
// Layer 4: 故事时间线位置
if (storyTimeContext) {
userPromptSections.push(
"## 当前故事时间",
storyTimeContext,
"",
);
}
// Layer 5: 节点类型定义
userPromptSections.push("## 节点类型定义", schemaDescription, "");
userPromptSections.push("请分析对话,按 JSON 格式输出操作列表。");
const userPrompt = userPromptSections.join("\n");
const promptPayload = resolveTaskPromptPayload(promptBuild, userPrompt);
const extractionAugmentPrompt = buildCognitiveExtractAugmentPrompt();
const promptPayloadAdditionalMessages = Array.isArray(
@@ -904,6 +1046,16 @@ export async function extractMemories({
`[ST-BME][prompt-diag] NO user messages in promptMessages! Fallback userPrompt will be used.`,
);
}
if (extractionInput?.debug) {
debugLog(
`[ST-BME][extract-input] raw=${Number(extractionInput.debug.rawMessageCount || 0)}, ` +
`filtered=${Number(extractionInput.debug.filteredMessageCount || 0)}, ` +
`assistantChanged=${Number(extractionInput.debug.changedAssistantMessageCount || 0)}, ` +
`assistantDropped=${Number(extractionInput.debug.droppedAssistantMessageCount || 0)}, ` +
`extractRules=${Number(extractionInput.debug.assistantBoundaryConfig?.extractRuleCount || 0)}, ` +
`excludeRules=${Number(extractionInput.debug.assistantBoundaryConfig?.excludeRuleCount || 0)}`,
);
}
}
// 调用 LLM
@@ -913,7 +1065,11 @@ export async function extractMemories({
maxRetries: 2,
signal,
taskType: "extract",
debugContext: createTaskLlmDebugContext(promptBuild, extractRegexInput),
debugContext: createExtractTaskLlmDebugContext(
promptBuild,
extractRegexInput,
extractionInput?.debug || null,
),
promptMessages: promptPayload.promptMessages,
additionalMessages: promptPayloadAdditionalMessages,
onStreamProgress,