feat: improve shujuku-compatible extraction and recall input

This commit is contained in:
Youzini-afk
2026-04-11 16:05:06 +08:00
parent 154e553589
commit 322752bb43
15 changed files with 1642 additions and 35 deletions

View File

@@ -0,0 +1,453 @@
function splitConfigText(value = "") {
return String(value || "")
.split(/[\r\n,]+/)
.map((item) => String(item || "").trim())
.filter(Boolean);
}
function escapeRegex(value = "") {
return String(value || "").replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
}
function normalizeBoundaryRule(rawRule, mode = "exclude", index = 0) {
if (typeof rawRule === "string") {
const tag = String(rawRule || "").trim();
if (!tag) return null;
return {
id: `${mode}:tag:${index}:${tag}`,
mode,
kind: "tag",
label: tag,
tag,
};
}
if (!rawRule || typeof rawRule !== "object" || Array.isArray(rawRule)) {
return null;
}
const tag = String(rawRule.tag || rawRule.name || "").trim();
if (tag) {
return {
id: `${mode}:tag:${index}:${tag}`,
mode,
kind: "tag",
label: String(rawRule.label || tag).trim() || tag,
tag,
};
}
const start = String(rawRule.start ?? rawRule.open ?? rawRule.begin ?? "").trim();
const end = String(rawRule.end ?? rawRule.close ?? rawRule.finish ?? "").trim();
if (!start || !end) {
return null;
}
return {
id: `${mode}:boundary:${index}`,
mode,
kind: "boundary",
label: String(rawRule.label || `${start}${end}`).trim() || `${start}${end}`,
start,
end,
caseSensitive: rawRule.caseSensitive === true,
};
}
function normalizeBoundaryRules(rawRules = null, rawTags = "", mode = "exclude") {
const values = [];
if (Array.isArray(rawRules)) {
values.push(...rawRules);
} else if (rawRules !== null && rawRules !== undefined && rawRules !== "") {
values.push(rawRules);
}
values.push(...splitConfigText(rawTags));
return values
.map((item, index) => normalizeBoundaryRule(item, mode, index))
.filter(Boolean);
}
function applyTagBoundaryRule(text, rule) {
const input = String(text || "");
const escapedTag = escapeRegex(rule?.tag || "");
if (!escapedTag) {
return {
changed: false,
output: input,
ruleLabel: String(rule?.label || ""),
matchedText: "",
};
}
const regex = new RegExp(
`<${escapedTag}\\b[^>]*>([\\s\\S]*?)<\\/${escapedTag}>`,
"gi",
);
let match = null;
for (const candidate of input.matchAll(regex)) {
match = candidate;
}
if (!match) {
return {
changed: false,
output: input,
ruleLabel: String(rule?.label || ""),
matchedText: "",
};
}
const matchedText = String(match[0] || "");
if (rule?.mode === "extract") {
return {
changed: true,
output: String(match[1] || "").trim(),
ruleLabel: String(rule?.label || rule?.tag || ""),
matchedText,
};
}
const matchIndex = Number(match.index);
if (!Number.isFinite(matchIndex) || matchIndex < 0) {
return {
changed: false,
output: input,
ruleLabel: String(rule?.label || rule?.tag || ""),
matchedText: "",
};
}
return {
changed: true,
output: `${input.slice(0, matchIndex)}${input.slice(matchIndex + matchedText.length)}`.trim(),
ruleLabel: String(rule?.label || rule?.tag || ""),
matchedText,
};
}
function applyLiteralBoundaryRule(text, rule) {
const input = String(text || "");
const start = String(rule?.start || "");
const end = String(rule?.end || "");
if (!start || !end) {
return {
changed: false,
output: input,
ruleLabel: String(rule?.label || ""),
matchedText: "",
};
}
const sourceText = rule?.caseSensitive === true ? input : input.toLowerCase();
const startNeedle = rule?.caseSensitive === true ? start : start.toLowerCase();
const endNeedle = rule?.caseSensitive === true ? end : end.toLowerCase();
const startIndex = sourceText.lastIndexOf(startNeedle);
if (startIndex < 0) {
return {
changed: false,
output: input,
ruleLabel: String(rule?.label || ""),
matchedText: "",
};
}
const endIndex = sourceText.indexOf(endNeedle, startIndex + startNeedle.length);
if (endIndex < 0) {
return {
changed: false,
output: input,
ruleLabel: String(rule?.label || ""),
matchedText: "",
};
}
const matchedText = input.slice(startIndex, endIndex + end.length);
if (rule?.mode === "extract") {
return {
changed: true,
output: input.slice(startIndex + start.length, endIndex).trim(),
ruleLabel: String(rule?.label || ""),
matchedText,
};
}
return {
changed: true,
output: `${input.slice(0, startIndex)}${input.slice(endIndex + end.length)}`.trim(),
ruleLabel: String(rule?.label || ""),
matchedText,
};
}
function applyBoundaryRule(text, rule) {
if (rule?.kind === "tag") {
return applyTagBoundaryRule(text, rule);
}
if (rule?.kind === "boundary") {
return applyLiteralBoundaryRule(text, rule);
}
return {
changed: false,
output: String(text || ""),
ruleLabel: String(rule?.label || ""),
matchedText: "",
};
}
function applyFirstExtractRule(text, rules = []) {
const input = String(text || "");
for (const rule of Array.isArray(rules) ? rules : []) {
const result = applyBoundaryRule(input, rule);
if (result.changed) {
return {
changed: true,
output: result.output,
operation: {
mode: "extract",
rule: result.ruleLabel,
matchedLength: String(result.matchedText || "").length,
},
};
}
}
return {
changed: false,
output: input,
operation: null,
};
}
function applyExcludeRules(text, rules = []) {
const input = String(text || "");
let output = input;
const operations = [];
for (const rule of Array.isArray(rules) ? rules : []) {
const result = applyBoundaryRule(output, rule);
if (!result.changed) {
continue;
}
output = result.output;
operations.push({
mode: "exclude",
rule: result.ruleLabel,
matchedLength: String(result.matchedText || "").length,
});
}
return {
changed: output !== input,
output,
operations,
};
}
function normalizeRole(value = "") {
const role = String(value || "assistant").trim().toLowerCase();
if (["user", "assistant", "system"].includes(role)) {
return role;
}
return role === "ai" ? "assistant" : "assistant";
}
function resolveMessageContent(message = {}) {
if (typeof message?.content === "string") {
return message.content;
}
if (typeof message?.mes === "string") {
return message.mes;
}
return "";
}
function resolveMessageRawContent(message = {}) {
if (typeof message?.rawContent === "string") {
return message.rawContent;
}
if (typeof message?.mes === "string") {
return message.mes;
}
if (typeof message?.content === "string") {
return message.content;
}
return "";
}
function resolveSpeakerName(message = {}, role = "assistant", names = {}) {
const explicitSpeaker = String(
message?.speaker ?? message?.name ?? message?.displayName ?? "",
).trim();
if (explicitSpeaker) {
return explicitSpeaker;
}
if (role === "user") {
return String(names?.userName || "用户").trim() || "用户";
}
if (role === "assistant") {
return String(names?.charName || "角色").trim() || "角色";
}
return role || "assistant";
}
function normalizeExtractionMessage(message = {}, index = 0, names = {}) {
const role = normalizeRole(
message?.role ?? (message?.is_user === true ? "user" : "assistant"),
);
const content = String(resolveMessageContent(message) || "").trim();
const rawContent = String(resolveMessageRawContent(message) || content).trim();
const speaker = resolveSpeakerName(message, role, names);
const seq = Number.isFinite(Number(message?.seq)) ? Number(message.seq) : null;
return {
index,
seq,
role,
speaker,
name: speaker,
content,
rawContent,
sourceType: role === "user" ? "user_input" : "ai_output",
};
}
function countRoles(messages = []) {
return (Array.isArray(messages) ? messages : []).reduce(
(acc, message) => {
const role = normalizeRole(message?.role || "assistant");
acc[role] = Number(acc[role] || 0) + 1;
return acc;
},
{ user: 0, assistant: 0, system: 0 },
);
}
export function formatExtractionTranscript(messages = []) {
return (Array.isArray(messages) ? messages : [])
.map((message, index) => {
const seqLabel = Number.isFinite(Number(message?.seq))
? `#${Number(message.seq)}`
: `#${index + 1}`;
const role = normalizeRole(message?.role || "assistant");
const speaker = String(message?.speaker || message?.name || "").trim();
const speakerLabel = speaker ? `|${speaker}` : "";
return `${seqLabel} [${role}${speakerLabel}]: ${String(message?.content || "")}`;
})
.filter((item) => String(item || "").trim())
.join("\n\n");
}
export function buildExtractionInputContext(
messages = [],
{ settings = {}, userName = "", charName = "" } = {},
) {
const normalizedMessages = (Array.isArray(messages) ? messages : [])
.map((message, index) => normalizeExtractionMessage(message, index, {
userName,
charName,
}))
.filter(
(message) =>
String(message?.content || "").trim().length > 0 ||
String(message?.rawContent || "").trim().length > 0,
);
const extractRules = normalizeBoundaryRules(
settings?.extractAssistantExtractRules,
settings?.extractAssistantExtractTags,
"extract",
);
const excludeRules = normalizeBoundaryRules(
settings?.extractAssistantExcludeRules,
settings?.extractAssistantExcludeTags,
"exclude",
);
const filteredMessages = [];
const messageOperations = [];
let changedAssistantMessageCount = 0;
let droppedAssistantMessageCount = 0;
let extractedAssistantMessageCount = 0;
let excludedAssistantMessageCount = 0;
for (const message of normalizedMessages) {
const operations = [];
let nextContent = String(message.content || "");
if (message.role === "assistant") {
const extractResult = applyFirstExtractRule(nextContent, extractRules);
if (extractResult.changed) {
nextContent = extractResult.output;
extractedAssistantMessageCount += 1;
operations.push(extractResult.operation);
}
const excludeResult = applyExcludeRules(nextContent, excludeRules);
if (excludeResult.changed) {
nextContent = excludeResult.output;
excludedAssistantMessageCount += 1;
operations.push(...excludeResult.operations);
}
}
const normalizedContent = String(nextContent || "").trim();
if (operations.length > 0 || normalizedContent !== String(message.content || "").trim()) {
if (message.role === "assistant") {
changedAssistantMessageCount += 1;
}
messageOperations.push({
seq: message.seq,
role: message.role,
speaker: message.speaker,
beforeLength: String(message.content || "").length,
afterLength: normalizedContent.length,
operations,
});
}
if (!normalizedContent) {
if (message.role === "assistant" && String(message.content || "").trim()) {
droppedAssistantMessageCount += 1;
}
continue;
}
filteredMessages.push({
...message,
content: normalizedContent,
extractionFilterOperations: operations,
});
}
const rawTranscript = formatExtractionTranscript(
normalizedMessages.filter((message) => String(message.content || "").trim()),
);
const filteredTranscript = formatExtractionTranscript(filteredMessages);
return {
rawMessages: normalizedMessages,
filteredMessages,
rawTranscript,
filteredTranscript,
debug: {
rawMessageCount: normalizedMessages.length,
filteredMessageCount: filteredMessages.length,
rawRoleCounts: countRoles(normalizedMessages),
filteredRoleCounts: countRoles(filteredMessages),
rawTranscriptLength: rawTranscript.length,
filteredTranscriptLength: filteredTranscript.length,
changedAssistantMessageCount,
droppedAssistantMessageCount,
extractedAssistantMessageCount,
excludedAssistantMessageCount,
assistantBoundaryConfig: {
extractRuleCount: extractRules.length,
excludeRuleCount: excludeRules.length,
extractRules: extractRules.map((rule) => rule.label),
excludeRules: excludeRules.map((rule) => rule.label),
},
rawMessages: normalizedMessages,
filteredMessages,
messageOperations,
},
};
}