ST-Bionic-Memory-Ecology/retrieval/retrieval-enhancer.js

import { embedText, searchSimilar } from "../vector/embedding.js";
import { getNode } from "../graph/graph.js";
import { isDirectVectorConfig } from "../vector/vector-index.js";

const COOCCURRENCE_EXCLUDED_TYPES = new Set([
  "event",
  "synopsis",
  "reflection",
]);

const cooccurrenceCache = new WeakMap();

export function splitIntentSegments(
  text,
  { maxSegments = 4, minLength = 3 } = {},
) {
  const raw = String(text || "").trim();
  if (!raw) return [];

  const segments = raw
    .split(/[，,。.；;！!？?\n]+|(?:顺便|另外|还有|对了|然后|而且|并且|同时)/)
    .map((item) => item.trim())
    .filter((item) => item.length >= minLength);

  return uniqueStrings(segments).slice(0, Math.max(1, maxSegments));
}

export function mergeVectorResults(resultGroups = [], limit = Infinity) {
  const merged = new Map();
  let rawHitCount = 0;

  for (const group of resultGroups) {
    for (const item of Array.isArray(group) ? group : []) {
      if (!item?.nodeId) continue;
      rawHitCount += 1;
      const score = Number(item.score) || 0;
      const existing = merged.get(item.nodeId);
      if (!existing || score > existing.score) {
        merged.set(item.nodeId, { ...item, score });
      }
    }
  }

  const results = [...merged.values()]
    .sort((a, b) => {
      if (b.score !== a.score) return b.score - a.score;
      return String(a.nodeId).localeCompare(String(b.nodeId));
    })
    .slice(0, Number.isFinite(limit) ? limit : merged.size);

  return {
    rawHitCount,
    results,
  };
}

export function isEligibleAnchorNode(node) {
  if (!node || node.archived) return false;
  if (COOCCURRENCE_EXCLUDED_TYPES.has(node.type)) return false;
  return getAnchorTerms(node).length > 0;
}

export function getAnchorTerms(node) {
  return [node?.fields?.name, node?.fields?.title]
    .filter((value) => typeof value === "string")
    .map((value) => value.trim())
    .filter((value) => value.length >= 2);
}

export function collectSupplementalAnchorNodeIds(
  graph,
  vectorResults = [],
  primaryAnchorIds = [],
  maxCount = 5,
) {
  const selected = [];
  const seen = new Set(primaryAnchorIds || []);

  for (const result of vectorResults) {
    if (selected.length >= maxCount) break;
    const node = getNode(graph, result?.nodeId);
    if (!isEligibleAnchorNode(node) || seen.has(node.id)) continue;
    seen.add(node.id);
    selected.push(node.id);
  }

  return selected;
}

export function createCooccurrenceIndex(
  graph,
  {
    maxAnchorsPerBatch = 10,
    eligibleNodes = null,
  } = {},
) {
  const nodes = Array.isArray(eligibleNodes)
    ? eligibleNodes.filter(isEligibleAnchorNode)
    : [];
  const eligibleNodeKey = nodes.map((node) => node.id).sort().join("|");
  const cacheKey = [
    graph?.batchJournal?.length || 0,
    graph?.nodes?.length || 0,
    graph?.historyState?.lastProcessedAssistantFloor ?? -1,
    maxAnchorsPerBatch,
    eligibleNodeKey,
  ].join(":");
  const cached = cooccurrenceCache.get(graph);
  if (cached?.key === cacheKey) {
    return cached.value;
  }

  const index = new Map();
  let pairCount = 0;
  let batchCount = 0;
  let source = "seqRange";

  if (nodes.length >= 2 && Array.isArray(graph?.batchJournal)) {
    for (const journal of graph.batchJournal) {
      const range = Array.isArray(journal?.processedRange)
        ? journal.processedRange
        : null;
      if (!range || !Number.isFinite(range[0]) || !Number.isFinite(range[1])) {
        continue;
      }

      const batchNodes = nodes
        .filter((node) => rangesOverlap(node.seqRange, range))
        .sort(compareBySeqDesc)
        .slice(0, Math.max(2, maxAnchorsPerBatch));
      if (batchNodes.length < 2) continue;

      batchCount += 1;
      pairCount += appendPairs(index, batchNodes, 1);
    }
  }

  if (batchCount === 0) {
    source = "seqRange";
    pairCount = 0;
    index.clear();

    for (let i = 0; i < nodes.length; i++) {
      for (let j = i + 1; j < nodes.length; j++) {
        const overlap = rangeOverlapSize(nodes[i].seqRange, nodes[j].seqRange);
        if (overlap <= 0) continue;
        addCooccurrence(index, nodes[i].id, nodes[j].id, overlap);
        addCooccurrence(index, nodes[j].id, nodes[i].id, overlap);
        pairCount += 1;
      }
    }
  } else {
    source = "batchJournal";
  }

  const result = {
    map: normalizeCooccurrenceMap(index),
    source,
    batchCount,
    pairCount,
  };
  cooccurrenceCache.set(graph, { key: cacheKey, value: result });
  return result;
}

export function applyCooccurrenceBoost(
  baseScores,
  anchorWeights,
  cooccurrenceIndex,
  { scale = 0.1, maxNeighbors = 10 } = {},
) {
  const nextScores = new Map(baseScores || []);
  const boostedNodes = [];
  const map = cooccurrenceIndex?.map instanceof Map
    ? cooccurrenceIndex.map
    : new Map();

  for (const [anchorId, anchorScore] of anchorWeights.entries()) {
    const neighbors = map.get(anchorId) || [];
    const capped = neighbors.slice(0, Math.max(1, maxNeighbors));

    for (const item of capped) {
      const bonus =
        Math.max(0, Number(anchorScore) || 0) *
        Math.log(1 + Math.max(0, Number(item.count) || 0)) *
        Math.max(0, Number(scale) || 0);
      if (!bonus) continue;

      nextScores.set(item.nodeId, (nextScores.get(item.nodeId) || 0) + bonus);
      boostedNodes.push({
        anchorId,
        nodeId: item.nodeId,
        count: item.count,
        bonus,
      });
    }
  }

  return {
    scores: nextScores,
    boostedNodes,
  };
}

export function dppGreedySelect(
  candidateVecs = [],
  candidateScores = [],
  k,
  qualityWeight = 1,
) {
  const total = Math.min(candidateVecs.length, candidateScores.length);
  const target = Math.max(0, Math.min(k, total));
  if (target >= total) {
    return Array.from({ length: total }, (_, index) => index);
  }

  const normalized = candidateVecs.map((vector) => normalizeVector(vector));
  const q = candidateScores.map((score) =>
    Math.pow(Math.max(Number(score) || 0, 1e-10), Math.max(0, qualityWeight)),
  );
  const diag = q.map((value) => value * value + 1e-8);
  const chol = Array.from({ length: target }, () =>
    Array(total).fill(0),
  );
  const selected = [];

  for (let j = 0; j < target; j++) {
    let bestIndex = -1;
    let bestValue = Number.NEGATIVE_INFINITY;

    for (let i = 0; i < total; i++) {
      if (selected.includes(i)) continue;
      if (diag[i] > bestValue) {
        bestValue = diag[i];
        bestIndex = i;
      }
    }

    if (bestIndex === -1) break;
    selected.push(bestIndex);

    if (j === target - 1 || diag[bestIndex] < 1e-10) {
      continue;
    }

    const row = normalized.map(
      (vector, index) => q[bestIndex] * dot(normalized[bestIndex], vector) * q[index],
    );
    const next = [...row];
    for (let i = 0; i < j; i++) {
      const pivot = chol[i][bestIndex];
      for (let index = 0; index < total; index++) {
        next[index] -= pivot * chol[i][index];
      }
    }

    const inv = 1 / Math.sqrt(diag[bestIndex]);
    for (let index = 0; index < total; index++) {
      chol[j][index] = next[index] * inv;
      diag[index] = Math.max(0, diag[index] - chol[j][index] ** 2);
    }
  }

  return selected;
}

export function applyDiversitySampling(
  candidates = [],
  { k, qualityWeight = 1 } = {},
) {
  const target = Math.max(1, Math.floor(Number(k) || 0));
  if (candidates.length <= target) {
    return {
      applied: false,
      reason: "candidate-pool-too-small",
      selected: candidates.slice(0, target),
      beforeCount: candidates.length,
      afterCount: Math.min(candidates.length, target),
    };
  }

  if (
    candidates.some(
      (item) =>
        !Array.isArray(item?.node?.embedding) || item.node.embedding.length === 0,
    )
  ) {
    return {
      applied: false,
      reason: "candidate-embeddings-missing",
      selected: candidates.slice(0, target),
      beforeCount: candidates.length,
      afterCount: Math.min(candidates.length, target),
    };
  }

  const indexes = dppGreedySelect(
    candidates.map((item) => item.node.embedding),
    candidates.map((item) => item.finalScore),
    target,
    qualityWeight,
  );

  const selected = indexes
    .map((index) => candidates[index])
    .filter(Boolean);

  if (selected.length !== target) {
    return {
      applied: false,
      reason: "dpp-selection-incomplete",
      selected: candidates.slice(0, target),
      beforeCount: candidates.length,
      afterCount: Math.min(candidates.length, target),
    };
  }

  return {
    applied: true,
    reason: "",
    selected,
    beforeCount: candidates.length,
    afterCount: selected.length,
  };
}

export function nmfQueryAnalysis(
  queryVec,
  entityVecs,
  { nTopics = 15, maxIter = 100, tolerance = 1e-4 } = {},
) {
  const vectors = normalizeMatrix(entityVecs);
  const query = vectorAbs(queryVec);
  if (vectors.length < 2 || query.length === 0) {
    return {
      semanticDepth: 0,
      topicCoverage: 0,
      novelty: 1,
      topTopics: [],
    };
  }

  const k = Math.min(Math.max(1, Math.floor(nTopics)), vectors.length);
  const matrix = vectors.map((vector) => vectorAbs(vector));
  const { h } = nmfMultiplicativeUpdate(matrix, k, maxIter, tolerance);
  const rawScores = h.map((topic) => dot(query, topic));
  const topics = softmax(rawScores);

  const entropy = -topics.reduce((sum, value) => {
    return value > 1e-10 ? sum + value * Math.log(value) : sum;
  }, 0);
  const maxEntropy = k > 1 ? Math.log(k) : 1;
  const semanticDepth = 1 - entropy / maxEntropy;
  const topicCoverage = topics.filter((value) => value > 0.5 / k).length;
  const reconstruction = Array(query.length).fill(0);

  for (let topicIndex = 0; topicIndex < topics.length; topicIndex++) {
    const weight = topics[topicIndex];
    for (let dim = 0; dim < reconstruction.length; dim++) {
      reconstruction[dim] += weight * h[topicIndex][dim];
    }
  }

  const novelty =
    l2Norm(subtractVectors(query, reconstruction)) / Math.max(l2Norm(query), 1e-10);

  return {
    semanticDepth,
    topicCoverage,
    novelty,
    topTopics: topics,
  };
}

export function sparseCodeResidual(
  queryVec,
  entityVecs,
  { lambda = 0.1, maxIter = 80 } = {},
) {
  const query = normalizeVector(queryVec, false);
  const entities = normalizeMatrix(entityVecs);
  const total = entities.length;
  if (total === 0 || query.length === 0) {
    return {
      alpha: [],
      residual: [...query],
      residualNorm: l2Norm(query),
    };
  }

  const gram = Array.from({ length: total }, () => Array(total).fill(0));
  const etq = Array(total).fill(0);

  for (let i = 0; i < total; i++) {
    etq[i] = dot(entities[i], query);
    for (let j = i; j < total; j++) {
      const value = dot(entities[i], entities[j]);
      gram[i][j] = value;
      gram[j][i] = value;
    }
  }

  let lipschitz = 0;
  for (let i = 0; i < total; i++) {
    const rowSum = gram[i].reduce((sum, value) => sum + Math.abs(value), 0);
    lipschitz = Math.max(lipschitz, rowSum);
  }
  if (lipschitz < 1e-10) {
    return {
      alpha: Array(total).fill(0),
      residual: [...query],
      residualNorm: l2Norm(query),
    };
  }

  const step = 1 / lipschitz;
  let alpha = Array(total).fill(0);
  let y = [...alpha];
  let t = 1;

  for (let iteration = 0; iteration < maxIter; iteration++) {
    const grad = matVecMul(gram, y).map((value, index) => value - etq[index]);
    const nextAlpha = softThreshold(
      y.map((value, index) => value - step * grad[index]),
      lambda * step,
    );
    const nextT = (1 + Math.sqrt(1 + 4 * t * t)) / 2;
    const momentum = (t - 1) / nextT;
    y = nextAlpha.map(
      (value, index) => value + momentum * (value - alpha[index]),
    );
    alpha = nextAlpha;
    t = nextT;
  }

  const reconstruction = Array(query.length).fill(0);
  for (let i = 0; i < total; i++) {
    if (Math.abs(alpha[i]) < 1e-10) continue;
    for (let dim = 0; dim < query.length; dim++) {
      reconstruction[dim] += alpha[i] * entities[i][dim];
    }
  }

  const residual = subtractVectors(query, reconstruction);
  return {
    alpha,
    residual,
    residualNorm: l2Norm(residual),
  };
}

export async function runResidualRecall({
  queryText,
  graph,
  embeddingConfig,
  basisNodes = [],
  candidateNodes = [],
  basisLimit = 24,
  nTopics = 15,
  noveltyThreshold = 0.4,
  residualThreshold = 0.3,
  residualTopK = 5,
  signal,
}) {
  if (!isDirectVectorConfig(embeddingConfig)) {
    return {
      triggered: false,
      hits: [],
      skipReason: "residual-direct-mode-required",
    };
  }

  const filteredBasis = basisNodes
    .filter(
      (node) =>
        Array.isArray(node?.embedding) && node.embedding.length > 0,
    )
    .slice(0, Math.max(2, basisLimit));
  if (filteredBasis.length < 2) {
    return {
      triggered: false,
      hits: [],
      skipReason: "residual-basis-insufficient",
    };
  }

  const queryVec = await embedText(queryText, embeddingConfig, { signal, isQuery: true });
  if (!queryVec || queryVec.length === 0) {
    return {
      triggered: false,
      hits: [],
      skipReason: "residual-query-embedding-missing",
    };
  }

  const nmfResult = nmfQueryAnalysis(queryVec, filteredBasis.map((node) => node.embedding), {
    nTopics,
  });
  if (!Number.isFinite(nmfResult.novelty) || nmfResult.novelty < noveltyThreshold) {
    return {
      triggered: false,
      hits: [],
      nmf: nmfResult,
      skipReason: "residual-novelty-below-threshold",
    };
  }

  const sparse = sparseCodeResidual(queryVec, filteredBasis.map((node) => node.embedding));
  if (!Number.isFinite(sparse.residualNorm) || sparse.residualNorm <= residualThreshold) {
    return {
      triggered: false,
      hits: [],
      nmf: nmfResult,
      sparse,
      skipReason: "residual-norm-below-threshold",
    };
  }

  const searchableCandidates = (candidateNodes || [])
    .filter(
      (node) =>
        Array.isArray(node?.embedding) &&
        node.embedding.length > 0 &&
        !filteredBasis.some((basisNode) => basisNode.id === node.id),
    )
    .map((node) => ({
      nodeId: node.id,
      embedding: node.embedding,
    }));

  if (searchableCandidates.length === 0) {
    return {
      triggered: true,
      hits: [],
      nmf: nmfResult,
      sparse,
      skipReason: "residual-search-space-empty",
    };
  }

  const hits = searchSimilar(sparse.residual, searchableCandidates, residualTopK)
    .map((item) => ({
      ...item,
      node: getNode(graph, item.nodeId),
    }))
    .filter((item) => item.node);

  return {
    triggered: true,
    hits,
    nmf: nmfResult,
    sparse,
    skipReason: hits.length > 0 ? "" : "residual-no-hit",
  };
}

function uniqueStrings(items = []) {
  return [...new Set(items.filter(Boolean))];
}

function normalizeCooccurrenceMap(index) {
  const normalized = new Map();
  for (const [nodeId, neighborMap] of index.entries()) {
    normalized.set(
      nodeId,
      [...neighborMap.entries()]
        .map(([neighborId, count]) => ({ nodeId: neighborId, count }))
        .sort((a, b) => {
          if (b.count !== a.count) return b.count - a.count;
          return String(a.nodeId).localeCompare(String(b.nodeId));
        }),
    );
  }
  return normalized;
}

function appendPairs(index, nodes, increment) {
  let count = 0;
  for (let i = 0; i < nodes.length; i++) {
    for (let j = i + 1; j < nodes.length; j++) {
      addCooccurrence(index, nodes[i].id, nodes[j].id, increment);
      addCooccurrence(index, nodes[j].id, nodes[i].id, increment);
      count += 1;
    }
  }
  return count;
}

function addCooccurrence(index, fromId, toId, increment) {
  if (!index.has(fromId)) {
    index.set(fromId, new Map());
  }
  const map = index.get(fromId);
  map.set(toId, (map.get(toId) || 0) + increment);
}

function rangesOverlap(a, b) {
  return rangeOverlapSize(a, b) > 0;
}

function rangeOverlapSize(a, b) {
  const rangeA = normalizeRange(a);
  const rangeB = normalizeRange(b);
  if (!rangeA || !rangeB) return 0;
  const start = Math.max(rangeA[0], rangeB[0]);
  const end = Math.min(rangeA[1], rangeB[1]);
  return end >= start ? end - start + 1 : 0;
}

function normalizeRange(range) {
  if (!Array.isArray(range) || range.length < 2) return null;
  const start = Number(range[0]);
  const end = Number(range[1]);
  if (!Number.isFinite(start) || !Number.isFinite(end)) return null;
  return [Math.min(start, end), Math.max(start, end)];
}

function compareBySeqDesc(a, b) {
  const seqA = a?.seqRange?.[1] ?? a?.seq ?? 0;
  const seqB = b?.seqRange?.[1] ?? b?.seq ?? 0;
  if (seqB !== seqA) return seqB - seqA;
  return (b.importance || 0) - (a.importance || 0);
}

function vectorAbs(vector = []) {
  return vector.map((value) => Math.abs(Number(value) || 0));
}

function normalizeVector(vector = [], useUnitNorm = true) {
  const normalized = vector.map((value) => Number(value) || 0);
  if (!useUnitNorm) return normalized;
  const norm = l2Norm(normalized);
  if (norm < 1e-10) return normalized.map(() => 0);
  return normalized.map((value) => value / norm);
}

function normalizeMatrix(vectors = []) {
  return vectors
    .filter((vector) => Array.isArray(vector) && vector.length > 0)
    .map((vector) => normalizeVector(vector));
}

function dot(a = [], b = []) {
  const length = Math.min(a.length, b.length);
  let sum = 0;
  for (let index = 0; index < length; index++) {
    sum += (Number(a[index]) || 0) * (Number(b[index]) || 0);
  }
  return sum;
}

function l2Norm(vector = []) {
  return Math.sqrt(vector.reduce((sum, value) => sum + value * value, 0));
}

function subtractVectors(a = [], b = []) {
  const length = Math.max(a.length, b.length);
  const result = Array(length).fill(0);
  for (let index = 0; index < length; index++) {
    result[index] = (Number(a[index]) || 0) - (Number(b[index]) || 0);
  }
  return result;
}

function matVecMul(matrix = [], vector = []) {
  return matrix.map((row) => dot(row, vector));
}

function softThreshold(vector = [], threshold = 0) {
  return vector.map((value) => {
    const absValue = Math.abs(value);
    if (absValue <= threshold) return 0;
    return Math.sign(value) * (absValue - threshold);
  });
}

function softmax(values = []) {
  if (values.length === 0) return [];
  const max = Math.max(...values);
  const exp = values.map((value) => Math.exp(value - max));
  const total = exp.reduce((sum, value) => sum + value, 0) || 1;
  return exp.map((value) => value / total);
}

function nmfMultiplicativeUpdate(matrix, k, maxIter, tolerance) {
  const m = matrix.length;
  const d = matrix[0]?.length || 0;
  const mean =
    matrix.reduce((sum, row) => sum + row.reduce((acc, value) => acc + value, 0), 0) /
      Math.max(1, m * d) || 0.01;
  const avg = Math.max(Math.sqrt(mean / Math.max(1, k)), 0.01);
  const rand = createDeterministicRandom(42);
  const w = Array.from({ length: m }, () =>
    Array.from({ length: k }, () => Math.abs(avg + avg * 0.5 * (rand() - 0.5)) + 1e-6),
  );
  const h = Array.from({ length: k }, () =>
    Array.from({ length: d }, () => Math.abs(avg + avg * 0.5 * (rand() - 0.5)) + 1e-6),
  );
  const eps = 1e-10;

  for (let iteration = 0; iteration < maxIter; iteration++) {
    const wtV = Array.from({ length: k }, () => Array(d).fill(0));
    const wtW = Array.from({ length: k }, () => Array(k).fill(0));

    for (let i = 0; i < k; i++) {
      for (let dim = 0; dim < d; dim++) {
        let sum = 0;
        for (let row = 0; row < m; row++) {
          sum += w[row][i] * matrix[row][dim];
        }
        wtV[i][dim] = sum;
      }
      for (let j = 0; j < k; j++) {
        let sum = 0;
        for (let row = 0; row < m; row++) {
          sum += w[row][i] * w[row][j];
        }
        wtW[i][j] = sum;
      }
    }

    for (let i = 0; i < k; i++) {
      for (let dim = 0; dim < d; dim++) {
        let denominator = 0;
        for (let topic = 0; topic < k; topic++) {
          denominator += wtW[i][topic] * h[topic][dim];
        }
        h[i][dim] *= wtV[i][dim] / (denominator + eps);
      }
    }

    const vHt = Array.from({ length: m }, () => Array(k).fill(0));
    const hHt = Array.from({ length: k }, () => Array(k).fill(0));

    for (let row = 0; row < m; row++) {
      for (let topic = 0; topic < k; topic++) {
        let sum = 0;
        for (let dim = 0; dim < d; dim++) {
          sum += matrix[row][dim] * h[topic][dim];
        }
        vHt[row][topic] = sum;
      }
    }

    for (let i = 0; i < k; i++) {
      for (let j = 0; j < k; j++) {
        let sum = 0;
        for (let dim = 0; dim < d; dim++) {
          sum += h[i][dim] * h[j][dim];
        }
        hHt[i][j] = sum;
      }
    }

    for (let row = 0; row < m; row++) {
      for (let topic = 0; topic < k; topic++) {
        let denominator = 0;
        for (let inner = 0; inner < k; inner++) {
          denominator += w[row][inner] * hHt[inner][topic];
        }
        w[row][topic] *= vHt[row][topic] / (denominator + eps);
      }
    }

    if (iteration % 10 === 9) {
      let residualSq = 0;
      let matrixSq = 0;
      for (let row = 0; row < m; row++) {
        for (let dim = 0; dim < d; dim++) {
          let reconstructed = 0;
          for (let topic = 0; topic < k; topic++) {
            reconstructed += w[row][topic] * h[topic][dim];
          }
          const diff = matrix[row][dim] - reconstructed;
          residualSq += diff * diff;
          matrixSq += matrix[row][dim] * matrix[row][dim];
        }
      }

      if (matrixSq > 0 && Math.sqrt(residualSq / matrixSq) < tolerance) {
        break;
      }
    }
  }

  return { w, h };
}

function createDeterministicRandom(seed) {
  let current = seed >>> 0;
  return () => {
    current = (1664525 * current + 1013904223) >>> 0;
    return current / 0xffffffff;
  };
}