feat(embed): docs_embedded tracking, buffer reuse, retry hardening

Embedding pipeline improvements building on the concurrent batching
foundation:

- Track docs_embedded vs chunks_embedded separately. A document counts
  as embedded only when ALL its chunks succeed, giving accurate
  progress reporting. The sync command reads docs_embedded for its
  document count.

- Reuse a single Vec<u8> buffer (embed_buf) across all store_embedding
  calls instead of allocating per chunk. Eliminates ~3KB allocation per
  768-dim embedding.

- Detect and record errors when Ollama silently returns fewer
  embeddings than inputs (batch mismatch). Previously these dropped
  chunks were invisible.

- Improve retry error messages: distinguish "retry returned unexpected
  result" (wrong dims/count) from "retry request failed" (network
  error) instead of generic "chunk too large" message.

- Convert all hot-path SQL from conn.execute() to prepare_cached() for
  statement cache reuse (clear_document_embeddings, store_embedding,
  record_embedding_error).

- Record embedding_metadata errors for empty documents so they don't
  appear as perpetually pending on subsequent runs.

- Accept concurrency parameter (configurable via config.embedding.concurrency)
  instead of hardcoded EMBED_CONCURRENCY=2.

- Add schema version pre-flight check in embed command to fail fast
  with actionable error instead of cryptic SQL errors.

- Fix --retry-failed to use DELETE instead of UPDATE. UPDATE clears
  last_error but the row still matches config params in the LEFT JOIN,
  making the doc permanently invisible to find_pending_documents.
  DELETE removes the row entirely so the LEFT JOIN returns NULL.
  Regression test added (old_update_approach_leaves_doc_invisible).

- Add chunking forward-progress guard: after floor_char_boundary()
  rounds backward, ensure start advances by at least one full
  character to prevent infinite loops on multi-byte sequences
  (box-drawing chars, smart quotes). Test cases cover the exact
  patterns that caused production hangs on document 18526.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Taylor Eernisse
2026-02-06 22:42:08 -05:00
parent 39cb0cb087
commit c2036c64e9
5 changed files with 497 additions and 66 deletions

View File

@@ -2,16 +2,17 @@ use console::style;
use serde::Serialize;
use crate::Config;
use crate::core::db::create_connection;
use crate::core::error::Result;
use crate::core::db::{LATEST_SCHEMA_VERSION, create_connection, get_schema_version};
use crate::core::error::{LoreError, Result};
use crate::core::paths::get_db_path;
use crate::core::shutdown::ShutdownSignal;
use crate::embedding::ollama::{OllamaClient, OllamaConfig};
use crate::embedding::pipeline::embed_documents;
use crate::embedding::pipeline::{DEFAULT_EMBED_CONCURRENCY, embed_documents};
#[derive(Debug, Default, Serialize)]
pub struct EmbedCommandResult {
pub embedded: usize,
pub docs_embedded: usize,
pub chunks_embedded: usize,
pub failed: usize,
pub skipped: usize,
}
@@ -26,6 +27,18 @@ pub async fn run_embed(
let db_path = get_db_path(config.storage.db_path.as_deref());
let conn = create_connection(&db_path)?;
let schema_version = get_schema_version(&conn);
if schema_version < LATEST_SCHEMA_VERSION {
return Err(LoreError::MigrationFailed {
version: schema_version,
message: format!(
"Database is at schema version {schema_version} but {LATEST_SCHEMA_VERSION} is required. \
Run 'lore migrate' first."
),
source: None,
});
}
let ollama_config = OllamaConfig {
base_url: config.embedding.base_url.clone(),
model: config.embedding.model.clone(),
@@ -43,18 +56,39 @@ pub async fn run_embed(
COMMIT;",
)?;
} else if retry_failed {
conn.execute(
"UPDATE embedding_metadata SET last_error = NULL, attempt_count = 0
WHERE last_error IS NOT NULL",
[],
// DELETE (not UPDATE) so the LEFT JOIN in find_pending_documents returns NULL,
// making the doc appear pending again. UPDATE would leave a matching row that
// still satisfies the config-param check, making the doc permanently invisible.
conn.execute_batch(
"BEGIN;
DELETE FROM embeddings WHERE rowid / 1000 IN (
SELECT DISTINCT document_id FROM embedding_metadata
WHERE last_error IS NOT NULL
);
DELETE FROM embedding_metadata WHERE last_error IS NOT NULL;
COMMIT;",
)?;
}
let model_name = &config.embedding.model;
let result = embed_documents(&conn, &client, model_name, progress_callback, signal).await?;
let concurrency = if config.embedding.concurrency > 0 {
config.embedding.concurrency as usize
} else {
DEFAULT_EMBED_CONCURRENCY
};
let result = embed_documents(
&conn,
&client,
model_name,
concurrency,
progress_callback,
signal,
)
.await?;
Ok(EmbedCommandResult {
embedded: result.embedded,
docs_embedded: result.docs_embedded,
chunks_embedded: result.chunks_embedded,
failed: result.failed,
skipped: result.skipped,
})
@@ -62,7 +96,10 @@ pub async fn run_embed(
pub fn print_embed(result: &EmbedCommandResult) {
println!("{} Embedding complete", style("done").green().bold(),);
println!(" Embedded: {}", result.embedded);
println!(
" Embedded: {} documents ({} chunks)",
result.docs_embedded, result.chunks_embedded
);
if result.failed > 0 {
println!(" Failed: {}", style(result.failed).red());
}