feat(embed): docs_embedded tracking, buffer reuse, retry hardening

Embedding pipeline improvements building on the concurrent batching foundation: - Track docs_embedded vs chunks_embedded separately. A document counts as embedded only when ALL its chunks succeed, giving accurate progress reporting. The sync command reads docs_embedded for its document count. - Reuse a single Vec<u8> buffer (embed_buf) across all store_embedding calls instead of allocating per chunk. Eliminates ~3KB allocation per 768-dim embedding. - Detect and record errors when Ollama silently returns fewer embeddings than inputs (batch mismatch). Previously these dropped chunks were invisible. - Improve retry error messages: distinguish "retry returned unexpected result" (wrong dims/count) from "retry request failed" (network error) instead of generic "chunk too large" message. - Convert all hot-path SQL from conn.execute() to prepare_cached() for statement cache reuse (clear_document_embeddings, store_embedding, record_embedding_error). - Record embedding_metadata errors for empty documents so they don't appear as perpetually pending on subsequent runs. - Accept concurrency parameter (configurable via config.embedding.concurrency) instead of hardcoded EMBED_CONCURRENCY=2. - Add schema version pre-flight check in embed command to fail fast with actionable error instead of cryptic SQL errors. - Fix --retry-failed to use DELETE instead of UPDATE. UPDATE clears last_error but the row still matches config params in the LEFT JOIN, making the doc permanently invisible to find_pending_documents. DELETE removes the row entirely so the LEFT JOIN returns NULL. Regression test added (old_update_approach_leaves_doc_invisible). - Add chunking forward-progress guard: after floor_char_boundary() rounds backward, ensure start advances by at least one full character to prevent infinite loops on multi-byte sequences (box-drawing chars, smart quotes). Test cases cover the exact patterns that caused production hangs on document 18526. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-06 22:42:08 -05:00
parent 39cb0cb087
commit c2036c64e9
5 changed files with 497 additions and 66 deletions
--- a/src/cli/commands/embed.rs
+++ b/src/cli/commands/embed.rs
@@ -2,16 +2,17 @@ use console::style;
 use serde::Serialize;

 use crate::Config;
-use crate::core::db::create_connection;
-use crate::core::error::Result;
+use crate::core::db::{LATEST_SCHEMA_VERSION, create_connection, get_schema_version};
+use crate::core::error::{LoreError, Result};
 use crate::core::paths::get_db_path;
 use crate::core::shutdown::ShutdownSignal;
 use crate::embedding::ollama::{OllamaClient, OllamaConfig};
-use crate::embedding::pipeline::embed_documents;
+use crate::embedding::pipeline::{DEFAULT_EMBED_CONCURRENCY, embed_documents};

 #[derive(Debug, Default, Serialize)]
 pub struct EmbedCommandResult {
-    pub embedded: usize,
+    pub docs_embedded: usize,
+    pub chunks_embedded: usize,
    pub failed: usize,
    pub skipped: usize,
 }
@@ -26,6 +27,18 @@ pub async fn run_embed(
    let db_path = get_db_path(config.storage.db_path.as_deref());
    let conn = create_connection(&db_path)?;

+    let schema_version = get_schema_version(&conn);
+    if schema_version < LATEST_SCHEMA_VERSION {
+        return Err(LoreError::MigrationFailed {
+            version: schema_version,
+            message: format!(
+                "Database is at schema version {schema_version} but {LATEST_SCHEMA_VERSION} is required. \
+                 Run 'lore migrate' first."
+            ),
+            source: None,
+        });
+    }
+
    let ollama_config = OllamaConfig {
        base_url: config.embedding.base_url.clone(),
        model: config.embedding.model.clone(),
@@ -43,18 +56,39 @@ pub async fn run_embed(
             COMMIT;",
        )?;
    } else if retry_failed {
-        conn.execute(
-            "UPDATE embedding_metadata SET last_error = NULL, attempt_count = 0
-             WHERE last_error IS NOT NULL",
-            [],
+        // DELETE (not UPDATE) so the LEFT JOIN in find_pending_documents returns NULL,
+        // making the doc appear pending again. UPDATE would leave a matching row that
+        // still satisfies the config-param check, making the doc permanently invisible.
+        conn.execute_batch(
+            "BEGIN;
+             DELETE FROM embeddings WHERE rowid / 1000 IN (
+               SELECT DISTINCT document_id FROM embedding_metadata
+               WHERE last_error IS NOT NULL
+             );
+             DELETE FROM embedding_metadata WHERE last_error IS NOT NULL;
+             COMMIT;",
        )?;
    }

    let model_name = &config.embedding.model;
-    let result = embed_documents(&conn, &client, model_name, progress_callback, signal).await?;
+    let concurrency = if config.embedding.concurrency > 0 {
+        config.embedding.concurrency as usize
+    } else {
+        DEFAULT_EMBED_CONCURRENCY
+    };
+    let result = embed_documents(
+        &conn,
+        &client,
+        model_name,
+        concurrency,
+        progress_callback,
+        signal,
+    )
+    .await?;

    Ok(EmbedCommandResult {
-        embedded: result.embedded,
+        docs_embedded: result.docs_embedded,
+        chunks_embedded: result.chunks_embedded,
        failed: result.failed,
        skipped: result.skipped,
    })
@@ -62,7 +96,10 @@ pub async fn run_embed(

 pub fn print_embed(result: &EmbedCommandResult) {
    println!("{} Embedding complete", style("done").green().bold(),);
-    println!("  Embedded: {}", result.embedded);
+    println!(
+        "  Embedded: {} documents ({} chunks)",
+        result.docs_embedded, result.chunks_embedded
+    );
    if result.failed > 0 {
        println!("  Failed:   {}", style(result.failed).red());
    }