[Spelunker] Run evaluations and show F1 score; scored a 2nd question (#…

…788)
microsoft · Mar 5, 2025 · 45d1f49 · 45d1f49
1 parent f249c38
commit 45d1f49
Show file tree

Hide file tree

Showing 17 changed files with 393 additions and 559 deletions.
diff --git a/.gitignore b/.gitignore
@@ -3,6 +3,8 @@
 # --------------------
 # Rules SqlLite db
 *.db
+*.db-shm
+*.db-wal
 
 # --------------------
 # VS Code

diff --git a/ts/.prettierignore b/ts/.prettierignore
@@ -21,3 +21,6 @@ packages/defaultAgentProvider/data/**/*.json
 # Test result
 packages/shell/test-results/*
 packages/shell/playwright-report/*
+
+# Eval sources
+packages/agents/spelunker/evals/eval-*/source
diff --git a/ts/packages/agents/spelunker/evals/design.md b/ts/packages/agents/spelunker/evals/design.md
@@ -134,6 +134,37 @@ An eval run needs to do the following:
   - Print some JSON with the question, the F1 score, and the algorithm
     (and perhaps a timestamp).
 
+### Schema for evaluation scoring
+
+The database needs to store for each run, for each question,
+for each chunk, whether it was selected or not. Runs identify
+the algorithm and its variations. Run names must be unique.
+Since most algorithms have it available, we also store the score.
+
+```sql
+CREATE TABLE IF NOT EXISTS Runs (
+  runId TEXT PRIMARY KEY,
+  runName TEXT UNIQUE,
+  comments TEXT,
+  startTimestamp TEXT,
+  endTimestamp TEXT
+);
+CREATE TABLE IF NOT EXISTS RunScores (
+  runId TEXT REFERENCES Runs(runId),
+  questionId TEXT REFERENCES Questions(questionId),
+  chunkHash TEXT REFERENCES Hashes(chunkHash),
+  score INTEGER,  -- 0 or 1
+  relevance FLOAT,  -- Range [0.0 ... 1.0]
+  CONSTRAINT triple UNIQUE (runId, questionId, chunkHash)
+);
+```
+
+To compute precision and recall (all numbers in range [0.0 ... 1.0])
+
+- p(recision) = fraction of selected chunks that have score==1 in Scores
+- r(ecall) = fraction of chunks with score==1 in Scores that were selected
+- `f1 = 2 * (p * r) / (p + r)`
+
 ## Tooling needed for automatic eval runs
 
 We need to write a new TypeScript program that reuses much of
@@ -149,9 +180,32 @@ APIs available.
 
 Do we need more versatility in the scoring tool? E.g.
 
-- Pass the question _ID_ on the command line instead of the text.
 - A way to set a fixed score for all chunks in a given file
   (or a file pattern).
 - A way to review scores (possibly by date range).
 - A way to set a fixed score for a list of chunk IDs
   (e.g. the References section of an actual answer).
+
+# Refactoring the selector
+
+Currently we have these steps:
+
+1. Using embeddings for fuzzy matching, select N nearest neighbors
+2. For those N chunks, ask an AI for a relevance score
+3. Pick the highest-scoring K chunks
+
+We can envision other steps:
+
+a. Ask an AI to construct a set of words or phrases to select nearest
+neighbors
+b. Ask an AI to select which files to pay attention to (bool or score?)
+
+In a sense we are collecting multiple types of scores, and we should
+combine them using
+[RRF ranking](https://learn.microsoft.com/en-us/azure/search/hybrid-search-ranking#how-rrf-ranking-works).
+
+For this to work, everything that returns a list of chunks/chunk IDs,
+should return a list of _scored_ chunk/chunk IDs. In practice, this
+just means that we have to change (1) (embeddings) to return the score.
+Or perhaps we just sort the results from highest to lowest score,
+since the RRF algorithm just takes the rank order for each score type.
diff --git a/ts/packages/agents/spelunker/evals/eval-1/dbdump.sql b/ts/packages/agents/spelunker/evals/eval-1/dbdump.sql
diff --git a/ts/packages/agents/spelunker/evals/eval-2/dbdump.sql b/ts/packages/agents/spelunker/evals/eval-2/dbdump.sql
diff --git a/ts/packages/agents/spelunker/evals/src/dump.sh b/ts/packages/agents/spelunker/evals/src/dump.sh
@@ -9,7 +9,7 @@ case $1 in
         ;;
 esac
 
-TABLES="Questions Hashes Scores"
+TABLES="Questions Scores"
 
 sqlite3 $1/eval.db ".dump $TABLES" >$1/dbdump.sql  || exit 1
 echo "Dumped $TABLES $1/eval.db to $1/dbdump.sql"

diff --git a/ts/packages/agents/spelunker/evals/src/evalscore.py b/ts/packages/agents/spelunker/evals/src/evalscore.py
@@ -167,7 +167,7 @@ def get_chunk_text(cursor: sqlite3.Cursor, chunkid):
 
 
 def score_chunk(question, chunk_text, language):
-    separator = "-" * 50
+    separator = "-" * 79
     print(separator)
     pipe = os.popen(f"pygmentize -l {language} | less -FRX", "w")
     pipe.write(chunk_text)
@@ -177,7 +177,7 @@ def score_chunk(question, chunk_text, language):
         print(line)
     yes = no = False
     while not yes and not no:
-        score = input(question + "\nScore: ")
+        score = input(question + "\nInclude this chunk (y/n): ")
         yes = score.lower() in ("1", "y", "yes")
         no = score.lower() in ("0", "n", "no")
     assert yes != no

diff --git a/ts/packages/agents/spelunker/evals/src/evalsetup.py b/ts/packages/agents/spelunker/evals/src/evalsetup.py
@@ -93,6 +93,8 @@ def main():
     copy_table(src_cur, dst_cur, "Files", filename_prefix)
     copy_table(src_cur, dst_cur, "Chunks")
     copy_table(src_cur, dst_cur, "Blobs")
+    copy_table(src_cur, dst_cur, "Summaries")
+    copy_table(src_cur, dst_cur, "ChunkEmbeddings")
     src_conn.close()
 
     add_new_tables(dst_cur)
@@ -115,7 +117,10 @@ def copy_table(src_cur, dst_cur, table_name, prefix=None):
 
     # Read rows
     rows = src_cur.execute(f"SELECT * FROM {table_name}").fetchall()
-    if prefix and  table_name.lower() == "files":
+    if not rows:
+        print(f"Table {table_name} is empty")
+        return
+    if prefix and table_name.lower() == "files":
         # Check the filenames start with the prefix
         for row in rows:
             filename = row[0]
@@ -143,6 +148,21 @@ def copy_table(src_cur, dst_cur, table_name, prefix=None):
     score INTEGER,  -- 0 or 1
     timestamp TEXT
 );
+CREATE TABLE IF NOT EXISTS Runs (
+  runId INTEGER PRIMARY KEY,
+  runName TEXT UNIQUE,
+  questionId INTEGER REFERENCES Questions(questionId),
+  comments TEXT,
+  startTimestamp TEXT,
+  endTimestamp TEXT
+);
+CREATE TABLE IF NOT EXISTS RunScores (
+  runId INTEGER REFERENCES Runs(runId),
+  chunkHash TEXT REFERENCES Hashes(chunkHash),
+  score INTEGER,  -- 0 or 1
+  relevance FLOAT,  -- Range [0.0 ... 1.0]
+  CONSTRAINT unique_columns UNIQUE (runId, chunkHash)
+);
 """
 # TODO: Table to record eval runs (the eval tool can create-or-insert that)
 
@@ -155,12 +175,11 @@ def add_new_tables(dst_cur):
         table_name = sql.split()[5]
         print(f"Creating table {table_name}")
         dst_cur.execute(sql)
-        if table_name == "Hashes":
-            print(f"Clearing contents of table {table_name}")
-            dst_cur.execute(f"DELETE FROM {table_name}")
 
 
 def fill_in_hashes(dst_cur, prefix):
+    print(f"Clearing Hashes")
+    dst_cur.execute(f"DELETE FROM Hashes")
     count = 0
     # Fetch all chunks
     selection = dst_cur.execute(

diff --git a/ts/packages/agents/spelunker/package.json b/ts/packages/agents/spelunker/package.json
@@ -29,6 +29,7 @@
     "better-sqlite3": "11.8.1",
     "code-processor": "workspace:*",
     "common-utils": "workspace:*",
+    "dotenv": "^16.3.1",
     "typeagent": "workspace:*",
     "typechat": "^0.1.1",
     "typescript": "^5.4.2"

diff --git a/ts/packages/agents/spelunker/src/databaseUtils.ts b/ts/packages/agents/spelunker/src/databaseUtils.ts
@@ -22,7 +22,7 @@ CREATE TABLE IF NOT EXISTS Chunks (
     treeName TEXT NOT NULL,
     codeName TEXT NOT NULL,
     parentId TEXT KEY REFERENCES Chunks(chunkId), -- May be null
-    fileName TEXT KEY REFERENCES files(fileName) NOT NULL,
+    fileName TEXT KEY REFERENCES Files(fileName) NOT NULL,
     lineNo INTEGER NOT NULL -- 1-based
 );
 CREATE TABLE IF NOT EXISTS Blobs (

diff --git a/ts/packages/agents/spelunker/src/embeddings.ts b/ts/packages/agents/spelunker/src/embeddings.ts
@@ -126,6 +126,18 @@ export async function preSelectChunks(
     input: string,
     maxChunks = 1000,
 ): Promise<ChunkId[]> {
+    const tb0 = new Date().getTime();
+    const queryEmbedding = await getEmbedding(context, input);
+    const tb1 = new Date().getTime();
+    const tail = !queryEmbedding ? " (failure)" : "";
+    console_log(
+        `  [Embedding input of ${input.length} characters took ${((tb1 - tb0) / 1000).toFixed(3)} seconds${tail}]`,
+    );
+    if (!queryEmbedding) {
+        // Fail fast if we can't get an embedding.
+        return [];
+    }
+
     const ta0 = new Date().getTime();
     const db = context.queryContext!.database!;
     const prepAllEmbeddings = db.prepare(
@@ -144,17 +156,6 @@ export async function preSelectChunks(
         return allEmbeddingRows.map((row) => row.chunkId);
     }
 
-    const tb0 = new Date().getTime();
-    const queryEmbedding = await getEmbedding(context, input);
-    const tb1 = new Date().getTime();
-    const tail = !queryEmbedding ? " (failure)" : "";
-    console_log(
-        `  [Embedding input of ${input.length} characters took ${((tb1 - tb0) / 1000).toFixed(3)} seconds${tail}]`,
-    );
-    if (!queryEmbedding) {
-        return [];
-    }
-
     const embeddings = allEmbeddingRows.map(
         (row) => new Float32Array(Buffer.from(row.embedding)),
     );

diff --git a/ts/packages/agents/spelunker/src/eval.ts b/ts/packages/agents/spelunker/src/eval.ts
@@ -0,0 +1,151 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+import path from "path";
+
+import dotenv from "dotenv";
+
+import { createDatabase } from "./databaseUtils.js";
+import { console_log, getDirName, resetEpoch } from "./logging.js";
+import { createQueryContext } from "./queryContext.js";
+import {
+    loadDatabase,
+    readAllChunksFromDatabase,
+    selectChunks,
+} from "./searchCode.js";
+import {
+    initializeSpelunkerContext,
+    SpelunkerContext,
+} from "./spelunkerActionHandler.js";
+import { ChunkId } from "./chunkSchema.js";
+
+const __dirname = getDirName(); // .../ts/packages/agents/spelunker/dist
+dotenv.config({ path: path.join(__dirname, "../../../../.env") }); // .../ts/.env
+
+type ConfigRecord = Record<string, any>;
+
+// TODO: Read this from a file that can be edited before each run,
+// or alternatively, read from command line args.
+const CONFIG: ConfigRecord = {
+    evalFolder: "evals/eval-2",
+    questionId: 2,
+};
+
+async function main() {
+    resetEpoch();
+    console_log("Starting eval script.");
+    console_log(`CONFIG = ${JSON.stringify(CONFIG, undefined, 4)}`);
+    const context = await initializeSpelunkerContext();
+    fillSpelunkerContext(context, CONFIG);
+    const question = readQuestion(context, CONFIG);
+    await loadDatabase(context);
+    await conductEval(context, CONFIG, question);
+    console_log("Eval script finished.");
+}
+
+function fillSpelunkerContext(
+    context: SpelunkerContext,
+    config: ConfigRecord,
+): void {
+    const evalFolder = path.join(path.dirname(__dirname), config.evalFolder);
+    const focusFolder = path.join(evalFolder, "source");
+    context.focusFolders = [focusFolder];
+    const dbFile = path.join(evalFolder, "eval.db");
+    context.queryContext = createQueryContext(dbFile);
+    createDatabase(context);
+}
+
+function readQuestion(context: SpelunkerContext, config: ConfigRecord): string {
+    const db = context.queryContext!.database!;
+    const row = db
+        .prepare<
+            [number],
+            { question: string }
+        >("SELECT question FROM Questions WHERE questionId = ?")
+        .get(CONFIG.questionId);
+    if (!row) {
+        throw new Error(
+            `No question found for questionId ${CONFIG.questionId}`,
+        );
+    }
+    return row.question;
+}
+
+async function conductEval(
+    context: SpelunkerContext,
+    config: ConfigRecord,
+    question: string,
+): Promise<void> {
+    console_log("*** Conducting eval ***");
+    console_log(`Question: ${question}`);
+    const db = context.queryContext!.database!;
+    const allChunks = await readAllChunksFromDatabase(db);
+    const chunks = await selectChunks(context, allChunks, question);
+    // if (!chunks.length) {
+    //     throw new Error("No chunks returned from selectChunks!");
+    // }
+    const selectedHashes = new Set<string>();
+    for (const chunk of chunks) {
+        const hash = lookupHashFromChunkId(context, chunk.chunkId);
+        selectedHashes.add(hash);
+    }
+    const correctHashes = new Set<string>();
+    const prep = db.prepare<[number], { chunkHash: string }>(
+        "SELECT chunkHash FROM Scores WHERE score == 1 AND questionId == ?",
+    );
+    for (const row of prep.iterate(CONFIG.questionId)) {
+        const hash = row.chunkHash;
+        correctHashes.add(hash);
+    }
+    console_log("Computing F1 score:");
+    // precision = len(selectedHashes ∩ correctHashes) / len(selectedHashes)
+    // recall = len(selectedHashes ∩ correctHashes) / len(correctHashes)
+    // F1 = 2 * (precision * recall) / (precision + recall)
+    const intersection = intersect<string>(selectedHashes, correctHashes);
+    const precision = intersection.size / selectedHashes.size || 0; // If NaN
+    const recall = intersection.size / correctHashes.size || 0; // If NaN
+    const F1 = (2 * (precision * recall)) / (precision + recall) || 0; // If NaN
+    console_log(
+        `precision: ${precision.toFixed(3)}, recall: ${recall.toFixed(3)}, F1: ${F1.toFixed(3)}`,
+    );
+}
+
+function intersect<T>(a: Set<T>, b: Set<T>): Set<T> {
+    return new Set([...a].filter((x) => b.has(x)));
+}
+
+function lookupHashFromChunkId(
+    context: SpelunkerContext,
+    chunkId: ChunkId,
+): string {
+    const db = context.queryContext!.database!;
+    const row = db
+        .prepare<
+            [string],
+            { chunkHash: string }
+        >("SELECT chunkHash FROM Hashes WHERE chunkId = ?")
+        .get(chunkId);
+    if (!row) {
+        throw new Error(`No hash found for chunkId ${chunkId}`);
+    }
+    return row.chunkHash;
+}
+
+// function lookupChunkIdFromHash(
+//     context: SpelunkerContext,
+//     chunkHash: string,
+// ): ChunkId {
+//     const db = context.queryContext!.database!;
+//     const row = db
+//         .prepare<
+//             [string],
+//             {chunkId: string}
+//         >("SELECT chunkId FROM Hashes WHERE chunkHash = ?")
+//         .get(chunkHash);
+//     if (!row) {
+//         throw new Error(`No chunkId found for hash ${chunkHash}`);
+//     }
+//     return row.chunkId;
+// }
+
+await main();
diff --git a/ts/packages/agents/spelunker/src/logging.ts b/ts/packages/agents/spelunker/src/logging.ts
@@ -1,6 +1,9 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT License.
 
+import path from "path";
+import { fileURLToPath } from "url";
+
 let epoch: number = 0;
 
 export function resetEpoch(): void {
@@ -15,3 +18,10 @@ export function console_log(...rest: any[]): void {
     const t = Date.now();
     console.log(((t - epoch) / 1000).toFixed(3).padStart(6), ...rest);
 }
+
+// Not really related to logging, but it needs a home...
+
+export function getDirName(): string {
+    const __filename = fileURLToPath(import.meta.url);
+    return path.dirname(__filename);
+}