Skip to content

Commit

Permalink
[Spelunker] Run evaluations and show F1 score; scored a 2nd question (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
gvanrossum-ms authored Mar 5, 2025
1 parent f249c38 commit 45d1f49
Show file tree
Hide file tree
Showing 17 changed files with 393 additions and 559 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
# --------------------
# Rules SqlLite db
*.db
*.db-shm
*.db-wal

# --------------------
# VS Code
Expand Down
3 changes: 3 additions & 0 deletions ts/.prettierignore
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,6 @@ packages/defaultAgentProvider/data/**/*.json
# Test result
packages/shell/test-results/*
packages/shell/playwright-report/*

# Eval sources
packages/agents/spelunker/evals/eval-*/source
56 changes: 55 additions & 1 deletion ts/packages/agents/spelunker/evals/design.md
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,37 @@ An eval run needs to do the following:
- Print some JSON with the question, the F1 score, and the algorithm
(and perhaps a timestamp).

### Schema for evaluation scoring

The database needs to store for each run, for each question,
for each chunk, whether it was selected or not. Runs identify
the algorithm and its variations. Run names must be unique.
Since most algorithms have it available, we also store the score.

```sql
CREATE TABLE IF NOT EXISTS Runs (
runId TEXT PRIMARY KEY,
runName TEXT UNIQUE,
comments TEXT,
startTimestamp TEXT,
endTimestamp TEXT
);
CREATE TABLE IF NOT EXISTS RunScores (
runId TEXT REFERENCES Runs(runId),
questionId TEXT REFERENCES Questions(questionId),
chunkHash TEXT REFERENCES Hashes(chunkHash),
score INTEGER, -- 0 or 1
relevance FLOAT, -- Range [0.0 ... 1.0]
CONSTRAINT triple UNIQUE (runId, questionId, chunkHash)
);
```

To compute precision and recall (all numbers in range [0.0 ... 1.0])

- p(recision) = fraction of selected chunks that have score==1 in Scores
- r(ecall) = fraction of chunks with score==1 in Scores that were selected
- `f1 = 2 * (p * r) / (p + r)`

## Tooling needed for automatic eval runs

We need to write a new TypeScript program that reuses much of
Expand All @@ -149,9 +180,32 @@ APIs available.

Do we need more versatility in the scoring tool? E.g.

- Pass the question _ID_ on the command line instead of the text.
- A way to set a fixed score for all chunks in a given file
(or a file pattern).
- A way to review scores (possibly by date range).
- A way to set a fixed score for a list of chunk IDs
(e.g. the References section of an actual answer).

# Refactoring the selector

Currently we have these steps:

1. Using embeddings for fuzzy matching, select N nearest neighbors
2. For those N chunks, ask an AI for a relevance score
3. Pick the highest-scoring K chunks

We can envision other steps:

a. Ask an AI to construct a set of words or phrases to select nearest
neighbors
b. Ask an AI to select which files to pay attention to (bool or score?)

In a sense we are collecting multiple types of scores, and we should
combine them using
[RRF ranking](https://learn.microsoft.com/en-us/azure/search/hybrid-search-ranking#how-rrf-ranking-works).

For this to work, everything that returns a list of chunks/chunk IDs,
should return a list of _scored_ chunk/chunk IDs. In practice, this
just means that we have to change (1) (embeddings) to return the score.
Or perhaps we just sort the results from highest to lowest score,
since the RRF algorithm just takes the rank order for each score type.
430 changes: 0 additions & 430 deletions ts/packages/agents/spelunker/evals/eval-1/dbdump.sql

Large diffs are not rendered by default.

201 changes: 99 additions & 102 deletions ts/packages/agents/spelunker/evals/eval-2/dbdump.sql

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion ts/packages/agents/spelunker/evals/src/dump.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ case $1 in
;;
esac

TABLES="Questions Hashes Scores"
TABLES="Questions Scores"

sqlite3 $1/eval.db ".dump $TABLES" >$1/dbdump.sql || exit 1
echo "Dumped $TABLES $1/eval.db to $1/dbdump.sql"
Expand Down
4 changes: 2 additions & 2 deletions ts/packages/agents/spelunker/evals/src/evalscore.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ def get_chunk_text(cursor: sqlite3.Cursor, chunkid):


def score_chunk(question, chunk_text, language):
separator = "-" * 50
separator = "-" * 79
print(separator)
pipe = os.popen(f"pygmentize -l {language} | less -FRX", "w")
pipe.write(chunk_text)
Expand All @@ -177,7 +177,7 @@ def score_chunk(question, chunk_text, language):
print(line)
yes = no = False
while not yes and not no:
score = input(question + "\nScore: ")
score = input(question + "\nInclude this chunk (y/n): ")
yes = score.lower() in ("1", "y", "yes")
no = score.lower() in ("0", "n", "no")
assert yes != no
Expand Down
27 changes: 23 additions & 4 deletions ts/packages/agents/spelunker/evals/src/evalsetup.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,8 @@ def main():
copy_table(src_cur, dst_cur, "Files", filename_prefix)
copy_table(src_cur, dst_cur, "Chunks")
copy_table(src_cur, dst_cur, "Blobs")
copy_table(src_cur, dst_cur, "Summaries")
copy_table(src_cur, dst_cur, "ChunkEmbeddings")
src_conn.close()

add_new_tables(dst_cur)
Expand All @@ -115,7 +117,10 @@ def copy_table(src_cur, dst_cur, table_name, prefix=None):

# Read rows
rows = src_cur.execute(f"SELECT * FROM {table_name}").fetchall()
if prefix and table_name.lower() == "files":
if not rows:
print(f"Table {table_name} is empty")
return
if prefix and table_name.lower() == "files":
# Check the filenames start with the prefix
for row in rows:
filename = row[0]
Expand Down Expand Up @@ -143,6 +148,21 @@ def copy_table(src_cur, dst_cur, table_name, prefix=None):
score INTEGER, -- 0 or 1
timestamp TEXT
);
CREATE TABLE IF NOT EXISTS Runs (
runId INTEGER PRIMARY KEY,
runName TEXT UNIQUE,
questionId INTEGER REFERENCES Questions(questionId),
comments TEXT,
startTimestamp TEXT,
endTimestamp TEXT
);
CREATE TABLE IF NOT EXISTS RunScores (
runId INTEGER REFERENCES Runs(runId),
chunkHash TEXT REFERENCES Hashes(chunkHash),
score INTEGER, -- 0 or 1
relevance FLOAT, -- Range [0.0 ... 1.0]
CONSTRAINT unique_columns UNIQUE (runId, chunkHash)
);
"""
# TODO: Table to record eval runs (the eval tool can create-or-insert that)

Expand All @@ -155,12 +175,11 @@ def add_new_tables(dst_cur):
table_name = sql.split()[5]
print(f"Creating table {table_name}")
dst_cur.execute(sql)
if table_name == "Hashes":
print(f"Clearing contents of table {table_name}")
dst_cur.execute(f"DELETE FROM {table_name}")


def fill_in_hashes(dst_cur, prefix):
print(f"Clearing Hashes")
dst_cur.execute(f"DELETE FROM Hashes")
count = 0
# Fetch all chunks
selection = dst_cur.execute(
Expand Down
1 change: 1 addition & 0 deletions ts/packages/agents/spelunker/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
"better-sqlite3": "11.8.1",
"code-processor": "workspace:*",
"common-utils": "workspace:*",
"dotenv": "^16.3.1",
"typeagent": "workspace:*",
"typechat": "^0.1.1",
"typescript": "^5.4.2"
Expand Down
2 changes: 1 addition & 1 deletion ts/packages/agents/spelunker/src/databaseUtils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ CREATE TABLE IF NOT EXISTS Chunks (
treeName TEXT NOT NULL,
codeName TEXT NOT NULL,
parentId TEXT KEY REFERENCES Chunks(chunkId), -- May be null
fileName TEXT KEY REFERENCES files(fileName) NOT NULL,
fileName TEXT KEY REFERENCES Files(fileName) NOT NULL,
lineNo INTEGER NOT NULL -- 1-based
);
CREATE TABLE IF NOT EXISTS Blobs (
Expand Down
23 changes: 12 additions & 11 deletions ts/packages/agents/spelunker/src/embeddings.ts
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,18 @@ export async function preSelectChunks(
input: string,
maxChunks = 1000,
): Promise<ChunkId[]> {
const tb0 = new Date().getTime();
const queryEmbedding = await getEmbedding(context, input);
const tb1 = new Date().getTime();
const tail = !queryEmbedding ? " (failure)" : "";
console_log(
` [Embedding input of ${input.length} characters took ${((tb1 - tb0) / 1000).toFixed(3)} seconds${tail}]`,
);
if (!queryEmbedding) {
// Fail fast if we can't get an embedding.
return [];
}

const ta0 = new Date().getTime();
const db = context.queryContext!.database!;
const prepAllEmbeddings = db.prepare(
Expand All @@ -144,17 +156,6 @@ export async function preSelectChunks(
return allEmbeddingRows.map((row) => row.chunkId);
}

const tb0 = new Date().getTime();
const queryEmbedding = await getEmbedding(context, input);
const tb1 = new Date().getTime();
const tail = !queryEmbedding ? " (failure)" : "";
console_log(
` [Embedding input of ${input.length} characters took ${((tb1 - tb0) / 1000).toFixed(3)} seconds${tail}]`,
);
if (!queryEmbedding) {
return [];
}

const embeddings = allEmbeddingRows.map(
(row) => new Float32Array(Buffer.from(row.embedding)),
);
Expand Down
151 changes: 151 additions & 0 deletions ts/packages/agents/spelunker/src/eval.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.

import path from "path";

import dotenv from "dotenv";

import { createDatabase } from "./databaseUtils.js";
import { console_log, getDirName, resetEpoch } from "./logging.js";
import { createQueryContext } from "./queryContext.js";
import {
loadDatabase,
readAllChunksFromDatabase,
selectChunks,
} from "./searchCode.js";
import {
initializeSpelunkerContext,
SpelunkerContext,
} from "./spelunkerActionHandler.js";
import { ChunkId } from "./chunkSchema.js";

const __dirname = getDirName(); // .../ts/packages/agents/spelunker/dist
dotenv.config({ path: path.join(__dirname, "../../../../.env") }); // .../ts/.env

type ConfigRecord = Record<string, any>;

// TODO: Read this from a file that can be edited before each run,
// or alternatively, read from command line args.
const CONFIG: ConfigRecord = {
evalFolder: "evals/eval-2",
questionId: 2,
};

async function main() {
resetEpoch();
console_log("Starting eval script.");
console_log(`CONFIG = ${JSON.stringify(CONFIG, undefined, 4)}`);
const context = await initializeSpelunkerContext();
fillSpelunkerContext(context, CONFIG);
const question = readQuestion(context, CONFIG);
await loadDatabase(context);
await conductEval(context, CONFIG, question);
console_log("Eval script finished.");
}

function fillSpelunkerContext(
context: SpelunkerContext,
config: ConfigRecord,
): void {
const evalFolder = path.join(path.dirname(__dirname), config.evalFolder);
const focusFolder = path.join(evalFolder, "source");
context.focusFolders = [focusFolder];
const dbFile = path.join(evalFolder, "eval.db");
context.queryContext = createQueryContext(dbFile);
createDatabase(context);
}

function readQuestion(context: SpelunkerContext, config: ConfigRecord): string {
const db = context.queryContext!.database!;
const row = db
.prepare<
[number],
{ question: string }
>("SELECT question FROM Questions WHERE questionId = ?")
.get(CONFIG.questionId);
if (!row) {
throw new Error(
`No question found for questionId ${CONFIG.questionId}`,
);
}
return row.question;
}

async function conductEval(
context: SpelunkerContext,
config: ConfigRecord,
question: string,
): Promise<void> {
console_log("*** Conducting eval ***");
console_log(`Question: ${question}`);
const db = context.queryContext!.database!;
const allChunks = await readAllChunksFromDatabase(db);
const chunks = await selectChunks(context, allChunks, question);
// if (!chunks.length) {
// throw new Error("No chunks returned from selectChunks!");
// }
const selectedHashes = new Set<string>();
for (const chunk of chunks) {
const hash = lookupHashFromChunkId(context, chunk.chunkId);
selectedHashes.add(hash);
}
const correctHashes = new Set<string>();
const prep = db.prepare<[number], { chunkHash: string }>(
"SELECT chunkHash FROM Scores WHERE score == 1 AND questionId == ?",
);
for (const row of prep.iterate(CONFIG.questionId)) {
const hash = row.chunkHash;
correctHashes.add(hash);
}
console_log("Computing F1 score:");
// precision = len(selectedHashes ∩ correctHashes) / len(selectedHashes)
// recall = len(selectedHashes ∩ correctHashes) / len(correctHashes)
// F1 = 2 * (precision * recall) / (precision + recall)
const intersection = intersect<string>(selectedHashes, correctHashes);
const precision = intersection.size / selectedHashes.size || 0; // If NaN
const recall = intersection.size / correctHashes.size || 0; // If NaN
const F1 = (2 * (precision * recall)) / (precision + recall) || 0; // If NaN
console_log(
`precision: ${precision.toFixed(3)}, recall: ${recall.toFixed(3)}, F1: ${F1.toFixed(3)}`,
);
}

function intersect<T>(a: Set<T>, b: Set<T>): Set<T> {
return new Set([...a].filter((x) => b.has(x)));
}

function lookupHashFromChunkId(
context: SpelunkerContext,
chunkId: ChunkId,
): string {
const db = context.queryContext!.database!;
const row = db
.prepare<
[string],
{ chunkHash: string }
>("SELECT chunkHash FROM Hashes WHERE chunkId = ?")
.get(chunkId);
if (!row) {
throw new Error(`No hash found for chunkId ${chunkId}`);
}
return row.chunkHash;
}

// function lookupChunkIdFromHash(
// context: SpelunkerContext,
// chunkHash: string,
// ): ChunkId {
// const db = context.queryContext!.database!;
// const row = db
// .prepare<
// [string],
// {chunkId: string}
// >("SELECT chunkId FROM Hashes WHERE chunkHash = ?")
// .get(chunkHash);
// if (!row) {
// throw new Error(`No chunkId found for hash ${chunkHash}`);
// }
// return row.chunkId;
// }

await main();
10 changes: 10 additions & 0 deletions ts/packages/agents/spelunker/src/logging.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.

import path from "path";
import { fileURLToPath } from "url";

let epoch: number = 0;

export function resetEpoch(): void {
Expand All @@ -15,3 +18,10 @@ export function console_log(...rest: any[]): void {
const t = Date.now();
console.log(((t - epoch) / 1000).toFixed(3).padStart(6), ...rest);
}

// Not really related to logging, but it needs a home...

export function getDirName(): string {
const __filename = fileURLToPath(import.meta.url);
return path.dirname(__filename);
}
Loading

0 comments on commit 45d1f49

Please sign in to comment.