From abb7367c3d090d0de73c7ea405a42397d794c4df Mon Sep 17 00:00:00 2001
From: Piali Choudhury <pialic@microsoft.com>
Date: Wed, 5 Feb 2025 15:25:00 -0800
Subject: [PATCH] ongoing changes to handle pdf files

---
 ts/examples/docuProc/src/pdfChunker.py        |  146 +++
 ts/examples/docuProc/src/pdfChunker.ts        |  186 +++
 ts/examples/docuProc/src/pdfChunkyIndexer.ts  |  135 ++
 .../docuProc/src/pdfDocAnswerSchema.ts        |   15 +
 ts/examples/docuProc/src/pdfDocChunkSchema.ts |   35 +
 ts/examples/docuProc/src/pdfDocQuerySchema.ts |   24 +
 ts/examples/docuProc/src/pdfFileDocumenter.ts |   97 ++
 ts/examples/docuProc/src/pdfImporter.ts       |  305 +++++
 ts/examples/docuProc/src/pdfQNAInterface.ts   | 1113 +++++++++++++++++
 9 files changed, 2056 insertions(+)
 create mode 100644 ts/examples/docuProc/src/pdfChunker.py
 create mode 100644 ts/examples/docuProc/src/pdfChunker.ts
 create mode 100644 ts/examples/docuProc/src/pdfChunkyIndexer.ts
 create mode 100644 ts/examples/docuProc/src/pdfDocAnswerSchema.ts
 create mode 100644 ts/examples/docuProc/src/pdfDocChunkSchema.ts
 create mode 100644 ts/examples/docuProc/src/pdfDocQuerySchema.ts
 create mode 100644 ts/examples/docuProc/src/pdfFileDocumenter.ts
 create mode 100644 ts/examples/docuProc/src/pdfImporter.ts
 create mode 100644 ts/examples/docuProc/src/pdfQNAInterface.ts

diff --git a/ts/examples/docuProc/src/pdfChunker.py b/ts/examples/docuProc/src/pdfChunker.py
new file mode 100644
index 000000000..0902154c3
--- /dev/null
+++ b/ts/examples/docuProc/src/pdfChunker.py
@@ -0,0 +1,146 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import subprocess
+import sys
+
+# Ensure required dependencies are installed
+required_packages = ["pdfplumber", "pymupdf"]
+for package in required_packages:
+    try:
+        __import__(package)
+    except ImportError:
+        print(f"Installing missing package: {package}...")
+        subprocess.check_call([sys.executable, "-m", "pip", "install", package])
+
+import pdfplumber # type: ignore
+import fitz # type: ignore
+import json
+import os
+import datetime
+import csv
+from dataclasses import dataclass
+from typing import List, Dict, Any, Optional
+
+IdType = str
+
+@dataclass
+class Blob:
+    """A sequence of text, table, or image data plus metadata."""
+    type: str  # "text", "table", "image"
+    content: Any  # Text (list of lines), table (list of lists), image path (str)
+    start: int  # Page number (0-based)
+    bbox: Optional[List[float]] = None  # Bounding box if applicable
+
+    def to_dict(self) -> Dict[str, Any]:
+        result = {
+            "type": self.type,
+            "content": self.content,
+            "start": self.start,
+        }
+        if self.bbox:
+            result["bbox"] = self.bbox
+        return result
+
+@dataclass
+class Chunk:
+    """A chunk at any level of nesting (root, inner, leaf)."""
+    id: IdType
+    pageid: IdType
+    blobs: List[Blob]  # Blobs around the placeholders
+    parentId: IdType
+    children: List[IdType]  # len() is one less than len(blobs)
+
+    def to_dict(self) -> Dict[str, object]:
+        return {
+            "id": self.id,
+            "pageid": self.pageid,
+            "blobs": [blob.to_dict() for blob in self.blobs],
+            "parentId": self.parentId,
+            "children": self.children,
+        }
+
+@dataclass
+class ChunkedFile:
+    """A file with extracted chunks."""
+    file_name: str
+    chunks: List[Chunk]
+
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "file_name": self.file_name,
+            "chunks": [chunk.to_dict() for chunk in self.chunks],
+        }
+
+class PDFChunker:
+    def __init__(self, file_path: str, output_dir: str = "output"):
+        self.file_path = file_path
+        self.output_dir = output_dir
+        os.makedirs(output_dir, exist_ok=True)
+
+    def generate_id(self) -> str:
+        return datetime.datetime.now().strftime("%Y%m%d-%H%M%S.%f")
+
+    def extract_text_chunks(self, pdf, by_paragraph: bool = True) -> List[Chunk]:
+        chunks = []
+        for page_num, page in enumerate(pdf.pages):
+            text = page.extract_text()
+            if text:
+                lines = text.split("\n")
+                chunk_id = self.generate_id()
+                if by_paragraph:
+                    paragraphs = text.split("\n\n")
+                    blobs = [Blob("text", para.split("\n"), page_num) for para in paragraphs]
+                else:
+                    blobs = [Blob("text", lines, page_num)]
+                chunks.append(Chunk(chunk_id, str(page_num), blobs, "", []))
+        return chunks
+
+    def extract_tables(self, pdf) -> List[Chunk]:
+        chunks = []
+        for page_num, page in enumerate(pdf.pages):
+            tables = page.extract_tables()
+            for table in tables:
+                table_path = os.path.join(self.output_dir, f"table_{page_num}_{self.generate_id()}.csv")
+                with open(table_path, "w", newline="") as f:
+                    writer = csv.writer(f)
+                    writer.writerows(table)
+                chunk_id = self.generate_id()
+                blobs = [Blob("table", table_path, page_num)]
+                chunks.append(Chunk(chunk_id, str(page_num), blobs, "", []))
+        return chunks
+
+    def extract_images(self) -> List[Chunk]:
+        chunks = []
+        doc = fitz.open(self.file_path)
+        for page_num in range(len(doc)):
+            for img_index, img in enumerate(doc[page_num].get_images(full=True)):
+                xref = img[0]
+                pix = fitz.Pixmap(doc, xref)
+                img_path = os.path.join(self.output_dir, f"image_{page_num}_{img_index}.png")
+                pix.save(img_path)
+                bbox = list(doc[page_num].get_image_bbox(xref))
+                chunk_id = self.generate_id()
+                blobs = [Blob("image", img_path, page_num, bbox)]
+                chunks.append(Chunk(chunk_id, str(page_num), blobs, "", []))
+        return chunks
+
+    def chunkify(self, by_paragraph: bool = True) -> ChunkedFile:
+        with pdfplumber.open(self.file_path) as pdf:
+            text_chunks = self.extract_text_chunks(pdf, by_paragraph)
+            table_chunks = self.extract_tables(pdf)
+        image_chunks = self.extract_images()
+        all_chunks = text_chunks + table_chunks + image_chunks
+        return ChunkedFile(self.file_path, all_chunks)
+
+    def save_json(self, output_path: str, by_paragraph: bool = True):
+        chunked_file = self.chunkify(by_paragraph)
+        with open(output_path, "w") as f:
+            json.dump(chunked_file.to_dict(), f, indent=2)
+
+# Example usage
+if __name__ == "__main__":
+    pdf_path = "sample.pdf"
+    output_json = "output.json"
+    chunker = PDFChunker(pdf_path)
+    chunker.save_json(output_json)
diff --git a/ts/examples/docuProc/src/pdfChunker.ts b/ts/examples/docuProc/src/pdfChunker.ts
new file mode 100644
index 000000000..80aa9a18a
--- /dev/null
+++ b/ts/examples/docuProc/src/pdfChunker.ts
@@ -0,0 +1,186 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+// This requires that python3 is on the PATH
+// and the pdfChunker.py script is in the dist directory.
+
+import { exec } from "child_process";
+import path from "path";
+import { fileURLToPath } from "url";
+import { promisify } from "util";
+
+import { PdfFileDocumentation } from "./pdfDocChunkSchema.js";
+
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = path.dirname(__filename);
+
+const execPromise = promisify(exec);
+
+export type ChunkId = string;
+
+export interface Blob {
+    start: number; // int; 0-based!
+    lines: string[];
+    breadcrumb?: boolean;
+}
+
+export interface Chunk {
+    // Names here must match names in pdfChunker.py.
+    id: ChunkId;
+    treeName: string;
+    blobs: Blob[];
+    parentId: ChunkId;
+    children: ChunkId[];
+    fileName: string; // Set upon receiving end from ChunkedFile.fileName.
+    docs?: PdfFileDocumentation; // Computed later by fileDocumenter.
+}
+
+export interface ChunkedFile {
+    fileName: string;
+    chunks: Chunk[];
+}
+
+export interface ErrorItem {
+    error: string;
+    filename?: string;
+    output?: string;
+}
+
+export async function chunkifyPdfFiles(
+    filenames: string[],
+): Promise<(ChunkedFile | ErrorItem)[]> {
+    let output,
+        errors,
+        success = false;
+    try {
+        const chunkerPath = path.join(__dirname, "pdfChunker.py");
+        let { stdout, stderr } = await execPromise(
+            `python3 -X utf8 ${chunkerPath} ${filenames.join(" ")}`,
+            { maxBuffer: 64 * 1024 * 1024 }, // Super large buffer
+        );
+        output = stdout;
+        errors = stderr;
+        success = true;
+    } catch (error: any) {
+        output = error?.stdout || "";
+        errors = error?.stderr || error.message || "Unknown error";
+    }
+
+    if (!success) {
+        return [{ error: errors, output: output }];
+    }
+    if (errors) {
+        return [{ error: errors, output: output }];
+    }
+    if (!output) {
+        return [{ error: "No output from chunker script" }];
+    }
+
+    const results: (ChunkedFile | ErrorItem)[] = JSON.parse(output);
+    // TODO: validate that JSON matches our schema.
+
+    // Ensure all chunks have a filename.
+    for (const result of results) {
+        if (!("error" in result)) {
+            for (const chunk of result.chunks) {
+                chunk.fileName = result.fileName;
+            }
+        }
+    }
+    return splitLargeFiles(results);
+}
+
+const CHUNK_COUNT_LIMIT = 25; // How many chunks at most.
+const FILE_SIZE_LIMIT = 25000; // How many characters at most.
+
+function splitLargeFiles(
+    items: (ChunkedFile | ErrorItem)[],
+): (ChunkedFile | ErrorItem)[] {
+    const results: (ChunkedFile | ErrorItem)[] = [];
+    for (const item of items) {
+        if (
+            "error" in item ||
+            (item.chunks.length <= CHUNK_COUNT_LIMIT &&
+                fileSize(item) <= FILE_SIZE_LIMIT)
+        ) {
+            results.push(item);
+        } else {
+            results.push(...splitFile(item));
+        }
+    }
+    return results;
+}
+
+// This algorithm is too complex. I needed a debugger and logging to get it right.
+function splitFile(file: ChunkedFile): ChunkedFile[] {
+    const fileName = file.fileName;
+    const parentMap: Map<ChunkId, Chunk> = new Map();
+    for (const chunk of file.chunks) {
+        // Only nodes with children will be looked up in this map.
+        if (chunk.children.length) parentMap.set(chunk.id, chunk);
+    }
+
+    const results: ChunkedFile[] = []; // Where output accumulates.
+    let chunks = Array.from(file.chunks); // The chunks yet to emit.
+    let minNumChunks = 1;
+
+    outer: while (true) {
+        // Keep going until we exit the inner loop.
+        let totalSize = 0; // Size in characters of chunks to be output.
+        for (let i = 0; i < chunks.length; i++) {
+            // Iterate in pre-order.
+            const currentChunk = chunks[i];
+            const size = chunkSize(currentChunk);
+            if (
+                i < minNumChunks ||
+                (i < CHUNK_COUNT_LIMIT && totalSize + size <= FILE_SIZE_LIMIT)
+            ) {
+                totalSize += size;
+                continue;
+            }
+
+            // Split the file here (current chunk goes into ancestors).
+            const rest = chunks.splice(i);
+            if (rest.shift() !== currentChunk)
+                throw Error(
+                    "Internal error: expected current chunk at head of rest",
+                );
+            results.push({ fileName, chunks });
+            const ancestors: Chunk[] = [];
+
+            let c: Chunk | undefined = currentChunk;
+            do {
+                ancestors.unshift(c);
+                c = parentMap.get(c.parentId);
+            } while (c);
+            // Note that the current chunk is the last ancestor.
+            chunks = [...ancestors, ...rest];
+            minNumChunks = ancestors.length;
+            continue outer;
+        }
+        // Append the final chunk.
+        results.push({ fileName, chunks });
+        break;
+    }
+    // console.log(
+    //     `Split ${file.fileName} (${file.chunks.length} chunks) into ${results.length} files.`,
+    // );
+    // console.log(`Sizes: ${results.map((f) => f.chunks.length).join(", ")}`);
+    return results;
+}
+
+function fileSize(file: ChunkedFile): number {
+    return file.chunks.reduce((acc, chunk) => acc + chunkSize(chunk), 0);
+}
+
+function chunkSize(chunk: Chunk): number {
+    let totalCharacters = 0;
+    for (const blob of chunk.blobs) {
+        if (!blob.breadcrumb) {
+            for (const line of blob.lines) {
+                totalCharacters += line.length;
+            }
+        }
+    }
+    return totalCharacters;
+}
diff --git a/ts/examples/docuProc/src/pdfChunkyIndexer.ts b/ts/examples/docuProc/src/pdfChunkyIndexer.ts
new file mode 100644
index 000000000..d27b19efb
--- /dev/null
+++ b/ts/examples/docuProc/src/pdfChunkyIndexer.ts
@@ -0,0 +1,135 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+import { openai, ChatModel, TextEmbeddingModel } from "aiclient";
+import * as knowLib from "knowledge-processor";
+import { createObjectFolder, loadSchema, ObjectFolder } from "typeagent";
+
+import { createFileDocumenter, FileDocumenter } from "./pdfFileDocumenter.js";
+import { Chunk, ChunkId } from "./pdfChunker.js";
+import { QuerySpecs } from "./pdfDocQuerySchema.js";
+import { createJsonTranslator, TypeChatJsonTranslator } from "typechat";
+import { createTypeScriptJsonValidator } from "typechat/ts";
+import { AnswerSpecs } from "./pdfDocAnswerSchema.js";
+
+export const IndexNames = [
+    "summaries",
+    "keywords",
+    "tags",
+    "synonyms",
+    "dependencies",
+];
+export type IndexType = (typeof IndexNames)[number];
+export type NamedIndex = [IndexType, knowLib.TextIndex<string, ChunkId>];
+
+// A bundle of object stores and indexes etc.
+export class ChunkyIndex {
+    chatModel: ChatModel;
+    miniModel: ChatModel; // E.g. gpt-3.5-turbo or gpt-4-mini or o1-mini.
+    embeddingModel: TextEmbeddingModel;
+    fileDocumenter: FileDocumenter;
+    queryMaker: TypeChatJsonTranslator<QuerySpecs>;
+    answerMaker: TypeChatJsonTranslator<AnswerSpecs>;
+
+    // The rest are asynchronously initialized by reInitialize(rootDir).
+    rootDir!: string;
+    answerFolder!: ObjectFolder<AnswerSpecs>;
+    chunkFolder!: ObjectFolder<Chunk>;
+    indexes!: Map<IndexType, knowLib.TextIndex<string, ChunkId>>;
+
+    private constructor() {
+        this.chatModel = openai.createChatModelDefault("spelunkerChat");
+        this.miniModel = openai.createChatModel(
+            "GPT_35_TURBO",
+            undefined,
+            undefined,
+            ["spelunkerMini"],
+        );
+        this.embeddingModel = knowLib.createEmbeddingCache(
+            openai.createEmbeddingModel(),
+            1000,
+        );
+        this.fileDocumenter = createFileDocumenter(this.chatModel);
+        this.queryMaker = createQueryMaker(this.chatModel);
+        this.answerMaker = createAnswerMaker(this.chatModel);
+    }
+
+    static async createInstance(rootDir: string): Promise<ChunkyIndex> {
+        const instance = new ChunkyIndex();
+        await instance.reInitialize(rootDir);
+        return instance;
+    }
+
+    async reInitialize(rootDir: string): Promise<void> {
+        const instance = this; // So makeIndex can see it.
+        instance.rootDir = rootDir;
+        instance.chunkFolder = await createObjectFolder<Chunk>(
+            instance.rootDir + "/chunks",
+            { serializer: (obj) => JSON.stringify(obj, null, 2) },
+        );
+        instance.answerFolder = await createObjectFolder<AnswerSpecs>(
+            instance.rootDir + "/answers",
+            { serializer: (obj) => JSON.stringify(obj, null, 2) },
+        );
+        instance.indexes = new Map();
+        for (const name of IndexNames) {
+            instance.indexes.set(name, await makeIndex(name));
+        }
+
+        async function makeIndex(
+            name: string,
+        ): Promise<knowLib.TextIndex<string, ChunkId>> {
+            return await knowLib.createTextIndex<ChunkId>(
+                {
+                    caseSensitive: false,
+                    concurrency: 4,
+                    semanticIndex: true,
+                    embeddingModel: instance.embeddingModel,
+                },
+                instance.rootDir + "/" + name,
+            );
+        }
+    }
+
+    getIndexByName(indexName: IndexType): knowLib.TextIndex<string, ChunkId> {
+        for (const [name, index] of this.allIndexes()) {
+            if (name === indexName) {
+                return index;
+            }
+        }
+        throw new Error(`Unknown index: ${indexName}`);
+    }
+
+    allIndexes(): NamedIndex[] {
+        return [...this.indexes.entries()];
+    }
+}
+
+function createQueryMaker(
+    model: ChatModel,
+): TypeChatJsonTranslator<QuerySpecs> {
+    const typeName = "QuerySpecs";
+    const schema = loadSchema(
+        ["makeQuerySchema.ts", "makeAnswerSchema.ts"],
+        import.meta.url,
+    );
+    const validator = createTypeScriptJsonValidator<QuerySpecs>(
+        schema,
+        typeName,
+    );
+    const translator = createJsonTranslator<QuerySpecs>(model, validator);
+    return translator;
+}
+
+function createAnswerMaker(
+    model: ChatModel,
+): TypeChatJsonTranslator<AnswerSpecs> {
+    const typeName = "AnswerSpecs";
+    const schema = loadSchema(["makeAnswerSchema.ts"], import.meta.url);
+    const validator = createTypeScriptJsonValidator<AnswerSpecs>(
+        schema,
+        typeName,
+    );
+    const translator = createJsonTranslator<AnswerSpecs>(model, validator);
+    return translator;
+}
diff --git a/ts/examples/docuProc/src/pdfDocAnswerSchema.ts b/ts/examples/docuProc/src/pdfDocAnswerSchema.ts
new file mode 100644
index 000000000..e65ea2189
--- /dev/null
+++ b/ts/examples/docuProc/src/pdfDocAnswerSchema.ts
@@ -0,0 +1,15 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+// Identifier for a chunk of code.
+export type ChunkId = string;
+
+// Answer to the original question.
+export type AnswerSpecs = {
+    question: string; // Original question (e.g. "How can items be related")
+    answer: string; // Answer to the question. It is readable and complete, with suitable formatting (line breaks, bullet points etc)
+
+    references: ChunkId[]; // Chunks that support this answer
+    confidence: number; // Between 0 and 1
+    message?: string; // Optional message to the user (e.g. for low confidence); might request more input
+};
diff --git a/ts/examples/docuProc/src/pdfDocChunkSchema.ts b/ts/examples/docuProc/src/pdfDocChunkSchema.ts
new file mode 100644
index 000000000..1b6e3714c
--- /dev/null
+++ b/ts/examples/docuProc/src/pdfDocChunkSchema.ts
@@ -0,0 +1,35 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+// Extracted information for a chunk of text.
+export type PdfDocChunk = {
+    // Optional file identifier for this chunk.
+    fileName?: string;
+
+    // paragraph number in the file
+    paraNumber?: number;
+
+    name: string;
+
+    // One paragraph summary of the pdf chunk.
+    // Concise and informative.
+    summary: string;
+
+    // Propose keywords/phrases capturing the chunk's highlights,
+    // context, and notable topics. Make them concise but descriptive,
+    // ensuring users can find these points with common queries or synonyms.
+    keywords?: string[];
+
+    // Optional high-level labels (e.g., "transformer", "algorithm").
+    tags?: string[];
+
+    // Additional synonyms or related domain concepts.
+    synonyms?: string[];
+
+    // References to other chunks or external files.
+    dependencies?: string[];
+};
+
+export type PdfFileDocumentation = {
+    chunkDocs?: PdfDocChunk[];
+};
diff --git a/ts/examples/docuProc/src/pdfDocQuerySchema.ts b/ts/examples/docuProc/src/pdfDocQuerySchema.ts
new file mode 100644
index 000000000..4c1834df6
--- /dev/null
+++ b/ts/examples/docuProc/src/pdfDocQuerySchema.ts
@@ -0,0 +1,24 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+import { AnswerSpecs } from "./pdfDocAnswerSchema.js";
+
+// Proposed query for one index.
+export type QuerySpec = {
+    query: string; // Query the index for nearest neighbors to this
+    maxHits?: number; // Omit to use system default
+    confidence: number; // Between 0 and 1 (how much value you expect from this query)
+};
+
+// Proposed queries for some of the indexes; or a direct answer.
+export type QuerySpecs = {
+    // Queries directed to various indexes. Comments describe what's in each index.
+    summaries?: QuerySpec; // A paragraph describing the paper paragraph
+    keywords?: QuerySpec; // Short key words and phrases extracted from the text
+    tags?: QuerySpec; // Optional high-level labels (e.g. "algorithmic", "scientific")
+    synonyms?: QuerySpec; // Additional synonyms or related domain concepts
+    dependencies?: QuerySpec; // External dependencies
+
+    // If the question can be answered based on chat history and general knowledge.
+    answer?: AnswerSpecs;
+};
diff --git a/ts/examples/docuProc/src/pdfFileDocumenter.ts b/ts/examples/docuProc/src/pdfFileDocumenter.ts
new file mode 100644
index 000000000..87d43576e
--- /dev/null
+++ b/ts/examples/docuProc/src/pdfFileDocumenter.ts
@@ -0,0 +1,97 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+import { ChatModel } from "aiclient";
+import { loadSchema } from "typeagent";
+import { createJsonTranslator, TypeChatJsonTranslator } from "typechat";
+import { createTypeScriptJsonValidator } from "typechat/ts";
+
+import { PdfFileDocumentation } from "./pdfDocChunkSchema.js";
+import { Chunk } from "./pdfChunker.js";
+
+// For various reasons we want to index chunks separately,
+// but we want to produce their documentation in the context of the whole file.
+// FileDocumenter.document(chunks) produces documentation comments
+// and then assigns each comment to the appropriate chunk.
+
+// Document an entire file and assign comments to chunks.
+
+export interface FileDocumenter {
+    document(chunks: Chunk[]): Promise<PdfFileDocumentation>;
+}
+
+export function createFileDocumenter(model: ChatModel): FileDocumenter {
+    const fileDocTranslator = createFileDocTranslator(model);
+    return {
+        document,
+    };
+
+    async function document(chunks: Chunk[]): Promise<PdfFileDocumentation> {
+        let text = "";
+        for (const chunk of chunks) {
+            text += `***: Docmument the following ${chunk.treeName}:\n`;
+            for (const blob of chunk.blobs) {
+                for (let i = 0; i < blob.lines.length; i++) {
+                    text += `[${blob.start + i + 1}]: ${blob.lines[i]}\n`;
+                }
+            }
+        }
+        const request =
+            "Document the given Python code, its purpose, and any relevant details.\n" +
+            "The code has (non-contiguous) line numbers, e.g.: `[1]: def foo():`\n" +
+            "There are also marker lines, e.g.: `***: Document the following FuncDef`\n" +
+            "Write a concise paragraph for EACH marker.\n" +
+            "For example, the comment could be:\n" +
+            "```\n" +
+            "Method C.foo finds the most twisted anagram for a word.\n" +
+            "It uses various heuristics to rank a word's twistedness'.\n" +
+            "```\n" +
+            "Also fill in the lists of keywords, tags, synonyms, and dependencies.\n";
+        const result = await fileDocTranslator.translate(request, text);
+
+        // Now assign each comment to its chunk.
+        if (result.success) {
+            const fileDocs: PdfFileDocumentation = result.data;
+            // Assign each comment to its chunk.
+            for (const chunkDoc of fileDocs.chunkDocs ?? []) {
+                console.log(chunkDoc.name);
+                for (const chunk of chunks) {
+                    for (const blob of chunk.blobs) {
+
+                        console.log(blob);
+                        // Reminder: blob.start is 0-based, comment.lineNumber is 1-based.
+                        /*if (
+                            !blob.breadcrumb &&
+                            blob.start < chunkDoc.lineNumber &&
+                            chunkDoc.lineNumber <=
+                                blob.start + blob.lines.length
+                        ) {
+                            const chunkDocs = chunk?.docs?.chunkDocs ?? [];
+                            chunkDocs.push(chunkDoc);
+                            chunk.docs = { chunkDocs };
+                        }*/
+                    }
+                }
+            }
+            return fileDocs;
+        } else {
+            throw new Error(result.message);
+        }
+    }
+}
+
+function createFileDocTranslator(
+    model: ChatModel,
+): TypeChatJsonTranslator<PdfFileDocumentation> {
+    const typeName = "FileDocumentation";
+    const schema = loadSchema(["fileDocSchema.ts"], import.meta.url);
+    const validator = createTypeScriptJsonValidator<PdfFileDocumentation>(
+        schema,
+        typeName,
+    );
+    const translator = createJsonTranslator<PdfFileDocumentation>(
+        model,
+        validator,
+    );
+    return translator;
+}
diff --git a/ts/examples/docuProc/src/pdfImporter.ts b/ts/examples/docuProc/src/pdfImporter.ts
new file mode 100644
index 000000000..e2739238d
--- /dev/null
+++ b/ts/examples/docuProc/src/pdfImporter.ts
@@ -0,0 +1,305 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+// TODO: Most of this is not Python specific; generalize to other languages.
+
+import chalk, { ChalkInstance } from "chalk";
+import * as fs from "fs";
+import * as knowLib from "knowledge-processor";
+import { asyncArray } from "typeagent";
+
+import * as iapp from "interactive-app";
+import { ChunkyIndex, IndexNames } from "./pdfChunkyIndexer.js";
+import { PdfDocChunk, PdfFileDocumentation } from "./pdfDocChunkSchema.js";
+import {
+    Chunk,
+    ChunkedFile,
+    ChunkId,
+    chunkifyPdfFiles,
+    ErrorItem,
+} from "./pdfChunker.js";
+import { purgeNormalizedFile } from "./pdfQNAInterface.js";
+
+function log(
+    io: iapp.InteractiveIo | undefined,
+    message: string,
+    color: ChalkInstance,
+): void {
+    message = color(message);
+    if (io) {
+        io.writer.writeLine(message);
+    } else {
+        console.log(message);
+    }
+}
+
+export async function importAllFiles(
+    files: string[],
+    chunkyIndex: ChunkyIndex,
+    io: iapp.InteractiveIo | undefined,
+    verbose: boolean,
+): Promise<void> {
+    log(io, `[Importing ${files.length} files]`, chalk.grey);
+
+    const t0 = Date.now();
+    await importPdfFiles(files, chunkyIndex, io, verbose);
+    const t1 = Date.now();
+
+    log(
+        io,
+        `[Imported ${files.length} files in ${((t1 - t0) * 0.001).toFixed(3)} seconds]`,
+        chalk.grey,
+    );
+}
+
+async function importPdfFiles(
+    files: string[],
+    chunkyIndex: ChunkyIndex,
+    io: iapp.InteractiveIo | undefined,
+    verbose = false,
+): Promise<void> {
+    // Canonicalize filenames.
+    let filenames = files.map((file) =>
+        fs.existsSync(file) ? fs.realpathSync(file) : file,
+    );
+
+    // Purge previous occurrences of these files.
+    for (const fileName of filenames) {
+        await purgeNormalizedFile(io, chunkyIndex, fileName, verbose);
+    }
+
+    // Chunkify PDF files using a helper program.
+    const t0 = Date.now();
+    const results = await chunkifyPdfFiles(filenames);
+    const t1 = Date.now();
+    if (results.length !== filenames.length) {
+        log(
+            io,
+            `[Some over-long files were split into multiple partial files]`,
+            chalk.yellow,
+        );
+    }
+
+    // Print stats for chunkifying.
+    let numLines = 0;
+    let numBlobs = 0;
+    let numChunks = 0;
+    let numErrors = 0;
+    for (const result of results) {
+        if ("error" in result) {
+            numErrors++;
+        } else {
+            const chunkedFile = result;
+            numChunks += chunkedFile.chunks.length;
+            for (const chunk of chunkedFile.chunks) {
+                numBlobs += chunk.blobs.length;
+                for (const blob of chunk.blobs) {
+                    numLines += blob.lines.length;
+                }
+            }
+        }
+    }
+    log(
+        io,
+        `[Chunked ${results.length} files ` +
+            `(${numLines} lines, ${numBlobs} blobs, ${numChunks} chunks, ${numErrors} errors) ` +
+            `in ${((t1 - t0) * 0.001).toFixed(3)} seconds]`,
+        chalk.gray,
+    );
+
+    const chunkingErrors = results.filter(
+        (result:any): result is ErrorItem => "error" in result,
+    );
+    for (const error of chunkingErrors) {
+        log(
+            io,
+            `[Error: ${error.error}; Output: ${error.output ?? ""}]`,
+            chalk.redBright,
+        );
+    }
+
+    const chunkedFiles = results.filter(
+        (result:any): result is ChunkedFile => "chunks" in result,
+    );
+    log(io, `[Documenting ${chunkedFiles.length} files]`, chalk.grey);
+
+    const tt0 = Date.now();
+    const documentedFiles: PdfFileDocumentation[] = [];
+    const concurrency = 8; // TODO: Make this a function argument
+    let nChunks = 0;
+    await asyncArray.forEachAsync(
+        chunkedFiles,
+        concurrency,
+        async (chunkedFile) => {
+            const t0 = Date.now();
+            let docs: PdfFileDocumentation;
+            nChunks += chunkedFile.chunks.length;
+            try {
+                docs = await exponentialBackoff(
+                    io,
+                    chunkyIndex.fileDocumenter.document,
+                    chunkedFile.chunks,
+                );
+            } catch (error) {
+                const t1 = Date.now();
+                log(
+                    io,
+                    `  [Error documenting ${chunkedFile.fileName} in ${((t1 - t0) * 0.001).toFixed(3)} seconds: ${error}]`,
+                    chalk.redBright,
+                );
+                return;
+            }
+            const t1 = Date.now();
+
+            if (verbose) {
+                log(
+                    io,
+                    `  [Documented ${chunkedFile.chunks.length} chunks in ${((t1 - t0) * 0.001).toFixed(3)} seconds for ${chunkedFile.fileName}]`,
+                    chalk.grey,
+                );
+            }
+            documentedFiles.push(docs);
+        },
+    );
+    const tt1 = Date.now();
+
+    log(
+        io,
+        `[Documented ${documentedFiles.length} files (${nChunks} chunks) in ${((tt1 - tt0) * 0.001).toFixed(3)} seconds]`,
+        chalk.grey,
+    );
+
+    const nonEmptyFiles = chunkedFiles.filter(
+        (cf) => cf.chunks.filter((c) => c.docs).length,
+    );
+
+    log(io, `[Embedding ${nonEmptyFiles.length} files]`, chalk.grey);
+
+    if (nonEmptyFiles.length) {
+        const ttt0 = Date.now();
+        // Cannot parallelize this because of concurrent writing to TextIndex.
+        // TODO: Try pre-computing embeddings in parallel to fill the embeddings cache (is that cache safe?)
+        for (const chunkedFile of nonEmptyFiles) {
+            await embedChunkedFile(chunkedFile, chunkyIndex, io, verbose);
+        }
+        const ttt1 = Date.now();
+
+        log(
+            io,
+            `[Embedded ${documentedFiles.length} files in ${((ttt1 - ttt0) * 0.001).toFixed(3)} seconds]`,
+            chalk.grey,
+        );
+    }
+}
+
+export async function embedChunkedFile(
+    chunkedFile: ChunkedFile,
+    chunkyIndex: ChunkyIndex,
+    io: iapp.InteractiveIo | undefined,
+    verbose = false,
+): Promise<void> {
+    const chunks: Chunk[] = chunkedFile.chunks;
+    if (chunks.length === 0) {
+        log(io, `[Skipping empty file ${chunkedFile.fileName}]`, chalk.yellow);
+        return;
+    }
+    const t0 = Date.now();
+    for (const chunk of chunkedFile.chunks) {
+        await embedChunk(chunk, chunkyIndex, io, verbose);
+    }
+    const t1 = Date.now();
+    if (verbose) {
+        log(
+            io,
+            `  [Embedded ${chunks.length} chunks in ${((t1 - t0) * 0.001).toFixed(3)} seconds for ${chunkedFile.fileName}]`,
+            chalk.grey,
+        );
+    }
+}
+
+async function embedChunk(
+    chunk: Chunk,
+    chunkyIndex: ChunkyIndex,
+    io: iapp.InteractiveIo | undefined,
+    verbose = false,
+): Promise<void> {
+    const t0 = Date.now();
+    const lineCount = chunk.blobs.reduce(
+        (acc, blob) => acc + blob.lines.length,
+        0,
+    );
+    await exponentialBackoff(io, chunkyIndex.chunkFolder.put, chunk, chunk.id);
+
+    const summaries: string[] = [];
+    const chunkDocs: PdfDocChunk[] = chunk.docs?.chunkDocs ?? [];
+    for (const chunkDoc of chunkDocs) {
+        summaries.push(chunkDoc.summary);
+    }
+    const combinedSummaries = summaries.join("\n").trimEnd();
+
+    for (const chunkDoc of chunkDocs) {
+        for (const indexName of IndexNames) {
+            let data: string[];
+            if (indexName == "summaries") {
+                data = [combinedSummaries];
+            } else {
+                data = (chunkDoc as any)[indexName];
+            }
+            const index = chunkyIndex.indexes.get(indexName)!;
+            if (data && index) {
+                await writeToIndex(io, chunk.id, data, index);
+            }
+        }
+    }
+
+    const t1 = Date.now();
+    if (verbose) {
+        log(
+            io,
+            `  [Embedded ${chunk.id} (${lineCount} lines @ ${chunk.blobs[0].start + 1}) ` +
+                `in ${((t1 - t0) * 0.001).toFixed(3)} seconds for ${chunk.fileName}]`,
+            chalk.gray,
+        );
+    }
+}
+
+async function writeToIndex(
+    io: iapp.InteractiveIo | undefined,
+    chunkId: ChunkId,
+    phrases: string[] | undefined, // List of summaries, keywords, tags, etc. in chunk
+    index: knowLib.TextIndex<string, ChunkId>,
+) {
+    for (const phrase of phrases ?? []) {
+        await exponentialBackoff(io, index.put, phrase, [chunkId]);
+    }
+}
+
+async function exponentialBackoff<T extends any[], R>(
+    io: iapp.InteractiveIo | undefined,
+    callable: (...args: T) => Promise<R>,
+    ...args: T
+): Promise<R> {
+    let timeout = 1;
+    for (;;) {
+        try {
+            return await callable(...args);
+        } catch (error) {
+            if (timeout > 1000) {
+                log(io, `[Error: ${error}; giving up]`, chalk.redBright);
+                throw error;
+            }
+            log(
+                io,
+                `[Error: ${error}; retrying in ${timeout} ms]`,
+                chalk.redBright,
+            );
+            await new Promise((resolve) => setTimeout(resolve, timeout));
+            timeout *= 2;
+        }
+    }
+}
+
+// Apply URL escaping to key. NOTE: Currently unused. TODO: Therefore remove.
+export function sanitizeKey(key: string): string {
+    return encodeURIComponent(key).replace(/%20/g, "+"); // Encode spaces as plus, others as %xx.
+}
diff --git a/ts/examples/docuProc/src/pdfQNAInterface.ts b/ts/examples/docuProc/src/pdfQNAInterface.ts
new file mode 100644
index 000000000..dcdf4456c
--- /dev/null
+++ b/ts/examples/docuProc/src/pdfQNAInterface.ts
@@ -0,0 +1,1113 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+// User interface for querying the index.
+
+import chalk, { ChalkInstance } from "chalk";
+import * as fs from "fs";
+import * as util from "util";
+
+import * as iapp from "interactive-app";
+import * as knowLib from "knowledge-processor";
+import { NameValue, ScoredItem } from "typeagent";
+
+import { IndexType, ChunkyIndex } from "./pdfChunkyIndexer.js";
+import { QuerySpec, QuerySpecs } from "./pdfDocQuerySchema.js";
+import { Chunk, ChunkId } from "./pdfChunker.js";
+import { importAllFiles } from "./pdfImporter.js";
+import { AnswerSpecs } from "./pdfDocAnswerSchema.js";
+import { PromptSection } from "typechat";
+
+type QueryOptions = {
+    maxHits: number;
+    minScore: number;
+    verbose: boolean;
+};
+
+function writeColor(
+    io: iapp.InteractiveIo | undefined,
+    color: ChalkInstance,
+    message: string,
+): void {
+    message = color(message);
+    if (io) {
+        io.writer.writeLine(message);
+    } else {
+        console.log(message);
+    }
+}
+
+function writeNote(io: iapp.InteractiveIo | undefined, message: string): void {
+    writeColor(io, chalk.gray, message);
+}
+
+function writeMain(io: iapp.InteractiveIo | undefined, message: string): void {
+    writeColor(io, chalk.white, message);
+}
+
+function writeWarning(
+    io: iapp.InteractiveIo | undefined,
+    message: string,
+): void {
+    writeColor(io, chalk.yellow, message);
+}
+
+function writeError(io: iapp.InteractiveIo | undefined, message: string): void {
+    writeColor(io, chalk.redBright, message);
+}
+
+function writeHeading(
+    io: iapp.InteractiveIo | undefined,
+    message: string,
+): void {
+    writeColor(io, chalk.green, message);
+}
+
+export async function interactiveQueryLoop(
+    chunkyIndex: ChunkyIndex,
+    verbose = false,
+): Promise<void> {
+    const handlers: Record<string, iapp.CommandHandler> = {
+        import: importHandler, // Since you can't name a function "import".
+        clearMemory,
+        chunk,
+        search,
+        summaries,
+        keywords,
+        tags,
+        synonyms,
+        dependencies,
+        files,
+        purgeFile,
+    };
+    iapp.addStandardHandlers(handlers);
+
+    function importDef(): iapp.CommandMetadata {
+        return {
+            description: "Import a file or files of Python code.",
+            options: {
+                fileName: {
+                    description:
+                        "File to import (or multiple files separated by commas)",
+                    type: "string",
+                },
+                files: {
+                    description: "File containing the list of files to import",
+                    type: "string",
+                },
+                verbose: {
+                    description: "More verbose output",
+                    type: "boolean",
+                    defaultValue: verbose,
+                },
+            },
+        };
+    }
+    handlers.import.metadata = importDef();
+    async function importHandler(
+        args: string[] | iapp.NamedArgs,
+        io: iapp.InteractiveIo,
+    ): Promise<void> {
+        const namedArgs = iapp.parseNamedArguments(args, importDef());
+        const files = namedArgs.fileName
+            ? (namedArgs.fileName as string).trim().split(",")
+            : namedArgs.files
+              ? fs
+                    .readFileSync(namedArgs.files as string, "utf-8")
+                    .split("\n")
+                    .map((line) => line.trim())
+                    .filter((line) => line.length > 0 && line[0] !== "#")
+              : [];
+        if (!files.length) {
+            writeError(io, "[No files to import (use --? for help)]");
+            return;
+        }
+        await importAllFiles(files, chunkyIndex, io, namedArgs.verbose);
+    }
+
+    handlers.clearMemory.metadata = "Clear all memory (and all indexes)";
+    async function clearMemory(
+        args: string[],
+        io: iapp.InteractiveIo,
+    ): Promise<void> {
+        await fs.promises.rm(chunkyIndex.rootDir, {
+            recursive: true,
+            force: true,
+        });
+        await chunkyIndex.reInitialize(chunkyIndex.rootDir);
+        writeNote(io, "[All memory and all indexes cleared]");
+        // Actually the embeddings cache isn't. But we shouldn't have to care.
+    }
+
+    handlers.chunk.metadata = "Print one or more chunks";
+    async function chunk(
+        args: string[],
+        io: iapp.InteractiveIo,
+    ): Promise<void> {
+        const joinedArgs = args.join(" ");
+        const splitArgs = joinedArgs.trim().split(/[\s,]+/);
+        for (const chunkId of splitArgs) {
+            const chunk = await chunkyIndex.chunkFolder.get(chunkId);
+            if (chunk) {
+                const chunkDocs = chunk.docs?.chunkDocs ?? [];
+                writeNote(io, `\nCHUNK ID: ${chunkId}`);
+                for (const chunkDoc of chunkDocs) {
+                    for (const [name, _] of chunkyIndex.allIndexes()) {
+                        if (name == "summaries") {
+                            if (chunkDoc.summary) {
+                                writeNote(io, "SUMMARY:");
+                                writeMain(
+                                    io,
+                                    // Indent by two
+                                    wordWrap(chunkDoc.summary).replace(
+                                        /^/gm,
+                                        "  ",
+                                    ),
+                                );
+                            } else {
+                                writeNote(io, "SUMMARY: None");
+                            }
+                        } else {
+                            const docItem: string[] | undefined = (
+                                chunkDoc as any
+                            )[name];
+                            if (docItem?.length) {
+                                writeNote(
+                                    io,
+                                    `${name.toUpperCase()}: ${docItem.join(", ")}`,
+                                );
+                            }
+                        }
+                    }
+                }
+                writeNote(io, "CODE:");
+                writeChunkLines(chunk, io, 100);
+            } else {
+                writeNote(io, `[Chunk ID ${chunkId} not found]`);
+            }
+        }
+    }
+
+    function searchDef(): iapp.CommandMetadata {
+        return {
+            description: "Search for a query string in the code index.",
+            args: {
+                query: {
+                    description: "Natural language query",
+                    type: "string",
+                },
+            },
+            options: {
+                maxHits: {
+                    description: "Maximum number of hits to return",
+                    type: "integer",
+                    defaultValue: 10,
+                },
+                minScore: {
+                    description: "Minimum score to return",
+                    type: "number",
+                    defaultValue: 0.7,
+                },
+                verbose: {
+                    description: "More verbose output",
+                    type: "boolean",
+                    defaultValue: verbose,
+                },
+            },
+        };
+    }
+    handlers.search.metadata = searchDef();
+    async function search(
+        args: string[] | iapp.NamedArgs,
+        io: iapp.InteractiveIo,
+    ): Promise<void> {
+        const namedArgs = iapp.parseNamedArguments(args, searchDef());
+        const query: string = namedArgs.query;
+        const queryOptions: QueryOptions = {
+            maxHits: namedArgs.maxHits,
+            minScore: namedArgs.minScore,
+            verbose: namedArgs.verbose,
+        };
+        await processQuery(query, chunkyIndex, io, queryOptions);
+    }
+
+    function commonOptions(): Record<string, iapp.ArgDef> {
+        return {
+            verbose: {
+                description: "More verbose output",
+                type: "boolean",
+                defaultValue: verbose,
+            },
+            filter: {
+                description: "Filter by keyword",
+                type: "string",
+            },
+            query: {
+                description: "Natural language query",
+                type: "string",
+            },
+            maxHits: {
+                description:
+                    "Maximum number of hits to return (only for query)",
+                type: "integer",
+                defaultValue: 10,
+            },
+            minScore: {
+                description: "Minimum score to return (only for query)",
+                type: "number",
+                defaultValue: 0.7,
+            },
+            debug: {
+                description: "Show debug output",
+                type: "boolean",
+            },
+        };
+    }
+
+    function summariesDef(): iapp.CommandMetadata {
+        return {
+            description: "Show all recorded summaries and their postings.",
+            options: commonOptions(),
+        };
+    }
+    handlers.summaries.metadata = summariesDef();
+    async function summaries(
+        args: string[] | iapp.NamedArgs,
+        io: iapp.InteractiveIo,
+    ): Promise<void> {
+        await _reportIndex(args, io, "summaries");
+    }
+
+    function keywordsDef(): iapp.CommandMetadata {
+        return {
+            description: "Show all recorded keywords and their postings.",
+            options: commonOptions(),
+        };
+    }
+    handlers.keywords.metadata = keywordsDef();
+    async function keywords(
+        args: string[] | iapp.NamedArgs,
+        io: iapp.InteractiveIo,
+    ): Promise<void> {
+        await _reportIndex(args, io, "keywords");
+    }
+
+    function tagsDef(): iapp.CommandMetadata {
+        return {
+            description: "Show all recorded tags and their postings.",
+            options: commonOptions(),
+        };
+    }
+    handlers.tags.metadata = tagsDef();
+    async function tags(
+        args: string[] | iapp.NamedArgs,
+        io: iapp.InteractiveIo,
+    ): Promise<void> {
+        await _reportIndex(args, io, "tags");
+    }
+
+    function synonymsDef(): iapp.CommandMetadata {
+        return {
+            description: "Show all recorded synonyms and their postings.",
+            options: commonOptions(),
+        };
+    }
+    handlers.synonyms.metadata = synonymsDef();
+    async function synonyms(
+        args: string[] | iapp.NamedArgs,
+        io: iapp.InteractiveIo,
+    ): Promise<void> {
+        await _reportIndex(args, io, "synonyms");
+    }
+
+    function dependenciesDef(): iapp.CommandMetadata {
+        return {
+            description: "Show all recorded dependencies and their postings.",
+            options: commonOptions(),
+        };
+    }
+    handlers.dependencies.metadata = dependenciesDef();
+    async function dependencies(
+        args: string[] | iapp.NamedArgs,
+        io: iapp.InteractiveIo,
+    ): Promise<void> {
+        await _reportIndex(args, io, "dependencies");
+    }
+
+    function filesDef(): iapp.CommandMetadata {
+        return {
+            description: "Show all recorded file names.",
+            options: {
+                filter: {
+                    description: "Only show files containing this string",
+                    type: "string",
+                },
+            },
+        };
+    }
+    handlers.files.metadata = filesDef();
+    async function files(
+        args: string[] | iapp.NamedArgs,
+        io: iapp.InteractiveIo,
+    ): Promise<void> {
+        const namedArgs = iapp.parseNamedArguments(args, filesDef());
+        const filter = namedArgs.filter;
+        const filesPopularity: Map<string, number> = new Map();
+        for await (const chunk of chunkyIndex.chunkFolder.allObjects()) {
+            filesPopularity.set(
+                chunk.fileName,
+                (filesPopularity.get(chunk.fileName) ?? 0) + 1,
+            );
+        }
+        if (!filesPopularity.size) {
+            writeMain(io, "[No files]");
+        } else {
+            const sortedFiles = Array.from(filesPopularity)
+                .filter(([file, _]) => !filter || file.includes(filter))
+                .sort();
+            writeNote(
+                io,
+                `Found ${sortedFiles.length} ${filter ? "matching" : "total"} files.`,
+            );
+            for (const [file, count] of sortedFiles) {
+                writeMain(
+                    io,
+                    `${chalk.blue(count.toFixed(0).padStart(7))} ${chalk.green(file)}`,
+                );
+            }
+        }
+    }
+
+    function purgeFileDef(): iapp.CommandMetadata {
+        return {
+            description: "Purge all mentions of a file.",
+            args: {
+                fileName: {
+                    description: "File to purge",
+                    type: "string",
+                },
+            },
+            options: {
+                verbose: {
+                    description: "More verbose output",
+                    type: "boolean",
+                    defaultValue: verbose,
+                },
+            },
+        };
+    }
+    handlers.purgeFile.metadata = purgeFileDef();
+    async function purgeFile(
+        args: string[] | iapp.NamedArgs,
+        io: iapp.InteractiveIo,
+    ): Promise<void> {
+        const namedArgs = iapp.parseNamedArguments(args, purgeFileDef());
+        const file = namedArgs.fileName as string;
+        const fileName = fs.existsSync(file) ? fs.realpathSync(file) : file;
+        await purgeNormalizedFile(io, chunkyIndex, fileName, namedArgs.verbose);
+    }
+
+    async function _reportIndex(
+        args: string[] | iapp.NamedArgs,
+        io: iapp.InteractiveIo,
+        indexName: IndexType,
+    ): Promise<void> {
+        const namedArgs = iapp.parseNamedArguments(args, keywordsDef());
+        const index = chunkyIndex.getIndexByName(indexName);
+        if (namedArgs.debug) {
+            writeNote(io, `[Debug: ${indexName}]`);
+            await _debugIndex(io, index, indexName, verbose);
+            return;
+        }
+
+        let matches: ScoredItem<knowLib.TextBlock<ChunkId>>[] = [];
+        if (namedArgs.query) {
+            matches = await index.nearestNeighborsPairs(
+                namedArgs.query,
+                namedArgs.maxHits,
+                namedArgs.minScore,
+            );
+        } else {
+            for await (const textBlock of index.entries()) {
+                matches.push({
+                    score: 1,
+                    item: textBlock,
+                });
+            }
+        }
+
+        let hits: ScoredItem<knowLib.TextBlock<ChunkId>>[] = [];
+        if (!namedArgs.filter) {
+            hits = matches;
+        } else {
+            const filterWords: string[] = (namedArgs.filter as string)
+                .split(" ")
+                .filter(Boolean)
+                .map((w) => w.toLowerCase());
+            for await (const match of matches) {
+                const valueWords: string[] = match.item.value
+                    .split(/\s+/)
+                    .filter(Boolean)
+                    .map((w) => w.toLowerCase());
+                if (filterWords.every((word) => valueWords.includes(word))) {
+                    hits.push(match);
+                }
+            }
+        }
+
+        if (!hits.length) {
+            writeNote(io, `No ${indexName}.`); // E.g., "No keywords."
+            return;
+        } else {
+            writeNote(io, `Found ${hits.length} ${indexName}.`);
+
+            // TFIDF = TF(t) * IDF(t) = 1 * log(N / (1 + nt))
+            // - t is a term (in other contexts, a term occurring in a given chunk)
+            // - nt the number of chunks occurring for that term in this index
+            // - N the total number of chunks in the system
+            const numChunks = await chunkyIndex.chunkFolder.size();
+            for (const hit of hits) {
+                const numSourceIds: number = hit.item.sourceIds?.length ?? 0;
+                hit.score *= Math.log(numChunks / (1 + numSourceIds));
+            }
+
+            hits.sort((a, b) => {
+                if (a.score != b.score) return b.score - a.score;
+                return a.item.value.localeCompare(b.item.value);
+            });
+            for (const hit of hits) {
+                writeMain(
+                    io,
+                    `${hit.score.toFixed(3).padStart(7)}: ${chalk.green(hit.item.value)} :: ${(hit.item.sourceIds ?? []).join(", ")}`,
+                );
+            }
+        }
+    }
+
+    async function _debugIndex(
+        io: iapp.InteractiveIo,
+        index: knowLib.TextIndex<string, ChunkId>,
+        indexName: string,
+        verbose: boolean,
+    ): Promise<void> {
+        const allTexts = Array.from(index.text());
+        for (const text of allTexts) {
+            if (verbose) writeNote(io, `Text: ${text}`);
+            const hits = await index.nearestNeighborsPairs(
+                text,
+                allTexts.length,
+            );
+            if (verbose) {
+                for (const hit of hits) {
+                    writeNote(
+                        io,
+                        `${hit.score.toFixed(3).padStart(7)} ${hit.item.value}`,
+                    );
+                }
+            }
+            if (hits.length < 2) {
+                writeNote(io, `No hits for ${text} in ${indexName}`);
+            } else {
+                const end = hits.length - 1;
+                writeMain(
+                    io,
+                    `${chalk.green(hits[0].item.value)}: ${chalk.blue(hits[1].item.value)} (${hits[1].score.toFixed(3)}) -- ` +
+                        `${chalk.blue(hits[end].item.value)} (${hits[end].score.toFixed(3)})`,
+                );
+            }
+        }
+    }
+
+    async function _inputHandler(
+        input: string,
+        io: iapp.InteractiveIo,
+    ): Promise<void> {
+        await processQuery(input, chunkyIndex, io, {
+            maxHits: 10,
+            minScore: 0.7,
+            verbose,
+        });
+    }
+
+    await iapp.runConsole({
+        inputHandler: _inputHandler,
+        handlers,
+        prompt: "\n🤖> ",
+    });
+}
+
+export async function purgeNormalizedFile(
+    io: iapp.InteractiveIo | undefined,
+    chunkyIndex: ChunkyIndex,
+    fileName: string,
+    verbose: boolean,
+): Promise<void> {
+    // Step 1: Find chunks to remove.
+    let toDelete: Set<ChunkId> = new Set();
+    for await (const chunk of chunkyIndex.chunkFolder.allObjects()) {
+        if (chunk.fileName === fileName) {
+            toDelete.add(chunk.id);
+        }
+    }
+
+    // Step 1a: Logging and early return if nothing to purge.
+    if (!toDelete.size) {
+        writeNote(io, `[No chunks to purge for file ${fileName}]`);
+        return;
+    }
+    writeNote(
+        io,
+        `[Need to purge ${toDelete.size} chunks for file ${fileName}]`,
+    );
+
+    // Step 2: Remove chunk ids from indexes.
+    const chunkIdsToDelete: ChunkId[] = Array.from(toDelete);
+    for (const [name, index] of chunkyIndex.allIndexes()) {
+        const affectedValues: string[] = [];
+        // Collect values from which we need to remove the chunk ids about to be deleted.
+        for await (const textBlock of index.entries()) {
+            if (
+                textBlock?.sourceIds?.some((id) =>
+                    chunkIdsToDelete.includes(id),
+                )
+            ) {
+                if (verbose) {
+                    writeNote(io, `[Purging ${name} entry ${textBlock.value}]`);
+                }
+                affectedValues.push(textBlock.value);
+            }
+        }
+        // Actually update the index (can't modify it while it's being iterated over).
+        for (const value of affectedValues) {
+            const id = await index.getId(value);
+            if (!id) {
+                writeWarning(io, `[No id for value {value}]`);
+            } else {
+                await index.remove(id, chunkIdsToDelete);
+            }
+        }
+        writeNote(io, `[Purged ${affectedValues.length} ${name}]`); // name is plural, e.g. "keywords".
+    }
+
+    // Step 3: Remove chunks (do this last so if step 2 fails we can try again).
+    for (const id of toDelete) {
+        if (verbose) {
+            writeNote(io, `[Purging chunk ${id}]`);
+        }
+        await chunkyIndex.chunkFolder.remove(id);
+    }
+    writeNote(io, `[Purged ${toDelete.size} chunks]`);
+}
+
+async function processQuery(
+    input: string,
+    chunkyIndex: ChunkyIndex,
+    io: iapp.InteractiveIo,
+    queryOptions: QueryOptions,
+): Promise<void> {
+    // ** Step 0:** Find most recent answers.
+
+    const recentAnswers: NameValue<AnswerSpecs>[] = await findRecentAnswers(
+        input,
+        chunkyIndex,
+    );
+
+    // **Step 1:** Ask LLM (queryMaker) to propose queries for each index.
+
+    const proposedQueries = await proposeQueries(
+        input,
+        recentAnswers,
+        chunkyIndex,
+        io,
+        queryOptions,
+    );
+    if (!proposedQueries) return; // Error message already printed by proposeQueries.
+
+    // Step 1a: If step 1 gave the answer, print it and exit.
+    if ("answer" in proposedQueries) {
+        await chunkyIndex.answerFolder.put(proposedQueries.answer);
+        reportQuery(
+            undefined,
+            undefined,
+            proposedQueries.answer,
+            io,
+            queryOptions.verbose,
+        );
+        writeNote(io, "[Answer produced by stage 1]");
+        return;
+    }
+
+    // **Step 2:** Run those queries on the indexes.
+
+    const [hitsByIndex, chunkIdScores] = await runIndexQueries(
+        proposedQueries,
+        chunkyIndex,
+        io,
+        queryOptions,
+    );
+    if (!chunkIdScores) return; // Error message already printed by runIndexQueries.
+
+    // **Step 3:** Ask the LLM (answerMaker) to answer the question.
+
+    const answer = await generateAnswer(
+        input,
+        hitsByIndex,
+        chunkIdScores,
+        recentAnswers,
+        chunkyIndex,
+        io,
+        queryOptions,
+    );
+    if (!answer) return; // Error message already printed by generateAnswer.
+
+    // **Step 4:** Print the answer. Also record it for posterity.
+
+    await chunkyIndex.answerFolder.put(answer);
+    reportQuery(hitsByIndex, chunkIdScores, answer, io, queryOptions.verbose);
+}
+
+async function proposeQueries(
+    input: string,
+    recentAnswers: NameValue<AnswerSpecs>[],
+    chunkyIndex: ChunkyIndex,
+    io: iapp.InteractiveIo,
+    queryOptions: QueryOptions,
+): Promise<QuerySpecs | undefined> {
+    const t0 = Date.now();
+
+    const promptPreamble = makeQueryMakerPrompt(recentAnswers);
+    if (queryOptions.verbose) {
+        for (const section of promptPreamble) {
+            writeNote(io, `[${section.role}: ${section.content}]`);
+        }
+    }
+    const result = await chunkyIndex.queryMaker.translate(
+        input,
+        promptPreamble,
+    );
+    if (!result.success) {
+        const t1 = Date.now();
+        writeError(
+            io,
+            `[Error: ${result.message} in ${((t1 - t0) * 0.001).toFixed(3)} seconds]`,
+        );
+        return undefined;
+    }
+    const specs = result.data;
+    if (queryOptions.verbose) {
+        writeNote(io, `\n[Result: proposed queries]`);
+        // Use util.inspect() to colorize JSON; writeColor() doesn't do that.
+        io.writer.writeLine(
+            util.inspect(specs, { depth: null, colors: true, compact: false }),
+        );
+    }
+    const t1 = Date.now();
+    writeNote(
+        io,
+        `[proposeQueries took ${((t1 - t0) * 0.001).toFixed(3)} seconds]`,
+    );
+
+    return specs;
+}
+
+type HitsMap = Map<string, ScoredItem<knowLib.TextBlock<ChunkId>>[]>; // indexName -> hits from nearestNeighborsPairs
+
+async function runIndexQueries(
+    proposedQueries: QuerySpecs,
+    chunkyIndex: ChunkyIndex,
+    io: iapp.InteractiveIo,
+    queryOptions: QueryOptions,
+): Promise<[HitsMap, Map<ChunkId, ScoredItem<ChunkId>> | undefined]> {
+    const t0 = Date.now();
+
+    const hitsByIndex: HitsMap = new Map();
+    const chunkIdScores: Map<ChunkId, ScoredItem<ChunkId>> = new Map(); // Record score of each chunk id.
+    const totalNumChunks = await chunkyIndex.chunkFolder.size(); // Nominator in IDF calculation.
+
+    for (const [indexName, index] of chunkyIndex.allIndexes()) {
+        const spec: QuerySpec | undefined = (proposedQueries as any)[indexName];
+        if (spec === undefined) {
+            writeNote(io, `[No query specified for ${indexName}]`);
+            continue;
+        }
+
+        const specMaxHits = spec.maxHits;
+        const defaultMaxHits = queryOptions.maxHits;
+        const maxHits = specMaxHits ?? defaultMaxHits;
+        const maxHitsDisplay =
+            maxHits === specMaxHits
+                ? maxHits.toString()
+                : `${specMaxHits} ?? ${defaultMaxHits}`;
+
+        const hits = await index.nearestNeighborsPairs(
+            spec.query,
+            maxHits,
+            queryOptions.minScore,
+        );
+        if (!hits.length) {
+            writeNote(
+                io,
+                `[${indexName}: query ${spec.query} (maxHits ${maxHitsDisplay}) no hits]`,
+            );
+            continue;
+        }
+        hitsByIndex.set(indexName, hits);
+
+        // Update chunk id scores.
+        for (const hit of hits) {
+            // Literature suggests setting TF = 1 in this case,
+            // but the term's relevance score intuitively makes sense.
+            const tf = hit.score;
+            // IDF calculation ("inverse document frequency smooth").
+            const fraction =
+                totalNumChunks / (1 + (hit.item.sourceIds?.length ?? 0));
+            const idf = 1 + Math.log(fraction);
+            const newScore = tf * idf;
+            for (const chunkId of hit.item.sourceIds ?? []) {
+                const oldScoredItem = chunkIdScores.get(chunkId);
+                const oldScore = oldScoredItem?.score ?? 0;
+                // Combine scores by addition. (Alternatives: max, possibly others.)
+                const combinedScore = oldScore + newScore;
+                if (oldScoredItem) {
+                    oldScoredItem.score = combinedScore;
+                } else {
+                    chunkIdScores.set(chunkId, {
+                        score: oldScore + newScore,
+                        item: chunkId,
+                    });
+                }
+            }
+        }
+
+        // Verbose logging.
+        if (queryOptions.verbose) {
+            writeNote(
+                io,
+                `\nFound ${hits.length} ${indexName} for '${spec.query}':`,
+            );
+            for (const hit of hits) {
+                writeNote(
+                    io,
+                    `${hit.score.toFixed(3).padStart(7)}: ${hit.item.value} -- ${hit.item.sourceIds?.join(", ")}`,
+                );
+            }
+        }
+
+        // Regular logging.
+        const numChunks = new Set(hits.flatMap((h) => h.item.sourceIds)).size;
+        const end = hits.length - 1;
+        writeNote(
+            io,
+            `[${indexName}: query '${spec.query}' (maxHits ${maxHitsDisplay}); ${hits.length} hits; ` +
+                `scores ${hits[0].score.toFixed(3)}--${hits[end].score.toFixed(3)}; ` +
+                `${numChunks} unique chunk ids]`,
+        );
+    }
+
+    const t1 = Date.now();
+    writeNote(
+        io,
+        `[runIndexQueries took ${((t1 - t0) * 0.001).toFixed(3)} seconds]`,
+    );
+
+    return [hitsByIndex, chunkIdScores];
+}
+
+async function generateAnswer(
+    input: string,
+    hitsByIndex: HitsMap,
+    chunkIdScores: Map<ChunkId, ScoredItem<ChunkId>>,
+    recentAnswers: NameValue<AnswerSpecs>[],
+    chunkyIndex: ChunkyIndex,
+    io: iapp.InteractiveIo,
+    queryOptions: QueryOptions,
+): Promise<AnswerSpecs | undefined> {
+    const t0 = Date.now();
+
+    // Step 3a: Compute array of ids sorted by score, truncated to some limit.
+    const scoredChunkIds: ScoredItem<ChunkId>[] = Array.from(
+        chunkIdScores.values(),
+    );
+
+    writeNote(io, `\n[Overall ${scoredChunkIds.length} unique chunk ids]`);
+
+    scoredChunkIds.sort((a, b) => b.score - a.score);
+
+    // Step 3b: Get the chunks themselves.
+    const chunks: Chunk[] = [];
+    const maxChunks = 30;
+    // Take the top N chunks that actually exist.
+    for (const scoredChunkId of scoredChunkIds) {
+        const maybeChunk = await chunkyIndex.chunkFolder.get(
+            scoredChunkId.item,
+        );
+        if (maybeChunk) {
+            chunks.push(maybeChunk);
+            if (queryOptions.verbose) {
+                writeVerboseReferences(
+                    io,
+                    [scoredChunkId.item],
+                    hitsByIndex,
+                    chunkIdScores,
+                );
+            }
+            if (chunks.length >= maxChunks) {
+                break;
+            }
+        } else {
+            writeNote(io, `[Chunk ${scoredChunkId.item} not found]`);
+        }
+    }
+
+    writeNote(io, `[Sending ${chunks.length} chunks to answerMaker]`);
+
+    const preamble = makeAnswerPrompt(recentAnswers, chunks);
+    if (queryOptions.verbose) {
+        const formatted = util.inspect(preamble, {
+            depth: null,
+            colors: true,
+            compact: false,
+        });
+        writeNote(io, `Preamble: ${formatted}`);
+    }
+
+    // Step 3c: Make the request and check for success.
+    const answerResult = await chunkyIndex.answerMaker.translate(
+        input,
+        preamble,
+    );
+
+    if (!answerResult.success) {
+        writeError(io, `[Error: ${answerResult.message}]`);
+        return undefined;
+    }
+
+    if (queryOptions.verbose) {
+        writeNote(
+            io,
+            `AnswerResult: ${JSON.stringify(answerResult.data, null, 2)}`,
+        );
+    }
+
+    const t1 = Date.now();
+    writeNote(
+        io,
+        `[generateAnswer took ${((t1 - t0) * 0.001).toFixed(3)} seconds]`,
+    );
+
+    return answerResult.data;
+}
+
+async function findRecentAnswers(
+    input: string,
+    chunkyIndex: ChunkyIndex,
+): Promise<NameValue<AnswerSpecs>[]> {
+    // TODO: Allow for multiple concurrent sessions.
+    const recentAnswers: NameValue<AnswerSpecs>[] = [];
+    for await (const answer of chunkyIndex.answerFolder.all()) {
+        recentAnswers.push(answer);
+    }
+    // Assume the name field (the internal key) is a timestamp.
+    recentAnswers.sort((a, b) => b.name.localeCompare(a.name));
+    recentAnswers.splice(20); // TODO: Cut off by total size, not count.
+    recentAnswers.reverse(); // Most recent last.
+    return recentAnswers;
+}
+
+function reportQuery(
+    hitsByIndex: HitsMap | undefined,
+    chunkIdScores: Map<ChunkId, ScoredItem<ChunkId>> | undefined,
+    answer: AnswerSpecs,
+    io: iapp.InteractiveIo,
+    verbose: boolean,
+): void {
+    writeHeading(
+        io,
+        `\nAnswer (confidence ${answer.confidence.toFixed(3).replace(/0+$/, "")}):`,
+    );
+
+    writeMain(io, wordWrap(answer.answer));
+
+    if (answer.message) {
+        writeWarning(io, "\n" + wordWrap(`Message: ${answer.message}`));
+    }
+    if (answer.references.length) {
+        writeNote(
+            io,
+            `\nReferences (${answer.references.length}): ${answer.references.join(",").replace(/,/g, ", ")}`,
+        );
+        if (verbose && chunkIdScores && hitsByIndex) {
+            writeVerboseReferences(
+                io,
+                answer.references,
+                hitsByIndex,
+                chunkIdScores,
+            );
+        }
+    }
+    // NOTE: If the user wants to see the contents of any chunk, they can use the @chunk command.
+}
+
+function writeVerboseReferences(
+    io: iapp.InteractiveIo,
+    references: ChunkId[],
+    hitsByIndex: HitsMap,
+    chunkIdScores: Map<ChunkId, ScoredItem<ChunkId>>,
+): void {
+    for (const ref of references) {
+        const score = chunkIdScores.get(ref)?.score ?? 0;
+        writeColor(
+            io,
+            chalk.blue,
+            `  ${ref} (${chalk.white(score.toFixed(3))})`,
+        );
+        for (const indexName of hitsByIndex.keys()) {
+            const hits = hitsByIndex.get(indexName) ?? [];
+            for (const hit of hits) {
+                if (hit.item.sourceIds?.includes(ref)) {
+                    writeColor(
+                        io,
+                        chalk.green,
+                        `    ${chalk.gray(indexName)}: ${hit.item.value} (${chalk.white(hit.score.toFixed(3))})`,
+                    );
+                }
+            }
+        }
+    }
+}
+
+function makeQueryMakerPrompt(
+    recentAnswers: NameValue<AnswerSpecs>[],
+): PromptSection[] {
+    const prompt = `
+I have a code project split up in chunks indexed on several categories.
+Please produce suitable queries for each applicable index based on
+conversation history (especially if the query refers to a previous answer
+indirectly, e.g. via "it" or "that"), and the user question given later.
+Don't suggest "meta" queries about the conversation itself -- only the code is indexed.
+`;
+    return makeAnyPrompt(recentAnswers, prompt);
+}
+
+function makeAnswerPrompt(
+    recentAnswers: NameValue<AnswerSpecs>[],
+    chunks: Chunk[],
+): PromptSection[] {
+    const prompt = `\
+Following are the chunks most relevant to the query.
+Use the preceding conversation items as context for the user query given later.
+`;
+
+    const preamble = makeAnyPrompt(recentAnswers, prompt);
+    for (const chunk of chunks) {
+        const chunkData = {
+            fileName: chunk.fileName,
+            chunkId: chunk.id,
+            blobs: chunk.blobs,
+            summary: chunk.docs?.chunkDocs
+                ?.filter((cd) => cd.summary)
+                .map((cd) => cd.summary)
+                .join("\n"),
+        };
+        preamble.push({ role: "user", content: JSON.stringify(chunkData) });
+    }
+    return preamble;
+}
+
+function makeAnyPrompt(
+    recentAnswers: NameValue<AnswerSpecs>[],
+    prompt: string,
+): PromptSection[] {
+    const preamble: PromptSection[] = [];
+    for (const answer of recentAnswers) {
+        preamble.push({ role: "user", content: answer.value.question });
+        preamble.push({ role: "assistant", content: answer.value.answer });
+    }
+    preamble.push({ role: "user", content: prompt.trim() });
+    return preamble;
+}
+
+function writeChunkLines(
+    chunk: Chunk,
+    io: iapp.InteractiveIo,
+    lineBudget = 10,
+): void {
+    // TODO: limit how much we write per blob too (if there are multiple).
+    outer: for (const blob of chunk.blobs) {
+        for (let i = 0; i < blob.lines.length; i++) {
+            if (lineBudget-- <= 0) {
+                writeNote(io, "   ...");
+                break outer;
+            }
+            writeMain(
+                io,
+                `${(1 + blob.start + i).toString().padStart(6)}: ${blob.lines[i].trimEnd()}`,
+            );
+        }
+    }
+}
+
+// Wrap long lines.
+export function wordWrap(text: string, wrapLength: number = 80): string {
+    let inCodeBlock = false;
+    const lines: string[] = [];
+    const prefixRegex = /^\s*((-|\*|\d+\.)\s+)?/;
+    for (let line of text.split(/[ ]*\r?\n/)) {
+        if (line.startsWith("```")) inCodeBlock = !inCodeBlock; // TODO: Colorize code blocks.
+        if (line.length <= wrapLength || inCodeBlock) {
+            // The whole line is deemed to fit.
+            lines.push(line);
+            continue;
+        }
+        // We must try to break.
+        const prefixLength = prefixRegex.exec(line)?.[0]?.length ?? 0;
+        const indent = " ".repeat(prefixLength);
+        while (line.length > wrapLength) {
+            const shortenedLine = line.slice(0, wrapLength + 1);
+            let index = shortenedLine.lastIndexOf(" ");
+            if (index <= prefixLength) {
+                index = line.indexOf(" ", wrapLength);
+                if (index < 0) break; // The rest of the line is one "word".
+            }
+            lines.push(line.slice(0, index).trimEnd());
+            line = indent + line.slice(index + 1).trimStart();
+        }
+        lines.push(line);
+    }
+    return lines.join("\n");
+}
+
+export function testWordWrap(): void {
+    const sampleText = `\
+This is a long line that should be wrapped at some point. It's not clear where.
+    This is another long line but it is also indented. Let's make sure the breaks are also indented.
+    - This is a bullet point that should be wrapped at some point. It's not clear where.
+    12. This is a numbered point that should be wrapped at some point. It's not clear where.
+\`\`\`python
+def generate_id() -> IdType:
+    """Generate a new unique ID.
+
+    IDs are really timestamps formatted as YYYY_MM_DD-HH_MM_SS.UUUUUU,
+    where UUUUUU is microseconds.
+
+    To ensure IDs are unique, if the next timestamp isn't greater than the last one,
+    we add 1 usec to the last one. This has the advantage of "gracefully" handling
+    time going backwards.
+    """
+    global last_ts
+    next_ts = datetime.datetime.now()  # Local time, for now
+    if next_ts <= last_ts:
+        next_ts = last_ts + datetime.timedelta(microseconds=1)
+    last_ts = next_ts
+    return next_ts.strftime("%Y%m%d-%H%M%S.%f")
+\`\`\`
+This is a short line.
+  * A short bullet.
+  * Another short one.
+    A short indented line.
+End.
+`;
+    console.log(wordWrap(sampleText, 40));
+}
+
+// testWordWrap();