From abb7367c3d090d0de73c7ea405a42397d794c4df Mon Sep 17 00:00:00 2001 From: Piali Choudhury Date: Wed, 5 Feb 2025 15:25:00 -0800 Subject: [PATCH] ongoing changes to handle pdf files --- ts/examples/docuProc/src/pdfChunker.py | 146 +++ ts/examples/docuProc/src/pdfChunker.ts | 186 +++ ts/examples/docuProc/src/pdfChunkyIndexer.ts | 135 ++ .../docuProc/src/pdfDocAnswerSchema.ts | 15 + ts/examples/docuProc/src/pdfDocChunkSchema.ts | 35 + ts/examples/docuProc/src/pdfDocQuerySchema.ts | 24 + ts/examples/docuProc/src/pdfFileDocumenter.ts | 97 ++ ts/examples/docuProc/src/pdfImporter.ts | 305 +++++ ts/examples/docuProc/src/pdfQNAInterface.ts | 1113 +++++++++++++++++ 9 files changed, 2056 insertions(+) create mode 100644 ts/examples/docuProc/src/pdfChunker.py create mode 100644 ts/examples/docuProc/src/pdfChunker.ts create mode 100644 ts/examples/docuProc/src/pdfChunkyIndexer.ts create mode 100644 ts/examples/docuProc/src/pdfDocAnswerSchema.ts create mode 100644 ts/examples/docuProc/src/pdfDocChunkSchema.ts create mode 100644 ts/examples/docuProc/src/pdfDocQuerySchema.ts create mode 100644 ts/examples/docuProc/src/pdfFileDocumenter.ts create mode 100644 ts/examples/docuProc/src/pdfImporter.ts create mode 100644 ts/examples/docuProc/src/pdfQNAInterface.ts diff --git a/ts/examples/docuProc/src/pdfChunker.py b/ts/examples/docuProc/src/pdfChunker.py new file mode 100644 index 000000000..0902154c3 --- /dev/null +++ b/ts/examples/docuProc/src/pdfChunker.py @@ -0,0 +1,146 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import subprocess +import sys + +# Ensure required dependencies are installed +required_packages = ["pdfplumber", "pymupdf"] +for package in required_packages: + try: + __import__(package) + except ImportError: + print(f"Installing missing package: {package}...") + subprocess.check_call([sys.executable, "-m", "pip", "install", package]) + +import pdfplumber # type: ignore +import fitz # type: ignore +import json +import os +import datetime +import csv +from dataclasses import dataclass +from typing import List, Dict, Any, Optional + +IdType = str + +@dataclass +class Blob: + """A sequence of text, table, or image data plus metadata.""" + type: str # "text", "table", "image" + content: Any # Text (list of lines), table (list of lists), image path (str) + start: int # Page number (0-based) + bbox: Optional[List[float]] = None # Bounding box if applicable + + def to_dict(self) -> Dict[str, Any]: + result = { + "type": self.type, + "content": self.content, + "start": self.start, + } + if self.bbox: + result["bbox"] = self.bbox + return result + +@dataclass +class Chunk: + """A chunk at any level of nesting (root, inner, leaf).""" + id: IdType + pageid: IdType + blobs: List[Blob] # Blobs around the placeholders + parentId: IdType + children: List[IdType] # len() is one less than len(blobs) + + def to_dict(self) -> Dict[str, object]: + return { + "id": self.id, + "pageid": self.pageid, + "blobs": [blob.to_dict() for blob in self.blobs], + "parentId": self.parentId, + "children": self.children, + } + +@dataclass +class ChunkedFile: + """A file with extracted chunks.""" + file_name: str + chunks: List[Chunk] + + def to_dict(self) -> Dict[str, Any]: + return { + "file_name": self.file_name, + "chunks": [chunk.to_dict() for chunk in self.chunks], + } + +class PDFChunker: + def __init__(self, file_path: str, output_dir: str = "output"): + self.file_path = file_path + self.output_dir = output_dir + os.makedirs(output_dir, exist_ok=True) + + def generate_id(self) -> str: + return datetime.datetime.now().strftime("%Y%m%d-%H%M%S.%f") + + def extract_text_chunks(self, pdf, by_paragraph: bool = True) -> List[Chunk]: + chunks = [] + for page_num, page in enumerate(pdf.pages): + text = page.extract_text() + if text: + lines = text.split("\n") + chunk_id = self.generate_id() + if by_paragraph: + paragraphs = text.split("\n\n") + blobs = [Blob("text", para.split("\n"), page_num) for para in paragraphs] + else: + blobs = [Blob("text", lines, page_num)] + chunks.append(Chunk(chunk_id, str(page_num), blobs, "", [])) + return chunks + + def extract_tables(self, pdf) -> List[Chunk]: + chunks = [] + for page_num, page in enumerate(pdf.pages): + tables = page.extract_tables() + for table in tables: + table_path = os.path.join(self.output_dir, f"table_{page_num}_{self.generate_id()}.csv") + with open(table_path, "w", newline="") as f: + writer = csv.writer(f) + writer.writerows(table) + chunk_id = self.generate_id() + blobs = [Blob("table", table_path, page_num)] + chunks.append(Chunk(chunk_id, str(page_num), blobs, "", [])) + return chunks + + def extract_images(self) -> List[Chunk]: + chunks = [] + doc = fitz.open(self.file_path) + for page_num in range(len(doc)): + for img_index, img in enumerate(doc[page_num].get_images(full=True)): + xref = img[0] + pix = fitz.Pixmap(doc, xref) + img_path = os.path.join(self.output_dir, f"image_{page_num}_{img_index}.png") + pix.save(img_path) + bbox = list(doc[page_num].get_image_bbox(xref)) + chunk_id = self.generate_id() + blobs = [Blob("image", img_path, page_num, bbox)] + chunks.append(Chunk(chunk_id, str(page_num), blobs, "", [])) + return chunks + + def chunkify(self, by_paragraph: bool = True) -> ChunkedFile: + with pdfplumber.open(self.file_path) as pdf: + text_chunks = self.extract_text_chunks(pdf, by_paragraph) + table_chunks = self.extract_tables(pdf) + image_chunks = self.extract_images() + all_chunks = text_chunks + table_chunks + image_chunks + return ChunkedFile(self.file_path, all_chunks) + + def save_json(self, output_path: str, by_paragraph: bool = True): + chunked_file = self.chunkify(by_paragraph) + with open(output_path, "w") as f: + json.dump(chunked_file.to_dict(), f, indent=2) + +# Example usage +if __name__ == "__main__": + pdf_path = "sample.pdf" + output_json = "output.json" + chunker = PDFChunker(pdf_path) + chunker.save_json(output_json) diff --git a/ts/examples/docuProc/src/pdfChunker.ts b/ts/examples/docuProc/src/pdfChunker.ts new file mode 100644 index 000000000..80aa9a18a --- /dev/null +++ b/ts/examples/docuProc/src/pdfChunker.ts @@ -0,0 +1,186 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +// This requires that python3 is on the PATH +// and the pdfChunker.py script is in the dist directory. + +import { exec } from "child_process"; +import path from "path"; +import { fileURLToPath } from "url"; +import { promisify } from "util"; + +import { PdfFileDocumentation } from "./pdfDocChunkSchema.js"; + +const __filename = fileURLToPath(import.meta.url); +const __dirname = path.dirname(__filename); + +const execPromise = promisify(exec); + +export type ChunkId = string; + +export interface Blob { + start: number; // int; 0-based! + lines: string[]; + breadcrumb?: boolean; +} + +export interface Chunk { + // Names here must match names in pdfChunker.py. + id: ChunkId; + treeName: string; + blobs: Blob[]; + parentId: ChunkId; + children: ChunkId[]; + fileName: string; // Set upon receiving end from ChunkedFile.fileName. + docs?: PdfFileDocumentation; // Computed later by fileDocumenter. +} + +export interface ChunkedFile { + fileName: string; + chunks: Chunk[]; +} + +export interface ErrorItem { + error: string; + filename?: string; + output?: string; +} + +export async function chunkifyPdfFiles( + filenames: string[], +): Promise<(ChunkedFile | ErrorItem)[]> { + let output, + errors, + success = false; + try { + const chunkerPath = path.join(__dirname, "pdfChunker.py"); + let { stdout, stderr } = await execPromise( + `python3 -X utf8 ${chunkerPath} ${filenames.join(" ")}`, + { maxBuffer: 64 * 1024 * 1024 }, // Super large buffer + ); + output = stdout; + errors = stderr; + success = true; + } catch (error: any) { + output = error?.stdout || ""; + errors = error?.stderr || error.message || "Unknown error"; + } + + if (!success) { + return [{ error: errors, output: output }]; + } + if (errors) { + return [{ error: errors, output: output }]; + } + if (!output) { + return [{ error: "No output from chunker script" }]; + } + + const results: (ChunkedFile | ErrorItem)[] = JSON.parse(output); + // TODO: validate that JSON matches our schema. + + // Ensure all chunks have a filename. + for (const result of results) { + if (!("error" in result)) { + for (const chunk of result.chunks) { + chunk.fileName = result.fileName; + } + } + } + return splitLargeFiles(results); +} + +const CHUNK_COUNT_LIMIT = 25; // How many chunks at most. +const FILE_SIZE_LIMIT = 25000; // How many characters at most. + +function splitLargeFiles( + items: (ChunkedFile | ErrorItem)[], +): (ChunkedFile | ErrorItem)[] { + const results: (ChunkedFile | ErrorItem)[] = []; + for (const item of items) { + if ( + "error" in item || + (item.chunks.length <= CHUNK_COUNT_LIMIT && + fileSize(item) <= FILE_SIZE_LIMIT) + ) { + results.push(item); + } else { + results.push(...splitFile(item)); + } + } + return results; +} + +// This algorithm is too complex. I needed a debugger and logging to get it right. +function splitFile(file: ChunkedFile): ChunkedFile[] { + const fileName = file.fileName; + const parentMap: Map = new Map(); + for (const chunk of file.chunks) { + // Only nodes with children will be looked up in this map. + if (chunk.children.length) parentMap.set(chunk.id, chunk); + } + + const results: ChunkedFile[] = []; // Where output accumulates. + let chunks = Array.from(file.chunks); // The chunks yet to emit. + let minNumChunks = 1; + + outer: while (true) { + // Keep going until we exit the inner loop. + let totalSize = 0; // Size in characters of chunks to be output. + for (let i = 0; i < chunks.length; i++) { + // Iterate in pre-order. + const currentChunk = chunks[i]; + const size = chunkSize(currentChunk); + if ( + i < minNumChunks || + (i < CHUNK_COUNT_LIMIT && totalSize + size <= FILE_SIZE_LIMIT) + ) { + totalSize += size; + continue; + } + + // Split the file here (current chunk goes into ancestors). + const rest = chunks.splice(i); + if (rest.shift() !== currentChunk) + throw Error( + "Internal error: expected current chunk at head of rest", + ); + results.push({ fileName, chunks }); + const ancestors: Chunk[] = []; + + let c: Chunk | undefined = currentChunk; + do { + ancestors.unshift(c); + c = parentMap.get(c.parentId); + } while (c); + // Note that the current chunk is the last ancestor. + chunks = [...ancestors, ...rest]; + minNumChunks = ancestors.length; + continue outer; + } + // Append the final chunk. + results.push({ fileName, chunks }); + break; + } + // console.log( + // `Split ${file.fileName} (${file.chunks.length} chunks) into ${results.length} files.`, + // ); + // console.log(`Sizes: ${results.map((f) => f.chunks.length).join(", ")}`); + return results; +} + +function fileSize(file: ChunkedFile): number { + return file.chunks.reduce((acc, chunk) => acc + chunkSize(chunk), 0); +} + +function chunkSize(chunk: Chunk): number { + let totalCharacters = 0; + for (const blob of chunk.blobs) { + if (!blob.breadcrumb) { + for (const line of blob.lines) { + totalCharacters += line.length; + } + } + } + return totalCharacters; +} diff --git a/ts/examples/docuProc/src/pdfChunkyIndexer.ts b/ts/examples/docuProc/src/pdfChunkyIndexer.ts new file mode 100644 index 000000000..d27b19efb --- /dev/null +++ b/ts/examples/docuProc/src/pdfChunkyIndexer.ts @@ -0,0 +1,135 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +import { openai, ChatModel, TextEmbeddingModel } from "aiclient"; +import * as knowLib from "knowledge-processor"; +import { createObjectFolder, loadSchema, ObjectFolder } from "typeagent"; + +import { createFileDocumenter, FileDocumenter } from "./pdfFileDocumenter.js"; +import { Chunk, ChunkId } from "./pdfChunker.js"; +import { QuerySpecs } from "./pdfDocQuerySchema.js"; +import { createJsonTranslator, TypeChatJsonTranslator } from "typechat"; +import { createTypeScriptJsonValidator } from "typechat/ts"; +import { AnswerSpecs } from "./pdfDocAnswerSchema.js"; + +export const IndexNames = [ + "summaries", + "keywords", + "tags", + "synonyms", + "dependencies", +]; +export type IndexType = (typeof IndexNames)[number]; +export type NamedIndex = [IndexType, knowLib.TextIndex]; + +// A bundle of object stores and indexes etc. +export class ChunkyIndex { + chatModel: ChatModel; + miniModel: ChatModel; // E.g. gpt-3.5-turbo or gpt-4-mini or o1-mini. + embeddingModel: TextEmbeddingModel; + fileDocumenter: FileDocumenter; + queryMaker: TypeChatJsonTranslator; + answerMaker: TypeChatJsonTranslator; + + // The rest are asynchronously initialized by reInitialize(rootDir). + rootDir!: string; + answerFolder!: ObjectFolder; + chunkFolder!: ObjectFolder; + indexes!: Map>; + + private constructor() { + this.chatModel = openai.createChatModelDefault("spelunkerChat"); + this.miniModel = openai.createChatModel( + "GPT_35_TURBO", + undefined, + undefined, + ["spelunkerMini"], + ); + this.embeddingModel = knowLib.createEmbeddingCache( + openai.createEmbeddingModel(), + 1000, + ); + this.fileDocumenter = createFileDocumenter(this.chatModel); + this.queryMaker = createQueryMaker(this.chatModel); + this.answerMaker = createAnswerMaker(this.chatModel); + } + + static async createInstance(rootDir: string): Promise { + const instance = new ChunkyIndex(); + await instance.reInitialize(rootDir); + return instance; + } + + async reInitialize(rootDir: string): Promise { + const instance = this; // So makeIndex can see it. + instance.rootDir = rootDir; + instance.chunkFolder = await createObjectFolder( + instance.rootDir + "/chunks", + { serializer: (obj) => JSON.stringify(obj, null, 2) }, + ); + instance.answerFolder = await createObjectFolder( + instance.rootDir + "/answers", + { serializer: (obj) => JSON.stringify(obj, null, 2) }, + ); + instance.indexes = new Map(); + for (const name of IndexNames) { + instance.indexes.set(name, await makeIndex(name)); + } + + async function makeIndex( + name: string, + ): Promise> { + return await knowLib.createTextIndex( + { + caseSensitive: false, + concurrency: 4, + semanticIndex: true, + embeddingModel: instance.embeddingModel, + }, + instance.rootDir + "/" + name, + ); + } + } + + getIndexByName(indexName: IndexType): knowLib.TextIndex { + for (const [name, index] of this.allIndexes()) { + if (name === indexName) { + return index; + } + } + throw new Error(`Unknown index: ${indexName}`); + } + + allIndexes(): NamedIndex[] { + return [...this.indexes.entries()]; + } +} + +function createQueryMaker( + model: ChatModel, +): TypeChatJsonTranslator { + const typeName = "QuerySpecs"; + const schema = loadSchema( + ["makeQuerySchema.ts", "makeAnswerSchema.ts"], + import.meta.url, + ); + const validator = createTypeScriptJsonValidator( + schema, + typeName, + ); + const translator = createJsonTranslator(model, validator); + return translator; +} + +function createAnswerMaker( + model: ChatModel, +): TypeChatJsonTranslator { + const typeName = "AnswerSpecs"; + const schema = loadSchema(["makeAnswerSchema.ts"], import.meta.url); + const validator = createTypeScriptJsonValidator( + schema, + typeName, + ); + const translator = createJsonTranslator(model, validator); + return translator; +} diff --git a/ts/examples/docuProc/src/pdfDocAnswerSchema.ts b/ts/examples/docuProc/src/pdfDocAnswerSchema.ts new file mode 100644 index 000000000..e65ea2189 --- /dev/null +++ b/ts/examples/docuProc/src/pdfDocAnswerSchema.ts @@ -0,0 +1,15 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +// Identifier for a chunk of code. +export type ChunkId = string; + +// Answer to the original question. +export type AnswerSpecs = { + question: string; // Original question (e.g. "How can items be related") + answer: string; // Answer to the question. It is readable and complete, with suitable formatting (line breaks, bullet points etc) + + references: ChunkId[]; // Chunks that support this answer + confidence: number; // Between 0 and 1 + message?: string; // Optional message to the user (e.g. for low confidence); might request more input +}; diff --git a/ts/examples/docuProc/src/pdfDocChunkSchema.ts b/ts/examples/docuProc/src/pdfDocChunkSchema.ts new file mode 100644 index 000000000..1b6e3714c --- /dev/null +++ b/ts/examples/docuProc/src/pdfDocChunkSchema.ts @@ -0,0 +1,35 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +// Extracted information for a chunk of text. +export type PdfDocChunk = { + // Optional file identifier for this chunk. + fileName?: string; + + // paragraph number in the file + paraNumber?: number; + + name: string; + + // One paragraph summary of the pdf chunk. + // Concise and informative. + summary: string; + + // Propose keywords/phrases capturing the chunk's highlights, + // context, and notable topics. Make them concise but descriptive, + // ensuring users can find these points with common queries or synonyms. + keywords?: string[]; + + // Optional high-level labels (e.g., "transformer", "algorithm"). + tags?: string[]; + + // Additional synonyms or related domain concepts. + synonyms?: string[]; + + // References to other chunks or external files. + dependencies?: string[]; +}; + +export type PdfFileDocumentation = { + chunkDocs?: PdfDocChunk[]; +}; diff --git a/ts/examples/docuProc/src/pdfDocQuerySchema.ts b/ts/examples/docuProc/src/pdfDocQuerySchema.ts new file mode 100644 index 000000000..4c1834df6 --- /dev/null +++ b/ts/examples/docuProc/src/pdfDocQuerySchema.ts @@ -0,0 +1,24 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +import { AnswerSpecs } from "./pdfDocAnswerSchema.js"; + +// Proposed query for one index. +export type QuerySpec = { + query: string; // Query the index for nearest neighbors to this + maxHits?: number; // Omit to use system default + confidence: number; // Between 0 and 1 (how much value you expect from this query) +}; + +// Proposed queries for some of the indexes; or a direct answer. +export type QuerySpecs = { + // Queries directed to various indexes. Comments describe what's in each index. + summaries?: QuerySpec; // A paragraph describing the paper paragraph + keywords?: QuerySpec; // Short key words and phrases extracted from the text + tags?: QuerySpec; // Optional high-level labels (e.g. "algorithmic", "scientific") + synonyms?: QuerySpec; // Additional synonyms or related domain concepts + dependencies?: QuerySpec; // External dependencies + + // If the question can be answered based on chat history and general knowledge. + answer?: AnswerSpecs; +}; diff --git a/ts/examples/docuProc/src/pdfFileDocumenter.ts b/ts/examples/docuProc/src/pdfFileDocumenter.ts new file mode 100644 index 000000000..87d43576e --- /dev/null +++ b/ts/examples/docuProc/src/pdfFileDocumenter.ts @@ -0,0 +1,97 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +import { ChatModel } from "aiclient"; +import { loadSchema } from "typeagent"; +import { createJsonTranslator, TypeChatJsonTranslator } from "typechat"; +import { createTypeScriptJsonValidator } from "typechat/ts"; + +import { PdfFileDocumentation } from "./pdfDocChunkSchema.js"; +import { Chunk } from "./pdfChunker.js"; + +// For various reasons we want to index chunks separately, +// but we want to produce their documentation in the context of the whole file. +// FileDocumenter.document(chunks) produces documentation comments +// and then assigns each comment to the appropriate chunk. + +// Document an entire file and assign comments to chunks. + +export interface FileDocumenter { + document(chunks: Chunk[]): Promise; +} + +export function createFileDocumenter(model: ChatModel): FileDocumenter { + const fileDocTranslator = createFileDocTranslator(model); + return { + document, + }; + + async function document(chunks: Chunk[]): Promise { + let text = ""; + for (const chunk of chunks) { + text += `***: Docmument the following ${chunk.treeName}:\n`; + for (const blob of chunk.blobs) { + for (let i = 0; i < blob.lines.length; i++) { + text += `[${blob.start + i + 1}]: ${blob.lines[i]}\n`; + } + } + } + const request = + "Document the given Python code, its purpose, and any relevant details.\n" + + "The code has (non-contiguous) line numbers, e.g.: `[1]: def foo():`\n" + + "There are also marker lines, e.g.: `***: Document the following FuncDef`\n" + + "Write a concise paragraph for EACH marker.\n" + + "For example, the comment could be:\n" + + "```\n" + + "Method C.foo finds the most twisted anagram for a word.\n" + + "It uses various heuristics to rank a word's twistedness'.\n" + + "```\n" + + "Also fill in the lists of keywords, tags, synonyms, and dependencies.\n"; + const result = await fileDocTranslator.translate(request, text); + + // Now assign each comment to its chunk. + if (result.success) { + const fileDocs: PdfFileDocumentation = result.data; + // Assign each comment to its chunk. + for (const chunkDoc of fileDocs.chunkDocs ?? []) { + console.log(chunkDoc.name); + for (const chunk of chunks) { + for (const blob of chunk.blobs) { + + console.log(blob); + // Reminder: blob.start is 0-based, comment.lineNumber is 1-based. + /*if ( + !blob.breadcrumb && + blob.start < chunkDoc.lineNumber && + chunkDoc.lineNumber <= + blob.start + blob.lines.length + ) { + const chunkDocs = chunk?.docs?.chunkDocs ?? []; + chunkDocs.push(chunkDoc); + chunk.docs = { chunkDocs }; + }*/ + } + } + } + return fileDocs; + } else { + throw new Error(result.message); + } + } +} + +function createFileDocTranslator( + model: ChatModel, +): TypeChatJsonTranslator { + const typeName = "FileDocumentation"; + const schema = loadSchema(["fileDocSchema.ts"], import.meta.url); + const validator = createTypeScriptJsonValidator( + schema, + typeName, + ); + const translator = createJsonTranslator( + model, + validator, + ); + return translator; +} diff --git a/ts/examples/docuProc/src/pdfImporter.ts b/ts/examples/docuProc/src/pdfImporter.ts new file mode 100644 index 000000000..e2739238d --- /dev/null +++ b/ts/examples/docuProc/src/pdfImporter.ts @@ -0,0 +1,305 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +// TODO: Most of this is not Python specific; generalize to other languages. + +import chalk, { ChalkInstance } from "chalk"; +import * as fs from "fs"; +import * as knowLib from "knowledge-processor"; +import { asyncArray } from "typeagent"; + +import * as iapp from "interactive-app"; +import { ChunkyIndex, IndexNames } from "./pdfChunkyIndexer.js"; +import { PdfDocChunk, PdfFileDocumentation } from "./pdfDocChunkSchema.js"; +import { + Chunk, + ChunkedFile, + ChunkId, + chunkifyPdfFiles, + ErrorItem, +} from "./pdfChunker.js"; +import { purgeNormalizedFile } from "./pdfQNAInterface.js"; + +function log( + io: iapp.InteractiveIo | undefined, + message: string, + color: ChalkInstance, +): void { + message = color(message); + if (io) { + io.writer.writeLine(message); + } else { + console.log(message); + } +} + +export async function importAllFiles( + files: string[], + chunkyIndex: ChunkyIndex, + io: iapp.InteractiveIo | undefined, + verbose: boolean, +): Promise { + log(io, `[Importing ${files.length} files]`, chalk.grey); + + const t0 = Date.now(); + await importPdfFiles(files, chunkyIndex, io, verbose); + const t1 = Date.now(); + + log( + io, + `[Imported ${files.length} files in ${((t1 - t0) * 0.001).toFixed(3)} seconds]`, + chalk.grey, + ); +} + +async function importPdfFiles( + files: string[], + chunkyIndex: ChunkyIndex, + io: iapp.InteractiveIo | undefined, + verbose = false, +): Promise { + // Canonicalize filenames. + let filenames = files.map((file) => + fs.existsSync(file) ? fs.realpathSync(file) : file, + ); + + // Purge previous occurrences of these files. + for (const fileName of filenames) { + await purgeNormalizedFile(io, chunkyIndex, fileName, verbose); + } + + // Chunkify PDF files using a helper program. + const t0 = Date.now(); + const results = await chunkifyPdfFiles(filenames); + const t1 = Date.now(); + if (results.length !== filenames.length) { + log( + io, + `[Some over-long files were split into multiple partial files]`, + chalk.yellow, + ); + } + + // Print stats for chunkifying. + let numLines = 0; + let numBlobs = 0; + let numChunks = 0; + let numErrors = 0; + for (const result of results) { + if ("error" in result) { + numErrors++; + } else { + const chunkedFile = result; + numChunks += chunkedFile.chunks.length; + for (const chunk of chunkedFile.chunks) { + numBlobs += chunk.blobs.length; + for (const blob of chunk.blobs) { + numLines += blob.lines.length; + } + } + } + } + log( + io, + `[Chunked ${results.length} files ` + + `(${numLines} lines, ${numBlobs} blobs, ${numChunks} chunks, ${numErrors} errors) ` + + `in ${((t1 - t0) * 0.001).toFixed(3)} seconds]`, + chalk.gray, + ); + + const chunkingErrors = results.filter( + (result:any): result is ErrorItem => "error" in result, + ); + for (const error of chunkingErrors) { + log( + io, + `[Error: ${error.error}; Output: ${error.output ?? ""}]`, + chalk.redBright, + ); + } + + const chunkedFiles = results.filter( + (result:any): result is ChunkedFile => "chunks" in result, + ); + log(io, `[Documenting ${chunkedFiles.length} files]`, chalk.grey); + + const tt0 = Date.now(); + const documentedFiles: PdfFileDocumentation[] = []; + const concurrency = 8; // TODO: Make this a function argument + let nChunks = 0; + await asyncArray.forEachAsync( + chunkedFiles, + concurrency, + async (chunkedFile) => { + const t0 = Date.now(); + let docs: PdfFileDocumentation; + nChunks += chunkedFile.chunks.length; + try { + docs = await exponentialBackoff( + io, + chunkyIndex.fileDocumenter.document, + chunkedFile.chunks, + ); + } catch (error) { + const t1 = Date.now(); + log( + io, + ` [Error documenting ${chunkedFile.fileName} in ${((t1 - t0) * 0.001).toFixed(3)} seconds: ${error}]`, + chalk.redBright, + ); + return; + } + const t1 = Date.now(); + + if (verbose) { + log( + io, + ` [Documented ${chunkedFile.chunks.length} chunks in ${((t1 - t0) * 0.001).toFixed(3)} seconds for ${chunkedFile.fileName}]`, + chalk.grey, + ); + } + documentedFiles.push(docs); + }, + ); + const tt1 = Date.now(); + + log( + io, + `[Documented ${documentedFiles.length} files (${nChunks} chunks) in ${((tt1 - tt0) * 0.001).toFixed(3)} seconds]`, + chalk.grey, + ); + + const nonEmptyFiles = chunkedFiles.filter( + (cf) => cf.chunks.filter((c) => c.docs).length, + ); + + log(io, `[Embedding ${nonEmptyFiles.length} files]`, chalk.grey); + + if (nonEmptyFiles.length) { + const ttt0 = Date.now(); + // Cannot parallelize this because of concurrent writing to TextIndex. + // TODO: Try pre-computing embeddings in parallel to fill the embeddings cache (is that cache safe?) + for (const chunkedFile of nonEmptyFiles) { + await embedChunkedFile(chunkedFile, chunkyIndex, io, verbose); + } + const ttt1 = Date.now(); + + log( + io, + `[Embedded ${documentedFiles.length} files in ${((ttt1 - ttt0) * 0.001).toFixed(3)} seconds]`, + chalk.grey, + ); + } +} + +export async function embedChunkedFile( + chunkedFile: ChunkedFile, + chunkyIndex: ChunkyIndex, + io: iapp.InteractiveIo | undefined, + verbose = false, +): Promise { + const chunks: Chunk[] = chunkedFile.chunks; + if (chunks.length === 0) { + log(io, `[Skipping empty file ${chunkedFile.fileName}]`, chalk.yellow); + return; + } + const t0 = Date.now(); + for (const chunk of chunkedFile.chunks) { + await embedChunk(chunk, chunkyIndex, io, verbose); + } + const t1 = Date.now(); + if (verbose) { + log( + io, + ` [Embedded ${chunks.length} chunks in ${((t1 - t0) * 0.001).toFixed(3)} seconds for ${chunkedFile.fileName}]`, + chalk.grey, + ); + } +} + +async function embedChunk( + chunk: Chunk, + chunkyIndex: ChunkyIndex, + io: iapp.InteractiveIo | undefined, + verbose = false, +): Promise { + const t0 = Date.now(); + const lineCount = chunk.blobs.reduce( + (acc, blob) => acc + blob.lines.length, + 0, + ); + await exponentialBackoff(io, chunkyIndex.chunkFolder.put, chunk, chunk.id); + + const summaries: string[] = []; + const chunkDocs: PdfDocChunk[] = chunk.docs?.chunkDocs ?? []; + for (const chunkDoc of chunkDocs) { + summaries.push(chunkDoc.summary); + } + const combinedSummaries = summaries.join("\n").trimEnd(); + + for (const chunkDoc of chunkDocs) { + for (const indexName of IndexNames) { + let data: string[]; + if (indexName == "summaries") { + data = [combinedSummaries]; + } else { + data = (chunkDoc as any)[indexName]; + } + const index = chunkyIndex.indexes.get(indexName)!; + if (data && index) { + await writeToIndex(io, chunk.id, data, index); + } + } + } + + const t1 = Date.now(); + if (verbose) { + log( + io, + ` [Embedded ${chunk.id} (${lineCount} lines @ ${chunk.blobs[0].start + 1}) ` + + `in ${((t1 - t0) * 0.001).toFixed(3)} seconds for ${chunk.fileName}]`, + chalk.gray, + ); + } +} + +async function writeToIndex( + io: iapp.InteractiveIo | undefined, + chunkId: ChunkId, + phrases: string[] | undefined, // List of summaries, keywords, tags, etc. in chunk + index: knowLib.TextIndex, +) { + for (const phrase of phrases ?? []) { + await exponentialBackoff(io, index.put, phrase, [chunkId]); + } +} + +async function exponentialBackoff( + io: iapp.InteractiveIo | undefined, + callable: (...args: T) => Promise, + ...args: T +): Promise { + let timeout = 1; + for (;;) { + try { + return await callable(...args); + } catch (error) { + if (timeout > 1000) { + log(io, `[Error: ${error}; giving up]`, chalk.redBright); + throw error; + } + log( + io, + `[Error: ${error}; retrying in ${timeout} ms]`, + chalk.redBright, + ); + await new Promise((resolve) => setTimeout(resolve, timeout)); + timeout *= 2; + } + } +} + +// Apply URL escaping to key. NOTE: Currently unused. TODO: Therefore remove. +export function sanitizeKey(key: string): string { + return encodeURIComponent(key).replace(/%20/g, "+"); // Encode spaces as plus, others as %xx. +} diff --git a/ts/examples/docuProc/src/pdfQNAInterface.ts b/ts/examples/docuProc/src/pdfQNAInterface.ts new file mode 100644 index 000000000..dcdf4456c --- /dev/null +++ b/ts/examples/docuProc/src/pdfQNAInterface.ts @@ -0,0 +1,1113 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +// User interface for querying the index. + +import chalk, { ChalkInstance } from "chalk"; +import * as fs from "fs"; +import * as util from "util"; + +import * as iapp from "interactive-app"; +import * as knowLib from "knowledge-processor"; +import { NameValue, ScoredItem } from "typeagent"; + +import { IndexType, ChunkyIndex } from "./pdfChunkyIndexer.js"; +import { QuerySpec, QuerySpecs } from "./pdfDocQuerySchema.js"; +import { Chunk, ChunkId } from "./pdfChunker.js"; +import { importAllFiles } from "./pdfImporter.js"; +import { AnswerSpecs } from "./pdfDocAnswerSchema.js"; +import { PromptSection } from "typechat"; + +type QueryOptions = { + maxHits: number; + minScore: number; + verbose: boolean; +}; + +function writeColor( + io: iapp.InteractiveIo | undefined, + color: ChalkInstance, + message: string, +): void { + message = color(message); + if (io) { + io.writer.writeLine(message); + } else { + console.log(message); + } +} + +function writeNote(io: iapp.InteractiveIo | undefined, message: string): void { + writeColor(io, chalk.gray, message); +} + +function writeMain(io: iapp.InteractiveIo | undefined, message: string): void { + writeColor(io, chalk.white, message); +} + +function writeWarning( + io: iapp.InteractiveIo | undefined, + message: string, +): void { + writeColor(io, chalk.yellow, message); +} + +function writeError(io: iapp.InteractiveIo | undefined, message: string): void { + writeColor(io, chalk.redBright, message); +} + +function writeHeading( + io: iapp.InteractiveIo | undefined, + message: string, +): void { + writeColor(io, chalk.green, message); +} + +export async function interactiveQueryLoop( + chunkyIndex: ChunkyIndex, + verbose = false, +): Promise { + const handlers: Record = { + import: importHandler, // Since you can't name a function "import". + clearMemory, + chunk, + search, + summaries, + keywords, + tags, + synonyms, + dependencies, + files, + purgeFile, + }; + iapp.addStandardHandlers(handlers); + + function importDef(): iapp.CommandMetadata { + return { + description: "Import a file or files of Python code.", + options: { + fileName: { + description: + "File to import (or multiple files separated by commas)", + type: "string", + }, + files: { + description: "File containing the list of files to import", + type: "string", + }, + verbose: { + description: "More verbose output", + type: "boolean", + defaultValue: verbose, + }, + }, + }; + } + handlers.import.metadata = importDef(); + async function importHandler( + args: string[] | iapp.NamedArgs, + io: iapp.InteractiveIo, + ): Promise { + const namedArgs = iapp.parseNamedArguments(args, importDef()); + const files = namedArgs.fileName + ? (namedArgs.fileName as string).trim().split(",") + : namedArgs.files + ? fs + .readFileSync(namedArgs.files as string, "utf-8") + .split("\n") + .map((line) => line.trim()) + .filter((line) => line.length > 0 && line[0] !== "#") + : []; + if (!files.length) { + writeError(io, "[No files to import (use --? for help)]"); + return; + } + await importAllFiles(files, chunkyIndex, io, namedArgs.verbose); + } + + handlers.clearMemory.metadata = "Clear all memory (and all indexes)"; + async function clearMemory( + args: string[], + io: iapp.InteractiveIo, + ): Promise { + await fs.promises.rm(chunkyIndex.rootDir, { + recursive: true, + force: true, + }); + await chunkyIndex.reInitialize(chunkyIndex.rootDir); + writeNote(io, "[All memory and all indexes cleared]"); + // Actually the embeddings cache isn't. But we shouldn't have to care. + } + + handlers.chunk.metadata = "Print one or more chunks"; + async function chunk( + args: string[], + io: iapp.InteractiveIo, + ): Promise { + const joinedArgs = args.join(" "); + const splitArgs = joinedArgs.trim().split(/[\s,]+/); + for (const chunkId of splitArgs) { + const chunk = await chunkyIndex.chunkFolder.get(chunkId); + if (chunk) { + const chunkDocs = chunk.docs?.chunkDocs ?? []; + writeNote(io, `\nCHUNK ID: ${chunkId}`); + for (const chunkDoc of chunkDocs) { + for (const [name, _] of chunkyIndex.allIndexes()) { + if (name == "summaries") { + if (chunkDoc.summary) { + writeNote(io, "SUMMARY:"); + writeMain( + io, + // Indent by two + wordWrap(chunkDoc.summary).replace( + /^/gm, + " ", + ), + ); + } else { + writeNote(io, "SUMMARY: None"); + } + } else { + const docItem: string[] | undefined = ( + chunkDoc as any + )[name]; + if (docItem?.length) { + writeNote( + io, + `${name.toUpperCase()}: ${docItem.join(", ")}`, + ); + } + } + } + } + writeNote(io, "CODE:"); + writeChunkLines(chunk, io, 100); + } else { + writeNote(io, `[Chunk ID ${chunkId} not found]`); + } + } + } + + function searchDef(): iapp.CommandMetadata { + return { + description: "Search for a query string in the code index.", + args: { + query: { + description: "Natural language query", + type: "string", + }, + }, + options: { + maxHits: { + description: "Maximum number of hits to return", + type: "integer", + defaultValue: 10, + }, + minScore: { + description: "Minimum score to return", + type: "number", + defaultValue: 0.7, + }, + verbose: { + description: "More verbose output", + type: "boolean", + defaultValue: verbose, + }, + }, + }; + } + handlers.search.metadata = searchDef(); + async function search( + args: string[] | iapp.NamedArgs, + io: iapp.InteractiveIo, + ): Promise { + const namedArgs = iapp.parseNamedArguments(args, searchDef()); + const query: string = namedArgs.query; + const queryOptions: QueryOptions = { + maxHits: namedArgs.maxHits, + minScore: namedArgs.minScore, + verbose: namedArgs.verbose, + }; + await processQuery(query, chunkyIndex, io, queryOptions); + } + + function commonOptions(): Record { + return { + verbose: { + description: "More verbose output", + type: "boolean", + defaultValue: verbose, + }, + filter: { + description: "Filter by keyword", + type: "string", + }, + query: { + description: "Natural language query", + type: "string", + }, + maxHits: { + description: + "Maximum number of hits to return (only for query)", + type: "integer", + defaultValue: 10, + }, + minScore: { + description: "Minimum score to return (only for query)", + type: "number", + defaultValue: 0.7, + }, + debug: { + description: "Show debug output", + type: "boolean", + }, + }; + } + + function summariesDef(): iapp.CommandMetadata { + return { + description: "Show all recorded summaries and their postings.", + options: commonOptions(), + }; + } + handlers.summaries.metadata = summariesDef(); + async function summaries( + args: string[] | iapp.NamedArgs, + io: iapp.InteractiveIo, + ): Promise { + await _reportIndex(args, io, "summaries"); + } + + function keywordsDef(): iapp.CommandMetadata { + return { + description: "Show all recorded keywords and their postings.", + options: commonOptions(), + }; + } + handlers.keywords.metadata = keywordsDef(); + async function keywords( + args: string[] | iapp.NamedArgs, + io: iapp.InteractiveIo, + ): Promise { + await _reportIndex(args, io, "keywords"); + } + + function tagsDef(): iapp.CommandMetadata { + return { + description: "Show all recorded tags and their postings.", + options: commonOptions(), + }; + } + handlers.tags.metadata = tagsDef(); + async function tags( + args: string[] | iapp.NamedArgs, + io: iapp.InteractiveIo, + ): Promise { + await _reportIndex(args, io, "tags"); + } + + function synonymsDef(): iapp.CommandMetadata { + return { + description: "Show all recorded synonyms and their postings.", + options: commonOptions(), + }; + } + handlers.synonyms.metadata = synonymsDef(); + async function synonyms( + args: string[] | iapp.NamedArgs, + io: iapp.InteractiveIo, + ): Promise { + await _reportIndex(args, io, "synonyms"); + } + + function dependenciesDef(): iapp.CommandMetadata { + return { + description: "Show all recorded dependencies and their postings.", + options: commonOptions(), + }; + } + handlers.dependencies.metadata = dependenciesDef(); + async function dependencies( + args: string[] | iapp.NamedArgs, + io: iapp.InteractiveIo, + ): Promise { + await _reportIndex(args, io, "dependencies"); + } + + function filesDef(): iapp.CommandMetadata { + return { + description: "Show all recorded file names.", + options: { + filter: { + description: "Only show files containing this string", + type: "string", + }, + }, + }; + } + handlers.files.metadata = filesDef(); + async function files( + args: string[] | iapp.NamedArgs, + io: iapp.InteractiveIo, + ): Promise { + const namedArgs = iapp.parseNamedArguments(args, filesDef()); + const filter = namedArgs.filter; + const filesPopularity: Map = new Map(); + for await (const chunk of chunkyIndex.chunkFolder.allObjects()) { + filesPopularity.set( + chunk.fileName, + (filesPopularity.get(chunk.fileName) ?? 0) + 1, + ); + } + if (!filesPopularity.size) { + writeMain(io, "[No files]"); + } else { + const sortedFiles = Array.from(filesPopularity) + .filter(([file, _]) => !filter || file.includes(filter)) + .sort(); + writeNote( + io, + `Found ${sortedFiles.length} ${filter ? "matching" : "total"} files.`, + ); + for (const [file, count] of sortedFiles) { + writeMain( + io, + `${chalk.blue(count.toFixed(0).padStart(7))} ${chalk.green(file)}`, + ); + } + } + } + + function purgeFileDef(): iapp.CommandMetadata { + return { + description: "Purge all mentions of a file.", + args: { + fileName: { + description: "File to purge", + type: "string", + }, + }, + options: { + verbose: { + description: "More verbose output", + type: "boolean", + defaultValue: verbose, + }, + }, + }; + } + handlers.purgeFile.metadata = purgeFileDef(); + async function purgeFile( + args: string[] | iapp.NamedArgs, + io: iapp.InteractiveIo, + ): Promise { + const namedArgs = iapp.parseNamedArguments(args, purgeFileDef()); + const file = namedArgs.fileName as string; + const fileName = fs.existsSync(file) ? fs.realpathSync(file) : file; + await purgeNormalizedFile(io, chunkyIndex, fileName, namedArgs.verbose); + } + + async function _reportIndex( + args: string[] | iapp.NamedArgs, + io: iapp.InteractiveIo, + indexName: IndexType, + ): Promise { + const namedArgs = iapp.parseNamedArguments(args, keywordsDef()); + const index = chunkyIndex.getIndexByName(indexName); + if (namedArgs.debug) { + writeNote(io, `[Debug: ${indexName}]`); + await _debugIndex(io, index, indexName, verbose); + return; + } + + let matches: ScoredItem>[] = []; + if (namedArgs.query) { + matches = await index.nearestNeighborsPairs( + namedArgs.query, + namedArgs.maxHits, + namedArgs.minScore, + ); + } else { + for await (const textBlock of index.entries()) { + matches.push({ + score: 1, + item: textBlock, + }); + } + } + + let hits: ScoredItem>[] = []; + if (!namedArgs.filter) { + hits = matches; + } else { + const filterWords: string[] = (namedArgs.filter as string) + .split(" ") + .filter(Boolean) + .map((w) => w.toLowerCase()); + for await (const match of matches) { + const valueWords: string[] = match.item.value + .split(/\s+/) + .filter(Boolean) + .map((w) => w.toLowerCase()); + if (filterWords.every((word) => valueWords.includes(word))) { + hits.push(match); + } + } + } + + if (!hits.length) { + writeNote(io, `No ${indexName}.`); // E.g., "No keywords." + return; + } else { + writeNote(io, `Found ${hits.length} ${indexName}.`); + + // TFIDF = TF(t) * IDF(t) = 1 * log(N / (1 + nt)) + // - t is a term (in other contexts, a term occurring in a given chunk) + // - nt the number of chunks occurring for that term in this index + // - N the total number of chunks in the system + const numChunks = await chunkyIndex.chunkFolder.size(); + for (const hit of hits) { + const numSourceIds: number = hit.item.sourceIds?.length ?? 0; + hit.score *= Math.log(numChunks / (1 + numSourceIds)); + } + + hits.sort((a, b) => { + if (a.score != b.score) return b.score - a.score; + return a.item.value.localeCompare(b.item.value); + }); + for (const hit of hits) { + writeMain( + io, + `${hit.score.toFixed(3).padStart(7)}: ${chalk.green(hit.item.value)} :: ${(hit.item.sourceIds ?? []).join(", ")}`, + ); + } + } + } + + async function _debugIndex( + io: iapp.InteractiveIo, + index: knowLib.TextIndex, + indexName: string, + verbose: boolean, + ): Promise { + const allTexts = Array.from(index.text()); + for (const text of allTexts) { + if (verbose) writeNote(io, `Text: ${text}`); + const hits = await index.nearestNeighborsPairs( + text, + allTexts.length, + ); + if (verbose) { + for (const hit of hits) { + writeNote( + io, + `${hit.score.toFixed(3).padStart(7)} ${hit.item.value}`, + ); + } + } + if (hits.length < 2) { + writeNote(io, `No hits for ${text} in ${indexName}`); + } else { + const end = hits.length - 1; + writeMain( + io, + `${chalk.green(hits[0].item.value)}: ${chalk.blue(hits[1].item.value)} (${hits[1].score.toFixed(3)}) -- ` + + `${chalk.blue(hits[end].item.value)} (${hits[end].score.toFixed(3)})`, + ); + } + } + } + + async function _inputHandler( + input: string, + io: iapp.InteractiveIo, + ): Promise { + await processQuery(input, chunkyIndex, io, { + maxHits: 10, + minScore: 0.7, + verbose, + }); + } + + await iapp.runConsole({ + inputHandler: _inputHandler, + handlers, + prompt: "\n🤖> ", + }); +} + +export async function purgeNormalizedFile( + io: iapp.InteractiveIo | undefined, + chunkyIndex: ChunkyIndex, + fileName: string, + verbose: boolean, +): Promise { + // Step 1: Find chunks to remove. + let toDelete: Set = new Set(); + for await (const chunk of chunkyIndex.chunkFolder.allObjects()) { + if (chunk.fileName === fileName) { + toDelete.add(chunk.id); + } + } + + // Step 1a: Logging and early return if nothing to purge. + if (!toDelete.size) { + writeNote(io, `[No chunks to purge for file ${fileName}]`); + return; + } + writeNote( + io, + `[Need to purge ${toDelete.size} chunks for file ${fileName}]`, + ); + + // Step 2: Remove chunk ids from indexes. + const chunkIdsToDelete: ChunkId[] = Array.from(toDelete); + for (const [name, index] of chunkyIndex.allIndexes()) { + const affectedValues: string[] = []; + // Collect values from which we need to remove the chunk ids about to be deleted. + for await (const textBlock of index.entries()) { + if ( + textBlock?.sourceIds?.some((id) => + chunkIdsToDelete.includes(id), + ) + ) { + if (verbose) { + writeNote(io, `[Purging ${name} entry ${textBlock.value}]`); + } + affectedValues.push(textBlock.value); + } + } + // Actually update the index (can't modify it while it's being iterated over). + for (const value of affectedValues) { + const id = await index.getId(value); + if (!id) { + writeWarning(io, `[No id for value {value}]`); + } else { + await index.remove(id, chunkIdsToDelete); + } + } + writeNote(io, `[Purged ${affectedValues.length} ${name}]`); // name is plural, e.g. "keywords". + } + + // Step 3: Remove chunks (do this last so if step 2 fails we can try again). + for (const id of toDelete) { + if (verbose) { + writeNote(io, `[Purging chunk ${id}]`); + } + await chunkyIndex.chunkFolder.remove(id); + } + writeNote(io, `[Purged ${toDelete.size} chunks]`); +} + +async function processQuery( + input: string, + chunkyIndex: ChunkyIndex, + io: iapp.InteractiveIo, + queryOptions: QueryOptions, +): Promise { + // ** Step 0:** Find most recent answers. + + const recentAnswers: NameValue[] = await findRecentAnswers( + input, + chunkyIndex, + ); + + // **Step 1:** Ask LLM (queryMaker) to propose queries for each index. + + const proposedQueries = await proposeQueries( + input, + recentAnswers, + chunkyIndex, + io, + queryOptions, + ); + if (!proposedQueries) return; // Error message already printed by proposeQueries. + + // Step 1a: If step 1 gave the answer, print it and exit. + if ("answer" in proposedQueries) { + await chunkyIndex.answerFolder.put(proposedQueries.answer); + reportQuery( + undefined, + undefined, + proposedQueries.answer, + io, + queryOptions.verbose, + ); + writeNote(io, "[Answer produced by stage 1]"); + return; + } + + // **Step 2:** Run those queries on the indexes. + + const [hitsByIndex, chunkIdScores] = await runIndexQueries( + proposedQueries, + chunkyIndex, + io, + queryOptions, + ); + if (!chunkIdScores) return; // Error message already printed by runIndexQueries. + + // **Step 3:** Ask the LLM (answerMaker) to answer the question. + + const answer = await generateAnswer( + input, + hitsByIndex, + chunkIdScores, + recentAnswers, + chunkyIndex, + io, + queryOptions, + ); + if (!answer) return; // Error message already printed by generateAnswer. + + // **Step 4:** Print the answer. Also record it for posterity. + + await chunkyIndex.answerFolder.put(answer); + reportQuery(hitsByIndex, chunkIdScores, answer, io, queryOptions.verbose); +} + +async function proposeQueries( + input: string, + recentAnswers: NameValue[], + chunkyIndex: ChunkyIndex, + io: iapp.InteractiveIo, + queryOptions: QueryOptions, +): Promise { + const t0 = Date.now(); + + const promptPreamble = makeQueryMakerPrompt(recentAnswers); + if (queryOptions.verbose) { + for (const section of promptPreamble) { + writeNote(io, `[${section.role}: ${section.content}]`); + } + } + const result = await chunkyIndex.queryMaker.translate( + input, + promptPreamble, + ); + if (!result.success) { + const t1 = Date.now(); + writeError( + io, + `[Error: ${result.message} in ${((t1 - t0) * 0.001).toFixed(3)} seconds]`, + ); + return undefined; + } + const specs = result.data; + if (queryOptions.verbose) { + writeNote(io, `\n[Result: proposed queries]`); + // Use util.inspect() to colorize JSON; writeColor() doesn't do that. + io.writer.writeLine( + util.inspect(specs, { depth: null, colors: true, compact: false }), + ); + } + const t1 = Date.now(); + writeNote( + io, + `[proposeQueries took ${((t1 - t0) * 0.001).toFixed(3)} seconds]`, + ); + + return specs; +} + +type HitsMap = Map>[]>; // indexName -> hits from nearestNeighborsPairs + +async function runIndexQueries( + proposedQueries: QuerySpecs, + chunkyIndex: ChunkyIndex, + io: iapp.InteractiveIo, + queryOptions: QueryOptions, +): Promise<[HitsMap, Map> | undefined]> { + const t0 = Date.now(); + + const hitsByIndex: HitsMap = new Map(); + const chunkIdScores: Map> = new Map(); // Record score of each chunk id. + const totalNumChunks = await chunkyIndex.chunkFolder.size(); // Nominator in IDF calculation. + + for (const [indexName, index] of chunkyIndex.allIndexes()) { + const spec: QuerySpec | undefined = (proposedQueries as any)[indexName]; + if (spec === undefined) { + writeNote(io, `[No query specified for ${indexName}]`); + continue; + } + + const specMaxHits = spec.maxHits; + const defaultMaxHits = queryOptions.maxHits; + const maxHits = specMaxHits ?? defaultMaxHits; + const maxHitsDisplay = + maxHits === specMaxHits + ? maxHits.toString() + : `${specMaxHits} ?? ${defaultMaxHits}`; + + const hits = await index.nearestNeighborsPairs( + spec.query, + maxHits, + queryOptions.minScore, + ); + if (!hits.length) { + writeNote( + io, + `[${indexName}: query ${spec.query} (maxHits ${maxHitsDisplay}) no hits]`, + ); + continue; + } + hitsByIndex.set(indexName, hits); + + // Update chunk id scores. + for (const hit of hits) { + // Literature suggests setting TF = 1 in this case, + // but the term's relevance score intuitively makes sense. + const tf = hit.score; + // IDF calculation ("inverse document frequency smooth"). + const fraction = + totalNumChunks / (1 + (hit.item.sourceIds?.length ?? 0)); + const idf = 1 + Math.log(fraction); + const newScore = tf * idf; + for (const chunkId of hit.item.sourceIds ?? []) { + const oldScoredItem = chunkIdScores.get(chunkId); + const oldScore = oldScoredItem?.score ?? 0; + // Combine scores by addition. (Alternatives: max, possibly others.) + const combinedScore = oldScore + newScore; + if (oldScoredItem) { + oldScoredItem.score = combinedScore; + } else { + chunkIdScores.set(chunkId, { + score: oldScore + newScore, + item: chunkId, + }); + } + } + } + + // Verbose logging. + if (queryOptions.verbose) { + writeNote( + io, + `\nFound ${hits.length} ${indexName} for '${spec.query}':`, + ); + for (const hit of hits) { + writeNote( + io, + `${hit.score.toFixed(3).padStart(7)}: ${hit.item.value} -- ${hit.item.sourceIds?.join(", ")}`, + ); + } + } + + // Regular logging. + const numChunks = new Set(hits.flatMap((h) => h.item.sourceIds)).size; + const end = hits.length - 1; + writeNote( + io, + `[${indexName}: query '${spec.query}' (maxHits ${maxHitsDisplay}); ${hits.length} hits; ` + + `scores ${hits[0].score.toFixed(3)}--${hits[end].score.toFixed(3)}; ` + + `${numChunks} unique chunk ids]`, + ); + } + + const t1 = Date.now(); + writeNote( + io, + `[runIndexQueries took ${((t1 - t0) * 0.001).toFixed(3)} seconds]`, + ); + + return [hitsByIndex, chunkIdScores]; +} + +async function generateAnswer( + input: string, + hitsByIndex: HitsMap, + chunkIdScores: Map>, + recentAnswers: NameValue[], + chunkyIndex: ChunkyIndex, + io: iapp.InteractiveIo, + queryOptions: QueryOptions, +): Promise { + const t0 = Date.now(); + + // Step 3a: Compute array of ids sorted by score, truncated to some limit. + const scoredChunkIds: ScoredItem[] = Array.from( + chunkIdScores.values(), + ); + + writeNote(io, `\n[Overall ${scoredChunkIds.length} unique chunk ids]`); + + scoredChunkIds.sort((a, b) => b.score - a.score); + + // Step 3b: Get the chunks themselves. + const chunks: Chunk[] = []; + const maxChunks = 30; + // Take the top N chunks that actually exist. + for (const scoredChunkId of scoredChunkIds) { + const maybeChunk = await chunkyIndex.chunkFolder.get( + scoredChunkId.item, + ); + if (maybeChunk) { + chunks.push(maybeChunk); + if (queryOptions.verbose) { + writeVerboseReferences( + io, + [scoredChunkId.item], + hitsByIndex, + chunkIdScores, + ); + } + if (chunks.length >= maxChunks) { + break; + } + } else { + writeNote(io, `[Chunk ${scoredChunkId.item} not found]`); + } + } + + writeNote(io, `[Sending ${chunks.length} chunks to answerMaker]`); + + const preamble = makeAnswerPrompt(recentAnswers, chunks); + if (queryOptions.verbose) { + const formatted = util.inspect(preamble, { + depth: null, + colors: true, + compact: false, + }); + writeNote(io, `Preamble: ${formatted}`); + } + + // Step 3c: Make the request and check for success. + const answerResult = await chunkyIndex.answerMaker.translate( + input, + preamble, + ); + + if (!answerResult.success) { + writeError(io, `[Error: ${answerResult.message}]`); + return undefined; + } + + if (queryOptions.verbose) { + writeNote( + io, + `AnswerResult: ${JSON.stringify(answerResult.data, null, 2)}`, + ); + } + + const t1 = Date.now(); + writeNote( + io, + `[generateAnswer took ${((t1 - t0) * 0.001).toFixed(3)} seconds]`, + ); + + return answerResult.data; +} + +async function findRecentAnswers( + input: string, + chunkyIndex: ChunkyIndex, +): Promise[]> { + // TODO: Allow for multiple concurrent sessions. + const recentAnswers: NameValue[] = []; + for await (const answer of chunkyIndex.answerFolder.all()) { + recentAnswers.push(answer); + } + // Assume the name field (the internal key) is a timestamp. + recentAnswers.sort((a, b) => b.name.localeCompare(a.name)); + recentAnswers.splice(20); // TODO: Cut off by total size, not count. + recentAnswers.reverse(); // Most recent last. + return recentAnswers; +} + +function reportQuery( + hitsByIndex: HitsMap | undefined, + chunkIdScores: Map> | undefined, + answer: AnswerSpecs, + io: iapp.InteractiveIo, + verbose: boolean, +): void { + writeHeading( + io, + `\nAnswer (confidence ${answer.confidence.toFixed(3).replace(/0+$/, "")}):`, + ); + + writeMain(io, wordWrap(answer.answer)); + + if (answer.message) { + writeWarning(io, "\n" + wordWrap(`Message: ${answer.message}`)); + } + if (answer.references.length) { + writeNote( + io, + `\nReferences (${answer.references.length}): ${answer.references.join(",").replace(/,/g, ", ")}`, + ); + if (verbose && chunkIdScores && hitsByIndex) { + writeVerboseReferences( + io, + answer.references, + hitsByIndex, + chunkIdScores, + ); + } + } + // NOTE: If the user wants to see the contents of any chunk, they can use the @chunk command. +} + +function writeVerboseReferences( + io: iapp.InteractiveIo, + references: ChunkId[], + hitsByIndex: HitsMap, + chunkIdScores: Map>, +): void { + for (const ref of references) { + const score = chunkIdScores.get(ref)?.score ?? 0; + writeColor( + io, + chalk.blue, + ` ${ref} (${chalk.white(score.toFixed(3))})`, + ); + for (const indexName of hitsByIndex.keys()) { + const hits = hitsByIndex.get(indexName) ?? []; + for (const hit of hits) { + if (hit.item.sourceIds?.includes(ref)) { + writeColor( + io, + chalk.green, + ` ${chalk.gray(indexName)}: ${hit.item.value} (${chalk.white(hit.score.toFixed(3))})`, + ); + } + } + } + } +} + +function makeQueryMakerPrompt( + recentAnswers: NameValue[], +): PromptSection[] { + const prompt = ` +I have a code project split up in chunks indexed on several categories. +Please produce suitable queries for each applicable index based on +conversation history (especially if the query refers to a previous answer +indirectly, e.g. via "it" or "that"), and the user question given later. +Don't suggest "meta" queries about the conversation itself -- only the code is indexed. +`; + return makeAnyPrompt(recentAnswers, prompt); +} + +function makeAnswerPrompt( + recentAnswers: NameValue[], + chunks: Chunk[], +): PromptSection[] { + const prompt = `\ +Following are the chunks most relevant to the query. +Use the preceding conversation items as context for the user query given later. +`; + + const preamble = makeAnyPrompt(recentAnswers, prompt); + for (const chunk of chunks) { + const chunkData = { + fileName: chunk.fileName, + chunkId: chunk.id, + blobs: chunk.blobs, + summary: chunk.docs?.chunkDocs + ?.filter((cd) => cd.summary) + .map((cd) => cd.summary) + .join("\n"), + }; + preamble.push({ role: "user", content: JSON.stringify(chunkData) }); + } + return preamble; +} + +function makeAnyPrompt( + recentAnswers: NameValue[], + prompt: string, +): PromptSection[] { + const preamble: PromptSection[] = []; + for (const answer of recentAnswers) { + preamble.push({ role: "user", content: answer.value.question }); + preamble.push({ role: "assistant", content: answer.value.answer }); + } + preamble.push({ role: "user", content: prompt.trim() }); + return preamble; +} + +function writeChunkLines( + chunk: Chunk, + io: iapp.InteractiveIo, + lineBudget = 10, +): void { + // TODO: limit how much we write per blob too (if there are multiple). + outer: for (const blob of chunk.blobs) { + for (let i = 0; i < blob.lines.length; i++) { + if (lineBudget-- <= 0) { + writeNote(io, " ..."); + break outer; + } + writeMain( + io, + `${(1 + blob.start + i).toString().padStart(6)}: ${blob.lines[i].trimEnd()}`, + ); + } + } +} + +// Wrap long lines. +export function wordWrap(text: string, wrapLength: number = 80): string { + let inCodeBlock = false; + const lines: string[] = []; + const prefixRegex = /^\s*((-|\*|\d+\.)\s+)?/; + for (let line of text.split(/[ ]*\r?\n/)) { + if (line.startsWith("```")) inCodeBlock = !inCodeBlock; // TODO: Colorize code blocks. + if (line.length <= wrapLength || inCodeBlock) { + // The whole line is deemed to fit. + lines.push(line); + continue; + } + // We must try to break. + const prefixLength = prefixRegex.exec(line)?.[0]?.length ?? 0; + const indent = " ".repeat(prefixLength); + while (line.length > wrapLength) { + const shortenedLine = line.slice(0, wrapLength + 1); + let index = shortenedLine.lastIndexOf(" "); + if (index <= prefixLength) { + index = line.indexOf(" ", wrapLength); + if (index < 0) break; // The rest of the line is one "word". + } + lines.push(line.slice(0, index).trimEnd()); + line = indent + line.slice(index + 1).trimStart(); + } + lines.push(line); + } + return lines.join("\n"); +} + +export function testWordWrap(): void { + const sampleText = `\ +This is a long line that should be wrapped at some point. It's not clear where. + This is another long line but it is also indented. Let's make sure the breaks are also indented. + - This is a bullet point that should be wrapped at some point. It's not clear where. + 12. This is a numbered point that should be wrapped at some point. It's not clear where. +\`\`\`python +def generate_id() -> IdType: + """Generate a new unique ID. + + IDs are really timestamps formatted as YYYY_MM_DD-HH_MM_SS.UUUUUU, + where UUUUUU is microseconds. + + To ensure IDs are unique, if the next timestamp isn't greater than the last one, + we add 1 usec to the last one. This has the advantage of "gracefully" handling + time going backwards. + """ + global last_ts + next_ts = datetime.datetime.now() # Local time, for now + if next_ts <= last_ts: + next_ts = last_ts + datetime.timedelta(microseconds=1) + last_ts = next_ts + return next_ts.strftime("%Y%m%d-%H%M%S.%f") +\`\`\` +This is a short line. + * A short bullet. + * Another short one. + A short indented line. +End. +`; + console.log(wordWrap(sampleText, 40)); +} + +// testWordWrap();