Skip to content

Commit

Permalink
knowpro: message indexing, bugs (#794)
Browse files Browse the repository at this point in the history
- Starting message indexing. 
- Bug fixes to message matching.
- Deserialization bug in importPodcast
  • Loading branch information
umeshma authored Mar 6, 2025
1 parent b3a5385 commit c0b5d91
Show file tree
Hide file tree
Showing 9 changed files with 179 additions and 12 deletions.
36 changes: 34 additions & 2 deletions ts/examples/chat/src/memory/knowproMemory.ts
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ export async function createKnowproCommands(
commands.kpSearch = search;
commands.kpEntities = entities;
commands.kpPodcastBuildIndex = podcastBuildIndex;
commands.kpPodcastBuildMessageIndex = podcastBuildMessageIndex;

commands.kpImages = showImages;
commands.kpImagesImport = imagesImport;
Expand Down Expand Up @@ -539,7 +540,7 @@ export async function createKnowproCommands(
if (filter) {
context.printer.writeJson(filter, true);
}
if (searchResults) {
if (searchResults && searchResults.messageMatches.length > 0) {
if (namedArgs.showKnowledge) {
context.printer.writeKnowledgeSearchResults(
context.conversation!,
Expand Down Expand Up @@ -715,6 +716,37 @@ export async function createKnowproCommands(
progress.complete();
}

function podcastBuildMessageIndexDef(): CommandMetadata {
return {
description: "Build fuzzy message index for the podcast",
options: {
maxMessages: argNum("Maximum messages to index"),
batchSize: argNum("Batch size", 4),
},
};
}
commands.kpPodcastBuildMessageIndex.metadata =
podcastBuildMessageIndexDef();
async function podcastBuildMessageIndex(args: string[]): Promise<void> {
const namedArgs = parseNamedArguments(
args,
podcastBuildMessageIndexDef(),
);
context.printer.writeLine(
`Indexing ${context.conversation?.messages.length} messages`,
);
let progress = new ProgressBar(context.printer, namedArgs.maxMessages);
await context.podcast?.buildMessageIndex(
createIndexingEventHandler(
context,
progress,
namedArgs.maxMessages,
),
namedArgs.batchSize,
);
progress.complete();
}

function imageCollectionBuildIndexDef(): CommandMetadata {
return {
description: "Build image collection index",
Expand Down Expand Up @@ -831,7 +863,7 @@ function createIndexingEventHandler(
if (!startedRelated) {
progress.reset(sourceTexts.length);
context.printer.writeLine(
`Indexing ${sourceTexts.length} related terms`,
`Creating ${sourceTexts.length} embeddings`,
);
startedRelated = true;
}
Expand Down
38 changes: 37 additions & 1 deletion ts/packages/knowPro/src/collections.ts
Original file line number Diff line number Diff line change
Expand Up @@ -353,8 +353,34 @@ export class SemanticRefAccumulator extends MatchAccumulator<SemanticRefIndex> {
}

export class MessageAccumulator extends MatchAccumulator<MessageIndex> {
constructor() {
constructor(matches?: Match<MessageIndex>[]) {
super();
if (matches && matches.length > 0) {
this.setMatches(matches);
}
}

public override add(
value: number,
score: number,
isExactMatch: boolean,
): void {
if (isExactMatch) {
let match = this.getMatch(value);
if (match === undefined) {
match = {
value,
score,
hitCount: 1,
relatedHitCount: 0,
relatedScore: 0,
};
this.setMatch(match);
} else if (score > match.score) {
match.score = score;
match.hitCount++;
}
}
}

public addMessagesForSemanticRef(
Expand All @@ -376,6 +402,16 @@ export class MessageAccumulator extends MatchAccumulator<MessageIndex> {
}
}

public smoothScores() {
// Normalize the score relative to # of hits. Use log to reduce impact of very high score
for (const match of this.getMatches()) {
if (match.hitCount > 0) {
const avgScore = match.score / match.hitCount;
match.score = Math.log(avgScore + 1);
}
}
}

public toScoredMessageIndexes(): ScoredMessageIndex[] {
return this.getSortedByScore(0).map((m) => {
return {
Expand Down
4 changes: 3 additions & 1 deletion ts/packages/knowPro/src/fuzzyIndex.ts
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,11 @@ export class TextEmbeddingIndex {
public async addTextBatch(
textToIndex: string[],
eventHandler?: IndexingEventHandlers,
batchSize?: number,
): Promise<void> {
for (const batch of getIndexingBatches(
textToIndex,
this.settings.batchSize,
batchSize ?? this.settings.batchSize,
)) {
if (
eventHandler?.onEmbeddingsCreated &&
Expand All @@ -62,6 +63,7 @@ export class TextEmbeddingIndex {
) {
break;
}
// TODO: return IndexingResult to track how far we got before a non-recoverable failure
await this.addText(batch.values);
}
}
Expand Down
1 change: 1 addition & 0 deletions ts/packages/knowPro/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,4 @@ export * from "./serialization.js";
export * from "./dateTimeSchema.js";
export * from "./searchSchema.js";
export * from "./searchTranslator.js";
export * from "./textLocationIndex.js";
1 change: 1 addition & 0 deletions ts/packages/knowPro/src/interfaces.ts
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,7 @@ export interface IndexingEventHandlers {
batchStartAt: number,
) => boolean;
}

export type IndexingResults = {
chunksIndexedUpto?: TextLocation | undefined;
error?: string | undefined;
Expand Down
16 changes: 12 additions & 4 deletions ts/packages/knowPro/src/query.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1088,10 +1088,12 @@ export function messageMatchesFromKnowledgeMatches(
semanticRefs: SemanticRef[],
knowledgeMatches: Map<KnowledgeType, SemanticRefSearchResult>,
): MessageAccumulator {
const messageMatches = new MessageAccumulator();
let messageMatches = new MessageAccumulator();
let expectedHitCount = 0;
for (const knowledgeType of knowledgeMatches.keys()) {
const matchesByType = knowledgeMatches.get(knowledgeType);
if (matchesByType) {
if (matchesByType && matchesByType.semanticRefMatches.length > 0) {
expectedHitCount++;
for (const match of matchesByType.semanticRefMatches) {
messageMatches.addMessagesForSemanticRef(
semanticRefs[match.semanticRefIndex],
Expand All @@ -1100,7 +1102,13 @@ export function messageMatchesFromKnowledgeMatches(
}
}
}
const maxHitCount = messageMatches.getMaxHitCount();
messageMatches.selectWithHitCount(maxHitCount);
if (expectedHitCount > 0) {
const relevantMessages =
messageMatches.getWithHitCount(expectedHitCount);
if (relevantMessages.length > 0) {
messageMatches = new MessageAccumulator(relevantMessages);
}
}
//messageMatches.smoothScores();
return messageMatches;
}
3 changes: 2 additions & 1 deletion ts/packages/knowPro/src/searchSchema.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ export type FacetTerm = {
export type EntityTerm = {
// the name of the entity or thing such as "Bach", "Great Gatsby", "frog" or "piano"
name: string;
// the types of the entity such as "speaker", "person", "artist", "animal", "object", "instrument", "school", "room", "museum", "food" etc.
// the specific types of the entity such as "speaker", "person", "artist", "animal", "instrument", "school", "room", "museum", "food" etc.
// Generic types like "object", "thing" etc. are NOT allowed
// An entity can have multiple types; entity types should be single words
type: string[];
// A specific, inherent, defining, or non-immediate facet of the entity such as "blue", "old", "famous", "sister", "aunt_of", "weight: 4 kg"
Expand Down
57 changes: 56 additions & 1 deletion ts/packages/knowPro/src/textLocationIndex.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.

import { TextLocation } from "./interfaces.js";
import { IMessage, MessageIndex, TextLocation } from "./interfaces.js";
import { IndexingEventHandlers } from "./interfaces.js";
import {
TextEmbeddingIndex,
Expand All @@ -14,6 +14,11 @@ export type ScoredTextLocation = {
};

export interface ITextToTextLocationIndexFuzzy {
addTextLocation(text: string, textLocation: TextLocation): Promise<void>;
addTextLocationsBatched(
textAndLocations: [string, TextLocation][],
eventHandler?: IndexingEventHandlers,
): Promise<void>;
lookupText(
text: string,
maxMatches?: number,
Expand Down Expand Up @@ -51,10 +56,12 @@ export class TextToTextLocationIndexFuzzy
public async addTextLocationsBatched(
textAndLocations: [string, TextLocation][],
eventHandler?: IndexingEventHandlers,
batchSize?: number,
): Promise<void> {
await this.embeddingIndex.addTextBatch(
textAndLocations.map((tl) => tl[0]),
eventHandler,
batchSize,
);
this.textLocations.push(...textAndLocations.map((tl) => tl[1]));
}
Expand Down Expand Up @@ -94,3 +101,51 @@ export class TextToTextLocationIndexFuzzy
this.embeddingIndex.deserialize(data.embeddings);
}
}

export async function addMessagesToIndex(
textLocationIndex: TextToTextLocationIndexFuzzy,
messages: IMessage[],
baseMessageIndex: MessageIndex,
eventHandler?: IndexingEventHandlers,
batchSize?: number,
): Promise<void> {
const allChunks: [string, TextLocation][] = [];
// Collect everything so we can batch efficiently
for (let i = 0; i < messages.length; ++i) {
const message = messages[i];
let messageIndex = baseMessageIndex + i;
for (
let chunkIndex = 0;
chunkIndex < message.textChunks.length;
++chunkIndex
) {
allChunks.push([
message.textChunks[chunkIndex],
{ messageIndex, chunkIndex },
]);
}
}
// Todo: return an IndexingResult
await textLocationIndex.addTextLocationsBatched(
allChunks,
eventHandler,
batchSize,
);
}

export async function buildMessageIndex(
messages: IMessage[],
settings: TextEmbeddingIndexSettings,
eventHandler?: IndexingEventHandlers,
batchSize?: number,
) {
const textLocationIndex = new TextToTextLocationIndexFuzzy(settings);
await addMessagesToIndex(
textLocationIndex,
messages,
0,
eventHandler,
batchSize,
);
return textLocationIndex;
}
35 changes: 33 additions & 2 deletions ts/packages/memory/conversation/src/importPodcast.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ import {
IConversationDataWithIndexes,
writeConversationDataToFile,
readConversationDataFromFile,
TextToTextLocationIndexFuzzy,
buildMessageIndex,
} from "knowpro";
import { conversation as kpLib, split } from "knowledge-processor";
import { collections, dateTime, getFileName, readAllText } from "typeagent";
Expand Down Expand Up @@ -88,12 +90,13 @@ function assignMessageListeners(
}

export class PodcastMessage implements IMessage<PodcastMessageMeta> {
public timestamp: string | undefined;
constructor(
public textChunks: string[],
public metadata: PodcastMessageMeta,
public tags: string[] = [],
public timestamp: string | undefined = undefined,
) {}

addTimestamp(timestamp: string) {
this.timestamp = timestamp;
}
Expand All @@ -106,6 +109,10 @@ export class Podcast implements IConversation<PodcastMessageMeta> {
public settings: ConversationSettings;
public semanticRefIndex: ConversationIndex;
public secondaryIndexes: PodcastSecondaryIndexes;
/**
* Work in progress
*/
public messageIndex?: TextToTextLocationIndexFuzzy | undefined;

constructor(
public nameTag: string = "",
Expand Down Expand Up @@ -149,6 +156,21 @@ export class Podcast implements IConversation<PodcastMessageMeta> {
return result;
}

/**
* Work in progress. This will get merged into "buildIndex" soon
*/
public async buildMessageIndex(
eventHandler?: IndexingEventHandlers,
batchSize?: number,
): Promise<void> {
this.messageIndex = await buildMessageIndex(
this.messages,
this.settings.relatedTermIndexSettings.embeddingIndexSettings!,
eventHandler,
batchSize,
);
}

public async serialize(): Promise<PodcastData> {
const data: PodcastData = {
nameTag: this.nameTag,
Expand All @@ -165,7 +187,16 @@ export class Podcast implements IConversation<PodcastMessageMeta> {

public async deserialize(podcastData: PodcastData): Promise<void> {
this.nameTag = podcastData.nameTag;
this.messages = podcastData.messages;
this.messages = podcastData.messages.map((m) => {
const metadata = new PodcastMessageMeta(m.metadata.speaker);
metadata.listeners = m.metadata.listeners;
return new PodcastMessage(
m.textChunks,
metadata,
m.tags,
m.timestamp,
);
});
this.semanticRefs = podcastData.semanticRefs;
this.tags = podcastData.tags;
if (podcastData.semanticIndexData) {
Expand Down

0 comments on commit c0b5d91

Please sign in to comment.