Skip to content

Commit

Permalink
[python/knowpro] Cleanups and tweaks, and a little demo main program (#…
Browse files Browse the repository at this point in the history
…798)

- Changed remaining camelCase methods and field to snake_case.
- Added a partial translation of knowPro/conversationIndex.ts (as
memconv/convindex.py).
- Added (very informal) memconv/__main__.py which calls
import_podcast().
- Added a little bit of test data for memconv/__main__.py under
kp/testdata/.
  • Loading branch information
gvanrossum-ms authored Mar 7, 2025
1 parent c23b33d commit 071aee4
Show file tree
Hide file tree
Showing 11 changed files with 448 additions and 49 deletions.
File renamed without changes.
2 changes: 1 addition & 1 deletion python/knowpro/README.md → python/kp/knowpro/README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# knowpro
# knowpro (knowledge processor)

**Experimental prototype**: Working toward a shared understanding
of the MVP for structured RAG.
Expand Down
2 changes: 2 additions & 0 deletions python/kp/knowpro/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
76 changes: 76 additions & 0 deletions python/kp/knowpro/convindex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

from dataclasses import dataclass, field
from typing import Callable

from .interfaces import (
# Interfaces.
IConversation,
IMessage,
ITermToSemanticRefIndex,
ITermToSemanticRefIndexData,
# Other imports.
IndexingEventHandlers,
IndexingResults,
Knowledge,
KnowledgeType,
MessageIndex,
ScoredSemanticRef,
SemanticRef,
TextLocation,
TextRange,
)


def text_range_from_location(
message_index: MessageIndex,
chunk_index: int = 0,
) -> TextRange:
return TextRange(
start=TextLocation(message_index, chunk_index),
end=None,
)


type KnowledgeValidator = Callable[
[
KnowledgeType, # knowledge_type
Knowledge, # knowledge
],
bool,
]


def add_metadata_to_index(
messages: list[IMessage],
semantic_refs: list[SemanticRef],
semantic_ref_index: ITermToSemanticRefIndex,
knowledge_validator: KnowledgeValidator | None = None,
) -> None:
raise NotImplementedError


@dataclass
class ConversationIndex(ITermToSemanticRefIndex):
_map: dict[str, list[ScoredSemanticRef]]

def __init__(self, data: ITermToSemanticRefIndexData | None = None):
self._map = {}
if data:
self.deserialize(data)

# TODO: More methods

def deserialize(self, data: ITermToSemanticRefIndexData) -> None:
raise NotImplementedError


# ...


async def build_conversation_index(
conversation: IConversation,
event_handler: IndexingEventHandlers | None = None,
) -> IndexingResults:
raise NotImplementedError
65 changes: 34 additions & 31 deletions python/knowpro/interfaces.py → python/kp/knowpro/interfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,17 @@

# TODO:
# - See TODOs in kplib.py.
# - Do the Protocol classes need to be @runtime_checkable?
# - Should we use ABC instead of Protocol for certain classes?
#
# NOTE:
# - I took some liberty with index types and made them int.
# - I rearranged the order in some cases to ensure def-before-ref.
# - I translated readonly to @property.

from collections.abc import Sequence
from datetime import datetime as Date
from dataclasses import dataclass, field
from datetime import datetime as Datetime
from typing import Any, Callable, Literal, Protocol, runtime_checkable

from . import kplib
Expand All @@ -35,12 +38,12 @@ class DeletionInfo(Protocol):
@runtime_checkable
class IMessage[TMeta: IKnowledgeSource = Any](Protocol):
# The text of the message, split into chunks.
text_chunks: Sequence[str]
text_chunks: list[str]
# For example, e-mail has subject, from and to fields;
# a chat message has a sender and a recipient.
metadata: TMeta
timestamp: str | None = None
tags: Sequence[str]
tags: list[str]
deletion_info: DeletionInfo | None = None


Expand All @@ -61,20 +64,20 @@ class ScoredMessageIndex(Protocol):

@runtime_checkable
class ITermToSemanticRefIndex(Protocol):
def getTerms(self) -> Sequence[str]:
def get_terms(self) -> Sequence[str]:
raise NotImplementedError

def addTerm(
def add_term(
self,
term: str,
semantic_ref_index: SemanticRefIndex | ScoredSemanticRef,
) -> None:
raise NotImplementedError

def removeTerm(self, term: str, semantic_ref_index: SemanticRefIndex) -> None:
def remove_term(self, term: str, semantic_ref_index: SemanticRefIndex) -> None:
raise NotImplementedError

def lookupTerm(self, term: str) -> Sequence[ScoredSemanticRef] | None:
def lookup_term(self, term: str) -> Sequence[ScoredSemanticRef] | None:
raise NotImplementedError


Expand All @@ -94,45 +97,45 @@ class Tag(Protocol):
type Knowledge = kplib.ConcreteEntity | kplib.Action | Topic | Tag


@runtime_checkable
class TextLocation(Protocol):
@dataclass
class TextLocation:
# The index of the message.
message_index: MessageIndex
# The index of the chunk.
chunkIndex: int | None
chunk_index: int = 0
# The index of the character within the chunk.
charIndex: int | None
char_index: int = 0


# A text range within a session.
@runtime_checkable
class TextRange(Protocol):
@dataclass
class TextRange:
# The start of the range.
start: TextLocation
# The end of the range (exclusive).
end: TextLocation | None
end: TextLocation | None = None


@runtime_checkable
class SemanticRef(Protocol):
@dataclass
class SemanticRef:
semantic_ref_index: SemanticRefIndex
range: TextRange
knowledge_type: KnowledgeType
knowledge: Knowledge


@runtime_checkable
class DateRange(Protocol):
start: Date
# Inclusive.
end: Date | None
@dataclass
class DateRange:
start: Datetime
# Inclusive. # TODO: Really? Shouldn't this be exclusive?
end: Datetime | None = None


@runtime_checkable
class Term(Protocol):
@dataclass
class Term:
text: str
# Optional weighting for these matches.
weight: float | None
weight: float | None = None


@runtime_checkable
Expand Down Expand Up @@ -261,19 +264,19 @@ async def lookup_thread(
@runtime_checkable
class IConversationSecondaryIndexes(Protocol):
property_to_semantic_ref_index: IPropertyToSemanticRefIndex | None
timestampIndex: ITimestampToTextRangeIndex | None
termToRelatedTermsIndex: ITermToRelatedTermsIndex | None
timestamp_index: ITimestampToTextRangeIndex | None
terms_to_related_terms_index: ITermToRelatedTermsIndex | None
threads: IConversationThreads | None


@runtime_checkable
class IConversation[TMeta: IKnowledgeSource = Any](Protocol):
name_tag: str
tags: Sequence[str]
messages: Sequence[IMessage[TMeta]]
semantic_refs: Sequence[SemanticRef] | None
tags: list[str]
messages: list[IMessage[TMeta]]
semantic_refs: list[SemanticRef] | None
semantic_ref_index: ITermToSemanticRefIndex | None
secondaryIndexes: IConversationSecondaryIndexes | None
secondary_indexes: IConversationSecondaryIndexes | None


# ------------------------
Expand Down Expand Up @@ -334,5 +337,5 @@ class IndexingEventHandlers(Protocol):

@runtime_checkable
class IndexingResults(Protocol):
chunksIndexedUpto: TextLocation | None = None
chunks_indexed_upto: TextLocation | None = None
error: str | None = None
33 changes: 16 additions & 17 deletions python/knowpro/kplib.py → python/kp/knowpro/kplib.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,34 +3,33 @@

# TODO:
# - What does the comment "Very concise values" in class Facet mean?
# - Do the protocols need to be @runtime_checkable?
# - Should the field names be camelCase to match the JSON schema?
# - For things of type float, should we add `| int` to emphasize that int is okay?
# - How to allow totally missing attributes? (facets, params, subject_entity_facet)
# - Should we use ABC instead of Protocol for certain classes?
# (Some users think float means float only.)

from typing import Literal, Protocol, runtime_checkable
from dataclasses import dataclass
from typing import Literal


@runtime_checkable
class Quantity(Protocol):
@dataclass
class Quantity:
amount: float
units: str


type Value = str | float | bool | Quantity


@runtime_checkable
class Facet(Protocol):
@dataclass
class Facet:
name: str
# Very concise values.
value: Value


# Specific, tangible people, places, institutions or things only
@runtime_checkable
class ConcreteEntity(Protocol):
@dataclass
class ConcreteEntity:
# The name of the entity or thing such as "Bach", "Great Gatsby",
# "frog" or "piano".
name: str
Expand All @@ -45,19 +44,19 @@ class ConcreteEntity(Protocol):
facets: list[Facet] | None = None


@runtime_checkable
class ActionParam(Protocol):
@dataclass
class ActionParam:
name: str
value: Value


type VerbTense = Literal["past", "present", "future"]


@runtime_checkable
class Action(Protocol):
@dataclass
class Action:
# Each verb is typically a word.
verb: list[str]
verbs: list[str]
verb_tense: VerbTense
subject_entity_name: str | Literal["none"]
object_entity_name: str | Literal["none"]
Expand All @@ -69,8 +68,8 @@ class Action(Protocol):


# Detailed and comprehensive knowledge response.
@runtime_checkable
class KnowledgeResponse(Protocol):
@dataclass
class KnowledgeResponse:
entities: list[ConcreteEntity]
# The 'subject_entity_name' and 'object_entity_name' must correspond
# to the 'name' of an entity listed in the 'entities' array.
Expand Down
25 changes: 25 additions & 0 deletions python/kp/memconv/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# memconv (memory conversations)

**Experimental prototype**: Working toward a shared understanding
of the MVP for structured RAG.

**Sample code**

This is an in-progress project aiming at a Pythonic translation of
`TypeAgent/ts/packages/memory/conversation` to Python.
(Pythonic because it uses Python conventions and types as appropriate.)

- Python class names correspond 1:1 to TS interface or type names.
- Field and method names are converted from camelCase to snake_case.
- Types and interfaces become runtime-checkable Protocol classes,
except union types which become type aliases.
- Unions of string literals become Literal types.

## Trademarks

This project may contain trademarks or logos for projects, products, or services.
Authorized use of Microsoft trademarks or logos is subject to and must follow
[Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general).
Use of Microsoft trademarks or logos in modified versions of this project
must not cause confusion or imply Microsoft sponsorship.
Any use of third-party trademarks or logos are subject to those third-party's policies.
2 changes: 2 additions & 0 deletions python/kp/memconv/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
28 changes: 28 additions & 0 deletions python/kp/memconv/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/usr/bin/env python3.13
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

import argparse
from datetime import datetime as Datetime
import sys

assert sys.version_info >= (3, 13), "Requires Python 3.13 or later"

from kp.memconv.import_podcasts import import_podcast


def main():
parser = argparse.ArgumentParser(description="Import a podcast")
parser.add_argument("filename", help="The filename to import")
# TODO: Add more arguments for the import_podcast function.
args = parser.parse_args()
pod = import_podcast(args.filename, None, Datetime.now(), 3.0)
print("Name-Tag:", pod.name_tag)
print("Tags:", ", ".join(pod.tags))
for msg in pod.messages:
print()
print(msg)


if __name__ == "__main__":
main()
Loading

0 comments on commit 071aee4

Please sign in to comment.