-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmake_vectordb.py
181 lines (135 loc) · 6.31 KB
/
make_vectordb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
import pandas as pd
import chromadb
import requests
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
import tiktoken
import os
import dotenv
dotenv.load_dotenv()
def num_tokens_from_string(string: str, model_name: str = "gpt-3.5-turbo") -> int:
"""Returns the number of tokens in a text string."""
encoding = tiktoken.encoding_for_model(model_name)
num_tokens = len(encoding.encode(string))
return num_tokens
# Define a function to fetch README content
def get_readme(url):
if pd.isna(url) or url == "":
return ""
try:
# Construct URL to the raw README file
readme_url = url.replace('github.com', 'raw.githubusercontent.com').rstrip('/') + '/master/README.md'
response = requests.get(readme_url)
if response.status_code == 200:
return response.text
else:
return ""
except Exception:
# Handle exceptions
return ""
def save_dataframe(df, directory, base_filename):
"""
Save a DataFrame to a CSV file with versioning if the file already exists.
:param df: DataFrame to be saved.
:param directory: Directory where the file will be saved.
:param base_filename: The base name of the file, without version number.
"""
# Construct the initial file path
file_path = os.path.join(directory, base_filename)
# Initialize version number
version = 0
# Check if the file exists and update the file name with the next version
while os.path.exists(file_path):
version += 1
# Construct new file path with version number
file_name, file_extension = os.path.splitext(base_filename)
file_path = os.path.join(directory, f"{file_name}_v{version}{file_extension}")
# Save the DataFrame
df.to_csv(file_path, index=False)
print(f"DataFrame saved as {file_path}")
# Prepare the DataFrame
def make_desc(row):
# Example function using row 'A' and 'B'
return 'Platform: ' + row['Platform'] + \
'\n Description: ' + row['Description'] + \
'\n Categories: ' + row['Categories']
def make_desc_readme(row):
# Example function using row 'A' and 'B'
return 'Platform: ' + row['Platform'] + \
'\n Description: ' + row['Description'] + \
'\n Categories: ' + row['Categories'] + \
'\n Readme: ' + row['Readme']
def prepare_dataframe(df):
counter = 0 # for status printing
for index, row in df.iterrows():
df.at[index, 'Readme'] = get_readme(row['Code'])
# Increment the counter
counter += 1
if counter % 100 == 0:
print(f"Processed {counter} rows.")
df['extented_desc'] = df.apply(make_desc, axis=1)
df['extented_desc_readme'] = df.apply(make_desc_readme, axis=1)
assert df.isna().sum()['extented_desc'] == 0 # check for nans in desc to make embedding collection
df['tokens_in_ext_desc'] = df['extented_desc'].apply(num_tokens_from_string)
# save data frame with full readmes
#save_dataframe(df, 'dataframes', 'tool-table-with-readmes.csv')
#df['tokens_in_ext_desc'].hist() # for jupyer notebook
# Trim ReadMe to fit 8k tokens limit in embedding db - find better embedding model later
df['extented_desc_readme_trim'] = df['extented_desc_readme'].apply(lambda x: x[:22000] if pd.notna(x) else x)
df['tokens_in_ext_desc_readme_trim'] = df['extented_desc_readme_trim'].apply(num_tokens_from_string)
#df['tokens_in_ext_desc_readme_trim'].hist(bins = 50)# for jupyer notebook
assert df['tokens_in_ext_desc'].sum() < 100000
# as if Nov 7 2023 allshort descriptions of tools is jsut 68K tokens, so
# no need for embed db truly speaking, can also just make claude calls every time need to pick a tool
assert df.Name.nunique() == df.shape[0]
# for jupyer notebook
# df['extented_desc'].isna().sum()
# df['Code'].isna().sum()
# df['Readme'].isna().sum()
# df['Readme'].head(20)
df['Readme'] = df['Readme'].apply(lambda x: x.replace('\n', ' ').replace('\r', ' ').replace("'", "\\'") if pd.notna(x) else x)
# save data frame with trimmed readmes
#save_dataframe(df, 'dataframes', 'tool-table-with-readmestrimmed.csv')
return df
# Create Vector DB
def get_vector_db(df):
# Uncomment for persistent client
chroma_client = chromadb.PersistentClient()
EMBEDDING_MODEL = "text-embedding-ada-002"
# change this to biotech specialised model later
embedding_function = OpenAIEmbeddingFunction(api_key=os.getenv("OPENAI_API_KEY"), model_name=EMBEDDING_MODEL)
scrnatools_description_collection = chroma_client.create_collection(name='scRNA_Tools', embedding_function=embedding_function)
# Add the content vectors
scrnatools_description_collection.add(
documents = list(df['extented_desc']),
metadatas = df.drop(['extented_desc'], axis = 1).to_dict(orient='records'),
ids = list(df.Name)
)
scrnatools_description_collection.add(
documents = list(df['extented_desc_readme_trim']),
metadatas = df.drop(['extented_desc_readme_trim'], axis = 1).to_dict(orient='records'),
ids = list(df.Name))
return scrnatools_description_collection
# Query DB
def query_collection(collection, query, max_results, dataframe):
results = collection.query(query_texts=query, n_results=max_results, include=['distances'])
result = pd.DataFrame({
'id':results['ids'][0],
'score':results['distances'][0],
'content': dataframe[dataframe.Name.isin(results['ids'][0])]['extented_desc'],
'platform': dataframe[dataframe.Name.isin(results['ids'][0])]['Platform'],
})
return result
# Import table with tools from scrna-tools.org
# you can download it from https://scrna-tools.org/tables/ (don't forget to select all columns)
# or access souce programmatically from https://github.com/scRNA-tools/scRNA-tools/tree/master/database
df = pd.read_csv('tableExport.csv')
try:
df_enriched = prepare_dataframe(df)
save_dataframe(df_enriched, 'dataframes', 'tool-table-with-readmestrimmed.csv')
except Exception as e:
print(e)
try:
scrnatools_description_collection = get_vector_db(df_enriched)
except Exception as e:
print(e)
print(query_collection(scrnatools_description_collection, 'quality controll, python, 10M cells, human cells', 5, df))