Merge pull request #51 from bukosabino/develop-multiple-collections

Working with multiple collections/modules
bukosabino · Feb 2, 2024 · 08a2f8d · 08a2f8d
2 parents 243e5f3 + d169e4b
commit 08a2f8d
Show file tree

Hide file tree

Showing 20 changed files with 259 additions and 349 deletions.
diff --git a/README.md b/README.md
@@ -48,9 +48,9 @@ It is the web service, and it is a central component for the whole system, doing
 
 #### Loading data
 
-We download the BOE documents and break them into small chunks of text (e.g. 1200 characters). Each text chunk is transformed into an embedding (e.g. a numerically dense vector of 768 sizes). Some additional metadata is also stored with the vectors so that we can pre- or post-filter the search results. [Check the code](https://github.com/bukosabino/ia-boe/blob/main/src/etls/boe/load/run.py)
+We download the BOE documents and break them into small chunks of text (e.g. 1200 characters). Each text chunk is transformed into an embedding (e.g. a numerically dense vector of 768 sizes). Some additional metadata is also stored with the vectors so that we can pre- or post-filter the search results. [Check the code](https://github.com/bukosabino/ia-boe/blob/main/src/etls/boe/load.py)
 
-The BOE is updated every day, so we need to run an ETL job every day to retrieve the new documents, transform them into embeddings, link the metadata, and store them in the embedding database. [Check the code](https://github.com/bukosabino/ia-boe/blob/main/src/etls/boe/load/daily.py)
+The BOE is updated every day, so we need to run an ETL job every day to retrieve the new documents, transform them into embeddings, link the metadata, and store them in the embedding database. [Check the code](https://github.com/bukosabino/ia-boe/blob/main/src/etls/boe/load.py)
 
 #### Reading data
 
@@ -102,3 +102,4 @@ You are welcome! Please, contact us to see how you can help.
 * [Darío López](https://www.linkedin.com/in/dar%C3%ADo-l%C3%B3pez-padial-45269150/) 
 * [Alex Dantart](https://www.linkedin.com/in/dantart/)
 * [Jorge Iliarte](https://www.linkedin.com/in/jorge-iliarte-llop/)
+* [Jorge Barrachina](https://www.linkedin.com/in/jorgebarrachina/)
diff --git a/config/config.yaml b/config/config.yaml
@@ -5,11 +5,11 @@ chunk_overlap: 100
 admin_email: [email protected]
 
 embeddings_model_name: dariolopez/roberta-base-bne-finetuned-msmarco-qa-es-mnrl-mn
+embeddings_model_size: 768
 
 vector_store: 'qdrant'  # {'qdrant', 'pinecone', 'supabase'}
 top_k_results: 10
-date_start: 2000/01/01
-date_end: 2004/12/12
+distance_type: 'Cosine'  # {'Cosine', 'Euclid', 'Dot'}
 
 # Prompts
 prompt_system: |
@@ -24,21 +24,20 @@ prompt_system_context: |
   En la respuesta no menciones nada sobre el contexto o los scores.
 
 # Qdrant
-collection_name: justicio
+collections:
+  - justicio
+  - bocm
+  - bopz
 
 # Openai
-llm_model_name: 'gpt-3.5-turbo-1106'  # 'gpt-4-1106-preview'
+llm_model_name: 'gpt-3.5-turbo-0125'  # 'gpt-3.5-turbo-1106', 'gpt-4-1106-preview'
 temperature: 0
 seed: 42
 max_tokens: 1024
 
-# Deprecated
-
-# llm_api: 'llama2'  # {'llama2', 'openai'}
-
-# Pinecone
+# Not used
+## Pinecone
 vector_store_index_name: justicio
-
-# Supabase
+## Supabase
 table_name: 'documents'
-query_name: 'match_documents'
+query_name: 'match_documents'
diff --git a/doc/deployment_guide.md b/doc/deployment_guide.md
@@ -6,30 +6,36 @@ At this moment we are working with pinecone as vector database, so, please creat
 
 Once you have your pinecone index, please update the `config/config.yaml` :
 
-* vector_store_index_name: use the name of the pinecone index that you choose.
-* date_start: Choose a start date for your system. You will load BOE document from this date.
-* date_end: Choose an end date for your system. You will load BOE document to this date.
+* vector_store: use the name of the pinecone index that you choose.
 
 Export environment variables:
 
 ```
-export PINECONE_API_KEY=<your_pinecone_api_key>
-export PINECONE_ENV=<your_pinecone_env>
-export OPENAI_API_KEY=<your_open_api_key>
-export SENDGRID_API_KEY=<your_sendgrid_api_key>
 export APP_PATH="."
+export SENDGRID_API_KEY=<your_sendgrid_api_key>
+export OPENAI_API_KEY=<your_open_api_key>
+export TOKENIZERS_PARALLELISM=false
+export TAVILY_API_KEY=<your_tavily_api_key>
+export QDRANT_API_KEY="<your_qdrant_api_key>"
+export QDRANT_API_URL="<your_qdrant_api_url>"
 ```
 
 Load BOE documents into your vector database (depending on the selected data, may take a few minutes)
 
 ```
-python -m src.etls.boe.load.run
+python -m src.etls.boe.load dates collection_name 2024/01/01 2024/01/31
 ```
 
 If you want to update the vector database on a daily basis (BOE publishes new documents every day), run this file as a scheduled job (e.g. with CRON).
 
 ```
-python -m src.etls.boe.load.daily
+python -m src.etls.boe.load today collection_name
+```
+
+If you want to update the vector database on a daily basis (BOE publishes new documents every day), run this file with schedule:
+
+```
+python -m src.etls.boe.schedule
 ```
 
 ## 2. Deploy the service
@@ -52,11 +58,13 @@ pip install -r requirements.txt
 Export environment variables:
 
 ```
-export PINECONE_API_KEY="<your_pinecone_api_key>"
-export PINECONE_ENV="<your_pinecone_env>"
-export OPENAI_API_KEY="<your_open_api_key>"
-export SENDGRID_API_KEY="<your_sendgrid_api_key>"
 export APP_PATH="."
+export SENDGRID_API_KEY=<your_sendgrid_api_key>
+export OPENAI_API_KEY=<your_open_api_key>
+export TOKENIZERS_PARALLELISM=false
+export TAVILY_API_KEY=<your_tavily_api_key>
+export QDRANT_API_KEY="<your_qdrant_api_key>"
+export QDRANT_API_URL="<your_qdrant_api_url>"
 ```
 
 Run the service

diff --git a/requirements.txt b/requirements.txt
@@ -10,6 +10,9 @@ pydantic==2.4.2
 
 retry==0.9.2
 
+typer==0.9.0
+schedule==1.2.1
+
 langchain==0.0.305
 # langchainplus-sdk==0.0.20
 langsmith==0.0.41

diff --git a/src/etls/boe/daily_job.py b/src/etls/boe/daily_job.py
@@ -0,0 +1,12 @@
+import time
+
+import schedule
+
+from src.etls.boe.load import today
+
+
+schedule.every().day.at("11:00").do(today, collection_name="<template>")
+
+while True:
+    schedule.run_pending()
+    time.sleep(1)
diff --git a/src/etls/boe/load.py b/src/etls/boe/load.py
@@ -0,0 +1,57 @@
+from datetime import date, datetime
+
+import typer
+
+from src.email.send_email import send_email
+from src.etls.boe.scrapper import BOEScrapper
+from src.etls.common.etl import ETL
+from src.initialize import initialize_app
+
+
+app = typer.Typer()
+INIT_OBJECTS = initialize_app()
+
+
+@app.command()
+def today(collection_name: str):
+    etl_job = ETL(config_loader=INIT_OBJECTS.config_loader, vector_store=INIT_OBJECTS.vector_store[collection_name])
+    boe_scrapper = BOEScrapper()
+    day = date.today()
+    docs = boe_scrapper.download_day(day)
+    if docs:
+        etl_job.run(docs)
+
+    subject = "[BOE] Daily ETL executed"
+    content = f"""
+    Daily ETL executed
+    - Date: {day}
+    - Documents loaded: {len(docs)} 
+    - Database used: {INIT_OBJECTS.config_loader['vector_store']}
+    """
+    send_email(INIT_OBJECTS.config_loader, subject, content)
+
+
+@app.command()
+def dates(collection_name: str, date_start: str, date_end: str):
+    etl_job = ETL(config_loader=INIT_OBJECTS.config_loader, vector_store=INIT_OBJECTS.vector_store[collection_name])
+    boe_scrapper = BOEScrapper()
+    docs = boe_scrapper.download_days(
+        date_start=datetime.strptime(date_start, "%Y/%m/%d").date(),
+        date_end=datetime.strptime(date_end, "%Y/%m/%d").date(),
+    )
+    if docs:
+        etl_job.run(docs)
+
+    subject = "[BOE] Load ETL executed"
+    content = f"""
+    Load ETL executed
+    - Date start: {date_start}
+    - Date end: {date_end}
+    - Documents loaded: {len(docs)} 
+    - Database used: {INIT_OBJECTS.config_loader['vector_store']}
+    """
+    send_email(INIT_OBJECTS.config_loader, subject, content)
+
+
+if __name__ == "__main__":
+    app()
diff --git a/src/etls/boe/load/daily.py b/src/etls/boe/load/daily.py
diff --git a/src/etls/boe/load/run.py b/src/etls/boe/load/run.py
diff --git a/src/etls/boe/load/defs_id_largos.py → src/etls/boe/loading/defs_id_largos.py b/src/etls/boe/load/defs_id_largos.py → src/etls/boe/loading/defs_id_largos.py
@@ -98,7 +98,7 @@
     "BOE-A-1999-637",
     "BOE-A-1999-6568",
     "BOE-A-1999-8910",
-    "BOE-A-1999-8994"
+    "BOE-A-1999-8994",
 ]
 
 

diff --git a/src/etls/boe/load/documents.py → src/etls/boe/loading/documents.py b/src/etls/boe/load/documents.py → src/etls/boe/loading/documents.py
@@ -15,11 +15,7 @@
 
 initialize_logging()
 
-QDRANT_CLIENT = QdrantClient(
-    url=os.environ['QDRANT_API_URL'],
-    api_key=os.environ['QDRANT_API_KEY'],
-    timeout=1000
-)
+QDRANT_CLIENT = QdrantClient(url=os.environ["QDRANT_API_URL"], api_key=os.environ["QDRANT_API_KEY"], timeout=1000)
 
 
 def load_important_ids(filename):
@@ -29,66 +25,56 @@ def load_important_ids(filename):
 
 
 def filter_documents_by_year(documents: tp.List[str]) -> tp.List[str]:
-    """
-    """
     documents_filtered = []
     for document_id in documents:
-        id_split = document_id.split('-')
-        if id_split[0] != 'BOE' or int(id_split[2]) < 2000:
+        id_split = document_id.split("-")
+        if id_split[0] != "BOE" or int(id_split[2]) < 2000:
             documents_filtered.append(document_id)
     return documents_filtered
 
 
 def filter_documents_loaded(documents: tp.List[str]) -> tp.List[str]:
-    """Filters a list of document IDs that are not loaded on Embedding database.
-    """
+    """Filters a list of document IDs that are not loaded on Embedding database."""
     logger = lg.getLogger(filter_documents_loaded.__name__)
     query_vector = np.random.rand(768)
     documents_filtered = []
     for document_id in documents:
-        logger.info('Checking if document id is already loaded: %s', document_id)
+        logger.info("Checking if document id is already loaded: %s", document_id)
         search_result = QDRANT_CLIENT.search(
             collection_name="justicio",
             query_vector=query_vector,
             query_filter=Filter(
-                must=[
-                    FieldCondition(
-                        key="metadata.identificador",
-                        match=MatchValue(value=document_id)
-                    )
-                ]
+                must=[FieldCondition(key="metadata.identificador", match=MatchValue(value=document_id))]
             ),
-            limit=1
+            limit=1,
         )
         if not search_result:
             documents_filtered.append(document_id)
-            logger.info('Document id: %s is added', document_id)
+            logger.info("Document id: %s is added", document_id)
 
     return documents_filtered
 
 
 if __name__ == "__main__":
     logger = lg.getLogger("__main__")
     INIT_OBJECTS = initialize_app()
-    etl_job = ETL(
-        config_loader=INIT_OBJECTS.config_loader, vector_store=INIT_OBJECTS.vector_store
-    )
+    etl_job = ETL(config_loader=INIT_OBJECTS.config_loader, vector_store=INIT_OBJECTS.vector_store)
     boe_scrapper = BOEScrapper()
 
-    documents = load_important_ids('src/etls/boe/load/defs_ids_importantes.txt')
+    documents = load_important_ids("src/etls/boe/load/defs_ids_importantes.txt")
     documents += BOE_IDS
-    logger.info('Documents size: %s', len(documents))
+    logger.info("Documents size: %s", len(documents))
     documents_filtered = list(set(documents))
-    logger.info('Documents filtered size: %s', len(documents_filtered))
+    logger.info("Documents filtered size: %s", len(documents_filtered))
     documents_filtered = filter_documents_by_year(documents_filtered)
-    logger.info('Documents filtered size: %s', len(documents_filtered))
+    logger.info("Documents filtered size: %s", len(documents_filtered))
     logger.info(documents_filtered)
     # documents_filtered = filter_documents_loaded(documents_filtered)
     # logger.info('Documents filtered size: %s', len(documents_filtered))
 
     docs = []
     for boe_id in documents_filtered:
-        logger.info('Loading BOE Id: %s', boe_id)
+        logger.info("Loading BOE Id: %s", boe_id)
         url = f"https://www.boe.es/diario_boe/xml.php?id={boe_id}"
         try:
             meta_document = boe_scrapper.download_document(url)