Merge pull request #53 from bukosabino/develop-fix-bopz

Fixing module
bukosabino · Feb 4, 2024 · c061329 · c061329
2 parents e80e35b + a16fbf0
commit c061329
Show file tree

Hide file tree

Showing 2 changed files with 15 additions and 13 deletions.
diff --git a/src/etls/bopz/metadata.py b/src/etls/bopz/metadata.py
@@ -1,7 +1,7 @@
 import typing as tp
 from datetime import datetime
 
-from pydantic import BaseModel, field_validator
+from pydantic import field_validator
 
 from src.etls.common.metadata import MetadataDocument
 
@@ -19,7 +19,7 @@ class BOPZMetadataDocument(MetadataDocument):
     identificador: str
     numero_oficial: str = ""
     departamento: str
-    titulo: str
+    titulo: str = ""
     url_pdf: str
     url_html: str
     fecha_publicacion: str
@@ -37,5 +37,5 @@ class BOPZMetadataDocument(MetadataDocument):
     @classmethod
     def isoformat(cls, v):
         if v:
-            return datetime.strptime(v, "%Y%m%d").strftime("%Y-%m-%d")
-        return v
+            datetime.strptime(v, "%Y-%m-%d")
+        return v
diff --git a/src/etls/bopz/scrapper.py b/src/etls/bopz/scrapper.py
@@ -1,3 +1,4 @@
+import re
 import logging as lg
 import tempfile
 import typing as tp
@@ -14,32 +15,33 @@
 
 initialize_logging()
 
-def _extract_span_text(row: BeautifulSoup, text_label: str) -> str:
+
+def _extract_span_text(row: BeautifulSoup, regex: str) -> str:
     """
     Extracts the text of the next sibling for a span element that contains the specified text_label.
     
     :param row: The BeautifulSoup row element to search within.
-    :param text_label: The text to search for within the span element.
+    :param regex: The regular expresion to search for within the span element.
     :return: The stripped text of the next sibling if found, otherwise an empty string.
     """
-    span_element = row.find('span', string=lambda t: text_label in t)
+    span_element = row.find('span', string=lambda t: re.search(regex, t))
     return span_element.next_sibling.strip() if span_element and span_element.next_sibling else None
 
 def _extract_metadata(soup) -> tp.Dict:
     metadata_dict = {}
-    
+
     # Metadatos
-    if numero_registro := _extract_span_text(soup, 'Nº. Reg:'):
+    if numero_registro := _extract_span_text(soup, r'N.\. Reg:'):
         metadata_dict["numero_oficial"] = numero_registro.split('/')[0]
         metadata_dict["titulo"] = f"BOPZ-{numero_registro.replace('/', '-')}"
-    
-    if departamento := _extract_span_text(soup, 'Publicador:'):
+
+    if departamento := _extract_span_text(soup, r'Publicador:'):
         metadata_dict["departamento"] = departamento
 
-    if materia := _extract_span_text(soup, 'Materia'):
+    if materia := _extract_span_text(soup, r'Materia'):
         metadata_dict["materia"] = [materia]
 
-    if fecha_publicacion := _extract_span_text(soup, 'Fecha Pub:'):
+    if fecha_publicacion := _extract_span_text(soup, r'Fecha Pub:'):
         fecha_publicacion = datetime.strptime(fecha_publicacion, "%d/%m/%Y").strftime("%Y-%m-%d")
         metadata_dict["fecha_publicacion"] = fecha_publicacion
         metadata_dict["fecha_disposicion"] = fecha_publicacion