Skip to content

Commit

Permalink
Merge pull request #53 from bukosabino/develop-fix-bopz
Browse files Browse the repository at this point in the history
Fixing module
  • Loading branch information
llop00 authored Feb 4, 2024
2 parents e80e35b + a16fbf0 commit c061329
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 13 deletions.
8 changes: 4 additions & 4 deletions src/etls/bopz/metadata.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import typing as tp
from datetime import datetime

from pydantic import BaseModel, field_validator
from pydantic import field_validator

from src.etls.common.metadata import MetadataDocument

Expand All @@ -19,7 +19,7 @@ class BOPZMetadataDocument(MetadataDocument):
identificador: str
numero_oficial: str = ""
departamento: str
titulo: str
titulo: str = ""
url_pdf: str
url_html: str
fecha_publicacion: str
Expand All @@ -37,5 +37,5 @@ class BOPZMetadataDocument(MetadataDocument):
@classmethod
def isoformat(cls, v):
if v:
return datetime.strptime(v, "%Y%m%d").strftime("%Y-%m-%d")
return v
datetime.strptime(v, "%Y-%m-%d")
return v
20 changes: 11 additions & 9 deletions src/etls/bopz/scrapper.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import re
import logging as lg
import tempfile
import typing as tp
Expand All @@ -14,32 +15,33 @@

initialize_logging()

def _extract_span_text(row: BeautifulSoup, text_label: str) -> str:

def _extract_span_text(row: BeautifulSoup, regex: str) -> str:
"""
Extracts the text of the next sibling for a span element that contains the specified text_label.
:param row: The BeautifulSoup row element to search within.
:param text_label: The text to search for within the span element.
:param regex: The regular expresion to search for within the span element.
:return: The stripped text of the next sibling if found, otherwise an empty string.
"""
span_element = row.find('span', string=lambda t: text_label in t)
span_element = row.find('span', string=lambda t: re.search(regex, t))
return span_element.next_sibling.strip() if span_element and span_element.next_sibling else None

def _extract_metadata(soup) -> tp.Dict:
metadata_dict = {}

# Metadatos
if numero_registro := _extract_span_text(soup, 'Nº. Reg:'):
if numero_registro := _extract_span_text(soup, r'N.\. Reg:'):
metadata_dict["numero_oficial"] = numero_registro.split('/')[0]
metadata_dict["titulo"] = f"BOPZ-{numero_registro.replace('/', '-')}"
if departamento := _extract_span_text(soup, 'Publicador:'):

if departamento := _extract_span_text(soup, r'Publicador:'):
metadata_dict["departamento"] = departamento

if materia := _extract_span_text(soup, 'Materia'):
if materia := _extract_span_text(soup, r'Materia'):
metadata_dict["materia"] = [materia]

if fecha_publicacion := _extract_span_text(soup, 'Fecha Pub:'):
if fecha_publicacion := _extract_span_text(soup, r'Fecha Pub:'):
fecha_publicacion = datetime.strptime(fecha_publicacion, "%d/%m/%Y").strftime("%Y-%m-%d")
metadata_dict["fecha_publicacion"] = fecha_publicacion
metadata_dict["fecha_disposicion"] = fecha_publicacion
Expand Down

0 comments on commit c061329

Please sign in to comment.