Skip to content

Commit

Permalink
Merge pull request #72 from bukosabino/fixing-boja-bopv
Browse files Browse the repository at this point in the history
Estandarización en los boletines BOJA y BOPV
  • Loading branch information
ruben-quilez authored Mar 6, 2024
2 parents a84ffa3 + 1bc1e7d commit 4b9957f
Show file tree
Hide file tree
Showing 7 changed files with 54 additions and 71 deletions.
1 change: 0 additions & 1 deletion src/etls/boja/load.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from datetime import date, datetime
import json

import typer

Expand Down
6 changes: 1 addition & 5 deletions src/etls/boja/metadata.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,6 @@
import typing as tp
from datetime import datetime
from typing import Optional

from pydantic import BaseModel, validator, Field
import re

from src.etls.common.metadata import MetadataDocument


Expand All @@ -21,7 +17,7 @@ class BOJAMetadataDocument(MetadataDocument):
source_type: str = "Boletin"

# Metadatos

identificador: str
departamento: str
tipologia: str

Expand Down
36 changes: 16 additions & 20 deletions src/etls/boja/scrapper.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import logging as lg
import tempfile
import typing as tp
from datetime import date, datetime
from datetime import date
import re

from src.etls.boja.metadata import BOJAMetadataDocument
from src.etls.common.scrapper import BaseScrapper
from src.etls.common.utils import ScrapeError, HTTPRequester
from src.etls.common.utils import ScrapperError, HTTPRequester
from src.etls.boja.utils import mes_a_numero, clean_text
from src.initialize import initialize_logging

Expand Down Expand Up @@ -41,8 +41,7 @@ def extract_bojas_from_extraordinary(url):

return urls_bojas
except Exception as e:
print(f"Error inesperado: {e}")
raise
raise Exception(f"Error inesperado: {e}")

@staticmethod
def find_disposiciones(url_boletin):
Expand All @@ -65,10 +64,9 @@ def find_disposiciones(url_boletin):
for enlace_final in enlaces_intermedios:
enlaces_finales.append(enlace_final.get('href'))
else:
raise ScrapeError("No se encontró el listado ordenado con las clases especificadas.")
raise ScrapperError("No se encontró el listado ordenado con las clases especificadas.")
except Exception as e:
print(f"Error inesperado: {e}")
raise
raise Exception(f"Error inesperado: {e}")
return enlaces_finales

def _get_summary_link_from_date(self, fecha_busqueda):
Expand Down Expand Up @@ -105,8 +103,7 @@ def _get_summary_link_from_date(self, fecha_busqueda):
"extraordinario": False
}]
except Exception as e:
print(f"Error inesperado: {e}")
raise
raise Exception(f"Error inesperado: {e}")

def download_day(self, day: date) -> tp.List[BOJAMetadataDocument]:
"""Download all the documents for a specific date."""
Expand All @@ -123,20 +120,19 @@ def download_day(self, day: date) -> tp.List[BOJAMetadataDocument]:
document_data = self.download_document(disposicion)
if document_data:
disposition_summary = {
"url_boletin":boletin['url'],
"url_html":disposicion,
"fecha_disposicion": day.strftime("%Y-%m-%d"),
"anio": str(day.year),
"mes": str(day.month),
"dia": str(day.day),
"url_boletin": boletin['url'],
"url_html": disposicion,
"fecha_disposicion": day.strftime("%Y-%m-%d"),
"anio": str(day.year),
"mes": str(day.month),
"dia": str(day.day),
}
for atributo, valor in disposition_summary.items():
setattr(document_data, atributo, valor)
disposiciones.append(document_data)
return disposiciones
except Exception as e:
print(f"Error inesperado descargando dia {day}: {e}")
raise
raise Exception(f"Error inesperado descargando dia {day}: {e}")

def download_document(self, url: str) -> BOJAMetadataDocument:
"""
Expand All @@ -155,7 +151,7 @@ def download_document(self, url: str) -> BOJAMetadataDocument:
cuerpo = soup.find(id="cuerpo", class_="grid_11 contenidos_nivel3 boja_disposicion")
cabecera = soup.find(class_="punteado_izquierda cabecera_detalle_disposicion")
if not cabecera or not cuerpo:
raise ScrapeError("No se pudo encontrar la cabecera o el cuerpo del documento")
raise ScrapperError("No se pudo encontrar la cabecera o el cuerpo del documento")
h2 = cabecera.find('h2')
h5 = cabecera.find('h5')
h3 = cabecera.find('h3')
Expand All @@ -180,12 +176,12 @@ def download_document(self, url: str) -> BOJAMetadataDocument:
fn.write(text_cleaned)
logger.info("Scrapped document successfully %s", url)
metadata_doc = BOJAMetadataDocument(**{ "filepath": fn.name,
"identificador": '/'.join(url.split("/")[-3:]),
"titulo": titulo,
"departamento": clean_text(organo_disposicion),
"url_pdf": enlace_pdf,
"tipologia": re.sub(r"^\d+\.\s*", "", tipo_disposicion),
})
return metadata_doc
except Exception as e:
print(f"Error inesperado procesando el documento {url}: {e}")
raise
raise Exception(f"Error inesperado procesando el documento {url}: {e}")
2 changes: 1 addition & 1 deletion src/etls/bopv/README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@

# Web principal

[Web principal del BOJA](https://www.euskadi.eus/web01-bopv/es/bopv2/datos/Ultimo.shtml)
[Web principal del BOPV](https://www.euskadi.eus/web01-bopv/es/bopv2/datos/Ultimo.shtml)


# Portal de búsqueda avanzada
Expand Down
10 changes: 2 additions & 8 deletions src/etls/bopv/metadata.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,9 @@
import typing as tp
from datetime import datetime
from typing import Optional

from pydantic import BaseModel, validator, Field
import re

from src.etls.common.metadata import MetadataDocument




class BOPVMetadataDocument(MetadataDocument):
"""Class for keeping metadata of a BOPV Document scrapped."""

Expand All @@ -21,9 +15,9 @@ class BOPVMetadataDocument(MetadataDocument):
source_type: str = "Boletin"

# Metadatos

identificador: str
departamento: Optional[str] = None
tipologia: str
tipologia: str

# Links
titulo: Optional[str] = None
Expand Down
26 changes: 11 additions & 15 deletions src/etls/bopv/scrapper.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,17 @@
import logging as lg
import tempfile
import typing as tp
from datetime import date, datetime
from datetime import date
import re
import random

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from requests.exceptions import HTTPError

from src.etls.bopv.metadata import BOPVMetadataDocument
from src.etls.common.scrapper import BaseScrapper
from src.etls.common.utils import ScrapeError
from src.etls.common.utils import ScrapperError
from src.initialize import initialize_logging


Expand Down Expand Up @@ -79,7 +78,7 @@ def _get_summary_link_from_date(self, requested_date: date):
except requests.HTTPError as err:
raise ValueError(f"Error en la solicitud HTTP: {err}")
except ValueError as err:
raise
raise ValueError(f"Error en la solicitud HTTP: {err}")

def download_day(self, day: date) -> tp.List[BOPVMetadataDocument]:
"""Download all the documents for a specific date."""
Expand All @@ -99,7 +98,7 @@ def download_day(self, day: date) -> tp.List[BOPVMetadataDocument]:
for block in txt_blocks:
titulo = block.find('p', class_='BOPVSumarioTitulo')
if not titulo or not titulo.find('a'):
raise ScrapeError("No se pudo encontrar el título o el enlace en uno de los bloques.")
raise ScrapperError("No se pudo encontrar el título o el enlace en uno de los bloques.")
href = titulo.find('a')['href']
url_disposicion = summary_link.rsplit('/', 1)[0] + '/' + href
document_data = self.download_document(url_disposicion)
Expand All @@ -118,11 +117,9 @@ def download_day(self, day: date) -> tp.List[BOPVMetadataDocument]:
disposiciones.append(document_data)
return disposiciones
except requests.exceptions.RequestException as e:
print(f"Error de red o HTTP al intentar acceder a {summary_link}: {e}")
raise
raise Exception(f"Error de red o HTTP al intentar acceder a {summary_link}: {e}")
except Exception as e:
print(f"Error inesperado: {e}")
raise
raise Exception(f"Error inesperado: {e}")

def download_document(self, url: str) -> BOPVMetadataDocument:
"""
Expand All @@ -142,7 +139,7 @@ def download_document(self, url: str) -> BOPVMetadataDocument:
soup = BeautifulSoup(response.content, "html.parser")
seccion_tag = soup.find("h4", class_="BOPVSeccion")
if not seccion_tag:
raise ScrapeError("No se pudo encontrar la sección requerida.")
raise ScrapperError("No se pudo encontrar la sección requerida.")

seccion_text = seccion_tag.get_text(strip=True).upper()
if seccion_text not in ['DISPOSICIONES GENERALES', 'OTRAS DISPOSICIONES']:
Expand All @@ -153,7 +150,7 @@ def download_document(self, url: str) -> BOPVMetadataDocument:
pdf_link_tag = soup.find("li", class_="formatoPdf").find('a')

if not organismo_tag or not content_block or not pdf_link_tag:
raise ScrapeError("No se pudo encontrar algunos de los elementos requeridos.")
raise ScrapperError("No se pudo encontrar algunos de los elementos requeridos.")

organismo = organismo_tag.get_text(strip=True) if organismo_tag else ""
base_url = url.rsplit('/', 1)[0] + '/'
Expand All @@ -169,6 +166,7 @@ def download_document(self, url: str) -> BOPVMetadataDocument:
text_cleaned = clean_text(content)
fn.write(text_cleaned)
metadata_doc = BOPVMetadataDocument(**{"filepath": fn.name,
"identificador": '/'.join(url.split('.')[-2].split("/")[-3:]),
"departamento": organismo,
"url_pdf": pdf_url,
"tipologia": tipologia,
Expand All @@ -177,8 +175,6 @@ def download_document(self, url: str) -> BOPVMetadataDocument:
return metadata_doc

except requests.exceptions.RequestException as e:
print(f"Error de red o HTTP al intentar acceder a {url}: {e}")
raise
raise Exception(f"Error de red o HTTP al intentar acceder a {url}: {e}")
except Exception as e:
print(f"Error inesperado procesando el documento {url}: {e}")
raise
raise Exception(f"Error de red o HTTP al intentar acceder a {url}: {e}")
44 changes: 23 additions & 21 deletions src/etls/common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,35 +28,38 @@ def load(self) -> tp.List[Document]:
text = f.read()
return [Document(page_content=text, metadata=self.metadata)]

class ScrapeError(Exception):

class ScrapperError(Exception):
"""
Excepción personalizada para errores de scrapeo.
Custom exception for scraping errors.
"""

def __init__(self, message="Error durante el proceso de scraping", *args, **kwargs):
"""
Inicializa la excepción con un mensaje de error personalizado.
Initializes the exception with a custom error message.
:param message: Mensaje de error que describe el fallo.
:param args: Argumentos posicionales adicionales.
:param kwargs: Argumentos de palabra clave adicionales.
:param message: Error message describing the failure.
:param args: Additional positional arguments.
:param kwargs: Additional keyword arguments.
"""
super().__init__(message, *args, **kwargs)
self.message = message

def __str__(self):
"""
Devuelve una representación en string de la excepción, que incluye el mensaje de error.
Returns a string representation of the exception, including the error message.
"""
return f"ScrapeError: {self.message}"

return f"ScrapperError: {self.message}"


class HTTPRequestException(Exception):
"""
Excepción para errores ocurridos durante las solicitudes HTTP realizadas por HTTPRequester.
Exception for errors occurring during HTTP requests made by HTTPRequester.
"""
def __init__(self, message="Error en la solicitud HTTP", *args):
def __init__(self, message="HTTP request error", *args):
super().__init__(message, *args)



class HTTPRequester:
user_agents = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",
Expand All @@ -77,32 +80,31 @@ class HTTPRequester:
@classmethod
def get_random_user_agent(cls):
"""
Selecciona y devuelve un User-Agent aleatorio de la lista de user_agents.
Selects and returns a random User-Agent from the list of user_agents.
"""
return random.choice(cls.user_agents)

@classmethod
def get_headers(cls):
"""
Genera y devuelve headers incluyendo un User-Agent aleatorio.
Generates and returns headers including a random User-Agent.
"""
headers = cls.default_headers.copy()
headers["User-Agent"] = cls.get_random_user_agent()
return headers

@staticmethod
def get_soup(url, timeout=10):
def get_soup(url, timeout=10, markup='html.parser'):
"""
Realiza una solicitud HTTP GET al URL proporcionado, utilizando headers aleatorios,
y devuelve un objeto BeautifulSoup si la respuesta es exitosa. Si hay un error
o se supera el tiempo de espera, lanza HTTPRequestException.
Performs an HTTP GET request to the provided URL, using random headers, and returns a BeautifulSoup
object if the response is successful. If there is an error or timeout, it throws HTTPRequestException.
"""
headers = HTTPRequester.get_headers()
try:
response = requests.get(url, headers=headers, timeout=timeout)
response.raise_for_status()
return BeautifulSoup(response.content, 'html.parser')
return BeautifulSoup(response.content, markup)
except Timeout as e:
raise HTTPRequestException(f"La solicitud HTTP excedió el tiempo de espera: {e}")
raise HTTPRequestException(f"HTTP request timed out: {e}")
except requests.RequestException as e:
raise HTTPRequestException(f"Error al realizar la solicitud HTTP: {e}")
raise HTTPRequestException(f"HTTP request failed: {e}")

0 comments on commit 4b9957f

Please sign in to comment.