From a89e20f92c6fc78e1f1b090057911f0d4bd8fcaa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Vinot?= Date: Thu, 29 Feb 2024 12:53:20 +0100 Subject: [PATCH] chg: Improve debug logging to trace a capture --- playwrightcapture/capture.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/playwrightcapture/capture.py b/playwrightcapture/capture.py index c2549f6..5fc5865 100644 --- a/playwrightcapture/capture.py +++ b/playwrightcapture/capture.py @@ -90,7 +90,7 @@ class Capture(): def __init__(self, browser: BROWSER | None=None, device_name: str | None=None, proxy: str | dict[str, str] | None=None, - general_timeout_in_sec: int | None = None, loglevel: str='INFO'): + general_timeout_in_sec: int | None = None, loglevel: str | int='INFO'): """Captures a page with Playwright. :param browser: The browser to use for the capture. @@ -480,7 +480,6 @@ async def handle_download(download: Download) -> None: async def store_request(request: Request) -> None: # This method is called on each request, to store the body (if it is an image) in a dict indexed by URL try: - self.logger.debug(f'Storing request: {request.url}') if response := await request.response(): if response.ok: try: @@ -491,9 +490,9 @@ async def store_request(request: Request) -> None: self._requests[request.url] = body except PureError: # unable to identify the mimetype - self.logger.debug(f'Unable to identify the mimetype for {request.url}') - except Exception as e: - self.logger.debug(f'Unable to get body for {request.url}: {e}') + pass + except Exception: + pass except Exception as e: self.logger.warning(f'Unable to store request: {e}') @@ -550,9 +549,11 @@ async def store_request(request: Request) -> None: raise initial_error else: await page.bring_to_front() + self.logger.debug('Page moved to front.') # page instrumentation await self._wait_for_random_timeout(page, 5) # Wait 5 sec after document loaded + self.logger.debug('Start instrumentation.') # ==== recaptcha # Same technique as: https://github.com/NikolaiT/uncaptcha3 @@ -573,12 +574,16 @@ async def store_request(request: Request) -> None: # ====== # NOTE: testing # await self.__cloudflare_bypass_attempt(page) + self.logger.debug('Done with captcha.') # check if we have anything on the page. If we don't, the page is not working properly. if await self._failsafe_get_content(page): + self.logger.debug('Got rendered content') # move mouse await page.mouse.move(x=random.uniform(300, 800), y=random.uniform(200, 500)) + self.logger.debug('Moved mouse.') await self._safe_wait(page) + self.logger.debug('Keep going after moving mouse.') if parsed_url.fragment: # We got a fragment, make sure we go to it and scroll only a little bit. @@ -587,6 +592,7 @@ async def store_request(request: Request) -> None: await page.locator(f'id={fragment}').first.scroll_into_view_if_needed(timeout=5000) await self._safe_wait(page) await page.mouse.wheel(delta_y=random.uniform(150, 300), delta_x=0) + self.logger.debug('Jumped to fragment.') except PlaywrightTimeoutError as e: self.logger.info(f'Unable to go to fragment "{fragment}" (timeout): {e}') except TargetClosedError as e: @@ -598,20 +604,27 @@ async def store_request(request: Request) -> None: try: # NOTE using page.mouse.wheel causes the instrumentation to fail, sometimes await page.mouse.wheel(delta_y=random.uniform(1500, 3000), delta_x=0) + self.logger.debug('Scrolled down.') except Error as e: self.logger.debug(f'Unable to scroll: {e}') await self._safe_wait(page) + self.logger.debug('Keep going after moving on page.') + try: await page.keyboard.press('PageUp') + self.logger.debug('PageUp on keyboard') await self._safe_wait(page) await page.keyboard.press('PageDown') + self.logger.debug('PageDown on keyboard') except Error as e: self.logger.debug(f'Unable to use keyboard: {e}') + self.logger.debug('Done with instrumentation, waiting for network idle.') await self._safe_wait(page) await self._wait_for_random_timeout(page, 5) # Wait 5 sec after network idle await self._safe_wait(page) + self.logger.debug('Done with instrumentation, done with waiting.') if content := await self._failsafe_get_content(page): to_return['html'] = content