From d3955bac65cd97c8baefe62ebded99f7ba17d396 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= Date: Sat, 12 Oct 2024 02:14:18 +0100 Subject: [PATCH] Update Tekstowo backend to fetch lyrics directly - Refactored Tekstowo backend to fetch lyrics directly from song pages. - Added `encode` method to convert artist and title to their URL format, where non-alphanumeric characters are replaced with underscores. - Removed the now redundant search functionality and associated tests. - Simplified `extract_lyrics` method to directly parse lyrics without any checks. --- beetsplug/lyrics.py | 90 +-- docs/changelog.rst | 3 + test/plugins/test_lyrics.py | 61 +- .../szukajwykonawcaagfdgjatytulagfdgafg.txt | 537 ---------------- ...ukajwykonawcajuicewrldtytulluciddreams.txt | 584 ------------------ 5 files changed, 23 insertions(+), 1252 deletions(-) delete mode 100755 test/rsrc/lyrics/tekstowopl/szukajwykonawcaagfdgjatytulagfdgafg.txt delete mode 100755 test/rsrc/lyrics/tekstowopl/szukajwykonawcajuicewrldtytulluciddreams.txt diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index 537bfe5150..899bf39921 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -14,6 +14,8 @@ """Fetches, embeds, and displays lyrics.""" +from __future__ import annotations + import difflib import errno import itertools @@ -23,6 +25,7 @@ import struct import unicodedata import warnings +from functools import partial from typing import ClassVar from urllib.parse import quote, urlencode @@ -47,7 +50,6 @@ import beets from beets import plugins, ui -from beets.autotag.hooks import string_dist DIV_RE = re.compile(r"<(/?)div>?", re.I) COMMENT_RE = re.compile(r"", re.S) @@ -288,7 +290,7 @@ class DirectBackend(Backend): @classmethod def encode(cls, text: str) -> str: """Encode the string for inclusion in a URL.""" - return quote(unidecode(text)) + raise NotImplementedError @classmethod def build_url(cls, *args: str) -> str: @@ -312,7 +314,7 @@ def encode(cls, text: str) -> str: for old, new in cls.REPLACEMENTS.items(): text = re.sub(old, new, text) - return super().encode(text) + return quote(unidecode(text)) def fetch(self, artist, title, album=None, length=None): url = self.build_url(artist, title) @@ -485,86 +487,30 @@ class Tekstowo(DirectBackend): """Fetch lyrics from Tekstowo.pl.""" REQUIRES_BS = True - BASE_URL = "http://www.tekstowo.pl" - URL_TEMPLATE = BASE_URL + "/wyszukaj.html?search-title={}&search-artist={}" - - def fetch(self, artist, title, album=None, length=None): - url = self.build_url(title, artist) - search_results = self.fetch_url(url) - if not search_results: - return None - - song_page_url = self.parse_search_results(search_results) - if not song_page_url: - return None - - song_page_html = self.fetch_url(song_page_url) - if not song_page_html: - return None - - return self.extract_lyrics(song_page_html, artist, title) + URL_TEMPLATE = "https://www.tekstowo.pl/piosenka,{},{}.html" - def parse_search_results(self, html): - html = _scrape_strip_cruft(html) - html = _scrape_merge_paragraphs(html) - - soup = try_parse_html(html) - if not soup: - return None - - content_div = soup.find("div", class_="content") - if not content_div: - return None - - card_div = content_div.find("div", class_="card") - if not card_div: - return None + non_alpha_to_underscore = partial(re.compile(r"\W").sub, "_") - song_rows = card_div.find_all("div", class_="box-przeboje") - if not song_rows: - return None - - song_row = song_rows[0] - if not song_row: - return None + @classmethod + def encode(cls, text: str) -> str: + return cls.non_alpha_to_underscore(unidecode(text.lower())) - link = song_row.find("a") - if not link: - return None + def fetch(self, artist, title, album=None, length=None): + if html := self.fetch_url(self.build_url(artist, title)): + return self.extract_lyrics(html) - return self.BASE_URL + link.get("href") + return None - def extract_lyrics(self, html, artist, title): + def extract_lyrics(self, html: str) -> str | None: html = _scrape_strip_cruft(html) html = _scrape_merge_paragraphs(html) soup = try_parse_html(html) - if not soup: - return None - info_div = soup.find("div", class_="col-auto") - if not info_div: - return None - - info_elements = info_div.find_all("a") - if not info_elements: - return None - - html_title = info_elements[-1].get_text() - html_artist = info_elements[-2].get_text() - - title_dist = string_dist(html_title, title) - artist_dist = string_dist(html_artist, artist) + if lyrics_div := soup.select_one("div.song-text > div.inner-text"): + return lyrics_div.get_text() - thresh = self.config["dist_thresh"].get(float) - if title_dist > thresh or artist_dist > thresh: - return None - - lyrics_div = soup.select("div.song-text > div.inner-text") - if not lyrics_div: - return None - - return lyrics_div[0].get_text() + return None def remove_credits(text): diff --git a/docs/changelog.rst b/docs/changelog.rst index a9ed03e422..a2e15dd317 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -44,6 +44,9 @@ Bug fixes: * :doc:`plugins/discogs`: Fix the ``TypeError`` when there is no description. * Remove single quotes from all SQL queries :bug:`4709` +* :doc:`plugins/lyrics`: Update ``tekstowo`` backend to fetch lyrics directly + since recent updates to their website made it unsearchable. + :bug:`5456` For packagers: diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py index 937e0a3cb1..a65ae84827 100644 --- a/test/plugins/test_lyrics.py +++ b/test/plugins/test_lyrics.py @@ -564,10 +564,7 @@ def test_good_lyrics(self): """Ensure we are able to scrape a page with lyrics""" url = "https://www.tekstowo.pl/piosenka,24kgoldn,city_of_angels_1.html" mock = MockFetchUrl() - assert ( - tekstowo.extract_lyrics(mock(url), "24kGoldn", "City of Angels") - is not None - ) + assert tekstowo.extract_lyrics(mock(url)) def test_no_lyrics(self): """Ensure we don't crash when the scraping the html for a Tekstowo page @@ -578,61 +575,7 @@ def test_no_lyrics(self): "beethoven_piano_sonata_17_tempest_the_3rd_movement.html" ) mock = MockFetchUrl() - assert ( - tekstowo.extract_lyrics( - mock(url), - "Beethoven", - "Beethoven Piano Sonata 17" "Tempest The 3rd Movement", - ) - is None - ) - - def test_song_no_match(self): - """Ensure we return None when a song does not match the search query""" - # https://github.com/beetbox/beets/issues/4406 - # expected return value None - url = ( - "https://www.tekstowo.pl/piosenka,bailey_bigger" - ",black_eyed_susan.html" - ) - mock = MockFetchUrl() - assert ( - tekstowo.extract_lyrics( - mock(url), "Kelly Bailey", "Black Mesa Inbound" - ) - is None - ) - - -class TekstowoParseSearchResultsTest(TekstowoBaseTest): - """tests Tekstowo.parse_search_results()""" - - def setUp(self): - """Set up configuration""" - TekstowoBaseTest.setUp(self) - self.plugin = lyrics.LyricsPlugin() - - def test_multiple_results(self): - """Ensure we are able to scrape a page with multiple search results""" - url = ( - "https://www.tekstowo.pl/szukaj,wykonawca,juice+wrld" - ",tytul,lucid+dreams.html" - ) - mock = MockFetchUrl() - assert ( - tekstowo.parse_search_results(mock(url)) - == "http://www.tekstowo.pl/piosenka,juice_wrld," - "lucid_dreams__remix__ft__lil_uzi_vert.html" - ) - - def test_no_results(self): - """Ensure we are able to scrape a page with no search results""" - url = ( - "https://www.tekstowo.pl/szukaj,wykonawca," - "agfdgja,tytul,agfdgafg.html" - ) - mock = MockFetchUrl() - assert tekstowo.parse_search_results(mock(url)) is None + assert not tekstowo.extract_lyrics(mock(url)) class TekstowoIntegrationTest(TekstowoBaseTest, LyricsAssertions): diff --git a/test/rsrc/lyrics/tekstowopl/szukajwykonawcaagfdgjatytulagfdgafg.txt b/test/rsrc/lyrics/tekstowopl/szukajwykonawcaagfdgjatytulagfdgafg.txt deleted file mode 100755 index a137059de4..0000000000 --- a/test/rsrc/lyrics/tekstowopl/szukajwykonawcaagfdgjatytulagfdgafg.txt +++ /dev/null @@ -1,537 +0,0 @@ - - - - - - - - - Wyszukiwarka - teksty piosenek, tłumaczenia piosenek, teledyski na Tekstowo.pl - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- -
-
-
- - - - - - - - -
- -
- - -
-
-
-
- -
- 2 170 872 tekstów, 20 217 poszukiwanych i 376 oczekujących -
-
- -
- - -
- -
- -
- -
- - - - -
- -
-
-
- - -

Znalezione utwory:

-
-
- brak wyników wyszukiwania -
-
- - - - - - -
-
-
- -
-
- -
- -
-
- - -
-
- -
- -
- - -
-
-
- -
-
- 2 170 872 tekstów, 20 217 poszukiwanych i 376 oczekujących -
-
-
-

Największy serwis z tekstami piosenek w Polsce. Każdy może znaleźć u nas teksty piosenek, teledyski oraz tłumaczenia swoich ulubionych utworów.
Zachęcamy wszystkich użytkowników do dodawania nowych tekstów, tłumaczeń i teledysków!

-
- Reklama | - Kontakt | - FAQ - Polityka prywatności -
-
-
- -
- - - -
- -
-
- - - - - - - - - - - - - - - - -
- - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/test/rsrc/lyrics/tekstowopl/szukajwykonawcajuicewrldtytulluciddreams.txt b/test/rsrc/lyrics/tekstowopl/szukajwykonawcajuicewrldtytulluciddreams.txt deleted file mode 100755 index 40e8fa3cb0..0000000000 --- a/test/rsrc/lyrics/tekstowopl/szukajwykonawcajuicewrldtytulluciddreams.txt +++ /dev/null @@ -1,584 +0,0 @@ - - - - - - - - - Wyszukiwarka - teksty piosenek, tłumaczenia piosenek, teledyski na Tekstowo.pl - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- -
-
-
- - - - - - - - -
- -
- - -
-
-
-
- -
- 2 170 872 tekstów, 20 217 poszukiwanych i 376 oczekujących -
-
- -
- - -
- -
- -
- -
- - - - -
- - - -
-
- -
- -
-
- - -
-
- -
- -
- - -
-
-
- -
-
- 2 170 872 tekstów, 20 217 poszukiwanych i 376 oczekujących -
-
-
-

Największy serwis z tekstami piosenek w Polsce. Każdy może znaleźć u nas teksty piosenek, teledyski oraz tłumaczenia swoich ulubionych utworów.
Zachęcamy wszystkich użytkowników do dodawania nowych tekstów, tłumaczeń i teledysków!

-
- Reklama | - Kontakt | - FAQ - Polityka prywatności -
-
-
- -
- - - -
- -
-
- - - - - - - - - - - - - - - - -
- - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file