Skip to content

Commit

Permalink
Update Tekstowo backend to fetch lyrics directly
Browse files Browse the repository at this point in the history
- Refactored Tekstowo backend to fetch lyrics directly from song pages.
- Added `encode` method to convert artist and title to their URL format,
  where non-alphanumeric characters are replaced with underscores.
- Removed the now redundant search functionality and associated tests.
- Simplified `extract_lyrics` method to directly parse lyrics without
  any checks.
  • Loading branch information
snejus committed Oct 12, 2024
1 parent 9d2b34d commit d3955ba
Show file tree
Hide file tree
Showing 5 changed files with 23 additions and 1,252 deletions.
90 changes: 18 additions & 72 deletions beetsplug/lyrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@

"""Fetches, embeds, and displays lyrics."""

from __future__ import annotations

import difflib
import errno
import itertools
Expand All @@ -23,6 +25,7 @@
import struct
import unicodedata
import warnings
from functools import partial
from typing import ClassVar
from urllib.parse import quote, urlencode

Expand All @@ -47,7 +50,6 @@

import beets
from beets import plugins, ui
from beets.autotag.hooks import string_dist

DIV_RE = re.compile(r"<(/?)div>?", re.I)
COMMENT_RE = re.compile(r"<!--.*-->", re.S)
Expand Down Expand Up @@ -288,7 +290,7 @@ class DirectBackend(Backend):
@classmethod
def encode(cls, text: str) -> str:
"""Encode the string for inclusion in a URL."""
return quote(unidecode(text))
raise NotImplementedError

@classmethod
def build_url(cls, *args: str) -> str:
Expand All @@ -312,7 +314,7 @@ def encode(cls, text: str) -> str:
for old, new in cls.REPLACEMENTS.items():
text = re.sub(old, new, text)

return super().encode(text)
return quote(unidecode(text))

def fetch(self, artist, title, album=None, length=None):
url = self.build_url(artist, title)
Expand Down Expand Up @@ -485,86 +487,30 @@ class Tekstowo(DirectBackend):
"""Fetch lyrics from Tekstowo.pl."""

REQUIRES_BS = True
BASE_URL = "http://www.tekstowo.pl"
URL_TEMPLATE = BASE_URL + "/wyszukaj.html?search-title={}&search-artist={}"

def fetch(self, artist, title, album=None, length=None):
url = self.build_url(title, artist)
search_results = self.fetch_url(url)
if not search_results:
return None

song_page_url = self.parse_search_results(search_results)
if not song_page_url:
return None

song_page_html = self.fetch_url(song_page_url)
if not song_page_html:
return None

return self.extract_lyrics(song_page_html, artist, title)
URL_TEMPLATE = "https://www.tekstowo.pl/piosenka,{},{}.html"

def parse_search_results(self, html):
html = _scrape_strip_cruft(html)
html = _scrape_merge_paragraphs(html)

soup = try_parse_html(html)
if not soup:
return None

content_div = soup.find("div", class_="content")
if not content_div:
return None

card_div = content_div.find("div", class_="card")
if not card_div:
return None
non_alpha_to_underscore = partial(re.compile(r"\W").sub, "_")

song_rows = card_div.find_all("div", class_="box-przeboje")
if not song_rows:
return None

song_row = song_rows[0]
if not song_row:
return None
@classmethod
def encode(cls, text: str) -> str:
return cls.non_alpha_to_underscore(unidecode(text.lower()))

link = song_row.find("a")
if not link:
return None
def fetch(self, artist, title, album=None, length=None):
if html := self.fetch_url(self.build_url(artist, title)):
return self.extract_lyrics(html)

return self.BASE_URL + link.get("href")
return None

def extract_lyrics(self, html, artist, title):
def extract_lyrics(self, html: str) -> str | None:
html = _scrape_strip_cruft(html)
html = _scrape_merge_paragraphs(html)

soup = try_parse_html(html)
if not soup:
return None

info_div = soup.find("div", class_="col-auto")
if not info_div:
return None

info_elements = info_div.find_all("a")
if not info_elements:
return None

html_title = info_elements[-1].get_text()
html_artist = info_elements[-2].get_text()

title_dist = string_dist(html_title, title)
artist_dist = string_dist(html_artist, artist)
if lyrics_div := soup.select_one("div.song-text > div.inner-text"):
return lyrics_div.get_text()

thresh = self.config["dist_thresh"].get(float)
if title_dist > thresh or artist_dist > thresh:
return None

lyrics_div = soup.select("div.song-text > div.inner-text")
if not lyrics_div:
return None

return lyrics_div[0].get_text()
return None


def remove_credits(text):
Expand Down
3 changes: 3 additions & 0 deletions docs/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@ Bug fixes:
* :doc:`plugins/discogs`: Fix the ``TypeError`` when there is no description.
* Remove single quotes from all SQL queries
:bug:`4709`
* :doc:`plugins/lyrics`: Update ``tekstowo`` backend to fetch lyrics directly
since recent updates to their website made it unsearchable.
:bug:`5456`

For packagers:

Expand Down
61 changes: 2 additions & 59 deletions test/plugins/test_lyrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -564,10 +564,7 @@ def test_good_lyrics(self):
"""Ensure we are able to scrape a page with lyrics"""
url = "https://www.tekstowo.pl/piosenka,24kgoldn,city_of_angels_1.html"
mock = MockFetchUrl()
assert (
tekstowo.extract_lyrics(mock(url), "24kGoldn", "City of Angels")
is not None
)
assert tekstowo.extract_lyrics(mock(url))

def test_no_lyrics(self):
"""Ensure we don't crash when the scraping the html for a Tekstowo page
Expand All @@ -578,61 +575,7 @@ def test_no_lyrics(self):
"beethoven_piano_sonata_17_tempest_the_3rd_movement.html"
)
mock = MockFetchUrl()
assert (
tekstowo.extract_lyrics(
mock(url),
"Beethoven",
"Beethoven Piano Sonata 17" "Tempest The 3rd Movement",
)
is None
)

def test_song_no_match(self):
"""Ensure we return None when a song does not match the search query"""
# https://github.com/beetbox/beets/issues/4406
# expected return value None
url = (
"https://www.tekstowo.pl/piosenka,bailey_bigger"
",black_eyed_susan.html"
)
mock = MockFetchUrl()
assert (
tekstowo.extract_lyrics(
mock(url), "Kelly Bailey", "Black Mesa Inbound"
)
is None
)


class TekstowoParseSearchResultsTest(TekstowoBaseTest):
"""tests Tekstowo.parse_search_results()"""

def setUp(self):
"""Set up configuration"""
TekstowoBaseTest.setUp(self)
self.plugin = lyrics.LyricsPlugin()

def test_multiple_results(self):
"""Ensure we are able to scrape a page with multiple search results"""
url = (
"https://www.tekstowo.pl/szukaj,wykonawca,juice+wrld"
",tytul,lucid+dreams.html"
)
mock = MockFetchUrl()
assert (
tekstowo.parse_search_results(mock(url))
== "http://www.tekstowo.pl/piosenka,juice_wrld,"
"lucid_dreams__remix__ft__lil_uzi_vert.html"
)

def test_no_results(self):
"""Ensure we are able to scrape a page with no search results"""
url = (
"https://www.tekstowo.pl/szukaj,wykonawca,"
"agfdgja,tytul,agfdgafg.html"
)
mock = MockFetchUrl()
assert tekstowo.parse_search_results(mock(url)) is None
assert not tekstowo.extract_lyrics(mock(url))


class TekstowoIntegrationTest(TekstowoBaseTest, LyricsAssertions):
Expand Down
Loading

0 comments on commit d3955ba

Please sign in to comment.