diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index 3c99bfb8a4..e6ab217c5b 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -25,11 +25,15 @@ import struct import unicodedata import warnings -from functools import partial -from typing import TYPE_CHECKING, ClassVar +from contextlib import suppress +from dataclasses import dataclass +from functools import cached_property, partial, total_ordering +from http import HTTPStatus +from typing import TYPE_CHECKING, ClassVar, Iterable, Iterator from urllib.parse import quote, urlencode import requests +from typing_extensions import TypedDict from unidecode import unidecode import beets @@ -59,6 +63,7 @@ TAG_RE = re.compile(r"<[^>]*>") BREAK_RE = re.compile(r"\n?\s*]*)*>\s*\n?", re.I) USER_AGENT = f"beets/{beets.__version__}" +INSTRUMENTAL_LYRICS = "[Instrumental]" # The content for the base index.rst generated in ReST mode. REST_INDEX_TEMPLATE = """Lyrics @@ -96,6 +101,10 @@ """ +class NotFoundError(requests.exceptions.HTTPError): + pass + + # Utilities. @@ -266,37 +275,151 @@ def fetch( raise NotImplementedError +class LRCLibItem(TypedDict): + """Lyrics data item returned by the LRCLib API.""" + + id: int + name: str + trackName: str + artistName: str + albumName: str + duration: float | None + instrumental: bool + plainLyrics: str + syncedLyrics: str | None + + +@dataclass +@total_ordering +class LRCLyrics: + #: Percentage tolerance for max duration difference between lyrics and item. + DURATION_DIFF_TOLERANCE = 0.05 + + target_duration: float + duration: float + instrumental: bool + plain: str + synced: str | None + + def __le__(self, other: LRCLyrics) -> bool: + """Compare two lyrics items by their score.""" + return self.dist < other.dist + + @classmethod + def make(cls, candidate: LRCLibItem, target_duration: float) -> LRCLyrics: + return cls( + target_duration, + candidate["duration"] or 0.0, + candidate["instrumental"], + candidate["plainLyrics"], + candidate["syncedLyrics"], + ) + + @cached_property + def duration_dist(self) -> float: + """Return the absolute difference between lyrics and target duration.""" + return abs(self.duration - self.target_duration) + + @cached_property + def is_valid(self) -> bool: + """Return whether the lyrics item is valid. + Lyrics duration must be within the tolerance defined by + :attr:`DURATION_DIFF_TOLERANCE`. + """ + return ( + self.duration_dist + <= self.target_duration * self.DURATION_DIFF_TOLERANCE + ) + + @cached_property + def dist(self) -> tuple[float, bool]: + """Distance/score of the given lyrics item. + + Return a tuple with the following values: + 1. Absolute difference between lyrics and target duration + 2. Boolean telling whether synced lyrics are available. + + Best lyrics match is the one that has the closest duration to + ``target_duration`` and has synced lyrics available. + """ + return self.duration_dist, not self.synced + + def get_text(self, want_synced: bool) -> str: + if self.instrumental: + return INSTRUMENTAL_LYRICS + + if want_synced and self.synced: + return "\n".join(map(str.strip, self.synced.splitlines())) + + return self.plain + + class LRCLib(Backend): - base_url = "https://lrclib.net/api/get" + """Fetch lyrics from the LRCLib API.""" - def fetch( + BASE_URL = "https://lrclib.net/api" + GET_URL = f"{BASE_URL}/get" + SEARCH_URL = f"{BASE_URL}/search" + + def warn(self, message: str, *args) -> None: + """Log a warning message with the class name.""" + self._log.warning(f"{self.__class__.__name__}: {message}", *args) + + def fetch_json(self, *args, **kwargs): + """Wrap the request method to raise an exception on HTTP errors.""" + kwargs.setdefault("timeout", 10) + kwargs.setdefault("headers", {"User-Agent": USER_AGENT}) + r = requests.get(*args, **kwargs) + if r.status_code == HTTPStatus.NOT_FOUND: + raise NotFoundError("HTTP Error: Not Found", response=r) + r.raise_for_status() + + return r.json() + + def fetch_candidates( self, artist: str, title: str, album: str, length: int - ) -> str | None: - params: dict[str, str | int] = { - "artist_name": artist, - "track_name": title, - } + ) -> Iterator[list[LRCLibItem]]: + """Yield lyrics candidates for the given song data. + + Firstly, attempt to GET lyrics directly, and then search the API if + lyrics are not found or the duration does not match. + + Return an iterator over lists of candidates. + """ + base_params = {"artist_name": artist, "track_name": title} + get_params = {**base_params, "duration": length} if album: - params["album_name"] = album + get_params["album_name"] = album - if length: - params["duration"] = length + with suppress(NotFoundError): + yield [self.fetch_json(self.GET_URL, params=get_params)] - try: - response = requests.get( - self.base_url, - params=params, - timeout=10, - ) - data = response.json() - except (requests.RequestException, json.decoder.JSONDecodeError) as exc: - self._log.debug("LRCLib API request failed: {0}", exc) - return None + yield self.fetch_json(self.SEARCH_URL, params=base_params) - if self.config["synced"]: - return data.get("syncedLyrics") + @classmethod + def pick_best_match(cls, lyrics: Iterable[LRCLyrics]) -> LRCLyrics | None: + """Return best matching lyrics item from the given list.""" + return min((li for li in lyrics if li.is_valid), default=None) - return data.get("plainLyrics") + def fetch( + self, artist: str, title: str, album: str, length: int + ) -> str | None: + """Fetch lyrics text for the given song data.""" + fetch = partial(self.fetch_candidates, artist, title, album, length) + make = partial(LRCLyrics.make, target_duration=length) + pick = self.pick_best_match + try: + return next( + filter(None, map(pick, (map(make, x) for x in fetch()))) + ).get_text(self.config["synced"]) + except StopIteration: + pass + except requests.JSONDecodeError: + self.warn("Could not decode response JSON data") + except requests.RequestException as exc: + self.warn("Request error: {}", exc) + + return None class DirectBackend(Backend): @@ -478,7 +601,7 @@ def _try_extracting_lyrics_from_non_data_lyrics_container(self, soup): string="This song is an instrumental", ): self._log.debug("Detected instrumental") - return "[Instrumental]" + return INSTRUMENTAL_LYRICS else: self._log.debug("Couldn't scrape page using known layouts") return None @@ -725,7 +848,7 @@ def fetch(self, artist: str, title: str, *_) -> str | None: class LyricsPlugin(plugins.BeetsPlugin): - SOURCES = ["google", "musixmatch", "genius", "tekstowo", "lrclib"] + SOURCES = ["lrclib", "google", "musixmatch", "genius", "tekstowo"] SOURCE_BACKENDS = { "google": Google, "musixmatch": MusiXmatch, diff --git a/docs/changelog.rst b/docs/changelog.rst index a9f4bc2169..edc051a7b3 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -58,6 +58,13 @@ Bug fixes: * :doc:`plugins/lyrics`: Do not attempt to search for lyrics if either the artist or title is missing and ignore ``artist_sort`` value if it is empty. :bug:`2635` +* :doc:`plugins/lyrics`: Fix fetching lyrics from ``lrclib`` source. If we + cannot find lyrics for a specific album, artist, title combination, the + plugin now tries to search for the artist and title and picks the most + relevant result. Update the default ``sources`` configuration to prioritize + ``lrclib`` over other sources since it returns reliable results quicker than + others. + :bug:`5102` For packagers: diff --git a/docs/plugins/lyrics.rst b/docs/plugins/lyrics.rst index 4939476938..d1f434d70f 100644 --- a/docs/plugins/lyrics.rst +++ b/docs/plugins/lyrics.rst @@ -56,7 +56,7 @@ configuration file. The available options are: sources known to be scrapeable. - **sources**: List of sources to search for lyrics. An asterisk ``*`` expands to all available sources. - Default: ``google genius tekstowo lrclib``, i.e., all the available sources. The + Default: ``lrclib google genius tekstowo``, i.e., all the available sources. The ``google`` source will be automatically deactivated if no ``google_API_key`` is setup. The ``google``, ``genius``, and ``tekstowo`` sources will only be enabled if diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py index 99e6f8a4e8..0dee427ec3 100644 --- a/test/plugins/test_lyrics.py +++ b/test/plugins/test_lyrics.py @@ -14,7 +14,9 @@ """Tests for the 'lyrics' plugin.""" +import re from functools import partial +from http import HTTPStatus import pytest @@ -228,7 +230,7 @@ def _patch_google_search(self, requests_mock, lyrics_page): def test_backend_source(self, lyrics_plugin, lyrics_page: LyricsPage): """Test parsed lyrics from each of the configured lyrics pages.""" lyrics = lyrics_plugin.get_lyrics( - lyrics_page.artist, lyrics_page.track_title, "", 0 + lyrics_page.artist, lyrics_page.track_title, "", 186 ) assert lyrics @@ -340,32 +342,41 @@ def test_scrape(self, backend, lyrics_html, expecting_lyrics): assert bool(backend.extract_lyrics(lyrics_html)) == expecting_lyrics +LYRICS_DURATION = 950 + + +def lyrics_match(**overrides): + return { + "instrumental": False, + "duration": LYRICS_DURATION, + "syncedLyrics": "synced", + "plainLyrics": "plain", + **overrides, + } + + class TestLRCLibLyrics(LyricsBackendTest): + ITEM_DURATION = 999 + @pytest.fixture(scope="class") def backend_name(self): return "lrclib" @pytest.fixture - def fetch_lyrics(self, backend, requests_mock, response_data): - requests_mock.get(lyrics.LRCLib.base_url, json=response_data) + def request_kwargs(self, response_data): + return {"json": response_data} - return partial(backend.fetch, "la", "la", "la", 0) + @pytest.fixture + def fetch_lyrics(self, backend, requests_mock, request_kwargs): + requests_mock.get(backend.GET_URL, status_code=HTTPStatus.NOT_FOUND) + requests_mock.get(backend.SEARCH_URL, **request_kwargs) - @pytest.mark.parametrize( - "response_data", - [ - { - "syncedLyrics": "[00:00.00] la la la", - "plainLyrics": "la la la", - } - ], - ) + return partial(backend.fetch, "la", "la", "la", self.ITEM_DURATION) + + @pytest.mark.parametrize("response_data", [[lyrics_match()]]) @pytest.mark.parametrize( "plugin_config, expected_lyrics", - [ - ({"synced": True}, "[00:00.00] la la la"), - ({"synced": False}, "la la la"), - ], + [({"synced": True}, "synced"), ({"synced": False}, "plain")], ) def test_synced_config_option(self, fetch_lyrics, expected_lyrics): assert fetch_lyrics() == expected_lyrics @@ -373,21 +384,62 @@ def test_synced_config_option(self, fetch_lyrics, expected_lyrics): @pytest.mark.parametrize( "response_data, expected_lyrics", [ + pytest.param([], None, id="handle non-matching lyrics"), pytest.param( - {"syncedLyrics": "", "plainLyrics": "la la la"}, - "la la la", - id="pick plain lyrics", + [lyrics_match()], + "synced", + id="synced when available", ), pytest.param( - { - "statusCode": 404, - "error": "Not Found", - "message": "Failed to find specified track", - }, + [lyrics_match(duration=1)], None, - id="not found", + id="none: duration too short", + ), + pytest.param( + [lyrics_match(instrumental=True)], + "[Instrumental]", + id="instrumental track", + ), + pytest.param( + [lyrics_match(syncedLyrics=None)], + "plain", + id="plain by default", + ), + pytest.param( + [ + lyrics_match( + duration=ITEM_DURATION, + syncedLyrics=None, + plainLyrics="plain with closer duration", + ), + lyrics_match(syncedLyrics="synced", plainLyrics="plain 2"), + ], + "plain with closer duration", + id="prefer closer duration", + ), + pytest.param( + [lyrics_match(syncedLyrics=None), lyrics_match()], + "synced", + id="prefer match with synced lyrics", ), ], ) + @pytest.mark.parametrize("plugin_config", [{"synced": True}]) def test_fetch_lyrics(self, fetch_lyrics, expected_lyrics): assert fetch_lyrics() == expected_lyrics + + @pytest.mark.parametrize( + "request_kwargs, expected_log_match", + [ + ( + {"status_code": HTTPStatus.BAD_GATEWAY}, + r"LRCLib: Request error: 502", + ), + ({"text": "invalid"}, r"LRCLib: Could not decode.*JSON"), + ], + ) + def test_error(self, caplog, fetch_lyrics, expected_log_match): + assert fetch_lyrics() is None + assert caplog.messages + assert (last_log := caplog.messages[-1]) + assert re.search(expected_log_match, last_log, re.I)