From 0292091b238e338b69cc43f7365506fd252f925e Mon Sep 17 00:00:00 2001 From: Marcel van der Veldt Date: Thu, 17 Oct 2024 12:43:10 +0200 Subject: [PATCH 1/3] fix encoding detection --- music_assistant/server/helpers/audio.py | 12 +++++++----- music_assistant/server/helpers/util.py | 9 +++++++++ 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/music_assistant/server/helpers/audio.py b/music_assistant/server/helpers/audio.py index eda1fbb49..30a05ecb3 100644 --- a/music_assistant/server/helpers/audio.py +++ b/music_assistant/server/helpers/audio.py @@ -46,7 +46,7 @@ from .process import AsyncProcess, check_output, communicate from .tags import parse_tags from .throttle_retry import BYPASS_THROTTLER -from .util import TimedAsyncGenerator, create_tempfile +from .util import TimedAsyncGenerator, create_tempfile, detect_charset if TYPE_CHECKING: from music_assistant.common.models.player_queue import QueueItem @@ -603,8 +603,9 @@ async def get_hls_radio_stream( substream_url, headers=HTTP_HEADERS, timeout=timeout ) as resp: resp.raise_for_status() - charset = resp.charset or "utf-8" - substream_m3u_data = await resp.text(charset) + raw_data = await resp.read() + encoding = resp.charset or await detect_charset(raw_data) + substream_m3u_data = raw_data.decode(encoding) # get chunk-parts from the substream hls_chunks = parse_m3u(substream_m3u_data) chunk_seconds = 0 @@ -679,8 +680,9 @@ async def get_hls_substream( url, allow_redirects=True, headers=HTTP_HEADERS, timeout=timeout ) as resp: resp.raise_for_status() - charset = resp.charset or "utf-8" - master_m3u_data = await resp.text(charset) + raw_data = await resp.read() + encoding = resp.charset or await detect_charset(raw_data) + master_m3u_data = raw_data.decode(encoding) substreams = parse_m3u(master_m3u_data) if any(x for x in substreams if x.length and not x.key): # this is already a substream! diff --git a/music_assistant/server/helpers/util.py b/music_assistant/server/helpers/util.py index 95896f6d9..6cdde467b 100644 --- a/music_assistant/server/helpers/util.py +++ b/music_assistant/server/helpers/util.py @@ -19,6 +19,7 @@ from types import TracebackType from typing import TYPE_CHECKING, Any, ParamSpec, Self, TypeVar +import cchardet as chardet import ifaddr import memory_tempfile from zeroconf import IPVersion @@ -181,6 +182,14 @@ async def close_async_generator(agen: AsyncGenerator[Any, None]) -> None: await agen.aclose() +async def detect_charset(data: bytes, fallback="utf-8") -> str: + """Detect charset of raw data.""" + try: + return (await asyncio.to_thread(chardet.detect, data))["encoding"] + except (ImportError, AttributeError): + return fallback + + class TaskManager: """ Helper class to run many tasks at once. From 20cc3da09073ea209b6507f7357f2c768b692725 Mon Sep 17 00:00:00 2001 From: Marcel van der Veldt Date: Thu, 17 Oct 2024 12:54:59 +0200 Subject: [PATCH 2/3] small tweak to encoding detect --- music_assistant/server/helpers/util.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/music_assistant/server/helpers/util.py b/music_assistant/server/helpers/util.py index 6cdde467b..34ef1610d 100644 --- a/music_assistant/server/helpers/util.py +++ b/music_assistant/server/helpers/util.py @@ -185,9 +185,12 @@ async def close_async_generator(agen: AsyncGenerator[Any, None]) -> None: async def detect_charset(data: bytes, fallback="utf-8") -> str: """Detect charset of raw data.""" try: - return (await asyncio.to_thread(chardet.detect, data))["encoding"] - except (ImportError, AttributeError): - return fallback + detected = await asyncio.to_thread(chardet.detect, data) + if detected and detected["encoding"] and detected["confidence"] > 0.75: + return detected["encoding"] + except Exception as err: + LOGGER.debug("Failed to detect charset: %s", err) + return fallback class TaskManager: From 04c063a474fc7ffef6047afab33f05d33da1a86a Mon Sep 17 00:00:00 2001 From: Marcel van der Veldt Date: Thu, 17 Oct 2024 15:58:29 +0200 Subject: [PATCH 3/3] Always detect charset for m3u playlists --- music_assistant/server/helpers/audio.py | 6 ++++-- music_assistant/server/helpers/playlists.py | 9 ++++++--- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/music_assistant/server/helpers/audio.py b/music_assistant/server/helpers/audio.py index 30a05ecb3..cce84fafc 100644 --- a/music_assistant/server/helpers/audio.py +++ b/music_assistant/server/helpers/audio.py @@ -604,7 +604,8 @@ async def get_hls_radio_stream( ) as resp: resp.raise_for_status() raw_data = await resp.read() - encoding = resp.charset or await detect_charset(raw_data) + # NOTE: using resp.charset is not reliable, we need to detect it ourselves + encoding = await detect_charset(raw_data) substream_m3u_data = raw_data.decode(encoding) # get chunk-parts from the substream hls_chunks = parse_m3u(substream_m3u_data) @@ -681,7 +682,8 @@ async def get_hls_substream( ) as resp: resp.raise_for_status() raw_data = await resp.read() - encoding = resp.charset or await detect_charset(raw_data) + # NOTE: using resp.charset is not reliable, we need to detect it ourselves + encoding = await detect_charset(raw_data) master_m3u_data = raw_data.decode(encoding) substreams = parse_m3u(master_m3u_data) if any(x for x in substreams if x.length and not x.key): diff --git a/music_assistant/server/helpers/playlists.py b/music_assistant/server/helpers/playlists.py index 8986de7fe..a4b14acde 100644 --- a/music_assistant/server/helpers/playlists.py +++ b/music_assistant/server/helpers/playlists.py @@ -11,6 +11,7 @@ from aiohttp import client_exceptions from music_assistant.common.models.errors import InvalidDataError +from music_assistant.server.helpers.util import detect_charset if TYPE_CHECKING: from music_assistant.server import MusicAssistant @@ -146,10 +147,12 @@ async def fetch_playlist(mass: MusicAssistant, url: str) -> list[PlaylistItem]: """Parse an online m3u or pls playlist.""" try: async with mass.http_session.get(url, allow_redirects=True, timeout=5) as resp: - charset = resp.charset or "utf-8" try: - playlist_data = (await resp.content.read(64 * 1024)).decode(charset) - except ValueError as err: + raw_data = await resp.content.read(64 * 1024) + # NOTE: using resp.charset is not reliable, we need to detect it ourselves + encoding = await detect_charset(raw_data) + playlist_data = raw_data.decode(encoding, errors="replace") + except (ValueError, UnicodeDecodeError) as err: msg = f"Could not decode playlist {url}" raise InvalidDataError(msg) from err except TimeoutError as err: