From 0f2edcfb68e603773047e30d0fc60b85ec714be1 Mon Sep 17 00:00:00 2001 From: krakow10 Date: Thu, 26 Sep 2024 14:36:57 -0700 Subject: [PATCH] Fix "stream did not contain valid UTF-8" using String::from_utf8_lossy (#380) Co-authored-by: Kenneth Loeffler --- rbx_binary/CHANGELOG.md | 1 + rbx_binary/src/deserializer/state.rs | 34 +++++++++++++++++++++++++--- 2 files changed, 32 insertions(+), 3 deletions(-) diff --git a/rbx_binary/CHANGELOG.md b/rbx_binary/CHANGELOG.md index 15cebfe0d..f237324e7 100644 --- a/rbx_binary/CHANGELOG.md +++ b/rbx_binary/CHANGELOG.md @@ -3,6 +3,7 @@ ## Unreleased * Added the ability to specify what type of compression to use for serializing. This takes the form of `Serializer::compression_type`. ([#446]) * Added support for ZSTD compressed files ([#446]) +* Implicit lossy conversion of non-UTF-8 `Instance.Name` and `*Script.Source` properties when decoding. The previous behaviour was returning an error. ([#380]) [#446]: https://github.com/rojo-rbx/rbx-dom/pull/446 diff --git a/rbx_binary/src/deserializer/state.rs b/rbx_binary/src/deserializer/state.rs index c2876606c..19089bd34 100644 --- a/rbx_binary/src/deserializer/state.rs +++ b/rbx_binary/src/deserializer/state.rs @@ -1,4 +1,5 @@ use std::{ + borrow::Cow, collections::{HashMap, HashSet, VecDeque}, convert::TryInto, io::Read, @@ -374,7 +375,20 @@ impl<'db, R: Read> DeserializerState<'db, R> { for referent in &type_info.referents { let instance = self.instances_by_ref.get_mut(referent).unwrap(); - let value = chunk.read_string()?; + let binary_string = chunk.read_binary_string()?; + let value = match std::str::from_utf8(&binary_string) { + Ok(value) => Cow::Borrowed(value), + Err(_) => { + log::warn!( + "Performing lossy string conversion on property {}.{} because it did not contain UTF-8. +This may cause unexpected or broken behavior in your final results if you rely on this property being non UTF-8.", + type_info.type_name, + prop_name + ); + + String::from_utf8_lossy(binary_string.as_ref()) + } + }; instance.builder.set_name(value); } @@ -399,8 +413,22 @@ impl<'db, R: Read> DeserializerState<'db, R> { VariantType::String => { for referent in &type_info.referents { let instance = self.instances_by_ref.get_mut(referent).unwrap(); - let value = chunk.read_string()?; - add_property(instance, &property, value.into()); + let binary_string = chunk.read_binary_string()?; + let value = match std::str::from_utf8(&binary_string) { + Ok(value) => Cow::Borrowed(value), + Err(_) => { + log::warn!( + "Performing lossy string conversion on property {}.{} because it did not contain UTF-8. +This may cause unexpected or broken behavior in your final results if you rely on this property being non UTF-8.", + type_info.type_name, + property.name + ); + + String::from_utf8_lossy(&binary_string) + } + }; + + add_property(instance, &property, value.as_ref().into()); } } VariantType::Content => {