From d055b4530ec27b01d46c7b74d92d0a8d1c48717f Mon Sep 17 00:00:00 2001 From: Ben Wiederhake Date: Fri, 24 May 2024 00:02:09 +0200 Subject: [PATCH] Correctly split three-or-more byte sequences of UTF-8 (#2123) --- bridge/helper/helper.go | 14 ++++++++++++-- bridge/helper/helper_test.go | 9 +++++++++ 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/bridge/helper/helper.go b/bridge/helper/helper.go index 0208dff1bd..d6488af66a 100644 --- a/bridge/helper/helper.go +++ b/bridge/helper/helper.go @@ -211,8 +211,18 @@ func ClipMessage(text string, length int, clippingMessage string) string { if len(text) > length { text = text[:length-len(clippingMessage)] - if r, size := utf8.DecodeLastRuneInString(text); r == utf8.RuneError { - text = text[:len(text)-size] + for len(text) > 0 { + if r, _ := utf8.DecodeLastRuneInString(text); r == utf8.RuneError { + text = text[:len(text)-1] + // Note: DecodeLastRuneInString only returns the constant value "1" in + // case of an error. We do not yet know whether the last rune is now + // actually valid. Example: "€" is 0xE2 0x82 0xAC. If we happen to split + // the string just before 0xAC, and go back only one byte, that would + // leave us with a string that ends in the byte 0xE2, which is not a valid + // rune, so we need to try again. + } else { + break + } } text += clippingMessage } diff --git a/bridge/helper/helper_test.go b/bridge/helper/helper_test.go index 76e548e487..f21a4bda8e 100644 --- a/bridge/helper/helper_test.go +++ b/bridge/helper/helper_test.go @@ -88,6 +88,15 @@ var lineSplittingTestCases = map[string]struct { }, nonSplitOutput: []string{"不布人個我此而及單石業喜資富下我河下日沒一我臺空達的常景便物沒為……子大我別名解成?生賣的全直黑,我自我結毛分洲了世當,是政福那是東;斯說"}, }, + "Long message, clip three-byte rune after two bytes": { + input: "x 人人生而自由,在尊嚴和權利上一律平等。 他們都具有理性和良知,應該以兄弟情誼的精神對待彼此。", + splitOutput: []string{ + "x 人人生而自由,在尊嚴和權利上 ", + "一律平等。 他們都具有理性和良知 ", + ",應該以兄弟情誼的精神對待彼此。", + }, + nonSplitOutput: []string{"x 人人生而自由,在尊嚴和權利上一律平等。 他們都具有理性和良知,應該以兄弟情誼的精神對待彼此。"}, + }, } func TestGetSubLines(t *testing.T) {