Skip to content

Commit

Permalink
Correctly split three-or-more byte sequences of UTF-8 (#2123)
Browse files Browse the repository at this point in the history
  • Loading branch information
BenWiederhake authored May 23, 2024
1 parent 6b528ff commit d055b45
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 2 deletions.
14 changes: 12 additions & 2 deletions bridge/helper/helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -211,8 +211,18 @@ func ClipMessage(text string, length int, clippingMessage string) string {

if len(text) > length {
text = text[:length-len(clippingMessage)]
if r, size := utf8.DecodeLastRuneInString(text); r == utf8.RuneError {
text = text[:len(text)-size]
for len(text) > 0 {
if r, _ := utf8.DecodeLastRuneInString(text); r == utf8.RuneError {
text = text[:len(text)-1]
// Note: DecodeLastRuneInString only returns the constant value "1" in
// case of an error. We do not yet know whether the last rune is now
// actually valid. Example: "€" is 0xE2 0x82 0xAC. If we happen to split
// the string just before 0xAC, and go back only one byte, that would
// leave us with a string that ends in the byte 0xE2, which is not a valid
// rune, so we need to try again.
} else {
break
}
}
text += clippingMessage
}
Expand Down
9 changes: 9 additions & 0 deletions bridge/helper/helper_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,15 @@ var lineSplittingTestCases = map[string]struct {
},
nonSplitOutput: []string{"不布人個我此而及單石業喜資富下我河下日沒一我臺空達的常景便物沒為……子大我別名解成?生賣的全直黑,我自我結毛分洲了世當,是政福那是東;斯說"},
},
"Long message, clip three-byte rune after two bytes": {
input: "x 人人生而自由,在尊嚴和權利上一律平等。 他們都具有理性和良知,應該以兄弟情誼的精神對待彼此。",
splitOutput: []string{
"x 人人生而自由,在尊嚴和權利上 <clipped message>",
"一律平等。 他們都具有理性和良知 <clipped message>",
",應該以兄弟情誼的精神對待彼此。",
},
nonSplitOutput: []string{"x 人人生而自由,在尊嚴和權利上一律平等。 他們都具有理性和良知,應該以兄弟情誼的精神對待彼此。"},
},
}

func TestGetSubLines(t *testing.T) {
Expand Down

0 comments on commit d055b45

Please sign in to comment.