From b16213cb38ae6dad7f79969fcf6c4f886d6f0776 Mon Sep 17 00:00:00 2001
From: Sergio Salvatore <sergio@vimeo.com>
Date: Fri, 4 Oct 2024 16:51:54 -0400
Subject: [PATCH] Improve Go Case Conversion

- provide the ability to augment or replace the set of initialisms that
  are used
- provide an EncodeCasingFunc for the Go style
- introduce the concept of "atoms" so that Go will not attempt to break
  up names that we want to be kept together.
---
 tagformat/caseconversion/case_conversion.go   | 307 +++++++++++++-----
 .../caseconversion/case_conversion_test.go    | 112 ++++++-
 2 files changed, 319 insertions(+), 100 deletions(-)

diff --git a/tagformat/caseconversion/case_conversion.go b/tagformat/caseconversion/case_conversion.go
index 5525653..571d0d1 100644
--- a/tagformat/caseconversion/case_conversion.go
+++ b/tagformat/caseconversion/case_conversion.go
@@ -3,6 +3,7 @@ package caseconversion
 import (
 	"fmt"
 	"go/token"
+	"sort"
 	"strings"
 	"unicode"
 	"unicode/utf8"
@@ -26,6 +27,224 @@ type EncodeCasingFunc func(DecodedIdentifier) string
 // EncodeLowerCamelCase, "testString").
 type DecodedIdentifier []string
 
+// GoCaseConverter is a case converter that can decode and encode Go-style
+// identifiers.
+type GoCaseConverter struct {
+	initialisms  []string
+	atoms        []string
+	loweredAtoms []string
+}
+
+// NewGoCaseConverter creates a new GoCaseConverter.  Please see the
+// `AddInitialism`, `SetInitialisms`, and `SetAtoms` methods for details on how
+// to customize the encoding and decoding behavior.
+func NewGoCaseConverter() *GoCaseConverter {
+	g := &GoCaseConverter{}
+	g.SetInitialisms(commonInitialisms)
+	return g
+}
+
+var defaultGoCaseConverter = NewGoCaseConverter()
+
+// SetInitialisms replaces the set of initialisms used by the GoCaseConverter
+// with the argument.  Attempting to set an initialism less than two characters
+// long will cause a panic.
+func (g *GoCaseConverter) SetInitialisms(initialisms []string) {
+	g.initialisms = initialisms
+	sort.Strings(g.initialisms)
+	for _, initialism := range g.initialisms {
+		if len(initialism) < 2 {
+			panic(fmt.Sprintf("initialisms must be at least two characters long; %q is not valid", initialism))
+		}
+	}
+}
+
+// AddInitialisms adds the passed initialisms to the set of initialisms.
+// Attempting to add an initialism less than two characters long will cause a
+// panic.
+func (g *GoCaseConverter) AddInitialism(initialism ...string) {
+	g.SetInitialisms(append(g.initialisms, initialism...))
+}
+
+// SetAtoms replaces the set of atoms used by the GoCaseConverter with the
+// argument.  Atoms will specifically not be split at word boundaries and should
+// be provided in the exported-name format as in "ABTest".  Attemping to add an
+// atom less than two characters in length will cause a panic.
+func (g *GoCaseConverter) SetAtoms(atoms []string) {
+	g.atoms = atoms
+	sort.Strings(g.atoms)
+
+	g.loweredAtoms = make([]string, len(g.atoms))
+	for i, atom := range g.atoms {
+		if len(atom) < 2 {
+			panic(fmt.Sprintf("atoms must be at least two characters long; %q is not valid", atom))
+		}
+		g.loweredAtoms[i] = strings.ToLower(atom)
+	}
+}
+
+// Decode implements DecodeCasingFunc for Go-style identifiers.  It consults the
+// internal list of initialisms and atoms to determine how to split the string.
+func (g *GoCaseConverter) Decode(s string) (DecodedIdentifier, error) {
+	if !token.IsIdentifier(s) {
+		return nil, fmt.Errorf("only characters of the Letter category or '_' can appear in strings")
+	}
+	return g.decodeGoCamelCase(s, func(r rune) bool {
+		return r == '_'
+	})
+}
+
+// DecodeGoTags decodes CamelCase, snake_case, and kebab-case strings with fully
+// capitalized acronyms into a slice of lower cased strings.
+func (g *GoCaseConverter) DecodeGoTags(s string) (DecodedIdentifier, error) {
+	return g.decodeGoCamelCase(s, func(r rune) bool {
+		return r == '_' || r == '-'
+	})
+}
+
+// decodeGoCamelCase splits up a string in a slice of lower cased sub-string by
+// splitting after fully capitalized acronyms and after the characters that
+// signal word boundaries as specified in the passed isWordBoundary function
+func (g *GoCaseConverter) decodeGoCamelCase(s string, isWordBoundary func(rune) bool) (DecodedIdentifier, error) {
+	words := []string{}
+
+	buf := strings.Builder{}
+
+	sRunes := []rune(s)
+	for i := 0; i < len(sRunes); i++ {
+		char := sRunes[i]
+		if buf.Len() > 0 && (firstCharOfInitialism(s, i) || firstCharAfterInitialism(s, i) || isWordBoundary(char)) {
+			// We think we're at a word boundary, but we need to check if this is a prefix for an atom.
+			// We're looking for the longest matching atom, so this requires some iteration.
+			offset := sort.SearchStrings(g.atoms, buf.String())
+			bestMatch := -1
+			bestMatchLenDiff := 0
+			for ; offset < len(g.atoms) && strings.HasPrefix(g.atoms[offset], buf.String()); offset++ {
+				str := buf.String()
+				candidate := g.atoms[offset]
+
+				lenDiff := len(candidate) - len(str)
+				if lenDiff > 0 && len(str) < (i+lenDiff) {
+					// pull off more characters to match the length of the atom
+					str += string(sRunes[i : i+lenDiff])
+				}
+
+				if strings.EqualFold(candidate, str) {
+					// we found an atom that matches exactly, so we should hold
+					// on to that before we look for something potentially
+					// better
+					bestMatch = offset
+					bestMatchLenDiff = lenDiff
+				}
+			}
+
+			if bestMatch >= 0 {
+				// we found a match with an atom, so advance the pointer
+				words = append(words, g.atoms[bestMatch])
+				buf.Reset()
+				i += bestMatchLenDiff - 1
+				continue
+			}
+
+			words = append(words, buf.String())
+			buf.Reset()
+
+			if isWordBoundary(char) {
+				// if we're on a word boundary, just advance past it
+				continue
+			}
+		}
+		buf.WriteRune(char)
+	}
+
+	if buf.Len() > 0 {
+		// write whatever is left over in the buffer
+		words = append(words, buf.String())
+	}
+
+	lowerCased := make([]string, 0, len(words))
+
+	// see if any of the initialisms are actually a combination of two ("JSONAPI" or something...)
+	for _, word := range words {
+		if strings.ToUpper(word) != word {
+			// it's not an initialism because it's not all uppercase
+			lowerCased = append(lowerCased, strings.ToLower(word))
+			continue
+		}
+
+		offset := sort.SearchStrings(g.initialisms, word)
+		// offset is the position where we would insert this new word, so we
+		// should check the word before it to see if it's a prefix (or possibly
+		// an exact match)
+		for offset > 0 && strings.HasPrefix(word, g.initialisms[offset-1]) {
+			lowerCased = append(lowerCased, strings.ToLower(word[:len(g.initialisms[offset-1])]))
+			word = word[len(g.initialisms[offset-1]):]
+			offset = sort.SearchStrings(g.initialisms, word)
+		}
+
+		// if there's anything left to the word, add it
+		if len(word) > 0 {
+			lowerCased = append(lowerCased, strings.ToLower(word))
+		}
+	}
+	return lowerCased, nil
+}
+
+// Encode implements a EncodeCasingFunc for Go-style identifiers and returns an
+// exported-style name (with an initial uppercase character).
+func (g *GoCaseConverter) Encode(words DecodedIdentifier) string {
+	b := strings.Builder{}
+	b.Grow(aggregateStringLen(words))
+	for _, w := range words {
+		maybeInitialism := strings.ToUpper(w)
+		initialismOffset := sort.SearchStrings(g.initialisms, maybeInitialism)
+
+		atomOffset := sort.SearchStrings(g.loweredAtoms, w)
+
+		// check first if it's an atom, then if it's an initialism, and then
+		// assume it's just a normal name.
+		if atomOffset < len(g.atoms) && g.loweredAtoms[atomOffset] == w {
+			b.WriteString(g.atoms[atomOffset])
+		} else if initialismOffset < len(g.initialisms) && g.initialisms[initialismOffset] == maybeInitialism {
+			b.WriteString(maybeInitialism)
+		} else {
+			b.WriteString(cases.Title(language.English, cases.NoLower).String(w))
+		}
+	}
+	return b.String()
+}
+
+// EncodeUnexported is like Encode, but returns an unexported name (with an
+// initial lowercase character) still adhering to the rules of initialisms and
+// atoms.
+func (g *GoCaseConverter) EncodeUnexported(words DecodedIdentifier) string {
+	b := strings.Builder{}
+	b.Grow(aggregateStringLen(words))
+	for i, w := range words {
+		maybeInitialism := strings.ToUpper(w)
+		initialismOffset := sort.SearchStrings(g.initialisms, maybeInitialism)
+
+		atomOffset := sort.SearchStrings(g.loweredAtoms, w)
+
+		if atomOffset < len(g.atoms) && g.loweredAtoms[atomOffset] == w {
+			if i == 0 {
+				b.WriteString(g.loweredAtoms[atomOffset])
+			} else {
+				b.WriteString(g.atoms[atomOffset])
+			}
+		} else {
+			if i == 0 {
+				b.WriteString(w)
+			} else if initialismOffset < len(g.initialisms) && g.initialisms[initialismOffset] == maybeInitialism {
+				b.WriteString(maybeInitialism)
+			} else {
+				b.WriteString(cases.Title(language.English, cases.NoLower).String(w))
+			}
+		}
+	}
+	return b.String()
+}
+
 func decodeCamelCase(typeName, s string) (DecodedIdentifier, error) {
 	// ignore the size of the rune
 	r, _ := utf8.DecodeRuneInString(s)
@@ -98,61 +317,6 @@ func firstCharAfterInitialism(s string, i int) bool {
 	return i+rl1+rl2 < len(s) && unicode.IsUpper(r1) && unicode.IsLower(r2)
 }
 
-// lastCharOfInitialismAtEOS, as used in DecodeGoCamelCase, attempts to
-// detect when the indexed rune is the last character of an initialism at the
-// end of a string (e.g., jsonAP*I*).
-func lastCharOfInitialismAtEOS(s string, i int) bool {
-	s1 := s[i:]
-	r, rl := utf8.DecodeRuneInString(s1)
-
-	return i+rl == len(s) && unicode.IsUpper(r)
-}
-
-// decodeGoCamelCase splits up a string in a slice of lower cased sub-string by
-// splitting after fully capitalized acronyms and after the characters that
-// signal word boundaries as specified in the passed isWordBoundary function
-func decodeGoCamelCase(s string, isWordBoundary func(rune) bool) (DecodedIdentifier, error) {
-	words := []string{}
-	lastBoundary := 0
-	for i, char := range s {
-		if firstCharOfInitialism(s, i) || firstCharAfterInitialism(s, i) || isWordBoundary(char) {
-			if lastBoundary < i {
-				word := s[lastBoundary:i]
-				if word == strings.ToUpper(word) {
-					words = append(words, extractInitialisms(word)...)
-				} else {
-					words = append(words, strings.ToLower(word))
-				}
-			}
-			switch {
-			case isWordBoundary(char):
-				lastBoundary = i + 1
-			default:
-				lastBoundary = i
-			}
-
-		} else if lastCharOfInitialismAtEOS(s, i) {
-			if lastBoundary < i {
-				word := s[lastBoundary:]
-				if word == strings.ToUpper(word) {
-					words = append(words, extractInitialisms(word)...)
-					return words, nil
-				}
-			}
-			lastBoundary = i
-		}
-	}
-
-	if last := strings.ToLower(s[lastBoundary:]); len(last) > 0 {
-		words = append(words, strings.ToLower(s[lastBoundary:]))
-	}
-
-	return words, nil
-}
-
-// TODO: Add EncodeGoCamelCase function and set as default name encoder in
-// FlattenMangler
-
 // DecodeGoCamelCase decodes UpperCamelCase and lowerCamelCase strings with
 // fully capitalized acronyms (e.g., "jsonAPIDocs") into a slice of lower-cased
 // sub-strings.
@@ -160,47 +324,18 @@ func DecodeGoCamelCase(s string) (DecodedIdentifier, error) {
 	if !token.IsIdentifier(s) {
 		return nil, fmt.Errorf("only characters of the Letter category or '_' can appear in strings")
 	}
-	return decodeGoCamelCase(s, func(r rune) bool {
-		return r == '_'
-	})
+	return defaultGoCaseConverter.Decode(s)
 }
 
 // DecodeGoTags decodes CamelCase, snake_case, and kebab-case strings with fully
 // capitalized acronyms into a slice of lower cased strings.
 func DecodeGoTags(s string) (DecodedIdentifier, error) {
-	return decodeGoCamelCase(s, func(r rune) bool {
-		return r == '_' || r == '-'
-	})
+	return defaultGoCaseConverter.DecodeGoTags(s)
 }
 
 // List from https://github.com/golang/lint/blob/master/lint.go
 var commonInitialisms = []string{"ACL", "API", "ASCII", "CPU", "CSS", "DNS", "EOF", "GUID", "HTML", "HTTP", "HTTPS", "ID", "IP", "JSON", "LHS", "QPS", "RAM", "RHS", "RPC", "SLA", "SMTP", "SQL", "SSH", "TCP", "TLS", "TTL", "UDP", "UI", "UID", "UUID", "URI", "URL", "UTF8", "VM", "XML", "XMPP", "XSRF", "XSS"}
 
-// Given an entirely uppercase string, extract any initialisms sequentially from the start of the string and return them with the remainder of the string
-func extractInitialisms(s string) []string {
-	words := []string{}
-
-	for {
-		initialismFound := false
-		for _, initialism := range commonInitialisms {
-			if len(s) >= len(initialism) && initialism == s[:len(initialism)] {
-				initialismFound = true
-				words = append(words, strings.ToLower(initialism))
-				s = s[len(initialism):]
-			}
-		}
-		if !initialismFound {
-			break
-		}
-	}
-
-	if len(s) > 0 {
-		words = append(words, strings.ToLower(s))
-	}
-
-	return words
-}
-
 func decodeLowerCaseWithSplitChar(splitChar rune, typeName, s string) (DecodedIdentifier, error) {
 	// ignore the size of the rune
 	r, _ := utf8.DecodeRuneInString(s)
diff --git a/tagformat/caseconversion/case_conversion_test.go b/tagformat/caseconversion/case_conversion_test.go
index 5677e2f..00ba90d 100644
--- a/tagformat/caseconversion/case_conversion_test.go
+++ b/tagformat/caseconversion/case_conversion_test.go
@@ -120,21 +120,105 @@ func TestEncode(t *testing.T) {
 	}
 }
 
-var initialismCases = []struct {
-	original string
-	returned []string
-}{
-	{"JSONAPI", []string{"json", "api"}},
-	{"JSONAPIA", []string{"json", "api", "a"}},
-	{"XMLJSONAPI", []string{"xml", "json", "api"}},
-}
+func TestGoCaseConverter(t *testing.T) {
+	g := NewGoCaseConverter()
+	g.SetAtoms([]string{"RaNsoMNoTe", "ZZTop", "ABTests", "ABTest"})
+
+	for testName, tbl := range map[string]struct {
+		original   string
+		exported   string
+		unexported string
+		expected   []string
+	}{
+		"TwoInitialisms": {
+			original:   "JSONAPI",
+			unexported: "jsonAPI",
+			expected:   []string{"json", "api"},
+		},
+		"SingleInitialism": {
+			original:   "API",
+			unexported: "api",
+			expected:   []string{"api"},
+		},
+		"NotKnownInitialism": {
+			original:   "XSLTransformation",
+			exported:   "XslTransformation",
+			unexported: "xslTransformation",
+			expected:   []string{"xsl", "transformation"},
+		},
+		"TwoInitialismsWithSuffix": {
+			original:   "JSONAPIA",
+			unexported: "jsonAPIA",
+			expected:   []string{"json", "api", "a"},
+		},
+		"ThreeInitialisms": {
+			original:   "XMLJSONAPI",
+			unexported: "xmlJSONAPI",
+			expected:   []string{"xml", "json", "api"},
+		},
+		"TestLongConcatted": {
+			original:   "TestJSONAPI",
+			unexported: "testJSONAPI",
+			expected:   []string{"test", "json", "api"},
+		},
+		"TestLongConcattedWithSuffix": {
+			original:   "TestJSONAPIAddress",
+			unexported: "testJSONAPIAddress",
+			expected:   []string{"test", "json", "api", "address"},
+		},
+		"TestAtomAlone": {
+			original:   "ABTest",
+			unexported: "abtest",
+			expected:   []string{"abtest"},
+		},
+		"TestAtomLongerString": {
+			original:   "ABTestsGroup",
+			unexported: "abtestsGroup",
+			expected:   []string{"abtests", "group"},
+		},
+		"TestAtomWithInitialismSuffix": {
+			original:   "ABTestID",
+			unexported: "abtestID",
+			expected:   []string{"abtest", "id"},
+		},
+		"TestAtomWithPrefix": {
+			original:   "TheRaNsoMNoTe",
+			unexported: "theRaNsoMNoTe",
+			expected:   []string{"the", "ransomnote"},
+		},
+		"TwoAtoms": {
+			original:   "ABTestZZTop",
+			unexported: "abtestZZTop",
+			expected:   []string{"abtest", "zztop"},
+		},
+	} {
+		t.Run(testName, func(t *testing.T) {
+			words, err := g.Decode(tbl.original)
+			require.NoError(t, err)
+			assert.Equal(t, DecodedIdentifier(tbl.expected), words)
 
-func TestExtractInitialisms(t *testing.T) {
-	for _, initialismCase := range initialismCases {
-		t.Run(initialismCase.original, func(t *testing.T) {
-			is := extractInitialisms(initialismCase.original)
-			ought := initialismCase.returned
-			assert.Equal(t, ought, is)
+			if tbl.exported == "" {
+				tbl.exported = tbl.original
+			}
+			encoded := g.Encode(DecodedIdentifier(tbl.expected))
+			assert.Equal(t, tbl.exported, encoded)
+
+			encodedUnexported := g.EncodeUnexported(DecodedIdentifier(tbl.expected))
+			assert.Equal(t, tbl.unexported, encodedUnexported)
 		})
 	}
+
+	t.Run("TestInitialismPanic", func(t *testing.T) {
+		assert.Panics(t, func() {
+			g := NewGoCaseConverter()
+			g.SetInitialisms([]string{"A"})
+		})
+	})
+
+	t.Run("TestAtomPanic", func(t *testing.T) {
+		assert.Panics(t, func() {
+			g := NewGoCaseConverter()
+			g.SetAtoms([]string{"A"})
+		})
+	})
 }