From 5262c5df0adab6ef1eb3f54675fc4dbd2b0883c5 Mon Sep 17 00:00:00 2001
From: Marti Maria <marti.maria@littlecms.com>
Date: Fri, 3 Nov 2023 12:03:36 +0100
Subject: [PATCH] Added new functions to deal with UTF8

In preparation of new release, I'm adding the new UTF8 handling functions that hopefully would make  localization better.
---
 include/lcms2.h      |  18 +++-
 src/cmsio1.c         |  10 ++
 src/cmsnamed.c       | 230 ++++++++++++++++++++++++++++++++++++++++---
 src/cmstypes.c       |  17 ++--
 src/lcms2.def        |   3 +
 src/lcms2_internal.h |   1 +
 testbed/testcms2.c   |  25 +++++
 7 files changed, 283 insertions(+), 21 deletions(-)

diff --git a/include/lcms2.h b/include/lcms2.h
index 3aa9bb31..10202bae 100644
--- a/include/lcms2.h
+++ b/include/lcms2.h
@@ -1334,8 +1334,11 @@ CMSAPI cmsBool           CMSEXPORT cmsSliceSpaceFloat(cmsUInt32Number nInputs, c
 
 typedef struct _cms_MLU_struct cmsMLU;
 
-#define  cmsNoLanguage "\0\0"
-#define  cmsNoCountry  "\0\0"
+#define  cmsNoLanguage    "\0\0"
+#define  cmsNoCountry     "\0\0"
+
+// Special language/country to retrieve unicode field for description in V2 profiles. Use with care.
+#define  cmsV2Unicode     "\xff\xff"
 
 CMSAPI cmsMLU*           CMSEXPORT cmsMLUalloc(cmsContext ContextID, cmsUInt32Number nItems);
 CMSAPI void              CMSEXPORT cmsMLUfree(cmsMLU* mlu);
@@ -1347,6 +1350,9 @@ CMSAPI cmsBool           CMSEXPORT cmsMLUsetASCII(cmsMLU* mlu,
 CMSAPI cmsBool           CMSEXPORT cmsMLUsetWide(cmsMLU* mlu,
                                                   const char LanguageCode[3], const char CountryCode[3],
                                                   const wchar_t* WideString);
+CMSAPI cmsBool           CMSEXPORT cmsMLUsetUTF8(cmsMLU* mlu,
+                                                  const char LanguageCode[3], const char CountryCode[3],
+                                                  const char* UTF8String);
 
 CMSAPI cmsUInt32Number   CMSEXPORT cmsMLUgetASCII(const cmsMLU* mlu,
                                                   const char LanguageCode[3], const char CountryCode[3],
@@ -1355,6 +1361,10 @@ CMSAPI cmsUInt32Number   CMSEXPORT cmsMLUgetASCII(const cmsMLU* mlu,
 CMSAPI cmsUInt32Number   CMSEXPORT cmsMLUgetWide(const cmsMLU* mlu,
                                                  const char LanguageCode[3], const char CountryCode[3],
                                                  wchar_t* Buffer, cmsUInt32Number BufferSize);
+CMSAPI cmsUInt32Number   CMSEXPORT cmsMLUgetUTF8(const cmsMLU* mlu,
+                                                 const char LanguageCode[3], const char CountryCode[3],
+                                                 char* Buffer, cmsUInt32Number BufferSize);
+
 
 CMSAPI cmsBool           CMSEXPORT cmsMLUgetTranslation(const cmsMLU* mlu,
                                                          const char LanguageCode[3], const char CountryCode[3],
@@ -1579,6 +1589,10 @@ CMSAPI cmsUInt32Number   CMSEXPORT cmsGetProfileInfoASCII(cmsHPROFILE hProfile,
                                                             const char LanguageCode[3], const char CountryCode[3],
                                                             char* Buffer, cmsUInt32Number BufferSize);
 
+CMSAPI cmsUInt32Number  CMSEXPORT cmsGetProfileInfoUTF8(cmsHPROFILE hProfile, cmsInfoType Info,
+                                                            const char LanguageCode[3], const char CountryCode[3],
+                                                            char* Buffer, cmsUInt32Number BufferSize);
+
 // IO handlers ----------------------------------------------------------------------------------------------------------
 
 typedef struct _cms_io_handler cmsIOHANDLER;
diff --git a/src/cmsio1.c b/src/cmsio1.c
index 5b57dd2c..c75b454c 100644
--- a/src/cmsio1.c
+++ b/src/cmsio1.c
@@ -1027,3 +1027,13 @@ cmsUInt32Number  CMSEXPORT cmsGetProfileInfoASCII(cmsHPROFILE hProfile, cmsInfoT
 
     return cmsMLUgetASCII(mlu, LanguageCode, CountryCode, Buffer, BufferSize);
 }
+
+cmsUInt32Number  CMSEXPORT cmsGetProfileInfoUTF8(cmsHPROFILE hProfile, cmsInfoType Info,
+                                                          const char LanguageCode[3], const char CountryCode[3],
+                                                          char* Buffer, cmsUInt32Number BufferSize)
+{
+    const cmsMLU* mlu = GetInfo(hProfile, Info);
+    if (mlu == NULL) return 0;
+
+    return cmsMLUgetUTF8(mlu, LanguageCode, CountryCode, Buffer, BufferSize);
+}
diff --git a/src/cmsnamed.c b/src/cmsnamed.c
index ccf040ab..380cb8d2 100644
--- a/src/cmsnamed.c
+++ b/src/cmsnamed.c
@@ -200,17 +200,145 @@ void strFrom16(char str[3], cmsUInt16Number n)
     str[0] = (char)(n >> 8);
     str[1] = (char)n;
     str[2] = (char)0;
+}
+
+
+// Convert from UTF8 to wchar, returns len.
+static
+cmsUInt32Number decodeUTF8(wchar_t* out, const char* in)
+{    
+    cmsUInt32Number codepoint = 0;    
+    cmsUInt32Number size = 0;
+
+    while (*in)
+    {
+        cmsUInt8Number ch = (cmsUInt8Number) *in;
+
+        if (ch <= 0x7f)
+        {
+            codepoint = ch;
+        }
+        else if (ch <= 0xbf)
+        {
+            codepoint = (codepoint << 6) | (ch & 0x3f);
+        }
+        else if (ch <= 0xdf)
+        {
+            codepoint = ch & 0x1f;
+        }
+        else if (ch <= 0xef)
+        {
+            codepoint = ch & 0x0f;
+        }
+        else
+        {
+            codepoint = ch & 0x07;
+        }
+
+        in++; 
+
+        if (((*in & 0xc0) != 0x80) && (codepoint <= 0x10ffff))
+        {
+            if (sizeof(wchar_t) > 2)
+            {
+                if (out) *out++ = (wchar_t) codepoint;
+                size++;
+            }
+            else
+                if (codepoint > 0xffff)
+                {
+                    if (out)
+                    {
+                        *out++ = (wchar_t)(0xd800 + (codepoint >> 10));
+                        *out++ = (wchar_t)(0xdc00 + (codepoint & 0x03ff));
+                        size += 2;
+                    }
+                }
+                else
+                    if (codepoint < 0xd800 || codepoint >= 0xe000)
+                    {
+                        if (out) *out++ = (wchar_t) codepoint;
+                        size++;
+                    }
+        }
+    }   
+    
+    return size;
+}
+
+// Convert from wchar_t to UTF8 
+static 
+cmsUInt32Number encodeUTF8(char* out, const wchar_t* in, cmsUInt32Number max_wchars, cmsUInt32Number max_chars)
+{       
+    cmsUInt32Number codepoint = 0;
+    cmsUInt32Number size = 0;
+    cmsUInt32Number len_w = 0;
+        
+    while (*in && len_w < max_wchars)
+    {
+        if (*in >= 0xd800 && *in <= 0xdbff)
+            codepoint = ((*in - 0xd800) << 10) + 0x10000;
+        else
+        {
+            if (*in >= 0xdc00 && *in <= 0xdfff)
+                codepoint |= *in - 0xdc00;
+            else
+                codepoint = *in;
+
+            if (codepoint <= 0x7f)
+            {
+                if (out && (size + 1 < max_chars)) *out++ = (char)codepoint;
+                size++;
+            }
+
+            else if (codepoint <= 0x7ff)
+            {
+                if (out && (max_chars > 0) && (size + 2 < max_chars))
+                {
+                    *out++ = (char)(cmsUInt32Number)(0xc0 | ((codepoint >> 6) & 0x1f));
+                    *out++ = (char)(cmsUInt32Number)(0x80 | (codepoint & 0x3f));                    
+                }
+                size += 2;
+            }
+            else if (codepoint <= 0xffff)
+            {
+                if (out && (max_chars > 0) && (size + 3 < max_chars))
+                {
+                    *out++ = (char)(cmsUInt32Number)(0xe0 | ((codepoint >> 12) & 0x0f));
+                    *out++ = (char)(cmsUInt32Number)(0x80 | ((codepoint >> 6) & 0x3f));
+                    *out++ = (char)(cmsUInt32Number)(0x80 | (codepoint & 0x3f));
+                }
+                size += 3;
+            }
+            else
+            {
+                if (out && (max_chars > 0) && (size + 4 < max_chars))
+                {
+                    *out++ = (char)(cmsUInt32Number)(0xf0 | ((codepoint >> 18) & 0x07));
+                    *out++ = (char)(cmsUInt32Number)(0x80 | ((codepoint >> 12) & 0x3f));
+                    *out++ = (char)(cmsUInt32Number)(0x80 | ((codepoint >> 6) & 0x3f));
+                    *out++ = (char)(cmsUInt32Number)(0x80 | (codepoint & 0x3f));
+                }
+                size += 4;
+            }
 
+            codepoint = 0;
+        }
+
+        in++; len_w++;         
+    }
+
+    return size;
 }
 
 // Add an ASCII entry. Do not add any \0 termination (ICC1v43_2010-12.pdf page 61)
 // In the case the user explicitly sets an empty string, we force a \0
 cmsBool CMSEXPORT cmsMLUsetASCII(cmsMLU* mlu, const char LanguageCode[3], const char CountryCode[3], const char* ASCIIString)
 {
-    cmsUInt32Number i, len = (cmsUInt32Number) strlen(ASCIIString);
+    cmsUInt32Number i, len = (cmsUInt32Number)strlen(ASCIIString);
     wchar_t* WStr;
     cmsBool  rc;
-    cmsUInt16Number Lang  = strTo16(LanguageCode);
+    cmsUInt16Number Lang = strTo16(LanguageCode);
     cmsUInt16Number Cntry = strTo16(CountryCode);
 
     if (mlu == NULL) return FALSE;
@@ -218,22 +346,56 @@ cmsBool CMSEXPORT cmsMLUsetASCII(cmsMLU* mlu, const char LanguageCode[3], const
     // len == 0 would prevent operation, so we set a empty string pointing to zero
     if (len == 0)
     {
-        len = 1;
+        wchar_t empty = 0;
+        return AddMLUBlock(mlu, sizeof(wchar_t), &empty, Lang, Cntry);
     }
 
-    WStr = (wchar_t*) _cmsCalloc(mlu ->ContextID, len,  sizeof(wchar_t));
+    WStr = (wchar_t*)_cmsCalloc(mlu->ContextID, len, sizeof(wchar_t));
     if (WStr == NULL) return FALSE;
 
-    for (i=0; i < len; i++)
-        WStr[i] = (wchar_t) ASCIIString[i];
+    for (i = 0; i < len; i++)
+        WStr[i] = (wchar_t)ASCIIString[i];
 
-    rc = AddMLUBlock(mlu, len  * sizeof(wchar_t), WStr, Lang, Cntry);
+    rc = AddMLUBlock(mlu, len * sizeof(wchar_t), WStr, Lang, Cntry);
 
-    _cmsFree(mlu ->ContextID, WStr);
+    _cmsFree(mlu->ContextID, WStr);
     return rc;
 
 }
 
+// Add an UTF8 entry. Do not add any \0 termination (ICC1v43_2010-12.pdf page 61)
+// In the case the user explicitly sets an empty string, we force a \0
+cmsBool CMSEXPORT cmsMLUsetUTF8(cmsMLU* mlu, const char LanguageCode[3], const char CountryCode[3], const char* UTF8String)
+{
+    cmsUInt32Number UTF8len;
+    wchar_t* WStr;
+    cmsBool  rc;
+    cmsUInt16Number Lang  = strTo16(LanguageCode);
+    cmsUInt16Number Cntry = strTo16(CountryCode);
+
+    if (mlu == NULL) return FALSE;
+
+    if (*UTF8String == '\0')
+    {
+        wchar_t empty = 0;
+        return AddMLUBlock(mlu, sizeof(wchar_t), &empty, Lang, Cntry);
+    }
+    
+    // Len excluding terminator 0
+    UTF8len = decodeUTF8(NULL, UTF8String);
+    
+    // Get space for dest
+    WStr = (wchar_t*) _cmsCalloc(mlu ->ContextID, UTF8len,  sizeof(wchar_t));
+    if (WStr == NULL) return FALSE;
+
+    decodeUTF8(WStr, UTF8String);
+    
+    rc = AddMLUBlock(mlu, UTF8len  * sizeof(wchar_t), WStr, Lang, Cntry);
+
+    _cmsFree(mlu ->ContextID, WStr);
+    return rc;
+}
+
 // We don't need any wcs support library
 static
 cmsUInt32Number mywcslen(const wchar_t *s)
@@ -372,7 +534,7 @@ const wchar_t* _cmsMLUgetWide(const cmsMLU* mlu,
 
     if (v->StrW + v->Len > mlu->PoolSize) return NULL;
 
-    return(wchar_t*) ((cmsUInt8Number*) mlu ->MemPool + v ->StrW);
+    return (wchar_t*) ((cmsUInt8Number*) mlu ->MemPool + v ->StrW);
 }
 
 
@@ -410,10 +572,12 @@ cmsUInt32Number CMSEXPORT cmsMLUgetASCII(const cmsMLU* mlu,
     // Precess each character
     for (i=0; i < ASCIIlen; i++) {
 
-        if (Wide[i] == 0)
-            Buffer[i] = 0;
+        wchar_t wc = Wide[i];
+
+        if (wc < 0xff)
+            Buffer[i] = (char)wc;
         else
-            Buffer[i] = (char) Wide[i];
+            Buffer[i] = '?';
     }
 
     // We put a termination "\0"
@@ -421,6 +585,46 @@ cmsUInt32Number CMSEXPORT cmsMLUgetASCII(const cmsMLU* mlu,
     return ASCIIlen + 1;
 }
 
+
+// Obtain a UTF8 representation of the wide string. Setting buffer to NULL returns the len
+cmsUInt32Number CMSEXPORT cmsMLUgetUTF8(const cmsMLU* mlu,
+                                       const char LanguageCode[3], const char CountryCode[3],
+                                       char* Buffer, cmsUInt32Number BufferSize)
+{
+    const wchar_t *Wide;
+    cmsUInt32Number  StrLen = 0;
+    cmsUInt32Number UTF8len;
+
+    cmsUInt16Number Lang  = strTo16(LanguageCode);
+    cmsUInt16Number Cntry = strTo16(CountryCode);
+
+    // Sanitize
+    if (mlu == NULL) return 0;
+
+    // Get WideChar
+    Wide = _cmsMLUgetWide(mlu, &StrLen, Lang, Cntry, NULL, NULL);
+    if (Wide == NULL) return 0;
+
+    UTF8len = encodeUTF8(NULL, Wide, StrLen / sizeof(wchar_t), BufferSize);
+       
+    // Maybe we want only to know the len?
+    if (Buffer == NULL) return UTF8len + 1; // Note the zero at the end
+
+    // No buffer size means no data
+    if (BufferSize <= 0) return 0;
+
+    // Some clipping may be required
+    if (BufferSize < UTF8len + 1)
+        UTF8len = BufferSize - 1;
+
+    // Process it
+    encodeUTF8(Buffer, Wide, StrLen / sizeof(wchar_t), BufferSize);    
+
+    // We put a termination "\0"
+    Buffer[UTF8len] = 0;
+    return UTF8len + 1;
+}
+
 // Obtain a wide representation of the MLU, on depending on current locale settings
 cmsUInt32Number CMSEXPORT cmsMLUgetWide(const cmsMLU* mlu,
                                       const char LanguageCode[3], const char CountryCode[3],
@@ -441,7 +645,7 @@ cmsUInt32Number CMSEXPORT cmsMLUgetWide(const cmsMLU* mlu,
     // Maybe we want only to know the len?
     if (Buffer == NULL) return StrLen + sizeof(wchar_t);
 
-  // No buffer size means no data
+    // No buffer size means no data
     if (BufferSize <= 0) return 0;
 
     // Some clipping may be required
diff --git a/src/cmstypes.c b/src/cmstypes.c
index b0e8c7ec..4f2d2eec 100644
--- a/src/cmstypes.c
+++ b/src/cmstypes.c
@@ -925,6 +925,7 @@ static
 void *Type_Text_Description_Read(struct _cms_typehandler_struct* self, cmsIOHANDLER* io, cmsUInt32Number* nItems, cmsUInt32Number SizeOfTag)
 {
     char* Text = NULL;
+    wchar_t* UnicodeString = NULL;
     cmsMLU* mlu = NULL;
     cmsUInt32Number  AsciiCount;
     cmsUInt32Number  i, UnicodeCode, UnicodeCount;
@@ -944,7 +945,7 @@ void *Type_Text_Description_Read(struct _cms_typehandler_struct* self, cmsIOHAND
     if (SizeOfTag < AsciiCount) return NULL;
 
     // All seems Ok, allocate the container
-    mlu = cmsMLUalloc(self ->ContextID, 1);
+    mlu = cmsMLUalloc(self ->ContextID, 2);
     if (mlu == NULL) return NULL;
 
     // As many memory as size of tag
@@ -971,13 +972,17 @@ void *Type_Text_Description_Read(struct _cms_typehandler_struct* self, cmsIOHAND
 
     if (SizeOfTag < UnicodeCount*sizeof(cmsUInt16Number)) goto Done;
 
-    for (i=0; i < UnicodeCount; i++) {
-        if (!io ->Read(io, &Dummy, sizeof(cmsUInt16Number), 1)) goto Done;
-    }
+    UnicodeString = (wchar_t*)_cmsMalloc(self->ContextID, UnicodeCount * sizeof(wchar_t));
+    if (UnicodeString == NULL) goto Done;
+
+    if (!_cmsReadWCharArray(io, UnicodeCount, UnicodeString)) goto Done;
+    if (!cmsMLUsetWide(mlu, cmsV2Unicode, cmsV2Unicode, UnicodeString)) goto Done;
+    _cmsFree(self->ContextID, (void*)UnicodeString);
+
     SizeOfTag -= UnicodeCount*sizeof(cmsUInt16Number);
 
     // Skip ScriptCode code if present. Some buggy profiles does have less
-    // data that stricttly required. We need to skip it as this type may come
+    // data that strictly required. We need to skip it as this type may come
     // embedded in other types.
 
     if (SizeOfTag >= sizeof(cmsUInt16Number) + sizeof(cmsUInt8Number) + 67) {
@@ -1049,7 +1054,7 @@ cmsBool  Type_Text_Description_Write(struct _cms_typehandler_struct* self, cmsIO
 
         // Get both representations.
         cmsMLUgetASCII(mlu, cmsNoLanguage, cmsNoCountry,  Text, len * sizeof(char));
-        cmsMLUgetWide(mlu,  cmsNoLanguage, cmsNoCountry,  Wide, len * sizeof(wchar_t));
+        cmsMLUgetWide(mlu,  cmsV2Unicode,  cmsV2Unicode,  Wide, len * sizeof(wchar_t));
     }
 
     // Tell the real text len including the null terminator and padding
diff --git a/src/lcms2.def b/src/lcms2.def
index 5bd26783..7853eb6f 100644
--- a/src/lcms2.def
+++ b/src/lcms2.def
@@ -114,6 +114,7 @@ cmsGetPostScriptCRD                      =    cmsGetPostScriptCRD
 cmsGetPostScriptCSA                      =    cmsGetPostScriptCSA
 cmsGetProfileInfo                        =    cmsGetProfileInfo
 cmsGetProfileInfoASCII                   =    cmsGetProfileInfoASCII
+cmsGetProfileInfoUTF8                    =    cmsGetProfileInfoUTF8
 cmsGetProfileContextID                   =    cmsGetProfileContextID
 cmsGetProfileVersion                     =    cmsGetProfileVersion
 cmsGetSupportedIntents                   =    cmsGetSupportedIntents
@@ -209,8 +210,10 @@ cmsMLUfree                               =    cmsMLUfree
 cmsMLUgetASCII                           =    cmsMLUgetASCII
 cmsMLUgetTranslation                     =    cmsMLUgetTranslation
 cmsMLUgetWide                            =    cmsMLUgetWide
+cmsMLUgetUTF8						     =    cmsMLUgetUTF8
 cmsMLUsetASCII                           =    cmsMLUsetASCII
 cmsMLUsetWide                            =    cmsMLUsetWide
+cmsMLUsetUTF8                            =    cmsMLUsetUTF8
 cmsStageAllocCLut16bit                   =    cmsStageAllocCLut16bit
 cmsStageAllocCLut16bitGranular           =    cmsStageAllocCLut16bitGranular
 cmsStageAllocCLutFloat                   =    cmsStageAllocCLutFloat
diff --git a/src/lcms2_internal.h b/src/lcms2_internal.h
index 84a62fbf..8e78a71e 100644
--- a/src/lcms2_internal.h
+++ b/src/lcms2_internal.h
@@ -260,6 +260,7 @@ typedef CRITICAL_SECTION _cmsMutex;
 #ifdef _MSC_VER
 #    if (_MSC_VER >= 1800)
 #          pragma warning(disable : 26135)
+#          pragma warning(disable : 4127)
 #    endif
 #endif
 
diff --git a/testbed/testcms2.c b/testbed/testcms2.c
index f536e3ce..0f84445d 100644
--- a/testbed/testcms2.c
+++ b/testbed/testcms2.c
@@ -3676,6 +3676,30 @@ cmsInt32Number CheckMLU(void)
 }
 
 
+// Check UTF8 encoding
+static
+cmsInt32Number CheckMLU_UTF8(void)
+{
+    cmsMLU* mlu;
+    char Buffer[256];
+    cmsInt32Number rc = 1;
+        
+    mlu = cmsMLUalloc(DbgThread(), 0);
+    
+    cmsMLUsetWide(mlu, "en", "US", L"\x3b2\x14b");
+
+    cmsMLUgetUTF8(mlu, "en", "US", Buffer, 256);
+    if (strcmp(Buffer, "\xce\xb2\xc5\x8b") != 0) rc = 0;
+
+    if (rc == 0)
+        Fail("Unexpected string '%s'", Buffer);
+
+    cmsMLUfree(mlu);
+    return rc;
+}
+
+
+
 // A lightweight test of named color structures.
 static
 cmsInt32Number CheckNamedColorList(void)
@@ -9597,6 +9621,7 @@ int main(int argc, char* argv[])
 
     // MLU
     Check("Multilocalized Unicode", CheckMLU);
+    Check("Multilocalized Unicode (II)", CheckMLU_UTF8);
 
     // Named color
     Check("Named color lists", CheckNamedColorList);