Added new functions to deal with UTF8

In preparation of new release, I'm adding the new UTF8 handling functions that hopefully would make localization better.
mm2 · Nov 3, 2023 · 5262c5d · 5262c5d
1 parent 282c96e
commit 5262c5d
Show file tree

Hide file tree

Showing 7 changed files with 283 additions and 21 deletions.
diff --git a/include/lcms2.h b/include/lcms2.h
@@ -1334,8 +1334,11 @@ CMSAPI cmsBool           CMSEXPORT cmsSliceSpaceFloat(cmsUInt32Number nInputs, c
 
 typedef struct _cms_MLU_struct cmsMLU;
 
-#define  cmsNoLanguage "\0\0"
-#define  cmsNoCountry  "\0\0"
+#define  cmsNoLanguage    "\0\0"
+#define  cmsNoCountry     "\0\0"
+
+// Special language/country to retrieve unicode field for description in V2 profiles. Use with care.
+#define  cmsV2Unicode     "\xff\xff"
 
 CMSAPI cmsMLU*           CMSEXPORT cmsMLUalloc(cmsContext ContextID, cmsUInt32Number nItems);
 CMSAPI void              CMSEXPORT cmsMLUfree(cmsMLU* mlu);
@@ -1347,6 +1350,9 @@ CMSAPI cmsBool           CMSEXPORT cmsMLUsetASCII(cmsMLU* mlu,
 CMSAPI cmsBool           CMSEXPORT cmsMLUsetWide(cmsMLU* mlu,
                                                   const char LanguageCode[3], const char CountryCode[3],
                                                   const wchar_t* WideString);
+CMSAPI cmsBool           CMSEXPORT cmsMLUsetUTF8(cmsMLU* mlu,
+                                                  const char LanguageCode[3], const char CountryCode[3],
+                                                  const char* UTF8String);
 
 CMSAPI cmsUInt32Number   CMSEXPORT cmsMLUgetASCII(const cmsMLU* mlu,
                                                   const char LanguageCode[3], const char CountryCode[3],
@@ -1355,6 +1361,10 @@ CMSAPI cmsUInt32Number   CMSEXPORT cmsMLUgetASCII(const cmsMLU* mlu,
 CMSAPI cmsUInt32Number   CMSEXPORT cmsMLUgetWide(const cmsMLU* mlu,
                                                  const char LanguageCode[3], const char CountryCode[3],
                                                  wchar_t* Buffer, cmsUInt32Number BufferSize);
+CMSAPI cmsUInt32Number   CMSEXPORT cmsMLUgetUTF8(const cmsMLU* mlu,
+                                                 const char LanguageCode[3], const char CountryCode[3],
+                                                 char* Buffer, cmsUInt32Number BufferSize);
+
 
 CMSAPI cmsBool           CMSEXPORT cmsMLUgetTranslation(const cmsMLU* mlu,
                                                          const char LanguageCode[3], const char CountryCode[3],
@@ -1579,6 +1589,10 @@ CMSAPI cmsUInt32Number   CMSEXPORT cmsGetProfileInfoASCII(cmsHPROFILE hProfile,
                                                             const char LanguageCode[3], const char CountryCode[3],
                                                             char* Buffer, cmsUInt32Number BufferSize);
 
+CMSAPI cmsUInt32Number  CMSEXPORT cmsGetProfileInfoUTF8(cmsHPROFILE hProfile, cmsInfoType Info,
+                                                            const char LanguageCode[3], const char CountryCode[3],
+                                                            char* Buffer, cmsUInt32Number BufferSize);
+
 // IO handlers ----------------------------------------------------------------------------------------------------------
 
 typedef struct _cms_io_handler cmsIOHANDLER;

diff --git a/src/cmsio1.c b/src/cmsio1.c
@@ -1027,3 +1027,13 @@ cmsUInt32Number  CMSEXPORT cmsGetProfileInfoASCII(cmsHPROFILE hProfile, cmsInfoT
 
     return cmsMLUgetASCII(mlu, LanguageCode, CountryCode, Buffer, BufferSize);
 }
+
+cmsUInt32Number  CMSEXPORT cmsGetProfileInfoUTF8(cmsHPROFILE hProfile, cmsInfoType Info,
+                                                          const char LanguageCode[3], const char CountryCode[3],
+                                                          char* Buffer, cmsUInt32Number BufferSize)
+{
+    const cmsMLU* mlu = GetInfo(hProfile, Info);
+    if (mlu == NULL) return 0;
+
+    return cmsMLUgetUTF8(mlu, LanguageCode, CountryCode, Buffer, BufferSize);
+}
diff --git a/src/cmsnamed.c b/src/cmsnamed.c
@@ -200,40 +200,202 @@ void strFrom16(char str[3], cmsUInt16Number n)
     str[0] = (char)(n >> 8);
     str[1] = (char)n;
     str[2] = (char)0;
+}
+
+
+// Convert from UTF8 to wchar, returns len.
+static
+cmsUInt32Number decodeUTF8(wchar_t* out, const char* in)
+{    
+    cmsUInt32Number codepoint = 0;    
+    cmsUInt32Number size = 0;
+
+    while (*in)
+    {
+        cmsUInt8Number ch = (cmsUInt8Number) *in;
+
+        if (ch <= 0x7f)
+        {
+            codepoint = ch;
+        }
+        else if (ch <= 0xbf)
+        {
+            codepoint = (codepoint << 6) | (ch & 0x3f);
+        }
+        else if (ch <= 0xdf)
+        {
+            codepoint = ch & 0x1f;
+        }
+        else if (ch <= 0xef)
+        {
+            codepoint = ch & 0x0f;
+        }
+        else
+        {
+            codepoint = ch & 0x07;
+        }
+
+        in++; 
+
+        if (((*in & 0xc0) != 0x80) && (codepoint <= 0x10ffff))
+        {
+            if (sizeof(wchar_t) > 2)
+            {
+                if (out) *out++ = (wchar_t) codepoint;
+                size++;
+            }
+            else
+                if (codepoint > 0xffff)
+                {
+                    if (out)
+                    {
+                        *out++ = (wchar_t)(0xd800 + (codepoint >> 10));
+                        *out++ = (wchar_t)(0xdc00 + (codepoint & 0x03ff));
+                        size += 2;
+                    }
+                }
+                else
+                    if (codepoint < 0xd800 || codepoint >= 0xe000)
+                    {
+                        if (out) *out++ = (wchar_t) codepoint;
+                        size++;
+                    }
+        }
+    }   
+
+    return size;
+}
+
+// Convert from wchar_t to UTF8 
+static 
+cmsUInt32Number encodeUTF8(char* out, const wchar_t* in, cmsUInt32Number max_wchars, cmsUInt32Number max_chars)
+{       
+    cmsUInt32Number codepoint = 0;
+    cmsUInt32Number size = 0;
+    cmsUInt32Number len_w = 0;
+
+    while (*in && len_w < max_wchars)
+    {
+        if (*in >= 0xd800 && *in <= 0xdbff)
+            codepoint = ((*in - 0xd800) << 10) + 0x10000;
+        else
+        {
+            if (*in >= 0xdc00 && *in <= 0xdfff)
+                codepoint |= *in - 0xdc00;
+            else
+                codepoint = *in;
+
+            if (codepoint <= 0x7f)
+            {
+                if (out && (size + 1 < max_chars)) *out++ = (char)codepoint;
+                size++;
+            }
+
+            else if (codepoint <= 0x7ff)
+            {
+                if (out && (max_chars > 0) && (size + 2 < max_chars))
+                {
+                    *out++ = (char)(cmsUInt32Number)(0xc0 | ((codepoint >> 6) & 0x1f));
+                    *out++ = (char)(cmsUInt32Number)(0x80 | (codepoint & 0x3f));                    
+                }
+                size += 2;
+            }
+            else if (codepoint <= 0xffff)
+            {
+                if (out && (max_chars > 0) && (size + 3 < max_chars))
+                {
+                    *out++ = (char)(cmsUInt32Number)(0xe0 | ((codepoint >> 12) & 0x0f));
+                    *out++ = (char)(cmsUInt32Number)(0x80 | ((codepoint >> 6) & 0x3f));
+                    *out++ = (char)(cmsUInt32Number)(0x80 | (codepoint & 0x3f));
+                }
+                size += 3;
+            }
+            else
+            {
+                if (out && (max_chars > 0) && (size + 4 < max_chars))
+                {
+                    *out++ = (char)(cmsUInt32Number)(0xf0 | ((codepoint >> 18) & 0x07));
+                    *out++ = (char)(cmsUInt32Number)(0x80 | ((codepoint >> 12) & 0x3f));
+                    *out++ = (char)(cmsUInt32Number)(0x80 | ((codepoint >> 6) & 0x3f));
+                    *out++ = (char)(cmsUInt32Number)(0x80 | (codepoint & 0x3f));
+                }
+                size += 4;
+            }
 
+            codepoint = 0;
+        }
+
+        in++; len_w++;         
+    }
+
+    return size;
 }
 
 // Add an ASCII entry. Do not add any \0 termination (ICC1v43_2010-12.pdf page 61)
 // In the case the user explicitly sets an empty string, we force a \0
 cmsBool CMSEXPORT cmsMLUsetASCII(cmsMLU* mlu, const char LanguageCode[3], const char CountryCode[3], const char* ASCIIString)
 {
-    cmsUInt32Number i, len = (cmsUInt32Number) strlen(ASCIIString);
+    cmsUInt32Number i, len = (cmsUInt32Number)strlen(ASCIIString);
     wchar_t* WStr;
     cmsBool  rc;
-    cmsUInt16Number Lang  = strTo16(LanguageCode);
+    cmsUInt16Number Lang = strTo16(LanguageCode);
     cmsUInt16Number Cntry = strTo16(CountryCode);
 
     if (mlu == NULL) return FALSE;
 
     // len == 0 would prevent operation, so we set a empty string pointing to zero
     if (len == 0)
     {
-        len = 1;
+        wchar_t empty = 0;
+        return AddMLUBlock(mlu, sizeof(wchar_t), &empty, Lang, Cntry);
     }
 
-    WStr = (wchar_t*) _cmsCalloc(mlu ->ContextID, len,  sizeof(wchar_t));
+    WStr = (wchar_t*)_cmsCalloc(mlu->ContextID, len, sizeof(wchar_t));
     if (WStr == NULL) return FALSE;
 
-    for (i=0; i < len; i++)
-        WStr[i] = (wchar_t) ASCIIString[i];
+    for (i = 0; i < len; i++)
+        WStr[i] = (wchar_t)ASCIIString[i];
 
-    rc = AddMLUBlock(mlu, len  * sizeof(wchar_t), WStr, Lang, Cntry);
+    rc = AddMLUBlock(mlu, len * sizeof(wchar_t), WStr, Lang, Cntry);
 
-    _cmsFree(mlu ->ContextID, WStr);
+    _cmsFree(mlu->ContextID, WStr);
     return rc;
 
 }
 
+// Add an UTF8 entry. Do not add any \0 termination (ICC1v43_2010-12.pdf page 61)
+// In the case the user explicitly sets an empty string, we force a \0
+cmsBool CMSEXPORT cmsMLUsetUTF8(cmsMLU* mlu, const char LanguageCode[3], const char CountryCode[3], const char* UTF8String)
+{
+    cmsUInt32Number UTF8len;
+    wchar_t* WStr;
+    cmsBool  rc;
+    cmsUInt16Number Lang  = strTo16(LanguageCode);
+    cmsUInt16Number Cntry = strTo16(CountryCode);
+
+    if (mlu == NULL) return FALSE;
+
+    if (*UTF8String == '\0')
+    {
+        wchar_t empty = 0;
+        return AddMLUBlock(mlu, sizeof(wchar_t), &empty, Lang, Cntry);
+    }
+
+    // Len excluding terminator 0
+    UTF8len = decodeUTF8(NULL, UTF8String);
+
+    // Get space for dest
+    WStr = (wchar_t*) _cmsCalloc(mlu ->ContextID, UTF8len,  sizeof(wchar_t));
+    if (WStr == NULL) return FALSE;
+
+    decodeUTF8(WStr, UTF8String);
+
+    rc = AddMLUBlock(mlu, UTF8len  * sizeof(wchar_t), WStr, Lang, Cntry);
+
+    _cmsFree(mlu ->ContextID, WStr);
+    return rc;
+}
+
 // We don't need any wcs support library
 static
 cmsUInt32Number mywcslen(const wchar_t *s)
@@ -372,7 +534,7 @@ const wchar_t* _cmsMLUgetWide(const cmsMLU* mlu,
 
     if (v->StrW + v->Len > mlu->PoolSize) return NULL;
 
-    return(wchar_t*) ((cmsUInt8Number*) mlu ->MemPool + v ->StrW);
+    return (wchar_t*) ((cmsUInt8Number*) mlu ->MemPool + v ->StrW);
 }
 
 
@@ -410,17 +572,59 @@ cmsUInt32Number CMSEXPORT cmsMLUgetASCII(const cmsMLU* mlu,
     // Precess each character
     for (i=0; i < ASCIIlen; i++) {
 
-        if (Wide[i] == 0)
-            Buffer[i] = 0;
+        wchar_t wc = Wide[i];
+
+        if (wc < 0xff)
+            Buffer[i] = (char)wc;
         else
-            Buffer[i] = (char) Wide[i];
+            Buffer[i] = '?';
     }
 
     // We put a termination "\0"
     Buffer[ASCIIlen] = 0;
     return ASCIIlen + 1;
 }
 
+
+// Obtain a UTF8 representation of the wide string. Setting buffer to NULL returns the len
+cmsUInt32Number CMSEXPORT cmsMLUgetUTF8(const cmsMLU* mlu,
+                                       const char LanguageCode[3], const char CountryCode[3],
+                                       char* Buffer, cmsUInt32Number BufferSize)
+{
+    const wchar_t *Wide;
+    cmsUInt32Number  StrLen = 0;
+    cmsUInt32Number UTF8len;
+
+    cmsUInt16Number Lang  = strTo16(LanguageCode);
+    cmsUInt16Number Cntry = strTo16(CountryCode);
+
+    // Sanitize
+    if (mlu == NULL) return 0;
+
+    // Get WideChar
+    Wide = _cmsMLUgetWide(mlu, &StrLen, Lang, Cntry, NULL, NULL);
+    if (Wide == NULL) return 0;
+
+    UTF8len = encodeUTF8(NULL, Wide, StrLen / sizeof(wchar_t), BufferSize);
+
+    // Maybe we want only to know the len?
+    if (Buffer == NULL) return UTF8len + 1; // Note the zero at the end
+
+    // No buffer size means no data
+    if (BufferSize <= 0) return 0;
+
+    // Some clipping may be required
+    if (BufferSize < UTF8len + 1)
+        UTF8len = BufferSize - 1;
+
+    // Process it
+    encodeUTF8(Buffer, Wide, StrLen / sizeof(wchar_t), BufferSize);    
+
+    // We put a termination "\0"
+    Buffer[UTF8len] = 0;
+    return UTF8len + 1;
+}
+
 // Obtain a wide representation of the MLU, on depending on current locale settings
 cmsUInt32Number CMSEXPORT cmsMLUgetWide(const cmsMLU* mlu,
                                       const char LanguageCode[3], const char CountryCode[3],
@@ -441,7 +645,7 @@ cmsUInt32Number CMSEXPORT cmsMLUgetWide(const cmsMLU* mlu,
     // Maybe we want only to know the len?
     if (Buffer == NULL) return StrLen + sizeof(wchar_t);
 
-  // No buffer size means no data
+    // No buffer size means no data
     if (BufferSize <= 0) return 0;
 
     // Some clipping may be required

diff --git a/src/cmstypes.c b/src/cmstypes.c
@@ -925,6 +925,7 @@ static
 void *Type_Text_Description_Read(struct _cms_typehandler_struct* self, cmsIOHANDLER* io, cmsUInt32Number* nItems, cmsUInt32Number SizeOfTag)
 {
     char* Text = NULL;
+    wchar_t* UnicodeString = NULL;
     cmsMLU* mlu = NULL;
     cmsUInt32Number  AsciiCount;
     cmsUInt32Number  i, UnicodeCode, UnicodeCount;
@@ -944,7 +945,7 @@ void *Type_Text_Description_Read(struct _cms_typehandler_struct* self, cmsIOHAND
     if (SizeOfTag < AsciiCount) return NULL;
 
     // All seems Ok, allocate the container
-    mlu = cmsMLUalloc(self ->ContextID, 1);
+    mlu = cmsMLUalloc(self ->ContextID, 2);
     if (mlu == NULL) return NULL;
 
     // As many memory as size of tag
@@ -971,13 +972,17 @@ void *Type_Text_Description_Read(struct _cms_typehandler_struct* self, cmsIOHAND
 
     if (SizeOfTag < UnicodeCount*sizeof(cmsUInt16Number)) goto Done;
 
-    for (i=0; i < UnicodeCount; i++) {
-        if (!io ->Read(io, &Dummy, sizeof(cmsUInt16Number), 1)) goto Done;
-    }
+    UnicodeString = (wchar_t*)_cmsMalloc(self->ContextID, UnicodeCount * sizeof(wchar_t));
+    if (UnicodeString == NULL) goto Done;
+
+    if (!_cmsReadWCharArray(io, UnicodeCount, UnicodeString)) goto Done;
+    if (!cmsMLUsetWide(mlu, cmsV2Unicode, cmsV2Unicode, UnicodeString)) goto Done;
+    _cmsFree(self->ContextID, (void*)UnicodeString);
+
     SizeOfTag -= UnicodeCount*sizeof(cmsUInt16Number);
 
     // Skip ScriptCode code if present. Some buggy profiles does have less
-    // data that stricttly required. We need to skip it as this type may come
+    // data that strictly required. We need to skip it as this type may come
     // embedded in other types.
 
     if (SizeOfTag >= sizeof(cmsUInt16Number) + sizeof(cmsUInt8Number) + 67) {
@@ -1049,7 +1054,7 @@ cmsBool  Type_Text_Description_Write(struct _cms_typehandler_struct* self, cmsIO
 
         // Get both representations.
         cmsMLUgetASCII(mlu, cmsNoLanguage, cmsNoCountry,  Text, len * sizeof(char));
-        cmsMLUgetWide(mlu,  cmsNoLanguage, cmsNoCountry,  Wide, len * sizeof(wchar_t));
+        cmsMLUgetWide(mlu,  cmsV2Unicode,  cmsV2Unicode,  Wide, len * sizeof(wchar_t));
     }
 
     // Tell the real text len including the null terminator and padding