From 5262c5df0adab6ef1eb3f54675fc4dbd2b0883c5 Mon Sep 17 00:00:00 2001 From: Marti Maria Date: Fri, 3 Nov 2023 12:03:36 +0100 Subject: [PATCH] Added new functions to deal with UTF8 In preparation of new release, I'm adding the new UTF8 handling functions that hopefully would make localization better. --- include/lcms2.h | 18 +++- src/cmsio1.c | 10 ++ src/cmsnamed.c | 230 ++++++++++++++++++++++++++++++++++++++++--- src/cmstypes.c | 17 ++-- src/lcms2.def | 3 + src/lcms2_internal.h | 1 + testbed/testcms2.c | 25 +++++ 7 files changed, 283 insertions(+), 21 deletions(-) diff --git a/include/lcms2.h b/include/lcms2.h index 3aa9bb31..10202bae 100644 --- a/include/lcms2.h +++ b/include/lcms2.h @@ -1334,8 +1334,11 @@ CMSAPI cmsBool CMSEXPORT cmsSliceSpaceFloat(cmsUInt32Number nInputs, c typedef struct _cms_MLU_struct cmsMLU; -#define cmsNoLanguage "\0\0" -#define cmsNoCountry "\0\0" +#define cmsNoLanguage "\0\0" +#define cmsNoCountry "\0\0" + +// Special language/country to retrieve unicode field for description in V2 profiles. Use with care. +#define cmsV2Unicode "\xff\xff" CMSAPI cmsMLU* CMSEXPORT cmsMLUalloc(cmsContext ContextID, cmsUInt32Number nItems); CMSAPI void CMSEXPORT cmsMLUfree(cmsMLU* mlu); @@ -1347,6 +1350,9 @@ CMSAPI cmsBool CMSEXPORT cmsMLUsetASCII(cmsMLU* mlu, CMSAPI cmsBool CMSEXPORT cmsMLUsetWide(cmsMLU* mlu, const char LanguageCode[3], const char CountryCode[3], const wchar_t* WideString); +CMSAPI cmsBool CMSEXPORT cmsMLUsetUTF8(cmsMLU* mlu, + const char LanguageCode[3], const char CountryCode[3], + const char* UTF8String); CMSAPI cmsUInt32Number CMSEXPORT cmsMLUgetASCII(const cmsMLU* mlu, const char LanguageCode[3], const char CountryCode[3], @@ -1355,6 +1361,10 @@ CMSAPI cmsUInt32Number CMSEXPORT cmsMLUgetASCII(const cmsMLU* mlu, CMSAPI cmsUInt32Number CMSEXPORT cmsMLUgetWide(const cmsMLU* mlu, const char LanguageCode[3], const char CountryCode[3], wchar_t* Buffer, cmsUInt32Number BufferSize); +CMSAPI cmsUInt32Number CMSEXPORT cmsMLUgetUTF8(const cmsMLU* mlu, + const char LanguageCode[3], const char CountryCode[3], + char* Buffer, cmsUInt32Number BufferSize); + CMSAPI cmsBool CMSEXPORT cmsMLUgetTranslation(const cmsMLU* mlu, const char LanguageCode[3], const char CountryCode[3], @@ -1579,6 +1589,10 @@ CMSAPI cmsUInt32Number CMSEXPORT cmsGetProfileInfoASCII(cmsHPROFILE hProfile, const char LanguageCode[3], const char CountryCode[3], char* Buffer, cmsUInt32Number BufferSize); +CMSAPI cmsUInt32Number CMSEXPORT cmsGetProfileInfoUTF8(cmsHPROFILE hProfile, cmsInfoType Info, + const char LanguageCode[3], const char CountryCode[3], + char* Buffer, cmsUInt32Number BufferSize); + // IO handlers ---------------------------------------------------------------------------------------------------------- typedef struct _cms_io_handler cmsIOHANDLER; diff --git a/src/cmsio1.c b/src/cmsio1.c index 5b57dd2c..c75b454c 100644 --- a/src/cmsio1.c +++ b/src/cmsio1.c @@ -1027,3 +1027,13 @@ cmsUInt32Number CMSEXPORT cmsGetProfileInfoASCII(cmsHPROFILE hProfile, cmsInfoT return cmsMLUgetASCII(mlu, LanguageCode, CountryCode, Buffer, BufferSize); } + +cmsUInt32Number CMSEXPORT cmsGetProfileInfoUTF8(cmsHPROFILE hProfile, cmsInfoType Info, + const char LanguageCode[3], const char CountryCode[3], + char* Buffer, cmsUInt32Number BufferSize) +{ + const cmsMLU* mlu = GetInfo(hProfile, Info); + if (mlu == NULL) return 0; + + return cmsMLUgetUTF8(mlu, LanguageCode, CountryCode, Buffer, BufferSize); +} diff --git a/src/cmsnamed.c b/src/cmsnamed.c index ccf040ab..380cb8d2 100644 --- a/src/cmsnamed.c +++ b/src/cmsnamed.c @@ -200,17 +200,145 @@ void strFrom16(char str[3], cmsUInt16Number n) str[0] = (char)(n >> 8); str[1] = (char)n; str[2] = (char)0; +} + + +// Convert from UTF8 to wchar, returns len. +static +cmsUInt32Number decodeUTF8(wchar_t* out, const char* in) +{ + cmsUInt32Number codepoint = 0; + cmsUInt32Number size = 0; + + while (*in) + { + cmsUInt8Number ch = (cmsUInt8Number) *in; + + if (ch <= 0x7f) + { + codepoint = ch; + } + else if (ch <= 0xbf) + { + codepoint = (codepoint << 6) | (ch & 0x3f); + } + else if (ch <= 0xdf) + { + codepoint = ch & 0x1f; + } + else if (ch <= 0xef) + { + codepoint = ch & 0x0f; + } + else + { + codepoint = ch & 0x07; + } + + in++; + + if (((*in & 0xc0) != 0x80) && (codepoint <= 0x10ffff)) + { + if (sizeof(wchar_t) > 2) + { + if (out) *out++ = (wchar_t) codepoint; + size++; + } + else + if (codepoint > 0xffff) + { + if (out) + { + *out++ = (wchar_t)(0xd800 + (codepoint >> 10)); + *out++ = (wchar_t)(0xdc00 + (codepoint & 0x03ff)); + size += 2; + } + } + else + if (codepoint < 0xd800 || codepoint >= 0xe000) + { + if (out) *out++ = (wchar_t) codepoint; + size++; + } + } + } + + return size; +} + +// Convert from wchar_t to UTF8 +static +cmsUInt32Number encodeUTF8(char* out, const wchar_t* in, cmsUInt32Number max_wchars, cmsUInt32Number max_chars) +{ + cmsUInt32Number codepoint = 0; + cmsUInt32Number size = 0; + cmsUInt32Number len_w = 0; + + while (*in && len_w < max_wchars) + { + if (*in >= 0xd800 && *in <= 0xdbff) + codepoint = ((*in - 0xd800) << 10) + 0x10000; + else + { + if (*in >= 0xdc00 && *in <= 0xdfff) + codepoint |= *in - 0xdc00; + else + codepoint = *in; + + if (codepoint <= 0x7f) + { + if (out && (size + 1 < max_chars)) *out++ = (char)codepoint; + size++; + } + + else if (codepoint <= 0x7ff) + { + if (out && (max_chars > 0) && (size + 2 < max_chars)) + { + *out++ = (char)(cmsUInt32Number)(0xc0 | ((codepoint >> 6) & 0x1f)); + *out++ = (char)(cmsUInt32Number)(0x80 | (codepoint & 0x3f)); + } + size += 2; + } + else if (codepoint <= 0xffff) + { + if (out && (max_chars > 0) && (size + 3 < max_chars)) + { + *out++ = (char)(cmsUInt32Number)(0xe0 | ((codepoint >> 12) & 0x0f)); + *out++ = (char)(cmsUInt32Number)(0x80 | ((codepoint >> 6) & 0x3f)); + *out++ = (char)(cmsUInt32Number)(0x80 | (codepoint & 0x3f)); + } + size += 3; + } + else + { + if (out && (max_chars > 0) && (size + 4 < max_chars)) + { + *out++ = (char)(cmsUInt32Number)(0xf0 | ((codepoint >> 18) & 0x07)); + *out++ = (char)(cmsUInt32Number)(0x80 | ((codepoint >> 12) & 0x3f)); + *out++ = (char)(cmsUInt32Number)(0x80 | ((codepoint >> 6) & 0x3f)); + *out++ = (char)(cmsUInt32Number)(0x80 | (codepoint & 0x3f)); + } + size += 4; + } + codepoint = 0; + } + + in++; len_w++; + } + + return size; } // Add an ASCII entry. Do not add any \0 termination (ICC1v43_2010-12.pdf page 61) // In the case the user explicitly sets an empty string, we force a \0 cmsBool CMSEXPORT cmsMLUsetASCII(cmsMLU* mlu, const char LanguageCode[3], const char CountryCode[3], const char* ASCIIString) { - cmsUInt32Number i, len = (cmsUInt32Number) strlen(ASCIIString); + cmsUInt32Number i, len = (cmsUInt32Number)strlen(ASCIIString); wchar_t* WStr; cmsBool rc; - cmsUInt16Number Lang = strTo16(LanguageCode); + cmsUInt16Number Lang = strTo16(LanguageCode); cmsUInt16Number Cntry = strTo16(CountryCode); if (mlu == NULL) return FALSE; @@ -218,22 +346,56 @@ cmsBool CMSEXPORT cmsMLUsetASCII(cmsMLU* mlu, const char LanguageCode[3], const // len == 0 would prevent operation, so we set a empty string pointing to zero if (len == 0) { - len = 1; + wchar_t empty = 0; + return AddMLUBlock(mlu, sizeof(wchar_t), &empty, Lang, Cntry); } - WStr = (wchar_t*) _cmsCalloc(mlu ->ContextID, len, sizeof(wchar_t)); + WStr = (wchar_t*)_cmsCalloc(mlu->ContextID, len, sizeof(wchar_t)); if (WStr == NULL) return FALSE; - for (i=0; i < len; i++) - WStr[i] = (wchar_t) ASCIIString[i]; + for (i = 0; i < len; i++) + WStr[i] = (wchar_t)ASCIIString[i]; - rc = AddMLUBlock(mlu, len * sizeof(wchar_t), WStr, Lang, Cntry); + rc = AddMLUBlock(mlu, len * sizeof(wchar_t), WStr, Lang, Cntry); - _cmsFree(mlu ->ContextID, WStr); + _cmsFree(mlu->ContextID, WStr); return rc; } +// Add an UTF8 entry. Do not add any \0 termination (ICC1v43_2010-12.pdf page 61) +// In the case the user explicitly sets an empty string, we force a \0 +cmsBool CMSEXPORT cmsMLUsetUTF8(cmsMLU* mlu, const char LanguageCode[3], const char CountryCode[3], const char* UTF8String) +{ + cmsUInt32Number UTF8len; + wchar_t* WStr; + cmsBool rc; + cmsUInt16Number Lang = strTo16(LanguageCode); + cmsUInt16Number Cntry = strTo16(CountryCode); + + if (mlu == NULL) return FALSE; + + if (*UTF8String == '\0') + { + wchar_t empty = 0; + return AddMLUBlock(mlu, sizeof(wchar_t), &empty, Lang, Cntry); + } + + // Len excluding terminator 0 + UTF8len = decodeUTF8(NULL, UTF8String); + + // Get space for dest + WStr = (wchar_t*) _cmsCalloc(mlu ->ContextID, UTF8len, sizeof(wchar_t)); + if (WStr == NULL) return FALSE; + + decodeUTF8(WStr, UTF8String); + + rc = AddMLUBlock(mlu, UTF8len * sizeof(wchar_t), WStr, Lang, Cntry); + + _cmsFree(mlu ->ContextID, WStr); + return rc; +} + // We don't need any wcs support library static cmsUInt32Number mywcslen(const wchar_t *s) @@ -372,7 +534,7 @@ const wchar_t* _cmsMLUgetWide(const cmsMLU* mlu, if (v->StrW + v->Len > mlu->PoolSize) return NULL; - return(wchar_t*) ((cmsUInt8Number*) mlu ->MemPool + v ->StrW); + return (wchar_t*) ((cmsUInt8Number*) mlu ->MemPool + v ->StrW); } @@ -410,10 +572,12 @@ cmsUInt32Number CMSEXPORT cmsMLUgetASCII(const cmsMLU* mlu, // Precess each character for (i=0; i < ASCIIlen; i++) { - if (Wide[i] == 0) - Buffer[i] = 0; + wchar_t wc = Wide[i]; + + if (wc < 0xff) + Buffer[i] = (char)wc; else - Buffer[i] = (char) Wide[i]; + Buffer[i] = '?'; } // We put a termination "\0" @@ -421,6 +585,46 @@ cmsUInt32Number CMSEXPORT cmsMLUgetASCII(const cmsMLU* mlu, return ASCIIlen + 1; } + +// Obtain a UTF8 representation of the wide string. Setting buffer to NULL returns the len +cmsUInt32Number CMSEXPORT cmsMLUgetUTF8(const cmsMLU* mlu, + const char LanguageCode[3], const char CountryCode[3], + char* Buffer, cmsUInt32Number BufferSize) +{ + const wchar_t *Wide; + cmsUInt32Number StrLen = 0; + cmsUInt32Number UTF8len; + + cmsUInt16Number Lang = strTo16(LanguageCode); + cmsUInt16Number Cntry = strTo16(CountryCode); + + // Sanitize + if (mlu == NULL) return 0; + + // Get WideChar + Wide = _cmsMLUgetWide(mlu, &StrLen, Lang, Cntry, NULL, NULL); + if (Wide == NULL) return 0; + + UTF8len = encodeUTF8(NULL, Wide, StrLen / sizeof(wchar_t), BufferSize); + + // Maybe we want only to know the len? + if (Buffer == NULL) return UTF8len + 1; // Note the zero at the end + + // No buffer size means no data + if (BufferSize <= 0) return 0; + + // Some clipping may be required + if (BufferSize < UTF8len + 1) + UTF8len = BufferSize - 1; + + // Process it + encodeUTF8(Buffer, Wide, StrLen / sizeof(wchar_t), BufferSize); + + // We put a termination "\0" + Buffer[UTF8len] = 0; + return UTF8len + 1; +} + // Obtain a wide representation of the MLU, on depending on current locale settings cmsUInt32Number CMSEXPORT cmsMLUgetWide(const cmsMLU* mlu, const char LanguageCode[3], const char CountryCode[3], @@ -441,7 +645,7 @@ cmsUInt32Number CMSEXPORT cmsMLUgetWide(const cmsMLU* mlu, // Maybe we want only to know the len? if (Buffer == NULL) return StrLen + sizeof(wchar_t); - // No buffer size means no data + // No buffer size means no data if (BufferSize <= 0) return 0; // Some clipping may be required diff --git a/src/cmstypes.c b/src/cmstypes.c index b0e8c7ec..4f2d2eec 100644 --- a/src/cmstypes.c +++ b/src/cmstypes.c @@ -925,6 +925,7 @@ static void *Type_Text_Description_Read(struct _cms_typehandler_struct* self, cmsIOHANDLER* io, cmsUInt32Number* nItems, cmsUInt32Number SizeOfTag) { char* Text = NULL; + wchar_t* UnicodeString = NULL; cmsMLU* mlu = NULL; cmsUInt32Number AsciiCount; cmsUInt32Number i, UnicodeCode, UnicodeCount; @@ -944,7 +945,7 @@ void *Type_Text_Description_Read(struct _cms_typehandler_struct* self, cmsIOHAND if (SizeOfTag < AsciiCount) return NULL; // All seems Ok, allocate the container - mlu = cmsMLUalloc(self ->ContextID, 1); + mlu = cmsMLUalloc(self ->ContextID, 2); if (mlu == NULL) return NULL; // As many memory as size of tag @@ -971,13 +972,17 @@ void *Type_Text_Description_Read(struct _cms_typehandler_struct* self, cmsIOHAND if (SizeOfTag < UnicodeCount*sizeof(cmsUInt16Number)) goto Done; - for (i=0; i < UnicodeCount; i++) { - if (!io ->Read(io, &Dummy, sizeof(cmsUInt16Number), 1)) goto Done; - } + UnicodeString = (wchar_t*)_cmsMalloc(self->ContextID, UnicodeCount * sizeof(wchar_t)); + if (UnicodeString == NULL) goto Done; + + if (!_cmsReadWCharArray(io, UnicodeCount, UnicodeString)) goto Done; + if (!cmsMLUsetWide(mlu, cmsV2Unicode, cmsV2Unicode, UnicodeString)) goto Done; + _cmsFree(self->ContextID, (void*)UnicodeString); + SizeOfTag -= UnicodeCount*sizeof(cmsUInt16Number); // Skip ScriptCode code if present. Some buggy profiles does have less - // data that stricttly required. We need to skip it as this type may come + // data that strictly required. We need to skip it as this type may come // embedded in other types. if (SizeOfTag >= sizeof(cmsUInt16Number) + sizeof(cmsUInt8Number) + 67) { @@ -1049,7 +1054,7 @@ cmsBool Type_Text_Description_Write(struct _cms_typehandler_struct* self, cmsIO // Get both representations. cmsMLUgetASCII(mlu, cmsNoLanguage, cmsNoCountry, Text, len * sizeof(char)); - cmsMLUgetWide(mlu, cmsNoLanguage, cmsNoCountry, Wide, len * sizeof(wchar_t)); + cmsMLUgetWide(mlu, cmsV2Unicode, cmsV2Unicode, Wide, len * sizeof(wchar_t)); } // Tell the real text len including the null terminator and padding diff --git a/src/lcms2.def b/src/lcms2.def index 5bd26783..7853eb6f 100644 --- a/src/lcms2.def +++ b/src/lcms2.def @@ -114,6 +114,7 @@ cmsGetPostScriptCRD = cmsGetPostScriptCRD cmsGetPostScriptCSA = cmsGetPostScriptCSA cmsGetProfileInfo = cmsGetProfileInfo cmsGetProfileInfoASCII = cmsGetProfileInfoASCII +cmsGetProfileInfoUTF8 = cmsGetProfileInfoUTF8 cmsGetProfileContextID = cmsGetProfileContextID cmsGetProfileVersion = cmsGetProfileVersion cmsGetSupportedIntents = cmsGetSupportedIntents @@ -209,8 +210,10 @@ cmsMLUfree = cmsMLUfree cmsMLUgetASCII = cmsMLUgetASCII cmsMLUgetTranslation = cmsMLUgetTranslation cmsMLUgetWide = cmsMLUgetWide +cmsMLUgetUTF8 = cmsMLUgetUTF8 cmsMLUsetASCII = cmsMLUsetASCII cmsMLUsetWide = cmsMLUsetWide +cmsMLUsetUTF8 = cmsMLUsetUTF8 cmsStageAllocCLut16bit = cmsStageAllocCLut16bit cmsStageAllocCLut16bitGranular = cmsStageAllocCLut16bitGranular cmsStageAllocCLutFloat = cmsStageAllocCLutFloat diff --git a/src/lcms2_internal.h b/src/lcms2_internal.h index 84a62fbf..8e78a71e 100644 --- a/src/lcms2_internal.h +++ b/src/lcms2_internal.h @@ -260,6 +260,7 @@ typedef CRITICAL_SECTION _cmsMutex; #ifdef _MSC_VER # if (_MSC_VER >= 1800) # pragma warning(disable : 26135) +# pragma warning(disable : 4127) # endif #endif diff --git a/testbed/testcms2.c b/testbed/testcms2.c index f536e3ce..0f84445d 100644 --- a/testbed/testcms2.c +++ b/testbed/testcms2.c @@ -3676,6 +3676,30 @@ cmsInt32Number CheckMLU(void) } +// Check UTF8 encoding +static +cmsInt32Number CheckMLU_UTF8(void) +{ + cmsMLU* mlu; + char Buffer[256]; + cmsInt32Number rc = 1; + + mlu = cmsMLUalloc(DbgThread(), 0); + + cmsMLUsetWide(mlu, "en", "US", L"\x3b2\x14b"); + + cmsMLUgetUTF8(mlu, "en", "US", Buffer, 256); + if (strcmp(Buffer, "\xce\xb2\xc5\x8b") != 0) rc = 0; + + if (rc == 0) + Fail("Unexpected string '%s'", Buffer); + + cmsMLUfree(mlu); + return rc; +} + + + // A lightweight test of named color structures. static cmsInt32Number CheckNamedColorList(void) @@ -9597,6 +9621,7 @@ int main(int argc, char* argv[]) // MLU Check("Multilocalized Unicode", CheckMLU); + Check("Multilocalized Unicode (II)", CheckMLU_UTF8); // Named color Check("Named color lists", CheckNamedColorList);