Skip to content

Commit

Permalink
Added new functions to deal with UTF8
Browse files Browse the repository at this point in the history
In preparation of new release, I'm adding the new UTF8 handling functions that hopefully would make  localization better.
  • Loading branch information
mm2 committed Nov 3, 2023
1 parent 282c96e commit 5262c5d
Show file tree
Hide file tree
Showing 7 changed files with 283 additions and 21 deletions.
18 changes: 16 additions & 2 deletions include/lcms2.h
Original file line number Diff line number Diff line change
Expand Up @@ -1334,8 +1334,11 @@ CMSAPI cmsBool CMSEXPORT cmsSliceSpaceFloat(cmsUInt32Number nInputs, c

typedef struct _cms_MLU_struct cmsMLU;

#define cmsNoLanguage "\0\0"
#define cmsNoCountry "\0\0"
#define cmsNoLanguage "\0\0"
#define cmsNoCountry "\0\0"

// Special language/country to retrieve unicode field for description in V2 profiles. Use with care.
#define cmsV2Unicode "\xff\xff"

CMSAPI cmsMLU* CMSEXPORT cmsMLUalloc(cmsContext ContextID, cmsUInt32Number nItems);
CMSAPI void CMSEXPORT cmsMLUfree(cmsMLU* mlu);
Expand All @@ -1347,6 +1350,9 @@ CMSAPI cmsBool CMSEXPORT cmsMLUsetASCII(cmsMLU* mlu,
CMSAPI cmsBool CMSEXPORT cmsMLUsetWide(cmsMLU* mlu,
const char LanguageCode[3], const char CountryCode[3],
const wchar_t* WideString);
CMSAPI cmsBool CMSEXPORT cmsMLUsetUTF8(cmsMLU* mlu,
const char LanguageCode[3], const char CountryCode[3],
const char* UTF8String);

CMSAPI cmsUInt32Number CMSEXPORT cmsMLUgetASCII(const cmsMLU* mlu,
const char LanguageCode[3], const char CountryCode[3],
Expand All @@ -1355,6 +1361,10 @@ CMSAPI cmsUInt32Number CMSEXPORT cmsMLUgetASCII(const cmsMLU* mlu,
CMSAPI cmsUInt32Number CMSEXPORT cmsMLUgetWide(const cmsMLU* mlu,
const char LanguageCode[3], const char CountryCode[3],
wchar_t* Buffer, cmsUInt32Number BufferSize);
CMSAPI cmsUInt32Number CMSEXPORT cmsMLUgetUTF8(const cmsMLU* mlu,
const char LanguageCode[3], const char CountryCode[3],
char* Buffer, cmsUInt32Number BufferSize);


CMSAPI cmsBool CMSEXPORT cmsMLUgetTranslation(const cmsMLU* mlu,
const char LanguageCode[3], const char CountryCode[3],
Expand Down Expand Up @@ -1579,6 +1589,10 @@ CMSAPI cmsUInt32Number CMSEXPORT cmsGetProfileInfoASCII(cmsHPROFILE hProfile,
const char LanguageCode[3], const char CountryCode[3],
char* Buffer, cmsUInt32Number BufferSize);

CMSAPI cmsUInt32Number CMSEXPORT cmsGetProfileInfoUTF8(cmsHPROFILE hProfile, cmsInfoType Info,
const char LanguageCode[3], const char CountryCode[3],
char* Buffer, cmsUInt32Number BufferSize);

// IO handlers ----------------------------------------------------------------------------------------------------------

typedef struct _cms_io_handler cmsIOHANDLER;
Expand Down
10 changes: 10 additions & 0 deletions src/cmsio1.c
Original file line number Diff line number Diff line change
Expand Up @@ -1027,3 +1027,13 @@ cmsUInt32Number CMSEXPORT cmsGetProfileInfoASCII(cmsHPROFILE hProfile, cmsInfoT

return cmsMLUgetASCII(mlu, LanguageCode, CountryCode, Buffer, BufferSize);
}

cmsUInt32Number CMSEXPORT cmsGetProfileInfoUTF8(cmsHPROFILE hProfile, cmsInfoType Info,
const char LanguageCode[3], const char CountryCode[3],
char* Buffer, cmsUInt32Number BufferSize)
{
const cmsMLU* mlu = GetInfo(hProfile, Info);
if (mlu == NULL) return 0;

return cmsMLUgetUTF8(mlu, LanguageCode, CountryCode, Buffer, BufferSize);
}
230 changes: 217 additions & 13 deletions src/cmsnamed.c
Original file line number Diff line number Diff line change
Expand Up @@ -200,40 +200,202 @@ void strFrom16(char str[3], cmsUInt16Number n)
str[0] = (char)(n >> 8);
str[1] = (char)n;
str[2] = (char)0;
}


// Convert from UTF8 to wchar, returns len.
static
cmsUInt32Number decodeUTF8(wchar_t* out, const char* in)
{
cmsUInt32Number codepoint = 0;
cmsUInt32Number size = 0;

while (*in)
{
cmsUInt8Number ch = (cmsUInt8Number) *in;

if (ch <= 0x7f)
{
codepoint = ch;
}
else if (ch <= 0xbf)
{
codepoint = (codepoint << 6) | (ch & 0x3f);
}
else if (ch <= 0xdf)
{
codepoint = ch & 0x1f;
}
else if (ch <= 0xef)
{
codepoint = ch & 0x0f;
}
else
{
codepoint = ch & 0x07;
}

in++;

if (((*in & 0xc0) != 0x80) && (codepoint <= 0x10ffff))
{
if (sizeof(wchar_t) > 2)
{
if (out) *out++ = (wchar_t) codepoint;
size++;
}
else
if (codepoint > 0xffff)
{
if (out)
{
*out++ = (wchar_t)(0xd800 + (codepoint >> 10));
*out++ = (wchar_t)(0xdc00 + (codepoint & 0x03ff));
size += 2;
}
}
else
if (codepoint < 0xd800 || codepoint >= 0xe000)
{
if (out) *out++ = (wchar_t) codepoint;
size++;
}
}
}

return size;
}

// Convert from wchar_t to UTF8
static
cmsUInt32Number encodeUTF8(char* out, const wchar_t* in, cmsUInt32Number max_wchars, cmsUInt32Number max_chars)
{
cmsUInt32Number codepoint = 0;
cmsUInt32Number size = 0;
cmsUInt32Number len_w = 0;

while (*in && len_w < max_wchars)
{
if (*in >= 0xd800 && *in <= 0xdbff)
codepoint = ((*in - 0xd800) << 10) + 0x10000;
else
{
if (*in >= 0xdc00 && *in <= 0xdfff)
codepoint |= *in - 0xdc00;
else
codepoint = *in;

if (codepoint <= 0x7f)
{
if (out && (size + 1 < max_chars)) *out++ = (char)codepoint;
size++;
}

else if (codepoint <= 0x7ff)
{
if (out && (max_chars > 0) && (size + 2 < max_chars))
{
*out++ = (char)(cmsUInt32Number)(0xc0 | ((codepoint >> 6) & 0x1f));
*out++ = (char)(cmsUInt32Number)(0x80 | (codepoint & 0x3f));
}
size += 2;
}
else if (codepoint <= 0xffff)
{
if (out && (max_chars > 0) && (size + 3 < max_chars))
{
*out++ = (char)(cmsUInt32Number)(0xe0 | ((codepoint >> 12) & 0x0f));
*out++ = (char)(cmsUInt32Number)(0x80 | ((codepoint >> 6) & 0x3f));
*out++ = (char)(cmsUInt32Number)(0x80 | (codepoint & 0x3f));
}
size += 3;
}
else
{
if (out && (max_chars > 0) && (size + 4 < max_chars))
{
*out++ = (char)(cmsUInt32Number)(0xf0 | ((codepoint >> 18) & 0x07));
*out++ = (char)(cmsUInt32Number)(0x80 | ((codepoint >> 12) & 0x3f));
*out++ = (char)(cmsUInt32Number)(0x80 | ((codepoint >> 6) & 0x3f));
*out++ = (char)(cmsUInt32Number)(0x80 | (codepoint & 0x3f));
}
size += 4;
}

codepoint = 0;
}

in++; len_w++;
}

return size;
}

// Add an ASCII entry. Do not add any \0 termination (ICC1v43_2010-12.pdf page 61)
// In the case the user explicitly sets an empty string, we force a \0
cmsBool CMSEXPORT cmsMLUsetASCII(cmsMLU* mlu, const char LanguageCode[3], const char CountryCode[3], const char* ASCIIString)
{
cmsUInt32Number i, len = (cmsUInt32Number) strlen(ASCIIString);
cmsUInt32Number i, len = (cmsUInt32Number)strlen(ASCIIString);
wchar_t* WStr;
cmsBool rc;
cmsUInt16Number Lang = strTo16(LanguageCode);
cmsUInt16Number Lang = strTo16(LanguageCode);
cmsUInt16Number Cntry = strTo16(CountryCode);

if (mlu == NULL) return FALSE;

// len == 0 would prevent operation, so we set a empty string pointing to zero
if (len == 0)
{
len = 1;
wchar_t empty = 0;
return AddMLUBlock(mlu, sizeof(wchar_t), &empty, Lang, Cntry);
}

WStr = (wchar_t*) _cmsCalloc(mlu ->ContextID, len, sizeof(wchar_t));
WStr = (wchar_t*)_cmsCalloc(mlu->ContextID, len, sizeof(wchar_t));
if (WStr == NULL) return FALSE;

for (i=0; i < len; i++)
WStr[i] = (wchar_t) ASCIIString[i];
for (i = 0; i < len; i++)
WStr[i] = (wchar_t)ASCIIString[i];

rc = AddMLUBlock(mlu, len * sizeof(wchar_t), WStr, Lang, Cntry);
rc = AddMLUBlock(mlu, len * sizeof(wchar_t), WStr, Lang, Cntry);

_cmsFree(mlu ->ContextID, WStr);
_cmsFree(mlu->ContextID, WStr);
return rc;

}

// Add an UTF8 entry. Do not add any \0 termination (ICC1v43_2010-12.pdf page 61)
// In the case the user explicitly sets an empty string, we force a \0
cmsBool CMSEXPORT cmsMLUsetUTF8(cmsMLU* mlu, const char LanguageCode[3], const char CountryCode[3], const char* UTF8String)
{
cmsUInt32Number UTF8len;
wchar_t* WStr;
cmsBool rc;
cmsUInt16Number Lang = strTo16(LanguageCode);
cmsUInt16Number Cntry = strTo16(CountryCode);

if (mlu == NULL) return FALSE;

if (*UTF8String == '\0')
{
wchar_t empty = 0;
return AddMLUBlock(mlu, sizeof(wchar_t), &empty, Lang, Cntry);
}

// Len excluding terminator 0
UTF8len = decodeUTF8(NULL, UTF8String);

// Get space for dest
WStr = (wchar_t*) _cmsCalloc(mlu ->ContextID, UTF8len, sizeof(wchar_t));
if (WStr == NULL) return FALSE;

decodeUTF8(WStr, UTF8String);

rc = AddMLUBlock(mlu, UTF8len * sizeof(wchar_t), WStr, Lang, Cntry);

_cmsFree(mlu ->ContextID, WStr);
return rc;
}

// We don't need any wcs support library
static
cmsUInt32Number mywcslen(const wchar_t *s)
Expand Down Expand Up @@ -372,7 +534,7 @@ const wchar_t* _cmsMLUgetWide(const cmsMLU* mlu,

if (v->StrW + v->Len > mlu->PoolSize) return NULL;

return(wchar_t*) ((cmsUInt8Number*) mlu ->MemPool + v ->StrW);
return (wchar_t*) ((cmsUInt8Number*) mlu ->MemPool + v ->StrW);
}


Expand Down Expand Up @@ -410,17 +572,59 @@ cmsUInt32Number CMSEXPORT cmsMLUgetASCII(const cmsMLU* mlu,
// Precess each character
for (i=0; i < ASCIIlen; i++) {

if (Wide[i] == 0)
Buffer[i] = 0;
wchar_t wc = Wide[i];

if (wc < 0xff)
Buffer[i] = (char)wc;
else
Buffer[i] = (char) Wide[i];
Buffer[i] = '?';
}

// We put a termination "\0"
Buffer[ASCIIlen] = 0;
return ASCIIlen + 1;
}


// Obtain a UTF8 representation of the wide string. Setting buffer to NULL returns the len
cmsUInt32Number CMSEXPORT cmsMLUgetUTF8(const cmsMLU* mlu,
const char LanguageCode[3], const char CountryCode[3],
char* Buffer, cmsUInt32Number BufferSize)
{
const wchar_t *Wide;
cmsUInt32Number StrLen = 0;
cmsUInt32Number UTF8len;

cmsUInt16Number Lang = strTo16(LanguageCode);
cmsUInt16Number Cntry = strTo16(CountryCode);

// Sanitize
if (mlu == NULL) return 0;

// Get WideChar
Wide = _cmsMLUgetWide(mlu, &StrLen, Lang, Cntry, NULL, NULL);
if (Wide == NULL) return 0;

UTF8len = encodeUTF8(NULL, Wide, StrLen / sizeof(wchar_t), BufferSize);

// Maybe we want only to know the len?
if (Buffer == NULL) return UTF8len + 1; // Note the zero at the end

// No buffer size means no data
if (BufferSize <= 0) return 0;

// Some clipping may be required
if (BufferSize < UTF8len + 1)
UTF8len = BufferSize - 1;

// Process it
encodeUTF8(Buffer, Wide, StrLen / sizeof(wchar_t), BufferSize);

// We put a termination "\0"
Buffer[UTF8len] = 0;
return UTF8len + 1;
}

// Obtain a wide representation of the MLU, on depending on current locale settings
cmsUInt32Number CMSEXPORT cmsMLUgetWide(const cmsMLU* mlu,
const char LanguageCode[3], const char CountryCode[3],
Expand All @@ -441,7 +645,7 @@ cmsUInt32Number CMSEXPORT cmsMLUgetWide(const cmsMLU* mlu,
// Maybe we want only to know the len?
if (Buffer == NULL) return StrLen + sizeof(wchar_t);

// No buffer size means no data
// No buffer size means no data
if (BufferSize <= 0) return 0;

// Some clipping may be required
Expand Down
17 changes: 11 additions & 6 deletions src/cmstypes.c
Original file line number Diff line number Diff line change
Expand Up @@ -925,6 +925,7 @@ static
void *Type_Text_Description_Read(struct _cms_typehandler_struct* self, cmsIOHANDLER* io, cmsUInt32Number* nItems, cmsUInt32Number SizeOfTag)
{
char* Text = NULL;
wchar_t* UnicodeString = NULL;
cmsMLU* mlu = NULL;
cmsUInt32Number AsciiCount;
cmsUInt32Number i, UnicodeCode, UnicodeCount;
Expand All @@ -944,7 +945,7 @@ void *Type_Text_Description_Read(struct _cms_typehandler_struct* self, cmsIOHAND
if (SizeOfTag < AsciiCount) return NULL;

// All seems Ok, allocate the container
mlu = cmsMLUalloc(self ->ContextID, 1);
mlu = cmsMLUalloc(self ->ContextID, 2);
if (mlu == NULL) return NULL;

// As many memory as size of tag
Expand All @@ -971,13 +972,17 @@ void *Type_Text_Description_Read(struct _cms_typehandler_struct* self, cmsIOHAND

if (SizeOfTag < UnicodeCount*sizeof(cmsUInt16Number)) goto Done;

for (i=0; i < UnicodeCount; i++) {
if (!io ->Read(io, &Dummy, sizeof(cmsUInt16Number), 1)) goto Done;
}
UnicodeString = (wchar_t*)_cmsMalloc(self->ContextID, UnicodeCount * sizeof(wchar_t));
if (UnicodeString == NULL) goto Done;

if (!_cmsReadWCharArray(io, UnicodeCount, UnicodeString)) goto Done;
if (!cmsMLUsetWide(mlu, cmsV2Unicode, cmsV2Unicode, UnicodeString)) goto Done;
_cmsFree(self->ContextID, (void*)UnicodeString);

SizeOfTag -= UnicodeCount*sizeof(cmsUInt16Number);

// Skip ScriptCode code if present. Some buggy profiles does have less
// data that stricttly required. We need to skip it as this type may come
// data that strictly required. We need to skip it as this type may come
// embedded in other types.

if (SizeOfTag >= sizeof(cmsUInt16Number) + sizeof(cmsUInt8Number) + 67) {
Expand Down Expand Up @@ -1049,7 +1054,7 @@ cmsBool Type_Text_Description_Write(struct _cms_typehandler_struct* self, cmsIO

// Get both representations.
cmsMLUgetASCII(mlu, cmsNoLanguage, cmsNoCountry, Text, len * sizeof(char));
cmsMLUgetWide(mlu, cmsNoLanguage, cmsNoCountry, Wide, len * sizeof(wchar_t));
cmsMLUgetWide(mlu, cmsV2Unicode, cmsV2Unicode, Wide, len * sizeof(wchar_t));
}

// Tell the real text len including the null terminator and padding
Expand Down
Loading

0 comments on commit 5262c5d

Please sign in to comment.