24.8 如何在GB2312与Unicode之间互相转换
Q:
在Windows上有WideCharToMultiByte()、MultiByteToWideChar()等函数可用,Unix 上有类似函数吗。
A: scz
请"man iconv_open",然后动用Google进行相关搜索。之所以没有建议"man iconv", 因为iconv(1)对应着一个命令,Linux上应该看iconv(3),Solaris上应该看iconv(3C), 其它系统各有变化,但"man iconv_open"总是一致的。
我们就iconv*()函数进行了简单测试:
/*
- gcc -DLinux -Wall -pipe -O3 -s -o iconv_test iconv_test.c
- gcc -DSparc -Wall -pipe -O3 -s -o iconv_test iconv_test.c */
#if 0
我的一台FreeBSD测试机上居然没有/usr/include/iconv.h
而另一台AIX测试机上则根本不支持GB2312,编译时需指定库
gcc -DAix -Wall -pipe -O3 -s -o iconv_test iconv_test.c -liconv
显然,在各个缺省安装的Unix系统上iconv*()被支持的程度不一,所非自行安装 libiconv,否则这是一个高度不可移植的选择,决定放弃iconv*()。
本程序仅为测试目的存在,没有实用价值。
#endif
#include <stdio.h> #include <stdlib.h> #include <strings.h> #include <string.h> #include <iconv.h>
static void outputBinary ( FILE *out, unsigned char *byteArray, size_t byteArrayLen ) { size_t offset, k, j, i;
fprintf( out, "byteArray [ %u bytes ] ->\n", ( unsigned int )byteArrayLen );
if ( byteArrayLen <= 0 )
{
return;
}
i = 0;
offset = 0;
for ( k = byteArrayLen / 16; k > 0; k--, offset += 16 )
{
fprintf( out, "%08X ", ( unsigned int )offset );
for ( j = 0; j < 16; j++, i++ )
{
if ( j == 8 )
{
fprintf( out, "-%02X", byteArray[i] );
}
else
{
fprintf( out, " %02X", byteArray[i] );
}
}
fprintf( out, " " );
i -= 16;
for ( j = 0; j < 16; j++, i++ )
{
/*
* if ( isprint( (int)byteArray[i] ) )
*/
#if 1 if ( ( byteArray[i] >= ' ' ) && ( byteArray[i] != 0x7F ) && ( byteArray[i] < 0xFF ) ) #else if ( ( byteArray[i] >= ' ' ) && ( byteArray[i] < 0x7F ) ) #endif { fprintf( out, "%c", byteArray[i] ); } else { fprintf( out, "." ); } } fprintf( out, "\n" ); } /* end of for / k = byteArrayLen - i; if ( k <= 0 ) { return; } fprintf( out, "%08X ", ( unsigned int )offset ); for ( j = 0 ; j < k; j++, i++ ) { if ( j == 8 ) { fprintf( out, "-%02X", byteArray[i] ); } else { fprintf( out, " %02X", byteArray[i] ); } } i -= k; for ( j = 16 - k; j > 0; j-- ) { fprintf( out, " " ); } fprintf( out, " " ); for ( j = 0; j < k; j++, i++ ) { #if 1 if ( ( byteArray[i] >= ' ' ) && ( byteArray[i] != 0x7F ) && ( byteArray[i] < 0xFF ) ) #else if ( ( byteArray[i] >= ' ' ) && ( byteArray[i] < 0x7F ) ) #endif { fprintf( out, "%c", byteArray[i] ); } else { fprintf( out, "." ); } } fprintf( out, "\n" ); return; } / end of outputBinary */
#ifdef Linux
int main ( int argc, char * argv[] ) { int ret = EXIT_FAILURE; iconv_t cd = ( iconv_t )-1; char in[] = "GB2312与Unicode双向转换测试"; size_t inlen = sizeof( in ); char *inp = in; char out[sizeof(in)*2]; size_t outlen = sizeof( out ); char *outp = out;
/*
* 大小写不敏感
*/
if ( ( iconv_t )-1 == ( cd = iconv_open( "UCS-2", "GB2312" ) ) )
{
perror( "iconv_open() error" );
goto main_exit;
}
outputBinary
(
stderr,
in,
sizeof( in )
);
if ( -1 == iconv( cd, &inp, &inlen, &outp, &outlen ) )
{
perror( "iconv() error" );
goto main_exit;
}
outputBinary
(
stderr,
out,
sizeof( out ) - outlen
);
outlen = sizeof( out ) - outlen;
outp = out;
inlen = sizeof( in );
inp = in;
if ( ( iconv_t )-1 != cd )
{
iconv_close( cd );
cd = ( iconv_t )-1;
}
if ( ( iconv_t )-1 == ( cd = iconv_open( "GB2312", "UCS-2" ) ) )
{
perror( "iconv_open() error" );
goto main_exit;
}
if ( -1 == iconv( cd, &outp, &outlen, &inp, &inlen ) )
{
perror( "iconv() error" );
goto main_exit;
}
outputBinary
(
stderr,
in,
sizeof( in ) - inlen
);
ret = EXIT_SUCCESS;
main_exit:
if ( ( iconv_t )-1 != cd )
{
iconv_close( cd );
cd = ( iconv_t )-1;
}
return( ret );
} /* end of main */
#if 0 [scz@ /home/scz/src]> ./iconv_test byteArray [ 28 bytes ] -> 00000000 47 42 32 33 31 32 D3 EB-55 6E 69 63 6F 64 65 CB GB2312与Unicode 00000010 AB CF F2 D7 AA BB BB B2-E2 CA D4 00 byteArray [ 42 bytes ] -> 00000000 47 00 42 00 32 00 33 00-31 00 32 00 0E 4E 55 00 G.B.2.3.1.2..NU. 00000010 6E 00 69 00 63 00 6F 00-64 00 65 00 CC 53 11 54 n.i.c.o.d.e. 00000020 6C 8F 62 63 4B 6D D5 8B-00 00 byteArray [ 28 bytes ] -> 00000000 47 42 32 33 31 32 D3 EB-55 6E 69 63 6F 64 65 CB GB2312与Unicode 00000010 AB CF F2 D7 AA BB BB B2-E2 CA D4 00 #endif
#elif defined(Sparc)
int main ( int argc, char * argv[] ) { int ret = EXIT_FAILURE; iconv_t cd = ( iconv_t )-1; char in[] = "GB2312与Unicode双向转换测试"; size_t inlen = sizeof( in ); char *inp = in; char out[sizeof(in)*2]; size_t outlen = sizeof( out ); char *outp = out; char xxx[sizeof(out)]; size_t xxxlen = sizeof( xxx ); char *xxxp = xxx;
/*
* 大小写敏感,SPARC/Solaris上无法直接从"gb2312"转到"UCS-2"
*/
if ( ( iconv_t )-1 == ( cd = iconv_open( "UTF-8", "gb2312" ) ) )
{
perror( "iconv_open() error" );
goto main_exit;
}
outputBinary
(
stderr,
in,
sizeof( in )
);
if ( -1 == iconv( cd, ( const char ** )&inp, &inlen, &outp, &outlen ) )
{
perror( "iconv() error" );
goto main_exit;
}
outputBinary
(
stderr,
out,
sizeof( out ) - outlen
);
outlen = sizeof( out ) - outlen;
outp = out;
if ( ( iconv_t )-1 != cd )
{
iconv_close( cd );
cd = ( iconv_t )-1;
}
if ( ( iconv_t )-1 == ( cd = iconv_open( "UCS-2", "UTF-8" ) ) )
{
perror( "iconv_open() error" );
goto main_exit;
}
if ( -1 == iconv( cd, ( const char ** )&outp, &outlen, &xxxp, &xxxlen ) )
{
perror( "iconv() error" );
goto main_exit;
}
outputBinary
(
stderr,
xxx,
sizeof( xxx ) - xxxlen
);
xxxlen = sizeof( xxx ) - xxxlen;
xxxp = xxx;
outlen = sizeof( out );
outp = out;
if ( ( iconv_t )-1 != cd )
{
iconv_close( cd );
cd = ( iconv_t )-1;
}
if ( ( iconv_t )-1 == ( cd = iconv_open( "UTF-8", "UCS-2" ) ) )
{
perror( "iconv_open() error" );
goto main_exit;
}
if ( -1 == iconv( cd, ( const char ** )&xxxp, &xxxlen, &outp, &outlen ) )
{
perror( "iconv() error" );
goto main_exit;
}
outputBinary
(
stderr,
out,
sizeof( out ) - outlen
);
outlen = sizeof( out ) - outlen;
outp = out;
inlen = sizeof( in );
inp = in;
if ( ( iconv_t )-1 != cd )
{
iconv_close( cd );
cd = ( iconv_t )-1;
}
if ( ( iconv_t )-1 == ( cd = iconv_open( "gb2312", "UTF-8" ) ) )
{
perror( "iconv_open() error" );
goto main_exit;
}
if ( -1 == iconv( cd, ( const char ** )&outp, &outlen, &inp, &inlen ) )
{
perror( "iconv() error" );
goto main_exit;
}
outputBinary
(
stderr,
in,
sizeof( in ) - inlen
);
ret = EXIT_SUCCESS;
main_exit:
if ( ( iconv_t )-1 != cd )
{
iconv_close( cd );
cd = ( iconv_t )-1;
}
return( ret );
} /* end of main */
#if 0 [scz@ /export/home/scz/src]> ./iconv_test byteArray [ 28 bytes ] -> 00000000 47 42 32 33 31 32 D3 EB-55 6E 69 63 6F 64 65 CB GB2312与Unicode. 00000010 AB CF F2 D7 AA BB BB B2-E2 CA D4 00 byteArray [ 35 bytes ] -> 00000000 47 42 32 33 31 32 E4 B8-8E 55 6E 69 63 6F 64 65 GB2312 00000010 E5 8F 8C E5 90 91 E8 BD-AC E6 8D A2 E6 B5 8B E8 00000020 AF 95 00 byteArray [ 44 bytes ] -> 00000000 FE FF 00 47 00 42 00 32-00 33 00 31 00 32 4E 0E ..G.B.2.3.1.2N. 00000010 00 55 00 6E 00 69 00 63-00 6F 00 64 00 65 53 CC .U.n.i.c.o.d.eS. 00000020 54 11 8F 6C 63 62 6D 4B-8B D5 00 00 T. byteArray [ 35 bytes ] -> 00000000 47 42 32 33 31 32 E4 B8-8E 55 6E 69 63 6F 64 65 GB2312 00000010 E5 8F 8C E5 90 91 E8 BD-AC E6 8D A2 E6 B5 8B E8 00000020 AF 95 00 byteArray [ 28 bytes ] -> 00000000 47 42 32 33 31 32 D3 EB-55 6E 69 63 6F 64 65 CB GB2312与Unicode. 00000010 AB CF F2 D7 AA BB BB B2-E2 CA D4 00 #endif
鉴于缺省情况下iconv*()不具有良好的可移植性,可以考虑其它方案。其中一种替代 方案是在.c中预定义相关码表,自行完成查表转换工作。但这种方案要考虑字节序的 问题,参看前面x86/Linux、SPARC/Solaris上的输出信息,也不具有良好可移植性。