#ifdef WIN32 #include #endif #if defined LINUX || defined ANDROID #include #ifndef GLTB_NO_ICU // use ICU for conversion #include #endif #endif #include "GLTB/exception.h" #include "GLTB/stringconvert.h" namespace gltb { #if !defined WIN32 && defined GLTB_NO_ICU size_t getUtf16EncodedLength(const char *input) { size_t encodedLength = 0; size_t length = strlen(input); size_t i = 0; do { if((input[i] & 0x80) == 0) { encodedLength++; } else if((input[i] & 0xC0) <= 0xB0) { i++; encodedLength++; } else if(input[i] <= 0xFFFF) { i += 2; encodedLength++; } else if(input[i] <= 0x1FFFFF) { i += 3; encodedLength++; } else if(input[i] <= 0x3FFFFFF) { i += 4; encodedLength++; } else { i += 5; encodedLength++; } i++; } while(i < length); return encodedLength; } #endif wchar_t *utf8ToUtf16WChar(const char *input) { #if defined WIN32 int bufferSize=MultiByteToWideChar(CP_UTF8,0,input,-1,NULL,0); if(bufferSize==0 && strlen(input)!=0) { wchar_t *windowsMessage; DWORD lastError = GetLastError(); FormatMessage( FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, NULL, lastError, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPTSTR) &windowsMessage, 0, NULL ); /* * Now this is some fine insanity here: having encountered a charset conversion that went wrong, * we now have to rely on another one on a totally unknown string in order to get an error * message onto the screen. */ char *utf8WindowsMessage=utf16ToUtf8Char(windowsMessage); throw Exception("UTF8 to UTF16 string conversion failed; Windows error message claims " + std::string(utf8WindowsMessage),"gltb::utf8ToUtf16WChar()"); } wchar_t *result=new wchar_t[bufferSize]; if(MultiByteToWideChar(CP_UTF8,0,input,-1,result,bufferSize)!=bufferSize) { throw Exception("UTF8 to UTF16 string conversion failed","gltb::utf8ToUtf16WChar()"); } return result; #elif (defined LINUX || defined ANDROID) && !defined GLTB_NO_ICU int bufferSize=0; UErrorCode errorCode=U_ZERO_ERROR; int numSubs; u_strFromUTF8WithSub(NULL,0,&bufferSize,input,-1,L' ',&numSubs,&errorCode); if(errorCode==U_STRING_NOT_TERMINATED_WARNING) { /* * This is an interesting wrinkle in the behaviour of ICU: the * library happily generates unterminated strings and indicates * that using the error code. So here we are, having fit the * converted string into 0 bytes without terminator. In other * words: the output is the empty string. */ wchar_t *result=new wchar_t[1]; result[0]=0; return result; } else if(errorCode!=U_BUFFER_OVERFLOW_ERROR) { throw Exception("UTF8 to UTF16 string conversion failed while trying to get temporary buffer size, error: " + std::string(u_errorName(errorCode)),"gltb::utf8ToUtf16WChar()"); } UChar *temp=new UChar[bufferSize+1]; memset(temp,0,(bufferSize+1)*sizeof(UChar)); errorCode=U_ZERO_ERROR; u_strFromUTF8WithSub(temp,bufferSize,&bufferSize,input,-1,L' ',&numSubs,&errorCode); if(errorCode>0) { throw Exception("UTF8 to UTF16 string conversion failed, error: " + std::string(u_errorName(errorCode)),"gltb::utf8ToUtf16WChar()"); } bufferSize=0; errorCode=U_ZERO_ERROR; u_strToWCS(NULL,0,&bufferSize,temp,-1,&errorCode); if(errorCode!=U_BUFFER_OVERFLOW_ERROR) { throw Exception("UTF8 to UTF16 string conversion failed while trying to get output buffer size, error: " + std::string(u_errorName(errorCode)),"gltb::utf8ToUtf16WChar()"); } wchar_t *result=new wchar_t[bufferSize+1]; memset(result,0,(bufferSize+1)*sizeof(wchar_t)); errorCode=U_ZERO_ERROR; u_strToWCS(result,bufferSize,&bufferSize,temp,-1,&errorCode); if(errorCode>0) { throw Exception("UTF8 to UTF16 string conversion failed, error: " + std::string(u_errorName(errorCode)),"gltb::utf8ToUtf16WChar()"); } delete[] temp; return result; #elif (defined LINUX || defined ANDROID) && defined GLTB_NO_ICU size_t inputLength = strlen(input); size_t resultLength = getUtf16EncodedLength(input); wchar_t *result = new wchar_t[resultLength + 1]; memset(result, 0, (resultLength + 1) * sizeof(wchar_t)); size_t inputIndex = 0; for(size_t i = 0; i < resultLength; i++) { // TODO error checking of the encodings - not all possible byte sequences are valid UTF-8 if((input[inputIndex] & 0x80) == 0) { // one byte sequence result[i] = input[inputIndex]; inputIndex++; } else if((input[inputIndex] & 0xE0) == 0xC0) { // two byte sequence result[i] = ((input[inputIndex] & 0x1F) << 6) + (input[inputIndex + 1] & 0x3F); inputIndex += 2; } else if((input[inputIndex] & 0xF0) == 0xE0) { // three byte sequence result[i] = ((input[inputIndex] & 0x0F) << 12) + ((input[inputIndex + 1] & 0x3F) << 6) + (input[inputIndex + 2] & 0x3F); inputIndex += 3; } else if((input[inputIndex] & 0xF8) == 0xF0) { // four byte sequence result[i] = ((input[inputIndex] & 0x07) << 18) + ((input[inputIndex + 1] & 0x3F) << 12) + ((input[inputIndex + 2] & 0x3F) << 6) + (input[inputIndex + 3] & 0x3F); inputIndex += 4; } else if((input[inputIndex] & 0xFC) == 0xF8) { // five byte sequence result[i] = ((input[inputIndex] & 0x03) << 24) + ((input[inputIndex + 1] & 0x3F) << 18) + ((input[inputIndex + 2] & 0x3F) << 12) + ((input[inputIndex + 3] & 0x3F) << 6) + (input[inputIndex + 4] & 0x3F); inputIndex += 5; } else if((input[inputIndex] & 0xFE) == 0xFC) { // six byte sequence result[i] = ((input[inputIndex] & 0x01) << 30) + ((input[inputIndex + 1] & 0x3F) << 24) + ((input[inputIndex + 2] & 0x3F) << 18) + ((input[inputIndex + 3] & 0x3F) << 12) + ((input[inputIndex + 4] & 0x3F) << 6) + (input[inputIndex + 5] & 0x3F); inputIndex += 6; } else { // getting into this branch means that the input is definitely not strictly valid delete[] result; throw Exception("UTF8 to UTF16 string conversion failed because input contains invalid byte sequences", "gltb::utf8ToUtf16WChar()"); } } return result; #else #error unimplemented function on this target platform #endif } #if !defined WIN32 && defined GLTB_NO_ICU size_t getUtf8EncodedLength(const wchar_t *input) { size_t encodedLength = 0; size_t length = wcslen(input); for(size_t i = 0; i < length; i++) { if(input[i] <= 0x7F) { encodedLength++; } else if(input[i] <= 0x07FF) { encodedLength += 2; } else if(input[i] <= 0xFFFF) { encodedLength += 3; } else if(input[i] <= 0x1FFFFF) { encodedLength += 4; } else if(input[i] <= 0x3FFFFFF) { encodedLength += 5; } else { encodedLength += 6; } } return encodedLength; } #endif char *utf16ToUtf8Char(const wchar_t *input) { #ifdef WIN32 int bufferSize=WideCharToMultiByte(CP_UTF8,0,input,-1,NULL,0,NULL,NULL); if(bufferSize==0 && wcslen(input)!=0) { wchar_t *windowsMessage; DWORD lastError = GetLastError(); FormatMessage( FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, NULL, lastError, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPTSTR) &windowsMessage, 0, NULL ); /* * Now this is some fine insanity here: having encountered a charset conversion that went wrong, * we now have to rely on another one on a totally unknown string in order to get an error * message onto the screen. */ char *utf8WindowsMessage=utf16ToUtf8Char(windowsMessage); throw Exception("UTF16 to UTF8 string conversion failed; Windows error message claims " + std::string(utf8WindowsMessage),"gltb::utf16ToUtf8Char()"); } char *result=new char[bufferSize]; if(WideCharToMultiByte(CP_UTF8,0,input,-1,result,bufferSize,NULL,NULL)!=bufferSize) { throw Exception("UTF16 to UTF8 string conversion failed","gltb::utf16ToUtf8Char()"); } return result; #elif (defined LINUX || defined ANDROID) && !defined GLTB_NO_ICU int bufferSize=0; UErrorCode errorCode=U_ZERO_ERROR; u_strFromWCS(NULL,0,&bufferSize,input,-1,&errorCode); if(errorCode==U_STRING_NOT_TERMINATED_WARNING) { /* * This is an interesting wrinkle in the behaviour of ICU: the * library happily generates unterminated strings and indicates * that using the error code. So here we are, having fit the * converted string into 0 bytes without terminator. In other * words: the output is the empty string. */ char *result=new char[1]; result[0]=0; return result; } else if(errorCode!=U_BUFFER_OVERFLOW_ERROR) { throw Exception("UTF16 to UTF8 string conversion failed while trying to get temporary buffer size, error: " + std::string(u_errorName(errorCode)),"gltb::utf16ToUtf8Char()"); } UChar *temp=new UChar[bufferSize+1]; memset(temp,0,(bufferSize+1)*sizeof(UChar)); errorCode=U_ZERO_ERROR; u_strFromWCS(temp,bufferSize,&bufferSize,input,-1,&errorCode); if(errorCode>0) { throw Exception("UTF16 to UTF8 string conversion failed, error: " + std::string(u_errorName(errorCode)),"gltb::utf16ToUtf8har()"); } bufferSize=0; errorCode=U_ZERO_ERROR; u_strToUTF8(NULL,0,&bufferSize,temp,-1,&errorCode); if(errorCode!=U_BUFFER_OVERFLOW_ERROR) { throw Exception("UTF16 to UTF8 string conversion failed while trying to get output buffer size, error: " + std::string(u_errorName(errorCode)),"gltb::utf16ToUtf8Char()"); } char *result=new char[bufferSize+1]; memset(result,0,bufferSize+1); errorCode=U_ZERO_ERROR; u_strToUTF8(result,bufferSize,&bufferSize,temp,-1,&errorCode); if(errorCode>0) { throw Exception("UTF16 to UTF8 string conversion failed, error: " + std::string(u_errorName(errorCode)),"gltb::utf16ToUtf8har()"); } delete[] temp; return result; #elif (defined LINUX || defined ANDROID) && defined GLTB_NO_ICU size_t inputLength = wcslen(input); size_t resultLength = getUtf8EncodedLength(input); char *result = new char[resultLength + 1]; memset(result, 0, resultLength + 1); int resultIndex = 0; for(size_t i = 0; i < inputLength; i++) { if(input[i] <= 0x7F) { result[i] = input[i]; resultIndex++; } else if(input[i] <= 0x07FF) { // 2 byte encoding wchar_t masked = input[i] & 0x07FF; result[i] = 0xC0 + (masked >> 6); result[i + 1] = 0x80 + (masked & 0x3F); resultIndex += 2; } else if(input[i] <= 0xFFFF) { // 3 byte encoding wchar_t masked = input[i] & 0xFFFF; result[i] = 0xE0 + (masked >> 12); result[i + 1] = 0x80 + ((masked >> 6) & 0x3F); result[i + 2] = 0x80 + (masked & 0x3F); resultIndex += 3; } else if(input[i] <= 0x1FFFFF) { // 4 byte encoding wchar_t masked = input[i] & 0x1FFFFF; result[i] = 0xF0 + (masked >> 18); result[i + 1] = 0x80 + ((masked >> 12) & 0x3F); result[i + 2] = 0x80 + ((masked >> 6) & 0x3F); result[i + 3] = 0x80 + (masked & 0x3F); resultIndex += 4; } else if(input[i] <= 0x3FFFFFF) { // 5 byte encoding wchar_t masked = input[i] & 0x3FFFFFF; result[i] = 0xF8 + (masked >> 24); result[i + 1] = 0x80 + ((masked >> 18) & 0x3F); result[i + 2] = 0x80 + ((masked >> 12) & 0x3F); result[i + 3] = 0x80 + ((masked >> 6) & 0x3F); result[i + 4] = 0x80 + (masked & 0x3F); resultIndex += 5; } else { // 6 byte encoding result[i] = 0xFC + (input[i] >> 30); result[i + 1] = 0x80 + ((input[i] >> 24) & 0x3F); result[i + 2] = 0x80 + ((input[i] >> 18) & 0x3F); result[i + 3] = 0x80 + ((input[i] >> 12) & 0x3F); result[i + 4] = 0x80 + ((input[i] >> 6) & 0x3F); result[i + 5] = 0x80 + (input[i] & 0x3F); resultIndex += 6; } } return result; #else #error unimplemented function on this target platform #endif } std::wstring utf8ToUtf16WString(std::string input) { wchar_t *result=utf8ToUtf16WChar(input.c_str()); std::wstring resultStr=result; delete[] result; return resultStr; } std::string utf16ToUtf8String(std::wstring input) { char *result=utf16ToUtf8Char(input.c_str()); std::string resultStr=result; delete[] result; return resultStr; } }