| 相信一定有不少的程序開(kāi)發(fā)人員時(shí)常會(huì )遇到字符編碼的問(wèn)題,而這個(gè)問(wèn)題也是非常讓人頭痛的。因為這些都是潛在的錯誤,要找出這些錯誤也得要有這方面的開(kāi)發(fā)經(jīng)驗才行。特別是在處理xml文檔時(shí) ,該問(wèn)題的出現就更加的頻繁了,有一次用java寫(xiě)服務(wù)器端程序,用vc寫(xiě)客戶(hù)端與之交互。交互的協(xié)議都是用xml寫(xiě)的。結果在通訊時(shí)老是發(fā)現數據接受不正確。納悶!于是用抓取網(wǎng)絡(luò )數據包工具抓取數據,后來(lái)才發(fā)現原來(lái)是java上x(chóng)ml的頭是這樣的<?xml version="1.0" encoding="UTF-8"?>,而vc上默認的是GB2312。所以一遇到漢字數據就不正確了。去網(wǎng)上找資料,這方面的文章好象特別少,針對像這樣的問(wèn)題,下面我介紹一下我自己寫(xiě)的一個(gè)轉換程序。當然,程序很簡(jiǎn)單。如果有畫(huà)蛇添足的地方,還望各位高手一笑了之。 如果您對UTF-8、Unicode、GB2312等還是很陌生的話(huà),請查看http://www.linuxforum.net/books/UTF-8-Unicode.html,我這里就不浪費口舌了。下面介紹一下WinAPI的兩個(gè)函數:WideCharToMultiByte、MultiByteToWideChar。 函數原型: int WideCharToMultiByte( UINT CodePage, // code page DWORD dwFlags, // performance and mapping flags LPCWSTR lpWideCharStr, // wide-character string int cchWideChar, // number of chars in string LPSTR lpMultiByteStr, // buffer for new string int cbMultiByte, // size of buffer LPCSTR lpDefaultChar, // default for unmappable chars LPBOOL lpUsedDefaultChar // set when default char used); //將寬字符轉換成多個(gè)窄字符int MultiByteToWideChar( UINT CodePage, // code page DWORD dwFlags, // character-type options LPCSTR lpMultiByteStr, // string to map int cbMultiByte, // number of bytes in string LPWSTR lpWideCharStr, // wide-character buffer int cchWideChar // size of buffer);//將多個(gè)窄字符轉換成寬字符需要用到的一些函數: CString CXmlProcess::HexToBin(CString string)//將16進(jìn)制數轉換成2進(jìn)制{ if( string == "0") return "0000"; if( string == "1") return "0001"; if( string == "2") return "0010"; if( string == "3") return "0011"; if( string == "4") return "0100"; if( string == "5") return "0101"; if( string == "6") return "0110"; if( string == "7") return "0111"; if( string == "8") return "1000"; if( string == "9") return "1001"; if( string == "a") return "1010"; if( string == "b") return "1011"; if( string == "c") return "1100"; if( string == "d") return "1101"; if( string == "e") return "1110"; if( string == "f") return "1111"; return "";}CString CXmlProcess::BinToHex(CString BinString)//將2進(jìn)制數轉換成16進(jìn)制{ if( BinString == "0000") return "0"; if( BinString == "0001") return "1"; if( BinString == "0010") return "2"; if( BinString == "0011") return "3"; if( BinString == "0100") return "4"; if( BinString == "0101") return "5"; if( BinString == "0110") return "6"; if( BinString == "0111") return "7"; if( BinString == "1000") return "8"; if( BinString == "1001") return "9"; if( BinString == "1010") return "a"; if( BinString == "1011") return "b"; if( BinString == "1100") return "c"; if( BinString == "1101") return "d"; if( BinString == "1110") return "e"; if( BinString == "1111") return "f"; return "";}int CXmlProcess::BinToInt(CString string)//2進(jìn)制字符數據轉換成10進(jìn)制整型{ int len =0; int tempInt = 0; int strInt = 0; for(int i =0 ;i < string.GetLength() ;i ++) { tempInt = 1; strInt = (int)string.GetAt(i)-48; for(int k =0 ;k < 7-i ; k++) { tempInt = 2*tempInt; } len += tempInt*strInt; } return len;} UTF-8轉換成GB2312先把UTF-8轉換成Unicode.然后再把Unicode通過(guò)函數WideCharToMultiByte轉換成GB2312 WCHAR* CXmlProcess::UTF_8ToUnicode(char *ustart) //把UTF-8轉換成Unicode{ char char_one; char char_two; char char_three; int Hchar; int Lchar; char uchar[2]; WCHAR *unicode; CString string_one; CString string_two; CString string_three; CString combiString; char_one = *ustart; char_two = *(ustart+1); char_three = *(ustart+2); string_one.Format("%x",char_one); string_two.Format("%x",char_two); string_three.Format("%x",char_three); string_three = string_three.Right(2); string_two = string_two.Right(2); string_one = string_one.Right(2); string_three = HexToBin(string_three.Left(1))+HexToBin(string_three.Right(1)); string_two = HexToBin(string_two.Left(1))+HexToBin(string_two.Right(1)); string_one = HexToBin(string_one.Left(1))+HexToBin(string_one.Right(1)); combiString = string_one +string_two +string_three; combiString = combiString.Right(20); combiString.Delete(4,2); combiString.Delete(10,2); Hchar = BinToInt(combiString.Left(8)); Lchar = BinToInt(combiString.Right(8)); uchar[1] = (char)Hchar; uchar[0] = (char)Lchar; unicode = (WCHAR *)uchar; return unicode;}char * CXmlProcess::UnicodeToGB2312(unsigned short uData) //把Unicode 轉換成 GB2312{ char *buffer ; buffer = new char[sizeof(WCHAR)]; WideCharToMultiByte(CP_ACP,NULL,&uData,1,buffer,sizeof(WCHAR),NULL,NULL); return buffer;} GB2312轉換成UTF-8:先把GB2312通過(guò)函數MultiByteToWideChar轉換成Unicode.然后再把Unicode通過(guò)拆開(kāi)Unicode后拼裝成UTF-8。 WCHAR * CXmlProcess::Gb2312ToUnicode(char *gbBuffer) //GB2312 轉換成 Unicode{ WCHAR *uniChar; uniChar = new WCHAR[1]; ::MultiByteToWideChar(CP_ACP,MB_PRECOMPOSED,gbBuffer,2,uniChar,1); return uniChar;}char * CXmlProcess::UnicodeToUTF_8(WCHAR *UniChar) // Unicode 轉換成UTF-8{ char *buffer; CString strOne; CString strTwo; CString strThree; CString strFour; CString strAnd; buffer = new char[3]; int hInt,lInt; hInt = (int)((*UniChar)/256); lInt = (*UniChar)%256; CString string ; string.Format("%x",hInt); strTwo = HexToBin(string.Right(1)); string = string.Left(string.GetLength() - 1); strOne = HexToBin(string.Right(1)); string.Format("%x",lInt); strFour = HexToBin(string.Right(1)); string = string.Left(string.GetLength() -1); strThree = HexToBin(string.Right(1)); strAnd = strOne +strTwo + strThree + strFour; strAnd.Insert(0,"1110"); strAnd.Insert(8,"10"); strAnd.Insert(16,"10"); strOne = strAnd.Left(8); strAnd = strAnd.Right(16); strTwo = strAnd.Left(8); strThree = strAnd.Right(8); *buffer = (char)BinToInt(strOne); buffer[1] = (char)BinToInt(strTwo); buffer[2] = (char)BinToInt(strThree); return buffer;} 例子:將GB2312轉換成UTF-8的調用: char * CXmlProcess::translateCharToUTF_8(char *xmlStream, int len) { int newCharLen =0 ; int oldCharLen = 0; int revCharLen = len; char* newCharBuffer; char* finalCharBuffer; char *buffer ; CString string; buffer = new char[sizeof(WCHAR)]; newCharBuffer = new char[int(1.5*revCharLen)];//設置最大的一個(gè)緩沖區 while(oldCharLen < revCharLen) { if( *(xmlStream + oldCharLen) >= 0) { *(newCharBuffer+newCharLen) = *(xmlStream +oldCharLen); newCharLen ++; oldCharLen ++; }//如果是英文直接復制就可以 else { WCHAR *pbuffer = this->Gb2312ToUnicode(xmlStream+oldCharLen); buffer = this->UnicodeToUTF_8(pbuffer); *(newCharBuffer+newCharLen) = *buffer; *(newCharBuffer +newCharLen +1) = *(buffer + 1); *(newCharBuffer +newCharLen +2) = *(buffer + 2); newCharLen += 3; oldCharLen += 2; } } newCharBuffer[newCharLen] = ''\0''; CString string1 ; string1.Format("%s",newCharBuffer); finalCharBuffer = new char[newCharLen+1]; memcpy(finalCharBuffer,newCharBuffer,newCharLen+1); return finalCharBuffer;} 程序都非常的簡(jiǎn)單,由于實(shí)在太窮。已經(jīng)吃了兩天的方便面。所以現在頭昏,程序的詳細說(shuō)明就不寫(xiě)了。程序員到了像我這樣的地步也真是少見(jiàn)。工資低沒(méi)有辦法。哎?。。?! |
![]() | ![]() ![]() ![]() |
![]() 字符串編碼轉換 GBK to UTF8 (ansi版) xmwen@126.com */ char *gbk2utf8(const char *strGBK){ int len; wchar_t *strUnicode; char *strUTF8; if (!strGBK){return NULL;} len = MultiByteToWideChar(CP_GBK, 0,strGBK, -1, NULL,0); if (len <1){return NULL;} strUnicode = (wchar_t *) malloc(sizeof(wchar_t) * len); if (!strUnicode){return NULL;} len = MultiByteToWideChar(CP_GBK, 0, strGBK, -1, strUnicode, len); if (len<1){free(strUnicode);return NULL;} len = WideCharToMultiByte(CP_UTF8, 0, strUnicode, -1, NULL, 0, NULL, NULL); if (len<1){free(strUnicode);return NULL;} strUTF8 = (char *) malloc(sizeof(char) * len); if (!strUTF8){free(strUnicode);return NULL;} len = WideCharToMultiByte (CP_UTF8, 0, strUnicode, -1, strUTF8, len, NULL,NULL); free(strUnicode); if (len<1){free(strUTF8);return NULL;} return strUTF8; } ( xmwen 發(fā)表于 2009-11-3 19:38:00) ![]() 搞笑,這種害人害己的文章還有這么多人訪(fǎng)問(wèn)。 作者光知道 WideCharToMultiByte 可以把 Unicode 轉成 GB2312 就不知道也可以把 Unicode 轉換為 UTF-8 嗎? 其實(shí)這是一個(gè)很簡(jiǎn)單的程序,都被作者搞復雜了。 要實(shí)現 GB2312 (其實(shí)是GBK)轉換為 UTF-8 其實(shí)很簡(jiǎn)單,先用 MultiByteToWideChar 把 GB2312 轉換為 Unicode,再用 WideCharToMultiByte 把 Unicode 轉換為 UTF-8 就可以了。 UTF-8 轉換為 GB2312 是個(gè)相反的過(guò)程,先用 MultiByteToWideChar 把 UTF-8 轉換為 Unicode,再用 WideCharToMultiByte 把 Unicode 轉換為 GB2312 就可以了。 ( 雁過(guò)留聲 發(fā)表于 2007-1-11 9:11:00) ![]() 請作者檢查一下, 如: "你是我的好朋友" 轉換成了;"浣犳槸鎴戠殑濂芥i脲弸鍚?" 正確的應是: "浣犳槸鎴戠殑濂芥湅鍙嬪悧" 對于有的編碼還能對... 交流一下:kudoo.aos@gmail.com ( kudoo 發(fā)表于 2006-8-20 19:46:00) ![]() buffersize = WideCharToMultiByte(CP_UTF8, MB_PRECOMPOSED, unicode, wide_size, NULL, 0, NULL, 0); buffer = new char[buffersize+1]; 但是,我在調試的時(shí)候發(fā)現:buffersize似乎已經(jīng)預先留了‘\0’的位置,或者是不是我出錯了 比如:“i love you,愛(ài)”GB2312是需要14個(gè)字節 UTF8是需要15個(gè)字節,返回時(shí)候就是這些了啊, 我的地址是:robin-fox@sohu.com, 誰(shuí)能回答以下,感謝??! ( robin_fox_nan 發(fā)表于 2006-3-19 20:20:00) ![]() 原文請看 http://www.kbadboy.com/viewfull.asp?id=33 ( 鬼龍之舞 發(fā)表于 2005-8-25 16:13:00) ![]() 感謝樓主!! UTF8toUnicode proc uses esi edi lpszBuf_OUT,lpszUTF8_IN mov esi,lpszUTF8_IN mov edi,lpszBuf_OUT .while TRUE mov al,[esi] .if sbyte ptr al <0 mov al,[esi] and al,00001111b shl al,4 mov [edi+1],al mov al,[esi+1] and al,00111100b shr al,2 or [edi+1],al mov al,[esi+1] and al,11b shl al,6 mov [edi+0],al mov al,[esi+2] and al,00111111b or [edi+0],al add edi,2 add esi,3 .elseif al xor ah,ah stosw inc esi .else mov WORD ptr [edi],0 .break .endif .endw ret UTF8toUnicode endp ( 鬼龍之舞 發(fā)表于 2005-8-25 16:11:00) ![]() mov esi,lpszUTF8_IN mov edi,lpBuf_OUT .while TRUE mov ax,[esi] .if ax==0 stosw .break .elseif ah==0 add esi,2 stosw .else mov al,[esi+1] shr al,4 or al,11100000b mov [edi+0],al mov al,[esi+1] and al,00001111b shl al,2 or al,10000000b mov ah,[esi+0] shr ah,6 or al,ah mov [edi+1],al mov al,[esi+0] and al,00111111b or al,10000000b mov [edi+2],al add edi,3 add esi,2 .endif .endw ret UnicodetoUTF8 endp ( 鬼龍之舞 發(fā)表于 2005-8-25 16:11:00) ![]() ![]() UINT CodePage, // code page DWORD dwFlags, // performance and mapping flags LPCWSTR lpWideCharStr, // wide-character string int cchWideChar, // number of chars in string LPSTR lpMultiByteStr, // buffer for new string int cbMultiByte, // size of buffer LPCSTR lpDefaultChar, // default for unmappable chars LPBOOL lpUsedDefaultChar // set when default char used ); //將寬字符轉換成多個(gè)窄字符 這些只是函數原型,并沒(méi)有具體實(shí)現 ( zztop5384 發(fā)表于 2005-4-18 10:27:00) ![]() WCHAR* CXmlProcess::UTF_8ToUnicode(char *pText) { char uchar[2]; WCHAR *unicode; char_one = pText[0]; char_two = pText[1]); char_three = pText[2]; uchar[1] = ((pText[0] & 0x0F) << 4) + ((pText[1] >> 2) & 0x0F); uchar[0] = ((pText[1] & 0x03) << 6) + (pText[2] & 0x3F); unicode = (WCHAR *)uchar; return unicode; } |
聯(lián)系客服