100 lines
2.6 KiB
C++
100 lines
2.6 KiB
C++
|
#include <stdio.h>
|
|||
|
#include <iostream>
|
|||
|
using namespace std;
|
|||
|
//void print_utf8_to_unicode(const char *str) {
|
|||
|
// while (*str) {
|
|||
|
// unsigned char c = *str;
|
|||
|
// unsigned int code = 0;
|
|||
|
//
|
|||
|
// // <20><><EFBFBD><EFBFBD>UTF-8<><38><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ֽ<EFBFBD>
|
|||
|
// if (c < 0x80) { // 1<>ֽ<EFBFBD>
|
|||
|
// code = c;
|
|||
|
// str += 1;
|
|||
|
// } else if ((c & 0xE0) == 0xC0) { // 2<>ֽ<EFBFBD>
|
|||
|
// code = ((c & 0x1F) << 6) | (str[1] & 0x3F);
|
|||
|
// str += 2;
|
|||
|
// } else if ((c & 0xF0) == 0xE0) { // 3<>ֽڣ<D6BD><DAA3><EFBFBD><EFBFBD>ij<EFBFBD><C4B3>ã<EFBFBD>
|
|||
|
// code = ((c & 0x0F) << 12) | ((str[1] & 0x3F) << 6) | (str[2] & 0x3F);
|
|||
|
// str += 3;
|
|||
|
// } else if ((c & 0xF8) == 0xF0) { // 4<>ֽ<EFBFBD>
|
|||
|
// code = ((c & 0x07) << 18) | ((str[1] & 0x3F) << 12) | ((str[2] & 0x3F) << 6) | (str[3] & 0x3F);
|
|||
|
// str += 4;
|
|||
|
// } else {
|
|||
|
// // <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ч<EFBFBD>ֽ<EFBFBD>
|
|||
|
// code = c; // Unicode<64>滻<EFBFBD>ַ<EFBFBD>
|
|||
|
// str++;
|
|||
|
// }
|
|||
|
//
|
|||
|
// printf("U+%04X ", code);
|
|||
|
// }
|
|||
|
//}
|
|||
|
|
|||
|
int Utf82Unicode(char* pInput, char* pOutput)
|
|||
|
{
|
|||
|
int outputSize = 0; //<2F><>¼ת<C2BC><D7AA><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Unicode<64>ַ<EFBFBD><D6B7><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ֽ<EFBFBD><D6BD><EFBFBD>
|
|||
|
*pOutput = 0;
|
|||
|
while (*pInput)
|
|||
|
{
|
|||
|
cout<< *pInput<<"; ";
|
|||
|
if (*pInput > 0x00 && *pInput <= 0x7F) //<2F><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ֽ<EFBFBD>UTF8<46>ַ<EFBFBD><D6B7><EFBFBD>Ӣ<EFBFBD><D3A2><EFBFBD><EFBFBD>ĸ<EFBFBD><C4B8><EFBFBD><EFBFBD><EFBFBD>֣<EFBFBD>
|
|||
|
{
|
|||
|
*pOutput = *pInput;
|
|||
|
pOutput++;
|
|||
|
*pOutput = 0; //С<>˷<EFBFBD><CBB7><EFBFBD>ʾ<EFBFBD><CABE><EFBFBD>ڸߵ<DAB8>ַ<EFBFBD>0
|
|||
|
}
|
|||
|
else if (((*pInput) & 0xE0) == 0xC0) //<2F><><EFBFBD><EFBFBD>˫<EFBFBD>ֽ<EFBFBD>UTF8<46>ַ<EFBFBD>
|
|||
|
//else if(*pInput >= 0xC0 && *pInput < 0xE0)
|
|||
|
{
|
|||
|
char high = *pInput;
|
|||
|
pInput++;
|
|||
|
char low = *pInput;
|
|||
|
if ((low & 0xC0) != 0x80) //<2F><><EFBFBD><EFBFBD><EFBFBD>Ƿ<EFBFBD>Ϊ<EFBFBD>Ϸ<EFBFBD><CFB7><EFBFBD>UTF8<46>ַ<EFBFBD><D6B7><EFBFBD>ʾ
|
|||
|
{
|
|||
|
return -1; //<2F><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
|||
|
}
|
|||
|
|
|||
|
*pOutput = (high << 6) + (low & 0x3F);
|
|||
|
pOutput++;
|
|||
|
*pOutput = (high >> 2) & 0x07;
|
|||
|
}
|
|||
|
else if (((*pInput) & 0xF0) == 0xE0) //<2F><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ֽ<EFBFBD>UTF8<46>ַ<EFBFBD>
|
|||
|
//else if(*pInput>=0xE0 && *pInput<0xF0)
|
|||
|
{
|
|||
|
char high = *pInput;
|
|||
|
pInput++;
|
|||
|
char middle = *pInput;
|
|||
|
pInput++;
|
|||
|
char low = *pInput;
|
|||
|
if (((middle & 0xC0) != 0x80) || ((low & 0xC0) != 0x80))
|
|||
|
{
|
|||
|
return -1;
|
|||
|
}
|
|||
|
*pOutput = (middle << 6) + (low & 0x3F);//ȡ<><C8A1>middle<6C>ĵ<EFBFBD><C4B5><EFBFBD>λ<EFBFBD><CEBB>low<6F>ĵ<EFBFBD>6λ<36><CEBB><EFBFBD><EFBFBD><EFBFBD>ϳ<EFBFBD>unicode<64>ַ<EFBFBD><D6B7>ĵ<EFBFBD>8λ
|
|||
|
pOutput++;
|
|||
|
*pOutput = (high << 4) + ((middle >> 2) & 0x0F); //ȡ<><C8A1>high<67>ĵ<EFBFBD><C4B5><EFBFBD>λ<EFBFBD><CEBB>middle<6C><65><EFBFBD>м<EFBFBD><D0BC><EFBFBD>λ<EFBFBD><CEBB><EFBFBD><EFBFBD><EFBFBD>ϳ<EFBFBD>unicode<64>ַ<EFBFBD><D6B7>ĸ<EFBFBD>8λ
|
|||
|
}
|
|||
|
else //<2F><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ֽ<EFBFBD><D6BD><EFBFBD><EFBFBD><EFBFBD>UTF8<46>ַ<EFBFBD><D6B7><EFBFBD><EFBFBD><EFBFBD><EFBFBD>д<EFBFBD><D0B4><EFBFBD>
|
|||
|
{
|
|||
|
return -1;
|
|||
|
}
|
|||
|
pInput ++;//<2F><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>һ<EFBFBD><D2BB>utf8<66>ַ<EFBFBD>
|
|||
|
pOutput ++;
|
|||
|
outputSize +=2;
|
|||
|
}
|
|||
|
//unicode<64>ַ<EFBFBD><D6B7><EFBFBD><EFBFBD><EFBFBD><EFBFBD>棬<EFBFBD><E6A3AC><EFBFBD><EFBFBD><EFBFBD><EFBFBD>\0
|
|||
|
*pOutput = 0;
|
|||
|
pOutput++;
|
|||
|
*pOutput = 0;
|
|||
|
return outputSize;
|
|||
|
}
|
|||
|
|
|||
|
int main() {
|
|||
|
// ȷ<><C8B7>Դ<EFBFBD><D4B4><EFBFBD><EFBFBD><EFBFBD>ļ<EFBFBD><C4BC><EFBFBD>UTF-8<><38><EFBFBD>뱣<EFBFBD><EBB1A3>
|
|||
|
char text[] =u8"<EFBFBD><EFBFBD><EFBFBD><EFBFBD>";
|
|||
|
//// print_utf8_to_unicode(text);
|
|||
|
// char outs[1005];
|
|||
|
// Utf82Unicode(text,outs);
|
|||
|
printf("str:%04X",text);
|
|||
|
return 0;
|
|||
|
}
|