100 lines
2.6 KiB
C++
100 lines
2.6 KiB
C++
#include <stdio.h>
|
||
#include <iostream>
|
||
using namespace std;
|
||
//void print_utf8_to_unicode(const char *str) {
|
||
// while (*str) {
|
||
// unsigned char c = *str;
|
||
// unsigned int code = 0;
|
||
//
|
||
// // 根据UTF-8编码规则解析字节
|
||
// if (c < 0x80) { // 1字节
|
||
// code = c;
|
||
// str += 1;
|
||
// } else if ((c & 0xE0) == 0xC0) { // 2字节
|
||
// code = ((c & 0x1F) << 6) | (str[1] & 0x3F);
|
||
// str += 2;
|
||
// } else if ((c & 0xF0) == 0xE0) { // 3字节(中文常用)
|
||
// code = ((c & 0x0F) << 12) | ((str[1] & 0x3F) << 6) | (str[2] & 0x3F);
|
||
// str += 3;
|
||
// } else if ((c & 0xF8) == 0xF0) { // 4字节
|
||
// code = ((c & 0x07) << 18) | ((str[1] & 0x3F) << 12) | ((str[2] & 0x3F) << 6) | (str[3] & 0x3F);
|
||
// str += 4;
|
||
// } else {
|
||
// // 处理无效字节
|
||
// code = c; // Unicode替换字符
|
||
// str++;
|
||
// }
|
||
//
|
||
// printf("U+%04X ", code);
|
||
// }
|
||
//}
|
||
|
||
int Utf82Unicode(char* pInput, char* pOutput)
|
||
{
|
||
int outputSize = 0; //记录转换后的Unicode字符串的字节数
|
||
*pOutput = 0;
|
||
while (*pInput)
|
||
{
|
||
cout<< *pInput<<"; ";
|
||
if (*pInput > 0x00 && *pInput <= 0x7F) //处理单字节UTF8字符(英文字母、数字)
|
||
{
|
||
*pOutput = *pInput;
|
||
pOutput++;
|
||
*pOutput = 0; //小端法表示,在高地址填补0
|
||
}
|
||
else if (((*pInput) & 0xE0) == 0xC0) //处理双字节UTF8字符
|
||
//else if(*pInput >= 0xC0 && *pInput < 0xE0)
|
||
{
|
||
char high = *pInput;
|
||
pInput++;
|
||
char low = *pInput;
|
||
if ((low & 0xC0) != 0x80) //检查是否为合法的UTF8字符表示
|
||
{
|
||
return -1; //如果不是则报错
|
||
}
|
||
|
||
*pOutput = (high << 6) + (low & 0x3F);
|
||
pOutput++;
|
||
*pOutput = (high >> 2) & 0x07;
|
||
}
|
||
else if (((*pInput) & 0xF0) == 0xE0) //处理三字节UTF8字符
|
||
//else if(*pInput>=0xE0 && *pInput<0xF0)
|
||
{
|
||
char high = *pInput;
|
||
pInput++;
|
||
char middle = *pInput;
|
||
pInput++;
|
||
char low = *pInput;
|
||
if (((middle & 0xC0) != 0x80) || ((low & 0xC0) != 0x80))
|
||
{
|
||
return -1;
|
||
}
|
||
*pOutput = (middle << 6) + (low & 0x3F);//取出middle的低两位与low的低6位,组合成unicode字符的低8位
|
||
pOutput++;
|
||
*pOutput = (high << 4) + ((middle >> 2) & 0x0F); //取出high的低四位与middle的中间四位,组合成unicode字符的高8位
|
||
}
|
||
else //对于其他字节数的UTF8字符不进行处理
|
||
{
|
||
return -1;
|
||
}
|
||
pInput ++;//处理下一个utf8字符
|
||
pOutput ++;
|
||
outputSize +=2;
|
||
}
|
||
//unicode字符串后面,有两个\0
|
||
*pOutput = 0;
|
||
pOutput++;
|
||
*pOutput = 0;
|
||
return outputSize;
|
||
}
|
||
|
||
int main() {
|
||
// 确保源代码文件以UTF-8编码保存
|
||
char text[] =u8"你好";
|
||
//// print_utf8_to_unicode(text);
|
||
// char outs[1005];
|
||
// Utf82Unicode(text,outs);
|
||
printf("str:%04X",text);
|
||
return 0;
|
||
}
|