SumProject/my_error/uni3.cpp
2025-03-22 22:38:52 +08:00

100 lines
2.6 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include <stdio.h>
#include <iostream>
using namespace std;
//void print_utf8_to_unicode(const char *str) {
// while (*str) {
// unsigned char c = *str;
// unsigned int code = 0;
//
// // 根据UTF-8编码规则解析字节
// if (c < 0x80) { // 1字节
// code = c;
// str += 1;
// } else if ((c & 0xE0) == 0xC0) { // 2字节
// code = ((c & 0x1F) << 6) | (str[1] & 0x3F);
// str += 2;
// } else if ((c & 0xF0) == 0xE0) { // 3字节中文常用
// code = ((c & 0x0F) << 12) | ((str[1] & 0x3F) << 6) | (str[2] & 0x3F);
// str += 3;
// } else if ((c & 0xF8) == 0xF0) { // 4字节
// code = ((c & 0x07) << 18) | ((str[1] & 0x3F) << 12) | ((str[2] & 0x3F) << 6) | (str[3] & 0x3F);
// str += 4;
// } else {
// // 处理无效字节
// code = c; // Unicode替换字符
// str++;
// }
//
// printf("U+%04X ", code);
// }
//}
int Utf82Unicode(char* pInput, char* pOutput)
{
int outputSize = 0; //记录转换后的Unicode字符串的字节数
*pOutput = 0;
while (*pInput)
{
cout<< *pInput<<"; ";
if (*pInput > 0x00 && *pInput <= 0x7F) //处理单字节UTF8字符英文字母、数字
{
*pOutput = *pInput;
pOutput++;
*pOutput = 0; //小端法表示在高地址填补0
}
else if (((*pInput) & 0xE0) == 0xC0) //处理双字节UTF8字符
//else if(*pInput >= 0xC0 && *pInput < 0xE0)
{
char high = *pInput;
pInput++;
char low = *pInput;
if ((low & 0xC0) != 0x80) //检查是否为合法的UTF8字符表示
{
return -1; //如果不是则报错
}
*pOutput = (high << 6) + (low & 0x3F);
pOutput++;
*pOutput = (high >> 2) & 0x07;
}
else if (((*pInput) & 0xF0) == 0xE0) //处理三字节UTF8字符
//else if(*pInput>=0xE0 && *pInput<0xF0)
{
char high = *pInput;
pInput++;
char middle = *pInput;
pInput++;
char low = *pInput;
if (((middle & 0xC0) != 0x80) || ((low & 0xC0) != 0x80))
{
return -1;
}
*pOutput = (middle << 6) + (low & 0x3F);//取出middle的低两位与low的低6位组合成unicode字符的低8位
pOutput++;
*pOutput = (high << 4) + ((middle >> 2) & 0x0F); //取出high的低四位与middle的中间四位组合成unicode字符的高8位
}
else //对于其他字节数的UTF8字符不进行处理
{
return -1;
}
pInput ++;//处理下一个utf8字符
pOutput ++;
outputSize +=2;
}
//unicode字符串后面有两个\0
*pOutput = 0;
pOutput++;
*pOutput = 0;
return outputSize;
}
int main() {
// 确保源代码文件以UTF-8编码保存
char text[] =u8"你好";
//// print_utf8_to_unicode(text);
// char outs[1005];
// Utf82Unicode(text,outs);
printf("str:%04X",text);
return 0;
}