SumProject/my_error/unicode.cpp
2025-03-22 22:38:52 +08:00

57 lines
1.3 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include <stdio.h>
int utf8ToUnicode(const char* utf8, int* unicode) {
unsigned char byte = (unsigned char)utf8[0];
int numBytes, codePoint;
if (byte < 0x80) {
// 1字节编码直接返回
numBytes = 1;
codePoint = byte;
} else if ((byte & 0xE0) == 0xC0) {
// 2字节编码
numBytes = 2;
codePoint = byte & 0x1F;
} else if ((byte & 0xF0) == 0xE0) {
// 3字节编码
numBytes = 3;
codePoint = byte & 0x0F;
} else if ((byte & 0xF8) == 0xF0) {
// 4字节编码
numBytes = 4;
codePoint = byte & 0x07;
} else {
// 非法编码
return -1;
}
// 解析余下的字节
for (int i = 1; i < numBytes; ++i) {
byte = (unsigned char)utf8[i];
if ((byte & 0xC0) != 0x80) {
// 非法编码
return -1;
}
codePoint = (codePoint << 6) | (byte & 0x3F);
}
*unicode = codePoint;
return numBytes;
}
int main() {
char utf8[] = "你好,世界!";
int unicode;
int numBytes = utf8ToUnicode(utf8, &unicode);
if (numBytes == -1) {
printf("非法UTF-8编码\n");
} else {
printf("UTF-8编码: %s\n", utf8);
printf("对应的Unicode码点: U+%04X\n", unicode);
}
return 0;
}