57 lines
1.3 KiB
C++
57 lines
1.3 KiB
C++
#include <stdio.h>
|
||
|
||
int utf8ToUnicode(const char* utf8, int* unicode) {
|
||
unsigned char byte = (unsigned char)utf8[0];
|
||
int numBytes, codePoint;
|
||
|
||
if (byte < 0x80) {
|
||
// 1字节编码,直接返回
|
||
numBytes = 1;
|
||
codePoint = byte;
|
||
} else if ((byte & 0xE0) == 0xC0) {
|
||
// 2字节编码
|
||
numBytes = 2;
|
||
codePoint = byte & 0x1F;
|
||
} else if ((byte & 0xF0) == 0xE0) {
|
||
// 3字节编码
|
||
numBytes = 3;
|
||
codePoint = byte & 0x0F;
|
||
} else if ((byte & 0xF8) == 0xF0) {
|
||
// 4字节编码
|
||
numBytes = 4;
|
||
codePoint = byte & 0x07;
|
||
} else {
|
||
// 非法编码
|
||
return -1;
|
||
}
|
||
|
||
// 解析余下的字节
|
||
for (int i = 1; i < numBytes; ++i) {
|
||
byte = (unsigned char)utf8[i];
|
||
if ((byte & 0xC0) != 0x80) {
|
||
// 非法编码
|
||
return -1;
|
||
}
|
||
codePoint = (codePoint << 6) | (byte & 0x3F);
|
||
}
|
||
|
||
*unicode = codePoint;
|
||
return numBytes;
|
||
}
|
||
|
||
int main() {
|
||
char utf8[] = "你好,世界!";
|
||
int unicode;
|
||
|
||
int numBytes = utf8ToUnicode(utf8, &unicode);
|
||
if (numBytes == -1) {
|
||
printf("非法UTF-8编码!\n");
|
||
} else {
|
||
printf("UTF-8编码: %s\n", utf8);
|
||
printf("对应的Unicode码点: U+%04X\n", unicode);
|
||
}
|
||
|
||
return 0;
|
||
}
|
||
|