SumProject/my_error/unicode.cpp

57 lines
1.3 KiB
C++
Raw Normal View History

2025-03-22 22:38:52 +08:00
#include <stdio.h>
int utf8ToUnicode(const char* utf8, int* unicode) {
unsigned char byte = (unsigned char)utf8[0];
int numBytes, codePoint;
if (byte < 0x80) {
// 1<>ֽڱ<D6BD><DAB1>룬ֱ<EBA3AC>ӷ<EFBFBD><D3B7><EFBFBD>
numBytes = 1;
codePoint = byte;
} else if ((byte & 0xE0) == 0xC0) {
// 2<>ֽڱ<D6BD><DAB1><EFBFBD>
numBytes = 2;
codePoint = byte & 0x1F;
} else if ((byte & 0xF0) == 0xE0) {
// 3<>ֽڱ<D6BD><DAB1><EFBFBD>
numBytes = 3;
codePoint = byte & 0x0F;
} else if ((byte & 0xF8) == 0xF0) {
// 4<>ֽڱ<D6BD><DAB1><EFBFBD>
numBytes = 4;
codePoint = byte & 0x07;
} else {
// <20>Ƿ<EFBFBD><C7B7><EFBFBD><EFBFBD><EFBFBD>
return -1;
}
// <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>µ<EFBFBD><C2B5>ֽ<EFBFBD>
for (int i = 1; i < numBytes; ++i) {
byte = (unsigned char)utf8[i];
if ((byte & 0xC0) != 0x80) {
// <20>Ƿ<EFBFBD><C7B7><EFBFBD><EFBFBD><EFBFBD>
return -1;
}
codePoint = (codePoint << 6) | (byte & 0x3F);
}
*unicode = codePoint;
return numBytes;
}
int main() {
char utf8[] = "<EFBFBD><EFBFBD><EFBFBD>ã<EFBFBD><EFBFBD><EFBFBD><EFBFBD>";
int unicode;
int numBytes = utf8ToUnicode(utf8, &unicode);
if (numBytes == -1) {
printf("<EFBFBD>Ƿ<EFBFBD>UTF-8<><38><EFBFBD>\n");
} else {
printf("UTF-8<><38><EFBFBD><EFBFBD>: %s\n", utf8);
printf("<EFBFBD><EFBFBD>Ӧ<EFBFBD><EFBFBD>Unicode<EFBFBD><EFBFBD><EFBFBD><EFBFBD>: U+%04X\n", unicode);
}
return 0;
}