57 lines
1.3 KiB
C++
57 lines
1.3 KiB
C++
|
#include <stdio.h>
|
|||
|
|
|||
|
int utf8ToUnicode(const char* utf8, int* unicode) {
|
|||
|
unsigned char byte = (unsigned char)utf8[0];
|
|||
|
int numBytes, codePoint;
|
|||
|
|
|||
|
if (byte < 0x80) {
|
|||
|
// 1<>ֽڱ<D6BD><DAB1>룬ֱ<EBA3AC>ӷ<EFBFBD><D3B7><EFBFBD>
|
|||
|
numBytes = 1;
|
|||
|
codePoint = byte;
|
|||
|
} else if ((byte & 0xE0) == 0xC0) {
|
|||
|
// 2<>ֽڱ<D6BD><DAB1><EFBFBD>
|
|||
|
numBytes = 2;
|
|||
|
codePoint = byte & 0x1F;
|
|||
|
} else if ((byte & 0xF0) == 0xE0) {
|
|||
|
// 3<>ֽڱ<D6BD><DAB1><EFBFBD>
|
|||
|
numBytes = 3;
|
|||
|
codePoint = byte & 0x0F;
|
|||
|
} else if ((byte & 0xF8) == 0xF0) {
|
|||
|
// 4<>ֽڱ<D6BD><DAB1><EFBFBD>
|
|||
|
numBytes = 4;
|
|||
|
codePoint = byte & 0x07;
|
|||
|
} else {
|
|||
|
// <20>Ƿ<EFBFBD><C7B7><EFBFBD><EFBFBD><EFBFBD>
|
|||
|
return -1;
|
|||
|
}
|
|||
|
|
|||
|
// <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>µ<EFBFBD><C2B5>ֽ<EFBFBD>
|
|||
|
for (int i = 1; i < numBytes; ++i) {
|
|||
|
byte = (unsigned char)utf8[i];
|
|||
|
if ((byte & 0xC0) != 0x80) {
|
|||
|
// <20>Ƿ<EFBFBD><C7B7><EFBFBD><EFBFBD><EFBFBD>
|
|||
|
return -1;
|
|||
|
}
|
|||
|
codePoint = (codePoint << 6) | (byte & 0x3F);
|
|||
|
}
|
|||
|
|
|||
|
*unicode = codePoint;
|
|||
|
return numBytes;
|
|||
|
}
|
|||
|
|
|||
|
int main() {
|
|||
|
char utf8[] = "<EFBFBD><EFBFBD><EFBFBD>ã<EFBFBD><EFBFBD><EFBFBD><EFBFBD>磡";
|
|||
|
int unicode;
|
|||
|
|
|||
|
int numBytes = utf8ToUnicode(utf8, &unicode);
|
|||
|
if (numBytes == -1) {
|
|||
|
printf("<EFBFBD>Ƿ<EFBFBD>UTF-8<><38><EFBFBD>룡\n");
|
|||
|
} else {
|
|||
|
printf("UTF-8<><38><EFBFBD><EFBFBD>: %s\n", utf8);
|
|||
|
printf("<EFBFBD><EFBFBD>Ӧ<EFBFBD><EFBFBD>Unicode<EFBFBD><EFBFBD><EFBFBD><EFBFBD>: U+%04X\n", unicode);
|
|||
|
}
|
|||
|
|
|||
|
return 0;
|
|||
|
}
|
|||
|
|