#include int utf8ToUnicode(const char* utf8, int* unicode) { unsigned char byte = (unsigned char)utf8[0]; int numBytes, codePoint; if (byte < 0x80) { // 1字节编码,直接返回 numBytes = 1; codePoint = byte; } else if ((byte & 0xE0) == 0xC0) { // 2字节编码 numBytes = 2; codePoint = byte & 0x1F; } else if ((byte & 0xF0) == 0xE0) { // 3字节编码 numBytes = 3; codePoint = byte & 0x0F; } else if ((byte & 0xF8) == 0xF0) { // 4字节编码 numBytes = 4; codePoint = byte & 0x07; } else { // 非法编码 return -1; } // 解析余下的字节 for (int i = 1; i < numBytes; ++i) { byte = (unsigned char)utf8[i]; if ((byte & 0xC0) != 0x80) { // 非法编码 return -1; } codePoint = (codePoint << 6) | (byte & 0x3F); } *unicode = codePoint; return numBytes; } int main() { char utf8[] = "你好,世界!"; int unicode; int numBytes = utf8ToUnicode(utf8, &unicode); if (numBytes == -1) { printf("非法UTF-8编码!\n"); } else { printf("UTF-8编码: %s\n", utf8); printf("对应的Unicode码点: U+%04X\n", unicode); } return 0; }