SumProject/my_error/unicode.cpp

#include <stdio.h>

int utf8ToUnicode(const char* utf8, int* unicode) {
    unsigned char byte = (unsigned char)utf8[0];
    int numBytes, codePoint;

    if (byte < 0x80) {
        // 1字节编码，直接返回
        numBytes = 1;
        codePoint = byte;
    } else if ((byte & 0xE0) == 0xC0) {
        // 2字节编码
        numBytes = 2;
        codePoint = byte & 0x1F;
    } else if ((byte & 0xF0) == 0xE0) {
        // 3字节编码
        numBytes = 3;
        codePoint = byte & 0x0F;
    } else if ((byte & 0xF8) == 0xF0) {
        // 4字节编码
        numBytes = 4;
        codePoint = byte & 0x07;
    } else {
        // 非法编码
        return -1;
    }

    // 解析余下的字节
    for (int i = 1; i < numBytes; ++i) {
        byte = (unsigned char)utf8[i];
        if ((byte & 0xC0) != 0x80) {
            // 非法编码
            return -1;
        }
        codePoint = (codePoint << 6) | (byte & 0x3F);
    }

    *unicode = codePoint;
    return numBytes;
}

int main() {
    char utf8[] = "你好，世界！";
    int unicode;

    int numBytes = utf8ToUnicode(utf8, &unicode);
    if (numBytes == -1) {
        printf("非法UTF-8编码！\n");
    } else {
        printf("UTF-8编码: %s\n", utf8);
        printf("对应的Unicode码点: U+%04X\n", unicode);
    }

    return 0;
}