Cygwin: mbrtowi: define replacement for mbrtowc, returning UTF-32 value

Given how UTF-16 isn't capable to hold all Unicode chars in a single
wchar_t, we need a function returning a wint_t value representing
a UTF-32 value for comparison functions.  Fortunately the important
wide character functions like towupper/towlower, isw<class>, iswctype,
etc, already take wint_t values and newlib handles them as UTF-32.

If only we had switched wchar_t to 32 bit way back when... sigh.

Signed-off-by: Corinna Vinschen <corinna@vinschen.de>
This commit is contained in:
Corinna Vinschen 2023-02-14 12:20:20 +01:00
parent 210eca1b31
commit 60c25da90d
2 changed files with 36 additions and 0 deletions

View File

@ -39,6 +39,10 @@ extern wctomb_f __utf8_wctomb;
#define __WCTOMB (__get_current_locale ()->wctomb) #define __WCTOMB (__get_current_locale ()->wctomb)
/* replacement function for mbrtowc, returning a wint_t representing
a UTF-32 value. Defined in strfuncs.cc */
extern wint_t mbrtowi (wint_t *, const char *, size_t, mbstate_t *);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

View File

@ -112,6 +112,38 @@ transform_chars_af_unix (PWCHAR out, const char *path, __socklen_t len)
return out; return out;
} }
/* replacement function for mbrtowc, returning a wint_t representing
a UTF-32 value. */
extern "C" wint_t
mbrtowi (wint_t *pwi, const char *s, size_t n, mbstate_t *ps)
{
size_t len, len2;
wchar_t w1, w2;
len = mbrtowc (&w1, s, n, ps);
if (len == (size_t) -1 || len == (size_t) -2)
return len;
*pwi = w1;
/* Convert surrogate pair to wint_t value */
if (len > 0 && w1 >= 0xd800 && w1 <= 0xdbff)
{
s += len;
n -= len;
len2 = mbrtowc (&w2, s, n, ps);
if (len2 > 0 && w2 >= 0xdc00 && w2 <= 0xdfff)
{
len += len2;
*pwi = (((w1 & 0x3ff) << 10) | (w2 & 0x3ff)) + 0x10000;
}
else
{
len = (size_t) -1;
errno = EILSEQ;
}
}
return len;
}
/* The SJIS, JIS and eucJP conversion in newlib does not use UTF as /* The SJIS, JIS and eucJP conversion in newlib does not use UTF as
wchar_t character representation. That's unfortunate for us since wchar_t character representation. That's unfortunate for us since
we require UTF for the OS. What we do here is to have our own we require UTF for the OS. What we do here is to have our own