* libc/stdlib/mbtowc_r.c (__utf8_mbtowc): Allow CESU-8 surrogate
value encoding. * libc/stdlib/wctomb_r.c (__utf8_mbtowc): Allow CESU-8 surrogate value decoding.
This commit is contained in:
parent
9c47bbb6e9
commit
6ff28fc3b1
|
@ -1,3 +1,10 @@
|
|||
2009-10-03 Corinna Vinschen <corinna@vinschen.de>
|
||||
|
||||
* libc/stdlib/mbtowc_r.c (__utf8_mbtowc): Allow CESU-8 surrogate
|
||||
value encoding.
|
||||
* libc/stdlib/wctomb_r.c (__utf8_mbtowc): Allow CESU-8 surrogate
|
||||
value decoding.
|
||||
|
||||
2009-09-29 Corinna Vinschen <corinna@vinschen.de>
|
||||
|
||||
* libc/locale/locale.c (loadlocale): Allow "C." same as "C-" as locale
|
||||
|
|
|
@ -295,12 +295,6 @@ _DEFUN (__utf8_mbtowc, (r, pwc, s, n, charset, state),
|
|||
tmp = (wchar_t)((state->__value.__wchb[0] & 0x0f) << 12)
|
||||
| (wchar_t)((state->__value.__wchb[1] & 0x3f) << 6)
|
||||
| (wchar_t)(ch & 0x3f);
|
||||
/* Check for invalid CESU-8 encoding of UTF-16 surrogate values. */
|
||||
if (tmp >= 0xd800 && tmp <= 0xdfff)
|
||||
{
|
||||
r->_errno = EILSEQ;
|
||||
return -1;
|
||||
}
|
||||
*pwc = tmp;
|
||||
return i;
|
||||
}
|
||||
|
|
|
@ -63,72 +63,75 @@ _DEFUN (__utf8_wctomb, (r, s, wchar, charset, state),
|
|||
mbstate_t *state)
|
||||
{
|
||||
wint_t wchar = _wchar;
|
||||
int ret = 0;
|
||||
|
||||
if (s == NULL)
|
||||
return 0; /* UTF-8 encoding is not state-dependent */
|
||||
|
||||
if (state->__count == -4 && (wchar < 0xdc00 || wchar >= 0xdfff))
|
||||
if (sizeof (wchar_t) == 2 && state->__count == -4
|
||||
&& (wchar < 0xdc00 || wchar >= 0xdfff))
|
||||
{
|
||||
/* At this point only the second half of a surrogate pair is valid. */
|
||||
r->_errno = EILSEQ;
|
||||
return -1;
|
||||
/* There's a leftover lone high surrogate. Write out the CESU-8 value
|
||||
of the surrogate and proceed to convert the given character. Note
|
||||
to return extra 3 bytes. */
|
||||
wchar_t tmp;
|
||||
tmp = (state->__value.__wchb[0] << 16 | state->__value.__wchb[1] << 8)
|
||||
- 0x10000 >> 10 | 0xd80d;
|
||||
*s++ = 0xe0 | ((tmp & 0xf000) >> 12);
|
||||
*s++ = 0x80 | ((tmp & 0xfc0) >> 6);
|
||||
*s++ = 0x80 | (tmp & 0x3f);
|
||||
state->__count = 0;
|
||||
ret = 3;
|
||||
}
|
||||
if (wchar <= 0x7f)
|
||||
{
|
||||
*s = wchar;
|
||||
return 1;
|
||||
return ret + 1;
|
||||
}
|
||||
if (wchar >= 0x80 && wchar <= 0x7ff)
|
||||
{
|
||||
*s++ = 0xc0 | ((wchar & 0x7c0) >> 6);
|
||||
*s = 0x80 | (wchar & 0x3f);
|
||||
return 2;
|
||||
return ret + 2;
|
||||
}
|
||||
if (wchar >= 0x800 && wchar <= 0xffff)
|
||||
{
|
||||
if (wchar >= 0xd800 && wchar <= 0xdfff)
|
||||
/* No UTF-16 surrogate handling in UCS-4 */
|
||||
if (sizeof (wchar_t) == 2 && wchar >= 0xd800 && wchar <= 0xdfff)
|
||||
{
|
||||
wint_t tmp;
|
||||
/* UTF-16 surrogates -- must not occur in normal UCS-4 data */
|
||||
if (sizeof (wchar_t) != 2)
|
||||
if (wchar <= 0xdbff)
|
||||
{
|
||||
r->_errno = EILSEQ;
|
||||
return -1;
|
||||
/* First half of a surrogate pair. Store the state and
|
||||
return ret + 0. */
|
||||
tmp = ((wchar & 0x3ff) << 10) + 0x10000;
|
||||
state->__value.__wchb[0] = (tmp >> 16) & 0xff;
|
||||
state->__value.__wchb[1] = (tmp >> 8) & 0xff;
|
||||
state->__count = -4;
|
||||
*s = (0xf0 | ((tmp & 0x1c0000) >> 18));
|
||||
return ret;
|
||||
}
|
||||
if (wchar >= 0xdc00)
|
||||
if (state->__count == -4)
|
||||
{
|
||||
/* Second half of a surrogate pair. It's not valid if
|
||||
we don't have already read a first half of a surrogate
|
||||
before. */
|
||||
if (state->__count != -4)
|
||||
{
|
||||
r->_errno = EILSEQ;
|
||||
return -1;
|
||||
}
|
||||
/* If it's valid, reconstruct the full Unicode value and
|
||||
return the trailing three bytes of the UTF-8 char. */
|
||||
/* Second half of a surrogate pair. Reconstruct the full
|
||||
Unicode value and return the trailing three bytes of the
|
||||
UTF-8 character. */
|
||||
tmp = (state->__value.__wchb[0] << 16)
|
||||
| (state->__value.__wchb[1] << 8)
|
||||
| (wchar & 0x3ff);
|
||||
state->__count = 0;
|
||||
*s++ = 0xf0 | ((tmp & 0x1c0000) >> 18);
|
||||
*s++ = 0x80 | ((tmp & 0x3f000) >> 12);
|
||||
*s++ = 0x80 | ((tmp & 0xfc0) >> 6);
|
||||
*s = 0x80 | (tmp & 0x3f);
|
||||
return 3;
|
||||
return 4;
|
||||
}
|
||||
/* First half of a surrogate pair. Store the state and return
|
||||
the first byte of the UTF-8 char. */
|
||||
tmp = ((wchar & 0x3ff) << 10) + 0x10000;
|
||||
state->__value.__wchb[0] = (tmp >> 16) & 0xff;
|
||||
state->__value.__wchb[1] = (tmp >> 8) & 0xff;
|
||||
state->__count = -4;
|
||||
*s = (0xf0 | ((tmp & 0x1c0000) >> 18));
|
||||
return 1;
|
||||
/* Otherwise translate into CESU-8 value. */
|
||||
}
|
||||
*s++ = 0xe0 | ((wchar & 0xf000) >> 12);
|
||||
*s++ = 0x80 | ((wchar & 0xfc0) >> 6);
|
||||
*s = 0x80 | (wchar & 0x3f);
|
||||
return 3;
|
||||
return ret + 3;
|
||||
}
|
||||
if (wchar >= 0x10000 && wchar <= 0x10ffff)
|
||||
{
|
||||
|
|
Loading…
Reference in New Issue