mirror of
git://sourceware.org/git/newlib-cygwin.git
synced 2025-01-18 12:29:32 +08:00
9b42474f29
* libc/stdlib/wctomb_r.c (__utf8_wctomb): Fix check for handling a lone high surrogate. Signed-off-by: Corinna Vinschen <corinna@vinschen.de>
377 lines
8.5 KiB
C
377 lines
8.5 KiB
C
#include <errno.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <wchar.h>
|
|
#include <locale.h>
|
|
#include "mbctype.h"
|
|
#include "local.h"
|
|
|
|
int (*__wctomb) (struct _reent *, char *, wchar_t, const char *charset,
|
|
mbstate_t *)
|
|
#ifdef __CYGWIN__
|
|
/* Cygwin starts up in UTF-8 mode. */
|
|
= __utf8_wctomb;
|
|
#else
|
|
= __ascii_wctomb;
|
|
#endif
|
|
|
|
int
|
|
_DEFUN (_wctomb_r, (r, s, wchar, state),
|
|
struct _reent *r _AND
|
|
char *s _AND
|
|
wchar_t _wchar _AND
|
|
mbstate_t *state)
|
|
{
|
|
return __wctomb (r, s, _wchar, __locale_charset (), state);
|
|
}
|
|
|
|
int
|
|
_DEFUN (__ascii_wctomb, (r, s, wchar, charset, state),
|
|
struct _reent *r _AND
|
|
char *s _AND
|
|
wchar_t _wchar _AND
|
|
const char *charset _AND
|
|
mbstate_t *state)
|
|
{
|
|
/* Avoids compiler warnings about comparisons that are always false
|
|
due to limited range when sizeof(wchar_t) is 2 but sizeof(wint_t)
|
|
is 4, as is the case on cygwin. */
|
|
wint_t wchar = _wchar;
|
|
|
|
if (s == NULL)
|
|
return 0;
|
|
|
|
#ifdef __CYGWIN__
|
|
if ((size_t)wchar >= 0x80)
|
|
#else
|
|
if ((size_t)wchar >= 0x100)
|
|
#endif
|
|
{
|
|
r->_errno = EILSEQ;
|
|
return -1;
|
|
}
|
|
|
|
*s = (char) wchar;
|
|
return 1;
|
|
}
|
|
|
|
#ifdef _MB_CAPABLE
|
|
/* for some conversions, we use the __count field as a place to store a state value */
|
|
#define __state __count
|
|
|
|
int
|
|
_DEFUN (__utf8_wctomb, (r, s, wchar, charset, state),
|
|
struct _reent *r _AND
|
|
char *s _AND
|
|
wchar_t _wchar _AND
|
|
const char *charset _AND
|
|
mbstate_t *state)
|
|
{
|
|
wint_t wchar = _wchar;
|
|
int ret = 0;
|
|
|
|
if (s == NULL)
|
|
return 0; /* UTF-8 encoding is not state-dependent */
|
|
|
|
if (sizeof (wchar_t) == 2 && state->__count == -4
|
|
&& (wchar < 0xdc00 || wchar > 0xdfff))
|
|
{
|
|
/* There's a leftover lone high surrogate. Write out the CESU-8 value
|
|
of the surrogate and proceed to convert the given character. Note
|
|
to return extra 3 bytes. */
|
|
wchar_t tmp;
|
|
tmp = (state->__value.__wchb[0] << 16 | state->__value.__wchb[1] << 8)
|
|
- (0x10000 >> 10 | 0xd80d);
|
|
*s++ = 0xe0 | ((tmp & 0xf000) >> 12);
|
|
*s++ = 0x80 | ((tmp & 0xfc0) >> 6);
|
|
*s++ = 0x80 | (tmp & 0x3f);
|
|
state->__count = 0;
|
|
ret = 3;
|
|
}
|
|
if (wchar <= 0x7f)
|
|
{
|
|
*s = wchar;
|
|
return ret + 1;
|
|
}
|
|
if (wchar >= 0x80 && wchar <= 0x7ff)
|
|
{
|
|
*s++ = 0xc0 | ((wchar & 0x7c0) >> 6);
|
|
*s = 0x80 | (wchar & 0x3f);
|
|
return ret + 2;
|
|
}
|
|
if (wchar >= 0x800 && wchar <= 0xffff)
|
|
{
|
|
/* No UTF-16 surrogate handling in UCS-4 */
|
|
if (sizeof (wchar_t) == 2 && wchar >= 0xd800 && wchar <= 0xdfff)
|
|
{
|
|
wint_t tmp;
|
|
if (wchar <= 0xdbff)
|
|
{
|
|
/* First half of a surrogate pair. Store the state and
|
|
return ret + 0. */
|
|
tmp = ((wchar & 0x3ff) << 10) + 0x10000;
|
|
state->__value.__wchb[0] = (tmp >> 16) & 0xff;
|
|
state->__value.__wchb[1] = (tmp >> 8) & 0xff;
|
|
state->__count = -4;
|
|
*s = (0xf0 | ((tmp & 0x1c0000) >> 18));
|
|
return ret;
|
|
}
|
|
if (state->__count == -4)
|
|
{
|
|
/* Second half of a surrogate pair. Reconstruct the full
|
|
Unicode value and return the trailing three bytes of the
|
|
UTF-8 character. */
|
|
tmp = (state->__value.__wchb[0] << 16)
|
|
| (state->__value.__wchb[1] << 8)
|
|
| (wchar & 0x3ff);
|
|
state->__count = 0;
|
|
*s++ = 0xf0 | ((tmp & 0x1c0000) >> 18);
|
|
*s++ = 0x80 | ((tmp & 0x3f000) >> 12);
|
|
*s++ = 0x80 | ((tmp & 0xfc0) >> 6);
|
|
*s = 0x80 | (tmp & 0x3f);
|
|
return 4;
|
|
}
|
|
/* Otherwise translate into CESU-8 value. */
|
|
}
|
|
*s++ = 0xe0 | ((wchar & 0xf000) >> 12);
|
|
*s++ = 0x80 | ((wchar & 0xfc0) >> 6);
|
|
*s = 0x80 | (wchar & 0x3f);
|
|
return ret + 3;
|
|
}
|
|
if (wchar >= 0x10000 && wchar <= 0x10ffff)
|
|
{
|
|
*s++ = 0xf0 | ((wchar & 0x1c0000) >> 18);
|
|
*s++ = 0x80 | ((wchar & 0x3f000) >> 12);
|
|
*s++ = 0x80 | ((wchar & 0xfc0) >> 6);
|
|
*s = 0x80 | (wchar & 0x3f);
|
|
return 4;
|
|
}
|
|
|
|
r->_errno = EILSEQ;
|
|
return -1;
|
|
}
|
|
|
|
/* Cygwin defines its own doublebyte charset conversion functions
|
|
because the underlying OS requires wchar_t == UTF-16. */
|
|
#ifndef __CYGWIN__
|
|
int
|
|
_DEFUN (__sjis_wctomb, (r, s, wchar, charset, state),
|
|
struct _reent *r _AND
|
|
char *s _AND
|
|
wchar_t _wchar _AND
|
|
const char *charset _AND
|
|
mbstate_t *state)
|
|
{
|
|
wint_t wchar = _wchar;
|
|
|
|
unsigned char char2 = (unsigned char)wchar;
|
|
unsigned char char1 = (unsigned char)(wchar >> 8);
|
|
|
|
if (s == NULL)
|
|
return 0; /* not state-dependent */
|
|
|
|
if (char1 != 0x00)
|
|
{
|
|
/* first byte is non-zero..validate multi-byte char */
|
|
if (_issjis1(char1) && _issjis2(char2))
|
|
{
|
|
*s++ = (char)char1;
|
|
*s = (char)char2;
|
|
return 2;
|
|
}
|
|
else
|
|
{
|
|
r->_errno = EILSEQ;
|
|
return -1;
|
|
}
|
|
}
|
|
*s = (char) wchar;
|
|
return 1;
|
|
}
|
|
|
|
int
|
|
_DEFUN (__eucjp_wctomb, (r, s, wchar, charset, state),
|
|
struct _reent *r _AND
|
|
char *s _AND
|
|
wchar_t _wchar _AND
|
|
const char *charset _AND
|
|
mbstate_t *state)
|
|
{
|
|
wint_t wchar = _wchar;
|
|
unsigned char char2 = (unsigned char)wchar;
|
|
unsigned char char1 = (unsigned char)(wchar >> 8);
|
|
|
|
if (s == NULL)
|
|
return 0; /* not state-dependent */
|
|
|
|
if (char1 != 0x00)
|
|
{
|
|
/* first byte is non-zero..validate multi-byte char */
|
|
if (_iseucjp1 (char1) && _iseucjp2 (char2))
|
|
{
|
|
*s++ = (char)char1;
|
|
*s = (char)char2;
|
|
return 2;
|
|
}
|
|
else if (_iseucjp2 (char1) && _iseucjp2 (char2 | 0x80))
|
|
{
|
|
*s++ = (char)0x8f;
|
|
*s++ = (char)char1;
|
|
*s = (char)(char2 | 0x80);
|
|
return 3;
|
|
}
|
|
else
|
|
{
|
|
r->_errno = EILSEQ;
|
|
return -1;
|
|
}
|
|
}
|
|
*s = (char) wchar;
|
|
return 1;
|
|
}
|
|
|
|
int
|
|
_DEFUN (__jis_wctomb, (r, s, wchar, charset, state),
|
|
struct _reent *r _AND
|
|
char *s _AND
|
|
wchar_t _wchar _AND
|
|
const char *charset _AND
|
|
mbstate_t *state)
|
|
{
|
|
wint_t wchar = _wchar;
|
|
int cnt = 0;
|
|
unsigned char char2 = (unsigned char)wchar;
|
|
unsigned char char1 = (unsigned char)(wchar >> 8);
|
|
|
|
if (s == NULL)
|
|
return 1; /* state-dependent */
|
|
|
|
if (char1 != 0x00)
|
|
{
|
|
/* first byte is non-zero..validate multi-byte char */
|
|
if (_isjis (char1) && _isjis (char2))
|
|
{
|
|
if (state->__state == 0)
|
|
{
|
|
/* must switch from ASCII to JIS state */
|
|
state->__state = 1;
|
|
*s++ = ESC_CHAR;
|
|
*s++ = '$';
|
|
*s++ = 'B';
|
|
cnt = 3;
|
|
}
|
|
*s++ = (char)char1;
|
|
*s = (char)char2;
|
|
return cnt + 2;
|
|
}
|
|
r->_errno = EILSEQ;
|
|
return -1;
|
|
}
|
|
if (state->__state != 0)
|
|
{
|
|
/* must switch from JIS to ASCII state */
|
|
state->__state = 0;
|
|
*s++ = ESC_CHAR;
|
|
*s++ = '(';
|
|
*s++ = 'B';
|
|
cnt = 3;
|
|
}
|
|
*s = (char)char2;
|
|
return cnt + 1;
|
|
}
|
|
#endif /* !__CYGWIN__ */
|
|
|
|
#ifdef _MB_EXTENDED_CHARSETS_ISO
|
|
int
|
|
_DEFUN (__iso_wctomb, (r, s, wchar, charset, state),
|
|
struct _reent *r _AND
|
|
char *s _AND
|
|
wchar_t _wchar _AND
|
|
const char *charset _AND
|
|
mbstate_t *state)
|
|
{
|
|
wint_t wchar = _wchar;
|
|
|
|
if (s == NULL)
|
|
return 0;
|
|
|
|
/* wchars <= 0x9f translate to all ISO charsets directly. */
|
|
if (wchar >= 0xa0)
|
|
{
|
|
int iso_idx = __iso_8859_index (charset + 9);
|
|
if (iso_idx >= 0)
|
|
{
|
|
unsigned char mb;
|
|
|
|
if (s == NULL)
|
|
return 0;
|
|
|
|
for (mb = 0; mb < 0x60; ++mb)
|
|
if (__iso_8859_conv[iso_idx][mb] == wchar)
|
|
{
|
|
*s = (char) (mb + 0xa0);
|
|
return 1;
|
|
}
|
|
r->_errno = EILSEQ;
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
if ((size_t)wchar >= 0x100)
|
|
{
|
|
r->_errno = EILSEQ;
|
|
return -1;
|
|
}
|
|
|
|
*s = (char) wchar;
|
|
return 1;
|
|
}
|
|
#endif /* _MB_EXTENDED_CHARSETS_ISO */
|
|
|
|
#ifdef _MB_EXTENDED_CHARSETS_WINDOWS
|
|
int
|
|
_DEFUN (__cp_wctomb, (r, s, wchar, charset, state),
|
|
struct _reent *r _AND
|
|
char *s _AND
|
|
wchar_t _wchar _AND
|
|
const char *charset _AND
|
|
mbstate_t *state)
|
|
{
|
|
wint_t wchar = _wchar;
|
|
|
|
if (s == NULL)
|
|
return 0;
|
|
|
|
if (wchar >= 0x80)
|
|
{
|
|
int cp_idx = __cp_index (charset + 2);
|
|
if (cp_idx >= 0)
|
|
{
|
|
unsigned char mb;
|
|
|
|
if (s == NULL)
|
|
return 0;
|
|
|
|
for (mb = 0; mb < 0x80; ++mb)
|
|
if (__cp_conv[cp_idx][mb] == wchar)
|
|
{
|
|
*s = (char) (mb + 0x80);
|
|
return 1;
|
|
}
|
|
r->_errno = EILSEQ;
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
if ((size_t)wchar >= 0x100)
|
|
{
|
|
r->_errno = EILSEQ;
|
|
return -1;
|
|
}
|
|
|
|
*s = (char) wchar;
|
|
return 1;
|
|
}
|
|
#endif /* _MB_EXTENDED_CHARSETS_WINDOWS */
|
|
#endif /* _MB_CAPABLE */
|