4
0
mirror of git://sourceware.org/git/newlib-cygwin.git synced 2025-02-11 19:49:15 +08:00
Corinna Vinschen 28186e81d9 * libc/ctype/iswalpha.c: Handle all wchar_t as unicode on
_MB_CAPABLE systems.
	* libc/ctype/iswblank.c: Ditto.
	* libc/ctype/iswcntrl.c: Ditto.
	* libc/ctype/iswprint.c: Ditto.
	* libc/ctype/iswpunct.c: Ditto.
	* libc/ctype/iswspace.c: Ditto.
	* libc/ctype/jp2uc.c (__jp2uc): On Cygwin, just return c.
	Explain why.
	* libc/ctype/towlower.c: Ditto.
	* libc/ctype/towupper.c: Ditto.
	* libc/include/sys/config.h: Define _MB_EXTENDED_CHARSETS_ISO
	and _MB_EXTENDED_CHARSETS_WINDOWS if _MB_EXTENDED_CHARSETS_ALL is
	defined.  Define _MB_EXTENDED_CHARSETS_ALL on Cygwin only for now.
	* libc/include/sys/reent.h (struct _reent): Mark _current_category
	and _current_locale as unused.
	* libc/locale/locale.c: Add new charset support to documentation.
	Include ../stdio/local.h from here.
	(lc_ctype_charset): Set to "ASCII" by default.
	(lc_message_charset): Ditto.
	(_setlocale_r): Don't set _current_category and _current_locale.
	(loadlocale): Add Cygwin codepage support.  On _MB_CAPABLE
	systems, set __mbtowc and __wctomb function pointers to function
	corresponding with current charset.  Don't allow non-existant
	ISO-8859-12 charset.  Add support for Windows singlebyte codepages.
	On Cygwin, add support for GBK, CP949, and BIG5.  On Cygwin,
	call __set_ctype() in case the catorgy is LC_CTYPE.  Don't set
	_current_category and _current_locale.
	* libc/stdlib/Makefile.am (GENERAL_SOURCES): Add sb_charsets.c.
	* libc/stdlib/Makefile.in: Regenerate.
	* libc/stdlib/local.h: Add prototype for __locale_charset.
	Add prototypes for __mbtowc and __wctomb pointers.
	Add prototypes for charset-specific _wctomb_r and _mbtowc_r
	functions.
	Declare tables and functions from sb_charsets.c.
	* libc/stdlib/mbtowc_r.c (__mbtowc): Define.  Set to __ascii_mbtowc
	by default.
	(_mbtowc_r): Just call __mbtowc from here.
	(__ascii_mbtowc): New function.
	(__iso_mbtowc): New function.
	(__cp_mbtowc): New function.
	(__utf8_mbtowc): New function.
	(__sjis_mbtowc): New function.  Disable on Cygwin.
	(__eucjp_mbtowc): New function.  Disable on Cygwin.
	(__jis_mbtowc): New function.  Disable on Cygwin.
	* libc/stdlib/sb_charsets.c: New file, adding singlebyte to UTF
	conversion tables for all ISO and CP charsets.
	(__iso_8859_index): New function.
	(__cp_index): New function.
	* libc/stdlib/wctomb_r.c (__wctomb): Define.  Set to __ascii_wctomb
	by default.
	(_wctomb_r): Just call __wctomb from here.
	(__ascii_wctomb): New function.
	(__utf8_wctomb): New function.
	(__sjis_wctomb): New function.  Disable on Cygwin.
	(__eucjp_wctomb): New function.  Disable on Cygwin.
	(__jis_wctomb): New function.  Disable on Cygwin.
	(__iso_wctomb): New function.
	(__cp_wctomb): New function.
2009-03-24 10:13:27 +00:00

358 lines
7.9 KiB
C

#include <errno.h>
#include <stdlib.h>
#include <string.h>
#include <wchar.h>
#include <locale.h>
#include "mbctype.h"
#include "local.h"
int (*__wctomb) (struct _reent *, char *, wchar_t, const char *charset,
mbstate_t *)
= __ascii_wctomb;
int
_DEFUN (_wctomb_r, (r, s, wchar, state),
struct _reent *r _AND
char *s _AND
wchar_t _wchar _AND
mbstate_t *state)
{
return __wctomb (r, s, _wchar, __locale_charset (), state);
}
int
_DEFUN (__ascii_wctomb, (r, s, wchar, charset, state),
struct _reent *r _AND
char *s _AND
wchar_t _wchar _AND
const char *charset _AND
mbstate_t *state)
{
/* Avoids compiler warnings about comparisons that are always false
due to limited range when sizeof(wchar_t) is 2 but sizeof(wint_t)
is 4, as is the case on cygwin. */
wint_t wchar = _wchar;
if (s == NULL)
return 0;
if ((size_t)wchar >= 0x100)
{
r->_errno = EILSEQ;
return -1;
}
*s = (char) wchar;
return 1;
}
#ifdef _MB_CAPABLE
/* for some conversions, we use the __count field as a place to store a state value */
#define __state __count
int
_DEFUN (__utf8_wctomb, (r, s, wchar, charset, state),
struct _reent *r _AND
char *s _AND
wchar_t _wchar _AND
const char *charset _AND
mbstate_t *state)
{
wint_t wchar = _wchar;
if (s == NULL)
return 0; /* UTF-8 encoding is not state-dependent */
if (state->__count == -4 && (wchar < 0xdc00 || wchar >= 0xdfff))
{
/* At this point only the second half of a surrogate pair is valid. */
r->_errno = EILSEQ;
return -1;
}
if (wchar <= 0x7f)
{
*s = wchar;
return 1;
}
if (wchar >= 0x80 && wchar <= 0x7ff)
{
*s++ = 0xc0 | ((wchar & 0x7c0) >> 6);
*s = 0x80 | (wchar & 0x3f);
return 2;
}
if (wchar >= 0x800 && wchar <= 0xffff)
{
if (wchar >= 0xd800 && wchar <= 0xdfff)
{
wint_t tmp;
/* UTF-16 surrogates -- must not occur in normal UCS-4 data */
if (sizeof (wchar_t) != 2)
{
r->_errno = EILSEQ;
return -1;
}
if (wchar >= 0xdc00)
{
/* Second half of a surrogate pair. It's not valid if
we don't have already read a first half of a surrogate
before. */
if (state->__count != -4)
{
r->_errno = EILSEQ;
return -1;
}
/* If it's valid, reconstruct the full Unicode value and
return the trailing three bytes of the UTF-8 char. */
tmp = (state->__value.__wchb[0] << 16)
| (state->__value.__wchb[1] << 8)
| (wchar & 0x3ff);
state->__count = 0;
*s++ = 0x80 | ((tmp & 0x3f000) >> 12);
*s++ = 0x80 | ((tmp & 0xfc0) >> 6);
*s = 0x80 | (tmp & 0x3f);
return 3;
}
/* First half of a surrogate pair. Store the state and return
the first byte of the UTF-8 char. */
tmp = ((wchar & 0x3ff) << 10) + 0x10000;
state->__value.__wchb[0] = (tmp >> 16) & 0xff;
state->__value.__wchb[1] = (tmp >> 8) & 0xff;
state->__count = -4;
*s = (0xf0 | ((tmp & 0x1c0000) >> 18));
return 1;
}
*s++ = 0xe0 | ((wchar & 0xf000) >> 12);
*s++ = 0x80 | ((wchar & 0xfc0) >> 6);
*s = 0x80 | (wchar & 0x3f);
return 3;
}
if (wchar >= 0x10000 && wchar <= 0x10ffff)
{
*s++ = 0xf0 | ((wchar & 0x1c0000) >> 18);
*s++ = 0x80 | ((wchar & 0x3f000) >> 12);
*s++ = 0x80 | ((wchar & 0xfc0) >> 6);
*s = 0x80 | (wchar & 0x3f);
return 4;
}
r->_errno = EILSEQ;
return -1;
}
/* Cygwin defines its own doublebyte charset conversion functions
because the underlying OS requires wchar_t == UTF-16. */
#ifndef __CYGWIN__
int
_DEFUN (__sjis_wctomb, (r, s, wchar, charset, state),
struct _reent *r _AND
char *s _AND
wchar_t _wchar _AND
const char *charset _AND
mbstate_t *state)
{
wint_t wchar = _wchar;
unsigned char char2 = (unsigned char)wchar;
unsigned char char1 = (unsigned char)(wchar >> 8);
if (s == NULL)
return 0; /* not state-dependent */
if (char1 != 0x00)
{
/* first byte is non-zero..validate multi-byte char */
if (_issjis1(char1) && _issjis2(char2))
{
*s++ = (char)char1;
*s = (char)char2;
return 2;
}
else
{
r->_errno = EILSEQ;
return -1;
}
}
*s = (char) wchar;
return 1;
}
int
_DEFUN (__eucjp_wctomb, (r, s, wchar, charset, state),
struct _reent *r _AND
char *s _AND
wchar_t _wchar _AND
const char *charset _AND
mbstate_t *state)
{
wint_t wchar = _wchar;
unsigned char char2 = (unsigned char)wchar;
unsigned char char1 = (unsigned char)(wchar >> 8);
if (s == NULL)
return 0; /* not state-dependent */
if (char1 != 0x00)
{
/* first byte is non-zero..validate multi-byte char */
if (_iseucjp (char1) && _iseucjp (char2))
{
*s++ = (char)char1;
*s = (char)char2;
return 2;
}
else
{
r->_errno = EILSEQ;
return -1;
}
}
*s = (char) wchar;
return 1;
}
int
_DEFUN (__jis_wctomb, (r, s, wchar, charset, state),
struct _reent *r _AND
char *s _AND
wchar_t _wchar _AND
const char *charset _AND
mbstate_t *state)
{
wint_t wchar = _wchar;
int cnt = 0;
unsigned char char2 = (unsigned char)wchar;
unsigned char char1 = (unsigned char)(wchar >> 8);
if (s == NULL)
return 1; /* state-dependent */
if (char1 != 0x00)
{
/* first byte is non-zero..validate multi-byte char */
if (_isjis (char1) && _isjis (char2))
{
if (state->__state == 0)
{
/* must switch from ASCII to JIS state */
state->__state = 1;
*s++ = ESC_CHAR;
*s++ = '$';
*s++ = 'B';
cnt = 3;
}
*s++ = (char)char1;
*s = (char)char2;
return cnt + 2;
}
r->_errno = EILSEQ;
return -1;
}
if (state->__state != 0)
{
/* must switch from JIS to ASCII state */
state->__state = 0;
*s++ = ESC_CHAR;
*s++ = '(';
*s++ = 'B';
cnt = 3;
}
*s = (char)char2;
return cnt + 1;
}
#endif /* !__CYGWIN__ */
#ifdef _MB_EXTENDED_CHARSETS_ISO
int
_DEFUN (__iso_wctomb, (r, s, wchar, charset, state),
struct _reent *r _AND
char *s _AND
wchar_t _wchar _AND
const char *charset _AND
mbstate_t *state)
{
wint_t wchar = _wchar;
if (s == NULL)
return 0;
/* wchars <= 0x9f translate to all ISO charsets directly. */
if (wchar >= 0xa0)
{
int iso_idx = __iso_8859_index (charset + 9);
if (iso_idx >= 0)
{
unsigned char mb;
if (s == NULL)
return 0;
for (mb = 0; mb < 0x60; ++mb)
if (__iso_8859_conv[iso_idx][mb] == wchar)
{
*s = (char) (mb + 0xa0);
return 1;
}
r->_errno = EILSEQ;
return -1;
}
}
if ((size_t)wchar >= 0x100)
{
r->_errno = EILSEQ;
return -1;
}
*s = (char) wchar;
return 1;
}
#endif /* _MB_EXTENDED_CHARSETS_ISO */
#ifdef _MB_EXTENDED_CHARSETS_WINDOWS
int
_DEFUN (__cp_wctomb, (r, s, wchar, charset, state),
struct _reent *r _AND
char *s _AND
wchar_t _wchar _AND
const char *charset _AND
mbstate_t *state)
{
wint_t wchar = _wchar;
if (s == NULL)
return 0;
if (wchar >= 0x80)
{
int cp_idx = __cp_index (charset + 2);
if (cp_idx >= 0)
{
unsigned char mb;
if (s == NULL)
return 0;
for (mb = 0; mb < 0x80; ++mb)
if (__cp_conv[cp_idx][mb] == wchar)
{
*s = (char) (mb + 0x80);
return 1;
}
r->_errno = EILSEQ;
return -1;
}
}
if ((size_t)wchar >= 0x100)
{
r->_errno = EILSEQ;
return -1;
}
*s = (char) wchar;
return 1;
}
#endif /* _MB_EXTENDED_CHARSETS_WINDOWS */
#endif /* _MB_CAPABLE */