4
0
mirror of git://sourceware.org/git/newlib-cygwin.git synced 2025-01-29 10:30:50 +08:00
Corinna Vinschen 28186e81d9 * libc/ctype/iswalpha.c: Handle all wchar_t as unicode on
_MB_CAPABLE systems.
	* libc/ctype/iswblank.c: Ditto.
	* libc/ctype/iswcntrl.c: Ditto.
	* libc/ctype/iswprint.c: Ditto.
	* libc/ctype/iswpunct.c: Ditto.
	* libc/ctype/iswspace.c: Ditto.
	* libc/ctype/jp2uc.c (__jp2uc): On Cygwin, just return c.
	Explain why.
	* libc/ctype/towlower.c: Ditto.
	* libc/ctype/towupper.c: Ditto.
	* libc/include/sys/config.h: Define _MB_EXTENDED_CHARSETS_ISO
	and _MB_EXTENDED_CHARSETS_WINDOWS if _MB_EXTENDED_CHARSETS_ALL is
	defined.  Define _MB_EXTENDED_CHARSETS_ALL on Cygwin only for now.
	* libc/include/sys/reent.h (struct _reent): Mark _current_category
	and _current_locale as unused.
	* libc/locale/locale.c: Add new charset support to documentation.
	Include ../stdio/local.h from here.
	(lc_ctype_charset): Set to "ASCII" by default.
	(lc_message_charset): Ditto.
	(_setlocale_r): Don't set _current_category and _current_locale.
	(loadlocale): Add Cygwin codepage support.  On _MB_CAPABLE
	systems, set __mbtowc and __wctomb function pointers to function
	corresponding with current charset.  Don't allow non-existant
	ISO-8859-12 charset.  Add support for Windows singlebyte codepages.
	On Cygwin, add support for GBK, CP949, and BIG5.  On Cygwin,
	call __set_ctype() in case the catorgy is LC_CTYPE.  Don't set
	_current_category and _current_locale.
	* libc/stdlib/Makefile.am (GENERAL_SOURCES): Add sb_charsets.c.
	* libc/stdlib/Makefile.in: Regenerate.
	* libc/stdlib/local.h: Add prototype for __locale_charset.
	Add prototypes for __mbtowc and __wctomb pointers.
	Add prototypes for charset-specific _wctomb_r and _mbtowc_r
	functions.
	Declare tables and functions from sb_charsets.c.
	* libc/stdlib/mbtowc_r.c (__mbtowc): Define.  Set to __ascii_mbtowc
	by default.
	(_mbtowc_r): Just call __mbtowc from here.
	(__ascii_mbtowc): New function.
	(__iso_mbtowc): New function.
	(__cp_mbtowc): New function.
	(__utf8_mbtowc): New function.
	(__sjis_mbtowc): New function.  Disable on Cygwin.
	(__eucjp_mbtowc): New function.  Disable on Cygwin.
	(__jis_mbtowc): New function.  Disable on Cygwin.
	* libc/stdlib/sb_charsets.c: New file, adding singlebyte to UTF
	conversion tables for all ISO and CP charsets.
	(__iso_8859_index): New function.
	(__cp_index): New function.
	* libc/stdlib/wctomb_r.c (__wctomb): Define.  Set to __ascii_wctomb
	by default.
	(_wctomb_r): Just call __wctomb from here.
	(__ascii_wctomb): New function.
	(__utf8_wctomb): New function.
	(__sjis_wctomb): New function.  Disable on Cygwin.
	(__eucjp_wctomb): New function.  Disable on Cygwin.
	(__jis_wctomb): New function.  Disable on Cygwin.
	(__iso_wctomb): New function.
	(__cp_wctomb): New function.
2009-03-24 10:13:27 +00:00

609 lines
14 KiB
C

#include <newlib.h>
#include <stdlib.h>
#include <locale.h>
#include "mbctype.h"
#include <wchar.h>
#include <string.h>
#include <errno.h>
#include "local.h"
int (*__mbtowc) (struct _reent *, wchar_t *, const char *, size_t,
const char *, mbstate_t *)
= __ascii_mbtowc;
int
_DEFUN (_mbtowc_r, (r, pwc, s, n, state),
struct _reent *r _AND
wchar_t *pwc _AND
const char *s _AND
size_t n _AND
mbstate_t *state)
{
return __mbtowc (r, pwc, s, n, __locale_charset (), state);
}
int
_DEFUN (__ascii_mbtowc, (r, pwc, s, n, charset, state),
struct _reent *r _AND
wchar_t *pwc _AND
const char *s _AND
size_t n _AND
const char *charset _AND
mbstate_t *state)
{
wchar_t dummy;
unsigned char *t = (unsigned char *)s;
if (pwc == NULL)
pwc = &dummy;
if (s == NULL)
return 0;
if (n == 0)
return -2;
*pwc = (wchar_t)*t;
if (*t == '\0')
return 0;
return 1;
}
#ifdef _MB_CAPABLE
typedef enum { ESCAPE, DOLLAR, BRACKET, AT, B, J,
NUL, JIS_CHAR, OTHER, JIS_C_NUM } JIS_CHAR_TYPE;
typedef enum { ASCII, JIS, A_ESC, A_ESC_DL, JIS_1, J_ESC, J_ESC_BR,
INV, JIS_S_NUM } JIS_STATE;
typedef enum { COPY_A, COPY_J1, COPY_J2, MAKE_A, NOOP, EMPTY, ERROR } JIS_ACTION;
/**************************************************************************************
* state/action tables for processing JIS encoding
* Where possible, switches to JIS are grouped with proceding JIS characters and switches
* to ASCII are grouped with preceding JIS characters. Thus, maximum returned length
* is 2 (switch to JIS) + 2 (JIS characters) + 2 (switch back to ASCII) = 6.
*************************************************************************************/
static JIS_STATE JIS_state_table[JIS_S_NUM][JIS_C_NUM] = {
/* ESCAPE DOLLAR BRACKET AT B J NUL JIS_CHAR OTHER */
/* ASCII */ { A_ESC, ASCII, ASCII, ASCII, ASCII, ASCII, ASCII, ASCII, ASCII },
/* JIS */ { J_ESC, JIS_1, JIS_1, JIS_1, JIS_1, JIS_1, INV, JIS_1, INV },
/* A_ESC */ { ASCII, A_ESC_DL, ASCII, ASCII, ASCII, ASCII, ASCII, ASCII, ASCII },
/* A_ESC_DL */{ ASCII, ASCII, ASCII, JIS, JIS, ASCII, ASCII, ASCII, ASCII },
/* JIS_1 */ { INV, JIS, JIS, JIS, JIS, JIS, INV, JIS, INV },
/* J_ESC */ { INV, INV, J_ESC_BR, INV, INV, INV, INV, INV, INV },
/* J_ESC_BR */{ INV, INV, INV, INV, ASCII, ASCII, INV, INV, INV },
};
static JIS_ACTION JIS_action_table[JIS_S_NUM][JIS_C_NUM] = {
/* ESCAPE DOLLAR BRACKET AT B J NUL JIS_CHAR OTHER */
/* ASCII */ { NOOP, COPY_A, COPY_A, COPY_A, COPY_A, COPY_A, EMPTY, COPY_A, COPY_A},
/* JIS */ { NOOP, COPY_J1, COPY_J1, COPY_J1, COPY_J1, COPY_J1, ERROR, COPY_J1, ERROR },
/* A_ESC */ { COPY_A, NOOP, COPY_A, COPY_A, COPY_A, COPY_A, COPY_A, COPY_A, COPY_A},
/* A_ESC_DL */{ COPY_A, COPY_A, COPY_A, NOOP, NOOP, COPY_A, COPY_A, COPY_A, COPY_A},
/* JIS_1 */ { ERROR, COPY_J2, COPY_J2, COPY_J2, COPY_J2, COPY_J2, ERROR, COPY_J2, ERROR },
/* J_ESC */ { ERROR, ERROR, NOOP, ERROR, ERROR, ERROR, ERROR, ERROR, ERROR },
/* J_ESC_BR */{ ERROR, ERROR, ERROR, ERROR, MAKE_A, MAKE_A, ERROR, ERROR, ERROR },
};
/* we override the mbstate_t __count field for more complex encodings and use it store a state value */
#define __state __count
#ifdef _MB_EXTENDED_CHARSETS_ISO
int
_DEFUN (__iso_mbtowc, (r, pwc, s, n, charset, state),
struct _reent *r _AND
wchar_t *pwc _AND
const char *s _AND
size_t n _AND
const char *charset _AND
mbstate_t *state)
{
wchar_t dummy;
unsigned char *t = (unsigned char *)s;
if (pwc == NULL)
pwc = &dummy;
if (s == NULL)
return 0;
if (n == 0)
return -2;
if (*t >= 0xa0)
{
int iso_idx = __iso_8859_index (charset + 9);
if (iso_idx >= 0)
{
*pwc = __iso_8859_conv[iso_idx][*t - 0xa0];
if (*pwc == 0) /* Invalid character */
{
r->_errno = EILSEQ;
return -1;
}
return 1;
}
}
*pwc = (wchar_t) *t;
if (*t == '\0')
return 0;
return 1;
}
#endif /* _MB_EXTENDED_CHARSETS_ISO */
#ifdef _MB_EXTENDED_CHARSETS_WINDOWS
int
_DEFUN (__cp_mbtowc, (r, pwc, s, n, charset, state),
struct _reent *r _AND
wchar_t *pwc _AND
const char *s _AND
size_t n _AND
const char *charset _AND
mbstate_t *state)
{
wchar_t dummy;
unsigned char *t = (unsigned char *)s;
if (pwc == NULL)
pwc = &dummy;
if (s == NULL)
return 0;
if (n == 0)
return -2;
if (*t >= 0x80)
{
int cp_idx = __cp_index (charset + 2);
if (cp_idx >= 0)
{
*pwc = __cp_conv[cp_idx][*t - 0x80];
if (*pwc == 0) /* Invalid character */
{
r->_errno = EILSEQ;
return -1;
}
return 1;
}
}
*pwc = (wchar_t)*t;
if (*t == '\0')
return 0;
return 1;
}
#endif /* _MB_EXTENDED_CHARSETS_WINDOWS */
int
_DEFUN (__utf8_mbtowc, (r, pwc, s, n, charset, state),
struct _reent *r _AND
wchar_t *pwc _AND
const char *s _AND
size_t n _AND
const char *charset _AND
mbstate_t *state)
{
wchar_t dummy;
unsigned char *t = (unsigned char *)s;
int ch;
int i = 0;
if (pwc == NULL)
pwc = &dummy;
if (s == NULL)
return 0;
if (n == 0)
return -2;
if (state->__count == 4)
{
/* Create the second half of the surrogate pair. For a description
see the comment below. */
wint_t tmp = (wchar_t)((state->__value.__wchb[0] & 0x07) << 18)
| (wchar_t)((state->__value.__wchb[1] & 0x3f) << 12)
| (wchar_t)((state->__value.__wchb[2] & 0x3f) << 6)
| (wchar_t)(state->__value.__wchb[3] & 0x3f);
state->__count = 0;
*pwc = 0xdc00 | ((tmp - 0x10000) & 0x3ff);
return 2;
}
if (state->__count == 0)
ch = t[i++];
else
{
if (n < (size_t)-1)
++n;
ch = state->__value.__wchb[0];
}
if (ch == '\0')
{
*pwc = 0;
state->__count = 0;
return 0; /* s points to the null character */
}
if (ch >= 0x0 && ch <= 0x7f)
{
/* single-byte sequence */
state->__count = 0;
*pwc = ch;
return 1;
}
if (ch >= 0xc0 && ch <= 0xdf)
{
/* two-byte sequence */
state->__value.__wchb[0] = ch;
state->__count = 1;
if (n < 2)
return -2;
ch = t[i++];
if (ch < 0x80 || ch > 0xbf)
{
r->_errno = EILSEQ;
return -1;
}
if (state->__value.__wchb[0] < 0xc2)
{
/* overlong UTF-8 sequence */
r->_errno = EILSEQ;
return -1;
}
state->__count = 0;
*pwc = (wchar_t)((state->__value.__wchb[0] & 0x1f) << 6)
| (wchar_t)(ch & 0x3f);
return i;
}
if (ch >= 0xe0 && ch <= 0xef)
{
/* three-byte sequence */
wchar_t tmp;
state->__value.__wchb[0] = ch;
if (state->__count == 0)
state->__count = 1;
else if (n < (size_t)-1)
++n;
if (n < 2)
return -2;
ch = (state->__count == 1) ? t[i++] : state->__value.__wchb[1];
if (state->__value.__wchb[0] == 0xe0 && ch < 0xa0)
{
/* overlong UTF-8 sequence */
r->_errno = EILSEQ;
return -1;
}
if (ch < 0x80 || ch > 0xbf)
{
r->_errno = EILSEQ;
return -1;
}
state->__value.__wchb[1] = ch;
state->__count = 2;
if (n < 3)
return -2;
ch = t[i++];
if (ch < 0x80 || ch > 0xbf)
{
r->_errno = EILSEQ;
return -1;
}
state->__count = 0;
tmp = (wchar_t)((state->__value.__wchb[0] & 0x0f) << 12)
| (wchar_t)((state->__value.__wchb[1] & 0x3f) << 6)
| (wchar_t)(ch & 0x3f);
if (tmp >= 0xd800 && tmp <= 0xdfff)
{
r->_errno = EILSEQ;
return -1;
}
*pwc = tmp;
return i;
}
if (ch >= 0xf0 && ch <= 0xf7)
{
/* four-byte sequence */
wint_t tmp;
state->__value.__wchb[0] = ch;
if (state->__count == 0)
state->__count = 1;
else if (n < (size_t)-1)
++n;
if (n < 2)
return -2;
ch = (state->__count == 1) ? t[i++] : state->__value.__wchb[1];
if (state->__value.__wchb[0] == 0xf0 && ch < 0x90)
{
/* overlong UTF-8 sequence */
r->_errno = EILSEQ;
return -1;
}
if (ch < 0x80 || ch > 0xbf)
{
r->_errno = EILSEQ;
return -1;
}
state->__value.__wchb[1] = ch;
if (state->__count == 1)
state->__count = 2;
else if (n < (size_t)-1)
++n;
if (n < 3)
return -2;
ch = (state->__count == 2) ? t[i++] : state->__value.__wchb[2];
if (ch < 0x80 || ch > 0xbf)
{
r->_errno = EILSEQ;
return -1;
}
state->__value.__wchb[2] = ch;
state->__count = 3;
if (n < 4)
return -2;
ch = t[i++];
if (ch < 0x80 || ch > 0xbf)
{
r->_errno = EILSEQ;
return -1;
}
tmp = (wint_t)((state->__value.__wchb[0] & 0x07) << 18)
| (wint_t)((state->__value.__wchb[1] & 0x3f) << 12)
| (wint_t)((state->__value.__wchb[2] & 0x3f) << 6)
| (wint_t)(ch & 0x3f);
if (tmp > 0xffff && sizeof(wchar_t) == 2)
{
/* On systems which have wchar_t being UTF-16 values, the value
doesn't fit into a single wchar_t in this case. So what we
do here is to store the state with a special value of __count
and return the first half of a surrogate pair. As return
value we choose to return the half of the actual UTF-8 char.
The second half is returned in case we recognize the special
__count value above. */
state->__value.__wchb[3] = ch;
state->__count = 4;
*pwc = 0xd800 | (((tmp - 0x10000) >> 10) & 0x3ff);
return 2;
}
*pwc = tmp;
state->__count = 0;
return i;
}
r->_errno = EILSEQ;
return -1;
}
/* Cygwin defines its own doublebyte charset conversion functions
because the underlying OS requires wchar_t == UTF-16. */
#ifndef __CYGWIN__
int
_DEFUN (__sjis_mbtowc, (r, pwc, s, n, charset, state),
struct _reent *r _AND
wchar_t *pwc _AND
const char *s _AND
size_t n _AND
const char *charset _AND
mbstate_t *state)
{
wchar_t dummy;
unsigned char *t = (unsigned char *)s;
int ch;
int i = 0;
if (pwc == NULL)
pwc = &dummy;
if (s == NULL)
return 0; /* not state-dependent */
if (n == 0)
return -2;
ch = t[i++];
if (state->__count == 0)
{
if (_issjis1 (ch))
{
state->__value.__wchb[0] = ch;
state->__count = 1;
if (n <= 1)
return -2;
ch = t[i++];
}
}
if (state->__count == 1)
{
if (_issjis2 (ch))
{
*pwc = (((wchar_t)state->__value.__wchb[0]) << 8) + (wchar_t)ch;
state->__count = 0;
return i;
}
else
{
r->_errno = EILSEQ;
return -1;
}
}
*pwc = (wchar_t)*t;
if (*t == '\0')
return 0;
return 1;
}
int
_DEFUN (__eucjp_mbtowc, (r, pwc, s, n, charset, state),
struct _reent *r _AND
wchar_t *pwc _AND
const char *s _AND
size_t n _AND
const char *charset _AND
mbstate_t *state)
{
wchar_t dummy;
unsigned char *t = (unsigned char *)s;
int ch;
int i = 0;
if (pwc == NULL)
pwc = &dummy;
if (s == NULL)
return 0;
if (n == 0)
return -2;
ch = t[i++];
if (state->__count == 0)
{
if (_iseucjp (ch))
{
state->__value.__wchb[0] = ch;
state->__count = 1;
if (n <= 1)
return -2;
ch = t[i++];
}
}
if (state->__count == 1)
{
if (_iseucjp (ch))
{
*pwc = (((wchar_t)state->__value.__wchb[0]) << 8) + (wchar_t)ch;
state->__count = 0;
return i;
}
else
{
r->_errno = EILSEQ;
return -1;
}
}
*pwc = (wchar_t)*t;
if (*t == '\0')
return 0;
return 1;
}
int
_DEFUN (__jis_mbtowc, (r, pwc, s, n, charset, state),
struct _reent *r _AND
wchar_t *pwc _AND
const char *s _AND
size_t n _AND
const char *charset _AND
mbstate_t *state)
{
wchar_t dummy;
unsigned char *t = (unsigned char *)s;
JIS_STATE curr_state;
JIS_ACTION action;
JIS_CHAR_TYPE ch;
unsigned char *ptr;
unsigned int i;
int curr_ch;
if (pwc == NULL)
pwc = &dummy;
if (s == NULL)
{
state->__state = ASCII;
return 1; /* state-dependent */
}
if (n == 0)
return -2;
curr_state = state->__state;
ptr = t;
for (i = 0; i < n; ++i)
{
curr_ch = t[i];
switch (curr_ch)
{
case ESC_CHAR:
ch = ESCAPE;
break;
case '$':
ch = DOLLAR;
break;
case '@':
ch = AT;
break;
case '(':
ch = BRACKET;
break;
case 'B':
ch = B;
break;
case 'J':
ch = J;
break;
case '\0':
ch = NUL;
break;
default:
if (_isjis (curr_ch))
ch = JIS_CHAR;
else
ch = OTHER;
}
action = JIS_action_table[curr_state][ch];
curr_state = JIS_state_table[curr_state][ch];
switch (action)
{
case NOOP:
break;
case EMPTY:
state->__state = ASCII;
*pwc = (wchar_t)0;
return 0;
case COPY_A:
state->__state = ASCII;
*pwc = (wchar_t)*ptr;
return (i + 1);
case COPY_J1:
state->__value.__wchb[0] = t[i];
break;
case COPY_J2:
state->__state = JIS;
*pwc = (((wchar_t)state->__value.__wchb[0]) << 8) + (wchar_t)(t[i]);
return (i + 1);
case MAKE_A:
ptr = (unsigned char *)(t + i + 1);
break;
case ERROR:
default:
r->_errno = EILSEQ;
return -1;
}
}
state->__state = curr_state;
return -2; /* n < bytes needed */
}
#endif /* !__CYGWIN__*/
#endif /* _MB_CAPABLE */