* strfuncs.cc: Add comment to explain why we can't support JIS
for now. (__db_wctomb): Alwaus use WC_NO_BEST_FIT_CHARS. (__jis_wctomb): Just call __ascii_wctomb from here. (__eucjp_wctomb): Convert to standalone implementation to fix up the difference between eucJP and CP 20932 affecting JIS-X-0212 characters. Explain. (__kr_wctomb): Use codepage 949. (__db_mbtowc): Reorder code slightly. Always use MB_ERR_INVALID_CHARS in call to MultiByteToWideChar. Fix a problem with singlebyte sequences. Fix a bug in '\0' handling. Reset state->__count on successful return from non-zero state. (__jis_mbtowc): Just call __ascii_mbtowc from here. (__eucjp_mbtowc): Convert to standalone implementation to fix up the difference between eucJP and CP 20932 affecting JIS-X-0212 characters. (__kr_mbtowc): Use codepage 949. (__set_charset_from_codepage): Handle codepage 20932 as eucJP.
This commit is contained in:
parent
2f1769f431
commit
2f9ee8fc90
|
@ -1,3 +1,23 @@
|
|||
2009-04-06 Corinna Vinschen <corinna@vinschen.de>
|
||||
|
||||
* strfuncs.cc: Add comment to explain why we can't support JIS
|
||||
for now.
|
||||
(__db_wctomb): Alwaus use WC_NO_BEST_FIT_CHARS.
|
||||
(__jis_wctomb): Just call __ascii_wctomb from here.
|
||||
(__eucjp_wctomb): Convert to standalone implementation to fix up the
|
||||
difference between eucJP and CP 20932 affecting JIS-X-0212 characters.
|
||||
Explain.
|
||||
(__kr_wctomb): Use codepage 949.
|
||||
(__db_mbtowc): Reorder code slightly. Always use MB_ERR_INVALID_CHARS
|
||||
in call to MultiByteToWideChar. Fix a problem with singlebyte
|
||||
sequences. Fix a bug in '\0' handling. Reset state->__count on
|
||||
successful return from non-zero state.
|
||||
(__jis_mbtowc): Just call __ascii_mbtowc from here.
|
||||
(__eucjp_mbtowc): Convert to standalone implementation to fix up the
|
||||
difference between eucJP and CP 20932 affecting JIS-X-0212 characters.
|
||||
(__kr_mbtowc): Use codepage 949.
|
||||
(__set_charset_from_codepage): Handle codepage 20932 as eucJP.
|
||||
|
||||
2009-04-05 Christopher Faylor <me+cygwin@cgf.cx>
|
||||
|
||||
* Makefile.in: Use all compile options when calculating magic values.
|
||||
|
|
|
@ -22,12 +22,18 @@ details. */
|
|||
#include "cygheap.h"
|
||||
#include "tls_pbuf.h"
|
||||
|
||||
/* The SJIS, JIS and EUCJP conversion in newlib does not use UTF as
|
||||
/* The SJIS, JIS and eucJP conversion in newlib does not use UTF as
|
||||
wchar_t character representation. That's unfortunate for us since
|
||||
we require UTF for the OS. What we do here is to have our own
|
||||
implementation of the base functions for the conversion using
|
||||
the MulitByteToWideChar/WideCharToMultiByte functions. */
|
||||
|
||||
/* FIXME: We can't support JIS (ISO-2022-JP) at all right now. It's a
|
||||
stateful charset encoding. The translation from mbtowc to
|
||||
MulitByteToWideChar is quite complex. Given that we support SJIS and
|
||||
eucJP, the both most used Japanese charset encodings, this shouldn't
|
||||
be such a big problem. */
|
||||
|
||||
/* GBK, eucKR, and Big5 conversions are not available so far in newlib. */
|
||||
|
||||
static int
|
||||
|
@ -43,8 +49,8 @@ __db_wctomb (struct _reent *r, char *s, wchar_t wchar, UINT cp)
|
|||
}
|
||||
|
||||
BOOL def_used = false;
|
||||
int ret = WideCharToMultiByte (cp, cp > 50000 ? 0 : WC_NO_BEST_FIT_CHARS,
|
||||
&wchar, 1, s, MB_CUR_MAX, NULL, &def_used);
|
||||
int ret = WideCharToMultiByte (cp, WC_NO_BEST_FIT_CHARS, &wchar, 1, s,
|
||||
MB_CUR_MAX, NULL, &def_used);
|
||||
if (ret > 0 && !def_used)
|
||||
return ret;
|
||||
|
||||
|
@ -59,18 +65,59 @@ __sjis_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset,
|
|||
return __db_wctomb (r,s, wchar, 932);
|
||||
}
|
||||
|
||||
extern "C" int __ascii_wctomb (struct _reent *, char *, wchar_t, const char *,
|
||||
mbstate_t *);
|
||||
extern "C" int
|
||||
__jis_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset,
|
||||
mbstate_t *state)
|
||||
{
|
||||
return __db_wctomb (r,s, wchar, 50220);
|
||||
/* FIXME: See comment at start of file. */
|
||||
return __ascii_wctomb (r, s, wchar, charset, state);
|
||||
}
|
||||
|
||||
extern "C" int
|
||||
__eucjp_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset,
|
||||
mbstate_t *state)
|
||||
{
|
||||
return __db_wctomb (r,s, wchar, 51932);
|
||||
/* Unfortunately, the Windows eucJP codepage 20932 is not really 100%
|
||||
compatible to eucJP. It's a cute approximation which makes it a
|
||||
doublebyte codepage.
|
||||
The JIS-X-0212 three byte codes (0x8f,0xa1-0xfe,0xa1-0xfe) are folded
|
||||
into two byte codes as follows: The 0x8f is stripped, the next byte is
|
||||
taken as is, the third byte is mapped into the lower 7-bit area by
|
||||
masking it with 0x7f. So, for instance, the eucJP code 0x8f,0xdd,0xf8
|
||||
becomes 0xdd,0x78 in CP 20932.
|
||||
|
||||
To be really eucJP compatible, we have to map the JIS-X-0212 characters
|
||||
between CP 20932 and eucJP ourselves. */
|
||||
if (s == NULL)
|
||||
return 0;
|
||||
|
||||
if (wchar < 0x80)
|
||||
{
|
||||
*s = (char) wchar;
|
||||
return 1;
|
||||
}
|
||||
|
||||
BOOL def_used = false;
|
||||
int ret = WideCharToMultiByte (20932, WC_NO_BEST_FIT_CHARS, &wchar, 1, s,
|
||||
MB_CUR_MAX, NULL, &def_used);
|
||||
if (ret > 0 && !def_used)
|
||||
{
|
||||
/* CP20932 representation of JIS-X-0212 character? */
|
||||
if (ret == 2 && (unsigned char) s[1] <= 0x7f)
|
||||
{
|
||||
/* Yes, convert to eucJP three byte sequence */
|
||||
s[2] = s[1] | 0x80;
|
||||
s[1] = s[0];
|
||||
s[0] = 0x8f;
|
||||
++ret;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
r->_errno = EILSEQ;
|
||||
return -1;
|
||||
}
|
||||
|
||||
extern "C" int
|
||||
|
@ -84,7 +131,7 @@ extern "C" int
|
|||
__kr_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset,
|
||||
mbstate_t *state)
|
||||
{
|
||||
return __db_wctomb (r,s, wchar, 51949);
|
||||
return __db_wctomb (r,s, wchar, 949);
|
||||
}
|
||||
|
||||
extern "C" int
|
||||
|
@ -95,21 +142,20 @@ __big5_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset,
|
|||
}
|
||||
|
||||
static int
|
||||
__db_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n,
|
||||
UINT cp, mbstate_t *state)
|
||||
__db_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n, UINT cp,
|
||||
mbstate_t *state)
|
||||
{
|
||||
wchar_t dummy;
|
||||
char buf[2];
|
||||
int ret;
|
||||
|
||||
if (pwc == NULL)
|
||||
pwc = &dummy;
|
||||
|
||||
if (s == NULL)
|
||||
return 0; /* not state-dependent */
|
||||
|
||||
if (n == 0)
|
||||
return -2;
|
||||
|
||||
if (pwc == NULL)
|
||||
pwc = &dummy;
|
||||
|
||||
if (state->__count == 0)
|
||||
{
|
||||
|
@ -118,44 +164,35 @@ __db_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n,
|
|||
*pwc = *(unsigned char *) s;
|
||||
return *s ? 1 : 0;
|
||||
}
|
||||
ret = MultiByteToWideChar (cp, cp > 50000 ? 0 : MB_ERR_INVALID_CHARS,
|
||||
s, 2, pwc, 1);
|
||||
size_t cnt = min (n, 2);
|
||||
ret = MultiByteToWideChar (cp, MB_ERR_INVALID_CHARS, s, cnt, pwc, 1);
|
||||
if (ret)
|
||||
return *s ? 2 : 0;
|
||||
return cnt;
|
||||
if (n == 1)
|
||||
{
|
||||
state->__count = 1;
|
||||
state->__count = n;
|
||||
state->__value.__wchb[0] = *s;
|
||||
return -2;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* These Win32 functions are really crappy. Assuming n is 2
|
||||
but the first byte is a singlebyte charcode, the function
|
||||
does not convert that byte and return 1, rather it just
|
||||
returns 0. So, what we do here is to check if the first
|
||||
byte returns a valid value... */
|
||||
ret = MultiByteToWideChar (cp,
|
||||
cp > 50000 ? 0 : MB_ERR_INVALID_CHARS,
|
||||
s, 1, pwc, 1);
|
||||
if (ret)
|
||||
return *s ? 1 : 0;
|
||||
}
|
||||
/* These Win32 functions are really crappy. Assuming n is 2 but the
|
||||
first byte is a singlebyte charcode, the function does not convert
|
||||
that byte and return 1, rather it just returns 0. So, what we do
|
||||
here is to check if the first byte returns a valid value... */
|
||||
else if (MultiByteToWideChar (cp, MB_ERR_INVALID_CHARS, s, 1, pwc, 1))
|
||||
return 1;
|
||||
r->_errno = EILSEQ;
|
||||
return -1;
|
||||
}
|
||||
if (!*s)
|
||||
return -2;
|
||||
buf[0] = state->__value.__wchb[0];
|
||||
buf[1] = *s;
|
||||
ret = MultiByteToWideChar (cp, cp > 50000 ? 0 : MB_ERR_INVALID_CHARS,
|
||||
buf, 2, pwc, 1);
|
||||
state->__value.__wchb[state->__count] = *s;
|
||||
ret = MultiByteToWideChar (cp, MB_ERR_INVALID_CHARS,
|
||||
(const char *) state->__value.__wchb, 2, pwc, 1);
|
||||
if (!ret)
|
||||
{
|
||||
r->_errno = EILSEQ;
|
||||
return -1;
|
||||
}
|
||||
return ret;
|
||||
state->__count = 0;
|
||||
return 1;
|
||||
}
|
||||
|
||||
extern "C" int
|
||||
|
@ -169,14 +206,85 @@ extern "C" int
|
|||
__jis_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n,
|
||||
const char *charset, mbstate_t *state)
|
||||
{
|
||||
return __db_mbtowc (r, pwc, s, n, 50220, state);
|
||||
/* FIXME: See comment at start of file. */
|
||||
return __ascii_mbtowc (r, pwc, s, n, charset, state);
|
||||
}
|
||||
|
||||
extern "C" int
|
||||
__eucjp_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n,
|
||||
const char *charset, mbstate_t *state)
|
||||
const char *charset, mbstate_t *state)
|
||||
{
|
||||
return __db_mbtowc (r, pwc, s, n, 51932, state);
|
||||
/* See comment in __eucjp_wctomb above. */
|
||||
wchar_t dummy;
|
||||
int ret = 0;
|
||||
|
||||
if (s == NULL)
|
||||
return 0; /* not state-dependent */
|
||||
|
||||
if (n == 0)
|
||||
return -2;
|
||||
|
||||
if (pwc == NULL)
|
||||
pwc = &dummy;
|
||||
|
||||
if (state->__count == 0)
|
||||
{
|
||||
if (*(unsigned char *) s < 0x80)
|
||||
{
|
||||
*pwc = *(unsigned char *) s;
|
||||
return *s ? 1 : 0;
|
||||
}
|
||||
if (*(unsigned char *) s == 0x8f) /* JIS-X-0212 lead byte? */
|
||||
{
|
||||
/* Yes. Store sequence in mbstate and handle in the __count != 0
|
||||
case at the end of the function. */
|
||||
size_t i;
|
||||
for (i = 0; i < 3 && i < n; i++)
|
||||
state->__value.__wchb[i] = s[i];
|
||||
if ((state->__count = i) < 3) /* Incomplete sequence? */
|
||||
return -2;
|
||||
ret = 3;
|
||||
goto jis_x_0212;
|
||||
}
|
||||
size_t cnt = min (n, 2);
|
||||
if (MultiByteToWideChar (20932, MB_ERR_INVALID_CHARS, s, cnt, pwc, 1))
|
||||
return cnt;
|
||||
if (n == 1)
|
||||
{
|
||||
state->__count = 1;
|
||||
state->__value.__wchb[0] = *s;
|
||||
return -2;
|
||||
}
|
||||
else if (MultiByteToWideChar (20932, MB_ERR_INVALID_CHARS, s, 1, pwc, 1))
|
||||
return 1;
|
||||
r->_errno = EILSEQ;
|
||||
return -1;
|
||||
}
|
||||
state->__value.__wchb[state->__count++] = *s;
|
||||
ret = 1;
|
||||
jis_x_0212:
|
||||
if (state->__value.__wchb[0] == 0x8f)
|
||||
{
|
||||
if (state->__count == 2)
|
||||
{
|
||||
if (n == 1)
|
||||
return -2;
|
||||
state->__value.__wchb[state->__count] = s[1];
|
||||
ret = 2;
|
||||
}
|
||||
/* Ok, we have a full JIS-X-0212 sequence in mbstate. Convert it
|
||||
to the CP 20932 representation and feed it to MultiByteToWideChar. */
|
||||
state->__value.__wchb[0] = state->__value.__wchb[1];
|
||||
state->__value.__wchb[1] = state->__value.__wchb[2] & 0x7f;
|
||||
}
|
||||
if (!MultiByteToWideChar (20932, MB_ERR_INVALID_CHARS,
|
||||
(const char *) state->__value.__wchb, 2, pwc, 1))
|
||||
{
|
||||
r->_errno = EILSEQ;
|
||||
return -1;
|
||||
}
|
||||
state->__count = 0;
|
||||
return ret;
|
||||
}
|
||||
|
||||
extern "C" int
|
||||
|
@ -190,7 +298,7 @@ extern "C" int
|
|||
__kr_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n,
|
||||
const char *charset, mbstate_t *state)
|
||||
{
|
||||
return __db_mbtowc (r, pwc, s, n, 51949, state);
|
||||
return __db_mbtowc (r, pwc, s, n, 949, state);
|
||||
}
|
||||
|
||||
extern "C" int
|
||||
|
@ -265,6 +373,7 @@ __set_charset_from_codepage (UINT cp, char *charset)
|
|||
case 50220:
|
||||
strcpy (charset, "JIS");
|
||||
return __jis_mbtowc;
|
||||
case 20932:
|
||||
case 51932:
|
||||
strcpy (charset, "EUCJP");
|
||||
return __eucjp_mbtowc;
|
||||
|
|
Loading…
Reference in New Issue