* strfuncs.cc: Add comment to explain why we can't support JIS

for now. (__db_wctomb): Alwaus use WC_NO_BEST_FIT_CHARS. (__jis_wctomb): Just call __ascii_wctomb from here. (__eucjp_wctomb): Convert to standalone implementation to fix up the difference between eucJP and CP 20932 affecting JIS-X-0212 characters. Explain. (__kr_wctomb): Use codepage 949. (__db_mbtowc): Reorder code slightly. Always use MB_ERR_INVALID_CHARS in call to MultiByteToWideChar. Fix a problem with singlebyte sequences. Fix a bug in '\0' handling. Reset state->__count on successful return from non-zero state. (__jis_mbtowc): Just call __ascii_mbtowc from here. (__eucjp_mbtowc): Convert to standalone implementation to fix up the difference between eucJP and CP 20932 affecting JIS-X-0212 characters. (__kr_mbtowc): Use codepage 949. (__set_charset_from_codepage): Handle codepage 20932 as eucJP.
2009-04-06 10:50:11 +00:00 · 2009-04-06 10:50:11 +00:00 · 2f9ee8fc90
parent 2f1769f431
commit 2f9ee8fc90
2 changed files with 169 additions and 40 deletions
--- a/winsup/cygwin/ChangeLog
+++ b/winsup/cygwin/ChangeLog
@ -1,3 +1,23 @@
+2009-04-06  Corinna Vinschen  <corinna@vinschen.de>
+
+	* strfuncs.cc: Add comment to explain why we can't support JIS
+	for now.
+	(__db_wctomb): Alwaus use WC_NO_BEST_FIT_CHARS.
+	(__jis_wctomb): Just call __ascii_wctomb from here.
+	(__eucjp_wctomb): Convert to standalone implementation to fix up the
+	difference between eucJP and CP 20932 affecting JIS-X-0212 characters.
+	Explain.
+	(__kr_wctomb): Use codepage 949.
+	(__db_mbtowc): Reorder code slightly.  Always use MB_ERR_INVALID_CHARS
+	in call to MultiByteToWideChar.  Fix a problem with singlebyte
+	sequences.  Fix a bug in '\0' handling.  Reset state->__count on
+	successful return from non-zero state.
+	(__jis_mbtowc): Just call __ascii_mbtowc from here.
+	(__eucjp_mbtowc): Convert to standalone implementation to fix up the
+	difference between eucJP and CP 20932 affecting JIS-X-0212 characters.
+	(__kr_mbtowc): Use codepage 949.
+	(__set_charset_from_codepage): Handle codepage 20932 as eucJP.
+
 2009-04-05  Christopher Faylor  <me+cygwin@cgf.cx>

 	* Makefile.in: Use all compile options when calculating magic values.
--- a/winsup/cygwin/strfuncs.cc
+++ b/winsup/cygwin/strfuncs.cc
@ -22,12 +22,18 @@ details. */
 #include "cygheap.h"
 #include "tls_pbuf.h"

-/* The SJIS, JIS and EUCJP conversion in newlib does not use UTF as
+/* The SJIS, JIS and eucJP conversion in newlib does not use UTF as
   wchar_t character representation.  That's unfortunate for us since
   we require UTF for the OS.  What we do here is to have our own
   implementation of the base functions for the conversion using
   the MulitByteToWideChar/WideCharToMultiByte functions. */

+/* FIXME: We can't support JIS (ISO-2022-JP) at all right now.  It's a
+   stateful charset encoding.  The translation from mbtowc to
+   MulitByteToWideChar is quite complex.  Given that we support SJIS and
+   eucJP, the both most used Japanese charset encodings, this shouldn't
+   be such a big problem. */
+
 /* GBK, eucKR, and Big5 conversions are not available so far in newlib. */

 static int
@ -43,8 +49,8 @@ __db_wctomb (struct _reent *r, char *s, wchar_t wchar, UINT cp)
    }

  BOOL def_used = false;
-  int ret = WideCharToMultiByte (cp, cp > 50000 ? 0 : WC_NO_BEST_FIT_CHARS,
-				 &wchar, 1, s, MB_CUR_MAX, NULL, &def_used);
+  int ret = WideCharToMultiByte (cp, WC_NO_BEST_FIT_CHARS, &wchar, 1, s,
+				 MB_CUR_MAX, NULL, &def_used);
  if (ret > 0 && !def_used)
    return ret;

@ -59,18 +65,59 @@ __sjis_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset,
  return __db_wctomb (r,s, wchar, 932);
 }

+extern "C" int __ascii_wctomb (struct _reent *, char *, wchar_t, const char *,
+			       mbstate_t *);
 extern "C" int
 __jis_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset,
 	       mbstate_t *state)
 {
-  return __db_wctomb (r,s, wchar, 50220);
+  /* FIXME: See comment at start of file. */
+  return __ascii_wctomb (r, s, wchar, charset, state);
 }

 extern "C" int
 __eucjp_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset,
 	       mbstate_t *state)
 {
-  return __db_wctomb (r,s, wchar, 51932);
+  /* Unfortunately, the Windows eucJP codepage 20932 is not really 100%
+     compatible to eucJP.  It's a cute approximation which makes it a
+     doublebyte codepage.
+     The JIS-X-0212 three byte codes (0x8f,0xa1-0xfe,0xa1-0xfe) are folded
+     into two byte codes as follows: The 0x8f is stripped, the next byte is
+     taken as is, the third byte is mapped into the lower 7-bit area by
+     masking it with 0x7f.  So, for instance, the eucJP code 0x8f,0xdd,0xf8
+     becomes 0xdd,0x78 in CP 20932.
+
+     To be really eucJP compatible, we have to map the JIS-X-0212 characters
+     between CP 20932 and eucJP ourselves. */
+  if (s == NULL)
+    return 0;
+
+  if (wchar < 0x80)
+    {
+      *s = (char) wchar;
+      return 1;
+    }
+
+  BOOL def_used = false;
+  int ret = WideCharToMultiByte (20932, WC_NO_BEST_FIT_CHARS, &wchar, 1, s,
+				 MB_CUR_MAX, NULL, &def_used);
+  if (ret > 0 && !def_used)
+    {
+      /* CP20932 representation of JIS-X-0212 character? */
+      if (ret == 2 && (unsigned char) s[1] <= 0x7f)
+	{
+	  /* Yes, convert to eucJP three byte sequence */
+	  s[2] = s[1] | 0x80;
+	  s[1] = s[0];
+	  s[0] = 0x8f;
+	  ++ret;
+	}
+      return ret;
+    }
+
+  r->_errno = EILSEQ;
+  return -1;
 }

 extern "C" int
@ -84,7 +131,7 @@ extern "C" int
 __kr_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset,
 	       mbstate_t *state)
 {
-  return __db_wctomb (r,s, wchar, 51949);
+  return __db_wctomb (r,s, wchar, 949);
 }

 extern "C" int
@ -95,21 +142,20 @@ __big5_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset,
 }

 static int
-__db_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n,
-	     UINT cp, mbstate_t *state)
+__db_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n, UINT cp,
+	     mbstate_t *state)
 {
  wchar_t dummy;
-  char buf[2];
  int ret;
-  
-  if (pwc == NULL)
-    pwc = &dummy;

  if (s == NULL)
    return 0;  /* not state-dependent */

  if (n == 0)
    return -2;
+  
+  if (pwc == NULL)
+    pwc = &dummy;

  if (state->__count == 0)
    {
@ -118,44 +164,35 @@ __db_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n,
 	  *pwc = *(unsigned char *) s;
 	  return *s ? 1 : 0;
 	}
-      ret = MultiByteToWideChar (cp, cp > 50000 ? 0 : MB_ERR_INVALID_CHARS,
-				 s, 2, pwc, 1);
+      size_t cnt = min (n, 2);
+      ret = MultiByteToWideChar (cp, MB_ERR_INVALID_CHARS, s, cnt, pwc, 1);
      if (ret)
-	return *s ? 2 : 0;
+	return cnt;
      if (n == 1)
 	{
-	  state->__count = 1;
+	  state->__count = n;
 	  state->__value.__wchb[0] = *s;
 	  return -2;
 	}
-      else
-	{
-	  /* These Win32 functions are really crappy.  Assuming n is 2
-	     but the first byte is a singlebyte charcode, the function
-	     does not convert that byte and return 1, rather it just
-	     returns 0.  So, what we do here is to check if the first
-	     byte returns a valid value... */
-	  ret = MultiByteToWideChar (cp,
-				     cp > 50000 ? 0 : MB_ERR_INVALID_CHARS,
-				     s, 1, pwc, 1);
-	  if (ret)
-	    return *s ? 1 : 0;
-	}
+      /* These Win32 functions are really crappy.  Assuming n is 2 but the
+	 first byte is a singlebyte charcode, the function does not convert
+	 that byte and return 1, rather it just returns 0.  So, what we do
+	 here is to check if the first byte returns a valid value... */
+      else if (MultiByteToWideChar (cp, MB_ERR_INVALID_CHARS, s, 1, pwc, 1))
+	return 1;
      r->_errno = EILSEQ;
      return -1;
    }
-  if (!*s)
-    return -2;
-  buf[0] = state->__value.__wchb[0];
-  buf[1] = *s;
-  ret = MultiByteToWideChar (cp, cp > 50000 ? 0 : MB_ERR_INVALID_CHARS,
-			     buf, 2, pwc, 1);
+  state->__value.__wchb[state->__count] = *s;
+  ret = MultiByteToWideChar (cp, MB_ERR_INVALID_CHARS,
+			     (const char *) state->__value.__wchb, 2, pwc, 1);
  if (!ret)
    {
      r->_errno = EILSEQ;
      return -1;
    }
-  return ret;
+  state->__count = 0;
+  return 1;
 }

 extern "C" int
@ -169,14 +206,85 @@ extern "C" int
 __jis_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n,
 	       const char *charset, mbstate_t *state)
 {
-  return __db_mbtowc (r, pwc, s, n, 50220, state);
+  /* FIXME: See comment at start of file. */
+  return __ascii_mbtowc (r, pwc, s, n, charset, state);
 }

 extern "C" int
 __eucjp_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n,
-	       const char *charset, mbstate_t *state)
+		const char *charset, mbstate_t *state)
 {
-  return __db_mbtowc (r, pwc, s, n, 51932, state);
+  /* See comment in __eucjp_wctomb above. */
+  wchar_t dummy;
+  int ret = 0;
+
+  if (s == NULL)
+    return 0;  /* not state-dependent */
+
+  if (n == 0)
+    return -2;
+  
+  if (pwc == NULL)
+    pwc = &dummy;
+
+  if (state->__count == 0)
+    {
+      if (*(unsigned char *) s < 0x80)
+	{
+	  *pwc = *(unsigned char *) s;
+	  return *s ? 1 : 0;
+	}
+      if (*(unsigned char *) s == 0x8f)	/* JIS-X-0212 lead byte? */
+	{
+	  /* Yes.  Store sequence in mbstate and handle in the __count != 0
+	     case at the end of the function. */
+	  size_t i;
+	  for (i = 0; i < 3 && i < n; i++)
+	    state->__value.__wchb[i] = s[i];
+	  if ((state->__count = i) < 3)	/* Incomplete sequence? */
+	    return -2;
+	  ret = 3;
+	  goto jis_x_0212;
+	}
+      size_t cnt = min (n, 2);
+      if (MultiByteToWideChar (20932, MB_ERR_INVALID_CHARS, s, cnt, pwc, 1))
+	return cnt;
+      if (n == 1)
+	{
+	  state->__count = 1;
+	  state->__value.__wchb[0] = *s;
+	  return -2;
+	}
+      else if (MultiByteToWideChar (20932, MB_ERR_INVALID_CHARS, s, 1, pwc, 1))
+	return 1;
+      r->_errno = EILSEQ;
+      return -1;
+    }
+  state->__value.__wchb[state->__count++] = *s;
+  ret = 1;
+jis_x_0212:
+  if (state->__value.__wchb[0] == 0x8f)
+    {
+      if (state->__count == 2)
+	{
+	  if (n == 1)
+	    return -2;
+	  state->__value.__wchb[state->__count] = s[1];
+	  ret = 2;
+	}
+      /* Ok, we have a full JIS-X-0212 sequence in mbstate.  Convert it
+	 to the CP 20932 representation and feed it to MultiByteToWideChar. */
+      state->__value.__wchb[0] = state->__value.__wchb[1];
+      state->__value.__wchb[1] = state->__value.__wchb[2] & 0x7f;
+    }
+  if (!MultiByteToWideChar (20932, MB_ERR_INVALID_CHARS,
+			    (const char *) state->__value.__wchb, 2, pwc, 1))
+    {
+      r->_errno = EILSEQ;
+      return -1;
+    }
+  state->__count = 0;
+  return ret;
 }

 extern "C" int
@ -190,7 +298,7 @@ extern "C" int
 __kr_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n,
 	       const char *charset, mbstate_t *state)
 {
-  return __db_mbtowc (r, pwc, s, n, 51949, state);
+  return __db_mbtowc (r, pwc, s, n, 949, state);
 }

 extern "C" int
@ -265,6 +373,7 @@ __set_charset_from_codepage (UINT cp, char *charset)
    case 50220:
      strcpy (charset, "JIS");
      return __jis_mbtowc;
+    case 20932:
    case 51932:
      strcpy (charset, "EUCJP");
      return __eucjp_mbtowc;