* libc/stdlib/mbtowc_r.c (__utf8_mbtowc): Allow CESU-8 surrogate

value encoding. * libc/stdlib/wctomb_r.c (__utf8_mbtowc): Allow CESU-8 surrogate value decoding.
2009-10-03 08:51:07 +00:00 · 2009-10-03 08:51:07 +00:00 · 6ff28fc3b1
parent 9c47bbb6e9
commit 6ff28fc3b1
3 changed files with 42 additions and 38 deletions
--- a/newlib/ChangeLog
+++ b/newlib/ChangeLog
@ -1,3 +1,10 @@
+2009-10-03  Corinna Vinschen  <corinna@vinschen.de>
+
+	* libc/stdlib/mbtowc_r.c (__utf8_mbtowc): Allow CESU-8 surrogate
+	value encoding.
+	* libc/stdlib/wctomb_r.c (__utf8_mbtowc): Allow CESU-8 surrogate
+	value decoding.
+
 2009-09-29  Corinna Vinschen  <corinna@vinschen.de>

 	* libc/locale/locale.c (loadlocale): Allow "C." same as "C-" as locale
--- a/newlib/libc/stdlib/mbtowc_r.c
+++ b/newlib/libc/stdlib/mbtowc_r.c
@ -295,12 +295,6 @@ _DEFUN (__utf8_mbtowc, (r, pwc, s, n, charset, state),
      tmp = (wchar_t)((state->__value.__wchb[0] & 0x0f) << 12)
 	|    (wchar_t)((state->__value.__wchb[1] & 0x3f) << 6)
 	|     (wchar_t)(ch & 0x3f);
-      /* Check for invalid CESU-8 encoding of UTF-16 surrogate values. */
-      if (tmp >= 0xd800 && tmp <= 0xdfff)
-	{
-	  r->_errno = EILSEQ;
-	  return -1;
-	}
      *pwc = tmp;
      return i;
    }
--- a/newlib/libc/stdlib/wctomb_r.c
+++ b/newlib/libc/stdlib/wctomb_r.c
@ -63,72 +63,75 @@ _DEFUN (__utf8_wctomb, (r, s, wchar, charset, state),
        mbstate_t     *state)
 {
  wint_t wchar = _wchar;
+  int ret = 0;

  if (s == NULL)
    return 0; /* UTF-8 encoding is not state-dependent */

-  if (state->__count == -4 && (wchar < 0xdc00 || wchar >= 0xdfff))
+  if (sizeof (wchar_t) == 2 && state->__count == -4
+      && (wchar < 0xdc00 || wchar >= 0xdfff))
    {
-      /* At this point only the second half of a surrogate pair is valid. */
-      r->_errno = EILSEQ;
-      return -1;
+      /* There's a leftover lone high surrogate.  Write out the CESU-8 value
+	 of the surrogate and proceed to convert the given character.  Note
+	 to return extra 3 bytes. */
+      wchar_t tmp;
+      tmp = (state->__value.__wchb[0] << 16 | state->__value.__wchb[1] << 8)
+	    - 0x10000 >> 10 | 0xd80d;
+      *s++ = 0xe0 | ((tmp & 0xf000) >> 12);
+      *s++ = 0x80 | ((tmp &  0xfc0) >> 6);
+      *s++ = 0x80 |  (tmp &   0x3f);
+      state->__count = 0;
+      ret = 3;
    }
  if (wchar <= 0x7f)
    {
      *s = wchar;
-      return 1;
+      return ret + 1;
    }
  if (wchar >= 0x80 && wchar <= 0x7ff)
    {
      *s++ = 0xc0 | ((wchar & 0x7c0) >> 6);
      *s   = 0x80 |  (wchar &  0x3f);
-      return 2;
+      return ret + 2;
    }
  if (wchar >= 0x800 && wchar <= 0xffff)
    {
-      if (wchar >= 0xd800 && wchar <= 0xdfff)
+      /* No UTF-16 surrogate handling in UCS-4 */
+      if (sizeof (wchar_t) == 2 && wchar >= 0xd800 && wchar <= 0xdfff)
 	{
 	  wint_t tmp;
-	  /* UTF-16 surrogates -- must not occur in normal UCS-4 data */
-	  if (sizeof (wchar_t) != 2)
+	  if (wchar <= 0xdbff)
 	    {
-	      r->_errno = EILSEQ;
-	      return -1;
+	      /* First half of a surrogate pair.  Store the state and
+	         return ret + 0. */
+	      tmp = ((wchar & 0x3ff) << 10) + 0x10000;
+	      state->__value.__wchb[0] = (tmp >> 16) & 0xff;
+	      state->__value.__wchb[1] = (tmp >> 8) & 0xff;
+	      state->__count = -4;
+	      *s = (0xf0 | ((tmp & 0x1c0000) >> 18));
+	      return ret;
 	    }
-	  if (wchar >= 0xdc00)
+	  if (state->__count == -4)
 	    {
-	      /* Second half of a surrogate pair. It's not valid if
-		 we don't have already read a first half of a surrogate
-		 before. */
-	      if (state->__count != -4)
-		{
-		  r->_errno = EILSEQ;
-		  return -1;
-		}
-	      /* If it's valid, reconstruct the full Unicode value and
-		 return the trailing three bytes of the UTF-8 char. */
+	      /* Second half of a surrogate pair.  Reconstruct the full
+		 Unicode value and return the trailing three bytes of the
+		 UTF-8 character. */
 	      tmp = (state->__value.__wchb[0] << 16)
 		    | (state->__value.__wchb[1] << 8)
 		    | (wchar & 0x3ff);
 	      state->__count = 0;
+	      *s++ = 0xf0 | ((tmp & 0x1c0000) >> 18);
 	      *s++ = 0x80 | ((tmp &  0x3f000) >> 12);
 	      *s++ = 0x80 | ((tmp &    0xfc0) >> 6);
 	      *s   = 0x80 |  (tmp &     0x3f);
-	      return 3;
+	      return 4;
 	    }
-	  /* First half of a surrogate pair.  Store the state and return
-	     the first byte of the UTF-8 char. */
-	  tmp = ((wchar & 0x3ff) << 10) + 0x10000;
-	  state->__value.__wchb[0] = (tmp >> 16) & 0xff;
-	  state->__value.__wchb[1] = (tmp >> 8) & 0xff;
-	  state->__count = -4;
-	  *s = (0xf0 | ((tmp & 0x1c0000) >> 18));
-	  return 1;
+	  /* Otherwise translate into CESU-8 value. */
 	}
      *s++ = 0xe0 | ((wchar & 0xf000) >> 12);
      *s++ = 0x80 | ((wchar &  0xfc0) >> 6);
      *s   = 0x80 |  (wchar &   0x3f);
-      return 3;
+      return ret + 3;
    }
  if (wchar >= 0x10000 && wchar <= 0x10ffff)
    {