From a657970571be50681055aa60289e35f312dea761 Mon Sep 17 00:00:00 2001
From: Corinna Vinschen <corinna@vinschen.de>
Date: Mon, 2 Nov 2009 11:42:04 +0000
Subject: [PATCH] 	* miscfuncs.h (transform_chars): Declare.  Define
 inline variation here. 	* mount.cc (mount_info::from_fstab): Remove
 extern declaration of 	transform_chars. 	* path.cc (tfx_chars): Move to
 strfuncs.cc. 	(transform_chars): Ditto. 	* strfunc.cc (tfx_chars):
 Moved here from path.cc. 	(transform_chars): Ditto. 
 (sys_cp_wcstombs): Make UNICODE private use area conversion roundtrip 	save
 for all characters. 	(sys_cp_mbstowcs): Ditto, by removing special case for
 UTF-8 sequences 	representing U+f0XX UNICODE chars.  Fix typo in
 comment.

---
 winsup/cygwin/ChangeLog   | 16 ++++++++-
 winsup/cygwin/miscfuncs.h |  8 +++++
 winsup/cygwin/mount.cc    |  1 -
 winsup/cygwin/path.cc     | 57 --------------------------------
 winsup/cygwin/strfuncs.cc | 69 ++++++++++++++++++++++++++++++++++-----
 5 files changed, 83 insertions(+), 68 deletions(-)

diff --git a/winsup/cygwin/ChangeLog b/winsup/cygwin/ChangeLog
index b5b93ac13..99fcb9dc0 100644
--- a/winsup/cygwin/ChangeLog
+++ b/winsup/cygwin/ChangeLog
@@ -1,3 +1,17 @@
+2009-11-02  Corinna Vinschen  <corinna@vinschen.de>
+
+	* miscfuncs.h (transform_chars): Declare.  Define inline variation here.
+	* mount.cc (mount_info::from_fstab): Remove extern declaration of
+	transform_chars.
+	* path.cc (tfx_chars): Move to strfuncs.cc.
+	(transform_chars): Ditto.
+	* strfunc.cc (tfx_chars): Moved here from path.cc.
+	(transform_chars): Ditto.
+	(sys_cp_wcstombs): Make UNICODE private use area conversion roundtrip
+	save for all characters.
+	(sys_cp_mbstowcs): Ditto, by removing special case for UTF-8 sequences
+	representing U+f0XX UNICODE chars.  Fix typo in comment.
+
 2009-11-02  Corinna Vinschen  <corinna@vinschen.de>
 
 	* path.cc (tfx_chars): Constify.
@@ -362,7 +376,7 @@
 	(fhandler_console::write_normal): Always use codepage 437 for alternate
 	charset.  Otherwise always default to the current internal locale.
 	Replace ASCII SO with ASCII CAN.
-	* strfuncs.cc: Tweka comments according to below changes.
+	* strfuncs.cc: Tweak comments according to below changes.
 	(sys_cp_wcstombs): Constify charset parameter.  Convert all wchar_t
 	values in the Unicode private use area U+F0xx to the singlebyte
 	counterpart.  Drop special handling creating ASCII SO sequence from
diff --git a/winsup/cygwin/miscfuncs.h b/winsup/cygwin/miscfuncs.h
index 4755d724a..489f8b745 100644
--- a/winsup/cygwin/miscfuncs.h
+++ b/winsup/cygwin/miscfuncs.h
@@ -25,6 +25,14 @@ void backslashify (const char *, char *, bool);
 void slashify (const char *, char *, bool);
 #define isslash(c) ((c) == '/')
 
+extern void transform_chars (PWCHAR, PWCHAR);
+inline void
+transform_chars (PUNICODE_STRING upath, USHORT start_idx)
+{
+  transform_chars (upath->Buffer + start_idx,
+		   upath->Buffer + upath->Length / sizeof (WCHAR) - 1);
+}
+
 /* Memory checking */
 int __stdcall check_invalid_virtual_addr (const void *s, unsigned sz) __attribute__ ((regparm(2)));
 
diff --git a/winsup/cygwin/mount.cc b/winsup/cygwin/mount.cc
index 8c9b72a2f..b99a9b811 100644
--- a/winsup/cygwin/mount.cc
+++ b/winsup/cygwin/mount.cc
@@ -997,7 +997,6 @@ mount_info::from_fstab (bool user, WCHAR fstab[], PWCHAR fstab_end)
 
   if (user)
     {
-      extern void transform_chars (PWCHAR, PWCHAR);
       PWCHAR username;
       sys_mbstowcs (username = wcpcpy (fstab_end, L".d\\"),
 		    NT_MAX_PATH - (fstab_end - fstab),
diff --git a/winsup/cygwin/path.cc b/winsup/cygwin/path.cc
index fdc42d372..1f95073b9 100644
--- a/winsup/cygwin/path.cc
+++ b/winsup/cygwin/path.cc
@@ -395,63 +395,6 @@ path_conv::set_normalized_path (const char *path_copy)
     }
 }
 
-/* Transform characters invalid for Windows filenames to the Unicode private
-   use area in the U+f0XX range.  The affected characters are all control
-   chars 1 <= c <= 31, as well as the characters " * : < > ? |.  The backslash
-   is affected as well, but we can't transform it as long as we accept Win32
-   paths as input.
-   The reverse functionality is in strfuncs.cc, function sys_cp_wcstombs. */
-static const WCHAR tfx_chars[] = {
-            0, 0xf000 |   1, 0xf000 |   2, 0xf000 |   3,
- 0xf000 |   4, 0xf000 |   5, 0xf000 |   6, 0xf000 |   7,
- 0xf000 |   8, 0xf000 |   9, 0xf000 |  10, 0xf000 |  11,
- 0xf000 |  12, 0xf000 |  13, 0xf000 |  14, 0xf000 |  15,
- 0xf000 |  16, 0xf000 |  17, 0xf000 |  18, 0xf000 |  19,
- 0xf000 |  20, 0xf000 |  21, 0xf000 |  22, 0xf000 |  23,
- 0xf000 |  24, 0xf000 |  25, 0xf000 |  26, 0xf000 |  27,
- 0xf000 |  28, 0xf000 |  29, 0xf000 |  30, 0xf000 |  31,
-          ' ',          '!', 0xf000 | '"',          '#',
-          '$',          '%',          '&',           39,
-          '(',          ')', 0xf000 | '*',          '+',
-          ',',          '-',          '.',          '\\',
-          '0',          '1',          '2',          '3',
-          '4',          '5',          '6',          '7',
-          '8',          '9', 0xf000 | ':',          ';',
- 0xf000 | '<',          '=', 0xf000 | '>', 0xf000 | '?',
-          '@',          'A',          'B',          'C',
-          'D',          'E',          'F',          'G',
-          'H',          'I',          'J',          'K',
-          'L',          'M',          'N',          'O',
-          'P',          'Q',          'R',          'S',
-          'T',          'U',          'V',          'W',
-          'X',          'Y',          'Z',          '[',
-          '\\',          ']',          '^',          '_',
-          '`',          'a',          'b',          'c',
-          'd',          'e',          'f',          'g',
-          'h',          'i',          'j',          'k',
-          'l',          'm',          'n',          'o',
-          'p',          'q',          'r',          's',
-          't',          'u',          'v',          'w',
-          'x',          'y',          'z',          '{',
- 0xf000 | '|',          '}',          '~',          127
-};
-
-void
-transform_chars (PWCHAR path, PWCHAR path_end)
-{
-  for (; path <= path_end; ++path)
-    if (*path < 128)
-      *path = tfx_chars[*path];
-}
-
-static inline
-void
-transform_chars (PUNICODE_STRING upath, USHORT start_idx)
-{
-  transform_chars (upath->Buffer + start_idx,
-		   upath->Buffer + upath->Length / sizeof (WCHAR) - 1);
-}
-
 static inline void
 str2uni_cat (UNICODE_STRING &tgt, const char *srcstr)
 {
diff --git a/winsup/cygwin/strfuncs.cc b/winsup/cygwin/strfuncs.cc
index 009af1769..61df6505a 100644
--- a/winsup/cygwin/strfuncs.cc
+++ b/winsup/cygwin/strfuncs.cc
@@ -22,6 +22,55 @@ details. */
 #include "cygheap.h"
 #include "tls_pbuf.h"
 
+/* Transform characters invalid for Windows filenames to the Unicode private
+   use area in the U+f0XX range.  The affected characters are all control
+   chars 1 <= c <= 31, as well as the characters " * : < > ? |.  The backslash
+   is affected as well, but we can't transform it as long as we accept Win32
+   paths as input.
+   The reverse functionality is in function sys_cp_wcstombs. */
+static const WCHAR tfx_chars[] = {
+            0, 0xf000 |   1, 0xf000 |   2, 0xf000 |   3,
+ 0xf000 |   4, 0xf000 |   5, 0xf000 |   6, 0xf000 |   7,
+ 0xf000 |   8, 0xf000 |   9, 0xf000 |  10, 0xf000 |  11,
+ 0xf000 |  12, 0xf000 |  13, 0xf000 |  14, 0xf000 |  15,
+ 0xf000 |  16, 0xf000 |  17, 0xf000 |  18, 0xf000 |  19,
+ 0xf000 |  20, 0xf000 |  21, 0xf000 |  22, 0xf000 |  23,
+ 0xf000 |  24, 0xf000 |  25, 0xf000 |  26, 0xf000 |  27,
+ 0xf000 |  28, 0xf000 |  29, 0xf000 |  30, 0xf000 |  31,
+          ' ',          '!', 0xf000 | '"',          '#',
+          '$',          '%',          '&',           39,
+          '(',          ')', 0xf000 | '*',          '+',
+          ',',          '-',          '.',          '\\',
+          '0',          '1',          '2',          '3',
+          '4',          '5',          '6',          '7',
+          '8',          '9', 0xf000 | ':',          ';',
+ 0xf000 | '<',          '=', 0xf000 | '>', 0xf000 | '?',
+          '@',          'A',          'B',          'C',
+          'D',          'E',          'F',          'G',
+          'H',          'I',          'J',          'K',
+          'L',          'M',          'N',          'O',
+          'P',          'Q',          'R',          'S',
+          'T',          'U',          'V',          'W',
+          'X',          'Y',          'Z',          '[',
+          '\\',          ']',          '^',          '_',
+          '`',          'a',          'b',          'c',
+          'd',          'e',          'f',          'g',
+          'h',          'i',          'j',          'k',
+          'l',          'm',          'n',          'o',
+          'p',          'q',          'r',          's',
+          't',          'u',          'v',          'w',
+          'x',          'y',          'z',          '{',
+ 0xf000 | '|',          '}',          '~',          127
+};
+
+void
+transform_chars (PWCHAR path, PWCHAR path_end)
+{
+  for (; path <= path_end; ++path)
+    if (*path < 128)
+      *path = tfx_chars[*path];
+}
+
 /* The SJIS, JIS and eucJP conversion in newlib does not use UTF as
    wchar_t character representation.  That's unfortunate for us since
    we require UTF for the OS.  What we do here is to have our own
@@ -426,16 +475,19 @@ sys_cp_wcstombs (wctomb_p f_wctomb, const char *charset, char *dst, size_t len,
     {
       wchar_t pw = *pwcs;
       int bytes;
+      unsigned char cwc;
 
       /* Convert UNICODE private use area.  Reverse functionality for the
-         ASCII area <= 0x7f (only for path names) is transform_chars in
-	 path.cc.  Reverse functionality for invalid bytes in a multibyte
-	 sequence is in sys_cp_mbstowcs. */
-      if ((pw & 0xff00) == 0xf000 && ((pw & 0xff) <= 0x7f || MB_CUR_MAX > 1))
+         ASCII area <= 0x7f (only for path names) is transform_chars above.
+	 Reverse functionality for invalid bytes in a multibyte sequence is
+	 in sys_cp_mbstowcs below. */
+      if ((pw & 0xff00) == 0xf000
+	  && (((cwc = (pw & 0xff)) <= 0x7f && tfx_chars[cwc] >= 0xf000)
+	      || (cwc >= 0x80 && MB_CUR_MAX > 1)))
 	{
-	  buf[0] = pw & 0xff;
+	  buf[0] = (char) cwc;
 	  bytes = 1;
-      	}
+	}
       else
 	{
 	  bytes = f_wctomb (_REENT, buf, pw, charset, &ps);
@@ -603,15 +655,14 @@ sys_cp_mbstowcs (mbtowc_p f_mbtowc, const char *charset, wchar_t *dst,
 	    }
 	}
       else if ((bytes = f_mbtowc (_REENT, ptr, (const char *) pmbs, nms,
-				  charset, &ps)) < 0
-	       || (bytes == 3 && pmbs[0] == 0xef && (pmbs[1] & 0xf4) == 0x80))
+				  charset, &ps)) < 0)
 	{
 	  /* The technique is based on a discussion here:
 	     http://www.mail-archive.com/linux-utf8@nl.linux.org/msg00080.html
 
 	     Invalid bytes in a multibyte secuence are converted to
 	     the private use area which is already used to store ASCII
-	     chars invalid in Windows filenames.  This techinque allows 
+	     chars invalid in Windows filenames.  This technque allows 
 	     to store them in a symmetric way. */
 	  bytes = 1;
 	  if (dst)