From a657970571be50681055aa60289e35f312dea761 Mon Sep 17 00:00:00 2001 From: Corinna Vinschen Date: Mon, 2 Nov 2009 11:42:04 +0000 Subject: [PATCH] * miscfuncs.h (transform_chars): Declare. Define inline variation here. * mount.cc (mount_info::from_fstab): Remove extern declaration of transform_chars. * path.cc (tfx_chars): Move to strfuncs.cc. (transform_chars): Ditto. * strfunc.cc (tfx_chars): Moved here from path.cc. (transform_chars): Ditto. (sys_cp_wcstombs): Make UNICODE private use area conversion roundtrip save for all characters. (sys_cp_mbstowcs): Ditto, by removing special case for UTF-8 sequences representing U+f0XX UNICODE chars. Fix typo in comment. --- winsup/cygwin/ChangeLog | 16 ++++++++- winsup/cygwin/miscfuncs.h | 8 +++++ winsup/cygwin/mount.cc | 1 - winsup/cygwin/path.cc | 57 -------------------------------- winsup/cygwin/strfuncs.cc | 69 ++++++++++++++++++++++++++++++++++----- 5 files changed, 83 insertions(+), 68 deletions(-) diff --git a/winsup/cygwin/ChangeLog b/winsup/cygwin/ChangeLog index b5b93ac13..99fcb9dc0 100644 --- a/winsup/cygwin/ChangeLog +++ b/winsup/cygwin/ChangeLog @@ -1,3 +1,17 @@ +2009-11-02 Corinna Vinschen + + * miscfuncs.h (transform_chars): Declare. Define inline variation here. + * mount.cc (mount_info::from_fstab): Remove extern declaration of + transform_chars. + * path.cc (tfx_chars): Move to strfuncs.cc. + (transform_chars): Ditto. + * strfunc.cc (tfx_chars): Moved here from path.cc. + (transform_chars): Ditto. + (sys_cp_wcstombs): Make UNICODE private use area conversion roundtrip + save for all characters. + (sys_cp_mbstowcs): Ditto, by removing special case for UTF-8 sequences + representing U+f0XX UNICODE chars. Fix typo in comment. + 2009-11-02 Corinna Vinschen * path.cc (tfx_chars): Constify. @@ -362,7 +376,7 @@ (fhandler_console::write_normal): Always use codepage 437 for alternate charset. Otherwise always default to the current internal locale. Replace ASCII SO with ASCII CAN. - * strfuncs.cc: Tweka comments according to below changes. + * strfuncs.cc: Tweak comments according to below changes. (sys_cp_wcstombs): Constify charset parameter. Convert all wchar_t values in the Unicode private use area U+F0xx to the singlebyte counterpart. Drop special handling creating ASCII SO sequence from diff --git a/winsup/cygwin/miscfuncs.h b/winsup/cygwin/miscfuncs.h index 4755d724a..489f8b745 100644 --- a/winsup/cygwin/miscfuncs.h +++ b/winsup/cygwin/miscfuncs.h @@ -25,6 +25,14 @@ void backslashify (const char *, char *, bool); void slashify (const char *, char *, bool); #define isslash(c) ((c) == '/') +extern void transform_chars (PWCHAR, PWCHAR); +inline void +transform_chars (PUNICODE_STRING upath, USHORT start_idx) +{ + transform_chars (upath->Buffer + start_idx, + upath->Buffer + upath->Length / sizeof (WCHAR) - 1); +} + /* Memory checking */ int __stdcall check_invalid_virtual_addr (const void *s, unsigned sz) __attribute__ ((regparm(2))); diff --git a/winsup/cygwin/mount.cc b/winsup/cygwin/mount.cc index 8c9b72a2f..b99a9b811 100644 --- a/winsup/cygwin/mount.cc +++ b/winsup/cygwin/mount.cc @@ -997,7 +997,6 @@ mount_info::from_fstab (bool user, WCHAR fstab[], PWCHAR fstab_end) if (user) { - extern void transform_chars (PWCHAR, PWCHAR); PWCHAR username; sys_mbstowcs (username = wcpcpy (fstab_end, L".d\\"), NT_MAX_PATH - (fstab_end - fstab), diff --git a/winsup/cygwin/path.cc b/winsup/cygwin/path.cc index fdc42d372..1f95073b9 100644 --- a/winsup/cygwin/path.cc +++ b/winsup/cygwin/path.cc @@ -395,63 +395,6 @@ path_conv::set_normalized_path (const char *path_copy) } } -/* Transform characters invalid for Windows filenames to the Unicode private - use area in the U+f0XX range. The affected characters are all control - chars 1 <= c <= 31, as well as the characters " * : < > ? |. The backslash - is affected as well, but we can't transform it as long as we accept Win32 - paths as input. - The reverse functionality is in strfuncs.cc, function sys_cp_wcstombs. */ -static const WCHAR tfx_chars[] = { - 0, 0xf000 | 1, 0xf000 | 2, 0xf000 | 3, - 0xf000 | 4, 0xf000 | 5, 0xf000 | 6, 0xf000 | 7, - 0xf000 | 8, 0xf000 | 9, 0xf000 | 10, 0xf000 | 11, - 0xf000 | 12, 0xf000 | 13, 0xf000 | 14, 0xf000 | 15, - 0xf000 | 16, 0xf000 | 17, 0xf000 | 18, 0xf000 | 19, - 0xf000 | 20, 0xf000 | 21, 0xf000 | 22, 0xf000 | 23, - 0xf000 | 24, 0xf000 | 25, 0xf000 | 26, 0xf000 | 27, - 0xf000 | 28, 0xf000 | 29, 0xf000 | 30, 0xf000 | 31, - ' ', '!', 0xf000 | '"', '#', - '$', '%', '&', 39, - '(', ')', 0xf000 | '*', '+', - ',', '-', '.', '\\', - '0', '1', '2', '3', - '4', '5', '6', '7', - '8', '9', 0xf000 | ':', ';', - 0xf000 | '<', '=', 0xf000 | '>', 0xf000 | '?', - '@', 'A', 'B', 'C', - 'D', 'E', 'F', 'G', - 'H', 'I', 'J', 'K', - 'L', 'M', 'N', 'O', - 'P', 'Q', 'R', 'S', - 'T', 'U', 'V', 'W', - 'X', 'Y', 'Z', '[', - '\\', ']', '^', '_', - '`', 'a', 'b', 'c', - 'd', 'e', 'f', 'g', - 'h', 'i', 'j', 'k', - 'l', 'm', 'n', 'o', - 'p', 'q', 'r', 's', - 't', 'u', 'v', 'w', - 'x', 'y', 'z', '{', - 0xf000 | '|', '}', '~', 127 -}; - -void -transform_chars (PWCHAR path, PWCHAR path_end) -{ - for (; path <= path_end; ++path) - if (*path < 128) - *path = tfx_chars[*path]; -} - -static inline -void -transform_chars (PUNICODE_STRING upath, USHORT start_idx) -{ - transform_chars (upath->Buffer + start_idx, - upath->Buffer + upath->Length / sizeof (WCHAR) - 1); -} - static inline void str2uni_cat (UNICODE_STRING &tgt, const char *srcstr) { diff --git a/winsup/cygwin/strfuncs.cc b/winsup/cygwin/strfuncs.cc index 009af1769..61df6505a 100644 --- a/winsup/cygwin/strfuncs.cc +++ b/winsup/cygwin/strfuncs.cc @@ -22,6 +22,55 @@ details. */ #include "cygheap.h" #include "tls_pbuf.h" +/* Transform characters invalid for Windows filenames to the Unicode private + use area in the U+f0XX range. The affected characters are all control + chars 1 <= c <= 31, as well as the characters " * : < > ? |. The backslash + is affected as well, but we can't transform it as long as we accept Win32 + paths as input. + The reverse functionality is in function sys_cp_wcstombs. */ +static const WCHAR tfx_chars[] = { + 0, 0xf000 | 1, 0xf000 | 2, 0xf000 | 3, + 0xf000 | 4, 0xf000 | 5, 0xf000 | 6, 0xf000 | 7, + 0xf000 | 8, 0xf000 | 9, 0xf000 | 10, 0xf000 | 11, + 0xf000 | 12, 0xf000 | 13, 0xf000 | 14, 0xf000 | 15, + 0xf000 | 16, 0xf000 | 17, 0xf000 | 18, 0xf000 | 19, + 0xf000 | 20, 0xf000 | 21, 0xf000 | 22, 0xf000 | 23, + 0xf000 | 24, 0xf000 | 25, 0xf000 | 26, 0xf000 | 27, + 0xf000 | 28, 0xf000 | 29, 0xf000 | 30, 0xf000 | 31, + ' ', '!', 0xf000 | '"', '#', + '$', '%', '&', 39, + '(', ')', 0xf000 | '*', '+', + ',', '-', '.', '\\', + '0', '1', '2', '3', + '4', '5', '6', '7', + '8', '9', 0xf000 | ':', ';', + 0xf000 | '<', '=', 0xf000 | '>', 0xf000 | '?', + '@', 'A', 'B', 'C', + 'D', 'E', 'F', 'G', + 'H', 'I', 'J', 'K', + 'L', 'M', 'N', 'O', + 'P', 'Q', 'R', 'S', + 'T', 'U', 'V', 'W', + 'X', 'Y', 'Z', '[', + '\\', ']', '^', '_', + '`', 'a', 'b', 'c', + 'd', 'e', 'f', 'g', + 'h', 'i', 'j', 'k', + 'l', 'm', 'n', 'o', + 'p', 'q', 'r', 's', + 't', 'u', 'v', 'w', + 'x', 'y', 'z', '{', + 0xf000 | '|', '}', '~', 127 +}; + +void +transform_chars (PWCHAR path, PWCHAR path_end) +{ + for (; path <= path_end; ++path) + if (*path < 128) + *path = tfx_chars[*path]; +} + /* The SJIS, JIS and eucJP conversion in newlib does not use UTF as wchar_t character representation. That's unfortunate for us since we require UTF for the OS. What we do here is to have our own @@ -426,16 +475,19 @@ sys_cp_wcstombs (wctomb_p f_wctomb, const char *charset, char *dst, size_t len, { wchar_t pw = *pwcs; int bytes; + unsigned char cwc; /* Convert UNICODE private use area. Reverse functionality for the - ASCII area <= 0x7f (only for path names) is transform_chars in - path.cc. Reverse functionality for invalid bytes in a multibyte - sequence is in sys_cp_mbstowcs. */ - if ((pw & 0xff00) == 0xf000 && ((pw & 0xff) <= 0x7f || MB_CUR_MAX > 1)) + ASCII area <= 0x7f (only for path names) is transform_chars above. + Reverse functionality for invalid bytes in a multibyte sequence is + in sys_cp_mbstowcs below. */ + if ((pw & 0xff00) == 0xf000 + && (((cwc = (pw & 0xff)) <= 0x7f && tfx_chars[cwc] >= 0xf000) + || (cwc >= 0x80 && MB_CUR_MAX > 1))) { - buf[0] = pw & 0xff; + buf[0] = (char) cwc; bytes = 1; - } + } else { bytes = f_wctomb (_REENT, buf, pw, charset, &ps); @@ -603,15 +655,14 @@ sys_cp_mbstowcs (mbtowc_p f_mbtowc, const char *charset, wchar_t *dst, } } else if ((bytes = f_mbtowc (_REENT, ptr, (const char *) pmbs, nms, - charset, &ps)) < 0 - || (bytes == 3 && pmbs[0] == 0xef && (pmbs[1] & 0xf4) == 0x80)) + charset, &ps)) < 0) { /* The technique is based on a discussion here: http://www.mail-archive.com/linux-utf8@nl.linux.org/msg00080.html Invalid bytes in a multibyte secuence are converted to the private use area which is already used to store ASCII - chars invalid in Windows filenames. This techinque allows + chars invalid in Windows filenames. This technque allows to store them in a symmetric way. */ bytes = 1; if (dst)