2013-07-20 01:28:34 +08:00
|
|
|
/* strfuncs.cc: string functions
|
2007-08-02 22:21:53 +08:00
|
|
|
|
2013-01-21 12:38:31 +08:00
|
|
|
Copyright 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
|
2013-11-24 20:13:36 +08:00
|
|
|
2007, 2008, 2009, 2010, 2011, 2012, 2013 Red Hat, Inc.
|
2007-08-02 22:21:53 +08:00
|
|
|
|
|
|
|
This file is part of Cygwin.
|
|
|
|
|
|
|
|
This software is a copyrighted work licensed under the terms of the
|
|
|
|
Cygwin license. Please consult the file "CYGWIN_LICENSE" for
|
|
|
|
details. */
|
|
|
|
|
|
|
|
#include "winsup.h"
|
2008-02-01 04:26:01 +08:00
|
|
|
#include <stdlib.h>
|
2012-03-08 17:36:11 +08:00
|
|
|
#include <sys/param.h>
|
2008-05-14 18:21:22 +08:00
|
|
|
#include <wchar.h>
|
2007-08-12 20:48:02 +08:00
|
|
|
#include <ntdll.h>
|
2008-02-01 04:26:01 +08:00
|
|
|
#include "path.h"
|
|
|
|
#include "fhandler.h"
|
|
|
|
#include "dtable.h"
|
|
|
|
#include "cygheap.h"
|
2007-08-02 22:21:53 +08:00
|
|
|
|
2009-11-02 19:42:04 +08:00
|
|
|
/* Transform characters invalid for Windows filenames to the Unicode private
|
|
|
|
use area in the U+f0XX range. The affected characters are all control
|
|
|
|
chars 1 <= c <= 31, as well as the characters " * : < > ? |. The backslash
|
|
|
|
is affected as well, but we can't transform it as long as we accept Win32
|
2010-04-23 19:07:35 +08:00
|
|
|
paths as input. */
|
2009-11-02 19:42:04 +08:00
|
|
|
static const WCHAR tfx_chars[] = {
|
2011-06-06 13:02:13 +08:00
|
|
|
0, 0xf000 | 1, 0xf000 | 2, 0xf000 | 3,
|
2009-11-02 19:42:04 +08:00
|
|
|
0xf000 | 4, 0xf000 | 5, 0xf000 | 6, 0xf000 | 7,
|
|
|
|
0xf000 | 8, 0xf000 | 9, 0xf000 | 10, 0xf000 | 11,
|
|
|
|
0xf000 | 12, 0xf000 | 13, 0xf000 | 14, 0xf000 | 15,
|
|
|
|
0xf000 | 16, 0xf000 | 17, 0xf000 | 18, 0xf000 | 19,
|
|
|
|
0xf000 | 20, 0xf000 | 21, 0xf000 | 22, 0xf000 | 23,
|
|
|
|
0xf000 | 24, 0xf000 | 25, 0xf000 | 26, 0xf000 | 27,
|
|
|
|
0xf000 | 28, 0xf000 | 29, 0xf000 | 30, 0xf000 | 31,
|
2011-06-06 13:02:13 +08:00
|
|
|
' ', '!', 0xf000 | '"', '#',
|
|
|
|
'$', '%', '&', 39,
|
|
|
|
'(', ')', 0xf000 | '*', '+',
|
|
|
|
',', '-', '.', '\\',
|
|
|
|
'0', '1', '2', '3',
|
|
|
|
'4', '5', '6', '7',
|
|
|
|
'8', '9', 0xf000 | ':', ';',
|
2009-11-02 19:42:04 +08:00
|
|
|
0xf000 | '<', '=', 0xf000 | '>', 0xf000 | '?',
|
2011-06-06 13:02:13 +08:00
|
|
|
'@', 'A', 'B', 'C',
|
|
|
|
'D', 'E', 'F', 'G',
|
|
|
|
'H', 'I', 'J', 'K',
|
|
|
|
'L', 'M', 'N', 'O',
|
|
|
|
'P', 'Q', 'R', 'S',
|
|
|
|
'T', 'U', 'V', 'W',
|
|
|
|
'X', 'Y', 'Z', '[',
|
|
|
|
'\\', ']', '^', '_',
|
|
|
|
'`', 'a', 'b', 'c',
|
|
|
|
'd', 'e', 'f', 'g',
|
|
|
|
'h', 'i', 'j', 'k',
|
|
|
|
'l', 'm', 'n', 'o',
|
|
|
|
'p', 'q', 'r', 's',
|
|
|
|
't', 'u', 'v', 'w',
|
|
|
|
'x', 'y', 'z', '{',
|
2009-11-02 19:42:04 +08:00
|
|
|
0xf000 | '|', '}', '~', 127
|
|
|
|
};
|
|
|
|
|
2015-12-18 19:42:40 +08:00
|
|
|
/* This is the table for the reverse functionality in sys_wcstombs.
|
2010-04-23 19:07:35 +08:00
|
|
|
It differs deliberately in two code places (space and dot) to allow
|
|
|
|
converting back space and dot on filesystems only supporting DOS
|
|
|
|
filenames. */
|
|
|
|
static const WCHAR tfx_rev_chars[] = {
|
2011-06-06 13:02:13 +08:00
|
|
|
0, 0xf000 | 1, 0xf000 | 2, 0xf000 | 3,
|
2010-04-23 19:07:35 +08:00
|
|
|
0xf000 | 4, 0xf000 | 5, 0xf000 | 6, 0xf000 | 7,
|
|
|
|
0xf000 | 8, 0xf000 | 9, 0xf000 | 10, 0xf000 | 11,
|
|
|
|
0xf000 | 12, 0xf000 | 13, 0xf000 | 14, 0xf000 | 15,
|
|
|
|
0xf000 | 16, 0xf000 | 17, 0xf000 | 18, 0xf000 | 19,
|
|
|
|
0xf000 | 20, 0xf000 | 21, 0xf000 | 22, 0xf000 | 23,
|
|
|
|
0xf000 | 24, 0xf000 | 25, 0xf000 | 26, 0xf000 | 27,
|
|
|
|
0xf000 | 28, 0xf000 | 29, 0xf000 | 30, 0xf000 | 31,
|
|
|
|
0xf000 | ' ', '!', 0xf000 | '"', '#',
|
2011-06-06 13:02:13 +08:00
|
|
|
'$', '%', '&', 39,
|
|
|
|
'(', ')', 0xf000 | '*', '+',
|
|
|
|
',', '-', 0xf000 | '.', '\\',
|
|
|
|
'0', '1', '2', '3',
|
|
|
|
'4', '5', '6', '7',
|
|
|
|
'8', '9', 0xf000 | ':', ';',
|
2010-04-23 19:07:35 +08:00
|
|
|
0xf000 | '<', '=', 0xf000 | '>', 0xf000 | '?',
|
2011-06-06 13:02:13 +08:00
|
|
|
'@', 'A', 'B', 'C',
|
|
|
|
'D', 'E', 'F', 'G',
|
|
|
|
'H', 'I', 'J', 'K',
|
|
|
|
'L', 'M', 'N', 'O',
|
|
|
|
'P', 'Q', 'R', 'S',
|
|
|
|
'T', 'U', 'V', 'W',
|
|
|
|
'X', 'Y', 'Z', '[',
|
|
|
|
'\\', ']', '^', '_',
|
|
|
|
'`', 'a', 'b', 'c',
|
|
|
|
'd', 'e', 'f', 'g',
|
|
|
|
'h', 'i', 'j', 'k',
|
|
|
|
'l', 'm', 'n', 'o',
|
|
|
|
'p', 'q', 'r', 's',
|
|
|
|
't', 'u', 'v', 'w',
|
|
|
|
'x', 'y', 'z', '{',
|
2010-04-23 19:07:35 +08:00
|
|
|
0xf000 | '|', '}', '~', 127
|
|
|
|
};
|
|
|
|
|
2009-11-02 19:42:04 +08:00
|
|
|
void
|
|
|
|
transform_chars (PWCHAR path, PWCHAR path_end)
|
|
|
|
{
|
|
|
|
for (; path <= path_end; ++path)
|
|
|
|
if (*path < 128)
|
|
|
|
*path = tfx_chars[*path];
|
|
|
|
}
|
|
|
|
|
2009-04-06 18:50:11 +08:00
|
|
|
/* The SJIS, JIS and eucJP conversion in newlib does not use UTF as
|
2009-03-24 20:18:34 +08:00
|
|
|
wchar_t character representation. That's unfortunate for us since
|
|
|
|
we require UTF for the OS. What we do here is to have our own
|
|
|
|
implementation of the base functions for the conversion using
|
|
|
|
the MulitByteToWideChar/WideCharToMultiByte functions. */
|
|
|
|
|
2009-04-06 18:50:11 +08:00
|
|
|
/* FIXME: We can't support JIS (ISO-2022-JP) at all right now. It's a
|
|
|
|
stateful charset encoding. The translation from mbtowc to
|
|
|
|
MulitByteToWideChar is quite complex. Given that we support SJIS and
|
|
|
|
eucJP, the both most used Japanese charset encodings, this shouldn't
|
|
|
|
be such a big problem. */
|
|
|
|
|
2009-03-26 18:26:57 +08:00
|
|
|
/* GBK, eucKR, and Big5 conversions are not available so far in newlib. */
|
2009-03-24 20:18:34 +08:00
|
|
|
|
|
|
|
static int
|
|
|
|
__db_wctomb (struct _reent *r, char *s, wchar_t wchar, UINT cp)
|
|
|
|
{
|
|
|
|
if (s == NULL)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (wchar < 0x80)
|
|
|
|
{
|
|
|
|
*s = (char) wchar;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
BOOL def_used = false;
|
2009-04-06 18:50:11 +08:00
|
|
|
int ret = WideCharToMultiByte (cp, WC_NO_BEST_FIT_CHARS, &wchar, 1, s,
|
2009-05-15 03:49:37 +08:00
|
|
|
2, NULL, &def_used);
|
2009-03-24 20:18:34 +08:00
|
|
|
if (ret > 0 && !def_used)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
r->_errno = EILSEQ;
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
extern "C" int
|
|
|
|
__sjis_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset,
|
|
|
|
mbstate_t *state)
|
2007-08-02 22:21:53 +08:00
|
|
|
{
|
2009-03-24 20:18:34 +08:00
|
|
|
return __db_wctomb (r,s, wchar, 932);
|
2008-02-06 01:37:10 +08:00
|
|
|
}
|
|
|
|
|
2009-03-24 20:18:34 +08:00
|
|
|
extern "C" int
|
|
|
|
__eucjp_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset,
|
|
|
|
mbstate_t *state)
|
|
|
|
{
|
2009-04-06 18:50:11 +08:00
|
|
|
/* Unfortunately, the Windows eucJP codepage 20932 is not really 100%
|
|
|
|
compatible to eucJP. It's a cute approximation which makes it a
|
|
|
|
doublebyte codepage.
|
|
|
|
The JIS-X-0212 three byte codes (0x8f,0xa1-0xfe,0xa1-0xfe) are folded
|
|
|
|
into two byte codes as follows: The 0x8f is stripped, the next byte is
|
|
|
|
taken as is, the third byte is mapped into the lower 7-bit area by
|
|
|
|
masking it with 0x7f. So, for instance, the eucJP code 0x8f,0xdd,0xf8
|
|
|
|
becomes 0xdd,0x78 in CP 20932.
|
|
|
|
|
|
|
|
To be really eucJP compatible, we have to map the JIS-X-0212 characters
|
|
|
|
between CP 20932 and eucJP ourselves. */
|
|
|
|
if (s == NULL)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (wchar < 0x80)
|
|
|
|
{
|
|
|
|
*s = (char) wchar;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
BOOL def_used = false;
|
|
|
|
int ret = WideCharToMultiByte (20932, WC_NO_BEST_FIT_CHARS, &wchar, 1, s,
|
2009-05-15 03:49:37 +08:00
|
|
|
3, NULL, &def_used);
|
2009-04-06 18:50:11 +08:00
|
|
|
if (ret > 0 && !def_used)
|
|
|
|
{
|
|
|
|
/* CP20932 representation of JIS-X-0212 character? */
|
|
|
|
if (ret == 2 && (unsigned char) s[1] <= 0x7f)
|
|
|
|
{
|
|
|
|
/* Yes, convert to eucJP three byte sequence */
|
|
|
|
s[2] = s[1] | 0x80;
|
|
|
|
s[1] = s[0];
|
|
|
|
s[0] = 0x8f;
|
|
|
|
++ret;
|
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
r->_errno = EILSEQ;
|
|
|
|
return -1;
|
2009-03-24 20:18:34 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
extern "C" int
|
|
|
|
__gbk_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset,
|
|
|
|
mbstate_t *state)
|
|
|
|
{
|
|
|
|
return __db_wctomb (r,s, wchar, 936);
|
|
|
|
}
|
|
|
|
|
|
|
|
extern "C" int
|
|
|
|
__kr_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset,
|
|
|
|
mbstate_t *state)
|
|
|
|
{
|
2009-04-06 18:50:11 +08:00
|
|
|
return __db_wctomb (r,s, wchar, 949);
|
2009-03-24 20:18:34 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
extern "C" int
|
|
|
|
__big5_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset,
|
|
|
|
mbstate_t *state)
|
|
|
|
{
|
|
|
|
return __db_wctomb (r,s, wchar, 950);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
2009-04-06 18:50:11 +08:00
|
|
|
__db_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n, UINT cp,
|
|
|
|
mbstate_t *state)
|
2009-03-24 20:18:34 +08:00
|
|
|
{
|
|
|
|
wchar_t dummy;
|
2007-08-02 22:21:53 +08:00
|
|
|
int ret;
|
|
|
|
|
2009-03-24 20:18:34 +08:00
|
|
|
if (s == NULL)
|
|
|
|
return 0; /* not state-dependent */
|
|
|
|
|
|
|
|
if (n == 0)
|
|
|
|
return -2;
|
2009-07-01 05:18:44 +08:00
|
|
|
|
2009-04-06 18:50:11 +08:00
|
|
|
if (pwc == NULL)
|
|
|
|
pwc = &dummy;
|
2009-03-24 20:18:34 +08:00
|
|
|
|
|
|
|
if (state->__count == 0)
|
|
|
|
{
|
|
|
|
if (*(unsigned char *) s < 0x80)
|
|
|
|
{
|
|
|
|
*pwc = *(unsigned char *) s;
|
|
|
|
return *s ? 1 : 0;
|
|
|
|
}
|
2012-03-08 17:36:11 +08:00
|
|
|
size_t cnt = MIN (n, 2);
|
2009-04-06 18:50:11 +08:00
|
|
|
ret = MultiByteToWideChar (cp, MB_ERR_INVALID_CHARS, s, cnt, pwc, 1);
|
2009-03-24 20:18:34 +08:00
|
|
|
if (ret)
|
2009-04-06 18:50:11 +08:00
|
|
|
return cnt;
|
2009-03-24 20:18:34 +08:00
|
|
|
if (n == 1)
|
|
|
|
{
|
2009-04-06 18:50:11 +08:00
|
|
|
state->__count = n;
|
2009-03-24 20:18:34 +08:00
|
|
|
state->__value.__wchb[0] = *s;
|
|
|
|
return -2;
|
|
|
|
}
|
2009-04-06 18:50:11 +08:00
|
|
|
/* These Win32 functions are really crappy. Assuming n is 2 but the
|
|
|
|
first byte is a singlebyte charcode, the function does not convert
|
|
|
|
that byte and return 1, rather it just returns 0. So, what we do
|
|
|
|
here is to check if the first byte returns a valid value... */
|
|
|
|
else if (MultiByteToWideChar (cp, MB_ERR_INVALID_CHARS, s, 1, pwc, 1))
|
|
|
|
return 1;
|
2009-03-24 20:18:34 +08:00
|
|
|
r->_errno = EILSEQ;
|
|
|
|
return -1;
|
|
|
|
}
|
2009-04-06 18:50:11 +08:00
|
|
|
state->__value.__wchb[state->__count] = *s;
|
|
|
|
ret = MultiByteToWideChar (cp, MB_ERR_INVALID_CHARS,
|
|
|
|
(const char *) state->__value.__wchb, 2, pwc, 1);
|
2009-03-24 20:18:34 +08:00
|
|
|
if (!ret)
|
2007-10-24 00:26:28 +08:00
|
|
|
{
|
2009-03-24 20:18:34 +08:00
|
|
|
r->_errno = EILSEQ;
|
|
|
|
return -1;
|
2007-10-24 00:26:28 +08:00
|
|
|
}
|
2009-04-06 18:50:11 +08:00
|
|
|
state->__count = 0;
|
|
|
|
return 1;
|
2007-08-02 22:21:53 +08:00
|
|
|
}
|
|
|
|
|
2009-03-24 20:18:34 +08:00
|
|
|
extern "C" int
|
|
|
|
__sjis_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n,
|
|
|
|
const char *charset, mbstate_t *state)
|
|
|
|
{
|
2010-01-24 20:29:49 +08:00
|
|
|
return __db_mbtowc (r, pwc, s, n, 932, state);
|
2009-03-24 20:18:34 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
extern "C" int
|
|
|
|
__eucjp_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n,
|
2009-04-06 18:50:11 +08:00
|
|
|
const char *charset, mbstate_t *state)
|
2009-03-24 20:18:34 +08:00
|
|
|
{
|
2009-04-06 18:50:11 +08:00
|
|
|
/* See comment in __eucjp_wctomb above. */
|
|
|
|
wchar_t dummy;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
if (s == NULL)
|
|
|
|
return 0; /* not state-dependent */
|
|
|
|
|
|
|
|
if (n == 0)
|
|
|
|
return -2;
|
2009-07-01 05:18:44 +08:00
|
|
|
|
2009-04-06 18:50:11 +08:00
|
|
|
if (pwc == NULL)
|
|
|
|
pwc = &dummy;
|
|
|
|
|
|
|
|
if (state->__count == 0)
|
|
|
|
{
|
|
|
|
if (*(unsigned char *) s < 0x80)
|
|
|
|
{
|
|
|
|
*pwc = *(unsigned char *) s;
|
|
|
|
return *s ? 1 : 0;
|
|
|
|
}
|
|
|
|
if (*(unsigned char *) s == 0x8f) /* JIS-X-0212 lead byte? */
|
|
|
|
{
|
|
|
|
/* Yes. Store sequence in mbstate and handle in the __count != 0
|
|
|
|
case at the end of the function. */
|
|
|
|
size_t i;
|
|
|
|
for (i = 0; i < 3 && i < n; i++)
|
|
|
|
state->__value.__wchb[i] = s[i];
|
|
|
|
if ((state->__count = i) < 3) /* Incomplete sequence? */
|
|
|
|
return -2;
|
|
|
|
ret = 3;
|
|
|
|
goto jis_x_0212;
|
|
|
|
}
|
2012-03-08 17:36:11 +08:00
|
|
|
size_t cnt = MIN (n, 2);
|
2009-04-06 18:50:11 +08:00
|
|
|
if (MultiByteToWideChar (20932, MB_ERR_INVALID_CHARS, s, cnt, pwc, 1))
|
|
|
|
return cnt;
|
|
|
|
if (n == 1)
|
|
|
|
{
|
|
|
|
state->__count = 1;
|
|
|
|
state->__value.__wchb[0] = *s;
|
|
|
|
return -2;
|
|
|
|
}
|
|
|
|
else if (MultiByteToWideChar (20932, MB_ERR_INVALID_CHARS, s, 1, pwc, 1))
|
|
|
|
return 1;
|
|
|
|
r->_errno = EILSEQ;
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
state->__value.__wchb[state->__count++] = *s;
|
|
|
|
ret = 1;
|
|
|
|
jis_x_0212:
|
|
|
|
if (state->__value.__wchb[0] == 0x8f)
|
|
|
|
{
|
|
|
|
if (state->__count == 2)
|
|
|
|
{
|
|
|
|
if (n == 1)
|
|
|
|
return -2;
|
|
|
|
state->__value.__wchb[state->__count] = s[1];
|
|
|
|
ret = 2;
|
|
|
|
}
|
|
|
|
/* Ok, we have a full JIS-X-0212 sequence in mbstate. Convert it
|
|
|
|
to the CP 20932 representation and feed it to MultiByteToWideChar. */
|
|
|
|
state->__value.__wchb[0] = state->__value.__wchb[1];
|
|
|
|
state->__value.__wchb[1] = state->__value.__wchb[2] & 0x7f;
|
|
|
|
}
|
|
|
|
if (!MultiByteToWideChar (20932, MB_ERR_INVALID_CHARS,
|
|
|
|
(const char *) state->__value.__wchb, 2, pwc, 1))
|
|
|
|
{
|
|
|
|
r->_errno = EILSEQ;
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
state->__count = 0;
|
|
|
|
return ret;
|
2009-03-24 20:18:34 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
extern "C" int
|
|
|
|
__gbk_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n,
|
|
|
|
const char *charset, mbstate_t *state)
|
|
|
|
{
|
|
|
|
return __db_mbtowc (r, pwc, s, n, 936, state);
|
|
|
|
}
|
|
|
|
|
|
|
|
extern "C" int
|
|
|
|
__kr_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n,
|
|
|
|
const char *charset, mbstate_t *state)
|
|
|
|
{
|
2009-04-06 18:50:11 +08:00
|
|
|
return __db_mbtowc (r, pwc, s, n, 949, state);
|
2009-03-24 20:18:34 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
extern "C" int
|
|
|
|
__big5_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n,
|
|
|
|
const char *charset, mbstate_t *state)
|
|
|
|
{
|
|
|
|
return __db_mbtowc (r, pwc, s, n, 950, state);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Our own sys_wcstombs/sys_mbstowcs functions differ from the
|
|
|
|
wcstombs/mbstowcs API in three ways:
|
|
|
|
|
|
|
|
- The UNICODE private use area is used in filenames to specify
|
|
|
|
characters not allowed in Windows filenames ('*', '?', etc).
|
|
|
|
The sys_wcstombs converts characters in the private use area
|
|
|
|
back to the corresponding ASCII chars.
|
|
|
|
|
|
|
|
- If a wide character in a filename has no representation in the current
|
|
|
|
multibyte charset, then usually you wouldn't be able to access the
|
|
|
|
file. To fix this problem, sys_wcstombs creates a replacement multibyte
|
|
|
|
sequences for the non-representable wide-char. The sequence starts with
|
2009-09-28 20:10:32 +08:00
|
|
|
an ASCII CAN (0x18, Ctrl-X), followed by the UTF-8 representation of the
|
|
|
|
character. The sys_(cp_)mbstowcs function detects ASCII CAN characters
|
2009-03-24 20:18:34 +08:00
|
|
|
in the input multibyte string and converts the following multibyte
|
|
|
|
sequence in by treating it as an UTF-8 char. If that fails, the ASCII
|
2009-09-28 20:10:32 +08:00
|
|
|
CAN was probably standalone and it gets just copied over as ASCII CAN.
|
2009-03-24 20:18:34 +08:00
|
|
|
|
2012-07-03 04:17:27 +08:00
|
|
|
- Three cases have to be distinguished for the return value:
|
|
|
|
|
|
|
|
- dst == NULL; len is ignored, the return value is the number of bytes
|
|
|
|
required for the string without the trailing NUL, just like the return
|
|
|
|
value of the wcstombs function.
|
|
|
|
|
|
|
|
- dst != NULL, len == (size_t) -1; the return value is the size in bytes
|
|
|
|
of the destination string without the trailing NUL. If the incoming
|
|
|
|
wide char string was not NUL-terminated, the target string won't be
|
|
|
|
NUL-terminated either.
|
|
|
|
|
|
|
|
- dst != NULL; len != (size_t) -1; the return value is the size in bytes
|
|
|
|
of the destination string without the trailing NUL. The target string
|
|
|
|
will be NUL-terminated, no matter what. If the result is truncated due
|
|
|
|
to buffer size, it's a bug in Cygwin and the buffer in the calling
|
|
|
|
function should be raised.
|
|
|
|
*/
|
2013-05-01 09:20:37 +08:00
|
|
|
size_t __reg3
|
2015-12-18 19:42:40 +08:00
|
|
|
sys_wcstombs (char *dst, size_t len, const wchar_t *src, size_t nwc)
|
2009-03-24 20:18:34 +08:00
|
|
|
{
|
|
|
|
char buf[10];
|
|
|
|
char *ptr = dst;
|
|
|
|
wchar_t *pwcs = (wchar_t *) src;
|
|
|
|
size_t n = 0;
|
|
|
|
mbstate_t ps;
|
2009-05-09 04:28:20 +08:00
|
|
|
save_errno save;
|
2015-12-18 19:42:40 +08:00
|
|
|
wctomb_p f_wctomb = cygheap->locale.wctomb;
|
|
|
|
const char *charset = cygheap->locale.charset;
|
2009-03-24 20:18:34 +08:00
|
|
|
|
|
|
|
memset (&ps, 0, sizeof ps);
|
|
|
|
if (dst == NULL)
|
|
|
|
len = (size_t) -1;
|
|
|
|
while (n < len && nwc-- > 0)
|
|
|
|
{
|
|
|
|
wchar_t pw = *pwcs;
|
2009-09-28 20:10:32 +08:00
|
|
|
int bytes;
|
2009-11-02 19:42:04 +08:00
|
|
|
unsigned char cwc;
|
2009-09-28 20:10:32 +08:00
|
|
|
|
|
|
|
/* Convert UNICODE private use area. Reverse functionality for the
|
2011-06-06 13:02:13 +08:00
|
|
|
ASCII area <= 0x7f (only for path names) is transform_chars above.
|
2009-11-02 19:42:04 +08:00
|
|
|
Reverse functionality for invalid bytes in a multibyte sequence is
|
|
|
|
in sys_cp_mbstowcs below. */
|
|
|
|
if ((pw & 0xff00) == 0xf000
|
2010-04-23 19:07:35 +08:00
|
|
|
&& (((cwc = (pw & 0xff)) <= 0x7f && tfx_rev_chars[cwc] >= 0xf000)
|
2009-11-02 19:42:04 +08:00
|
|
|
|| (cwc >= 0x80 && MB_CUR_MAX > 1)))
|
2009-06-04 01:23:39 +08:00
|
|
|
{
|
2009-11-02 19:42:04 +08:00
|
|
|
buf[0] = (char) cwc;
|
2009-09-28 20:10:32 +08:00
|
|
|
bytes = 1;
|
2009-11-02 19:42:04 +08:00
|
|
|
}
|
2009-09-28 20:10:32 +08:00
|
|
|
else
|
2009-07-01 05:18:44 +08:00
|
|
|
{
|
2009-09-28 20:10:32 +08:00
|
|
|
bytes = f_wctomb (_REENT, buf, pw, charset, &ps);
|
|
|
|
if (bytes == -1 && *charset != 'U'/*TF-8*/)
|
2009-03-24 20:18:34 +08:00
|
|
|
{
|
2009-09-28 20:10:32 +08:00
|
|
|
/* Convert chars invalid in the current codepage to a sequence
|
|
|
|
ASCII CAN; UTF-8 representation of invalid char. */
|
|
|
|
buf[0] = 0x18; /* ASCII CAN */
|
|
|
|
bytes = __utf8_wctomb (_REENT, buf + 1, pw, charset, &ps);
|
|
|
|
if (bytes == -1)
|
2009-03-24 20:18:34 +08:00
|
|
|
{
|
|
|
|
++pwcs;
|
|
|
|
ps.__count = 0;
|
|
|
|
continue;
|
|
|
|
}
|
2009-09-28 20:10:32 +08:00
|
|
|
++bytes; /* Add the ASCII CAN to the byte count. */
|
|
|
|
if (ps.__count == -4 && nwc > 0)
|
|
|
|
{
|
|
|
|
/* First half of a surrogate pair. */
|
|
|
|
++pwcs;
|
|
|
|
if ((*pwcs & 0xfc00) != 0xdc00) /* Invalid second half. */
|
|
|
|
{
|
|
|
|
++pwcs;
|
|
|
|
ps.__count = 0;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
bytes += __utf8_wctomb (_REENT, buf + bytes, *pwcs, charset,
|
|
|
|
&ps);
|
|
|
|
nwc--;
|
|
|
|
}
|
2009-03-24 20:18:34 +08:00
|
|
|
}
|
2009-07-01 05:18:44 +08:00
|
|
|
}
|
2009-03-24 20:18:34 +08:00
|
|
|
if (n + bytes <= len)
|
2009-07-01 05:18:44 +08:00
|
|
|
{
|
|
|
|
if (dst)
|
|
|
|
{
|
|
|
|
for (int i = 0; i < bytes; ++i)
|
|
|
|
*ptr++ = buf[i];
|
|
|
|
}
|
|
|
|
if (*pwcs++ == 0x00)
|
2009-03-24 20:18:34 +08:00
|
|
|
break;
|
2015-10-22 20:22:07 +08:00
|
|
|
n += bytes;
|
2009-07-01 05:18:44 +08:00
|
|
|
}
|
2009-03-24 20:18:34 +08:00
|
|
|
else
|
2009-07-01 05:18:44 +08:00
|
|
|
break;
|
2009-03-24 20:18:34 +08:00
|
|
|
}
|
2012-07-03 04:17:27 +08:00
|
|
|
if (n && dst && len != (size_t) -1)
|
2009-03-24 20:18:34 +08:00
|
|
|
{
|
|
|
|
n = (n < len) ? n : len - 1;
|
|
|
|
dst[n] = '\0';
|
|
|
|
}
|
|
|
|
|
|
|
|
return n;
|
|
|
|
}
|
|
|
|
|
2008-02-01 04:26:01 +08:00
|
|
|
/* Allocate a buffer big enough for the string, always including the
|
2009-03-24 20:18:34 +08:00
|
|
|
terminating '\0'. The buffer pointer is returned in *dst_p, the return
|
2008-02-01 04:26:01 +08:00
|
|
|
value is the number of bytes written to the buffer, as usual.
|
|
|
|
The "type" argument determines where the resulting buffer is stored.
|
|
|
|
It's either one of the cygheap_types values, or it's "HEAP_NOTHEAP".
|
2008-02-04 20:00:19 +08:00
|
|
|
In the latter case the allocation uses simple calloc.
|
2008-02-16 01:53:11 +08:00
|
|
|
|
2008-02-04 20:00:19 +08:00
|
|
|
Note that this code is shared by cygserver (which requires it via
|
2008-02-16 01:53:11 +08:00
|
|
|
__small_vsprintf) and so when built there plain calloc is the
|
2008-02-04 20:00:19 +08:00
|
|
|
only choice. */
|
2013-05-01 09:20:37 +08:00
|
|
|
size_t __reg3
|
2009-04-08 00:22:55 +08:00
|
|
|
sys_wcstombs_alloc (char **dst_p, int type, const wchar_t *src, size_t nwc)
|
2008-02-01 04:26:01 +08:00
|
|
|
{
|
2009-03-24 20:18:34 +08:00
|
|
|
size_t ret;
|
2008-02-01 04:26:01 +08:00
|
|
|
|
2009-03-24 20:18:34 +08:00
|
|
|
ret = sys_wcstombs (NULL, (size_t) -1, src, nwc);
|
|
|
|
if (ret > 0)
|
2008-02-01 04:26:01 +08:00
|
|
|
{
|
2009-03-24 20:18:34 +08:00
|
|
|
size_t dlen = ret + 1;
|
2008-02-01 04:26:01 +08:00
|
|
|
|
|
|
|
if (type == HEAP_NOTHEAP)
|
2009-03-24 20:18:34 +08:00
|
|
|
*dst_p = (char *) calloc (dlen, sizeof (char));
|
2008-02-01 04:26:01 +08:00
|
|
|
else
|
2009-03-24 20:18:34 +08:00
|
|
|
*dst_p = (char *) ccalloc ((cygheap_types) type, dlen, sizeof (char));
|
|
|
|
if (!*dst_p)
|
2008-02-16 01:53:11 +08:00
|
|
|
return 0;
|
2009-03-24 20:18:34 +08:00
|
|
|
ret = sys_wcstombs (*dst_p, dlen, src, nwc);
|
2008-02-01 04:26:01 +08:00
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2009-03-24 20:18:34 +08:00
|
|
|
/* sys_cp_mbstowcs is actually most of the time called as sys_mbstowcs with
|
|
|
|
a 0 codepage. If cp is not 0, the codepage is evaluated and used for the
|
|
|
|
conversion. This is so that fhandler_console can switch to an alternate
|
|
|
|
charset, which is the charset returned by GetConsoleCP (). Most of the
|
|
|
|
time this is used for box and line drawing characters. */
|
2013-05-01 09:20:37 +08:00
|
|
|
size_t __reg3
|
2009-09-28 20:10:32 +08:00
|
|
|
sys_cp_mbstowcs (mbtowc_p f_mbtowc, const char *charset, wchar_t *dst,
|
|
|
|
size_t dlen, const char *src, size_t nms)
|
2007-08-02 22:21:53 +08:00
|
|
|
{
|
2009-03-24 20:18:34 +08:00
|
|
|
wchar_t *ptr = dst;
|
2009-05-31 11:59:38 +08:00
|
|
|
unsigned const char *pmbs = (unsigned const char *) src;
|
2009-03-24 20:18:34 +08:00
|
|
|
size_t count = 0;
|
|
|
|
size_t len = dlen;
|
|
|
|
int bytes;
|
|
|
|
mbstate_t ps;
|
2009-05-09 04:28:20 +08:00
|
|
|
save_errno save;
|
2009-03-24 20:18:34 +08:00
|
|
|
|
|
|
|
memset (&ps, 0, sizeof ps);
|
|
|
|
if (dst == NULL)
|
|
|
|
len = (size_t)-1;
|
2009-03-25 00:42:36 +08:00
|
|
|
while (len > 0 && nms > 0)
|
2008-02-26 02:32:23 +08:00
|
|
|
{
|
2009-09-28 20:10:32 +08:00
|
|
|
/* ASCII CAN handling. */
|
|
|
|
if (*pmbs == 0x18)
|
2009-03-24 20:18:34 +08:00
|
|
|
{
|
2009-09-28 20:10:32 +08:00
|
|
|
/* Sanity check: If this is a lead CAN byte for a following UTF-8
|
2009-09-23 19:31:00 +08:00
|
|
|
sequence, there must be at least two more bytes left, and the
|
|
|
|
next byte must be a valid UTF-8 start byte. If the charset
|
|
|
|
isn't UTF-8 anyway, try to convert the following bytes as UTF-8
|
|
|
|
sequence. */
|
|
|
|
if (nms > 2 && pmbs[1] >= 0xc2 && pmbs[1] <= 0xf4 && *charset != 'U'/*TF-8*/)
|
2009-03-24 20:18:34 +08:00
|
|
|
{
|
2009-09-23 19:31:00 +08:00
|
|
|
bytes = __utf8_mbtowc (_REENT, ptr, (const char *) pmbs + 1,
|
|
|
|
nms - 1, charset, &ps);
|
|
|
|
if (bytes < 0)
|
|
|
|
{
|
2009-09-28 20:10:32 +08:00
|
|
|
/* Invalid UTF-8 sequence? Treat the ASCII CAN character as
|
|
|
|
stand-alone ASCII CAN char. */
|
2009-09-23 19:31:00 +08:00
|
|
|
bytes = 1;
|
|
|
|
if (dst)
|
2009-09-28 20:10:32 +08:00
|
|
|
*ptr = 0x18;
|
2009-09-23 19:31:00 +08:00
|
|
|
memset (&ps, 0, sizeof ps);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2009-09-28 20:10:32 +08:00
|
|
|
++bytes; /* Count CAN byte */
|
2009-09-23 19:31:00 +08:00
|
|
|
if (bytes > 1 && ps.__count == 4)
|
|
|
|
{
|
|
|
|
/* First half of a surrogate. */
|
|
|
|
wchar_t *ptr2 = dst ? ptr + 1 : NULL;
|
|
|
|
int bytes2 = __utf8_mbtowc (_REENT, ptr2,
|
|
|
|
(const char *) pmbs + bytes,
|
|
|
|
nms - bytes, charset, &ps);
|
|
|
|
if (bytes2 < 0)
|
|
|
|
memset (&ps, 0, sizeof ps);
|
|
|
|
else
|
|
|
|
{
|
|
|
|
bytes += bytes2;
|
|
|
|
++count;
|
|
|
|
ptr = dst ? ptr + 1 : NULL;
|
|
|
|
--len;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2009-09-28 20:10:32 +08:00
|
|
|
/* Otherwise it's just a simple ASCII CAN. */
|
2009-09-23 19:31:00 +08:00
|
|
|
else
|
2009-03-24 20:18:34 +08:00
|
|
|
{
|
2009-09-23 19:31:00 +08:00
|
|
|
bytes = 1;
|
|
|
|
if (dst)
|
2009-09-28 20:10:32 +08:00
|
|
|
*ptr = 0x18;
|
2009-03-24 20:18:34 +08:00
|
|
|
}
|
|
|
|
}
|
2009-06-04 01:23:39 +08:00
|
|
|
else if ((bytes = f_mbtowc (_REENT, ptr, (const char *) pmbs, nms,
|
2009-11-02 19:42:04 +08:00
|
|
|
charset, &ps)) < 0)
|
2009-05-31 11:59:38 +08:00
|
|
|
{
|
2009-09-28 20:10:32 +08:00
|
|
|
/* The technique is based on a discussion here:
|
2009-05-31 11:59:38 +08:00
|
|
|
http://www.mail-archive.com/linux-utf8@nl.linux.org/msg00080.html
|
|
|
|
|
2009-09-28 20:10:32 +08:00
|
|
|
Invalid bytes in a multibyte secuence are converted to
|
|
|
|
the private use area which is already used to store ASCII
|
2011-05-02 23:28:35 +08:00
|
|
|
chars invalid in Windows filenames. This technque allows
|
2009-09-28 20:10:32 +08:00
|
|
|
to store them in a symmetric way. */
|
2009-09-23 19:31:00 +08:00
|
|
|
bytes = 1;
|
2009-06-04 01:23:39 +08:00
|
|
|
if (dst)
|
2009-09-28 20:10:32 +08:00
|
|
|
*ptr = L'\xf000' | *pmbs;
|
2009-09-22 17:44:32 +08:00
|
|
|
memset (&ps, 0, sizeof ps);
|
2009-05-31 11:59:38 +08:00
|
|
|
}
|
|
|
|
|
2009-03-24 20:18:34 +08:00
|
|
|
if (bytes > 0)
|
2009-07-01 05:18:44 +08:00
|
|
|
{
|
|
|
|
pmbs += bytes;
|
|
|
|
nms -= bytes;
|
|
|
|
++count;
|
2009-03-24 20:18:34 +08:00
|
|
|
ptr = dst ? ptr + 1 : NULL;
|
2009-07-01 05:18:44 +08:00
|
|
|
--len;
|
|
|
|
}
|
2009-03-24 20:18:34 +08:00
|
|
|
else
|
|
|
|
{
|
|
|
|
if (bytes == 0)
|
|
|
|
++count;
|
|
|
|
break;
|
|
|
|
}
|
2008-02-26 02:32:23 +08:00
|
|
|
}
|
2009-03-24 20:18:34 +08:00
|
|
|
|
|
|
|
if (count && dst)
|
|
|
|
{
|
|
|
|
count = (count < dlen) ? count : dlen - 1;
|
|
|
|
dst[count] = L'\0';
|
|
|
|
}
|
|
|
|
|
|
|
|
return count;
|
2007-08-02 22:21:53 +08:00
|
|
|
}
|
2007-08-12 20:48:02 +08:00
|
|
|
|
2013-05-01 09:20:37 +08:00
|
|
|
size_t __reg3
|
2009-05-15 03:49:37 +08:00
|
|
|
sys_mbstowcs (wchar_t * dst, size_t dlen, const char *src, size_t nms)
|
|
|
|
{
|
|
|
|
return sys_cp_mbstowcs (cygheap->locale.mbtowc, cygheap->locale.charset,
|
|
|
|
dst, dlen, src, nms);
|
|
|
|
}
|
|
|
|
|
2008-02-01 04:26:01 +08:00
|
|
|
/* Same as sys_wcstombs_alloc, just backwards. */
|
2013-05-01 09:20:37 +08:00
|
|
|
size_t __reg3
|
2009-04-08 00:22:55 +08:00
|
|
|
sys_mbstowcs_alloc (wchar_t **dst_p, int type, const char *src, size_t nms)
|
2008-02-01 04:26:01 +08:00
|
|
|
{
|
2009-03-24 20:18:34 +08:00
|
|
|
size_t ret;
|
2008-02-01 04:26:01 +08:00
|
|
|
|
2009-03-24 20:18:34 +08:00
|
|
|
ret = sys_mbstowcs (NULL, (size_t) -1, src, nms);
|
|
|
|
if (ret > 0)
|
2008-02-01 04:26:01 +08:00
|
|
|
{
|
2009-03-24 20:18:34 +08:00
|
|
|
size_t dlen = ret + 1;
|
2008-02-26 02:32:23 +08:00
|
|
|
|
2008-02-01 04:26:01 +08:00
|
|
|
if (type == HEAP_NOTHEAP)
|
2009-04-08 00:22:55 +08:00
|
|
|
*dst_p = (wchar_t *) calloc (dlen, sizeof (wchar_t));
|
2008-02-01 04:26:01 +08:00
|
|
|
else
|
2009-04-08 00:22:55 +08:00
|
|
|
*dst_p = (wchar_t *) ccalloc ((cygheap_types) type, dlen,
|
|
|
|
sizeof (wchar_t));
|
2009-03-24 20:18:34 +08:00
|
|
|
if (!*dst_p)
|
2008-02-16 01:53:11 +08:00
|
|
|
return 0;
|
2009-03-24 20:18:34 +08:00
|
|
|
ret = sys_mbstowcs (*dst_p, dlen, src, nms);
|
2008-02-01 04:26:01 +08:00
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2011-01-19 17:41:54 +08:00
|
|
|
/* Copy string, until c or <nul> is encountered.
|
|
|
|
NUL-terminate the destination string (s1).
|
|
|
|
Return pointer to terminating byte in dst string. */
|
|
|
|
char * __stdcall
|
2013-11-25 19:38:08 +08:00
|
|
|
strccpy (char *__restrict s1, const char **__restrict s2, char c)
|
2011-01-19 17:41:54 +08:00
|
|
|
{
|
|
|
|
while (**s2 && **s2 != c)
|
|
|
|
*s1++ = *((*s2)++);
|
|
|
|
*s1 = 0;
|
|
|
|
|
|
|
|
MALLOC_CHECK;
|
|
|
|
return s1;
|
|
|
|
}
|
|
|
|
|
2007-08-12 20:48:02 +08:00
|
|
|
static WCHAR hex_wchars[] = L"0123456789abcdef";
|
|
|
|
|
|
|
|
NTSTATUS NTAPI
|
|
|
|
RtlInt64ToHexUnicodeString (ULONGLONG value, PUNICODE_STRING dest,
|
|
|
|
BOOLEAN append)
|
|
|
|
{
|
|
|
|
USHORT len = append ? dest->Length : 0;
|
|
|
|
if (dest->MaximumLength - len < 16 * (int) sizeof (WCHAR))
|
|
|
|
return STATUS_BUFFER_OVERFLOW;
|
2009-04-08 00:22:55 +08:00
|
|
|
wchar_t *end = (PWCHAR) ((PBYTE) dest->Buffer + len);
|
2007-08-12 20:48:02 +08:00
|
|
|
register PWCHAR p = end + 16;
|
|
|
|
while (p-- > end)
|
|
|
|
{
|
|
|
|
*p = hex_wchars[value & 0xf];
|
|
|
|
value >>= 4;
|
|
|
|
}
|
|
|
|
dest->Length += 16 * sizeof (WCHAR);
|
|
|
|
return STATUS_SUCCESS;
|
|
|
|
}
|