mirror of
git://sourceware.org/git/newlib-cygwin.git
synced 2025-01-18 12:29:32 +08:00
* fhandler.h (fhandler_console::trunc_buf): Add to use as cache for
truncated multibyte characters on input. (fhandler_console::write_replacement_char): Declare new method. * fhandler_console.cc (CONVERT_LIMIT): Raise to 64K. (fhandler_console::fhandler_console): Initialize trunc_buf. (ERR): Define as independent value again. (fhandler_console::write_replacement_char): New method to print replacement chars. (fhandler_console::write_normal): Add handling for truncated multibyte sequences. Call next_char instead of pathetic CharNextExA function. Don't change src, rather just work with found later on. * miscfuncs.cc (is_cp_multibyte): Move here from strfuncs.cc. Don't call Windows function, restrict to well-known ANSI/OEM codepages and UTF-8. (next_char): Call CharNextExA only for doublebyte codepages. Implement for UTF-8 here. * strfuncs.cc (is_cp_multibyte): Move to miscfuncs.cc. * winsup.h (next_char): Declare. * include/limits.h (MB_LEN_MAX): Set to maximum value of MB_CUR_MAX as defined by newlib for now.
This commit is contained in:
parent
a7197550f3
commit
4b65f19045
@ -1,3 +1,26 @@
|
|||||||
|
2008-02-06 Corinna Vinschen <corinna@vinschen.de>
|
||||||
|
|
||||||
|
* fhandler.h (fhandler_console::trunc_buf): Add to use as cache for
|
||||||
|
truncated multibyte characters on input.
|
||||||
|
(fhandler_console::write_replacement_char): Declare new method.
|
||||||
|
* fhandler_console.cc (CONVERT_LIMIT): Raise to 64K.
|
||||||
|
(fhandler_console::fhandler_console): Initialize trunc_buf.
|
||||||
|
(ERR): Define as independent value again.
|
||||||
|
(fhandler_console::write_replacement_char): New method to print
|
||||||
|
replacement chars.
|
||||||
|
(fhandler_console::write_normal): Add handling for truncated multibyte
|
||||||
|
sequences. Call next_char instead of pathetic CharNextExA function.
|
||||||
|
Don't change src, rather just work with found later on.
|
||||||
|
* miscfuncs.cc (is_cp_multibyte): Move here from strfuncs.cc.
|
||||||
|
Don't call Windows function, restrict to well-known ANSI/OEM codepages
|
||||||
|
and UTF-8.
|
||||||
|
(next_char): Call CharNextExA only for doublebyte codepages.
|
||||||
|
Implement for UTF-8 here.
|
||||||
|
* strfuncs.cc (is_cp_multibyte): Move to miscfuncs.cc.
|
||||||
|
* winsup.h (next_char): Declare.
|
||||||
|
* include/limits.h (MB_LEN_MAX): Set to maximum value of MB_CUR_MAX
|
||||||
|
as defined by newlib for now.
|
||||||
|
|
||||||
2008-02-05 Corinna Vinschen <corinna@vinschen.de>
|
2008-02-05 Corinna Vinschen <corinna@vinschen.de>
|
||||||
|
|
||||||
* autoload.cc (CharToOemA): Remove.
|
* autoload.cc (CharToOemA): Remove.
|
||||||
|
@ -896,6 +896,13 @@ class fhandler_console: public fhandler_termios
|
|||||||
static dev_console *dev_state;
|
static dev_console *dev_state;
|
||||||
static bool invisible_console;
|
static bool invisible_console;
|
||||||
|
|
||||||
|
/* Used when we encounter a truncated multi-byte sequence. The
|
||||||
|
lead bytes are stored here and revisited in the next write call. */
|
||||||
|
struct {
|
||||||
|
int len;
|
||||||
|
unsigned char buf[4]; /* Max len of valid UTF-8 sequence. */
|
||||||
|
} trunc_buf;
|
||||||
|
|
||||||
/* Output calls */
|
/* Output calls */
|
||||||
void set_default_attr ();
|
void set_default_attr ();
|
||||||
|
|
||||||
@ -904,6 +911,7 @@ class fhandler_console: public fhandler_termios
|
|||||||
void cursor_set (bool, int, int);
|
void cursor_set (bool, int, int);
|
||||||
void cursor_get (int *, int *);
|
void cursor_get (int *, int *);
|
||||||
void cursor_rel (int, int);
|
void cursor_rel (int, int);
|
||||||
|
void write_replacement_char (const unsigned char *);
|
||||||
const unsigned char *write_normal (unsigned const char*, unsigned const char *);
|
const unsigned char *write_normal (unsigned const char*, unsigned const char *);
|
||||||
void char_command (char);
|
void char_command (char);
|
||||||
bool set_raw_win32_keyboard_mode (bool);
|
bool set_raw_win32_keyboard_mode (bool);
|
||||||
|
@ -33,7 +33,7 @@ details. */
|
|||||||
#include "cygtls.h"
|
#include "cygtls.h"
|
||||||
#include "registry.h"
|
#include "registry.h"
|
||||||
|
|
||||||
#define CONVERT_LIMIT 16384
|
#define CONVERT_LIMIT 65536
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Scroll the screen context.
|
* Scroll the screen context.
|
||||||
@ -895,7 +895,9 @@ fhandler_console::tcgetattr (struct termios *t)
|
|||||||
fhandler_console::fhandler_console () :
|
fhandler_console::fhandler_console () :
|
||||||
fhandler_termios ()
|
fhandler_termios ()
|
||||||
{
|
{
|
||||||
|
trunc_buf.len = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
dev_console::set_color (HANDLE h)
|
dev_console::set_color (HANDLE h)
|
||||||
{
|
{
|
||||||
@ -1037,7 +1039,7 @@ fhandler_console::cursor_get (int *x, int *y)
|
|||||||
#define ESC 2
|
#define ESC 2
|
||||||
#define NOR 0
|
#define NOR 0
|
||||||
#define IGN 4
|
#define IGN 4
|
||||||
#if 0
|
#if 1
|
||||||
#define ERR 5
|
#define ERR 5
|
||||||
#else
|
#else
|
||||||
#define ERR NOR
|
#define ERR NOR
|
||||||
@ -1425,41 +1427,86 @@ beep ()
|
|||||||
MessageBeep (MB_OK);
|
MessageBeep (MB_OK);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* This gets called when we found an invalid UTF-8 character. We try with
|
||||||
|
the default ANSI codepage. If that fails we just print a question mark.
|
||||||
|
Looks ugly but is a neat and alomst sane fallback for many languages. */
|
||||||
|
void
|
||||||
|
fhandler_console::write_replacement_char (const unsigned char *char_p)
|
||||||
|
{
|
||||||
|
int n;
|
||||||
|
WCHAR def_cp_chars[2];
|
||||||
|
DWORD done;
|
||||||
|
|
||||||
|
n = MultiByteToWideChar (GetACP (), 0, (const CHAR *) char_p, 1,
|
||||||
|
def_cp_chars, 2);
|
||||||
|
if (n)
|
||||||
|
WriteConsoleW (get_output_handle (), def_cp_chars, n, &done, 0);
|
||||||
|
else
|
||||||
|
WriteConsoleW (get_output_handle (), L"?", 1, &done, 0);
|
||||||
|
}
|
||||||
|
|
||||||
const unsigned char *
|
const unsigned char *
|
||||||
fhandler_console::write_normal (const unsigned char *src,
|
fhandler_console::write_normal (const unsigned char *src,
|
||||||
const unsigned char *end)
|
const unsigned char *end)
|
||||||
{
|
{
|
||||||
/* Scan forward to see what a char which needs special treatment */
|
/* Scan forward to see what a char which needs special treatment */
|
||||||
DWORD done;
|
DWORD done;
|
||||||
unsigned char *found = (unsigned char *) src;
|
DWORD buf_len;
|
||||||
|
const unsigned char *found = src;
|
||||||
|
const unsigned char *nfound;
|
||||||
UINT cp = dev_state->get_console_cp ();
|
UINT cp = dev_state->get_console_cp ();
|
||||||
bool mb = is_cp_multibyte (cp);
|
|
||||||
|
/* First check if we have cached lead bytes of a former try to write
|
||||||
|
a truncated multibyte sequence. If so, process it. */
|
||||||
|
if (trunc_buf.len)
|
||||||
|
{
|
||||||
|
int cp_len = min (end - src, 4 - trunc_buf.len);
|
||||||
|
memcpy (trunc_buf.buf + trunc_buf.len, src, cp_len);
|
||||||
|
nfound = next_char (cp, trunc_buf.buf,
|
||||||
|
trunc_buf.buf + trunc_buf.len + cp_len);
|
||||||
|
if (!nfound) /* Invalid multibyte sequence. */
|
||||||
|
{ /* Give up and print replacement chars. */
|
||||||
|
for (int i = 0; i < trunc_buf.len; ++i)
|
||||||
|
write_replacement_char (trunc_buf.buf + i);
|
||||||
|
}
|
||||||
|
else if (nfound == trunc_buf.buf)
|
||||||
|
{ /* Still truncated multibyte sequence. */
|
||||||
|
trunc_buf.len += cp_len;
|
||||||
|
return end;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/* Valid multibyte sequence. Process. */
|
||||||
|
WCHAR buf[2];
|
||||||
|
buf_len = dev_state->str_to_con (buf, (const char *) trunc_buf.buf,
|
||||||
|
nfound - trunc_buf.buf);
|
||||||
|
WriteConsoleW (get_output_handle (), buf, buf_len, &done, 0);
|
||||||
|
found = src + (nfound - trunc_buf.buf - trunc_buf.len);
|
||||||
|
}
|
||||||
|
/* Mark trunc_buf as unused. */
|
||||||
|
trunc_buf.len = 0;
|
||||||
|
}
|
||||||
|
|
||||||
while (found < end
|
while (found < end
|
||||||
&& found - src < CONVERT_LIMIT
|
&& found - src < CONVERT_LIMIT
|
||||||
&& base_chars[*found] == NOR)
|
&& base_chars[*found] == NOR)
|
||||||
{
|
{
|
||||||
if (mb && *found && *found >= 0x80)
|
nfound = next_char (cp, found, end);
|
||||||
{
|
if (!nfound) /* Invalid multibyte sequence. */
|
||||||
unsigned char *nfound = (unsigned char *)
|
break;
|
||||||
CharNextExA (cp, (const CHAR *) found, 0);
|
if (nfound == found) /* Truncated multibyte sequence. */
|
||||||
/* Sanity check for UTF-8 to workaround the problem in
|
{ /* Stick to it until the next write. */
|
||||||
MultiByteToWideChar, that it's not capable of using replacement
|
trunc_buf.len = end - found;
|
||||||
characters for invalid source chars in the given codepage. */
|
memcpy (trunc_buf.buf, found, trunc_buf.len);
|
||||||
if (nfound == found + 1 && cp == CP_UTF8)
|
return end;
|
||||||
*found++ = '?';
|
|
||||||
else
|
|
||||||
found = nfound;
|
|
||||||
}
|
}
|
||||||
else
|
found = nfound;
|
||||||
++found;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Print all the base ones out */
|
/* Print all the base ones out */
|
||||||
if (found != src)
|
if (found != src)
|
||||||
{
|
{
|
||||||
DWORD len = found - src;
|
DWORD len = found - src;
|
||||||
DWORD buf_len;
|
|
||||||
PWCHAR buf = (PWCHAR) alloca (CONVERT_LIMIT * sizeof (WCHAR));
|
PWCHAR buf = (PWCHAR) alloca (CONVERT_LIMIT * sizeof (WCHAR));
|
||||||
|
|
||||||
buf_len = dev_state->str_to_con (buf, (const char *) src, len);
|
buf_len = dev_state->str_to_con (buf, (const char *) src, len);
|
||||||
@ -1490,13 +1537,14 @@ fhandler_console::write_normal (const unsigned char *src,
|
|||||||
buf += done;
|
buf += done;
|
||||||
}
|
}
|
||||||
while (buf_len > 0);
|
while (buf_len > 0);
|
||||||
src = found;
|
if (len >= CONVERT_LIMIT)
|
||||||
|
return found;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (src < end)
|
if (found < end)
|
||||||
{
|
{
|
||||||
int x, y;
|
int x, y;
|
||||||
switch (base_chars[*src])
|
switch (base_chars[*found])
|
||||||
{
|
{
|
||||||
case BEL:
|
case BEL:
|
||||||
beep ();
|
beep ();
|
||||||
@ -1529,16 +1577,19 @@ fhandler_console::write_normal (const unsigned char *src,
|
|||||||
cursor_set (false, 0, y);
|
cursor_set (false, 0, y);
|
||||||
break;
|
break;
|
||||||
case ERR:
|
case ERR:
|
||||||
WriteFile (get_output_handle (), src, 1, &done, 0);
|
WriteFile (get_output_handle (), found, 1, &done, 0);
|
||||||
break;
|
break;
|
||||||
case TAB:
|
case TAB:
|
||||||
cursor_get (&x, &y);
|
cursor_get (&x, &y);
|
||||||
cursor_set (false, 8 * (x / 8 + 1), y);
|
cursor_set (false, 8 * (x / 8 + 1), y);
|
||||||
break;
|
break;
|
||||||
|
case NOR:
|
||||||
|
write_replacement_char (found);
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
src ++;
|
found++;
|
||||||
}
|
}
|
||||||
return src;
|
return found;
|
||||||
}
|
}
|
||||||
|
|
||||||
int
|
int
|
||||||
|
@ -28,7 +28,9 @@ details. */
|
|||||||
|
|
||||||
/* Maximum length of a multibyte character. */
|
/* Maximum length of a multibyte character. */
|
||||||
#ifndef MB_LEN_MAX
|
#ifndef MB_LEN_MAX
|
||||||
#define MB_LEN_MAX 1
|
/* TODO: This is newlib's max value. We should probably rather define our
|
||||||
|
own _mbtowc_r and _wctomb_r functions which are only codepage dependent. */
|
||||||
|
#define MB_LEN_MAX 8
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* Minimum and maximum values a `signed char' can hold. */
|
/* Minimum and maximum values a `signed char' can hold. */
|
||||||
|
@ -17,7 +17,8 @@ details. */
|
|||||||
#include <alloca.h>
|
#include <alloca.h>
|
||||||
#include <limits.h>
|
#include <limits.h>
|
||||||
#include <wchar.h>
|
#include <wchar.h>
|
||||||
#include <winbase.h>
|
#include <wingdi.h>
|
||||||
|
#include <winuser.h>
|
||||||
#include <winnls.h>
|
#include <winnls.h>
|
||||||
#include "cygthread.h"
|
#include "cygthread.h"
|
||||||
#include "cygtls.h"
|
#include "cygtls.h"
|
||||||
@ -192,6 +193,118 @@ cygwin_strupr (char *string)
|
|||||||
return string;
|
return string;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* FIXME? We only support standard ANSI/OEM codepages according to
|
||||||
|
http://www.microsoft.com/globaldev/reference/cphome.mspx as well
|
||||||
|
as UTF-8 and codepage 1361, which is also mentioned as valid
|
||||||
|
doublebyte codepage in MSDN man pages (e.g. IsDBCSLeadByteEx).
|
||||||
|
Everything else will be hosed. */
|
||||||
|
|
||||||
|
bool
|
||||||
|
is_cp_multibyte (UINT cp)
|
||||||
|
{
|
||||||
|
switch (cp)
|
||||||
|
{
|
||||||
|
case 932:
|
||||||
|
case 936:
|
||||||
|
case 949:
|
||||||
|
case 950:
|
||||||
|
case 1361:
|
||||||
|
case 65001:
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* OMYGOD! CharNextExA is not UTF-8 aware! It only works fine with
|
||||||
|
double byte charsets. So we have to do it ourselves for UTF-8.
|
||||||
|
|
||||||
|
While being at it, we do more. If a double-byte or multibyte
|
||||||
|
sequence is trucated due to an early end, we need a way to recognize
|
||||||
|
it. The reason is that multiple buffered write statements might
|
||||||
|
accidentally stop and start in the middle of a single character byte
|
||||||
|
sequence. If we have to interpret the byte sequences (as in
|
||||||
|
fhandler_console, we would print wrong output in these cases.
|
||||||
|
|
||||||
|
So we have four possible return values here:
|
||||||
|
|
||||||
|
ret = end if str >= end
|
||||||
|
ret = NULL if we encounter an invalid byte sequence
|
||||||
|
ret = str if we encounter the start byte of a truncated byte sequence
|
||||||
|
ret = str + n if we encounter a vaild byte sequence
|
||||||
|
*/
|
||||||
|
|
||||||
|
const unsigned char *
|
||||||
|
next_char (UINT cp, const unsigned char *str, const unsigned char *end)
|
||||||
|
{
|
||||||
|
const unsigned char *ret;
|
||||||
|
|
||||||
|
if (str >= end)
|
||||||
|
return end;
|
||||||
|
|
||||||
|
switch (cp)
|
||||||
|
{
|
||||||
|
case 932:
|
||||||
|
case 936:
|
||||||
|
case 949:
|
||||||
|
case 950:
|
||||||
|
case 1361:
|
||||||
|
if (*str <= 0x7f)
|
||||||
|
ret = str + 1;
|
||||||
|
else if (str == end - 1 && IsDBCSLeadByteEx (cp, *str))
|
||||||
|
ret = str;
|
||||||
|
else
|
||||||
|
ret = (const unsigned char *) CharNextExA (cp, (const CHAR *) str, 0);
|
||||||
|
break;
|
||||||
|
case CP_UTF8:
|
||||||
|
switch (str[0] >> 4)
|
||||||
|
{
|
||||||
|
case 0x0 ... 0x7: /* One byte character. */
|
||||||
|
ret = str + 1;
|
||||||
|
break;
|
||||||
|
case 0x8 ... 0xb: /* Followup byte. Invalid as first byte. */
|
||||||
|
ret = NULL;
|
||||||
|
break;
|
||||||
|
case 0xc ... 0xd: /* Two byte character. */
|
||||||
|
/* Check followup bytes for validity. */
|
||||||
|
if (str >= end - 1)
|
||||||
|
ret = str;
|
||||||
|
else if (str[1] <= 0xbf)
|
||||||
|
ret = str + 2;
|
||||||
|
else
|
||||||
|
ret = NULL;
|
||||||
|
break;
|
||||||
|
case 0xe: /* Three byte character. */
|
||||||
|
if (str >= end - 2)
|
||||||
|
ret = str;
|
||||||
|
else if ((str[1] & 0xc0) == 0x80 && (str[2] & 0xc0) == 0x80
|
||||||
|
&& (str[0] != 0xe0 || str[1] >= 0xa0)
|
||||||
|
&& (str[0] != 0xed || str[1] <= 0x9f))
|
||||||
|
ret = str + 3;
|
||||||
|
else
|
||||||
|
ret = NULL;
|
||||||
|
break;
|
||||||
|
case 0xf: /* Four byte character. */
|
||||||
|
if (str[0] >= 0xf8)
|
||||||
|
ret = NULL;
|
||||||
|
else if (str >= end - 3)
|
||||||
|
ret = str;
|
||||||
|
else if ((str[1] & 0xc0) == 0x80 && (str[2] & 0xc0) == 0x80
|
||||||
|
&& (str[3] & 0xc0) == 0x80
|
||||||
|
&& (str[0] == 0xf0 || str[1] >= 0x90)
|
||||||
|
&& (str[0] == 0xf4 || str[1] <= 0x8f))
|
||||||
|
ret = str + 4;
|
||||||
|
else
|
||||||
|
ret = NULL;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
ret = str + 1;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
int __stdcall
|
int __stdcall
|
||||||
check_invalid_virtual_addr (const void *s, unsigned sz)
|
check_invalid_virtual_addr (const void *s, unsigned sz)
|
||||||
{
|
{
|
||||||
|
@ -36,14 +36,6 @@ get_cp ()
|
|||||||
return active_codepage;
|
return active_codepage;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool
|
|
||||||
is_cp_multibyte (UINT cp)
|
|
||||||
{
|
|
||||||
CPINFO cpi;
|
|
||||||
GetCPInfo (cp, &cpi);
|
|
||||||
return cpi.MaxCharSize > 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* tlen is always treated as the maximum buffer size, including the '\0'
|
/* tlen is always treated as the maximum buffer size, including the '\0'
|
||||||
character. sys_wcstombs will always return a 0-terminated result, no
|
character. sys_wcstombs will always return a 0-terminated result, no
|
||||||
matter what. */
|
matter what. */
|
||||||
|
@ -110,6 +110,7 @@ extern const char case_folded_upper[];
|
|||||||
/* The one function we use from winuser.h most of the time */
|
/* The one function we use from winuser.h most of the time */
|
||||||
extern "C" DWORD WINAPI GetLastError (void);
|
extern "C" DWORD WINAPI GetLastError (void);
|
||||||
|
|
||||||
|
/* Codepage and multibyte string specific stuff. */
|
||||||
enum codepage_type {ansi_cp, oem_cp, utf8_cp};
|
enum codepage_type {ansi_cp, oem_cp, utf8_cp};
|
||||||
extern codepage_type current_codepage;
|
extern codepage_type current_codepage;
|
||||||
extern UINT active_codepage;
|
extern UINT active_codepage;
|
||||||
@ -117,6 +118,8 @@ extern UINT active_codepage;
|
|||||||
void codepage_init (const char *buf);
|
void codepage_init (const char *buf);
|
||||||
UINT get_cp ();
|
UINT get_cp ();
|
||||||
bool is_cp_multibyte (UINT cp);
|
bool is_cp_multibyte (UINT cp);
|
||||||
|
const unsigned char *next_char (UINT cp, const unsigned char *str,
|
||||||
|
const unsigned char *end);
|
||||||
|
|
||||||
/* Used as type by sys_wcstombs_alloc and sys_mbstowcs_alloc. For a
|
/* Used as type by sys_wcstombs_alloc and sys_mbstowcs_alloc. For a
|
||||||
description see there. */
|
description see there. */
|
||||||
|
Loading…
x
Reference in New Issue
Block a user