/* nlsfuncs.cc: NLS helper functions Copyright 2010 Red Hat, Inc. This file is part of Cygwin. This software is a copyrighted work licensed under the terms of the Cygwin license. Please consult the file "CYGWIN_LICENSE" for details. */ #include "winsup.h" #include #include #include #include "path.h" #include "fhandler.h" #include "dtable.h" #include "cygheap.h" #include "tls_pbuf.h" /* Internal headers from newlib */ #include "../locale/timelocal.h" #include "../locale/lnumeric.h" #include "../locale/lmonetary.h" static char *lc_time_buf; static char *lc_numeric_buf; static char *lc_monetary_buf; #define _LC(x) &lc_##x##_ptr,lc_##x##_end-lc_##x##_ptr #define getlocaleinfo(category,type) \ __getlocaleinfo(lcid,(type),_LC(category),f_wctomb,charset) #define eval_datetimefmt(type,force) \ __eval_datetimefmt(lcid,(type),(force),&lc_time_ptr,\ lc_time_end-lc_time_ptr,f_wctomb, charset) /* Vista and later. Not defined in w32api yet. */ extern "C" { WINBASEAPI LCID WINAPI LocaleNameToLCID (LPCWSTR, DWORD); }; static char last_locale[ENCODING_LEN + 1]; static LCID last_lcid; /* Fetch LCID from POSIX locale specifier. Return values: -1: Invalid locale 0: C or POSIX >0: LCID */ static LCID __get_lcid_from_locale (const char *name) { char locale[ENCODING_LEN + 1]; char *c; LCID lcid; if (!strcmp (name, last_locale)) { debug_printf ("LCID=0x%04x", last_lcid); return last_lcid; } stpcpy (last_locale, name); stpcpy (locale, name); /* Drop charset and modifier */ c = strchr (locale, '.'); if (!c) c = strchr (locale, '@'); if (c) *c = '\0'; /* "POSIX" already converted to "C" in loadlocale. */ if (!strcmp (locale, "C")) return 0; /* Convert to form understood by LocaleNameToLCID */ c = strchr (locale, '_'); if (c) *c = '-'; if (wincap.has_localenames ()) { wchar_t wlocale[ENCODING_LEN + 1]; mbstowcs (wlocale, locale, ENCODING_LEN + 1); lcid = LocaleNameToLCID (wlocale, 0); last_lcid = lcid ?: (LCID) -1; debug_printf ("LCID=0x%04x", last_lcid); return last_lcid; } /* Pre-Vista we have to loop through the LCID values and see if they match language and TERRITORY. */ if (c) *c++ = '\0'; /* locale now points to the language, c points to the TERRITORY */ const char *language = locale; const char *territory = c; LCID lang, sublang; char iso[10]; /* In theory the lang part takes 10 bits (0x3ff), but up to Windows 2003 R2 the highest lang value is 0x81. */ for (lang = 1; lang <= 0x81; ++lang) if (GetLocaleInfo (lang, LOCALE_SISO639LANGNAME, iso, 10) && !strcmp (language, iso)) break; if (lang > 0x81) lcid = 0; else if (!territory) lcid = lang; else { /* In theory the sublang part takes 7 bits (0x3f), but up to Windows 2003 R2 the highest sublang value is 0x14. */ for (sublang = 1; sublang <= 0x14; ++sublang) { lcid = (sublang << 10) | lang; if (GetLocaleInfo (lcid, LOCALE_SISO3166CTRYNAME, iso, 10) && !strcmp (territory, iso)) break; } if (sublang > 0x14) lcid = 0; } last_lcid = lcid ?: (LCID) -1; debug_printf ("LCID=0x%04x", last_lcid); return last_lcid; } /* Never returns -1, *iff* s is not NULL. Just skips invalid chars instead. s==NULL returns -1 since it's used to recognize invalid strings in the used charset. */ static size_t lc_wcstombs (wctomb_p f_wctomb, const char *charset, char *s, const wchar_t *pwcs, size_t n) { char *ptr = s; size_t max = n; char buf[8]; size_t i, bytes, num_to_copy; mbstate_t state; memset (&state, 0, sizeof state); if (s == NULL) { size_t num_bytes = 0; while (*pwcs != 0) { bytes = f_wctomb (_REENT, buf, *pwcs++, charset, &state); if (bytes == (size_t) -1) return (size_t) -1; num_bytes += bytes; } return num_bytes; } while (n > 0) { bytes = f_wctomb (_REENT, buf, *pwcs, charset, &state); if (bytes == (size_t) -1) { memset (&state, 0, sizeof state); ++pwcs; continue; } num_to_copy = (n > bytes ? bytes : n); for (i = 0; i < num_to_copy; ++i) *ptr++ = buf[i]; if (*pwcs == 0x00) return ptr - s - (n >= bytes); ++pwcs; n -= num_to_copy; } return max; } /* Never returns -1. Invalid sequences are translated to replacement wide-chars. */ static size_t lc_mbstowcs (mbtowc_p f_mbtowc, const char *charset, wchar_t *pwcs, const char *s, size_t n) { size_t ret = 0; char *t = (char *) s; size_t bytes; mbstate_t state; memset (&state, 0, sizeof state); if (!pwcs) n = 1; while (n > 0) { bytes = f_mbtowc (_REENT, pwcs, t, MB_CUR_MAX, charset, &state); if (bytes == (size_t) -1) { state.__count = 0; bytes = 1; if (pwcs) *pwcs = L' '; } else if (bytes == 0) break; t += bytes; ++ret; if (pwcs) { ++pwcs; --n; } } return ret; } static char * __getlocaleinfo (LCID lcid, LCTYPE type, char **ptr, size_t size, wctomb_p f_wctomb, const char *charset) { wchar_t wbuf[80]; size_t num; char *ret; GetLocaleInfoW (lcid, type, wbuf, 80); num = lc_wcstombs (f_wctomb, charset, ret = *ptr, wbuf, size); *ptr += num + 1; return ret; } static UINT getlocaleint (LCID lcid, LCTYPE type) { UINT val; return GetLocaleInfoW (lcid, type | LOCALE_RETURN_NUMBER, (PWCHAR) &val, sizeof val) ? val : 0; } static char * __eval_datetimefmt (LCID lcid, LCTYPE type, int force, char **ptr, size_t size, wctomb_p f_wctomb, const char *charset) { wchar_t buf[80]; wchar_t fc; size_t num; mbstate_t mb; size_t idx; const char *day_str = "edaA"; const char *mon_str = "mmbB"; const char *year_str = "yyyY"; const char *hour12_str = "lI"; const char *hour24_str = "kH"; const char *t_str; char *ret = *ptr; char *p = *ptr; GetLocaleInfoW (lcid, type, buf, 80); memset (&mb, 0, sizeof mb); for (wchar_t *fmt = buf; *fmt; ++fmt) switch (fc = *fmt) { case L'\'': if (fmt[1] == L'\'') *p++ = '\''; else while (fmt[1] && *++fmt != L'\'') { num = f_wctomb (_REENT, p, *fmt, charset, &mb); if (num == (size_t) -1) memset (&mb, 0, sizeof mb); else p += num; } break; case L'd': case L'M': case L'y': t_str = (fc == L'd' ? day_str : fc == L'M' ? mon_str : year_str); if (fc == L'y') force = 0; for (idx = 1; fmt[1] == fc; ++idx, ++fmt); if (--idx > 3) idx = 3; if (force && idx == 3) idx = 2; *p++ = '%'; *p++ = t_str[idx]; break; case L'g': break; case L'h': case L'H': t_str = (fc == L'h' || force ? hour12_str : hour24_str); idx = 0; if (fmt[1] == fc) { ++fmt; idx = 1; } *p++ = '%'; *p++ = t_str[idx]; break; case L'm': case L's': case L't': if (fmt[1] == fc) ++fmt; *p++ = '%'; *p++ = (fc == L'm' ? 'M' : fc == L's' ? 'S' : 'p'); break; case L'\t': case L'\n': case L'%': *p++ = '%'; *p++ = (char) fc; break; default: num = f_wctomb (_REENT, p, *fmt, charset, &mb); if (num == (size_t) -1) memset (&mb, 0, sizeof mb); else p += num; break; } *p++ = '\0'; *ptr = p; return ret; } /* Convert Windows grouping format into POSIX grouping format. */ static char * conv_grouping (LCID lcid, LCTYPE type, char **lc_ptr) { char buf[10]; /* Per MSDN max size of LOCALE_SGROUPING element incl. NUL */ bool repeat = false; char *ptr = *lc_ptr; char *ret = ptr; GetLocaleInfoA (lcid, type, buf, 10); /* Convert Windows grouping format into POSIX grouping format. */ for (char *c = buf; *c; ++c) { if (*c < '0' || *c > '9') continue; char val = *c - '0'; if (!val) { repeat = true; break; } *ptr++ = val; } if (!repeat) *ptr++ = CHAR_MAX; *ptr++ = '\0'; *lc_ptr = ptr; return ret; } /* Called from newlib's setlocale() via __time_load_locale() if category is LC_TIME. Returns LC_TIME values fetched from Windows locale data in the structure pointed to by _time_locale. This is subsequently accessed by functions like nl_langinfo, strftime, strptime. */ extern "C" int __set_lc_time_from_win (const char *name, struct lc_time_T *_time_locale, wctomb_p f_wctomb, const char *charset) { LCID lcid = __get_lcid_from_locale (name); if (!lcid || lcid == (LCID) -1) return lcid; char *new_lc_time_buf = (char *) malloc (4096); const char *lc_time_end = new_lc_time_buf + 4096; if (!new_lc_time_buf) return -1; char *lc_time_ptr = new_lc_time_buf; /* mon */ for (int i = 0; i < 12; ++i) _time_locale->mon[i] = getlocaleinfo (time, LOCALE_SABBREVMONTHNAME1 + i); /* month and alt_month */ for (int i = 0; i < 12; ++i) _time_locale->month[i] = _time_locale->alt_month[i] = getlocaleinfo (time, LOCALE_SMONTHNAME1 + i); /* wday */ _time_locale->wday[0] = getlocaleinfo (time, LOCALE_SABBREVDAYNAME7); for (int i = 0; i < 6; ++i) _time_locale->wday[i + 1] = getlocaleinfo (time, LOCALE_SABBREVDAYNAME1 + i); /* weekday */ _time_locale->weekday[0] = getlocaleinfo (time, LOCALE_SDAYNAME7); for (int i = 0; i < 6; ++i) _time_locale->weekday[i + 1] = getlocaleinfo (time, LOCALE_SDAYNAME1 + i); /* X_fmt */ _time_locale->X_fmt = eval_datetimefmt (LOCALE_STIMEFORMAT, 0); /* x_fmt */ _time_locale->x_fmt = eval_datetimefmt (LOCALE_SSHORTDATE, 0); /* c_fmt */ _time_locale->c_fmt = eval_datetimefmt (LOCALE_SLONGDATE, 1); --lc_time_ptr; *lc_time_ptr++ = ' '; eval_datetimefmt (LOCALE_STIMEFORMAT, 0); /* AM/PM */ _time_locale->am_pm[0] = getlocaleinfo (time, LOCALE_S1159); _time_locale->am_pm[1] = getlocaleinfo (time, LOCALE_S2359); /* date_fmt */ _time_locale->date_fmt = eval_datetimefmt (LOCALE_SLONGDATE, 1); --lc_time_ptr; *lc_time_ptr++ = ' '; eval_datetimefmt (LOCALE_STIMEFORMAT, 0); --lc_time_ptr; lc_time_ptr = stpcpy (lc_time_ptr, " %Z") + 1; /* md */ { wchar_t buf[80]; GetLocaleInfoW (lcid, LOCALE_IDATE, buf, 80); lc_time_ptr = stpcpy (lc_time_ptr, *buf == L'1' ? "dm" : "md") + 1; } /* ampm_fmt */ _time_locale->ampm_fmt = eval_datetimefmt (LOCALE_STIMEFORMAT, 1); char *tmp = (char *) realloc (new_lc_time_buf, lc_time_ptr - new_lc_time_buf); if (!tmp) { free (new_lc_time_buf); return -1; } if (lc_time_buf) free (lc_time_buf); lc_time_buf = tmp; return 1; } /* Called from newlib's setlocale() via __numeric_load_locale() if category is LC_NUMERIC. Returns LC_NUMERIC values fetched from Windows locale data in the structure pointed to by _numeric_locale. This is subsequently accessed by functions like nl_langinfo, localeconv, printf, etc. */ extern "C" int __set_lc_numeric_from_win (const char *name, struct lc_numeric_T *_numeric_locale, wctomb_p f_wctomb, const char *charset) { LCID lcid = __get_lcid_from_locale (name); if (!lcid || lcid == (LCID) -1) return lcid; char *new_lc_numeric_buf = (char *) malloc (48); const char *lc_numeric_end = new_lc_numeric_buf + 48; if (!new_lc_numeric_buf) return -1; char *lc_numeric_ptr = new_lc_numeric_buf; /* decimal_point */ _numeric_locale->decimal_point = getlocaleinfo (numeric, LOCALE_SDECIMAL); /* thousands_sep */ _numeric_locale->thousands_sep = getlocaleinfo (numeric, LOCALE_STHOUSAND); /* grouping */ _numeric_locale->grouping = conv_grouping (lcid, LOCALE_SGROUPING, &lc_numeric_ptr); char *tmp = (char *) realloc (new_lc_numeric_buf, lc_numeric_ptr - new_lc_numeric_buf); if (!tmp) { free (new_lc_numeric_buf); return -1; } if (lc_numeric_buf) free (lc_numeric_buf); lc_numeric_buf = tmp; return 1; } /* Called from newlib's setlocale() via __monetary_load_locale() if category is LC_MONETARY. Returns LC_MONETARY values fetched from Windows locale data in the structure pointed to by _monetary_locale. This is subsequently accessed by functions like nl_langinfo, localeconv, printf, etc. */ extern "C" int __set_lc_monetary_from_win (const char *name, struct lc_monetary_T *_monetary_locale, wctomb_p f_wctomb, const char *charset) { LCID lcid = __get_lcid_from_locale (name); if (!lcid || lcid == (LCID) -1) return lcid; char *new_lc_monetary_buf = (char *) malloc (256); const char *lc_monetary_end = new_lc_monetary_buf + 256; if (!new_lc_monetary_buf) return -1; char *lc_monetary_ptr = new_lc_monetary_buf; /* int_curr_symbol */ _monetary_locale->int_curr_symbol = getlocaleinfo (monetary, LOCALE_SINTLSYMBOL); /* No spacing char means space. */ if (!_monetary_locale->int_curr_symbol[3]) { lc_monetary_ptr[-1] = ' '; *lc_monetary_ptr++ = '\0'; } /* currency_symbol */ { /* As on Linux: If the currency_symbol can't be represented in the given charset, use int_curr_symbol. */ wchar_t wbuf[14]; GetLocaleInfoW (lcid, LOCALE_SCURRENCY, wbuf, 14); if (lc_wcstombs (f_wctomb, charset, NULL, wbuf, 0) == (size_t) -1) { _monetary_locale->currency_symbol = lc_monetary_ptr; lc_monetary_ptr = stpncpy (lc_monetary_ptr, _monetary_locale->int_curr_symbol, 3); *lc_monetary_ptr++ = '\0'; } else _monetary_locale->currency_symbol = getlocaleinfo (monetary, LOCALE_SCURRENCY); } /* mon_decimal_point */ _monetary_locale->mon_decimal_point = getlocaleinfo (monetary, LOCALE_SMONDECIMALSEP); /* mon_thousands_sep */ _monetary_locale->mon_thousands_sep = getlocaleinfo (monetary, LOCALE_SMONTHOUSANDSEP); /* mon_grouping */ _monetary_locale->mon_grouping = conv_grouping (lcid, LOCALE_SMONGROUPING, &lc_monetary_ptr); /* positive_sign */ _monetary_locale->positive_sign = getlocaleinfo (monetary, LOCALE_SPOSITIVESIGN); /* negative_sign */ _monetary_locale->negative_sign = getlocaleinfo (monetary, LOCALE_SNEGATIVESIGN); /* int_frac_digits */ *lc_monetary_ptr = (char) getlocaleint (lcid, LOCALE_IINTLCURRDIGITS); _monetary_locale->int_frac_digits = lc_monetary_ptr++; /* frac_digits */ *lc_monetary_ptr = (char) getlocaleint (lcid, LOCALE_ICURRDIGITS); _monetary_locale->frac_digits = lc_monetary_ptr++; /* p_cs_precedes */ *lc_monetary_ptr = (char) getlocaleint (lcid, LOCALE_IPOSSYMPRECEDES); _monetary_locale->p_cs_precedes = lc_monetary_ptr++; /* p_sep_by_space */ *lc_monetary_ptr = (char) getlocaleint (lcid, LOCALE_IPOSSEPBYSPACE); _monetary_locale->p_sep_by_space = lc_monetary_ptr++; /* n_cs_precedes */ *lc_monetary_ptr = (char) getlocaleint (lcid, LOCALE_INEGSYMPRECEDES); _monetary_locale->n_cs_precedes = lc_monetary_ptr++; /* n_sep_by_space */ *lc_monetary_ptr = (char) getlocaleint (lcid, LOCALE_INEGSEPBYSPACE); _monetary_locale->n_sep_by_space = lc_monetary_ptr++; /* p_sign_posn */ *lc_monetary_ptr = (char) getlocaleint (lcid, LOCALE_IPOSSIGNPOSN); _monetary_locale->p_sign_posn = lc_monetary_ptr++; /* p_sign_posn */ *lc_monetary_ptr = (char) getlocaleint (lcid, LOCALE_INEGSIGNPOSN); _monetary_locale->n_sign_posn = lc_monetary_ptr++; char *tmp = (char *) realloc (new_lc_monetary_buf, lc_monetary_ptr - new_lc_monetary_buf); if (!tmp) { free (new_lc_monetary_buf); return -1; } if (lc_monetary_buf) free (lc_monetary_buf); lc_monetary_buf = tmp; return 1; } static LCID collate_lcid = 0; static mbtowc_p collate_mbtowc = __ascii_mbtowc; static char collate_charset[ENCODING_LEN + 1] = "ASCII"; /* Called from newlib's setlocale() if category is LC_COLLATE. Stores LC_COLLATE locale information. This is subsequently accessed by the below functions strcoll, strxfrm, wcscoll, wcsxfrm. */ extern "C" int __collate_load_locale (const char *name, mbtowc_p f_mbtowc, const char *charset) { LCID lcid = __get_lcid_from_locale (name); if (lcid == (LCID) -1) return -1; collate_lcid = lcid; collate_mbtowc = f_mbtowc; stpcpy (collate_charset, charset); return 0; } /* We use the Windows functions for locale-specific string comparison and transformation. The advantage is that we don't need any files with collation information. */ extern "C" int wcscoll (const wchar_t *ws1, const wchar_t *ws2) { int ret; if (!collate_lcid) return wcscmp (ws1, ws2); ret = CompareStringW (collate_lcid, 0, ws1, -1, ws2, -1); if (!ret) set_errno (EINVAL); return ret - CSTR_EQUAL; } extern "C" int strcoll (const char *s1, const char *s2) { size_t n1, n2; wchar_t *ws1, *ws2; tmp_pathbuf tp; int ret; if (!collate_lcid) return strcmp (s1, s2); /* The ANSI version of CompareString uses the default charset of the lcid, so we must use the Unicode version. */ n1 = lc_mbstowcs (collate_mbtowc, collate_charset, NULL, s1, 0) + 1; ws1 = (n1 > NT_MAX_PATH ? (wchar_t *) malloc (n1 * sizeof (wchar_t)) : tp.w_get ()); lc_mbstowcs (collate_mbtowc, collate_charset, ws1, s1, n1); n2 = lc_mbstowcs (collate_mbtowc, collate_charset, NULL, s2, 0) + 1; ws2 = (n2 > NT_MAX_PATH ? (wchar_t *) malloc (n2 * sizeof (wchar_t)) : tp.w_get ()); lc_mbstowcs (collate_mbtowc, collate_charset, ws2, s2, n2); ret = CompareStringW (collate_lcid, 0, ws1, -1, ws2, -1); if (n1 > NT_MAX_PATH) free (ws1); if (n2 > NT_MAX_PATH) free (ws2); if (!ret) set_errno (EINVAL); return ret - CSTR_EQUAL; } extern "C" size_t wcsxfrm (wchar_t *ws1, const wchar_t *ws2, size_t wsn) { size_t ret; if (!collate_lcid) return wcslcpy (ws1, ws2, wsn); ret = LCMapStringW (collate_lcid, LCMAP_SORTKEY | LCMAP_BYTEREV, ws2, -1, ws1, wsn * sizeof (wchar_t)); /* LCMapStringW returns byte count including the terminating NUL character, wcsxfrm is supposed to return length in wchar_t excluding the NUL. Since the array is only single byte NUL-terminated we must make sure the result is wchar_t-NUL terminated. */ if (ret) { ret = (ret + 1) / sizeof (wchar_t); if (ret >= wsn) return wsn; ws1[ret] = L'\0'; return ret; } if (GetLastError () != ERROR_INSUFFICIENT_BUFFER) set_errno (EINVAL); return wsn; } extern "C" size_t strxfrm (char *s1, const char *s2, size_t sn) { size_t ret; size_t n2; wchar_t *ws2; tmp_pathbuf tp; if (!collate_lcid) return strlcpy (s1, s2, sn); /* The ANSI version of LCMapString uses the default charset of the lcid, so we must use the Unicode version. */ n2 = lc_mbstowcs (collate_mbtowc, collate_charset, NULL, s2, 0) + 1; ws2 = (n2 > NT_MAX_PATH ? (wchar_t *) malloc (n2 * sizeof (wchar_t)) : tp.w_get ()); lc_mbstowcs (collate_mbtowc, collate_charset, ws2, s2, n2); /* The sort key is a NUL-terminated byte string. */ ret = LCMapStringW (collate_lcid, LCMAP_SORTKEY, ws2, -1, (PWCHAR) s1, sn); if (n2 > NT_MAX_PATH) free (ws2); if (ret == 0) { if (GetLastError () != ERROR_INSUFFICIENT_BUFFER) set_errno (EINVAL); return sn; } /* LCMapStringW returns byte count including the terminating NUL character. strxfrm is supposed to return length excluding the NUL. */ return ret - 1; } /* Fetch default ANSI codepage from locale info and generate a setlocale compatible character set code. Called from newlib's setlocale(), if the charset isn't given explicitely in the POSIX compatible locale specifier. The function also returns a pointer to the corresponding _mbtowc_r function which is used subsequently. */ extern "C" void __set_charset_from_locale (const char *locale, char *charset) { UINT cp; LCID lcid = __get_lcid_from_locale (locale); /* "C" locale, or invalid locale? */ if (lcid == 0 || lcid == (LCID) -1) { __small_sprintf (charset, "ASCII"); return; } if (!GetLocaleInfoW (lcid, LOCALE_IDEFAULTANSICODEPAGE | LOCALE_RETURN_NUMBER, (PWCHAR) &cp, sizeof cp)) cp = 0; /* codepage to de-facto standard charset transition. */ switch (cp) { case 874: __small_sprintf (charset, "CP%u", cp); break; case 932: strcpy (charset, "EUCJP"); break; case 936: strcpy (charset, "GBK"); break; case 949: strcpy (charset, "EUCKR"); break; case 950: strcpy (charset, "BIG5"); break; case 1250: strcpy (charset, "ISO-8859-2"); break; case 1251: strcpy (charset, "ISO-8859-5"); break; case 1252: strcpy (charset, "ISO-8859-1"); break; case 1253: strcpy (charset, "ISO-8859-7"); break; case 1254: strcpy (charset, "ISO-8859-9"); break; case 1255: strcpy (charset, "ISO-8859-8"); break; case 1256: strcpy (charset, "ISO-8859-6"); break; case 1257: strcpy (charset, "ISO-8859-13"); break; case 1258: default: strcpy (charset, "UTF-8"); break; } if (cp >= 1250 && cp <= 1257) { char *c = strchr (locale, '@'); if (c && !strcmp (c + 1, "euro")) strcpy (charset, "ISO-8859-15"); } }