* libc/locale/locale.c (loadlocale): Add handling of "@cjknarrow"

modifier on _MB_CAPABLE targets. Add comment to explain. Improve documentation.
2009-06-18 09:13:39 +00:00 · 2009-06-18 09:13:39 +00:00 · e53c92a80e
parent 339dde8fe5
commit e53c92a80e
2 changed files with 73 additions and 27 deletions
--- a/newlib/ChangeLog
+++ b/newlib/ChangeLog
@ -1,3 +1,9 @@
 2009-06-18  Corinna Vinschen  <corinna@vinschen.de>
 	* libc/locale/locale.c (loadlocale): Add handling of "@cjknarrow"
 	modifier on _MB_CAPABLE targets.  Add comment to explain.  Improve
 	documentation.
 2009-06-17  Michael Eager <eager@eagercon.com>
 	* libc/include/pthread.h: Support XMK (Xilinx) BSP, add RTEMS to
--- a/newlib/libc/locale/locale.c
+++ b/newlib/libc/locale/locale.c
@ -44,29 +44,49 @@ locale.
 This is a minimal implementation, supporting only the required <<"POSIX">>
 and <<"C">> values for <[locale]>; strings representing other locales are not
-honored unless _MB_CAPABLE is defined in which case POSIX locale strings
+honored unless _MB_CAPABLE is defined.
-are allowed, plus five extensions supported for backward compatibility with
+
-older implementations using newlib: <<"C-UTF-8">>, <<"C-JIS">>,
+If _MB_CAPABLE is defined, POSIX locale strings are allowed, following
-<<"C-EUCJP">>/<<"C-eucJP">>, <<"C-SJIS">>, <<"C-ISO-8859-x">> with
+the form
-1 <= x <= 15, or <<"C-CPxxx">> with xxx in [437, 720, 737, 775, 850, 852,
+
-855, 857, 858, 862, 866, 874, 1125, 1250, 1251, 1252, 1253, 1254, 1255, 1256,
+  language[_TERRITORY][.charset][@@modifier]
-1257, 1258].  Even when using POSIX locale strings, the only charsets allowed
+
-are <<"UTF-8">>, <<"JIS">>, <<"EUCJP">>/<<"eucJP">>, <<"SJIS">>,
+<<"language">> is a two character string per ISO 639.  <<"TERRITORY">> is a
-<<"ISO-8859-x">> with 1 <= x <= 15, or <<"CPxxx">> with xxx in [437, 720,
+country code per ISO 3166.  For <<"charset">> and <<"modifier">> see below.
-737, 775, 850, 852, 855, 857, 858, 862, 866, 874, 1125, 1250, 1251, 1252,
+
-1253, 1254, 1255, 1256, 1257, 1258]. 
+Additionally to the POSIX specifier, five extensions are supported for
 backward compatibility with older implementations using newlib:
 <<"C-UTF-8">>, <<"C-JIS">>, <<"C-EUCJP">>/<<"C-eucJP">>, <<"C-SJIS">>,
 <<"C-ISO-8859-x">> with 1 <= x <= 15, or <<"C-CPxxx">> with xxx in [437,
 720, 737, 775, 850, 852, 855, 857, 858, 862, 866, 874, 1125, 1250, 1251,
 1252, 1253, 1254, 1255, 1256, 1257, 1258].
 Even when using POSIX locale strings, the only charsets allowed are
 <<"UTF-8">>, <<"JIS">>, <<"EUCJP">>/<<"eucJP">>, <<"SJIS">>, <<"ISO-8859-x">>
 with 1 <= x <= 15, or <<"CPxxx">> with xxx in [437, 720, 737, 775, 850,
 852, 855, 857, 858, 862, 866, 874, 1125, 1250, 1251, 1252, 1253, 1254,
 1255, 1256, 1257, 1258]. 
 (<<"">> is also accepted; if given, the settings are read from the
 corresponding LC_* environment variables and $LANG according to POSIX rules.
 Under Cygwin, this implementation additionally supports the charsets
 <<"GBK">>, <<"eucKR">>, and <<"Big5">>.
-If you use <<NULL>> as the <[locale]> argument, <<setlocale>> returns
+This implementation also supports a single modifier, <<"cjknarrow">>.
-a pointer to the string representing the current locale (always
+Any other modifier is ignored.  <<"cjknarrow">>, in conjunction with one
-<<"C">> in this implementation).  The acceptable values for
+of the language specifiers <<"ja">>, <<"ko">>, and <<"zh">> specifies
-<[category]> are defined in `<<locale.h>>' as macros beginning with
+how the functions <<wcwidth>> and <<wcswidth>> handle characters from
-<<"LC_">>, but this implementation does not check the values you pass
+the "CJK Ambiguous Width" character class described in
-in the <[category]> argument.
+http://www.unicode.org/unicode/reports/tr11/.  Usually these characters
 have a width of 1, unless you specify one of the aforementioned
 languages, in which case these characters have a width of 2.  By
 specifying the <<"cjknarrow">> modifier, these characters will have a
 width of one in the languages <<"ja">>, <<"ko">>, and <<"zh">> as well.
 If you use <<NULL>> as the <[locale]> argument, <<setlocale>> returns a
 pointer to the string representing the current locale.  The acceptable
 values for <[category]> are defined in `<<locale.h>>' as macros
 beginning with <<"LC_">>.
 <<localeconv>> returns a pointer to a structure (also defined in
 `<<locale.h>>') describing the locale-specific conventions currently
@ -399,6 +419,9 @@ loadlocale(struct _reent *p, int category)
  int (*l_wctomb) (struct _reent *, char *, wchar_t, const char *, mbstate_t *);
  int (*l_mbtowc) (struct _reent *, wchar_t *, const char *, size_t,
 		   const char *, mbstate_t *);
 #ifdef _MB_CAPABLE
  int cjknarrow = 0;
 #endif
  /* "POSIX" is translated to "C", as on Linux. */
  if (!strcmp (locale, "POSIX"))
@ -429,10 +452,14 @@ loadlocale(struct _reent *p, int category)
      if (c[0] == '.')
 	{
 	  /* Charset */
-	  strcpy (charset, c + 1);
+	  char *chp;
-	  if ((c = strchr (charset, '@')))
+
 	  ++c;
 	  strcpy (charset, c);
 	  if ((chp = strchr (charset, '@')))
 	    /* Strip off modifier */
-	    *c = '\0';
+	    *chp = '\0';
 	  c += strlen (charset);
 	}
      else if (c[0] == '\0' || c[0] == '@')
 	/* End of string or just a modifier */
@ -444,6 +471,17 @@ loadlocale(struct _reent *p, int category)
      else
 	/* Invalid string */
      	return NULL;
 #ifdef _MB_CAPABLE
      if (c[0] == '@')
 	{
 	  /* Modifier */
 	  /* Only one modifier is recognized right now.  "cjknarrow" is used
 	     to modify the behaviour of wcwidth() for East Asian languages.
 	     For details see the comment at the end of this function. */
 	  if (!strcmp (c + 1, "cjknarrow"))
 	    cjknarrow = 1;
 	}
 #endif
    }
  /* We only support this subset of charsets. */
  switch (charset[0])
@ -606,13 +644,15 @@ loadlocale(struct _reent *p, int category)
      __mbtowc = l_mbtowc;
      __set_ctype (charset);
      /* Check for the language part of the locale specifier.  In case
-         of "ja", "ko", or "zh", assume the use of CJK fonts.  This is
+         of "ja", "ko", or "zh", assume the use of CJK fonts, unless the
-	 stored in lc_ctype_cjk_lang and tested in wcwidth() to figure
+	 "@cjknarrow" modifier has been specifed.
-	 out the width to return (1 or 2) for the "CJK Ambiguous Width"
+	 The result is stored in lc_ctype_cjk_lang and tested in wcwidth()
-	 category of characters. */
+	 to figure out the width to return (1 or 2) for the "CJK Ambiguous
-      lc_ctype_cjk_lang = (strncmp (locale, "ja", 2) == 0
+	 Width" category of characters. */
-			   || strncmp (locale, "ko", 2) == 0
+      lc_ctype_cjk_lang = !cjknarrow
-			   || strncmp (locale, "zh", 2) == 0);
+			  && ((strncmp (locale, "ja", 2) == 0
 			      || strncmp (locale, "ko", 2) == 0
 			      || strncmp (locale, "zh", 2) == 0));
 #endif
    }
  else if (category == LC_MESSAGES)