Conversions to and from utf-8.
This commit is contained in:
351
src/utf8.c
Normal file
351
src/utf8.c
Normal file
@@ -0,0 +1,351 @@
|
||||
/* Charset handling for GNU tar.
|
||||
|
||||
Copyright (C) 2004 Free Software Foundation, Inc.
|
||||
|
||||
This program is free software; you can redistribute it and/or modify it
|
||||
under the terms of the GNU General Public License as published by the
|
||||
Free Software Foundation; either version 2, or (at your option) any later
|
||||
version.
|
||||
|
||||
This program is distributed in the hope that it will be useful, but
|
||||
WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
|
||||
Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
|
||||
|
||||
#include "system.h"
|
||||
#include <quotearg.h>
|
||||
#include "common.h"
|
||||
#ifdef HAVE_ICONV_H
|
||||
# include <iconv.h>
|
||||
#endif
|
||||
|
||||
struct langtab
|
||||
{
|
||||
char *lang; /* Language code */
|
||||
char *terr; /* Territory code */
|
||||
char *charset; /* Corresponding charset */
|
||||
};
|
||||
|
||||
/* The list of language codes defined in ISO 639 with the corresponding
|
||||
default character sets.
|
||||
|
||||
NOTES:
|
||||
|
||||
1) The list must be ordered by:
|
||||
a) lang field in ascending order
|
||||
b) terr field in descending order.
|
||||
NULL fields are considered less than non-null ones.
|
||||
2) Many entries have NULL charset fields. Please help fill them!
|
||||
3) The "default" character set for a given language is a matter
|
||||
of preference. Possibly the table should contain a *list* of
|
||||
possible character sets.
|
||||
4) LC_ALL "modifier" field is not taken into account */
|
||||
|
||||
static struct langtab langtab[] = {
|
||||
{ "C", NULL, "ASCII"},
|
||||
{ "POSIX", NULL, "ASCII" },
|
||||
{ "aa", NULL, NULL}, /* Afar */
|
||||
{ "ab", NULL, NULL}, /* Abkhazian */
|
||||
{ "ae", NULL, NULL}, /* Avestan */
|
||||
{ "af", NULL, "iso-8859-1"}, /* Afrikaans */
|
||||
{ "am", NULL, "UTF-8"}, /* Amharic */
|
||||
{ "ar", NULL, "iso-8859-6"}, /* Arabic */
|
||||
{ "as", NULL, NULL}, /* Assamese */
|
||||
{ "ay", NULL, "iso-8859-1"}, /* Aymara */
|
||||
{ "az", NULL, NULL}, /* Azerbaijani */
|
||||
{ "ba", NULL, NULL}, /* Bashkir */
|
||||
{ "be", NULL, "UTF-8"}, /* Byelorussian; Belarusian */
|
||||
{ "bg", NULL, "iso-8859-5"}, /* Bulgarian */
|
||||
{ "bh", NULL, NULL}, /* Bihari */
|
||||
{ "bi", NULL, NULL}, /* Bislama */
|
||||
{ "bn", NULL, NULL}, /* Bengali; Bangla */
|
||||
{ "bo", NULL, NULL}, /* Tibetan */
|
||||
{ "br", NULL, "iso-8859-1"}, /* Breton: 1,5,8,9 */
|
||||
{ "bs", NULL, NULL}, /* Bosnian */
|
||||
{ "ca", NULL, "iso-8859-1"}, /* Catalan: 1,5,8,9 */
|
||||
{ "ce", NULL, NULL}, /* Chechen */
|
||||
{ "ch", NULL, NULL}, /* Chamorro */
|
||||
{ "co", NULL, "iso-8859-1"}, /* Corsican */
|
||||
{ "cs", NULL, "iso-8859-2"}, /* Czech */
|
||||
{ "cu", NULL, NULL }, /* Church Slavic */
|
||||
{ "cv", NULL, NULL}, /* Chuvash */
|
||||
{ "cy", NULL, "iso-8859-1"}, /* Welsh */
|
||||
{ "da", NULL, "iso-8859-1"}, /* Danish: 4-9 */
|
||||
{ "de", NULL, "iso-8859-1"}, /* German */
|
||||
{ "dz", NULL, NULL }, /* Dzongkha; Bhutani */
|
||||
{ "el", NULL, "iso-8859-7"}, /* Greek */
|
||||
{ "en", NULL, "iso-8859-1"}, /* English */
|
||||
{ "eo", NULL, "iso-8859-3"}, /* Esperanto */
|
||||
{ "es", NULL, "iso-8859-1"}, /* Spanish */
|
||||
{ "et", NULL, "iso-8859-15"}, /* Estonian: 6,7,9 */
|
||||
{ "eu", NULL, "iso-8859-1"}, /* Basque: 5,8,9 */
|
||||
{ "fa", NULL, "UTF-8"}, /* Persian */
|
||||
{ "fi", NULL, "iso-8859-15"}, /* Finnish */
|
||||
{ "fj", NULL, NULL }, /* Fijian; Fiji */
|
||||
{ "fo", NULL, "iso-8859-1"}, /* Faroese: 6,9 */
|
||||
{ "fr", NULL, "iso-8859-1"}, /* French */
|
||||
{ "fy", NULL, "iso-8859-1"}, /* Frisian */
|
||||
{ "ga", NULL, "iso-8859-14"}, /* Irish */
|
||||
{ "gd", NULL, "iso-8859-14" }, /* Scots; Gaelic */
|
||||
{ "gl", NULL, NULL }, /* Gallegan; Galician */
|
||||
{ "gn", NULL, NULL}, /* Guarani */
|
||||
{ "gu", NULL, NULL}, /* Gujarati */
|
||||
{ "gv", NULL, "iso-8859-14"}, /* Manx */
|
||||
{ "ha", NULL, NULL }, /* Hausa (?) */
|
||||
{ "he", NULL, "iso-8859-8" }, /* Hebrew */
|
||||
{ "hi", NULL, NULL}, /* Hindi */
|
||||
{ "ho", NULL, NULL}, /* Hiri Motu */
|
||||
{ "hr", NULL, "iso-8859-2"}, /* Croatian: 10 */
|
||||
{ "hu", NULL, "iso-8859-2"}, /* Hungarian */
|
||||
{ "hy", NULL, NULL}, /* Armenian */
|
||||
{ "hz", NULL, NULL}, /* Herero */
|
||||
{ "id", NULL, "iso-8859-1"}, /* Indonesian (formerly in) */
|
||||
{ "ia", NULL, NULL}, /* Interlingua */
|
||||
{ "ie", NULL, NULL}, /* Interlingue */
|
||||
{ "ik", NULL, NULL}, /* Inupiak */
|
||||
{ "io", NULL, NULL}, /* Ido */
|
||||
{ "is", NULL, "iso-8859-1"}, /* Icelandic */
|
||||
{ "it", NULL, "iso-8859-1"}, /* Italian */
|
||||
{ "iu", NULL, NULL}, /* Inuktitut */
|
||||
{ "ja", NULL, "EUC-JP"}, /* Japanese */
|
||||
{ "jv", NULL, NULL}, /* Javanese */
|
||||
{ "ka", NULL, NULL}, /* Georgian */
|
||||
{ "ki", NULL, NULL}, /* Kikuyu */
|
||||
{ "kj", NULL, NULL}, /* Kuanyama */
|
||||
{ "kk", NULL, NULL}, /* Kazakh */
|
||||
{ "kl", NULL, "iso-8859-1"}, /* Kalaallisut; Greenlandic */
|
||||
{ "km", NULL, NULL}, /* Khmer; Cambodian */
|
||||
{ "kn", NULL, NULL}, /* Kannada */
|
||||
{ "ko", NULL, "EUC-KR"}, /* Korean */
|
||||
{ "ks", NULL, NULL}, /* Kashmiri */
|
||||
{ "ku", NULL, NULL}, /* Kurdish */
|
||||
{ "kv", NULL, NULL}, /* Komi */
|
||||
{ "kw", NULL, "iso-8859-14"}, /* Cornish: 1,5,8 */
|
||||
{ "ky", NULL, NULL}, /* Kirghiz */
|
||||
{ "la", NULL, "iso-8859-1"}, /* Latin */
|
||||
{ "lb", NULL, "iso-8859-1"}, /* Letzeburgesch */
|
||||
{ "ln", NULL, NULL}, /* Lingala */
|
||||
{ "lo", NULL, NULL}, /* Lao; Laotian */
|
||||
{ "lt", NULL, "iso-8859-4"}, /* Lithuanian */
|
||||
{ "lv", NULL, "iso-8859-4"}, /* Latvian; Lettish */
|
||||
{ "mg", NULL, NULL}, /* Malagasy */
|
||||
{ "mh", NULL, NULL}, /* Marshall */
|
||||
{ "mi", NULL, NULL}, /* Maori */
|
||||
{ "mk", NULL, NULL}, /* Macedonian */
|
||||
{ "ml", NULL, NULL}, /* Malayalam */
|
||||
{ "mn", NULL, NULL}, /* Mongolian */
|
||||
{ "mo", NULL, "iso-8859-2"}, /* Moldavian */
|
||||
{ "mr", NULL, NULL}, /* Marathi */
|
||||
{ "ms", NULL, NULL}, /* Malay */
|
||||
{ "mt", NULL, "iso-8859-3"}, /* Maltese */
|
||||
{ "my", NULL, NULL}, /* Burmese */
|
||||
{ "na", NULL, NULL}, /* Nauru */
|
||||
{ "nb", NULL, "iso-8859-1"}, /* Norwegian Bokm<6B>l; Bokm@aa{}l */
|
||||
{ "nd", NULL, NULL}, /* Ndebele, North */
|
||||
{ "ne", NULL, NULL}, /* Nepali */
|
||||
{ "ng", NULL, NULL}, /* Ndonga */
|
||||
{ "nl", NULL, "iso-8859-1"}, /* Dutch: 5,9 */
|
||||
{ "nn", NULL, "iso-8859-1"}, /* Norwegian Nynorsk */
|
||||
{ "no", NULL, "iso-8859-1"}, /* Norwegian */
|
||||
{ "nr", NULL, NULL}, /* Ndebele, South */
|
||||
{ "nv", NULL, NULL}, /* Navajo */
|
||||
{ "ny", NULL, NULL}, /* Chichewa; Nyanja */
|
||||
{ "oc", NULL, NULL}, /* Occitan; Proven<65>al; Proven@,{c}al */
|
||||
{ "om", NULL, NULL}, /* (Afan) Oromo */
|
||||
{ "or", NULL, NULL}, /* Oriya */
|
||||
{ "os", NULL, NULL}, /* Ossetian; Ossetic */
|
||||
{ "pa", NULL, NULL}, /* Panjabi; Punjabi */
|
||||
{ "pi", NULL, NULL}, /* Pali */
|
||||
{ "pl", NULL, "iso-8859-2"}, /* Polish */
|
||||
{ "ps", NULL, NULL}, /* Pashto, Pushto */
|
||||
{ "pt", NULL, "iso-8859-1"}, /* Portuguese */
|
||||
{ "qu", NULL, "iso-8859-1"}, /* Quechua */
|
||||
{ "rm", NULL, "iso-8859-1"}, /* Rhaeto-Romance */
|
||||
{ "rn", NULL, NULL }, /* Rundi; Kirundi */
|
||||
{ "ro", NULL, "iso-8859-2"}, /* Romanian */
|
||||
{ "ru", NULL, "koi8-r"}, /* Russian */
|
||||
{ "rw", NULL, NULL}, /* Kinyarwanda */
|
||||
{ "sa", NULL, NULL}, /* Sanskrit */
|
||||
{ "sc", NULL, "iso-8859-1"}, /* Sardinian */
|
||||
{ "sd", NULL, NULL}, /* Sindhi */
|
||||
{ "se", NULL, "iso-8859-10"}, /* Northern Sami */
|
||||
{ "sg", NULL, NULL}, /* Sango; Sangro */
|
||||
{ "si", NULL, NULL}, /* Sinhalese */
|
||||
{ "sk", NULL, "iso-8859-2"}, /* Slovak */
|
||||
{ "sl", NULL, "iso-8859-1"}, /* Slovenian */
|
||||
{ "sm", NULL, NULL}, /* Samoan */
|
||||
{ "sn", NULL, NULL}, /* Shona */
|
||||
{ "so", NULL, NULL}, /* Somali */
|
||||
{ "sq", NULL, "iso-8859-1"}, /* Albanian: 2,5,8,9,10 */
|
||||
{ "sr", NULL, "iso-8859-2"}, /* Serbian */
|
||||
{ "ss", NULL, NULL}, /* Swati; Siswati */
|
||||
{ "st", NULL, NULL}, /* Sesotho; Sotho, Southern */
|
||||
{ "su", NULL, NULL}, /* Sundanese */
|
||||
{ "sv", NULL, "iso-8859-1"}, /* Swedish */
|
||||
{ "sw", NULL, NULL}, /* Swahili */
|
||||
{ "ta", NULL, NULL}, /* Tamil */
|
||||
{ "te", NULL, NULL}, /* Telugu */
|
||||
{ "tg", NULL, NULL}, /* Tajik */
|
||||
{ "th", NULL, "iso-8859-11"}, /* Thai */
|
||||
{ "ti", NULL, NULL}, /* Tigrinya */
|
||||
{ "tk", NULL, NULL}, /* Turkmen */
|
||||
{ "tl", NULL, "iso-8859-1"}, /* Tagalog */
|
||||
{ "tn", NULL, NULL}, /* Tswana; Setswana */
|
||||
{ "to", NULL, NULL}, /* Tonga (?) */
|
||||
{ "tr", NULL, "iso-8859-9"}, /* Turkish */
|
||||
{ "ts", NULL, NULL}, /* Tsonga */
|
||||
{ "tt", NULL, NULL}, /* Tatar */
|
||||
{ "tw", NULL, NULL}, /* Twi */
|
||||
{ "ty", NULL, NULL}, /* Tahitian */
|
||||
{ "ug", NULL, NULL}, /* Uighur */
|
||||
{ "uk", NULL, "koi8-u"}, /* Ukrainian */
|
||||
{ "ur", NULL, NULL}, /* Urdu */
|
||||
{ "uz", NULL, NULL}, /* Uzbek */
|
||||
{ "vi", NULL, NULL}, /* Vietnamese */
|
||||
{ "vo", NULL, NULL}, /* Volap<61>k; Volap@"{u}k; Volapuk */
|
||||
{ "wa", NULL, "iso-8859-1"}, /* Walloon */
|
||||
{ "wo", NULL, NULL}, /* Wolof */
|
||||
{ "xh", NULL, NULL}, /* Xhosa */
|
||||
{ "yi", NULL, "iso-8859-8"}, /* Yiddish (formerly ji) */
|
||||
{ "yo", NULL, NULL}, /* Yoruba */
|
||||
{ "za", NULL, NULL}, /* Zhuang */
|
||||
{ "zh", "TW", "big5"}, /* Chinese */
|
||||
{ "zh", NULL, "gb2312"}, /* Chinese */
|
||||
{ "zu", NULL, NULL}, /* Zulu */
|
||||
{ NULL }
|
||||
};
|
||||
|
||||
/* Given the language and (optionally) territory code, return the
|
||||
default character set for that language. See notes above. */
|
||||
|
||||
const char *
|
||||
charset_lookup (char *lang, char *terr)
|
||||
{
|
||||
static struct langtab *p;
|
||||
|
||||
if (!lang)
|
||||
return NULL;
|
||||
for (p = langtab; p->lang; p++)
|
||||
if (strcasecmp (p->lang, lang) == 0
|
||||
&& (terr == NULL
|
||||
|| p->terr == NULL
|
||||
|| !strcasecmp (p->terr, terr) == 0))
|
||||
return p->charset;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static const char *
|
||||
get_input_charset ()
|
||||
{
|
||||
const char *charset = NULL;
|
||||
char *tmp;
|
||||
int rc;
|
||||
|
||||
/* Try to deduce the charset from LC_ALL or LANG variables */
|
||||
|
||||
tmp = getenv ("LC_ALL");
|
||||
if (!tmp)
|
||||
tmp = getenv ("LANG");
|
||||
|
||||
if (tmp)
|
||||
{
|
||||
char *lang;
|
||||
char *terr;
|
||||
|
||||
lang = strtok (tmp, "_");
|
||||
terr = strtok (NULL, ".");
|
||||
charset = strtok (NULL, "@");
|
||||
|
||||
if (!charset)
|
||||
charset = charset_lookup (lang, terr);
|
||||
}
|
||||
|
||||
if (!charset)
|
||||
charset = "iso-8859-1";
|
||||
return charset;
|
||||
}
|
||||
|
||||
|
||||
|
||||
#ifndef HAVE_LIBICONV
|
||||
|
||||
iconv_t
|
||||
iconv_open (const char *tocode, const char *fromcode)
|
||||
{
|
||||
return (iconv_t)(-1);
|
||||
}
|
||||
|
||||
size_t
|
||||
iconv (iconv_t cd, ICONV_CONST char **inbuf, size_t *inbytesleft,
|
||||
char **outbuf, size_t *outbytesleft)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
int
|
||||
iconv_close (iconv_t cd)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif /* !HAVE_LIBICONV */
|
||||
|
||||
|
||||
|
||||
|
||||
static iconv_t conv_desc[2] = { (iconv_t) -1, (iconv_t) -1 };
|
||||
|
||||
static iconv_t
|
||||
utf8_init (bool to_utf)
|
||||
{
|
||||
if (conv_desc[(int) to_utf] == (iconv_t) -1)
|
||||
{
|
||||
if (to_utf)
|
||||
conv_desc[(int) to_utf] = iconv_open ("UTF-8", get_input_charset ());
|
||||
else
|
||||
conv_desc[(int) to_utf] = iconv_open (get_input_charset (), "UTF-8");
|
||||
}
|
||||
return conv_desc[(int) to_utf];
|
||||
}
|
||||
|
||||
bool
|
||||
utf8_convert(bool to_utf, char *input, char **output)
|
||||
{
|
||||
const char *ib;
|
||||
char *ob;
|
||||
size_t inlen;
|
||||
size_t outlen;
|
||||
size_t rc;
|
||||
iconv_t cd = utf8_init(to_utf);
|
||||
|
||||
if (cd == 0)
|
||||
{
|
||||
*output = xstrdup (input);
|
||||
return true;
|
||||
}
|
||||
else if (cd == (iconv_t)-1)
|
||||
return false;
|
||||
|
||||
inlen = strlen (input) + 1;
|
||||
outlen = inlen * MB_LEN_MAX + 1;
|
||||
ob = *output = xmalloc (outlen);
|
||||
ib = input;
|
||||
rc = iconv (cd, &ib, &inlen, &ob, &outlen);
|
||||
*ob = 0;
|
||||
return rc != -1;
|
||||
}
|
||||
|
||||
|
||||
bool
|
||||
string_ascii_p (const char *str)
|
||||
{
|
||||
const unsigned char *p = (const unsigned char *)str;
|
||||
for (; *p; p++)
|
||||
if (*p > 127)
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
Reference in New Issue
Block a user