/* * UTF-8 Utilities * Copyright * (C) 2004 Joseph H. Allen * * This file is part of JOE (Joe's Own Editor) */ #include "types.h" /* Under AmigaOS we have setlocale() but don't have langinfo.h and associated stuff, * so we have to disable the whole piece of code */ #ifdef __amigaos #undef HAVE_SETLOCALE #endif /* Cygwin has CODESET, but it's crummy */ #ifdef __CYGWIN__ #undef HAVE_SETLOCALE #endif /* If it looks old, forget it */ #ifndef CODESET #undef HAVE_SETLOCALE #endif #if defined(HAVE_LOCALE_H) && defined(HAVE_SETLOCALE) # include # include #endif /* nl_langinfo(CODESET) is broken on many systems. If HAVE_SETLOCALE is undefined, JOE uses a limited internal version instead */ /* UTF-8 Encoder * * c is unicode character. * buf is 7 byte buffer- utf-8 coded character is written to this followed by a 0 termination. * returns length (not including terminator). */ int utf8_encode(unsigned char *buf,int c) { if (c < 0x80) { buf[0] = c; buf[1] = 0; return 1; } else if(c < 0x800) { buf[0] = (0xc0|(c>>6)); buf[1] = (0x80|(c&0x3F)); buf[2] = 0; return 2; } else if(c < 0x10000) { buf[0] = (0xe0|(c>>12)); buf[1] = (0x80|((c>>6)&0x3f)); buf[2] = (0x80|((c)&0x3f)); buf[3] = 0; return 3; } else if(c < 0x200000) { buf[0] = (0xf0|(c>>18)); buf[1] = (0x80|((c>>12)&0x3f)); buf[2] = (0x80|((c>>6)&0x3f)); buf[3] = (0x80|((c)&0x3f)); buf[4] = 0; return 4; } else if(c < 0x4000000) { buf[0] = (0xf8|(c>>24)); buf[1] = (0x80|((c>>18)&0x3f)); buf[2] = (0x80|((c>>12)&0x3f)); buf[3] = (0x80|((c>>6)&0x3f)); buf[4] = (0x80|((c)&0x3f)); buf[5] = 0; return 5; } else { buf[0] = (0xfC|(c>>30)); buf[1] = (0x80|((c>>24)&0x3f)); buf[2] = (0x80|((c>>18)&0x3f)); buf[3] = (0x80|((c>>12)&0x3f)); buf[4] = (0x80|((c>>6)&0x3f)); buf[5] = (0x80|((c)&0x3f)); buf[6] = 0; return 6; } } /* UTF-8 Decoder * * Returns 0 - 7FFFFFFF: decoded character * -1: character accepted, nothing decoded yet. * -2: incomplete sequence * -3: no sequence started, but character is between 128 - 191, 254 or 255 */ int utf8_decode(struct utf8_sm *utf8_sm,unsigned char c) { if (utf8_sm->state) { if ((c&0xC0)==0x80) { utf8_sm->buf[utf8_sm->ptr++] = c; --utf8_sm->state; utf8_sm->accu = ((utf8_sm->accu<<6)|(c&0x3F)); if(!utf8_sm->state) return utf8_sm->accu; } else { utf8_sm->state = 0; return -2; } } else if ((c&0xE0)==0xC0) { /* 192 - 223 */ utf8_sm->buf[0] = c; utf8_sm->ptr = 1; utf8_sm->state = 1; utf8_sm->accu = (c&0x1F); } else if ((c&0xF0)==0xE0) { /* 224 - 239 */ utf8_sm->buf[0] = c; utf8_sm->ptr = 1; utf8_sm->state = 2; utf8_sm->accu = (c&0x0F); } else if ((c&0xF8)==0xF0) { /* 240 - 247 */ utf8_sm->buf[0] = c; utf8_sm->ptr = 1; utf8_sm->state = 3; utf8_sm->accu = (c&0x07); } else if ((c&0xFC)==0xF8) { /* 248 - 251 */ utf8_sm->buf[0] = c; utf8_sm->ptr = 1; utf8_sm->state = 4; utf8_sm->accu = (c&0x03); } else if ((c&0xFE)==0xFC) { /* 252 - 253 */ utf8_sm->buf[0] = c; utf8_sm->ptr = 1; utf8_sm->state = 5; utf8_sm->accu = (c&0x01); } else if ((c&0x80)==0x00) { /* 0 - 127 */ utf8_sm->buf[0] = c; utf8_sm->ptr = 1; utf8_sm->state = 0; return c; } else { /* 128 - 191, 254, 255 */ utf8_sm->ptr = 0; utf8_sm->state = 0; return -3; } return -1; } /* Initialize state machine */ void utf8_init(struct utf8_sm *utf8_sm) { utf8_sm->ptr = 0; utf8_sm->state = 0; } /* Decode an entire string */ int utf8_decode_string(unsigned char *s) { struct utf8_sm sm; int x; int c = -1; utf8_init(&sm); for(x=0;s[x];++x) c = utf8_decode(&sm,s[x]); return c; } /* Decode and advance * * Returns: 0 - 7FFFFFFF: decoded character * -2: incomplete sequence * -3: bad start of sequence found. * * p/plen are always advanced in such a way that repeated called to utf8_decode_fwrd do not cause * infinite loops. */ int utf8_decode_fwrd(unsigned char **p,int *plen) { struct utf8_sm sm; unsigned char *s = *p; int len; int c = -2; /* Return this on no more input. */ if (plen) len = *plen; else len = -1; utf8_init(&sm); while (len) { c = utf8_decode(&sm, *s); if (c >= 0) { /* We've got a character */ --len; ++s; break; } else if (c == -2) { /* Bad sequence detected. Caller should feed rest of string in again. */ break; } else if (c == -3) { /* Bad start of UTF-8 sequence. We need to eat this char to avoid infinite loops. */ --len; ++s; /* But we should tell the caller that something bad was found. */ break; } else { /* If c is -1, utf8_decode accepted the character, so we should get the next one. */ --len; ++s; } } if (plen) *plen = len; *p = s; return c; } /* For systems (BSD) with no nl_langinfo(CODESET) */ /* * This is a quick-and-dirty emulator of the nl_langinfo(CODESET) * function defined in the Single Unix Specification for those systems * (FreeBSD, etc.) that don't have one yet. It behaves as if it had * been called after setlocale(LC_CTYPE, ""), that is it looks at * the locale environment variables. * * http://www.opengroup.org/onlinepubs/7908799/xsh/langinfo.h.html * * Please extend it as needed and suggest improvements to the author. * This emulator will hopefully become redundant soon as * nl_langinfo(CODESET) becomes more widely implemented. * * Since the proposed Li18nux encoding name registry is still not mature, * the output follows the MIME registry where possible: * * http://www.iana.org/assignments/character-sets * * A possible autoconf test for the availability of nl_langinfo(CODESET) * can be found in * * http://www.cl.cam.ac.uk/~mgk25/unicode.html#activate * * Markus.Kuhn@cl.cam.ac.uk -- 2002-03-11 * Permission to use, copy, modify, and distribute this software * for any purpose and without fee is hereby granted. The author * disclaims all warranties with regard to this software. * * Latest version: * * http://www.cl.cam.ac.uk/~mgk25/ucs/langinfo.c */ unsigned char *joe_getcodeset(unsigned char *l) { static unsigned char buf[16]; unsigned char *p; if (l || ((l = (unsigned char *)getenv("LC_ALL")) && *l) || ((l = (unsigned char *)getenv("LC_CTYPE")) && *l) || ((l = (unsigned char *)getenv("LANG")) && *l)) { /* check standardized locales */ if (!zcmp(l, USTR "C") || !zcmp(l, USTR "POSIX")) return USTR "ascii"; /* check for encoding name fragment */ if (zstr(l, USTR "UTF") || zstr(l, USTR "utf")) return USTR "UTF-8"; if ((p = zstr(l, USTR "8859-"))) { memcpy((char *)buf, "ISO-8859-\0\0", 12); p += 5; if (*p >= '0' && *p <= '9') { buf[9] = *p++; if (*p >= '0' && *p <= '9') buf[10] = *p++; return buf; } } if (zstr(l, USTR "KOI8-R")) return USTR "KOI8-R"; if (zstr(l, USTR "KOI8-U")) return USTR "KOI8-U"; if (zstr(l, USTR "620")) return USTR "TIS-620"; if (zstr(l, USTR "2312")) return USTR "GB2312"; if (zstr(l, USTR "HKSCS")) return USTR "Big5HKSCS"; /* no MIME charset */ if (zstr(l, USTR "Big5") || zstr(l, USTR "BIG5")) return USTR "Big5"; if (zstr(l, USTR "GBK")) return USTR "GBK"; /* no MIME charset */ if (zstr(l, USTR "18030")) return USTR "GB18030"; /* no MIME charset */ if (zstr(l, USTR "Shift_JIS") || zstr(l, USTR "SJIS")) return USTR "Shift_JIS"; /* check for conclusive modifier */ if (zstr(l, USTR "euro")) return USTR "ISO-8859-15"; /* check for language (and perhaps country) codes */ if (zstr(l, USTR "zh_TW")) return USTR "Big5"; if (zstr(l, USTR "zh_HK")) return USTR "Big5HKSCS"; /* no MIME charset */ if (zstr(l, USTR "zh")) return USTR "GB2312"; if (zstr(l, USTR "ja")) return USTR "EUC-JP"; if (zstr(l, USTR "ko")) return USTR "EUC-KR"; if (zstr(l, USTR "ru")) return USTR "KOI8-R"; if (zstr(l, USTR "uk")) return USTR "KOI8-U"; if (zstr(l, USTR "pl") || zstr(l, USTR "hr") || zstr(l, USTR "hu") || zstr(l, USTR "cs") || zstr(l, USTR "sk") || zstr(l, USTR "sl")) return USTR "ISO-8859-2"; if (zstr(l, USTR "eo") || zstr(l, USTR "mt")) return USTR "ISO-8859-3"; if (zstr(l, USTR "el")) return USTR "ISO-8859-7"; if (zstr(l, USTR "he")) return USTR "ISO-8859-8"; if (zstr(l, USTR "tr")) return USTR "ISO-8859-9"; if (zstr(l, USTR "th")) return USTR "TIS-620"; /* or ISO-8859-11 */ if (zstr(l, USTR "lt")) return USTR "ISO-8859-13"; if (zstr(l, USTR "cy")) return USTR "ISO-8859-14"; if (zstr(l, USTR "ro")) return USTR "ISO-8859-2"; /* or ISO-8859-16 */ if (zstr(l, USTR "am") || zstr(l, USTR "vi")) return USTR "UTF-8"; /* Send me further rules if you like, but don't forget that we are * *only* interested in locale naming conventions on platforms * that do not already provide an nl_langinfo(CODESET) implementation. */ return USTR "ISO-8859-1"; /* should perhaps be "UTF-8" instead */ } return USTR "ascii"; } /* Initialize locale for JOE */ unsigned char *codeset; /* Codeset of terminal */ unsigned char *non_utf8_codeset; /* Codeset of local language non-UTF-8 */ unsigned char *locale_lang; /* Our local language */ unsigned char *locale_msgs; /* Language to use for editor messages */ struct charmap *locale_map; /* Character map of terminal, default map for new files */ struct charmap *locale_map_non_utf8; /* Old, non-utf8 version of locale */ void joe_locale() { unsigned char *s, *t, *u; s=(unsigned char *)getenv("LC_ALL"); if (!s || !*s) { s=(unsigned char *)getenv("LC_MESSAGES"); if (!s || !*s) { s=(unsigned char *)getenv("LANG"); } } if (s) s=zdup(s); else s=USTR "ascii"; if ((t=zrchr(s,'.'))) *t = 0; locale_msgs = s; s=(unsigned char *)getenv("LC_ALL"); if (!s || !*s) { s=(unsigned char *)getenv("LC_CTYPE"); if (!s || !*s) { s=(unsigned char *)getenv("LANG"); } } if (s) s=zdup(s); else s=USTR "ascii"; u = zdup(s); if ((t=zrchr(s,'.'))) *t = 0; locale_lang = s; #ifdef HAVE_SETLOCALE setlocale(LC_ALL,(char *)s); non_utf8_codeset = zdup((unsigned char *)nl_langinfo(CODESET)); #else non_utf8_codeset = joe_getcodeset(s); #endif /* printf("joe_locale\n"); */ #ifdef HAVE_SETLOCALE /* printf("set_locale\n"); */ setlocale(LC_ALL,""); #ifdef ENABLE_NLS /* Set up gettext() */ bindtextdomain(PACKAGE, LOCALEDIR); textdomain(PACKAGE); /* printf("%s %s %s\n",PACKAGE,LOCALEDIR,joe_gettext("New File")); */ #endif codeset = zdup((unsigned char *)nl_langinfo(CODESET)); #else codeset = joe_getcodeset(u); #endif locale_map = find_charmap(codeset); if (!locale_map) locale_map = find_charmap(USTR "ascii"); locale_map_non_utf8 = find_charmap(non_utf8_codeset); if (!locale_map_non_utf8) locale_map_non_utf8 = find_charmap(USTR "ascii"); fdefault.charmap = locale_map; pdefault.charmap = locale_map; /* printf("Character set is %s\n",locale_map->name); for(x=0;x!=128;++x) printf("%x space=%d blank=%d alpha=%d alnum=%d punct=%d print=%d\n", x,joe_isspace(locale_map,x), joe_isblank(locale_map,x), joe_isalpha_(locale_map,x), joe_isalnum_(locale_map,x), joe_ispunct(locale_map,x), joe_isprint(locale_map,x)); */ /* Use iconv() */ #ifdef junk to_utf = iconv_open("UTF-8", (char *)non_utf8_codeset); from_utf = iconv_open((char *)non_utf8_codeset, "UTF-8"); #endif /* For no locale support */ #ifdef junk locale_map = find_charmap("ascii"); fdefault.charmap = locale_map; pdefault.charmap = locale_map; #endif init_gettext(locale_msgs); } void to_utf8(struct charmap *map,unsigned char *s,int c) { int d = to_uni(map,c); if (d==-1) utf8_encode(s,'?'); else utf8_encode(s,d); #ifdef junk /* Iconv() way */ unsigned char buf[10]; unsigned char *ibufp = buf; unsigned char *obufp = s; int ibuf_sz=1; int obuf_sz= 10; buf[0]=c; buf[1]=0; if (to_utf==(iconv_t)-1 || iconv(to_utf,(char **)&ibufp,&ibuf_sz,(char **)&obufp,&obuf_sz)==(size_t)-1) { s[0]='?'; s[1]=0; } else { *obufp = 0; } #endif } int from_utf8(struct charmap *map,unsigned char *s) { int d = utf8_decode_string(s); int c = from_uni(map,d); if (c==-1) return '?'; else return c; #ifdef junk /* Iconv() way */ int ibuf_sz=zlen(s); unsigned char *ibufp=s; int obuf_sz=10; unsigned char obuf[10]; unsigned char *obufp = obuf; if (from_utf==(iconv_t)-1 || iconv(from_utf,(char **)&ibufp,&ibuf_sz,(char **)&obufp,&obuf_sz)==((size_t)-1)) return '?'; else return obuf[0]; #endif } void my_iconv(unsigned char *dest,struct charmap *dest_map, unsigned char *src,struct charmap *src_map) { if (dest_map == src_map) { zcpy (dest, src); return; } if (src_map->type) { /* src is UTF-8 */ if (dest_map->type) { /* UTF-8 to UTF-8? */ zcpy (dest, src); } else { /* UTF-8 to non-UTF-8 */ while (*src) { int len = -1; int c = utf8_decode_fwrd(&src, &len); if (c >= 0) { int d = from_uni(dest_map, c); if (d >= 0) *dest++ = d; else *dest++ = '?'; } else *dest++ = 'X'; } *dest = 0; } } else { /* src is not UTF-8 */ if (!dest_map->type) { /* Non UTF-8 to non-UTF-8 */ while (*src) { int c = to_uni(src_map, *src++); int d; if (c >= 0) { d = from_uni(dest_map, c); if (d >= 0) *dest++ = d; else *dest++ = '?'; } else *dest++ = '?'; } *dest = 0; } else { /* Non-UTF-8 to UTF-8 */ while (*src) { int c = to_uni(src_map, *src++); if (c >= 0) dest += utf8_encode(dest, c); else *dest++ = '?'; } *dest = 0; } } } /* Guess character set */ int guess_non_utf8; int guess_utf8; struct charmap *guess_map(unsigned char *buf, int len) { unsigned char *p; int plen; int c; int flag; /* No info? Use default */ if (!len || (!guess_non_utf8 && !guess_utf8)) return locale_map; /* Does it look like UTF-8? */ p = buf; plen = len; c = 0; flag = 0; while (plen) { /* Break if we could possibly run out of data in the middle of utf-8 sequence */ if (plen < 7) break; if (*p >= 128) flag = 1; c = utf8_decode_fwrd(&p, &plen); if (c < 0) break; } if (flag && c >= 0) { /* There are characters above 128, and there are no utf-8 errors */ if (locale_map->type || !guess_utf8) return locale_map; else return find_charmap(USTR "utf-8"); } if (!flag || !guess_non_utf8) { /* No characters above 128 */ return locale_map; } else { /* Choose non-utf8 version of locale */ return locale_map_non_utf8; } }