#include #include #include #include #ifdef HAVE_CONFIG_H #include "config.h" #endif #include "iconv.h" #include "wv.h" extern TokenTable s_Tokens[]; int (*wvConvertUnicodeToEntity) (U16 char16) = NULL; /* i hate iconv - compilers treat its prototype differently */ #if !defined(WIN32) || !defined(_WIN32) #define wv_iconv(a,b,c,d,e) iconv(a, (ICONV_CONST char**)b,c,(char**)d,e) #else #define wv_iconv(a,b,c,d,e) iconv(a,b,c,(char**)d,e) #endif U16 wvnLocaleToLIDConverter (U8 nLocale) { switch (nLocale) { #if 0 /* case 0: */ /* ANSI_CHARSET */ /* case 1: */ /* DEFAULT_CHARSET */ /* case 2: */ /* SYMBOL_CHARSET */ #endif case 77: /* MAC_CHARSET */ return (0xFFF); /* This number is a hack */ case 128: /* SHIFTJIS_CHARSET */ return (0x411); /* Japanese */ case 129: /* HANGEUL_CHARSET */ return (0x412); /* Korean */ case 130: /* JOHAB_CHARSET */ return (0x812); /* Korean (Johab) */ case 134: /* GB2312_CHARSET - Chinese Simplified */ return (0x804); /* China PRC - And others!! */ case 136: /* CHINESEBIG5_CHARSET - Chinese Traditional */ return (0x404); /* Taiwan - And others!! */ case 161: /* GREEK_CHARSET */ return (0x408); /* Greek */ case 162: /* TURKISH_CHARSET */ return (0x41f); /* Turkish */ case 163: /* VIETNAMESE_CHARSET */ return (0x42a); /* Vietnamese */ case 177: /* HEBREW_CHARSET */ return (0x40d); /* Hebrew */ case 178: /* ARABIC_CHARSET */ return (0x01); /* Arabic */ case 186: /* BALTIC_CHARSET */ return (0x425); /* Estonian - And others!! */ case 204: /* RUSSIAN_CHARSET */ return (0x419); /* Russian - And others!! */ case 222: /* THAI_CHARSET */ return (0x41e); /* Thai */ case 238: /* EASTEUROPE_CHARSET */ return (0x405); /* Czech - And many others!! */ #if 0 /* case 255: */ /* OEM_CHARSET */ #endif default: return (0x0); } return (0x0); } int wvOutputTextChar (U16 eachchar, U8 chartype, wvParseStruct * ps, CHP * achp) { U16 lid = 0; wvVersion v = wvQuerySupported (&ps->fib, NULL); /* testing adding a language */ /* For version <= WORD7, The charset used could * depend on the font's charset. */ if ((v <= WORD7) && (!ps->fib.fFarEast)) { FFN currentfont; if (ps->fonts.ffn == NULL) { lid = 0; } else { currentfont = ps->fonts.ffn[achp->ftc]; /* Return 0 if no match */ lid = wvnLocaleToLIDConverter (currentfont.chs); } } if (!lid) lid = achp->lidDefault; /* No lidDefault for ver < WORD6 */ if (lid == 0x400 || lid == 0) lid = ps->fib.lid; /* end testing adding a language */ if (achp->fSpec) { /* if the character is still one of the special ones then call this other handler instead */ if (ps->scharhandler) return ((*(ps->scharhandler)) (ps, eachchar, achp)); } else { /* Most Chars go through this baby */ if (ps->charhandler) { if (!((v == WORD7 || v == WORD6) && ps->fib.fFarEast)) if (v <= WORD7) { /* versions <= 7 do not use unicode. versions >= 8 always do */ /* versions 7 and 6 use unicode iff the far-east flag is set */ chartype = 1; } return ((*(ps->charhandler)) (ps, eachchar, chartype, lid)); } } wvError (("No CharHandler registered, programmer error\n")); return (0); } void wvOutputHtmlChar (U16 eachchar, U8 chartype, char *outputtype, U16 lid) { if (chartype) eachchar = wvHandleCodePage (eachchar, lid); wvOutputFromUnicode (eachchar, outputtype); } #define CPNAME_OR_FALLBACK(name,fallbackname) \ { \ static char* cpname = NULL; \ if (!cpname) \ { \ iconv_t cd = iconv_open(name,name); \ if (cd==(iconv_t)-1) \ { \ cpname = fallbackname; \ } \ else \ { \ cpname = name; \ iconv_close(cd); \ } \ }; \ return cpname; \ } typedef struct { const char * language_tag ; U16 lid ; } wvLanguageId ; static const wvLanguageId mLanguageIds[] = { { "-none-", 0x0000 }, /* none (language neutral) */ { "-none-", 0x0400 }, /* none */ { "af-ZA", 0x0436 }, /* Afrikaans */ { "am", 0x045e }, /* Amharic */ { "al-AL", 0x041c }, /* Albanian */ { "ar-SA", 0x0401 }, /* Arabic (Saudi) */ { "ar-IQ", 0x0801 }, /* Arabic (Iraq) */ { "ar-EG", 0x0c01 }, /* Arabic (Egypt) */ { "ar-LY", 0x1001 }, /* Arabic (Libya) */ { "ar-DZ", 0x1401 }, /* Arabic (Algeria) */ { "ar-MA", 0x1801 }, /* Arabic (Morocco) */ { "ar-TN", 0x1c01 }, /* Arabic (Tunisia) */ { "ar-OM", 0x2001 }, /* Arabic (Oman) */ { "ar-YE", 0x2401 }, /* Arabic (Yemen) */ { "ar-SY", 0x2801 }, /* Arabic (Syria) */ { "ar-JO", 0x2c01 }, /* Arabic (Jordan) */ { "ar-LB", 0x3001 }, /* Arabic (Lebanon) */ { "ar-KW", 0x3401 }, /* Arabic (Kuwait) */ { "ar-AE", 0x3801 }, /* Arabic (United Arab Emirates) */ { "ar-BH", 0x3c01 }, /* Arabic (Bahrain) */ { "ar-QA", 0x4001 }, /* Arabic (Qatar) */ { "as", 0x044d }, /* Assamese */ { "az", 0x042c }, /* Azerbaijani */ { "hy-AM", 0x042b }, /* Armenian */ { "az", 0x044c }, /* Azeri (Latin) az- */ { "az", 0x082c }, /* Azeri (Cyrillic) az- */ { "eu-ES", 0x042d }, /* Basque */ { "be-BY", 0x0423 }, /* Belarussian */ { "bn", 0x0445 }, /* Bengali bn- */ { "bg-BG", 0x0402 }, /* Bulgarian */ { "ca-ES", 0x0403 }, /* Catalan */ { "zh-TW", 0x0404 }, /* Chinese (Taiwan) */ { "zh-CN", 0x0804 }, /* Chinese (PRC) */ { "zh-HK", 0x0c04 }, /* Chinese (Hong Kong) */ { "zh-SG", 0x1004 }, /* Chinese (Singapore) */ { "ch-MO", 0x1404 }, /* Chinese (Macau SAR) */ { "hr-HR", 0x041a }, /* Croatian */ { "cs-CZ", 0x0405 }, /* Czech */ { "da-DK", 0x0406 }, /* Danish */ { "div", 0x465 }, /* Divehi div-*/ { "nl-NL", 0x0413 }, /* Dutch (Netherlands) */ { "nl-BE", 0x0813 }, /* Dutch (Belgium) */ { "en-US", 0x0409 }, /* English (USA) */ { "en-GB", 0x0809 }, /* English (UK) */ { "en-AU", 0x0c09 }, /* English (Australia) */ { "en-CA", 0x1009 }, /* English (Canada) */ { "en-NZ", 0x1409 }, /* English (New Zealand) */ { "en-IE", 0x1809 }, /* English (Ireland) */ { "en-ZA", 0x1c09 }, /* English (South Africa) */ { "en-JM", 0x2009 }, /* English (Jamaica) */ { "en", 0x2409 }, /* English (Caribbean) */ { "en-BZ", 0x2809 }, /* English (Belize) */ { "en-TT", 0x2c09 }, /* English (Trinidad) */ { "en-ZW", 0x3009 }, /* English (Zimbabwe) */ { "en-PH", 0x3409 }, /* English (Phillipines) */ { "et-EE", 0x0425 }, /* Estonian */ { "fo", 0x0438 }, /* Faeroese fo- */ { "fa-IR", 0x0429 }, /* Farsi */ { "fi-FI", 0x040b }, /* Finnish */ { "fr-FR", 0x040c }, /* French (France) */ { "fr-BE", 0x080c }, /* French (Belgium) */ { "fr-CA", 0x0c0c }, /* French (Canada) */ { "fr-CH", 0x100c }, /* French (Switzerland) */ { "fr-LU", 0x140c }, /* French (Luxembourg) */ { "fr-MC", 0x180c }, /* French (Monaco) */ { "gl", 0x0456 }, /* Galician gl- */ { "ga-IE", 0x083c }, /* Irish Gaelic */ { "gd-GB", 0x100c }, /* Scottish Gaelic */ { "ka-GE", 0x0437 }, /* Georgian */ { "de-DE", 0x0407 }, /* German (Germany) */ { "de-CH", 0x0807 }, /* German (Switzerland) */ { "de-AT", 0x0c07 }, /* German (Austria) */ { "de-LU", 0x1007 }, /* German (Luxembourg) */ { "de-LI", 0x1407 }, /* German (Liechtenstein) */ { "el-GR", 0x0408 }, /* Greek */ { "gu", 0x0447 }, /* Gujarati gu- */ { "ha", 0x0468 }, /* Hausa */ { "he-IL", 0x040d }, /* Hebrew */ { "hi-IN", 0x0439 }, /* Hindi */ { "hu-HU", 0x040e }, /* Hungarian */ { "is-IS", 0x040f }, /* Icelandic */ { "id-ID", 0x0421 }, /* Indonesian */ { "iu", 0x045d }, /* Inkutitut */ { "it-IT", 0x0410 }, /* Italian (Italy) */ { "it-CH", 0x0810 }, /* Italian (Switzerland) */ { "ja-JP", 0x0411}, /* Japanese */ { "kn", 0x044b }, /* Kannada kn- */ { "ks", 0x0860 }, /* Kashmiri (India) ks- */ { "kk", 0x043f }, /* Kazakh kk- */ { "kok", 0x0457 }, /* Konkani kok- */ { "ko-KR", 0x0412 }, /* Korean */ { "ko", 0x0812 }, /* Korean (Johab) ko- */ { "kir", 0x0440 }, /* Kyrgyz */ { "la", 0x0476 }, /* Latin */ { "lo", 0x0454 }, /* Laothian */ { "lv-LV", 0x0426 }, /* Latvian */ { "lt-LT", 0x0427 }, /* Lithuanian */ { "lt-LT", 0x0827 }, /* Lithuanian (Classic) */ { "mk", 0x042f }, /* FYRO Macedonian */ { "my-MY", 0x043e }, /* Malaysian */ { "my-BN", 0x083e }, /* Malay Brunei Darussalam */ { "ml", 0x044c }, /* Malayalam ml- */ { "mr", 0x044e }, /* Marathi mr- */ { "mt", 0x043a }, /* Maltese */ { "mo", 0x0450 }, /* Mongolian */ { "ne-NP", 0x0461 }, /* Napali (Nepal) */ { "ne-IN", 0x0861 }, /* Nepali (India) */ { "nb-NO", 0x0414 }, /* Norwegian (Bokmai) */ { "nn-NO", 0x0814 }, /* Norwegian (Nynorsk) */ { "or", 0x0448 }, /* Oriya or- */ { "om", 0x0472 }, /* Oromo (Afan, Galla) */ { "pl-PL", 0x0415 }, /* Polish */ { "pt-BR", 0x0416 }, /* Portuguese (Brazil) */ { "pt-PT", 0x0816 }, /* Portuguese (Portugal) */ { "pa", 0x0446 }, /* Punjabi pa- */ { "ps", 0x0463 }, /* Pashto (Pushto) */ { "rm", 0x0417 }, /* Rhaeto-Romanic rm- */ { "ro-RO", 0x0418 }, /* Romanian */ { "ro-MD", 0x0818 }, /* Romanian (Moldova) */ { "ru-RU", 0x0419 }, /* Russian */ { "ru-MD", 0x0819 }, /* Russian (Moldova) */ { "se", 0x043b }, /* Sami (Lappish) se- */ { "sa", 0x044f }, /* Sanskrit sa- */ { "sr", 0x0c1a }, /* Serbian (Cyrillic) sr- */ { "sr", 0x081a }, /* Serbian (Latin) sr- */ { "sd", 0x0459 }, /* Sindhi sd- */ { "sk-SK", 0x041b }, /* Slovak */ { "sl-SI", 0x0424 }, /* Slovenian */ { "wen", 0x042e }, /* Sorbian wen- */ { "so", 0x0477 }, /* Somali */ { "es-ES", 0x040a }, /* Spanish (Spain, Traditional) */ { "es-MX", 0x080a }, /* Spanish (Mexico) */ { "es-ES", 0x0c0a }, /* Spanish (Modern) */ { "es-GT", 0x100a }, /* Spanish (Guatemala) */ { "es-CR", 0x140a }, /* Spanish (Costa Rica) */ { "es-PA", 0x180a }, /* Spanish (Panama) */ { "es-DO", 0x1c0a }, /* Spanish (Dominican Republic) */ { "es-VE", 0x200a }, /* Spanish (Venezuela) */ { "es-CO", 0x240a }, /* Spanish (Colombia) */ { "es-PE", 0x280a }, /* Spanish (Peru) */ { "es-AR", 0x2c0a }, /* Spanish (Argentina) */ { "es-EC", 0x300a }, /* Spanish (Ecuador) */ { "es-CL", 0x340a }, /* Spanish (Chile) */ { "es-UY", 0x380a }, /* Spanish (Uruguay) */ { "es-PY", 0x3c0a }, /* Spanish (Paraguay) */ { "es-BO", 0x400a }, /* Spanish (Bolivia) */ { "es-SV", 0x440a }, /* Spanish (El Salvador) */ { "es-HN", 0x480a }, /* Spanish (Honduras) */ { "es-NI", 0x4c0a }, /* Spanish (Nicaragua) */ { "es-PR", 0x500a }, /* Spanish (Puerto Rico) */ { "sx", 0x0430 }, /* Sutu */ { "sw", 0x0441 }, /* Swahili (Kiswahili/Kenya) */ { "sv-SE", 0x041d }, /* Swedish */ { "sv-FI", 0x081d }, /* Swedish (Finland) */ { "ta", 0x0449 }, /* Tamil ta- */ { "tt", 0x0444 }, /* Tatar (Tatarstan) tt- */ { "te", 0x044a }, /* Telugu te- */ { "th-TH", 0x041e }, /* Thai */ { "ts", 0x0431 }, /* Tsonga ts- */ { "tn", 0x0432 }, /* Tswana tn- */ { "tr-TR", 0x041f }, /* Turkish */ { "tl", 0x0464 }, /* Tagalog */ { "tg", 0x0428 }, /* Tajik */ { "bo", 0x0451 }, /* Tibetan */ { "ti", 0x0473 }, /* Tigrinya */ { "uk-UA", 0x0422 }, /* Ukrainian */ { "ur-PK", 0x0420 }, /* Urdu (Pakistan) */ { "ur-IN", 0x0820 }, /* Urdu (India) */ { "uz", 0x0443 }, /* Uzbek (Latin) uz- */ { "uz", 0x0843 }, /* Uzbek (Cyrillic) uz- */ { "ven", 0x0433 }, /* Venda ven- */ { "vi-VN", 0x042a }, /* Vietnamese */ { "cy-GB", 0x0452 }, /* Welsh */ { "xh", 0x0434 }, /* Xhosa xh */ { "yi", 0x043d }, /* Yiddish yi- */ { "yo", 0x046a }, /* Yoruba */ { "zu", 0x0435 }, /* Zulu zu- */ { "en-US", 0x0800 } /* Default */ }; #define NrMappings (sizeof(mLanguageIds)/sizeof(mLanguageIds[0])) U16 wvLangToLIDConverter ( const char * lang ) { unsigned int i = 0 ; if (!lang) return 0x0400; /* return -none- */ for ( i = 0 ; i < NrMappings ; i++ ) if (!strcmp (lang, mLanguageIds[i].language_tag)) return mLanguageIds[i].lid ; return 0x0400 ; /* return -none- */ } const char * wvLIDToLangConverter (U16 lid) { unsigned int i = 0 ; if ( lid == 0 ) /* language netural */ return "-none-" ; for ( i = 0 ; i < NrMappings ; i++ ) if ( mLanguageIds[i].lid == lid ) return mLanguageIds[i].language_tag ; return "-none-"; /* default */ } const char * wvLIDToCodePageConverter (U16 lid) { if (lid == 0x0FFF) /*Macintosh Hack */ return ("MACINTOSH"); switch (lid & 0xff) { case 0x01: /*Arabic */ return ("CP1256"); case 0x02: /*Bulgarian */ return ("CP1251"); case 0x03: /*Catalan */ return ("CP1252"); case 0x04: /*Chinese */ switch (lid) { #if 0 case 0x1404: /*Chinese (Macau SAR) */ #endif case 0x0c04: /*Chinese (Hong Kong SAR, PRC) */ CPNAME_OR_FALLBACK ("CP950", "BIG5-HKSCS"); case 0x0804: /*Chinese (PRC) */ CPNAME_OR_FALLBACK ("CP936", "GBK"); #if 0 case 0x1004: /*Chinese (Singapore) */ #endif case 0x0404: /*Chinese (Taiwan) */ CPNAME_OR_FALLBACK ("CP950", "BIG5"); } case 0x05: /*Czech */ return ("CP1250"); case 0x06: /*Danish */ return ("CP1252"); case 0x07: /*German */ return ("CP1252"); case 0x08: /*Greek */ return ("CP1253"); case 0x09: /*English */ return ("CP1252"); case 0x0a: /*Spanish */ return ("CP1252"); case 0x0b: /*Finnish */ return ("CP1252"); case 0x0c: /*French */ return ("CP1252"); case 0x0d: /*Hebrew */ return ("CP1255"); case 0x0e: /*Hungarian */ return ("CP1250"); case 0x0f: /*Icelandic */ return ("CP1252"); case 0x10: /*Italian */ return ("CP1252"); case 0x11: /*Japanese */ return ("CP932"); case 0x12: /*Korean */ switch (lid) { case 0x0812: /*Korean (Johab) */ return ("CP1361"); case 0x0412: /*Korean */ return ("CP949"); } case 0x13: /*Dutch */ return ("CP1252"); case 0x14: /*Norwegian */ return ("CP1252"); case 0x15: /*Polish */ return ("CP1250"); case 0x16: /*Portuguese */ return ("CP1252"); case 0x17: /*Rhaeto-Romanic */ return ("CP1252"); case 0x18: /*Romanian */ return ("CP1250"); case 0x19: /*Russian */ return ("CP1251"); case 0x1a: /*Serbian, Croatian, (Bosnian?) */ switch (lid) { case 0x041a: /*Croatian */ return ("CP1252"); case 0x0c1a: /*Serbian (Cyrillic) */ return ("CP1251"); case 0x081a: /*Serbian (Latin) */ return ("CP1252"); } case 0x1b: /*Slovak */ return ("CP1250"); case 0x1c: /*Albanian */ return ("CP1251"); case 0x1d: /*Swedish */ return ("CP1252"); case 0x1e: /*Thai */ return ("CP874"); case 0x1f: /*Turkish */ return ("CP1254"); case 0x20: /*Urdu. This is Unicode only. */ return ("0"); case 0x21: /*Bahasa Indonesian */ return ("CP1252"); case 0x22: /*Ukrainian */ return ("CP1251"); case 0x23: /*Byelorussian / Belarusian */ return ("CP1251"); case 0x24: /*Slovenian */ return ("CP1250"); case 0x25: /*Estonian */ return ("CP1257"); case 0x26: /*Latvian */ return ("CP1257"); case 0x27: /*Lithuanian */ return ("CP1257"); case 0x29: /*Farsi / Persian. This is Unicode only. */ return ("0"); case 0x2a: /*Vietnamese */ return ("CP1258"); case 0x2b: /*Windows 2000: Armenian. This is Unicode only. */ return ("CP0"); case 0x2c: /*Azeri */ switch (lid) { case 0x082c: /*Azeri (Cyrillic) */ return ("CP1251"); #if 0 case 0x042c: /*Azeri (Latin) */ #endif } case 0x2d: /*Basque */ return ("CP1252"); case 0x2f: /*Macedonian */ return ("CP1251"); #if 0 case 0x30: /*Sutu */ #endif case 0x36: /*Afrikaans */ return ("CP1252"); case 0x37: /*Windows 2000: Georgian. This is Unicode only. */ return ("CP0"); case 0x38: /*Faeroese */ return ("CP1252"); case 0x39: /*Windows 2000: Hindi. This is Unicode only. */ return ("CP0"); case 0x3E: /*Malaysian / Malay */ return ("CP1252"); #if 0 case 0x3f: /*Kazakh */ #endif case 0x41: /*Swahili */ return ("CP1252"); case 0x43: /*Uzbek */ switch (lid) { case 0x0843: /*Uzbek (Cyrillic) */ return ("CP1251"); #if 0 case 0x0443: /*Uzbek (Latin) */ #endif } #if 0 case 0x44: /*Tatar */ #endif case 0x45: /*Windows 2000: Bengali. This is Unicode only. */ case 0x46: /*Windows 2000: Punjabi. This is Unicode only. */ case 0x47: /*Windows 2000: Gujarati. This is Unicode only. */ case 0x48: /*Windows 2000: Oriya. This is Unicode only. */ case 0x49: /*Windows 2000: Tamil. This is Unicode only. */ case 0x4a: /*Windows 2000: Telugu. This is Unicode only. */ case 0x4b: /*Windows 2000: Kannada. This is Unicode only. */ case 0x4c: /*Windows 2000: Malayalam. This is Unicode only. */ case 0x4d: /*Windows 2000: Assamese. This is Unicode only. */ case 0x4e: /*Windows 2000: Marathi. This is Unicode only. */ case 0x4f: /*Windows 2000: Sanskrit. This is Unicode only. */ return ("CP0"); case 0x55: /*Myanmar / Burmese. This is Unicode only. */ return ("CP0"); case 0x57: /*Windows 2000: Konkani. This is Unicode only. */ return ("CP0"); #if 0 case 0x58: /*Manipuri */ case 0x59: /*Sindhi */ case 0x60: /*Kashmiri (India) */ #endif case 0x61: /*Windows 2000: Nepali (India). This is Unicode only. */ return ("CP0"); }; /* TODO output a warning since this is a guess */ return ("CP1252"); } static U32 swap_iconv (U16 lid) { iconv_t handle = NULL; char f_code[33]; /* From CCSID */ char t_code[33]; /* To CCSID */ const char *codepage = NULL; size_t ibuflen, obuflen; U8 buffer[2]; U8 buffer2[2]; U8 *ibuf, *obuf; /* do a bit of caching */ static U16 lastlid = -1; static U32 ret = -1; /* shortcut */ if (ret != -1 && lastlid == lid) return ret; ibuf = buffer; obuf = buffer2; lastlid = lid; codepage = wvLIDToCodePageConverter (lid); memset (f_code, '\0', 33); memset (t_code, '\0', 33); strcpy (f_code, codepage); strcpy (t_code, "UCS-2"); handle = iconv_open (t_code, f_code); if (handle == (iconv_t)-1) return 0; buffer[0] = 0x20 & 0xff; buffer[1] = 0; ibuflen = obuflen = 2; wv_iconv (handle, &ibuf, &ibuflen, &obuf, &obuflen); iconv_close (handle); ret = *(U16 *) buffer2 != 0x20; return ret; } U16 wvHandleCodePage (U16 eachchar, U16 lid) { char f_code[33]; /* From CCSID */ char t_code[33]; /* To CCSID */ const char *codepage; iconv_t iconv_handle; /* Conversion Descriptor returned */ /* from iconv_open() function */ size_t ibuflen; /* Length of input buffer */ size_t obuflen; /* Length of output buffer */ U8 *ibuf; U8 *obuf; /* Buffer for converted characters */ U8 *p; U8 buffer[2]; U8 buffer2[2]; U16 rtn; if (eachchar > 0xff) { buffer[0] = (char) (eachchar >> 8); buffer[1] = (char) eachchar & 0xff; } else { buffer[0] = eachchar & 0xff; buffer[1] = 0; } ibuf = buffer; obuf = buffer2; codepage = wvLIDToCodePageConverter (lid); /* All reserved positions of from code (last 12 characters) and to code */ /* (last 19 characters) must be set to hexadecimal zeros. */ memset (f_code, '\0', 33); memset (t_code, '\0', 33); strcpy (f_code, codepage); strcpy (t_code, "UCS-2"); iconv_handle = iconv_open (t_code, f_code); if (iconv_handle == (iconv_t) - 1) { wvError ( ("iconv_open fail: %d, cannot convert %s to unicode\n", errno, codepage)); return ('?'); } ibuflen = obuflen = 2; p = obuf; wv_iconv (iconv_handle, &ibuf, &ibuflen, &obuf, &obuflen); /* We might have double byte char here. */ if (swap_iconv (lid)) { rtn = (U16) buffer2[0] << 8; rtn |= (U16) buffer2[1]; } else { rtn = *(U16 *) buffer2; } iconv_close (iconv_handle); return (rtn); } void wvOutputFromUnicode (U16 eachchar, char *outputtype) { static char cached_outputtype[33]; /* Last outputtype */ static iconv_t iconv_handle = (iconv_t)-1; /* Cached iconv descriptor */ static int need_swapping; U8 *ibuf, *obuf; size_t ibuflen, obuflen, len, count, i; U8 buffer[2], buffer2[5]; if ((wvConvertUnicodeToEntity != NULL) && wvConvertUnicodeToEntity (eachchar)) return; if ((iconv_handle == (iconv_t)-1) || strcmp (cached_outputtype, outputtype) != 0) { if ((iconv_handle != (iconv_t)-1)) iconv_close (iconv_handle); iconv_handle = iconv_open (outputtype, "UCS-2"); if (iconv_handle == (iconv_t) - 1) { wvError ( ("iconv_open fail: %d, cannot convert %s to %s\n", errno, "UCS-2", outputtype)); printf ("?"); return; } /* safe to cache the output type here */ strcpy (cached_outputtype, outputtype); /* Determining if unicode biteorder is swapped (glibc < 2.2) */ need_swapping = 1; buffer[0] = 0x20; buffer[1] = 0; ibuf = buffer; obuf = buffer2; ibuflen = 2; obuflen = 5; count = wv_iconv (iconv_handle, &ibuf, &ibuflen, &obuf, &obuflen); if (count >= 0) need_swapping = buffer2[0] != 0x20; } if (need_swapping) { buffer[0] = (eachchar >> 8) & 0x00ff; buffer[1] = eachchar & 0x00ff; } else { buffer[0] = eachchar & 0x00ff; buffer[1] = (eachchar >> 8) & 0x00ff; } ibuf = buffer; obuf = buffer2; ibuflen = 2; len = obuflen = 5; count = wv_iconv (iconv_handle, &ibuf, &ibuflen, &obuf, &obuflen); if (count == (size_t) - 1) { wvError (("iconv failed errno: %d, char: 0x%X, %s -> %s\n", errno, eachchar, "UCS-2", outputtype)); /* I'm torn here - do i just announce the failure, continue, or copy over to the other buffer? */ /* errno is usually 84 (illegal byte sequence) should i reverse the bytes and try again? */ printf ("%c", ibuf[1]); } else { len = len - obuflen; for (i = 0; i < len; i++) printf ("%c", buffer2[i]); } } int wvHandleElement (wvParseStruct * ps, wvTag tag, void *props, int dirty) { if (ps->elehandler) return ((*(ps->elehandler)) (ps, tag, props, dirty)); wvError (("No element handler registered!!\n")); return (0); } int wvHandleDocument (wvParseStruct * ps, wvTag tag) { if (ps->dochandler) return ((*(ps->dochandler)) (ps, tag)); wvError (("No dochandler!!\n")); return (0); } void wvSetCharHandler (wvParseStruct * ps, int (*proc) (wvParseStruct *, U16, U8, U16)) { ps->charhandler = proc; } void wvSetSpecialCharHandler (wvParseStruct * ps, int (*proc) (wvParseStruct *, U16, CHP *)) { ps->scharhandler = proc; } void wvSetElementHandler (wvParseStruct * ps, int (*proc) (wvParseStruct *, wvTag, void *, int)) { ps->elehandler = proc; } void wvSetDocumentHandler (wvParseStruct * ps, int (*proc) (wvParseStruct *, wvTag)) { ps->dochandler = proc; } int wvConvertUnicodeToLaTeX (U16 char16) { /* german and scandinavian characters, MV 1.7.2000 See man iso_8859_1 This requires the inputencoding latin1 package, see latin1.def. Chars in range 160...255 are just put through as these are legal iso-8859-1 symbols. (see above) Best way to do it until LaTeX is Unicode enabled (Omega project). -- MV 4.7.2000 We use a separate if-statement here ... the 'case range' construct is gcc specific :-( -- MV 13/07/2000 */ if ((char16 >= 0xa0) && (char16 <= 0xff)) { switch (char16) { case 0xa0: printf ("\\ "); /* hard space */ return (1); /* Fix up these as math characters: */ case 0xb1: printf ("$\\pm$"); return (1); case 0xb2: printf ("$\\mathtwosuperior$"); return (1); case 0xb3: printf ("$\\maththreesuperior$"); return (1); case 0xb5: printf ("$\\mu$"); return (1); case 0xb9: printf ("$\\mathonesuperior$"); return (1); case 0xd7: printf ("$\\times$"); return (1); } printf ("%c", char16); return (1); } switch (char16) { case 37: printf ("\\%%"); return (1); case 10: case 11: printf ("\\\\\n"); return (1); case 31: /* non-required hyphen */ printf ("\\-"); return (1); case 30: /* non-breaking hyphen */ printf ("-"); return (1); /* case 45: minus/hyphen, pass through */ case 12: printf("\\newpage\n"); return (1); case 13: case 14: case 7: return (1); case 9: printf ("\\hfill{}"); /* tab -- horrible cludge */ return (1); case 0xf020: printf (" "); /* Mac specialty ? MV 10.10.2000 */ return (1); case 0xf02c: printf (","); /* Mac */ return (1); case 0xf028: printf ("("); /* Mac */ return (1); case 34: printf ("\""); return (1); case 35: printf ("\\#"); /* MV 14.8.2000 */ return (1); case 36: printf ("\\$"); /* MV 14.8.2000 */ return (1); case 38: printf ("\\&"); /* MV 1.7.2000 */ return (1); case 92: printf ("$\\backslash$"); /* MV 23.9.2000 */ return (1); case 94: printf ("\\^"); /* MV 13.9.2000 */ return (1); case 95: printf ("\\_"); /* MV 13.9.2000 */ return (1); case 60: printf ("<"); return (1); case 0xf03e: /* Mac */ case 62: printf (">"); return (1); case 0xF8E7: /* without this, things should work in theory, but not for me */ printf ("_"); return (1); /* Added some new Unicode characters. It's probably difficult to write these characters in AbiWord, though ... :( -- 2000-08-11 huftis@bigfoot.com */ case 0x0100: printf ("\\=A"); /* A with macron */ return (1); case 0x0101: printf ("\\=a"); /* a with macron */ return (1); case 0x0102: printf ("\\u{A}"); /* A with breve */ return (1); case 0x0103: printf ("\\u{a}"); /* a with breve */ return (1); case 0x0104: printf ("\\k{A}"); /* A with ogonek */ return (1); case 0x0105: printf ("\\k{a}"); /* a with ogonek */ return (1); case 0x0106: printf ("\\'C"); /* C with acute */ return (1); case 0x0107: printf ("\\'c"); /* c with acute */ return (1); case 0x0108: printf ("\\^C"); /* C with circumflex */ return (1); case 0x0109: printf ("\\^c"); /* c with circumflex */ return (1); case 0x010A: printf ("\\.C"); /* C with dot above */ return (1); case 0x010B: printf ("\\.c"); /* c with dot above */ return (1); case 0x010C: printf ("\\v{C}"); /* C with caron */ return (1); case 0x010D: printf ("\\v{c}"); /* c with caron */ return (1); case 0x010E: printf ("\\v{D}"); /* D with caron */ return (1); case 0x010F: printf ("\\v{d}"); /* d with caron */ return (1); case 0x0110: printf ("\\DJ{}"); /* D with stroke */ return (1); case 0x0111: printf ("\\dj{}"); /* d with stroke */ return (1); case 0x0112: printf ("\\=E"); /* E with macron */ return (1); case 0x0113: printf ("\\=e"); /* e with macron */ return (1); case 0x0114: printf ("\\u{E}"); /* E with breve */ return (1); case 0x0115: printf ("\\u{e}"); /* e with breve */ return (1); case 0x0116: printf ("\\.E"); /* E with dot above */ return (1); case 0x0117: printf ("\\.e"); /* e with dot above */ return (1); case 0x0118: printf ("\\k{E}"); /* E with ogonek */ return (1); case 0x0119: printf ("\\k{e}"); /* e with ogonek */ return (1); case 0x011A: printf ("\\v{E}"); /* E with caron */ return (1); case 0x011B: printf ("\\v{e}"); /* e with caron */ return (1); case 0x011C: printf ("\\^G"); /* G with circumflex */ return (1); case 0x011D: printf ("\\^g"); /* g with circumflex */ return (1); case 0x011E: printf ("\\u{G}"); /* G with breve */ return (1); case 0x011F: printf ("\\u{g}"); /* g with breve */ return (1); case 0x0120: printf ("\\.G"); /* G with dot above */ return (1); case 0x0121: printf ("\\u{g}"); /* g with dot above */ return (1); case 0x0122: printf ("^H"); /* H with circumflex */ return (1); case 0x0123: printf ("^h"); /* h with circumflex */ return (1); case 0x0128: printf ("\\~I"); /* I with tilde */ return (1); case 0x0129: printf ("\\~{\\i}"); /* i with tilde (dotless) */ return (1); case 0x012A: printf ("\\=I"); /* I with macron */ return (1); case 0x012B: printf ("\\={\\i}"); /* i with macron (dotless) */ return (1); case 0x012C: printf ("\\u{I}"); /* I with breve */ return (1); case 0x012D: printf ("\\u{\\i}"); /* i with breve */ return (1); case 0x0130: printf ("\\.I"); /* I with dot above */ return (1); case 0x0131: printf ("\\i{}"); /* dotless i */ return (1); case 0x0132: printf ("IJ"); /* IJ ligature */ return (1); case 0x0133: printf ("ij"); /* ij ligature */ return (1); case 0x0134: printf ("\\^J"); /* J with circumflex (dotless) */ return (1); case 0x0135: printf ("\\^{\\j}"); /* j with circumflex (dotless) */ return (1); case 0x0136: printf ("\\c{K}"); /* K with cedilla */ return (1); case 0x0137: printf ("\\c{k}"); /* k with cedilla */ return (1); case 0x0138: printf ("k"); /* NOTE: Not the correct character (kra), but similar */ return (1); case 0x0139: printf ("\\'L"); /* L with acute */ return (1); case 0x013A: printf ("\\'l"); /* l with acute */ return (1); case 0x013B: printf ("\\c{L}"); /* L with cedilla */ return (1); case 0x013C: printf ("\\c{l}"); /* l with cedilla */ return (1); case 0x013D: printf ("\\v{L}"); /* L with caron */ return (1); case 0x013E: printf ("\\v{l}"); /* l with caron */ return (1); case 0x0141: printf ("\\L{}"); /* L with stroke */ return (1); case 0x0142: printf ("\\l{}"); /* l with stroke */ return (1); case 0x0143: printf ("\\'N"); /* N with acute */ return (1); case 0x0144: printf ("\\'n"); /* n with acute */ return (1); case 0x0145: printf ("\\c{N}"); /* N with cedilla */ return (1); case 0x0146: printf ("\\c{n}"); /* n with cedilla */ return (1); case 0x0147: printf ("\\v{N}"); /* N with caron */ return (1); case 0x0148: printf ("\\v{n}"); /* n with caron */ return (1); case 0x0149: printf ("'n"); /* n preceed with apostroph */ return (1); case 0x014A: printf ("\\NG{}"); /* ENG character */ return (1); case 0x014B: printf ("\\ng{}"); /* eng character */ return (1); case 0x014C: printf ("\\=O"); /* O with macron */ return (1); case 0x014D: printf ("\\=o"); /* o with macron */ return (1); case 0x014E: printf ("\\u{O}"); /* O with breve */ return (1); case 0x014F: printf ("\\u{o}"); /* o with breve */ return (1); case 0x0150: printf ("\\H{O}"); /* O with double acute */ return (1); case 0x0151: printf ("\\H{o}"); /* o with double acute */ return (1); case 0x0152: printf ("\\OE{}"); /* OE ligature */ return (1); case 0x0153: printf ("\\oe{}"); /* oe ligature */ return (1); case 0x0154: printf ("\\'R"); /* R with acute */ return (1); case 0x0155: printf ("\\'r"); /* r with acute */ return (1); case 0x0156: printf ("\\c{R}"); /* R with cedilla */ return (1); case 0x0157: printf ("\\c{r}"); /* r with cedilla */ return (1); case 0x0158: printf ("\\v{R}"); /* R with caron */ return (1); case 0x0159: printf ("\\v{r}"); /* r with caron */ return (1); case 0x015A: printf ("\\'S"); /* S with acute */ return (1); case 0x015B: printf ("\\'s"); /* s with acute */ return (1); case 0x015C: printf ("\\^S"); /* S with circumflex */ return (1); case 0x015D: printf ("\\^s"); /* c with circumflex */ return (1); case 0x015E: printf ("\\c{S}"); /* S with cedilla */ return (1); case 0x015F: printf ("\\c{s}"); /* s with cedilla */ return (1); case 0x0160: printf ("\\v{S}"); /* S with caron */ return (1); case 0x0161: printf ("\\v{s}"); /* s with caron */ return (1); case 0x0162: printf ("\\c{T}"); /* T with cedilla */ return (1); case 0x0163: printf ("\\c{t}"); /* t with cedilla */ return (1); case 0x0164: printf ("\\v{T}"); /* T with caron */ return (1); case 0x0165: printf ("\\v{t}"); /* t with caron */ return (1); case 0x0168: printf ("\\~U"); /* U with tilde */ return (1); case 0x0169: printf ("\\~u"); /* u with tilde */ return (1); case 0x016A: printf ("\\=U"); /* U with macron */ return (1); /* Greek (thanks Petr Vanicek!): */ case 0x0391: printf ("$A$"); return (1); case 0x0392: printf ("$B$"); return (1); case 0x0393: printf ("$\\Gamma$"); return (1); case 0xf044: /* Mac ? */ case 0x2206: /* Mac */ case 0x0394: printf ("$\\Delta$"); return (1); case 0x0395: printf ("$E$"); return (1); case 0x0396: printf ("$Z$"); return (1); case 0x0397: printf ("$H$"); return (1); case 0x0398: printf ("$\\Theta$"); return (1); case 0x0399: printf ("$I$"); return (1); case 0x039a: printf ("$K$"); return (1); case 0x039b: printf ("$\\Lambda$"); return (1); case 0xf04d: /* Mac? */ case 0x039c: printf ("$M$"); return (1); case 0x039d: printf ("$N$"); return (1); case 0x039e: printf ("$\\Xi$"); return (1); case 0x039f: printf ("$O$"); /* Omicron */ return (1); case 0x03a0: printf ("$\\Pi$"); return (1); case 0x03a1: printf ("$R$"); return (1); case 0x03a3: printf ("$\\Sigma$"); return (1); case 0x03a4: printf ("$T$"); return (1); case 0x03a5: printf ("$Y$"); return (1); case 0x03a6: printf ("$\\Phi$"); return (1); case 0x03a7: printf ("$X$"); /* Chi */ return (1); case 0x03a8: printf ("$\\Psi$"); return (1); case 0x2126: /* Mac */ case 0x03a9: printf ("$\\Omega$"); return (1); /* ...and lower case: */ case 0x03b1: printf ("$\\alpha$"); return (1); case 0x03b2: printf ("$\\beta$"); return (1); case 0xf067: /* Mac */ case 0x03b3: printf ("$\\gamma$"); return (1); case 0xf064: /* Mac */ case 0x03b4: printf ("$\\delta$"); return (1); case 0x03b5: printf ("$\\epsilon$"); return (1); case 0xf04e: /* Mac? variant? */ case 0xf07a: /* Mac? */ case 0x03b6: printf ("$\\zeta$"); return (1); case 0x03b7: printf ("$\\eta$"); return (1); case 0x03b8: printf ("$\\theta$"); return (1); case 0x03b9: printf ("$\\iota$"); return (1); case 0x03ba: printf ("$\\kappa$"); return (1); case 0xf06c: /* Mac? */ case 0x03bb: printf ("$\\lambda$"); return (1); case 0x03bc: printf ("$\\mu$"); return (1); case 0x03bd: printf ("$\\nu$"); return (1); case 0x03be: printf ("$\\xi$"); return (1); case 0x03bf: printf ("$o$"); /* omicron */ return (1); case 0x03c0: printf ("$\\pi$"); return (1); case 0xf072: /* Mac */ printf ("$\\varrho$"); return (1); case 0x03c1: printf ("$\\rho$"); return (1); case 0xf073: /* Mac */ case 0x03c3: printf ("$\\sigma$"); return (1); case 0x03c4: printf ("$\\tau$"); return (1); case 0x03c5: printf ("$\\upsilon$"); return (1); case 0x03c6: printf ("$\\phi$"); return (1); case 0x03c7: printf ("$\\chi$"); return (1); case 0x03c8: printf ("$\\psi$"); return (1); case 0x03c9: printf ("$\\omega$"); return (1); case 0xf06a: /* Mac? */ case 0x03d5: printf ("$\\varphi$"); /* ? */ return (1); /* More math, typical inline: */ case 0x2111: printf ("$\\Im$"); return (1); case 0x2118: printf ("$\\wp$"); /* Weierstrass p */ return (1); case 0x211c: printf ("$\\Re$"); return (1); case 0x2135: printf ("$\\aleph$"); return (1); case 0x2190: printf ("$\\leftarrow$"); return (1); case 0x2191: printf ("$\\uparrow$"); return (1); case 0xf0ae: /* Mac */ case 0x2192: printf ("$\\rightarrow$"); return (1); case 0x2193: printf ("$\\downarrow$"); return (1); case 0x21d0: printf ("$\\Leftarrow$"); return (1); case 0x21d1: printf ("$\\Uparrow$"); return (1); case 0x21d2: printf ("$\\Rightarrow$"); return (1); case 0x21d3: printf ("$\\Downarrow$"); return (1); case 0x21d4: printf ("$\\Leftrightarrow$"); return (1); case 0x2200: printf ("$\\forall$"); return (1); case 0xf0b6: /* Mac */ case 0x2202: printf ("$\\partial$"); return (1); case 0x2203: printf ("$\\exists$"); return (1); case 0x2205: printf ("$\\emptyset$"); return (1); case 0x2207: printf ("$\\nabla$"); return (1); case 0x2208: printf ("$\\in$"); /* element of */ return (1); case 0x2209: printf ("$\\notin$"); /* not an element of */ return (1); case 0x220b: printf ("$\\ni$"); /* contains as member */ return (1); case 0x221a: printf ("$\\surd$"); /* sq root */ return (1); case 0x2212: printf ("$-$"); /* minus */ return (1); case 0x221d: printf ("$\\propto$"); return (1); case 0x221e: printf ("$\\infty$"); return (1); case 0x2220: printf ("$\\angle$"); return (1); case 0x2227: printf ("$\\land$"); /* logical and */ return (1); case 0x2228: printf ("$\\lor$"); /* logical or */ return (1); case 0x2229: printf ("$\\cap$"); /* intersection */ return (1); case 0x222a: printf ("$\\cup$"); /* union */ return (1); case 0x223c: printf ("$\\sim$"); /* similar to */ return (1); case 0x2248: printf ("$\\approx$"); return (1); case 0x2261: printf ("$\\equiv$"); return (1); case 0x2260: printf ("$\\neq$"); return (1); case 0x2264: printf ("$\\leq$"); return (1); case 0xf0b3: /* Mac? */ case 0x2265: printf ("$\\geq$"); return (1); case 0x2282: printf ("$\\subset$"); return (1); case 0x2283: printf ("$\\supset$"); return (1); case 0x2284: printf ("$\\notsubset$"); return (1); case 0x2286: printf ("$\\subseteq$"); return (1); case 0x2287: printf ("$\\supseteq$"); return (1); case 0x2295: printf ("$\\oplus$"); /* circled plus */ return (1); case 0x2297: printf ("$\\otimes$"); return (1); case 0x22a5: printf ("$\\perp$"); /* perpendicular */ return (1); case 0x2660: printf ("$\\spadesuit$"); return (1); case 0x2663: printf ("$\\clubsuit$"); return (1); case 0x2665: printf ("$\\heartsuit$"); return (1); case 0x2666: printf ("$\\diamondsuit$"); return (1); case 0x01C7: printf ("LJ"); /* the LJ letter */ return (1); case 0x01C8: printf ("Lj"); /* the Lj letter */ return (1); case 0x01C9: printf ("lj"); /* the lj letter */ return (1); case 0x01CA: printf ("NJ"); /* the NJ letter */ return (1); case 0x01CB: printf ("Nj"); /* the Nj letter */ return (1); case 0x01CC: printf ("nj"); /* the nj letter */ return (1); case 0x01CD: printf ("\\v{A}"); /* A with caron */ return (1); case 0x01CE: printf ("\\v{a}"); /* a with caron */ return (1); case 0x01CF: printf ("\\v{I}"); /* I with caron */ return (1); case 0x01D0: printf ("\\v{\\i}"); /* i with caron (dotless) */ return (1); case 0x01D1: printf ("\\v{O}"); /* O with caron */ return (1); case 0x01D2: printf ("\\v{o}"); /* o with caron */ return (1); case 0x01D3: printf ("\\v{U}"); /* U with caron */ return (1); case 0x01D4: printf ("\\v{u}"); /* u with caron */ return (1); case 0x01E6: printf ("\\v{G}"); /* G with caron */ return (1); case 0x01E7: printf ("\\v{g}"); /* g with caron */ return (1); case 0x01E8: printf ("\\v{K}"); /* K with caron */ return (1); case 0x01E9: printf ("\\v{k}"); /* k with caron */ return (1); case 0x01F0: printf ("\\v{\\j}"); /* j with caron (dotless) */ return (1); case 0x01F1: printf ("DZ"); /* the DZ letter */ return (1); case 0x01F2: printf ("Dz"); /* the Dz letter */ return (1); case 0x01F3: printf ("dz"); /* the dz letter */ return (1); case 0x01F4: printf ("\\'G"); /* G with acute */ return (1); case 0x01F5: printf ("\\'g"); /* g with acute */ return (1); case 0x01FA: printf ("\\'{\\AA}"); /* Å with acute */ return (1); case 0x01FB: printf ("\\'{\\aa}"); /* å with acute */ return (1); case 0x01FC: printf ("\\'{\\AE}"); /* Æ with acute */ return (1); case 0x01FD: printf ("\\'{\\ae}"); /* æ with acute */ return (1); case 0x01FE: printf ("\\'{\\O}"); /* Ø with acute */ return (1); case 0x01FF: printf ("\\'{\\o}"); /* ø with acute */ return (1); case 0x2010: printf ("-"); /* hyphen */ return (1); case 0x2011: printf ("-"); /* non-breaking hyphen (is there a way to get this in LaTeX?) */ return (1); case 0x2012: printf ("--"); /* figure dash (similar to en-dash) */ return (1); case 0x2013: /* soft-hyphen? Or en-dash? I find that making this a soft-hyphen works very well, but makes the occasional "hard" word-connection hyphen (like the "-" in roller-coaster) disappear. (Are these actually en-dashes? Dunno.) How does MS Word distinguish between the 0x2013's that signify soft hyphens and those that signify word-connection hyphens? wvware should be able to as well. -- MV 8.7.2000 U+2013 is the en-dash character and not a soft hyphen. Soft hyphen is U+00AD. Changing to "--". -- 2000-08-11 huftis@bigfoot.com */ printf ("--"); return (1); case 0x016B: printf ("\\=u"); /* u with macron */ return (1); case 0x016C: printf ("\\u{U}"); /* U with breve */ return (1); case 0x016D: printf ("\\u{u}"); /* u with breve */ return (1); case 0x016E: printf ("\\r{U}"); /* U with ring above */ return (1); case 0x016F: printf ("\\r{u}"); /* u with ring above */ return (1); case 0x0170: printf ("\\H{U}"); /* U with double acute */ return (1); case 0x0171: printf ("\\H{u}"); /* u with double acute */ return (1); case 0x0174: printf ("\\^W"); /* W with circumflex */ return (1); case 0x0175: printf ("\\^w"); /* w with circumflex */ return (1); case 0x0176: printf ("\\^Y"); /* Y with circumflex */ return (1); case 0x0177: printf ("\\^y"); /* y with circumflex */ return (1); case 0x0178: printf ("\\\"Y"); /* Y with diaeresis */ return (1); case 0x0179: printf ("\\'Z"); /* Z with acute */ return (1); case 0x017A: printf ("\\'z"); /* z with acute */ return (1); case 0x017B: printf ("\\.Z"); /* Z with dot above */ return (1); case 0x017C: printf ("\\.z"); /* z with dot above */ return (1); case 0x017D: printf ("\\v{Z}"); /* Z with caron */ return (1); case 0x017E: printf ("\\v{z}"); /* z with caron */ return (1); /* Windows specials (MV 4.7.2000). More could be added. See http://www.hut.fi/u/jkorpela/www/windows-chars.html */ case 0x2014: printf ("---"); /* em-dash */ return (1); case 0x2018: printf ("`"); /* left single quote, Win */ return (1); case 0x2019: printf ("'"); /* Right single quote, Win */ return (1); case 0x201A: printf ("\\quotesinglbase{}"); /* single low 99 quotation mark */ return (1); case 0x201C: printf ("``"); /* inverted double quotation mark */ return (1); case 0x201D: printf ("''"); /* double q.m. */ return (1); case 0x201E: printf ("\\quotedblbase{}"); /* double low 99 quotation mark */ return (1); case 0x2020: printf ("\\dag{}"); /* dagger */ return (1); case 0x2021: printf ("\\ddag{}"); /* double dagger */ return (1); case 0x25cf: /* FilledCircle */ case 0x2022: printf ("$\\bullet$"); /* bullet */ return (1); case 0x2023: printf ("$\\bullet$"); /* NOTE: Not a real triangular bullet */ return (1); case 0x2024: printf ("."); /* One dot leader (for use in TOCs) */ return (1); case 0x2025: printf (".."); /* Two dot leader (for use in TOCs) */ return (1); case 0x2026: printf ("\\ldots"); /* ellipsis */ return (1); case 0x2039: printf ("\\guilsinglleft{}"); /* single left angle quotation mark */ return (1); case 0x203A: printf ("\\guilsinglright{}"); /* single right angle quotation mark */ return (1); case 0x203C: printf ("!!"); /* double exclamation mark */ return (1); case 0x2215: printf ("$/$"); /* Division slash */ return (1); case 0x2030: printf ("o/oo"); return (1); case 0x20ac: printf ("\\euro"); /* No known implementation ;-) Shouldn't we use the package 'eurofont'? -- 2000-08-15 huftis@bigfoot.com */ return (1); case 0x2160: printf ("I"); /* Roman numeral I */ return (1); case 0x2161: printf ("II"); /* Roman numeral II */ return (1); case 0x2162: printf ("III"); /* Roman numeral III */ return (1); case 0x2163: printf ("IV"); /* Roman numeral IV */ return (1); case 0x2164: printf ("V"); /* Roman numeral V */ return (1); case 0x2165: printf ("VI"); /* Roman numeral VI */ return (1); case 0x2166: printf ("VII"); /* Roman numeral VII */ return (1); case 0x2167: printf ("VIII"); /* Roman numeral VIII */ return (1); case 0x2168: printf ("IX"); /* Roman numeral IX */ return (1); case 0x2169: printf ("X"); /* Roman numeral X */ return (1); case 0x216A: printf ("XI"); /* Roman numeral XI */ return (1); case 0x216B: printf ("XII"); /* Roman numeral XII */ return (1); case 0x216C: printf ("L"); /* Roman numeral L */ return (1); case 0x216D: printf ("C"); /* Roman numeral C */ return (1); case 0x216E: printf ("D"); /* Roman numeral D */ return (1); case 0x216F: printf ("M"); /* Roman numeral M */ return (1); case 0x2170: printf ("i"); /* Roman numeral i */ return (1); case 0x2171: printf ("ii"); /* Roman numeral ii */ return (1); case 0x2172: printf ("iii"); /* Roman numeral iii */ return (1); case 0x2173: printf ("iv"); /* Roman numeral iv */ return (1); case 0x2174: printf ("v"); /* Roman numeral v */ return (1); case 0x2175: printf ("vi"); /* Roman numeral vi */ return (1); case 0x2176: printf ("vii"); /* Roman numeral vii */ return (1); case 0x2177: printf ("viii"); /* Roman numeral viii */ return (1); case 0x2178: printf ("ix"); /* Roman numeral ix */ return (1); case 0x2179: printf ("x"); /* Roman numeral x */ return (1); case 0x217A: printf ("xi"); /* Roman numeral xi */ return (1); case 0x217B: printf ("xiii"); /* Roman numeral xii */ return (1); case 0x217C: printf ("l"); /* Roman numeral l */ return (1); case 0x217D: printf ("c"); /* Roman numeral c */ return (1); case 0x217E: printf ("d"); /* Roman numeral d */ return (1); case 0x217F: printf ("m"); /* Roman numeral m */ return (1); } /* Debugging aid: */ if (char16 >= 0x80) { printf ("[%x]", char16); return (1); } return (0); } int wvConvertUnicodeToHtml (U16 char16) { switch (char16) { case 11: printf ("
"); return (1); case 31: /* non-required hyphen */ printf("­"); /*vladimir@lukianov.name HTML 4.01 spec*/ return (1); case 30: case 45: case 0x2013: printf ("-"); /* en-dash */ return (1); case 12: case 13: case 14: case 7: return (1); case 34: printf ("""); return (1); case 38: printf ("&"); return (1); case 60: printf ("<"); return (1); case 62: printf (">"); return (1); /* german characters, im assured that this is the right way to handle them by Markus Schulte As the output encoding for HTML was chosen as UTF-8, we don't need Ä etc. etc. I removed all but sz -- MV 6.4.2000 */ #ifndef _MSC_VER case 0xdf: printf ("ß"); return (1); #endif /* end german characters */ case 0x2026: #if 0 /* this just looks awful in netscape 4.5, so im going to do a very foolish thing and just put ... instead of this */ printf ("…"); /*is there a proper html name for ... &ellipse;? Yes, … -- MV */ #endif printf ("…"); return (1); case 0x2019: printf ("'"); return (1); case 0x2215: printf ("/"); return (1); case 0xF8E7: /* without this, things should work in theory, but not for me */ printf ("_"); return (1); case 0x2018: printf ("`"); return (1); /* Windows specials (MV): */ case 0x0160: printf ("Š"); return (1); case 0x0161: printf ("š"); return (1); case 0x2014: printf ("—"); return (1); case 0x201c: printf ("“"); /* inverted double quotation mark */ return (1); case 0x201d: printf ("”"); /* double q.m. */ return (1); case 0x201e: printf ("„"); /* below double q.m. */ return (1); case 0x2020: printf ("†"); return (1); case 0x2021: printf ("‡"); return (1); case 0x2022: printf ("•"); return (1); case 0x0152: printf ("Œ"); return (1); case 0x0153: printf ("œ"); return (1); case 0x0178: printf ("Ÿ"); return (1); case 0x2030: printf ("‰"); return (1); case 0x20ac: printf ("€"); return (1); /* Mac specials (MV): */ case 0xf020: printf (" "); return (1); case 0xf02c: printf (","); return (1); case 0xf028: printf ("("); return (1); case 0xf03e: printf (">"); return (1); case 0xf067: printf ("γ"); return (1); case 0xf064: printf ("δ"); return (1); case 0xf072: printf ("ρ"); return (1); case 0xf073: printf ("σ"); return (1); case 0xf0ae: printf ("→"); /* right arrow */ return (1); case 0xf0b6: printf ("∂"); /* partial deriv. */ return (1); case 0xf0b3: printf ("≥"); return (1); } /* Debugging aid: */ /* if (char16 >= 0x100) printf("[%x]", char16); */ return (0); } int wvConvertUnicodeToXml (U16 char16) { switch (char16) { case 11: printf ("
"); return (1); case 30: case 31: case 12: case 13: case 14: case 7: return (1); /* Much simpler here, because XML supports only a few entities */ case 34: printf ("""); return (1); case 38: printf ("&"); return (1); case 39: printf ("'"); return (1); case 60: printf ("<"); return (1); case 62: printf (">"); return (1); } return (0); } char *str_copy(char *d, size_t n, char *s) { strncpy(d, s, n); d[n-1] = 0; return d; } char *str_append(char *d, size_t n, char *s) { int max = n - strlen(d) - 1; strncat(d, s, max); d[n-1] = 0; return d; } #define BUF_COPY(d,s) str_copy(d,sizeof(d),s) char * wvConvertStylename(char *stylename, char *outputtype) { static char cached_outputtype[36]; static iconv_t iconv_handle = (iconv_t)-1; /**FIXME: 100 is just the size of stylename[] from wv.h**/ static char buffer[100]; char *ibuf, *obuf; size_t ibuflen, obuflen, len; /* Destroy */ if(!outputtype) { if ((iconv_handle != (iconv_t)-1)) iconv_close(iconv_handle); return NULL; } /* Initialize */ if(!iconv_handle || strcmp(cached_outputtype, outputtype)) { if ((iconv_handle != (iconv_t)-1)) iconv_close(iconv_handle); /**FIXME: don´t know if ISO-8859-1 is really the correct **charset for style names with eg umlauts. **/ iconv_handle = iconv_open(outputtype, "ISO-8859-1"); if(iconv_handle == (iconv_t)-1) { wvError(("iconv_open fail: %d, cannot convert %s to %s\n", errno, "ISO-8859-1", outputtype)); return stylename; } BUF_COPY(cached_outputtype, outputtype); } /* Convert */ ibuf = stylename; ibuflen = strlen(stylename); obuf = buffer; obuflen = sizeof(buffer) - 1; len = wv_iconv (iconv_handle, &ibuf, &ibuflen, &obuf, &obuflen); *obuf = 0; if(len == -1) { wvError(("wvConfig.c: can´t iconv()\n")); return stylename; } return buffer; }