/* * translat.c: Stuff for handling different encodings * and a digraph entry facility. Support an international IRC! * * you start using utf-8 and * discard all legacy encodings * * Joel Yliluoma. */ #include "irc.h" IRCII_RCSID("@(#)$eterna: translat.c,v 1.29 2006/07/06 00:05:53 mrg Exp $"); #ifdef HAVE_ICONV_H #include #endif /* HAVE_ICONV_H */ #include "vars.h" #include "translat.h" #include "ircaux.h" #include "window.h" #include "screen.h" #include "output.h" static u_char my_getarg(u_char **); /* Globals */ char digraph_changed = 0; #ifdef HAVE_ICONV_OPEN u_char *irc_encoding = NULL; u_char *display_encoding = NULL; u_char *input_encoding = NULL; #endif /* HAVE_ICONV_OPEN */ /* * dig_table_lo[] and dig_table_hi[] contain the character pair that * will result in the digraph in dig_table_di[]. To avoid searching * both tables, I take the lower character of the pair, and only * search dig_table_lo[]. Thus, dig_table_lo[] must always contain * the lower character of the pair. * * The digraph tables are based on those in the excellent editor Elvis, * with some additions for those, like me, who are used to VT320 or * VT420 terminals. */ #define DiLo(x) x, #define DiHi(x) #define DiDi(x) /* * Digraph tables. Note that, when adding a new digraph, the character * of the pair with the lowest value, *must* be in the DiLo column. * The higher of the pair goes in DiHi, and the digraph itself in DiDi. */ u_char dig_table_lo[DIG_TABLE_SIZE] = { #include "digraph.inc" 0 }; #undef DiLo #undef DiHi #undef DiDi #define DiLo(x) #define DiHi(x) x, #define DiDi(x) u_char dig_table_hi[DIG_TABLE_SIZE] = { #include "digraph.inc" 0 }; #undef DiLo #undef DiHi #undef DiDi #define DiLo(x) #define DiHi(x) #define DiDi(x) x, u_char dig_table_di[DIG_TABLE_SIZE] = { #include "digraph.inc" 0 }; /* * enter_digraph: The BIND function ENTER_DIGRAPH. */ void enter_digraph(key, str) u_int key; u_char *str; { current_screen->digraph_hit = 1; /* Just stuff away first character. */ } /* * get_digraph: Called by edit_char() when a digraph entry is activated. * Looks up a digraph given u_char c1 and the global u_char * current_screen->digraph_hit. */ u_char get_digraph(ic1) u_int ic1; { int i = 0; u_char c, c2 = current_screen->digraph_first, c1 = (u_char)ic1; current_screen->digraph_hit = 0; if (c1 > c2) /* Make sure we have the lowest one in c1. */ c = c1, c1 = c2, c2 = c; while (dig_table_lo[i]) { /* Find digraph and return it. */ if ((dig_table_lo[i] == c1) && (dig_table_hi[i] == c2)) return dig_table_di[i]; i++; } return 0; /* Failed lookup. */ } /* * digraph: The /DIGRAPH command with facilities. * This routine is *NOT* finished yet. */ void digraph(command, args, subargs) u_char *command, *args, *subargs; { u_char *arg; u_char c1, c2 = '\0', c3 = '\0'; int i; size_t len; if ((arg = next_arg(args, &args)) && (*arg == '-')) { u_char *cmd = (u_char *) 0; arg++; if ((len = my_strlen(arg)) == 0) { say("Unknown or missing flag."); return; } malloc_strcpy(&cmd, arg); lower(cmd); if (my_strncmp(cmd, "add", len) == 0) { /* * Add a digraph to the table. * I *know*. This *is* a kludge. */ if ((i = my_strlen(dig_table_lo)) == DIG_TABLE_SIZE - 1) say("Sorry, digraph table full."); else { while ((c1 = my_getarg(&args)) && (c2 = my_getarg(&args)) && (c3 = my_getarg(&args))) { /* Pass c1 to get_digraph() */ current_screen->digraph_first = c1; if (get_digraph(c2) == 0) { dig_table_di[i] = c3; /* Make sure c1 <= c2 */ if (c1 > c2) c3 = c1, c1 = c2, c2 = c3; dig_table_lo[i] = c1; dig_table_hi[i] = c2; i++; dig_table_lo[i] = dig_table_hi[i] = dig_table_di[i] = (u_char) 0; digraph_changed = 1; say("Digraph added to table."); } else { say("Digraph already defined in table."); break; } } if (!c2 || !c3) say("Unknown or missing argument."); } } else if (my_strncmp(cmd, "remove", len) == 0) { /* Remove a digraph from the table. */ if ((i = my_strlen(dig_table_lo)) == 0) say("Digraph table is already empty."); else { if ((c1 = my_getarg(&args)) && (c2 = my_getarg(&args))) { i = 0; if (c1 > c2) c3 = c1, c1 = c2, c2 = c3; while (dig_table_lo[i]) { if ((dig_table_lo[i] == c1) && (dig_table_hi[i] == c2)) /* * FIXME: strcpy() is not guaranteed for * overlapping copying, but this one * is high -> low. Ought to be fixed. */ /* re-indent this block - phone, jan 1993. */ { my_strcpy(dig_table_lo + i, dig_table_lo + i + 1); my_strcpy(dig_table_hi + i, dig_table_hi + i + 1); my_strcpy(dig_table_di + i, dig_table_di + i + 1); digraph_changed = 1; put_it("Digraph removed from table."); return; } /* much better */ i++; } say("Digraph not found."); } } } else if (my_strncmp(cmd, "clear", len) == 0) { /* Clear digraph table. */ dig_table_lo[0] = dig_table_hi[0] = dig_table_di[0] = (u_char) 0; digraph_changed = 1; say("Digraph table cleared."); } else say("Unknown flag."); } else { /* Display digraph table. */ u_char buffer1[8]; u_char buffer2[192]; say("Digraph table:"); buffer2[0] = (u_char) 0; i = 0; while(dig_table_lo[i]) { snprintf(CP(buffer1), sizeof buffer1, "%c%c %c ", dig_table_lo[i], dig_table_hi[i], dig_table_di[i]); my_strcat(buffer2, buffer1); if ((++i % 10) == 0) { put_it(CP(buffer2)); buffer2[0] = (u_char) 0; } } if (buffer2[0]) put_it(CP(buffer2)); snprintf(CP(buffer2), sizeof buffer2, "%d digraphs listed.", i); say(CP(buffer2)); } } static u_char my_getarg(args) u_char **args; { u_char *arg; arg = (u_char *)next_arg(*args, args); if (!args || !*args || !arg) return '\0'; /* Don't trust isdigit() with 8 bits. */ if ((*arg <= '9') && (*arg >= '0')) { u_char i = *arg & 0x0f; while ( *(++arg) ) i = (i * 10) + (*arg & 0x0f); return i; } else if ( (*arg == '!') && (*(arg + 1)) ) return *(arg + 1) | 0x80; return *arg; } void save_digraphs(fp) FILE *fp; { if (digraph_changed) { int i = 0; char *command = "\nDIGRAPH -ADD "; fprintf(fp, "DIGRAPH -CLEAR"); fprintf(fp, "%s", command); while(1) { fprintf(fp, "%d %d %d ", dig_table_lo[i], dig_table_hi[i], dig_table_di[i]); if (!dig_table_lo[++i]) break; if (!(i % 5)) fprintf(fp, "%s", command); } fputc('\n', fp); } } char displayable_unival(unsigned unival, iconv_t conv_out) { /* First rule out control characters */ if((unival >= 0x00 && unival < 0x20) || (unival >= 0x80 && unival < 0xA0) || (unival == 0x7F) || (unival >= 0xFFF0 && unival <= 0xFFFF)) return 0; /* Range 0x80..0x9F is used in some character sets (such as cp850), * but they are assigned different positions in unicode. * The univals we handle here are unicode positions. * In unicode, 0x80..0x9F are not used because some * american programs might still blindly assume * 7-bitness and take those as control characters. * 0x7F is delete/backspace. * 0xFFF0..0xFFFF is the unicode control range. * It contains a signature token, an illegal * character token and so on. */ #ifdef HAVE_ICONV_OPEN if (conv_out) { u_char utfbuf[8],*utfptr; u_char outbuf[256],*outptr; size_t utfspace, outspace; size_t retval; /* Now sequence the character to buffer * and let iconv say whether it can displayed. */ utf8_sequence(unival, utfbuf); utfptr = utfbuf; outptr = outbuf; utfspace = my_strlen(utfbuf); outspace = sizeof outbuf; /* reset the converter */ iconv(conv_out, NULL, 0, NULL, 0); /* *outptr = '\0'; */ retval = iconv(conv_out, (iconv_const char**)(void *)&utfptr, &utfspace, (char **)(void *)&outptr, &outspace); /* *outptr = '\0'; fprintf(stderr, "CHK: '%s' -> '%s', retval=%d, errno=%d\n", utfbuf, outbuf, retval, errno); */ return retval != (size_t)-1; } #endif /* HAVE_ICONV_OPEN */ return 1; } unsigned calc_unival_width(unsigned unival) { /* FIXME: Should we use some kind of database here? * FIXME: Combining marks support is completely untested */ /* chinese, japanese, korean */ if(unival >= 0x3000 && unival < 0xFF00) return 2; /* combining diacritical marks */ if(unival >= 0x0300 && unival < 0x0400) return 0; /* combining diacritical marks for symbols */ if(unival >= 0x20D0 && unival < 0x2100) return 0; /* combining half-marks */ if(unival >= 0xFE20 && unival < 0xFE30) return 0; /* everything else */ return 1; } unsigned calc_unival_length(const u_char* str) { /* Returns the number of bytes taken by * the given utf-8 code */ static const char sizes[16] = { 1,1,1,1,1,1,1,1, 0,0,0,0,2,2,3,4 }; return sizes[*str >> 4]; /* 1-byte (0..7F): * 0 1 2 3 4 5 6 7 * 2-byte (80..7FF): * C D * 3-byte (800..FFFF): * E * 4-byte (10000..1FFFFF): * F * invalid: * 8 9 A B (they can not begin a sequence) * * If utf8 is some day extended to use 5-byte * codings, you need to double the sizes[] size * and shift str by 3 instead of 4. * You'd also need to modify * utf8_sequence() and calc_unival(). * * Today, it seems unlikely that these encodings * will be needed in practical applications such as * an irc client. Many programs (such as Microsoft IE) * don't even support 4-byte encodings. * 2-3 -byte encodings are in daily use everywhere. */ } unsigned calc_unival(const u_char *utfbuf) { /* This function does the reverse of utf8_sequence(). */ switch (calc_unival_length(utfbuf)) { case 1: default: return ((utfbuf[0] & 127)); case 2: return ((utfbuf[0] & 31) << 6) | ((utfbuf[1] & 63)); case 3: return ((utfbuf[0] & 15) << 12) | ((utfbuf[1] & 63) << 6) | ((utfbuf[2] & 63)); case 4: return ((utfbuf[0] & 7) << 16) | ((utfbuf[1] & 63) << 12) | ((utfbuf[2] & 63) << 6) | ((utfbuf[3] & 63)); } } void utf8_sequence(unsigned unival, u_char* utfbuf) { /* This function does the reverse of calc_unival(). */ /* The output buffer should have 5 bytes of space. */ u_char *utfptr = utfbuf; if (unival < 0x80) /* <=7 bits */ *utfptr++ = (u_char)unival; else { if (unival < 0x800) /* <=11 bits */ *utfptr++ = (u_char)(0xC0 + (unival>>6)); else { if (unival < 0x10000) /* <=16 bits */ *utfptr++ = (u_char)(0xE0 + (unival>>12)); else /* <=21 bits */ { *utfptr++ = (u_char)(0xF0 + (unival>>18)); *utfptr++ = (u_char)(0x80 + ((unival>>12)&63)); } *utfptr++ = (u_char)(0x80 + ((unival>>6)&63)); } *utfptr++ = (u_char)(0x80 + (unival&63)); } /* Last put a zero-terminator. */ *utfptr = '\0'; /* fprintf(stderr, "utf8-seq %X: %02X %02X (%s)\n", unival, utfbuf[0], utfbuf[1], utfbuf); */ } void mbdata_init(struct mb_data *d, const char *enc) { bzero(d, sizeof(*d)); #ifdef HAVE_ICONV_OPEN d->enc = enc; if (!d->conv_in && !d->conv_out && d->enc && display_encoding) { /* New encoding, reinitialize converters */ if (!d->conv_in) d->conv_in = iconv_open("UTF-8", d->enc); if (!d->conv_out) d->conv_out = iconv_open(CP(display_encoding), "UTF-8"); if (d->conv_in == (iconv_t)(-1)) { iconv_close(d->conv_in); d->conv_in = NULL; } if (d->conv_out == (iconv_t)(-1)) { iconv_close(d->conv_out); d->conv_out = NULL; } } #endif /* HAVE_ICONV_OPEN */ } void mbdata_done(struct mb_data* d) { #ifdef HAVE_ICONV_OPEN if (d->conv_in) iconv_close(d->conv_in); if (d->conv_out) iconv_close(d->conv_out); #endif /* HAVE_ICONV_OPEN */ bzero(d, sizeof(*d)); } void decode_mb(ptr, dest, data) u_char *ptr; /* Source, encoded in whatever */ u_char *dest; /* Target, encoded in utf-8 - NULL is allowed */ mb_data *data; /* Populated with data*/ { #ifdef HAVE_ICONV_OPEN /* If iconv has now been initialized, use it. */ if (data->conv_in && data->conv_out) { /* Task: * Eat input byte by byte * Until either * - the input is exhausted * - conv_in creates a character * When conv_in creates a character, * - feed the character to conv_out * - if conv_out says dame desu yo * - we have an invalid character * - otherwise, analyze the unicode value * - For values 0000..001F: add 40, invert (invalid) * - For values 0080..009F: dec 40, invert (invalid) * - For values 3000..FEFF: (CJK) width=2 */ u_char utfbuf[8], *utfptr = utfbuf; size_t utfspace = sizeof(utfbuf); unsigned unival; int error = 0; size_t retval = 0; data->input_bytes = 0; data->output_bytes = 0; data->num_columns = 0; *utfptr = '\0'; while (*ptr != '\0') { unsigned gave; size_t is = 1; char *cptr, *cutfptr; retry: gave = is; cptr = (char *)ptr; cutfptr = (char *)utfptr; retval = iconv(data->conv_in, (iconv_const char**)(void *)&ptr, &is, (char **)(void *)&utfptr, &utfspace); data->input_bytes += gave-is; if (retval == (size_t)-1) { switch (errno) { case EINVAL: /* We didn't give enough bytes. Must give more */ is = gave; if (ptr[is] != '\0') { ++is; goto retry; } /* It's an undecodable input. */ error = 1; data->input_bytes = 1; ++ptr; goto endloop; case EILSEQ: /* Ignore invalid byte, continue loop. */ error = 1; if (*ptr != '\0') { ++ptr; ++data->input_bytes; } continue; } } if (utfptr > utfbuf) { /* An UTF-8 character was created! */ data->output_bytes += utfptr-utfbuf; *utfptr = '\0'; endloop: break; } break; } if (data->output_bytes == 0 && !error) { /* Nothing was produced, no errors. */ return; } unival = 0; if (data->output_bytes > 0) { /* Calculate the unicode value of the utf8 character */ unival = calc_unival(utfbuf); } if (!displayable_unival(unival, data->conv_out)) { /* The character could not be expressed in display encoding * or would be a control character */ data->num_columns = data->input_bytes; data->output_bytes = data->input_bytes; if (data->output_bytes > 0) data->output_bytes += 2; if (dest) { unsigned n = data->input_bytes; if (n > 0) { ptr -= n; *dest++ = REV_TOG; /* we assume ascii always works */ while (n-- > 0) *dest++ = (*ptr++ & 127) | 64; *dest++ = REV_TOG; } } return; } data->num_columns = calc_unival_width(unival); if (dest) { memcpy(dest, utfbuf, data->output_bytes); } return; } #endif /* HAVE_ICONV_OPEN */ /* No usable iconv (maybe csets were invalid), assume ISO-8859-1 in */ data->input_bytes = 1; data->num_columns = 1; if (!displayable_unival(*ptr, NULL)) { data->output_bytes = 3; if (dest) { *dest++ = REV_TOG; *dest++ = (*ptr & 127) | 64; *dest++ = REV_TOG; } } else { unsigned unival = *ptr; if (unival < 0x80) data->output_bytes = 1; else if (unival < 0x800) data->output_bytes = 2; else if (unival < 0x10000) data->output_bytes = 3; else data->output_bytes = 4; if (dest) utf8_sequence(unival, dest); } } void set_irc_encoding(u_char *enc) { #ifdef HAVE_ICONV_OPEN iconv_t test = iconv_open("UTF-8", CP(enc)); if (test != NULL && test != (iconv_t)(-1)) iconv_close(test); else say("IRC_ENCODING value %s is not supported by this system", enc); malloc_strcpy(&irc_encoding, enc); #else say("IRC_ENCODING has no effect - this version was compiled without iconv support"); #endif /* HAVE_ICONV_OPEN */ } void set_display_encoding(u_char *enc) { #ifdef HAVE_ICONV_OPEN iconv_t test = iconv_open(CP(enc), "UTF-8"); if (test != NULL && test != (iconv_t)(-1)) iconv_close(test); else say("DISPLAY_ENCODING value %s is not supported by this system", enc); malloc_strcpy(&display_encoding, enc); #else say("DISPLAY_ENCODING has no effect - this version was compiled without iconv support"); #endif /* HAVE_ICONV_OPEN */ } void set_input_encoding(u_char *enc) { #ifdef HAVE_ICONV_OPEN iconv_t test = iconv_open("UTF-8", CP(enc)); if (test != NULL && test != (iconv_t)(-1)) iconv_close(test); else say("INPUT_ENCODING value %s is not supported by this system", enc); malloc_strcpy(&input_encoding, enc); #else say("INPUT_ENCODING has no effect - this version was compiled without iconv support"); #endif /* HAVE_ICONV_OPEN */ }