static char rcsid[] = "@(#)$Id: unicode.c,v 1.20 2006/04/16 21:01:35 hurtta Exp $"; /****************************************************************************** * The Elm (ME+) Mail System - $Revision: 1.20 $ $State: Exp $ * * Author: Kari Hurtta (was hurtta+elm@ozone.FMI.FI) *****************************************************************************/ #include "headers.h" #include "s_me.h" #include "cs_imp.h" #include "unidata.h" DEBUG_VAR(Debug,__FILE__,"charset"); /* NOTE: UOP_printable returns 0 if character is not known to be printable UOP_noctrl returns 0 if character is known to be control character UOP_space return 0 if character is not known or is not space */ /* Returns 0 if char not OK, otherwise char or converted char */ uint16 unicode_ch(ch,op) unsigned int ch; enum unicode_op op; { struct unidata_mapped_data * XX = default_unidata(); if (XX) { static int flip = 0; if (!flip) { flip = 1; DPRINT(Debug,4,(&Debug, "unicode_ch called -- got unidata\n")); } if (ch <= 0xFFFF) { struct character_information info; uint16 dummy[1]; int Y ; /* bzero is defined on hdrs/defs.h */ bzero((void *)&info, sizeof info); info.character_type = 0; Y = unicode_lookup_character(XX,ch,&info,dummy,0); if (Y < 0) { /* database bad */ DPRINT(Debug,61,(&Debug, "unicode_ch: failed to look at %04X -- database bad\n", ch)); goto fail; } if (Y == 0) { /* failure (not valid character) */ DPRINT(Debug,61,(&Debug, "unicode_ch: failed to look at %04X -- not a valid character or control character\n", ch)); if (!info.character_type) { DPRINT(Debug,61,(&Debug, "unicode_ch-No character type information for %04X\n", ch)); goto bad; } } switch(op) { case UOP_none: break; case UOP_lowercase: if (info.lower) ch = info.lower; break; case UOP_noctrl: /* Other, Control */ if (info.character_type == CHARTYPE_Cc) ch = 0; /* Other, Not Assigned */ else if (info.character_type == CHARTYPE_Cn) ch = 0; break; case UOP_printable: /* Number */ if (info.character_type & CHARTYPE_Number) break; /* Separator, Space */ else if (info.character_type == CHARTYPE_Zs) break; /* Letter */ else if (info.character_type & CHARTYPE_Letter) break; /* Punctuation */ else if (info.character_type & CHARTYPE_Punctuation) break; /* Symbol */ else if (info.character_type & CHARTYPE_Symbol) break; else ch = 0; break; case UOP_space: /* Separator, Space */ if (info.character_type == CHARTYPE_Zs) break; else ch = 0; break; } } else { /* Invalid character */ bad: switch(op) { case UOP_printable: /* Invalid character is not printable */ case UOP_noctrl: /* Consider invalid character as control */ case UOP_space: /* Invalid character is not space */ ch = 0; } } } else { static int flip = 0; if (!flip) { flip = 1; DPRINT(Debug,4,(&Debug, "unicode_ch called -- no unidata\n")); } fail: switch(op) { case UOP_none: break; case UOP_lowercase: if (ch >= 0x0041 && ch <= 0x005A) /* ASCII range */ ch = ( ch - 0x0041) + 0x0061; else if (ch >= 0x00C0 && ch <= 0x00D6) /* LATIN1 range */ ch = ( ch - 0x00C0) + 0x00E0; else if (ch >= 0x00D8 && ch <= 0x00DE) /* LATIN1 range */ ch = ( ch - 0x00D8) + 0x00F8; break; case UOP_noctrl: if (ch <= 0x001F) /* ASCII ctrl range */ ch = 0; else if (ch >= 0x007F && ch <= 0x009F) ch = 0; break; case UOP_printable: if (ch >= 0x0020 && ch <= 0x007E) /* ASCII range */ break; else if (ch >= 0x00A0 && 0x00FF) /* Latin1 range */ break; else ch = 0; /* Not known if printable */ break; case UOP_space: if (ch == 0x0020) /* Ascii space */ break; else if (ch == 0x00A0) /* non breaking space */ break; else ch = 0; /* Not known if space */ break; } } return ch; } /* Returns 1 if compressed */ int compress_unicode(words,len) uint16 *words; int *len; { struct unidata_mapped_data * XX = default_unidata(); if (XX) { int have_comp = 0; int L = *len,i; uint16 *new_buffer = safe_malloc(L * sizeof (uint16)); int X = 0; for (i = 0; i < *len && X < L;) { int comp_type; int ret = unicode_compress_input(XX, &(new_buffer[X]), words + i, *len - i, &comp_type); if (ret < 1) { DPRINT(Debug,61,(&Debug, "unicode compress failes at [%d] %04X, ret=%d\n", i,words[i],ret)); /* Skip (copy) character */ new_buffer[X] = words[i]; X++; i++; continue; } if (comp_type != DECOMP_canonical) { int j; DPRINT(Debug,62,(&Debug, "unicode compress type %d discarded:", comp_type)); for (j = 0; j < ret; j++) { DPRINT(Debug,62,(&Debug," %04X", words[i+j])); } DPRINT(Debug,62,(&Debug," => %04X\n", new_buffer[X])); /* Discard compression */ new_buffer[X] = words[i]; ret = 1; } else have_comp = 1; if (ret != 1 || new_buffer[X] != words[i]) { int j; DPRINT(Debug,62,(&Debug, "unicode compress:")); for (j = 0; j < ret; j++) { DPRINT(Debug,62,(&Debug," %04X", words[i+j])); } DPRINT(Debug,62,(&Debug," => %04X\n", new_buffer[X])); } X++; /* ret characters consumed */ i += ret; } DPRINT(Debug,61,(&Debug, "unicode compress len %d => %d (%s)\n", *len,X, have_comp ? "processed" : "no compression")); for (i = 0; i < X; i++) { words[i] = new_buffer[i]; } *len = X; free(new_buffer); return have_comp; } return 0; } /* ---------------------------------------------------------------------- */ struct unidata_mapped_data * default_unidata() { static struct unidata_mapped_data * res = NULL; if (res) return res; if (0 != strcmp(unidata_path,"none")) { static int tried = 0; if (!tried) { DPRINT(Debug,4,(&Debug, "Loading UNIDATA information ... (file %s)\n", unidata_path)); if (!get_unidata(&res,unidata_path)) { DPRINT(Debug,4,(&Debug, "... loading of UNIDATA information failed\n")); } tried = 1; } } return res; } /* * Local Variables: * mode:c * c-basic-offset:4 * buffer-file-coding-system: iso-8859-1 * End: */