static char rcsid[] = "@(#)$Id: unicode.c,v 1.20 2006/04/16 21:01:35 hurtta Exp $";
/******************************************************************************
* The Elm (ME+) Mail System - $Revision: 1.20 $ $State: Exp $
*
* Author: Kari Hurtta <hurtta+elm@posti.FMI.FI> (was hurtta+elm@ozone.FMI.FI)
*****************************************************************************/
#include "headers.h"
#include "s_me.h"
#include "cs_imp.h"
#include "unidata.h"
DEBUG_VAR(Debug,__FILE__,"charset");
/* NOTE:
UOP_printable returns 0 if character is not known to be
printable
UOP_noctrl returns 0 if character is known to be
control character
UOP_space return 0 if character is not known or
is not space
*/
/* Returns 0 if char not OK, otherwise char or converted char */
uint16 unicode_ch(ch,op)
unsigned int ch;
enum unicode_op op;
{
struct unidata_mapped_data * XX = default_unidata();
if (XX) {
static int flip = 0;
if (!flip) {
flip = 1;
DPRINT(Debug,4,(&Debug,
"unicode_ch called -- got unidata\n"));
}
if (ch <= 0xFFFF) {
struct character_information info;
uint16 dummy[1];
int Y ;
/* bzero is defined on hdrs/defs.h */
bzero((void *)&info, sizeof info);
info.character_type = 0;
Y = unicode_lookup_character(XX,ch,&info,dummy,0);
if (Y < 0) { /* database bad */
DPRINT(Debug,61,(&Debug,
"unicode_ch: failed to look at %04X -- database bad\n",
ch));
goto fail;
}
if (Y == 0) { /* failure (not valid character) */
DPRINT(Debug,61,(&Debug,
"unicode_ch: failed to look at %04X -- not a valid character or control character\n",
ch));
if (!info.character_type) {
DPRINT(Debug,61,(&Debug,
"unicode_ch-No character type information for %04X\n",
ch));
goto bad;
}
}
switch(op) {
case UOP_none:
break;
case UOP_lowercase:
if (info.lower)
ch = info.lower;
break;
case UOP_noctrl:
/* Other, Control */
if (info.character_type == CHARTYPE_Cc)
ch = 0;
/* Other, Not Assigned */
else if (info.character_type == CHARTYPE_Cn)
ch = 0;
break;
case UOP_printable:
/* Number */
if (info.character_type & CHARTYPE_Number)
break;
/* Separator, Space */
else if (info.character_type == CHARTYPE_Zs)
break;
/* Letter */
else if (info.character_type & CHARTYPE_Letter)
break;
/* Punctuation */
else if (info.character_type & CHARTYPE_Punctuation)
break;
/* Symbol */
else if (info.character_type & CHARTYPE_Symbol)
break;
else
ch = 0;
break;
case UOP_space:
/* Separator, Space */
if (info.character_type == CHARTYPE_Zs)
break;
else
ch = 0;
break;
}
} else { /* Invalid character */
bad:
switch(op) {
case UOP_printable: /* Invalid character is not printable */
case UOP_noctrl: /* Consider invalid character as control */
case UOP_space: /* Invalid character is not space */
ch = 0;
}
}
} else {
static int flip = 0;
if (!flip) {
flip = 1;
DPRINT(Debug,4,(&Debug,
"unicode_ch called -- no unidata\n"));
}
fail:
switch(op) {
case UOP_none:
break;
case UOP_lowercase:
if (ch >= 0x0041 && ch <= 0x005A) /* ASCII range */
ch = ( ch - 0x0041) + 0x0061;
else if (ch >= 0x00C0 && ch <= 0x00D6) /* LATIN1 range */
ch = ( ch - 0x00C0) + 0x00E0;
else if (ch >= 0x00D8 && ch <= 0x00DE) /* LATIN1 range */
ch = ( ch - 0x00D8) + 0x00F8;
break;
case UOP_noctrl:
if (ch <= 0x001F) /* ASCII ctrl range */
ch = 0;
else if (ch >= 0x007F && ch <= 0x009F)
ch = 0;
break;
case UOP_printable:
if (ch >= 0x0020 && ch <= 0x007E) /* ASCII range */
break;
else if (ch >= 0x00A0 && 0x00FF) /* Latin1 range */
break;
else
ch = 0; /* Not known if printable */
break;
case UOP_space:
if (ch == 0x0020) /* Ascii space */
break;
else if (ch == 0x00A0) /* non breaking space */
break;
else
ch = 0; /* Not known if space */
break;
}
}
return ch;
}
/* Returns 1 if compressed */
int compress_unicode(words,len)
uint16 *words;
int *len;
{
struct unidata_mapped_data * XX = default_unidata();
if (XX) {
int have_comp = 0;
int L = *len,i;
uint16 *new_buffer = safe_malloc(L * sizeof (uint16));
int X = 0;
for (i = 0; i < *len && X < L;) {
int comp_type;
int ret = unicode_compress_input(XX,
&(new_buffer[X]),
words + i,
*len - i,
&comp_type);
if (ret < 1) {
DPRINT(Debug,61,(&Debug,
"unicode compress failes at [%d] %04X, ret=%d\n",
i,words[i],ret));
/* Skip (copy) character */
new_buffer[X] = words[i];
X++;
i++;
continue;
}
if (comp_type != DECOMP_canonical) {
int j;
DPRINT(Debug,62,(&Debug,
"unicode compress type %d discarded:",
comp_type));
for (j = 0; j < ret; j++) {
DPRINT(Debug,62,(&Debug," %04X",
words[i+j]));
}
DPRINT(Debug,62,(&Debug," => %04X\n",
new_buffer[X]));
/* Discard compression */
new_buffer[X] = words[i];
ret = 1;
} else
have_comp = 1;
if (ret != 1 || new_buffer[X] != words[i]) {
int j;
DPRINT(Debug,62,(&Debug,
"unicode compress:"));
for (j = 0; j < ret; j++) {
DPRINT(Debug,62,(&Debug," %04X",
words[i+j]));
}
DPRINT(Debug,62,(&Debug," => %04X\n",
new_buffer[X]));
}
X++;
/* ret characters consumed */
i += ret;
}
DPRINT(Debug,61,(&Debug,
"unicode compress len %d => %d (%s)\n",
*len,X,
have_comp ? "processed" : "no compression"));
for (i = 0; i < X; i++) {
words[i] = new_buffer[i];
}
*len = X;
free(new_buffer);
return have_comp;
}
return 0;
}
/* ---------------------------------------------------------------------- */
struct unidata_mapped_data * default_unidata()
{
static struct unidata_mapped_data * res = NULL;
if (res)
return res;
if (0 != strcmp(unidata_path,"none")) {
static int tried = 0;
if (!tried) {
DPRINT(Debug,4,(&Debug,
"Loading UNIDATA information ... (file %s)\n",
unidata_path));
if (!get_unidata(&res,unidata_path)) {
DPRINT(Debug,4,(&Debug,
"... loading of UNIDATA information failed\n"));
}
tried = 1;
}
}
return res;
}
/*
* Local Variables:
* mode:c
* c-basic-offset:4
* buffer-file-coding-system: iso-8859-1
* End:
*/
syntax highlighted by Code2HTML, v. 0.9.1