ports//mail/elm+ME/work/elm2.4.ME+.124/lib/unicode.c

static char rcsid[] = "@(#)$Id: unicode.c,v 1.20 2006/04/16 21:01:35 hurtta Exp $";

/******************************************************************************
 *  The Elm (ME+) Mail System  -  $Revision: 1.20 $   $State: Exp $
 *
 *  Author: Kari Hurtta <hurtta+elm@posti.FMI.FI> (was hurtta+elm@ozone.FMI.FI)
 *****************************************************************************/

#include "headers.h"
#include "s_me.h"
#include "cs_imp.h"
#include "unidata.h"

DEBUG_VAR(Debug,__FILE__,"charset");

/* NOTE:

              UOP_printable   returns 0 if character is not known to be
                              printable

              UOP_noctrl      returns 0 if character is known to be
                              control character

	      UOP_space       return 0 if character is not known or
	                      is not space
*/

/* Returns 0 if char not OK, otherwise char or converted char */
uint16 unicode_ch(ch,op)
     unsigned int ch; 
     enum unicode_op op; 
{
    struct unidata_mapped_data * XX = default_unidata();
   
    if (XX) {
	static int flip = 0;

	if (!flip) {
	    flip = 1;
	    DPRINT(Debug,4,(&Debug,
			    "unicode_ch called -- got unidata\n"));
	}

	if (ch <= 0xFFFF) {
	    struct character_information info;
	    uint16 dummy[1];
	    int Y ;

	    

	    /* bzero is defined on hdrs/defs.h */
	    bzero((void *)&info, sizeof info);

	    info.character_type = 0;

	    Y = unicode_lookup_character(XX,ch,&info,dummy,0);
	    
	    if (Y < 0)  {       /* database bad */
		DPRINT(Debug,61,(&Debug,
				 "unicode_ch: failed to look at %04X -- database bad\n",
				 ch));
		goto fail;
	    }
	    if (Y == 0) {       /* failure (not valid character) */
		DPRINT(Debug,61,(&Debug,
				 "unicode_ch: failed to look at %04X -- not a valid character or control character\n",
				 ch));
		if (!info.character_type) {
		    DPRINT(Debug,61,(&Debug,
				     "unicode_ch-No character type information for %04X\n",
				     ch));
		    goto bad;
		}
	    }

	switch(op) {
	case UOP_none: 
	    break;
	case UOP_lowercase:
	    if (info.lower)
		ch = info.lower;
	    break;
	case UOP_noctrl: 
	    
	    /* Other, Control          */
	    if (info.character_type == CHARTYPE_Cc)
		ch = 0;
	    
	    /* Other, Not Assigned     */
	    else if (info.character_type == CHARTYPE_Cn)
		ch = 0;

	    break;

	case UOP_printable:

	    /* Number                  */
	    if (info.character_type & CHARTYPE_Number)
		break;

	    /* Separator, Space        */
	    else if (info.character_type == CHARTYPE_Zs)
		break;

	    /* Letter                  */
	    else if (info.character_type & CHARTYPE_Letter)
		break;

	    /* Punctuation             */
	    else if (info.character_type & CHARTYPE_Punctuation)
		break;

	    /* Symbol                  */
	    else if (info.character_type & CHARTYPE_Symbol)
		break;

	    else
		ch = 0;
	    break;

	case UOP_space:

	    /* Separator, Space        */
	    if (info.character_type == CHARTYPE_Zs)
		break;
	    else
		ch = 0;
	    break;
	   
	}


	} else { /* Invalid character */

	bad:
	    switch(op) {
	    case UOP_printable:     /* Invalid character is not printable */
	    case UOP_noctrl:        /* Consider invalid character as control */
	    case UOP_space:         /* Invalid character is not space */
		ch = 0;
	    }
	}

    } else {
	static int flip = 0;

	if (!flip) {
	    flip = 1;
	    DPRINT(Debug,4,(&Debug,
			    "unicode_ch called -- no unidata\n"));
	}



    fail:

	switch(op) {
	case UOP_none: 
	    break;
	case UOP_lowercase:
	    if (ch >= 0x0041 && ch <= 0x005A)               /* ASCII range  */
		ch = ( ch - 0x0041) + 0x0061;
	    else if (ch >= 0x00C0 && ch <= 0x00D6)          /* LATIN1 range */
		ch = ( ch - 0x00C0) + 0x00E0;
	    else if (ch >= 0x00D8 && ch <= 0x00DE)          /* LATIN1 range */
		ch = ( ch - 0x00D8) + 0x00F8;
	    break;
	case UOP_noctrl: 
	    if (ch <= 0x001F)                              /* ASCII ctrl range */
		ch = 0;
	    else if (ch >= 0x007F && ch <= 0x009F)
		ch = 0;
	    break;
	case UOP_printable:
	    if (ch >= 0x0020 && ch <= 0x007E)              /* ASCII range */
		break;
	    else if (ch >= 0x00A0 && 0x00FF)               /* Latin1 range */
		break;
	    else
		ch = 0;                          /* Not known if printable */
	    break;
	case UOP_space:

	    if (ch == 0x0020)                               /* Ascii space */
		break;
	    else if (ch == 0x00A0)                /* non breaking space */
		break;
	    else
		ch = 0;                          /* Not known if space */
	    break;

	}
    }

    return ch;
}

/* Returns 1 if compressed */
int compress_unicode(words,len)
     uint16  *words; 
     int *len;
{
    struct unidata_mapped_data * XX = default_unidata();
    
    if (XX) {
	int have_comp = 0;

	int  L = *len,i;
	uint16  *new_buffer = safe_malloc(L * sizeof (uint16));
	int X = 0;

	for (i = 0; i < *len && X < L;) {
	    int comp_type;
	    int ret = unicode_compress_input(XX,
					     &(new_buffer[X]),
					     words + i,
					     *len - i,
					     &comp_type);
	    if (ret < 1) {
		DPRINT(Debug,61,(&Debug,
				"unicode compress failes at [%d] %04X, ret=%d\n",
				i,words[i],ret));
		
		/* Skip (copy) character */
		
		new_buffer[X] = words[i];
		X++;
		i++;
		continue;
	    }  

	    if (comp_type != DECOMP_canonical) {
		int j;
		DPRINT(Debug,62,(&Debug,
				 "unicode compress type %d discarded:",
				 comp_type));
		for (j = 0; j < ret; j++) {
		    DPRINT(Debug,62,(&Debug," %04X",
				     words[i+j]));
		}
		DPRINT(Debug,62,(&Debug," => %04X\n",
				 new_buffer[X]));

		/* Discard compression */
		new_buffer[X] = words[i];
		ret = 1;
	    } else
		have_comp = 1;

	    if (ret != 1 || new_buffer[X] != words[i]) {
		int j;
		DPRINT(Debug,62,(&Debug,
                                "unicode compress:"));
		for (j = 0; j < ret; j++) {
		    DPRINT(Debug,62,(&Debug," %04X",
				     words[i+j]));
		}
		DPRINT(Debug,62,(&Debug," => %04X\n",
				 new_buffer[X]));
	    }

	    X++;
	    /* ret characters consumed */
	    i += ret;
	}
	DPRINT(Debug,61,(&Debug,
			 "unicode compress  len %d => %d (%s)\n",
			 *len,X,
			 have_comp ? "processed" : "no compression"));

	for (i = 0; i < X; i++) {
	    words[i] = new_buffer[i];
	}
	*len = X;
	free(new_buffer);

	return have_comp;
    }
    return 0;
}


/* ---------------------------------------------------------------------- */

struct unidata_mapped_data * default_unidata()
{
    static struct unidata_mapped_data * res = NULL;

    if (res)
	return res;

    if (0 != strcmp(unidata_path,"none")) {	
	static int tried = 0;

	if (!tried) {
	    DPRINT(Debug,4,(&Debug,
			    "Loading UNIDATA information ... (file %s)\n",
			    unidata_path));
	    if (!get_unidata(&res,unidata_path)) {
		DPRINT(Debug,4,(&Debug,
				"... loading of UNIDATA information failed\n"));
	    }
	    tried = 1;
	}
    }
    return res;
}



/*
 * Local Variables:
 *  mode:c
 *  c-basic-offset:4
 *  buffer-file-coding-system: iso-8859-1
 * End:
 */
syntax highlighted by Code2HTML, v. 0.9.1