static char rcsid[] = "@(#)$Id: url_element.c,v 1.6 2006/05/07 08:35:31 hurtta Exp $";

/******************************************************************************
 *  The Elm (ME+) Mail System  -  $Revision: 1.6 $   $State: Exp $
 *
 *  Author: Kari Hurtta <hurtta+elm@posti.FMI.FI>
 *      or  Kari Hurtta <elm@elmme-mailer.org>
 *****************************************************************************/

#include "def_url.h"
#include "s_me.h"

DEBUG_VAR(Debug,__FILE__,"url");

static unsigned char * s2us P_((char *str));
static unsigned char * s2us(str)
     char *str;
{
    return (unsigned char *)str;
}

/* RFC 1738:

   In addition, octets may be encoded by a character triplet consisting
   of the character "%" followed by the two hexadecimal digits (from
   "0123456789ABCDEF") which forming the hexadecimal value of the octet.
   (The characters "abcdef" may also be used in hexadecimal encodings.)

*/

static int unicode_hex P_((int ch));
static int unicode_hex(ch)
     int ch;
{
    if (0x0030 /* 0 */ <= ch && ch <= 0x0039 /* 9 */)
	return ch - 0x0030 /* 0 */;
    if (0x0041 /* A */ <= ch && ch <= 0x0046 /* F */)
	return ch - 0x0041 /* A */ + 10;
    if (0x0061 /* a */ <= ch && ch <= 0x0066 /* f */)
	return ch - 0x0061 /* a */ + 10;
    return -1;
}

/* NOTE:
     %XX     are generally bytes from charset of remote protocol

     raw     charset is local matter

     so these charsets are not necessary same
*/
             

/* Decodes %XX, 
 * IF there is %XX > %7F returns type is RAW_BUFFER (charset unknown) 
 * otherwise uses charset of raw
 * If codes with %XX > %7F is able to handle as UTF-8, that is returned
 */

struct string * raw_to_parsed(raw,header_error)
     struct string *raw;
     struct header_errors **header_error;
{
    charset_t utf8 = MIME_name_to_charset("UTF-8",0);
    struct string * ret0 = new_string(raw->string_type);
    struct string * ret = new_string(RAW_BUFFER);
    int ret0_ok = 1;
    struct string * ret1 = NULL;
    int ret1_ok = 1;

    int L = string_len(raw);
    int X;
    int flag_failure = 0;
    int encoded_8bit = 0;

    if (!utf8)
	panic("CHARSET PANIC",__FILE__,__LINE__,"parsed_to_raw",
	      "UTF-8 not found",0);
    ret1 = new_string(utf8);


    for (X = 0; X < L; X++) {
	uint16 ch = give_unicode_from_string(raw,X);

	if (0x0025 /* % */ == ch) {

	    while (X < L-2) {
		uint16 ch1;
		uint16 ch2;
		int v1, v2;
		unsigned char ch0;

		ch = give_unicode_from_string(raw,X);
		ch1 = give_unicode_from_string(raw,X+1);
		ch2 = give_unicode_from_string(raw,X+2);

		if (0x0025 /* % */ != ch) 
		    break;

		v1 = unicode_hex(ch1);
		v2 = unicode_hex(ch2);

		if (-1 == v1 || -1 == v2) {
		    DPRINT(Debug,10,(&Debug, 
				     "raw_to_parsed: Bad URL escape %C%C%C on URL element: %S\n",
				     ch,ch1,ch2,raw));
		    flag_failure++;
		    goto failure0;
		}

		ch0 = v1 * 16 + v2;

		/* Adding to RAW_BUFFER should newer to fail ... */
		if (!add_streambyte_to_string(ret,ch0))
		    panic("URL PANIC",__FILE__,__LINE__,"raw_to_parsed",
			  "add_streambyte_to_string failed",0);


		if (!add_streambyte_to_string(ret0,ch0))
		    ret0_ok = 0;


		if (!add_streambyte_to_string(ret1,ch0))
		    ret1_ok = 0;

		if (ch0 >= 0x80) {
		    ret0_ok = 0;
		    encoded_8bit = 1;
		}

		X += 3;
	    }

	    if (X < L)
		goto failure0;

	} else {
	    /* Pick non-escaped sequence */
	    int pos;
	    struct string *temp;

	    char * buf;
	    int buflen;
	    int err0;
	    int err1;
	    
	failure0:
	    pos = X;
	    err0 = 0;
	    err1 = 0;

	    while (X < L-1) {
		ch = give_unicode_from_string(raw,X+1);
		if (0x0025 /* % */ == ch) 
		    break;
		X++;
	    }

	    /* streamclip_from_string returns printable
	       characters only so it can not used on here 
	    */

	    temp = clip_from_string(raw,&pos,X-pos+1);

	    if (pos != X+1)
		panic("URL PANIC",__FILE__,__LINE__,"raw_to_parsed",
		      "Clipping Error",0);

	    bytestream_from_string(temp,&buf,&buflen);
	    
	    /* Adding to RAW_BUFFER should newer to fail ... */
	    if (buflen != add_streambytes_to_string(ret,buflen,s2us(buf),
						    NULL)) 
		panic("URL PANIC",__FILE__,__LINE__,"raw_to_parsed",
		      "add_streambytes_to_string failed",0);
	   
	    if (buflen != add_streambytes_to_string(ret0,buflen,s2us(buf),
						    &err0)) 
		ret0_ok = 0;
	    else if (err0 > 0)
		ret0_ok = 0;

	    if (buflen != add_streambytes_to_string(ret1,buflen,s2us(buf),
						    &err1)) 
		ret1_ok = 0;
	    else if (err1 > 0)
		ret1_ok = 0;

	    free(buf);
	    free_string(&temp);
	}
    }
        
    if (flag_failure) {
	DPRINT(Debug,2,(&Debug,
			"raw_to_parsed: Failed to decode URL element: %S",
			raw));
	
	process_header_error(header_error,
			     CATGETS(elm_msg_cat, MeSet,
				     MeFailedURLElement,
				     "Failed to decode URL element: %S"),
			     raw);

	free_string(&ret);
	free_string(&ret0);
	free_string(&ret1);
	return NULL;
    }


    if (ret0_ok) {

	DPRINT(Debug,50,(&Debug, 
			 "raw_to_parsed: %S => %S\n",
			 raw,ret0));

	free_string(&ret);
	free_string(&ret1);

	return ret0;
    } else if (ret1_ok && encoded_8bit) {

	DPRINT(Debug,50,(&Debug, 
			 "raw_to_parsed (UTF-8): %S => %S\n",
			 raw,ret1));

	free_string(&ret);
	free_string(&ret0);

	return ret1;
    } else {
	DPRINT(Debug,50,(&Debug, 
			 "raw_to_parsed (RAW BUFFER): %S => %S\n",
			 raw,ret));

	free_string(&ret0);
	free_string(&ret1);
	return ret;
    }
}

/* Does %XX escaping, escaping is done according of UTF-8 
   except if type is RAW_BUFFER
   returns US-ASCII string
*/
char * parsed_to_raw(parsed)
     struct string *parsed;
{
    struct string * P = parsed;
    charset_t utf8 = MIME_name_to_charset("UTF-8",0);
    char * buf;
    int buflen;
    char *ret = NULL;
    int i,x = 0;

    if (!utf8)
	panic("CHARSET PANIC",__FILE__,__LINE__,"parsed_to_raw",
	      "UTF-8 not found",0);

    if (RAW_BUFFER != P->string_type)
	P = convert_string(utf8,parsed,0);

    /* RFC 1738 unsafe characters:

       space < > " # % { } | \ ^ ~ [ ] `

       RFC 1738 reserved characters:

       ; / ? : @ = &


       ( Is ` typo and should be ' ? )

       NO -- text says:

       Thus, only alphanumerics, the special characters "$-_.+!*'(),", and
       reserved characters used for their reserved purposes may be used
       unencoded within a URL.

    */

    bytestream_from_string(P,&buf,&buflen);

    ret= safe_malloc(buflen*3+1);

    for (i = 0; i < buflen; i++) {
	unsigned char ch = buf[i];

	/* US-ASCII assumed ... */
	if (ch >= '0' && ch <= '9' ||
	    ch >= 'a' && ch <= 'z' ||
	    ch >= 'A' && ch <= 'Z' ||
	    NULL != strchr("$-_.+!*'(),",ch))
	    ret[x++] = ch;
	else {
	    ret[x++] = '%';
	    ret[x++] = hexchars[ch / 16];
	    ret[x++] = hexchars[ch % 16];
	}

    }

    ret[x] = '\0';

    free(buf);
    if (P != parsed)
	free_string(&P);

    return ret;
}

/* ----------------------------------------------------------------------- */

#define URL_element_magic    0xEC02

struct url_element {
    unsigned short        magic;     /* URL_element_magic */
    struct string * raw;
    struct string * parsed;
};

static struct url_element *alloc_element P_((void));
static struct url_element *alloc_element()
{
    struct url_element * ptr = safe_malloc (sizeof (*ptr));

    /* bzero is defined hdrs/defs.h */
    bzero((void *)ptr,sizeof (*ptr));

    ptr->magic  = URL_element_magic;
    ptr->raw    = NULL;
    ptr->parsed = NULL;

    return ptr;
}

struct url_element * element_from_raw(raw)
     struct string *raw;
{
    struct url_element * ptr = alloc_element();

    ptr->raw = dup_string(raw);

    return ptr;
}

struct url_element  * element_from_parsed(parsed)
     struct string *parsed;
{
    struct url_element * ptr = alloc_element();

    ptr->parsed = dup_string(parsed);

    return ptr;
}

void free_url_element(ptr)
     struct url_element **ptr;
{

    if (URL_element_magic != (*ptr)->magic)
	panic("URL PANIC",__FILE__,__LINE__,"free_url_element",
	      "bad magic number",0);
    if ((*ptr)->parsed)
	free_string(&((*ptr)->parsed));
    if ((*ptr)->raw)
	free_string(&((*ptr)->raw));
    (*ptr)->magic = 0; /* Invalidate */

    free (*ptr);
    *ptr = NULL;
}


CONST struct string * parsed_from_element(elem,header_error)
     struct url_element *elem;
     struct header_errors **header_error;
{
    if (URL_element_magic != elem->magic)
	panic("URL PANIC",__FILE__,__LINE__,"parsed_from_element",
	      "bad magic number",0);

    if (elem->parsed)
	return elem->parsed;

    if (!elem->raw)
	return NULL;

    elem->parsed = raw_to_parsed(elem->raw,header_error);
    
    return elem->parsed;
}

CONST struct string * raw_from_element(elem)
     struct url_element *elem;
{
     charset_t ascii_ptr = MIME_name_to_charset("US-ASCII",0);
     char * buffer;

    if (URL_element_magic != elem->magic)
	panic("URL PANIC",__FILE__,__LINE__,"raw_from_element",
	      "bad magic number",0);

    if (!ascii_ptr)
	panic("CHARSET PANIC",__FILE__,__LINE__,"parsed_to_raw",
	      "US-ASCII not found",0);


    if (elem->raw)
	return elem->raw;

    if (!elem->parsed)
	return NULL;

    buffer = parsed_to_raw(elem->parsed);

    if (!buffer)
	return NULL;

    elem->raw = new_string2(ascii_ptr,s2us(buffer));

    free(buffer);

    return elem->raw;
}

struct url_element * dup_url_element(elem)
     struct url_element *elem;
{
    struct url_element * ptr = alloc_element();

    if (URL_element_magic != elem->magic)
	panic("URL PANIC",__FILE__,__LINE__,"dup_url_element",
	      "bad magic number",0);

    if (elem->parsed)
	ptr->parsed = dup_string(elem->parsed);

    if (elem->raw)
	ptr->raw = dup_string(elem->raw);

    return ptr;
}


/*
 * Local Variables:
 *  mode:c
 *  c-basic-offset:4
 *  buffer-file-coding-system: iso-8859-1
 * End:
 */