/*************************************************************************
* text to html routine
* automagically inserts html code for links
*
* Copyright (c) 2000 Andy Kempling (aurikan@hotmail.com)
* Copyright (c) 2000-1 Colin Phipps <cphipps@doomworld.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*************************************************************************/
/* $Id: text2html.c,v 1.8 2001/07/21 10:18:18 cph Exp $ */
/*
This code is partially aimed to comply with the URI specs set down in
http://www.ietf.org/rfc/rfc2396.txt section 2, as applied to what characters
are 'legal' in URLs.
but i'm a cheater :-)
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include "ircstats.h"
enum html_flags_e {
html_bold = 1,
html_underline = 2,
html_colour = 4,
};
char * first_html_boundary(char * line)
{
/* pre: line is null-terminated
post: return value is ptr to first instance of any character in illg
notice that \0 always occurs at the end of line (by pre),
so there will always be a first instance.
*/
/* cheater!! i include ( and ) as html boundaries when they aren't
to FIX this, we will have to discuss a more global solution
*/
const char illg [] =
{ '<', '>', '\n', '\"', '(', ')', 0x2, 0x3, 0x1f, '\0' };
char * ptr = line;
while (!strchr(illg, (int)(*ptr))) ptr++;
return ptr;
}
int lenword(char * start)
{
/* pre: start is null terminated
post: returns length of 'word' as defined by terminated by isspace or \0
*/
/* FIXME: <cph> just get the damn thing working aur, optimise later :) */
int x = 0;
char c = *start++;
while (!isspace(c) && c != 0)
{x++; c = *start++;}
return x;
}
char * backword(char * start, int max)
{
/* pre: start -> line[max]
post: returns ptr to beginning of word (by a space or tab or newline)
note that the separations like < > \n " ( ) \0 have already been considered
*/
if (!max) return start;
while (max--)
{
if (isspace(*(--start)))
return start+1;
}
/* will break out of this loop when max == 0, start -> line[0] */
return start;
}
char * first_url_word(char * line)
{
/* pre: line is null terminated
post: returns the first character of a 'url-looking' sequence
a 'url-looking' sequence is a 'word' (see above) which contains one
of the following strings
*/
/* TODO: add more urlf ? */
int numsearch = 8; /* number of defined url flag strings */
const char * urlf [] =
{ "http://",
"ftp://",
"www.",
".com",
".org",
".net",
".ca",
".uk"
};
int correct[numsearch];
int len[numsearch];
int x, y=0;
for (x = 0; x < numsearch; x++)
{
correct[x] = 0;
len[x] = strlen(urlf[x]);
}
while(*line)
{
for (x = 0; x < numsearch; x++)
{
if (*(urlf[x]+correct[x]) == *line)
correct[x]++;
else if (*urlf[x] == *line)
correct[x] = 1;
else
correct[x] = 0;
if (correct[x] == len[x])
return backword(line, y);
}
y++; line++;
}
return line;
}
char * s_a_r(char * replace)
{
/* pre: replace is null terminated
post: returns a malloced string with all '&' replaced by '&'
note: this is easily extendable into a generic find-and-replace function
*/
char * firstamp = strchr(replace, '&');
char * dest;
char * recursive;
int x;
if (!firstamp || *replace == '\0')
{
dest = (char *)malloc((strlen(replace) + 1)*sizeof(char));
strcpy(dest, replace);
return dest;
}
x = (int)(firstamp - replace);
recursive = s_a_r(firstamp+1);
dest = (char *)malloc((x + 6 + strlen(recursive))*sizeof(char));
if (x) strncpy(dest, replace, x);
strncpy(dest+x, "&", 5);
strcpy(dest+x+5, recursive);
free(recursive);
return dest;
}
void url_inner(FILE * output, char * start)
{
/* pre: start is null terminated
post: prints to ouput the text, with <a> tags inserted appropriately
*/
char * url;
int x;
void * ftplogin;
url = first_url_word(start);
while (*url)
{
x = lenword(url);
/* cph - remembered this gem from the printf function
* definition... %*s gets the field width from a passed int */
fprintf(output, "%.*s", (url-start), start);
if ((ftplogin = memchr(url, '@', x)) && !memchr(url, ':', x))
fprintf(output, "%.*s", x, url);
else
{
fputs("<a href=\"", output);
if (strncmp(url, "http://", 7) && strncmp(url, "ftp://", 6)) {
/* cph - a little more logic here
* Default is http://, but ftp. urls are probably ftp://
* and we want to handle user@example.com's too... */
if (!strncmp(url, "ftp.", 4) || ftplogin)
fputs("ftp://", output);
else
fputs("http://", output);
}
fprintf(output, "%.*s", x, url);
fputs("\">", output);
fprintf(output, "%.*s", x, url);
fputs("</a>", output);
}
start = url + x;
url = first_url_word(start);
}
fputs(start, output);
}
/* FIXME: this was just imported from the old code.
is it necessary? is there a better place for it to go? */
enum html_flags_e state;
const char* mirc_to_html_colour[] = {
"#ffffff", /* white */
"#000000", /* black */
"#00007f", /* blue */
"#007f00", /* green */
"#7f0000", /* red */
"#7f7f00", /* brown */
"#7f007f", /* purple*/
"#9f5f00", /* orange*/
"#00ff00", /* light green */
"#009f9f", /* teal? */
"#00ffff", /* cyan */
"#2020ff", /* light blue */
"#ff2020", /* pink */
"#9f9f9f", /* silver */
};
void url_catcher(FILE * output, char * line)
{
/* cph - eliminate tail recursion, saves stack thrashing */
while (*line) {
/* pre: line is null terminated
post: print to output the text with illegal characters
fixed and <a> tags inserted
*/
char * x = first_html_boundary(line);
if (x > line)
{
int len = (int)(x-line);
char * rec2 = (char *)malloc((len+1)*sizeof(char*));
strncpy(rec2, line, len);
rec2[len] = '\0';
url_inner(output, rec2);
free(rec2);
}
switch (*x)
{
case '\0':
return;
case '<':
fputs("<", output);
break;
case '>':
fputs(">", output);
break;
case '\n':
if (state & html_bold)
fputs("</b>", output);
if (state & html_underline)
fputs("</u>", output);
fputs("<br>", output);
state = 0;
break;
case '\"':
fputs(""", output);
break;
case 0x1f:
if (state & html_underline)
fputs("</u>", output);
else
fputs("<u>", output);
state ^= html_underline;
break;
case 0x02:
if (state & html_bold)
fputs("</b>", output);
else
fputs("<b>", output);
state ^= html_bold;
break;
case 0x03:
/* cph - mIRC's protocol-breaking colours
* See www.mirc.co.uk/help/color.txt
*/
if (state & html_colour)
fputs("</font>", output);
else {
int fgcl = atoi(++x);
if (*x) /* Skip past number */
x += strspn(x, "0123456789");
if (*x == ',') /* Ignore background for now */
x += 1 + strspn(x, "0123456789");
x--;
fprintf(output, "<font color=\"%s\">",
mirc_to_html_colour[fgcl]);
}
state ^= html_colour;
break;
default:
fputc(*x, output);
break;
}
/* cph - we already know *x, so only need check x[1]
* faster than strlen, since we read only 1 char */
line = x+1;
}
}
void output_content(FILE * output, char * line)
{
char* buf;
/* a wrapper class for url_catcher */
state = 0;
url_catcher(output, buf=s_a_r(line));
free(buf);
}
syntax highlighted by Code2HTML, v. 0.9.1