ports//irc/gruftistats/work/gruftistats-0.2.4/text2html.c

/*************************************************************************
 *  text to html routine   
 *  automagically inserts html code for links   
 *
 * Copyright (c) 2000    Andy Kempling (aurikan@hotmail.com)
 * Copyright (c) 2000-1  Colin Phipps <cphipps@doomworld.com>
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *************************************************************************/

/* $Id: text2html.c,v 1.8 2001/07/21 10:18:18 cph Exp $ */


/*
  This code is partially aimed to comply with the URI specs set down in
  http://www.ietf.org/rfc/rfc2396.txt section 2, as applied to what characters
  are 'legal' in URLs.
  
  but i'm a cheater :-)
*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

#include "ircstats.h"

enum html_flags_e { 
  html_bold = 1, 
  html_underline = 2, 
  html_colour = 4,
};

char * first_html_boundary(char * line)
{
  /* pre: line is null-terminated
     post: return value is ptr to first instance of any character in illg
     notice that \0 always occurs at the end of line (by pre),
     so there will always be a first instance.
  */

  /* cheater!!  i include ( and ) as html boundaries when they aren't 
     to FIX this, we will have to discuss a more global solution 
  */

  const char illg [] =
  { '<', '>', '\n', '\"', '(', ')', 0x2, 0x3, 0x1f, '\0' };

  char * ptr = line;

  while (!strchr(illg, (int)(*ptr))) ptr++;
 
  return ptr;
}

int lenword(char * start)
{
  /* pre: start is null terminated
     post: returns length of 'word' as defined by terminated by isspace or \0
  */
  /* FIXME: <cph> just get the damn thing working aur, optimise later :) */

  int x = 0;
  char c = *start++;
  while (!isspace(c) && c != 0)
    {x++; c = *start++;}
  return x;
}

char * backword(char * start, int max)
{
  /* pre: start -> line[max]
     post: returns ptr to beginning of word (by a space or tab or newline)
     note that the separations like < > \n " ( ) \0 have already been considered
  */
  if (!max) return start;

  while (max--)
    {
      if (isspace(*(--start)))
	return start+1;
    }
  /* will break out of this loop when max == 0, start -> line[0] */
  return start;
}

char * first_url_word(char * line)
{
  /* pre: line is null terminated
     post: returns the first character of a 'url-looking' sequence
  
     a 'url-looking' sequence is a 'word' (see above) which contains one
     of the following strings
  */

  /* TODO: add more urlf ? */

  int numsearch = 8; /* number of defined url flag strings */
  const char * urlf [] =
  { "http://",
    "ftp://",
    "www.",
    ".com",
    ".org",
    ".net",
    ".ca",
    ".uk"
  };
  int correct[numsearch];
  int len[numsearch];
  int x, y=0;

  for (x = 0; x < numsearch; x++)
    {
      correct[x] = 0;
      len[x] = strlen(urlf[x]);
    }
  while(*line)
    {
      for (x = 0; x < numsearch; x++)
	{
	  if (*(urlf[x]+correct[x]) == *line)
	    correct[x]++;
	  else if (*urlf[x] == *line)
	    correct[x] = 1;
	  else
	    correct[x] = 0;
	  if (correct[x] == len[x])
	    return backword(line, y);
	}
      y++; line++;
    }
  return line;  
}

char * s_a_r(char * replace)
{
  /* pre: replace is null terminated 
     post: returns a malloced string with all '&' replaced by '&amp;' 
     note: this is easily extendable into a generic find-and-replace function
  */
 
  char * firstamp = strchr(replace, '&');
  char * dest;
  char * recursive;
  int x;

  if (!firstamp || *replace == '\0')
    {
      dest = (char *)malloc((strlen(replace) + 1)*sizeof(char));
      strcpy(dest, replace); 
      return dest;
    }

  x = (int)(firstamp - replace);
  recursive = s_a_r(firstamp+1);
  dest = (char *)malloc((x + 6 + strlen(recursive))*sizeof(char));
  if (x) strncpy(dest, replace, x);
  strncpy(dest+x, "&amp;", 5);
  strcpy(dest+x+5, recursive);
  free(recursive);
  return dest; 
}

void url_inner(FILE * output, char * start)
{
  /* pre: start is null terminated
     post: prints to ouput the text, with <a> tags inserted appropriately
  */

  char * url;
  int x;
  void * ftplogin;

  url = first_url_word(start);
  while (*url)
    {
      x = lenword(url);
      /* cph - remembered this gem from the printf function
       * definition... %*s gets the field width from a passed int */
      fprintf(output, "%.*s", (url-start), start);

      if ((ftplogin = memchr(url, '@', x)) && !memchr(url, ':', x))
       fprintf(output, "%.*s", x, url);
      else
       {
       fputs("<a href=\"", output);
       if (strncmp(url, "http://", 7) && strncmp(url, "ftp://", 6)) {
 	/* cph - a little more logic here
	 * Default is http://, but ftp. urls are probably ftp://
	 * and we want to handle user@example.com's too... */
	if (!strncmp(url, "ftp.", 4) || ftplogin)
	  fputs("ftp://", output);
	else
	  fputs("http://", output);
       }
       fprintf(output, "%.*s", x, url);
       fputs("\">", output);
       fprintf(output, "%.*s", x, url);
       fputs("</a>", output);
       }
      start = url + x;
      url = first_url_word(start);
    }
  fputs(start, output); 
}

/* FIXME: this was just imported from the old code.
   is it necessary?  is there a better place for it to go? */
enum html_flags_e state;

const char* mirc_to_html_colour[] = { 
  "#ffffff", /* white */
  "#000000", /* black */
  "#00007f", /* blue  */
  "#007f00", /* green */
  "#7f0000", /* red   */
  "#7f7f00", /* brown */
  "#7f007f", /* purple*/
  "#9f5f00", /* orange*/
  "#00ff00", /* light green */
  "#009f9f", /* teal? */
  "#00ffff", /* cyan  */
  "#2020ff", /* light blue  */
  "#ff2020", /* pink  */
  "#9f9f9f", /* silver */
};

void url_catcher(FILE * output, char * line)
{
 /* cph - eliminate tail recursion, saves stack thrashing */
 while (*line) {
  /* pre: line is null terminated
     post: print to output the text with illegal characters
     fixed and <a> tags inserted 
  */

  char * x = first_html_boundary(line);

  if (x > line)
    {
      int len = (int)(x-line);
      char * rec2 = (char *)malloc((len+1)*sizeof(char*));
      strncpy(rec2, line, len);
      rec2[len] = '\0';
      url_inner(output, rec2);
      free(rec2);
    }

  switch (*x)
    {
    case '\0':
      return;
    case '<':
      fputs("&lt;", output);
      break;
    case '>':
      fputs("&gt;", output);
      break;
    case '\n':  
      if (state & html_bold)
	fputs("</b>", output);
      if (state & html_underline)
	fputs("</u>", output);
      fputs("<br>", output);
      state = 0;
      break;
    case '\"':
      fputs("&quot;", output);
      break;
    case 0x1f:
      if (state & html_underline)
	fputs("</u>", output);
      else
	fputs("<u>", output);
      state ^= html_underline;
      break;
    case 0x02:
      if (state & html_bold)
	fputs("</b>", output);
      else
	fputs("<b>", output);
      state ^= html_bold;
      break;
    case 0x03: 
      /* cph - mIRC's protocol-breaking colours 
       * See www.mirc.co.uk/help/color.txt 
       */
      if (state & html_colour) 
	fputs("</font>", output);
      else {
	int fgcl = atoi(++x);
	if (*x) /* Skip past number */
	  x += strspn(x, "0123456789");
	if (*x == ',') /* Ignore background for now */
	  x += 1 + strspn(x, "0123456789");
	x--; 
	fprintf(output, "<font color=\"%s\">", 
		mirc_to_html_colour[fgcl]);
      }
      state ^= html_colour;
      break;
    default:
      fputc(*x, output);
      break;
    }
  
  /* cph - we already know *x, so only need check x[1] 
   * faster than strlen, since we read only 1 char */
  line = x+1;
 }
}

void output_content(FILE * output, char * line)
{
  char* buf;
  /* a wrapper class for url_catcher */

  state = 0;
  url_catcher(output, buf=s_a_r(line));
  free(buf);
}
syntax highlighted by Code2HTML, v. 0.9.1