/*
tidy.c - HTML parser and pretty printer
EGENIX.COM COPYRIGHT NOTICE:
----------------------------
(c) 2001 eGenix.com Software GmbH, Langenfeld
This is a modified version of the original
HTML TIDY Software, Version 4aug00
The modifications applied to the source code which
were needed to make it thread safe and usable as C library are
made available under the eGenix.com Public License. Please see the
documentation for details or contact licenses@egenix.com.
The original HTML TIDY source code is covered by the following
license:
ORIGINAL HTML TIDY COPYRIGHT NOTICE:
------------------------------------
Copyright (c) 1998-2000 World Wide Web Consortium
(Massachusetts Institute of Technology, Institut National de
Recherche en Informatique et en Automatique, Keio University).
All Rights Reserved.
Contributing Author(s):
Dave Raggett
The contributing author(s) would like to thank all those who
helped with testing, bug fixes, and patience. This wouldn't
have been possible without all of you.
COPYRIGHT NOTICE:
This software and documentation is provided "as is," and
the copyright holders and contributing author(s) make no
representations or warranties, express or implied, including
but not limited to, warranties of merchantability or fitness
for any particular purpose or that the use of the software or
documentation will not infringe any third party patents,
copyrights, trademarks or other rights.
The copyright holders and contributing author(s) will not be
liable for any direct, indirect, special or consequential damages
arising out of any use of the software or documentation, even if
advised of the possibility of such damage.
Permission is hereby granted to use, copy, modify, and distribute
this source code, or portions hereof, documentation and executables,
for any purpose, without fee, subject to the following restrictions:
1. The origin of this source code must not be misrepresented.
2. Altered versions must be plainly marked as such and must
not be misrepresented as being the original source.
3. This Copyright notice may not be removed or altered from any
source or altered source distribution.
The copyright holders and contributing author(s) specifically
permit, without fee, and encourage the use of this source code
as a component for supporting the Hypertext Markup Language in
commercial products. If you use this source code in a product,
acknowledgment is not required but would be appreciated.
*/
/* Include HTML Tidy Header */
#include "htmltidy.h"
void InitTidy(void);
void DeInitTidy(void);
extern char *release_date;
/* Mapping for Windows Western character set (128-159) to Unicode */
int Win2Unicode[32] =
{
0x20AC, 0x0000, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x0000, 0x017D, 0x0000,
0x0000, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x0000, 0x017E, 0x0178
};
/*
John Love-Jensen contributed this table for mapping MacRoman
character set to Unicode
*/
int Mac2Unicode[256] =
{
0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
0x0008, 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x000E, 0x000F,
0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017,
0x0018, 0x0019, 0x001A, 0x001B, 0x001C, 0x001D, 0x001E, 0x001F,
0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027,
0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F,
0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F,
0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F,
0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
0x0058, 0x0059, 0x005A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F,
0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F,
0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D, 0x007E, 0x007F,
/* x7F = DEL */
0x00C4, 0x00C5, 0x00C7, 0x00C9, 0x00D1, 0x00D6, 0x00DC, 0x00E1,
0x00E0, 0x00E2, 0x00E4, 0x00E3, 0x00E5, 0x00E7, 0x00E9, 0x00E8,
0x00EA, 0x00EB, 0x00ED, 0x00EC, 0x00EE, 0x00EF, 0x00F1, 0x00F3,
0x00F2, 0x00F4, 0x00F6, 0x00F5, 0x00FA, 0x00F9, 0x00FB, 0x00FC,
0x2020, 0x00B0, 0x00A2, 0x00A3, 0x00A7, 0x2022, 0x00B6, 0x00DF,
0x00AE, 0x00A9, 0x2122, 0x00B4, 0x00A8, 0x2260, 0x00C6, 0x00D8,
0x221E, 0x00B1, 0x2264, 0x2265, 0x00A5, 0x00B5, 0x2202, 0x2211,
0x220F, 0x03C0, 0x222B, 0x00AA, 0x00BA, 0x03A9, 0x00E6, 0x00F8,
0x00BF, 0x00A1, 0x00AC, 0x221A, 0x0192, 0x2248, 0x2206, 0x00AB,
0x00BB, 0x2026, 0x00A0, 0x00C0, 0x00C3, 0x00D5, 0x0152, 0x0153,
0x2013, 0x2014, 0x201C, 0x201D, 0x2018, 0x2019, 0x00F7, 0x25CA,
0x00FF, 0x0178, 0x2044, 0x20AC, 0x2039, 0x203A, 0xFB01, 0xFB02,
0x2021, 0x00B7, 0x201A, 0x201E, 0x2030, 0x00C2, 0x00CA, 0x00C1,
0x00CB, 0x00C8, 0x00CD, 0x00CE, 0x00CF, 0x00CC, 0x00D3, 0x00D4,
/* xF0 = Apple Logo */
0xF8FF, 0x00D2, 0x00DA, 0x00DB, 0x00D9, 0x0131, 0x02C6, 0x02DC,
0x00AF, 0x02D8, 0x02D9, 0x02DA, 0x00B8, 0x02DD, 0x02DB, 0x02C7
};
void FatalError(char *msg)
{
fprintf(stderr, "Fatal error: %s\n", msg);
DeInitTidy();
/* 2 signifies a serious error */
exit(2);
}
void *MemAlloc(unsigned int size)
{
void *p;
p = malloc(size);
if (!p)
FatalError("Out of memory!");
return p;
}
void *MemRealloc(void *mem, unsigned int newsize)
{
void *p;
if (mem == (void *)NULL)
return MemAlloc(newsize);
p = realloc(mem, newsize);
if (!p)
FatalError("Out of memory!");
return p;
}
void MemFree(void *mem)
{
if (mem != (void *)NULL)
free(mem);
}
void ClearMemory(void *mem, unsigned int size)
{
memset(mem, 0, size);
}
static
InputStream *NewInputStream(int CharEncoding,
int tabsize)
{
InputStream *in;
in = (InputStream *)MemAlloc(sizeof(InputStream));
in->fp = NULL;
in->pushed = no;
in->c = '\0';
in->tabs = 0;
in->lastcol = 0;
in->curline = 1;
in->curcol = 1;
in->encoding = CharEncoding;
in->tabsize = tabsize;
in->state = FSM_ASCII;
in->data = NULL;
in->datasize = 0;
in->datapos = 0;
in->lexer = NULL;
return in;
}
InputStream *InputStreamFromFile(FILE *fp,
int CharEncoding,
int tabsize)
{
InputStream *in;
in = NewInputStream(CharEncoding,
tabsize);
in->fp = fp;
return in;
}
InputStream *InputStreamFromBuffer(char *data,
int datasize,
int datapos,
int CharEncoding,
int tabsize)
{
InputStream *in;
in = NewInputStream(CharEncoding,
tabsize);
in->data = data;
in->datasize = datasize;
in->datapos = datapos;
return in;
}
void FreeInputStream(InputStream *in)
{
#if 0
if (in->fp != NULL) {
if (in->fp != stdin)
fclose(in->fp);
}
if (in->data != NULL)
MemFree(in->data);
#endif
MemFree(in);
}
int StreamEOF(InputStream *in)
{
if (in->fp != NULL)
return feof(in->fp);
if (in->data != NULL)
return (in->datapos >= in->datasize);
return -1;
}
/* read char from stream */
static int ReadCharFromStream(InputStream *in)
{
unsigned int n, c, i, count;
if (StreamEOF(in))
return -1;
if (in->fp != NULL)
c = getc(in->fp);
else if (in->data != NULL)
c = in->data[in->datapos++];
else
return -1;
/*printf("read '%c' (%ui)\n", c, c);*/
/*
A document in ISO-2022 based encoding uses some ESC sequences
called "designator" to switch character sets. The designators
defined and used in ISO-2022-JP are:
"ESC" + "(" + ? for ISO646 variants
"ESC" + "$" + ? and
"ESC" + "$" + "(" + ? for multibyte character sets
Where ? stands for a single character used to indicate the
character set for multibyte characters.
Tidy handles this by preserving the escape sequence and
setting the top bit of each byte for non-ascii chars. This
bit is then cleared on output. The input stream keeps track
of the state to determine when to set/clear the bit.
*/
if (in->encoding == ISO2022)
{
if (c == 0x1b) /* ESC */
{
in->state = FSM_ESC;
return c;
}
switch (in->state)
{
case FSM_ESC:
if (c == '$')
in->state = FSM_ESCD;
else if (c == '(')
in->state = FSM_ESCP;
else
in->state = FSM_ASCII;
break;
case FSM_ESCD:
if (c == '(')
in->state = FSM_ESCDP;
else
in->state = FSM_NONASCII;
break;
case FSM_ESCDP:
in->state = FSM_NONASCII;
break;
case FSM_ESCP:
in->state = FSM_ASCII;
break;
case FSM_NONASCII:
c |= 0x80;
break;
}
return c;
}
if (in->encoding != UTF8)
return c;
/* deal with UTF-8 encoded char */
if ((c & 0xE0) == 0xC0) /* 110X XXXX two bytes */
{
n = c & 31;
count = 1;
}
else if ((c & 0xF0) == 0xE0) /* 1110 XXXX three bytes */
{
n = c & 15;
count = 2;
}
else if ((c & 0xF8) == 0xF0) /* 1111 0XXX four bytes */
{
n = c & 7;
count = 3;
}
else if ((c & 0xFC) == 0xF8) /* 1111 10XX five bytes */
{
n = c & 3;
count = 4;
}
else if ((c & 0xFE) == 0xFC) /* 1111 110X six bytes */
{
n = c & 1;
count = 5;
}
else /* 0XXX XXXX one byte */
return c;
/* successor bytes should have the form 10XX XXXX */
for (i = 1; i <= count; ++i)
{
if (StreamEOF(in))
return -1;
if (in->fp != NULL)
c = getc(in->fp);
else if (in->data != NULL)
c = in->data[in->datapos++];
else
return -1;
n = (n << 6) | (c & 0x3F);
}
return n;
}
int ReadChar(InputStream *in)
{
int c;
if (in->pushed)
{
in->pushed = no;
c = in->c;
if (c == '\n')
{
in->curcol = 1;
in->curline++;
return c;
}
in->curcol++;
return c;
}
in->lastcol = in->curcol;
if (in->tabs > 0)
{
in->curcol++;
in->tabs--;
return ' ';
}
for (;;)
{
c = ReadCharFromStream(in);
if (c < 0)
return EndOfStream;
if (c == '\n')
{
in->curcol = 1;
in->curline++;
break;
}
if (c == '\t')
{
in->tabs = in->tabsize - ((in->curcol - 1) % in->tabsize) - 1;
in->curcol++;
c = ' ';
break;
}
/* strip control characters, except for Esc */
if (c == '\033')
break;
if (0 < c && c < 32)
continue;
/* watch out for IS02022 */
if (in->encoding == RAW || in->encoding == ISO2022)
{
in->curcol++;
break;
}
if (in->encoding == MACROMAN)
c = Mac2Unicode[c];
/* produced e.g. as a side-effect of smart quotes in Word;
these are control characters in Latin-1 ! */
if (127 < c && c < 160)
{
ReportEncodingError(in->lexer, WINDOWS_CHARS, c);
c = Win2Unicode[c - 128];
if (c == 0)
continue;
}
in->curcol++;
break;
}
return c;
}
void UngetChar(int c, InputStream *in)
{
in->pushed = yes;
in->c = c;
if (c == '\n')
--(in->curline);
in->curcol = in->lastcol;
/*printf("unread '%c' (%ui)\n", c, c);*/
}
/* like strdup but using MemAlloc */
char *wstrdup(char *str)
{
char *s, *p;
int len;
if (str == NULL)
return NULL;
for (len = 0; str[len] != '\0'; ++len);
s = (char *)MemAlloc(sizeof(char)*(1+len));
for (p = s; (*p++ = *str++););
return s;
}
/* like strndup but using MemAlloc */
char *wstrndup(char *str, int len)
{
char *s, *p;
if (str == NULL || len < 0)
return NULL;
s = (char *)MemAlloc(sizeof(char)*(1+len));
p = s;
while (len-- > 0 && (*p++ = *str++));
*p = '\0';
return s;
}
/* exactly same as strncpy */
void wstrncpy(char *s1, char *s2, int size)
{
if (s1 != NULL && s2 != NULL)
{
if (size >= 0)
{
while (size--)
*s1++ = *s2++;
}
else
while ((*s1++ = *s2++));
}
}
void wstrcpy(char *s1, char *s2)
{
while ((*s1++ = *s2++));
}
void wstrcat(char *s1, char *s2)
{
while (*s1)
++s1;
while ((*s1++ = *s2++));
}
/* exactly same as strcmp */
int wstrcmp(char *s1, char *s2)
{
int c;
while ((c = *s1) == *s2)
{
if (c == '\0')
return 0;
++s1;
++s2;
}
return (*s1 > *s2 ? 1 : -1);
}
/* returns byte count, not char count */
int wstrlen(char *str)
{
int len = 0;
while(*str++)
++len;
return len;
}
/*
MS C 4.2 doesn't include strcasecmp.
Note that tolower and toupper won't
work on chars > 127
*/
int wstrcasecmp(char *s1, char *s2)
{
unsigned int c;
while (c = (unsigned int)(*s1), ToLower(c) == ToLower((unsigned int)(*s2)))
{
if (c == '\0')
return 0;
++s1;
++s2;
}
return (*s1 > *s2 ? 1 : -1);
}
int wstrncmp(char *s1, char *s2, int n)
{
int c;
while ((c = *s1) == *s2)
{
if (c == '\0')
return 0;
if (n == 0)
return 0;
++s1;
++s2;
--n;
}
if (n == 0)
return 0;
return (*s1 > *s2 ? 1 : -1);
}
int wstrncasecmp(char *s1, char *s2, int n)
{
int c;
while (c = *s1, tolower(c) == tolower(*s2))
{
if (c == '\0')
return 0;
if (n == 0)
return 0;
++s1;
++s2;
--n;
}
if (n == 0)
return 0;
return (*s1 > *s2 ? 1 : -1);
}
Bool wsubstr(char *s1, char *s2)
{
int i, len1 = wstrlen(s1), len2 = wstrlen(s2);
for (i = 0; i <= len1 - len2; ++i)
{
if (wstrncasecmp(s1+i, s2, len2) == 0)
return yes;
}
return no;
}
static
OutputStream *NewOutputStream(int CharEncoding)
{
OutputStream *out;
out = (OutputStream *)MemAlloc(sizeof(OutputStream));
out->encoding = CharEncoding;
out->state = FSM_ASCII;
out->fp = NULL;
out->data = NULL;
out->datasize = 0;
out->datapos = 0;
return out;
}
OutputStream *OutputStreamFromFile(FILE *fp,
int CharEncoding)
{
OutputStream *out;
out = NewOutputStream(CharEncoding);
out->fp = fp;
return out;
}
OutputStream *OutputStreamFromBuffer(char *data,
int datasize,
int datapos,
int CharEncoding)
{
OutputStream *out;
out = NewOutputStream(CharEncoding);
if (data != NULL) {
out->data = data;
out->datasize = datasize;
out->datapos = datapos;
}
else {
out->datasize = datasize > 1024 ? datasize : 1024;
out->data = (char *)MemAlloc(out->datasize);
out->datapos = 0;
}
return out;
}
void FreeOutputStream(OutputStream *out)
{
#if 0
if (out->fp != NULL) {
if (out->fp != stdout &&
out->fp != stderr)
fclose(out->fp);
out->fp = NULL;
}
#endif
if (out->data != NULL) {
MemFree(out->data);
out->data = NULL;
out->datapos = 0;
out->datasize = 0;
}
MemFree(out);
}
void WriteCharToStream(char c,
OutputStream *out)
{
/* Write to file */
if (out->fp != NULL) {
putc(c, out->fp);
return;
}
/* Write to buffer */
if (out->data == NULL) {
out->datasize = 1024;
out->data = (char *)MemAlloc(out->datasize);
out->datapos = 0;
}
out->data[out->datapos++] = c;
if (out->datapos >= out->datasize) {
out->datasize += out->datasize >> 1;
out->data = (char *)MemRealloc(out->data, out->datasize);
}
}
void WriteStringToStream(char *str,
int size,
OutputStream *out)
{
/* Write to file */
if (out->fp != NULL) {
fwrite(str, size, 1, out->fp);
return;
}
/* Write to buffer */
if (out->data == NULL) {
out->datasize = size + 1024;
out->data = (char *)MemAlloc(out->datasize);
out->datapos = 0;
}
else if (out->datapos + size >= out->datasize) {
out->datasize += out->datasize >> 1;
out->data = (char *)MemRealloc(out->data, out->datasize);
}
memcpy(out->data, str, size);
out->datapos += size;
}
/* For mac users, should we map Unicode back to MacRoman? */
void outc(unsigned int c, OutputStream *out)
{
unsigned int ch;
if (out->encoding == UTF8)
{
if (c < 128)
WriteCharToStream(c, out);
else if (c <= 0x7FF)
{
ch = (0xC0 | (c >> 6)); WriteCharToStream((char)ch, out);
ch = (0x80 | (c & 0x3F)); WriteCharToStream((char)ch, out);
}
else if (c <= 0xFFFF)
{
ch = (0xE0 | (c >> 12)); WriteCharToStream((char)ch, out);
ch = (0x80 | ((c >> 6) & 0x3F)); WriteCharToStream((char)ch, out);
ch = (0x80 | (c & 0x3F)); WriteCharToStream((char)ch, out);
}
else if (c <= 0x1FFFFF)
{
ch = (0xF0 | (c >> 18)); WriteCharToStream((char)ch, out);
ch = (0x80 | ((c >> 12) & 0x3F)); WriteCharToStream((char)ch, out);
ch = (0x80 | ((c >> 6) & 0x3F)); WriteCharToStream((char)ch, out);
ch = (0x80 | (c & 0x3F)); WriteCharToStream((char)ch, out);
}
else
{
ch = (0xF8 | (c >> 24)); WriteCharToStream((char)ch, out);
ch = (0x80 | ((c >> 18) & 0x3F)); WriteCharToStream((char)ch, out);
ch = (0x80 | ((c >> 12) & 0x3F)); WriteCharToStream((char)ch, out);
ch = (0x80 | ((c >> 6) & 0x3F)); WriteCharToStream((char)ch, out);
ch = (0x80 | (c & 0x3F)); WriteCharToStream((char)ch, out);
}
}
else if (out->encoding == ISO2022)
{
if (c == 0x1b) /* ESC */
out->state = FSM_ESC;
else
{
switch (out->state)
{
case FSM_ESC:
if (c == '$')
out->state = FSM_ESCD;
else if (c == '(')
out->state = FSM_ESCP;
else
out->state = FSM_ASCII;
break;
case FSM_ESCD:
if (c == '(')
out->state = FSM_ESCDP;
else
out->state = FSM_NONASCII;
break;
case FSM_ESCDP:
out->state = FSM_NONASCII;
break;
case FSM_ESCP:
out->state = FSM_ASCII;
break;
case FSM_NONASCII:
c &= 0x7F;
break;
}
}
WriteCharToStream((char)c, out);
}
else
WriteCharToStream((char)c, out);
}
/*
first time initialization which should
precede reading the command line
*/
void InitTidy(void)
{
InitMap();
InitAttrs();
InitTags();
InitEntities();
}
/*
call this when you have finished with tidy
to free the hash tables and other resources
*/
void DeInitTidy(void)
{
FreeTags();
FreeAttrTable();
FreeEntities();
}