/*
mxTidy -- Interface to HTML Tidy (HTML/XML beautifier)
Copyright (c) 2001-2002, eGenix.com Software GmbH; mailto:info@egenix.com
See the documentation for further copyright information or contact
the author (mailto:mal@lemburg.com).
*/
/* Debug defines: */
/*#define MAL_MEM_DEBUG*/
/*#define MAL_DEBUG*/
/*#define MAL_REF_DEBUG*/
/* Logging file used by debugging facility */
#ifndef MAL_DEBUG_OUTPUTFILE
# define MAL_DEBUG_OUTPUTFILE "mxTidy.log"
#endif
/* We want all our symbols to be exported */
#define MX_BUILDING_MXTIDY
#include "mx.h"
#include "mxTidy.h"
/* Include symbols from Tidy */
#include "htmltidy.h"
/* Version number: Major.Minor.Patchlevel */
#define VERSION "0.3.0"
/* --- module doc-string -------------------------------------------------- */
static char *Module_docstring =
MXTIDY_MODULE" -- Interface to Tidy (HTML beautifier).\n\n"
"Version "VERSION"\n\n"
"Copyright (c) 2001-2002, eGenix.com Software GmbH; mailto:info@egenix.com\n\n"
" All Rights Reserved\n\n"
"See the documentation for further information on copyrights,\n"
"or contact the author."
;
/* --- module globals ----------------------------------------------------- */
static PyObject *mxTidy_Error; /* Error Exception object */
/* Flag telling us whether the module was initialized or not. */
static int mxTidy_Initialized = 0;
/* --- forward declarations ----------------------------------------------- */
/* --- internal macros ---------------------------------------------------- */
/* --- module helpers ----------------------------------------------------- */
/* Create an exception object, insert it into the module dictionary
under the given name and return the object pointer; this is NULL in
case an error occurred. base can be given to indicate the base
object to be used by the exception object. It should be NULL
otherwise */
static
PyObject *insexc(PyObject *moddict,
char *name,
PyObject *base)
{
PyObject *v;
char fullname[256];
char *modname;
char *dot;
v = PyDict_GetItemString(moddict, "__name__");
if (v == NULL)
modname = NULL;
else
modname = PyString_AsString(v);
if (modname == NULL) {
PyErr_Clear();
modname = MXTIDY_MODULE;
}
/* The symbols from this extension are imported into
mx.. We trim the name to not confuse the user with
an overly long package path. */
strcpy(fullname, modname);
dot = strchr(fullname, '.');
if (dot)
dot = strchr(dot+1, '.');
if (dot)
strcpy(dot+1, name);
else
sprintf(fullname, "%s.%s", modname, name);
v = PyErr_NewException(fullname, base, NULL);
if (v == NULL)
return NULL;
if (PyDict_SetItemString(moddict,name,v))
return NULL;
return v;
}
#if 0
/* Helper for adding integer constants. Check for errors with
PyErr_Occurred() */
static
void insint(PyObject *dict,
char *name,
int value)
{
PyObject *v = PyInt_FromLong((long)value);
PyDict_SetItemString(dict, name, v);
Py_XDECREF(v);
}
#endif
#ifdef Py_NEWSTYLENUMBER
static
PyObject *notimplemented1(PyObject *v)
{
Py_Error(PyExc_TypeError,
"operation not implemented");
onError:
return NULL;
}
static
PyObject *notimplemented2(PyObject *v, PyObject *w)
{
Py_Error(PyExc_TypeError,
"operation not implemented");
onError:
return NULL;
}
static
PyObject *notimplemented3(PyObject *u, PyObject *v, PyObject *w)
{
Py_Error(PyExc_TypeError,
"operation not implemented");
onError:
return NULL;
}
#endif
/* --- Tidy configuration helpers -------------------------------------- */
#define SET_BOOL_OPTION(name, variable, truevalue, falsevalue) \
if ((value = PyDict_GetItemString(options, name)) != NULL) \
variable = PyObject_IsTrue(value) ? truevalue : falsevalue;
#define SET_INT_OPTION(name, variable) \
if ((value = PyDict_GetItemString(options, name)) != NULL) { \
variable = PyInt_AsLong(value); \
Py_Assert(variable >= 0, \
PyExc_TypeError, \
"'"name"' option value must be an integer and >= 0"); \
}
#define SET_STRING_OPTION(name, variable) \
if ((value = PyDict_GetItemString(options, name)) != NULL) { \
Py_Assert(PyString_Check(value), \
PyExc_TypeError, \
"'"name"' option value must be a string"); \
variable = wstrdup(PyString_AS_STRING(value)); \
}
static
int mxTidy_SetIndent(tidyconfig *config, char *value)
{
if (wstrcasecmp(value, "yes") == 0)
{
config->IndentContent = yes;
config->SmartIndent = no;
}
else if (wstrcasecmp(value, "true") == 0)
{
config->IndentContent = yes;
config->SmartIndent = no;
}
else if (wstrcasecmp(value, "no") == 0)
{
config->IndentContent = no;
config->SmartIndent = no;
}
else if (wstrcasecmp(value, "false") == 0)
{
config->IndentContent = no;
config->SmartIndent = no;
}
else if (wstrcasecmp(value, "auto") == 0)
{
config->IndentContent = yes;
config->SmartIndent = yes;
}
else
Py_Error(PyExc_ValueError,
"unknown 'indent' value");
return 0;
onError:
return -1;
}
static
int mxTidy_SetEncoding(tidyconfig *config, char *value)
{
if (wstrcasecmp(value, "ascii") == 0)
config->CharEncoding = ASCII;
else if (wstrcasecmp(value, "latin1") == 0)
config->CharEncoding = LATIN1;
else if (wstrcasecmp(value, "raw") == 0)
config->CharEncoding = RAW;
else if (wstrcasecmp(value, "utf8") == 0)
config->CharEncoding = UTF8;
else if (wstrcasecmp(value, "iso2022") == 0)
config->CharEncoding = ISO2022;
else if (wstrcasecmp(value, "mac") == 0)
config->CharEncoding = MACROMAN;
else
Py_Error(PyExc_ValueError,
"unknown 'char-encoding' value");
return 0;
onError:
return -1;
}
/* --- Tidy Interface ------------------------------------------------- */
static
int mxTidy_Initialize(void)
{
/* Initialize Tidy */
InitTidy();
return 0;
}
static
int mxTidy_Cleanup(void)
{
/* Called to free hash tables etc. */
DeInitTidy();
return 0;
}
static
PyObject *mxTidy_RunTidy(PyObject *inputstream,
PyObject *outputstream,
PyObject *errorstream,
PyObject *options)
{
char *inputstreamname;
tidyconfig *config = NULL;
Node *document = NULL, *doctype;
Lexer *lexer = NULL;
InputStream *input = NULL;
OutputStream *output = NULL;
OutputStream *errout = NULL;
PyObject *value, *result;
int totalwarnings;
int totalerrors;
DPRINTF("starting tidy()...\n");
/* Check Python arguments */
if (options != NULL) {
Py_Assert(PyDict_Check(options),
PyExc_TypeError,
"options must be given as dictionary");
if (PyDict_Size(options) == 0)
options = NULL;
}
/* Init Tidy globals */
config = NewConfig();
totalwarnings = 0;
totalerrors = 0;
DPRINTF("parsing options...\n");
/* Parse Tidy options.
These are the union of the command line options from tidy.c and
the config file options from config.c.
Note: All hyphens have to be converted to underscores to make
the option names compatible to Python identifiers.
Helpers:
RE: ^[^"]+\(".+"\), *{.*&\(.+\)}, +ParseBool},
-> SET_BOOL_OPTION(\1, \2, yes);
*/
if (options != NULL) {
SET_BOOL_OPTION("add_xml_decl", config->XmlPi, yes, no);
SET_BOOL_OPTION("add_xml_pi", config->XmlPi, yes, no);
SET_BOOL_OPTION("add_xml_space", config->XmlSpace, yes, no);
SET_BOOL_OPTION("assume_xml_procins", config->XmlPIs, yes, no);
SET_BOOL_OPTION("break_before_br", config->BreakBeforeBR, yes, no);
SET_BOOL_OPTION("clean", config->MakeClean, yes, no);
SET_BOOL_OPTION("drop_empty_paras", config->DropEmptyParas, yes, no);
SET_BOOL_OPTION("drop_font_tags", config->DropFontTags, yes, no);
SET_BOOL_OPTION("enclose_block_text", config->EncloseBlockText, yes, no);
SET_BOOL_OPTION("fix_backslash", config->FixBackslash, yes, no);
SET_BOOL_OPTION("fix_bad_comments", config->FixComments, yes, no);
SET_BOOL_OPTION("gnu_emacs", config->Emacs, yes, no);
SET_BOOL_OPTION("hide_endtags", config->HideEndTags, yes, no);
SET_BOOL_OPTION("indent_attributes", config->IndentAttributes, yes, no);
SET_BOOL_OPTION("input_xml", config->XmlTags, yes, no);
SET_BOOL_OPTION("literal_attributes", config->LiteralAttribs, yes, no);
SET_BOOL_OPTION("logical_emphasis", config->LogicalEmphasis, yes, no);
SET_BOOL_OPTION("numeric_entities", config->NumEntities, yes, no);
SET_BOOL_OPTION("output_errors", config->Errors, yes, no);
SET_BOOL_OPTION("output_markup", config->Output, yes, no);
SET_BOOL_OPTION("output_xhtml", config->xHTML, yes, no);
SET_BOOL_OPTION("output_xml", config->XmlOut, yes, no);
SET_BOOL_OPTION("quiet", config->Quiet, yes, no);
SET_BOOL_OPTION("quote_ampersand", config->QuoteAmpersand, yes, no);
SET_BOOL_OPTION("quote_marks", config->QuoteMarks, yes, no);
SET_BOOL_OPTION("quote_nbsp", config->QuoteNbsp, yes, no);
SET_BOOL_OPTION("raw", config->RawOut, yes, no);
SET_BOOL_OPTION("show_warnings", config->ShowWarnings, yes, no);
SET_BOOL_OPTION("tidy_mark", config->TidyMark, yes, no);
SET_BOOL_OPTION("uppercase_attributes", config->UpperCaseAttrs, yes, no);
SET_BOOL_OPTION("uppercase_tags", config->UpperCaseTags, yes, no);
SET_BOOL_OPTION("word_2000", config->Word2000, yes, no);
SET_BOOL_OPTION("wrap_asp", config->WrapAsp, yes, no);
SET_BOOL_OPTION("wrap_attributes", config->WrapAttVals, yes, no);
SET_BOOL_OPTION("wrap_jste", config->WrapJste, yes, no);
SET_BOOL_OPTION("wrap_php", config->WrapPhp, yes, no);
SET_BOOL_OPTION("wrap_script_literals", config->WrapScriptlets, yes, no);
SET_BOOL_OPTION("wrap_sections", config->WrapSection, yes, no);
SET_INT_OPTION("indent_spaces", config->spaces);
SET_INT_OPTION("tab_size", config->tabsize);
SET_INT_OPTION("wrap", config->wraplen);
SET_STRING_OPTION("alt_text", config->alt_text);
/* Indentation */
if ((value = PyDict_GetItemString(options, "indent")) != NULL) {
Py_Assert(PyString_Check(value),
PyExc_TypeError,
"'indent' option value must be a string");
if (mxTidy_SetIndent(config, PyString_AS_STRING(value)))
goto onError;
}
/* Character encoding */
if ((value = PyDict_GetItemString(options, "char_encoding")) != NULL) {
Py_Assert(PyString_Check(value),
PyExc_TypeError,
"'char_encoding' option value must be a string");
if (mxTidy_SetEncoding(config, PyString_AS_STRING(value)))
goto onError;
}
/* Not yet supported... use a config file to set these */
#if 0
{"new_inline_tags", {(int *)&inline_tags}, ParseTagNames},
{"new_blocklevel_tags", {(int *)&block_tags}, ParseTagNames},
{"new_empty_tags", {(int *)&empty_tags}, ParseTagNames},
{"new_pre_tags", {(int *)&pre_tags}, ParseTagNames},
{"doctype", {(int *)&doctype_str}, ParseDocType},
#endif
}
/* XXX Music:
ETAGE 3 - hotel costes; mixed by Stephane Pompougnac
*/
/* ensure config is self-consistent */
AdjustConfig(config);
/* Setup streams */
Py_Assert(inputstream != NULL,
PyExc_TypeError,
"missing inputstream");
if (PyFile_Check(inputstream)) {
input = InputStreamFromFile(PyFile_AsFile(inputstream),
config->CharEncoding,
config->tabsize);
inputstreamname = PyString_AS_STRING(PyFile_Name(inputstream));
}
else if (PyString_Check(inputstream)) {
input = InputStreamFromBuffer(PyString_AS_STRING(inputstream),
PyString_GET_SIZE(inputstream),
0,
config->CharEncoding,
config->tabsize);
inputstreamname = "";
}
else
Py_Error(PyExc_TypeError,
"inputstream must be a file object or string");
if (config->Output == no)
output = NULL;
else if (outputstream == NULL)
output = OutputStreamFromBuffer(NULL,
0,
0,
config->CharEncoding);
else if (PyFile_Check(outputstream))
output = OutputStreamFromFile(PyFile_AsFile(outputstream),
config->CharEncoding);
else
Py_Error(PyExc_TypeError,
"outputstream must be a file object or None");
if (config->Errors == no)
errout = NULL;
else if (errorstream == NULL)
errout = OutputStreamFromBuffer(NULL,
0,
0,
config->CharEncoding);
else if (PyFile_Check(errorstream))
errout = OutputStreamFromFile(PyFile_AsFile(errorstream),
config->CharEncoding);
else Py_Error(PyExc_TypeError,
"errorstream must be a file object or None");
DPRINTF("running lexer...\n");
/* Initialize Lexer */
lexer = NewLexer(config, input, errout);
DPRINTF("running parser...\n");
/* Tidy doesn't alter the doctype for generic XML docs */
if (config->XmlTags)
document = ParseXMLDocument(lexer);
else {
lexer->warnings = 0;
document = ParseDocument(lexer);
Py_Assert(CheckNodeIntegrity(document),
mxTidy_Error,
"Tidy tree has lost its integrity");
/* Simplifies ... ... etc. */
NestedEmphasis(lexer, document);
/* Cleans up indented text etc. */
List2BQ(lexer, document);
BQ2Div(lexer, document);
/* Replaces by and by */
if (config->LogicalEmphasis)
EmFromI(lexer, document);
/* Fix Word generated HTML files */
if (config->Word2000 && IsWord2000(lexer, document)) {
/* prune Word2000's ... */
DropSections(lexer, document);
/* drop style & class attributes and empty p, span elements */
CleanWord2000(lexer, document);
}
/* Replaces presentational markup by style rules */
if (config->MakeClean || config->DropFontTags)
CleanTree(lexer, document);
Py_Assert(CheckNodeIntegrity(document),
mxTidy_Error,
"Tidy tree has lost its integrity");
doctype = FindDocType(document);
if (document->content) {
if (config->xHTML)
SetXHTMLDocType(lexer, document);
else
FixDocType(lexer, document);
if (config->TidyMark)
AddGenerator(lexer, document);
}
/* Ensure presence of initial */
if (config->XmlOut && config->XmlPi)
FixXMLPI(lexer, document);
totalwarnings += lexer->warnings;
totalerrors += lexer->errors;
/* XXX Report errors & warnings... */
#if 0
if (!config->Quiet && document->content) {
ReportVersion(errout, lexer, inputstreamname, doctype);
ReportNumWarnings(errout, lexer);
}
#endif
}
/* XXX Report errors... */
if (lexer->errors > 0)
NeedsAuthorIntervention(errout);
DPRINTF("generating output...\n");
/* Generate output */
if (output != NULL &&
config->Output &&
lexer->errors == 0) {
if (config->XmlTags)
PPrintXMLTree(lexer, output, 0, 0, document);
else
PPrintTree(lexer, output, 0, 0, document);
}
/* XXX Report errors & warnings... */
#if 0
ErrorSummary(lexer);
if (totalerrors + totalwarnings > 0)
GeneralInfo(errout);
#endif
/* Build result tuple */
result = Py_BuildValue("iiz#z#",
totalerrors, totalwarnings,
output ? output->data : NULL,
output ? output->datapos : 0,
errout ? errout->data : NULL,
errout ? errout->datapos : 0);
/* Free data structures */
FreeNode(document);
document = NULL;
FreeLexer(lexer);
lexer = NULL;
FreeConfig(config);
config = NULL;
/* Free streams */
FreeInputStream(input);
input = NULL;
if (output) {
FreeOutputStream(output);
output = NULL;
}
if (errout) {
FreeOutputStream(errout);
errout = NULL;
}
/* Return status */
return result;
onError:
if (document)
FreeNode(document);
if (lexer)
FreeLexer(lexer);
if (config)
FreeConfig(config);
if (input)
FreeInputStream(input);
if (output)
FreeOutputStream(output);
if (errout)
FreeOutputStream(errout);
return NULL;
}
/* --- Module Functions --------------------------------------------- */
Py_C_Function(mxTidy_tidy,
"tidy(input, [output, errors=None, options={}])\n\n"
"Filter input through Tidy and write to output.\n"
"Tidy options must be given in the options dictionary."
)
{
PyObject *input, *output=NULL, *errors=Py_None, *options=NULL;
Py_Get4Args("O|OOO", input, output, errors, options);
if (output == Py_None)
output = NULL;
if (errors == Py_None)
errors = NULL;
if (options == Py_None)
options = NULL;
return mxTidy_RunTidy(input, output, errors, options);
onError:
return NULL;
}
/* XXX Add API to query Tidy version */
#if 0
Py_C_Function( mxTidy_urljoin,
"urljoin(u,v)\n\n"
"Takes two Tidys or strings, joins them and returns the\n"
"result as Tidy object")
{
mxTidyObject *a = 0,*b = 0;
PyObject *u,*v;
PyObject *url;
Py_Get2Args("OO",u,v);
if (_mxTidy_Check(u)) {
a = (mxTidyObject *)u;
Py_INCREF(u);
}
else if (PyString_Check(u)) {
a = mxTidy_FromString(PyString_AS_STRING(u),
RAW_Tidy);
if (!a)
goto onError;
}
else
Py_Error(PyExc_TypeError,
"arguments must be Tidys or strings");
if (_mxTidy_Check(v)) {
b = (mxTidyObject *)v;
Py_INCREF(v);
}
else if (PyString_Check(v)) {
b = mxTidy_FromString(PyString_AS_STRING(v),
RAW_Tidy);
if (!b)
goto onError;
}
else
Py_Error(PyExc_TypeError,
"arguments must be Tidys or strings");
url = (PyObject *)mxTidy_FromJoiningTidys((mxTidyObject*)a,
(mxTidyObject*)b);
if (!url)
goto onError;
DPRINTF(" urljoin() returning '%s'\n",
PyString_AS_STRING(((mxTidyObject *)url)->url));
Py_DECREF(a);
Py_DECREF(b);
Py_PRINT_REFCOUNT(url);
return url;
onError:
Py_XDECREF(a);
Py_XDECREF(b);
return NULL;
}
#endif
/* --- module init --------------------------------------------------------- */
/* Python Method Table */
static
PyMethodDef Module_methods[] =
{
Py_MethodListEntry("tidy",mxTidy_tidy),
#if 0
Py_MethodListEntrySingleArg("setmimedict",mxTidy_setmimedict),
#endif
{NULL,NULL} /* end of list */
};
/* Cleanup function */
static
void mxTidyModule_Cleanup(void)
{
if (mxTidy_Cleanup())
PyErr_Clear();
/* Reset mxTidy_Initialized flag */
mxTidy_Initialized = 0;
}
/* create PyMethodObjects and register them in the module's dict */
MX_EXPORT(void)
initmxTidy(void)
{
PyObject *module, *moddict;
if (mxTidy_Initialized)
Py_Error(PyExc_SystemError,
"can't initialize "MXTIDY_MODULE" more than once");
/* Create module */
module = Py_InitModule4(MXTIDY_MODULE, /* Module name */
Module_methods, /* Method list */
Module_docstring, /* Module doc-string */
(PyObject *)NULL, /* always pass this as *self */
PYTHON_API_VERSION); /* API Version */
if (module == NULL)
goto onError;
/* Init Tidy */
if (mxTidy_Initialize())
goto onError;
/* Add some constants to the module's dict */
moddict = PyModule_GetDict(module);
PyDict_SetItemString(moddict,
"__version__",
PyString_FromString(VERSION));
/* Errors */
if (!(mxTidy_Error = insexc(moddict, "Error", PyExc_StandardError)))
goto onError;
#if 0
/* Type objects */
Py_INCREF(&mxTidy_Type);
PyDict_SetItemString(moddict, "TidyType",
(PyObject *)&mxTidy_Type);
#endif
/* Register cleanup function */
if (Py_AtExit(mxTidyModule_Cleanup)) {
/* XXX what to do if we can't register that function ??? */
DPRINTF("* Failed to register mxTidy cleanup function\n");
}
/* We are now initialized */
mxTidy_Initialized = 1;
onError:
/* Check for errors and report them */
if (PyErr_Occurred())
Py_ReportModuleInitError(MXTIDY_MODULE);
return;
}