/* mxTidy -- Interface to HTML Tidy (HTML/XML beautifier) Copyright (c) 2001-2002, eGenix.com Software GmbH; mailto:info@egenix.com See the documentation for further copyright information or contact the author (mailto:mal@lemburg.com). */ /* Debug defines: */ /*#define MAL_MEM_DEBUG*/ /*#define MAL_DEBUG*/ /*#define MAL_REF_DEBUG*/ /* Logging file used by debugging facility */ #ifndef MAL_DEBUG_OUTPUTFILE # define MAL_DEBUG_OUTPUTFILE "mxTidy.log" #endif /* We want all our symbols to be exported */ #define MX_BUILDING_MXTIDY #include "mx.h" #include "mxTidy.h" /* Include symbols from Tidy */ #include "htmltidy.h" /* Version number: Major.Minor.Patchlevel */ #define VERSION "0.3.0" /* --- module doc-string -------------------------------------------------- */ static char *Module_docstring = MXTIDY_MODULE" -- Interface to Tidy (HTML beautifier).\n\n" "Version "VERSION"\n\n" "Copyright (c) 2001-2002, eGenix.com Software GmbH; mailto:info@egenix.com\n\n" " All Rights Reserved\n\n" "See the documentation for further information on copyrights,\n" "or contact the author." ; /* --- module globals ----------------------------------------------------- */ static PyObject *mxTidy_Error; /* Error Exception object */ /* Flag telling us whether the module was initialized or not. */ static int mxTidy_Initialized = 0; /* --- forward declarations ----------------------------------------------- */ /* --- internal macros ---------------------------------------------------- */ /* --- module helpers ----------------------------------------------------- */ /* Create an exception object, insert it into the module dictionary under the given name and return the object pointer; this is NULL in case an error occurred. base can be given to indicate the base object to be used by the exception object. It should be NULL otherwise */ static PyObject *insexc(PyObject *moddict, char *name, PyObject *base) { PyObject *v; char fullname[256]; char *modname; char *dot; v = PyDict_GetItemString(moddict, "__name__"); if (v == NULL) modname = NULL; else modname = PyString_AsString(v); if (modname == NULL) { PyErr_Clear(); modname = MXTIDY_MODULE; } /* The symbols from this extension are imported into mx.. We trim the name to not confuse the user with an overly long package path. */ strcpy(fullname, modname); dot = strchr(fullname, '.'); if (dot) dot = strchr(dot+1, '.'); if (dot) strcpy(dot+1, name); else sprintf(fullname, "%s.%s", modname, name); v = PyErr_NewException(fullname, base, NULL); if (v == NULL) return NULL; if (PyDict_SetItemString(moddict,name,v)) return NULL; return v; } #if 0 /* Helper for adding integer constants. Check for errors with PyErr_Occurred() */ static void insint(PyObject *dict, char *name, int value) { PyObject *v = PyInt_FromLong((long)value); PyDict_SetItemString(dict, name, v); Py_XDECREF(v); } #endif #ifdef Py_NEWSTYLENUMBER static PyObject *notimplemented1(PyObject *v) { Py_Error(PyExc_TypeError, "operation not implemented"); onError: return NULL; } static PyObject *notimplemented2(PyObject *v, PyObject *w) { Py_Error(PyExc_TypeError, "operation not implemented"); onError: return NULL; } static PyObject *notimplemented3(PyObject *u, PyObject *v, PyObject *w) { Py_Error(PyExc_TypeError, "operation not implemented"); onError: return NULL; } #endif /* --- Tidy configuration helpers -------------------------------------- */ #define SET_BOOL_OPTION(name, variable, truevalue, falsevalue) \ if ((value = PyDict_GetItemString(options, name)) != NULL) \ variable = PyObject_IsTrue(value) ? truevalue : falsevalue; #define SET_INT_OPTION(name, variable) \ if ((value = PyDict_GetItemString(options, name)) != NULL) { \ variable = PyInt_AsLong(value); \ Py_Assert(variable >= 0, \ PyExc_TypeError, \ "'"name"' option value must be an integer and >= 0"); \ } #define SET_STRING_OPTION(name, variable) \ if ((value = PyDict_GetItemString(options, name)) != NULL) { \ Py_Assert(PyString_Check(value), \ PyExc_TypeError, \ "'"name"' option value must be a string"); \ variable = wstrdup(PyString_AS_STRING(value)); \ } static int mxTidy_SetIndent(tidyconfig *config, char *value) { if (wstrcasecmp(value, "yes") == 0) { config->IndentContent = yes; config->SmartIndent = no; } else if (wstrcasecmp(value, "true") == 0) { config->IndentContent = yes; config->SmartIndent = no; } else if (wstrcasecmp(value, "no") == 0) { config->IndentContent = no; config->SmartIndent = no; } else if (wstrcasecmp(value, "false") == 0) { config->IndentContent = no; config->SmartIndent = no; } else if (wstrcasecmp(value, "auto") == 0) { config->IndentContent = yes; config->SmartIndent = yes; } else Py_Error(PyExc_ValueError, "unknown 'indent' value"); return 0; onError: return -1; } static int mxTidy_SetEncoding(tidyconfig *config, char *value) { if (wstrcasecmp(value, "ascii") == 0) config->CharEncoding = ASCII; else if (wstrcasecmp(value, "latin1") == 0) config->CharEncoding = LATIN1; else if (wstrcasecmp(value, "raw") == 0) config->CharEncoding = RAW; else if (wstrcasecmp(value, "utf8") == 0) config->CharEncoding = UTF8; else if (wstrcasecmp(value, "iso2022") == 0) config->CharEncoding = ISO2022; else if (wstrcasecmp(value, "mac") == 0) config->CharEncoding = MACROMAN; else Py_Error(PyExc_ValueError, "unknown 'char-encoding' value"); return 0; onError: return -1; } /* --- Tidy Interface ------------------------------------------------- */ static int mxTidy_Initialize(void) { /* Initialize Tidy */ InitTidy(); return 0; } static int mxTidy_Cleanup(void) { /* Called to free hash tables etc. */ DeInitTidy(); return 0; } static PyObject *mxTidy_RunTidy(PyObject *inputstream, PyObject *outputstream, PyObject *errorstream, PyObject *options) { char *inputstreamname; tidyconfig *config = NULL; Node *document = NULL, *doctype; Lexer *lexer = NULL; InputStream *input = NULL; OutputStream *output = NULL; OutputStream *errout = NULL; PyObject *value, *result; int totalwarnings; int totalerrors; DPRINTF("starting tidy()...\n"); /* Check Python arguments */ if (options != NULL) { Py_Assert(PyDict_Check(options), PyExc_TypeError, "options must be given as dictionary"); if (PyDict_Size(options) == 0) options = NULL; } /* Init Tidy globals */ config = NewConfig(); totalwarnings = 0; totalerrors = 0; DPRINTF("parsing options...\n"); /* Parse Tidy options. These are the union of the command line options from tidy.c and the config file options from config.c. Note: All hyphens have to be converted to underscores to make the option names compatible to Python identifiers. Helpers: RE: ^[^"]+\(".+"\), *{.*&\(.+\)}, +ParseBool}, -> SET_BOOL_OPTION(\1, \2, yes); */ if (options != NULL) { SET_BOOL_OPTION("add_xml_decl", config->XmlPi, yes, no); SET_BOOL_OPTION("add_xml_pi", config->XmlPi, yes, no); SET_BOOL_OPTION("add_xml_space", config->XmlSpace, yes, no); SET_BOOL_OPTION("assume_xml_procins", config->XmlPIs, yes, no); SET_BOOL_OPTION("break_before_br", config->BreakBeforeBR, yes, no); SET_BOOL_OPTION("clean", config->MakeClean, yes, no); SET_BOOL_OPTION("drop_empty_paras", config->DropEmptyParas, yes, no); SET_BOOL_OPTION("drop_font_tags", config->DropFontTags, yes, no); SET_BOOL_OPTION("enclose_block_text", config->EncloseBlockText, yes, no); SET_BOOL_OPTION("fix_backslash", config->FixBackslash, yes, no); SET_BOOL_OPTION("fix_bad_comments", config->FixComments, yes, no); SET_BOOL_OPTION("gnu_emacs", config->Emacs, yes, no); SET_BOOL_OPTION("hide_endtags", config->HideEndTags, yes, no); SET_BOOL_OPTION("indent_attributes", config->IndentAttributes, yes, no); SET_BOOL_OPTION("input_xml", config->XmlTags, yes, no); SET_BOOL_OPTION("literal_attributes", config->LiteralAttribs, yes, no); SET_BOOL_OPTION("logical_emphasis", config->LogicalEmphasis, yes, no); SET_BOOL_OPTION("numeric_entities", config->NumEntities, yes, no); SET_BOOL_OPTION("output_errors", config->Errors, yes, no); SET_BOOL_OPTION("output_markup", config->Output, yes, no); SET_BOOL_OPTION("output_xhtml", config->xHTML, yes, no); SET_BOOL_OPTION("output_xml", config->XmlOut, yes, no); SET_BOOL_OPTION("quiet", config->Quiet, yes, no); SET_BOOL_OPTION("quote_ampersand", config->QuoteAmpersand, yes, no); SET_BOOL_OPTION("quote_marks", config->QuoteMarks, yes, no); SET_BOOL_OPTION("quote_nbsp", config->QuoteNbsp, yes, no); SET_BOOL_OPTION("raw", config->RawOut, yes, no); SET_BOOL_OPTION("show_warnings", config->ShowWarnings, yes, no); SET_BOOL_OPTION("tidy_mark", config->TidyMark, yes, no); SET_BOOL_OPTION("uppercase_attributes", config->UpperCaseAttrs, yes, no); SET_BOOL_OPTION("uppercase_tags", config->UpperCaseTags, yes, no); SET_BOOL_OPTION("word_2000", config->Word2000, yes, no); SET_BOOL_OPTION("wrap_asp", config->WrapAsp, yes, no); SET_BOOL_OPTION("wrap_attributes", config->WrapAttVals, yes, no); SET_BOOL_OPTION("wrap_jste", config->WrapJste, yes, no); SET_BOOL_OPTION("wrap_php", config->WrapPhp, yes, no); SET_BOOL_OPTION("wrap_script_literals", config->WrapScriptlets, yes, no); SET_BOOL_OPTION("wrap_sections", config->WrapSection, yes, no); SET_INT_OPTION("indent_spaces", config->spaces); SET_INT_OPTION("tab_size", config->tabsize); SET_INT_OPTION("wrap", config->wraplen); SET_STRING_OPTION("alt_text", config->alt_text); /* Indentation */ if ((value = PyDict_GetItemString(options, "indent")) != NULL) { Py_Assert(PyString_Check(value), PyExc_TypeError, "'indent' option value must be a string"); if (mxTidy_SetIndent(config, PyString_AS_STRING(value))) goto onError; } /* Character encoding */ if ((value = PyDict_GetItemString(options, "char_encoding")) != NULL) { Py_Assert(PyString_Check(value), PyExc_TypeError, "'char_encoding' option value must be a string"); if (mxTidy_SetEncoding(config, PyString_AS_STRING(value))) goto onError; } /* Not yet supported... use a config file to set these */ #if 0 {"new_inline_tags", {(int *)&inline_tags}, ParseTagNames}, {"new_blocklevel_tags", {(int *)&block_tags}, ParseTagNames}, {"new_empty_tags", {(int *)&empty_tags}, ParseTagNames}, {"new_pre_tags", {(int *)&pre_tags}, ParseTagNames}, {"doctype", {(int *)&doctype_str}, ParseDocType}, #endif } /* XXX Music: ETAGE 3 - hotel costes; mixed by Stephane Pompougnac */ /* ensure config is self-consistent */ AdjustConfig(config); /* Setup streams */ Py_Assert(inputstream != NULL, PyExc_TypeError, "missing inputstream"); if (PyFile_Check(inputstream)) { input = InputStreamFromFile(PyFile_AsFile(inputstream), config->CharEncoding, config->tabsize); inputstreamname = PyString_AS_STRING(PyFile_Name(inputstream)); } else if (PyString_Check(inputstream)) { input = InputStreamFromBuffer(PyString_AS_STRING(inputstream), PyString_GET_SIZE(inputstream), 0, config->CharEncoding, config->tabsize); inputstreamname = ""; } else Py_Error(PyExc_TypeError, "inputstream must be a file object or string"); if (config->Output == no) output = NULL; else if (outputstream == NULL) output = OutputStreamFromBuffer(NULL, 0, 0, config->CharEncoding); else if (PyFile_Check(outputstream)) output = OutputStreamFromFile(PyFile_AsFile(outputstream), config->CharEncoding); else Py_Error(PyExc_TypeError, "outputstream must be a file object or None"); if (config->Errors == no) errout = NULL; else if (errorstream == NULL) errout = OutputStreamFromBuffer(NULL, 0, 0, config->CharEncoding); else if (PyFile_Check(errorstream)) errout = OutputStreamFromFile(PyFile_AsFile(errorstream), config->CharEncoding); else Py_Error(PyExc_TypeError, "errorstream must be a file object or None"); DPRINTF("running lexer...\n"); /* Initialize Lexer */ lexer = NewLexer(config, input, errout); DPRINTF("running parser...\n"); /* Tidy doesn't alter the doctype for generic XML docs */ if (config->XmlTags) document = ParseXMLDocument(lexer); else { lexer->warnings = 0; document = ParseDocument(lexer); Py_Assert(CheckNodeIntegrity(document), mxTidy_Error, "Tidy tree has lost its integrity"); /* Simplifies ... ... etc. */ NestedEmphasis(lexer, document); /* Cleans up indented text etc. */ List2BQ(lexer, document); BQ2Div(lexer, document); /* Replaces by and by */ if (config->LogicalEmphasis) EmFromI(lexer, document); /* Fix Word generated HTML files */ if (config->Word2000 && IsWord2000(lexer, document)) { /* prune Word2000's ... */ DropSections(lexer, document); /* drop style & class attributes and empty p, span elements */ CleanWord2000(lexer, document); } /* Replaces presentational markup by style rules */ if (config->MakeClean || config->DropFontTags) CleanTree(lexer, document); Py_Assert(CheckNodeIntegrity(document), mxTidy_Error, "Tidy tree has lost its integrity"); doctype = FindDocType(document); if (document->content) { if (config->xHTML) SetXHTMLDocType(lexer, document); else FixDocType(lexer, document); if (config->TidyMark) AddGenerator(lexer, document); } /* Ensure presence of initial */ if (config->XmlOut && config->XmlPi) FixXMLPI(lexer, document); totalwarnings += lexer->warnings; totalerrors += lexer->errors; /* XXX Report errors & warnings... */ #if 0 if (!config->Quiet && document->content) { ReportVersion(errout, lexer, inputstreamname, doctype); ReportNumWarnings(errout, lexer); } #endif } /* XXX Report errors... */ if (lexer->errors > 0) NeedsAuthorIntervention(errout); DPRINTF("generating output...\n"); /* Generate output */ if (output != NULL && config->Output && lexer->errors == 0) { if (config->XmlTags) PPrintXMLTree(lexer, output, 0, 0, document); else PPrintTree(lexer, output, 0, 0, document); } /* XXX Report errors & warnings... */ #if 0 ErrorSummary(lexer); if (totalerrors + totalwarnings > 0) GeneralInfo(errout); #endif /* Build result tuple */ result = Py_BuildValue("iiz#z#", totalerrors, totalwarnings, output ? output->data : NULL, output ? output->datapos : 0, errout ? errout->data : NULL, errout ? errout->datapos : 0); /* Free data structures */ FreeNode(document); document = NULL; FreeLexer(lexer); lexer = NULL; FreeConfig(config); config = NULL; /* Free streams */ FreeInputStream(input); input = NULL; if (output) { FreeOutputStream(output); output = NULL; } if (errout) { FreeOutputStream(errout); errout = NULL; } /* Return status */ return result; onError: if (document) FreeNode(document); if (lexer) FreeLexer(lexer); if (config) FreeConfig(config); if (input) FreeInputStream(input); if (output) FreeOutputStream(output); if (errout) FreeOutputStream(errout); return NULL; } /* --- Module Functions --------------------------------------------- */ Py_C_Function(mxTidy_tidy, "tidy(input, [output, errors=None, options={}])\n\n" "Filter input through Tidy and write to output.\n" "Tidy options must be given in the options dictionary." ) { PyObject *input, *output=NULL, *errors=Py_None, *options=NULL; Py_Get4Args("O|OOO", input, output, errors, options); if (output == Py_None) output = NULL; if (errors == Py_None) errors = NULL; if (options == Py_None) options = NULL; return mxTidy_RunTidy(input, output, errors, options); onError: return NULL; } /* XXX Add API to query Tidy version */ #if 0 Py_C_Function( mxTidy_urljoin, "urljoin(u,v)\n\n" "Takes two Tidys or strings, joins them and returns the\n" "result as Tidy object") { mxTidyObject *a = 0,*b = 0; PyObject *u,*v; PyObject *url; Py_Get2Args("OO",u,v); if (_mxTidy_Check(u)) { a = (mxTidyObject *)u; Py_INCREF(u); } else if (PyString_Check(u)) { a = mxTidy_FromString(PyString_AS_STRING(u), RAW_Tidy); if (!a) goto onError; } else Py_Error(PyExc_TypeError, "arguments must be Tidys or strings"); if (_mxTidy_Check(v)) { b = (mxTidyObject *)v; Py_INCREF(v); } else if (PyString_Check(v)) { b = mxTidy_FromString(PyString_AS_STRING(v), RAW_Tidy); if (!b) goto onError; } else Py_Error(PyExc_TypeError, "arguments must be Tidys or strings"); url = (PyObject *)mxTidy_FromJoiningTidys((mxTidyObject*)a, (mxTidyObject*)b); if (!url) goto onError; DPRINTF(" urljoin() returning '%s'\n", PyString_AS_STRING(((mxTidyObject *)url)->url)); Py_DECREF(a); Py_DECREF(b); Py_PRINT_REFCOUNT(url); return url; onError: Py_XDECREF(a); Py_XDECREF(b); return NULL; } #endif /* --- module init --------------------------------------------------------- */ /* Python Method Table */ static PyMethodDef Module_methods[] = { Py_MethodListEntry("tidy",mxTidy_tidy), #if 0 Py_MethodListEntrySingleArg("setmimedict",mxTidy_setmimedict), #endif {NULL,NULL} /* end of list */ }; /* Cleanup function */ static void mxTidyModule_Cleanup(void) { if (mxTidy_Cleanup()) PyErr_Clear(); /* Reset mxTidy_Initialized flag */ mxTidy_Initialized = 0; } /* create PyMethodObjects and register them in the module's dict */ MX_EXPORT(void) initmxTidy(void) { PyObject *module, *moddict; if (mxTidy_Initialized) Py_Error(PyExc_SystemError, "can't initialize "MXTIDY_MODULE" more than once"); /* Create module */ module = Py_InitModule4(MXTIDY_MODULE, /* Module name */ Module_methods, /* Method list */ Module_docstring, /* Module doc-string */ (PyObject *)NULL, /* always pass this as *self */ PYTHON_API_VERSION); /* API Version */ if (module == NULL) goto onError; /* Init Tidy */ if (mxTidy_Initialize()) goto onError; /* Add some constants to the module's dict */ moddict = PyModule_GetDict(module); PyDict_SetItemString(moddict, "__version__", PyString_FromString(VERSION)); /* Errors */ if (!(mxTidy_Error = insexc(moddict, "Error", PyExc_StandardError))) goto onError; #if 0 /* Type objects */ Py_INCREF(&mxTidy_Type); PyDict_SetItemString(moddict, "TidyType", (PyObject *)&mxTidy_Type); #endif /* Register cleanup function */ if (Py_AtExit(mxTidyModule_Cleanup)) { /* XXX what to do if we can't register that function ??? */ DPRINTF("* Failed to register mxTidy cleanup function\n"); } /* We are now initialized */ mxTidy_Initialized = 1; onError: /* Check for errors and report them */ if (PyErr_Occurred()) Py_ReportModuleInitError(MXTIDY_MODULE); return; }