#noCase
HTML2LaTeX ::= #ignore(HTML) #continue '<' "HTML" '>' HTMLHeader HTMLBody '<' '/' "HTML" '>' #empty;
HTMLHeader ::= '<' #continue "HEAD" '>' [~['<' '/' "HEAD" '>']]* '<' '/' "HEAD" '>';
HTMLBody ::= '<' #continue "BODY" '>' HTMLText '<' '/' "BODY" '>';
//note: blank characters are interesting, so we refuse to ignore HTML blanks and comments,
HTMLText ::= #!ignore
[
//note: handling of HTML escape sequences, announced by character \textbf{'\&'},
'&' #continue #readIdentifier:sEscape HTMLEscape ';'
|
//note: if not the beginning of a tag, the current character of the input stream
//note: is put to the output stream,
~'<':cChar => writeText(cChar);
|
//note: token operator '!' doesn't move the position of the input stream, and it continues
//note: in sequence only if the token expression that follows doesn't match; here, we check
//note: whether we have reached an end of tag or not,
!['<' blanks '/']
[
//note: we do not ignore comments anymore, so we have to do it my ourselves,
""]* "-->"
|
//note: an embedded tag has been encountered,
'<' #continue #ignore(HTML) #readIdentifier:sTag HTMLNextOfTag
]
]*;
//note: template clauses \samp{HTMLEscape<\textit{T}>} are always valid and just convert
//note: special characters to their LaTeX representation,
HTMLEscape<"lt"> ::= => {@<@};
HTMLEscape<"gt"> ::= => {@>@};
HTMLTag(sTag : value) ::= '<' #readText(sTag) #continue HTMLNextOfTag;
//note: in the real life, HTML tag \textit{} could represent a chapter, but the LaTeX
//note: output file is intended to be included into the reference manual of \CodeWorker\ as
//note: an illustration ; it will be a part of a section, so chapters are translated as
//note: sub sections!
HTMLNextOfTag<"H1"> ::=
#continue '>' => {@\subsection{@}
HTMLText
'<' '/' "H1" '>' => {@}@};
//note: in the real life, HTML tag \textit{} could represent a section, but for the same
//note: reason as above, it will be translated as a sub-sub section,
HTMLNextOfTag<"H2"> ::=
#continue '>' => {@\subsubsection{@}
HTMLText
'<' '/' "H2" '>' => {@}@};
HTMLNextOfTag<"A"> ::= [HTMLAttribute]* #continue '>' HTMLText '<' '/' 'A' '>';
HTMLNextOfTag<"TABLE"> ::=
[HTMLAttribute]* #continue '>' => {
@\begin{table@
//note: with HTML, the number of columns the table expects is deduced later. However, a
//note: latex table (well-formed for a PDF conversion) must know explicetly of how many
//note: columns it is composed. So, a floating position is attached to the current position
//note: of the output file. While discovering columns, text will be inserted here and further.
newFloatingLocation("table PDF suffix");
@}{@
//note: the format of each column is specified at this place,
newFloatingLocation("table columns");
@}{.5}@
}
//note: we consider that the first line of the table gives the name of the columns, and we'll
//note: take the PDF table suffix ('ii' for 2 columns, 'iii' for 3 columns, ...) to write
//note: lines of the table correctly,
=> local sPDFTableSuffix;
HTMLTableTitle(sPDFTableSuffix)
//note: we translate as many lines of the table as we can read, knowing the PDF suffix,
[HTMLTableLine(sPDFTableSuffix)]*
'<' '/' "TABLE" '>' => {@\end{table@sPDFTableSuffix@}
@};
HTMLTableTitle(sPDFTableSuffix : node) ::=
'<' "TR" [HTMLAttribute]*
#continue '>'
[HTMLTableCol(sPDFTableSuffix)]*
'<' '/' "TR" '>' => {
insertText(getFloatingLocation("table PDF suffix"), sPDFTableSuffix);
writeText(endl());
};
//note: the clause is intended to read the name of a column of a table, and to translate it
//note: to LaTeX, knowing that some text must be inserted into the declarative part of the
//note: LaTeX table,
HTMLTableCol(sPDFTableSuffix : node) ::=
'<' "TD" [HTMLAttribute]* #continue '>' => {
@{@
if !sPDFTableSuffix insertText(getFloatingLocation("table columns"), "l");
else insertText(getFloatingLocation("table columns"), "|l");
set sPDFTableSuffix += "i";
}
'<' 'B' '>' [#!ignore [~'<':cChar => writeText(cChar);]*] '<' '/' 'B' '>'
'<' '/' "TD" '>' => {@}@};
HTMLTableLine(sPDFTableSuffix : value) ::=
'<' "TR" [HTMLAttribute]* #continue '>' => {@\line@sPDFTableSuffix@@}
[HTMLTag("TD")]* '<' '/' "TR" '>' => {writeText(endl());};
HTMLNextOfTag<"TD"> ::=
[HTMLAttribute]* #continue '>' => {@{@}
HTMLCellText '<' '/' "TD" '>' => {@}@};
//note: the text into a cell of a table shouldn't contain paragraph jumps (empty line in LaTeX),
HTMLCellText ::= #!ignore
[
'&' #continue #readIdentifier:sEscape HTMLEscape ';'
|
//note: the simplest way to avoid empty lines is to ignore end of lines, and to replace it to
//note: a space,
['\r']? ['\n'] => {@ @}
|
~'<':cChar => writeText(cChar);
|
!['<' blanks '/']
[
""]* "-->"
|
'<' #continue #ignore(HTML) #readIdentifier:sTag HTMLNextOfTag
]
]*;
HTMLNextOfTag<"UL"> ::=
[HTMLAttribute]* #continue '>' => {@\begin{itemize}
@}
[HTMLTag("LI")]*
'<' '/' "UL" '>' => {@\end{itemize}
@};
HTMLNextOfTag<"LI"> ::=
[HTMLAttribute]* #continue '>' => {@\item @}
HTMLText
'<' '/' "LI" '>' => {writeText(endl());};
HTMLNextOfTag<"B"> ::=
#continue '>' => {@\textbf{@}
HTMLText
'<' '/' "B" '>' => {@}@};
HTMLNextOfTag<"I"> ::=
#continue '>' => {@\textbf{@}
HTMLText
'<' '/' "I" '>' => {@}@};
HTMLNextOfTag<"FONT"> ::= [HTMLAttribute]* #continue '>' HTMLText '<' '/' "FONT" '>';
HTMLNextOfTag<"BR"> ::= ['/']? #continue '>' => { writeText(endl());};
HTMLAttribute ::= #readIdentifier ['=' #continue [STRING_LITERAL | WORD_LITERAL]]?;
blanks ::= [' '| '\t' | '\r' | '\n']*;
STRING_LITERAL ::= #!ignore '\"' [~'\"']* '\"';
WORD_LITERAL ::= #!ignore [~['>' | '/' | ' ' | '\t']]+;