=begin
= HTML Repair Library
htmlrepair.rb
Version 1.0.1
Copyright (C) 2000 MoonWolf Development
MoonWolf
省略された終了タグを補う。
== 使い方
obj = HTMLSplit.new(html)
obj.repair
=end
require "htmlsplit"
class HTMLSplit
PARENTTAG = {
'p' => %w(body table),
'a' => %w(body),
'thead' => %w(table),
'tfoot' => %w(table),
'tbody' => %w(table),
'tr' => %w(table thead tfoor tbody),
'td' => %w(tr),
'th' => %w(tr),
'li' => %w(ol ul),
'dt' => %w(dl),
'dd' => %w(dl),
'col' => %w(colgroup),
'param' => %w(applet),
'area' => %w(map),
'input' => %w(form),
'textarea' => %w(form),
'button' => %w(form),
'select' => %w(form),
'keygen' => %w(form),
'label' => %w(form),
'fieldset' => %w(form),
'legend' => %w(fieldset),
'option' => %w(select),
}
def repair
tag = []
doc = []
@document.each {|e|
case e
when EmptyElementTag
doc.push e
when StartTag
if PARENTTAG[e.name] && (a = tag.rindex(e.name))
#ネストか終了タグの省略かチェック
flag = true
tag[a..-1].each {|t|
if PARENTTAG[e.name].include?(t)
#正常なネスト
flag = false
break
end
}
if flag
#省略された終了タグを出力
while t=tag.pop
c = EndTag.new(t)
doc.push c
if t==e.name
break
end
end
end
else
end
#
tag.push e.name
doc.push e
when EndTag
if tag.include?(e.name)
while t = tag.pop
if t==e.name
break
else
c = EndTag.new(t)
doc.push c
end
end
else
end
doc.push e
when CharacterData
doc.push e
when Declaration
doc.push e
when Comment
doc.push e
else
doc.push e
end
}
while t = tag.pop
doc.push EndTag.new(t)
end
@document = doc
end
end