=begin = HTML Repair Library htmlrepair.rb Version 1.0.1 Copyright (C) 2000 MoonWolf Development MoonWolf 省略された終了タグを補う。 == 使い方 obj = HTMLSplit.new(html) obj.repair =end require "htmlsplit" class HTMLSplit PARENTTAG = { 'p' => %w(body table), 'a' => %w(body), 'thead' => %w(table), 'tfoot' => %w(table), 'tbody' => %w(table), 'tr' => %w(table thead tfoor tbody), 'td' => %w(tr), 'th' => %w(tr), 'li' => %w(ol ul), 'dt' => %w(dl), 'dd' => %w(dl), 'col' => %w(colgroup), 'param' => %w(applet), 'area' => %w(map), 'input' => %w(form), 'textarea' => %w(form), 'button' => %w(form), 'select' => %w(form), 'keygen' => %w(form), 'label' => %w(form), 'fieldset' => %w(form), 'legend' => %w(fieldset), 'option' => %w(select), } def repair tag = [] doc = [] @document.each {|e| case e when EmptyElementTag doc.push e when StartTag if PARENTTAG[e.name] && (a = tag.rindex(e.name)) #ネストか終了タグの省略かチェック flag = true tag[a..-1].each {|t| if PARENTTAG[e.name].include?(t) #正常なネスト flag = false break end } if flag #省略された終了タグを出力 while t=tag.pop c = EndTag.new(t) doc.push c if t==e.name break end end end else end # tag.push e.name doc.push e when EndTag if tag.include?(e.name) while t = tag.pop if t==e.name break else c = EndTag.new(t) doc.push c end end else end doc.push e when CharacterData doc.push e when Declaration doc.push e when Comment doc.push e else doc.push e end } while t = tag.pop doc.push EndTag.new(t) end @document = doc end end