# parser HTML
# par Yoann Guillot - 2004
class HtmlElement
String = 'String'.freeze
def self.new_string(s)
n = new
n.type = String
n['content'] = s
n
end
attr_accessor :type, :attr, :empty
def initialize
@type = nil
end
def [](attrname)
attr ? @attr[attrname] : nil
end
def []=(attrname, val)
@attr ||= {}
@attr[attrname] = val
end
def to_s
'<' << type << (attr || {}).map{ |k, v| " #{k}=\"#{v}\"" }.join << (empty ? ' />' : '>')
end
def ==(o)
self.class == o.class and type == o.type and attr == o.attr and empty == o.empty
end
def hash
type.hash ^ attr.hash
end
alias eql? ==
end
def parsehtml(page, nocache=false)
parse = Array.new unless block_given?
curelem = nil
curword = ''
curattrname = nil
state = :waitopen
laststate = state
# use nocache=true to avoid this if you intend to change some of the tags
class << parse
def <<(e)
# list of tags created, avoid duplicate objects (saves mem)
@cache ||= {}
super(@cache[e] ||= e)
end
# free mem used by the cache
def done; @cache = nil end
end if parse and not nocache
# 0: waitopen before tag/in string ''
# 1: tagtype in tag type '<'
# 2: tagattrname in tag attrname '
# state: 00000001111224552223446666522224775011111190
# tags type and attrname downcased
pg = page.gsub(/\s+/, ' ') # incl. newlines
pg.length.times { |pos|
c = pg[pos] # any other way to enumerate characters of the string portably (ruby 1.8 & 1.9) ?
case state
when :waitopen # string
case c
when ?<
if curword.length > 0
curelem = HtmlElement.new_string(curword.strip)
if parse
parse << curelem
else
yield curelem
end
end
curword = ''
curelem = HtmlElement.new
state = :waittagtype
when ?\ # space
curword << c if curword.length > 0
else
curword << c
end
when :waittagtype
case c
when ?\ # space
next
end
state = :tagtype
redo
when :tagtype # after tag start
case curword.downcase
when '!--' # html comment
curword = c.chr
state = :comment
next
when '![cdata[' # xml cdata
curword = c.chr
state = :cdata
next
end
case c
when ?>
curelem.type = curword.downcase
case curelem.type
when 'script', 'style'
curword = curelem.to_s
state = :script
next
end
if parse
parse << curelem
else
yield curelem
end
curword = ''
state = :waitopen
when ?/
if curword.length == 0
# / at the beginning of a tag
curword = c.chr
else
laststate = state
state = :tagend
end
when ?\ # space
if curword.length > 0
# < kikoospaces lol="mdr">
curelem.type = curword.downcase
curword = ''
state = :tagattrname
end
else
curword << c
end
when :tagattrname # tagattrname
case c
when ?>
curelem[curword.downcase] = '' if curword.length > 0
case curelem.type
when 'script', 'style'
curword = curelem.to_s
state = :script
next
end
if parse
parse << curelem
else
yield curelem
end
curword = ''
state = :waitopen
when ?/
laststate = state
state = :tagend
when ?\ # space
curattrname = curword.downcase
curword = ''
state = :tagattreql
when ?=
curattrname = curword.downcase
curword = ''
state = :tagattrval
else
curword << c
end
when :tagattreql # aftertagattrname
case c
when ?>
curelem[curattrname] = ''
case curelem.type
when 'script', 'style'
curword = curelem.to_s
state = :script
next
end
if parse
parse << curelem
else
yield curelem
end
state = :waitopen
when ?/
laststate = state
state = :tagend
when ?=
state = :tagattrval
else
curelem[curattrname] = ''
curword << c
state = :tagattrname
end
when :tagattrval # beforetagattrval
case c
when ?>
curelem[curattrname] = ''
case curelem.type
when 'script', 'style'
curword = curelem.to_s
state = :script
next
end
if parse
parse << curelem
else
yield curelem
end
state = :waitopen
when ?/
laststate = state
state = :tagend
when ?"
state = :tagattrvaldquot
when ?'
state = :tagattrvalsquot
when ?\ # space
# nop
else
curword << c
state = :tagattrvalraw
end
when :tagattrvalraw # attrval
case c
when ?>
curelem[curattrname] = curword
case curelem.type
when 'script', 'style'
curword = curelem.to_s
state = :script
next
end
if parse
parse << curelem
else
yield curelem
end
curword = ''
state = :waitopen
when ?/
laststate = state
state = :tagend
when ?\ # space
curelem[curattrname] = curword
curword = ''
state = :tagattrname
else
curword << c
end
when :tagattrvaldquot # attrval, doublequote
case c
when ?"
state = :tagattrvalraw
else
curword << c
end
when :tagattrvalsquot # attrval, singlequote
case c
when ?'
state = :tagattrvalraw
else
curword << c
end
when :comment, :cdata # comment
case c
when ?>
if (state == :comment and curword[-1] == ?- and curword[-2] == ?-) or (state == :cdata and curword[-1] == ?] and curword[-2] == ?])
curelem.type = state.to_s.capitalize
curelem['content'] = curword[0...-2]
if parse
parse << curelem
else
yield curelem
end
curword = ''
state = :waitopen
else
curword << c
end
else
curword << c
end
when :tagend # wait for end of tag
if (c != ?>)
curword << ?/
else
curelem.empty = true
end
state = laststate
redo
when :script #