|
| | ->*<-
table = []
donetable = false
@parse.each { |e|
case e.type
when 'table'
# est-ce que l'on suit une autre table ou non
if donetable
donetable = false
table.last[0] += 1
table.last[1] = table.last[2] = 0
else
table << [1, 0, 0]
end
when 'tr'
table.last[1] += 1
table.last[2] = 0
when 'td', 'th'
if donetable
table.pop
donetable = false
end
table.last[2] += 1
# not mandatory
when '/td', '/th'
when '/tr'
when '/table'
table.pop if donetable
donetable = true
else
donetable = table.pop if donetable
# cannot distinguish A and B in "AB"
yield table, e
if donetable
table << donetable
donetable = true
end
end
}
nil
end
def to_s
@answer + @headers.map { |k, v| "#{k}: #{v}" }.join("\r\n") + "\r\n\r\n" + content
end
def inspect
'<#HttpAnswer:' + {'answer' => answer, 'headers' => headers, 'content' => content}.inspect
end
def get_text_sep; @get_text_sep ||= ' ' end
def get_text_sep=(a) @get_text_sep = a end
def get_text(onlyform=false, onlystr=true)
inform = false
inbody = false
innoframes = false
maynl = false
txt = []
nl = "\n"
@parse ||= parsehtml content
@parse.each { |e|
case e.type
when 'body'; inbody = true ; next
when '/body'; inbody = false
when 'noframes'; innoframes = true
when '/noframes'; innoframes = false ; next
when 'form'; inform = true
when '/form'
txt << e << nl unless onlystr
inform = false
end
next if (onlyform and not inform) or not inbody or innoframes
case e.type
when 'String'
txt << get_text_sep if maynl
txt << HttpServer.htmlentitiesdec(e['content'].gsub(/(?: |\s)+/, ' ').strip)
maynl = true
when 'optgroup'
txt << nl if maynl
txt << HttpServer.htmlentitiesdec(e['label'])
txt << nl
maynl = false
when 'b', '/b', 'td', '/td', 'span', '/span', 'font', '/font', 'Comment', 'Script', 'img', 'em', '/em'
nil
when 'br', 'p', '/p', 'table', '/table', 'tr', '/tr', 'tbody', '/tbody', 'div', '/div', '/option', 'li', '/li', 'ul', '/ul'
txt << nl if maynl
maynl = false
else
# input select option textarea
next if onlystr
txt << nl if maynl
maynl = false
txt << e << nl
end
}
txt.join
end
end
class HttpServer
attr_accessor :host, :port, :vhost, :vport, :loginpass, :proxyh, :proxyp, :proxylp, :use_ssl, :socket
# global defaults
@@timeout = 120
@@hdr_useragent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:109.0) Gecko/20100101 Firefox/117.0 (.NET CLR 3.5.30729)'
@@hdr_accept = 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5'
@@hdr_encoding = 'gzip,deflate'
@@hdr_language = 'en'
class << self
%w[timeout hdr_useragent hdr_accept hdr_encoding hdr_language].each { |a|
define_method(a) { class_variable_get "@@#{a}" }
define_method(a+'=') { |v| class_variable_set "@@#{a}", v }
}
end
attr_accessor :timeout, :hdr_useragent, :hdr_accept, :hdr_encoding, :hdr_language
attr_accessor :urlpath
def self.open(*a)
s = new(*a)
yield s
ensure
s.close
end
def initialize(url)
if not url.include? '://'
url = "http://#{url}/"
end
hostre = '[\w.-]+|\[[a-fA-F0-9:]+\]'
raise "Unparsed url #{url.inspect}" unless md = %r{^(?:(http-proxy|socks)://(\w*:[^@]*@)?(#{hostre})(:\d+)?/)?http(s)?://(\w*:[^@]*@)?(?:(#{hostre})(:\d+)?@)?(#{hostre})(:\d+)?(/.*)}.match(url)
@proxytype, @proxylp, @proxyh, proxyp, @use_ssl, @loginpass, vhost, vport, @host, port, @urlpath = md.captures
@proxyh = @proxyh[1..-2] if @proxyh and @proxyh[0] == ?[
@proxyp = proxyp ? proxyp[1..-1].to_i : 3128
@port = port ? port[1..-1].to_i : (@use_ssl ? 443 : 80)
@proxylp = 'Basic '+[@proxylp.chop].pack('m').split.join if @proxylp
@loginpass = nil if @loginpass == ':@'
@loginpass = 'Basic '+[@loginpass.chop].pack('m').split.join if @loginpass
@vhost = vhost ? vhost : @host
@vport = vport ? vport[1..-1].to_i : @port
@host = @host[1..-2] if @host[0] == ?[ # ipv6 numeric, vhost needs []
@socket = nil
@timeout = @@timeout
@hdr_useragent = @@hdr_useragent
@hdr_accept = @@hdr_accept
@hdr_encoding = @@hdr_encoding
@hdr_language = @@hdr_language
end
def self.urlenc(s)
s.to_s.gsub(/([^a-zA-Z0-9_.\/ -]+)/n) do
'%' + $1.unpack('H2' * $1.size).join('%').upcase
end.tr(' ', '+')
end
def self.get_form_url(url, vars)
vars.empty? ? url : (
url + '?' + vars.map { |k, v|
"#{urlenc k}=#{urlenc(htmlentitiesdec(v.to_s))}"
}.join('&')
)
end
# This takes the long string til EOE, matches it with scan, and build a hash from that
# > 255 omitted
HTMLENTITIES = Hash[*<
EOE
def self.htmlentitiesdec(s)
s.gsub(/(x?\d+);/) {
v = (($1[0] == ?x) ? $1[1..-1].to_i(16) : $1.to_i)
(v < 256) ? v.chr : $&
}.gsub(/&(\w+);/) { HTMLENTITIES[$1] ? HTMLENTITIES[$1].chr : $& }
end
def self.htmlentitiesenc(s)
s.gsub(/(.)/) {
e = HTMLENTITIES.index $1[0]
e = nil if e == 'hellip'
e ? "{e};" : $1
}
end
def setup_request_headers(headers)
headers['Host'] = @vhost
#headers['Host'] += ":#@vport" if @vport != 80
headers['User-Agent'] ||= @hdr_useragent
headers['Accept'] ||= @hdr_accept
headers['Connection'] ||= 'keep-alive'
headers['Keep-Alive'] ||= 300
headers['Accept-Charset'] ||= 'ISO-8859-1,utf-8;q=0.7,*;q=0.7'
headers['Accept-Encoding'] ||= @hdr_encoding
headers['Accept-Language'] ||= @hdr_language
headers['Authorization'] ||= @loginpass if @loginpass
headers['Proxy-Authorization'] ||= @proxylp if @proxylp and not @use_ssl
end
def head(page, headers = Hash.new)
setup_request_headers(headers)
# sort headers (TODO better)
h = headers.dup
h = ["Host: #{h.delete 'Host'}"] +
h.map { |k, v| "#{k}: #{v}" }
req = ["HEAD #{'http://' << (@host + (@port != 80 ? ":#@port" : '')) if @proxyh}#{page} HTTP/1.1"] + h
req = req.join("\r\n") + "\r\n\r\n"
read_resp send_req(req), true
rescue Errno::ECONNREFUSED
resp = HttpResp.new
resp.answer.replace("HTTP/1.1 503 Connection refused")
resp.content_raw << "The server refused the connection"
return resp
end
def get(page, headers = Hash.new)
setup_request_headers(headers)
# sort headers (TODO better)
h = headers.dup
h = ["Host: #{h.delete 'Host'}"] +
h.map { |k, v| "#{k}: #{v}" }
req = ["GET #{'http://' << (@host + (@port != 80 ? ":#@port" : '')) if @proxyh}#{page} HTTP/1.1"] + h
req = req.join("\r\n") + "\r\n\r\n"
read_resp send_req(req)
rescue Errno::ECONNREFUSED
resp = HttpResp.new
resp.answer.replace("HTTP/1.1 503 Connection refused")
resp.content_raw << "The server refused the connection"
return resp
end
def post_raw(page, postraw, headers = Hash.new)
setup_request_headers(headers)
headers['Content-type'] ||= 'application/octet-stream'
headers['Content-length'] = postraw.length
req = ["POST #{'http://' << (@host + (@port != 80 ? ":#@port" : '')) if @proxyh}#{page} HTTP/1.1"] + headers.map { |k, v| "#{k}: #{v}" }
req = req.join("\r\n") + "\r\n\r\n" + postraw
read_resp send_req(req)
rescue Errno::ECONNREFUSED
resp = HttpResp.new
resp.answer.replace("HTTP/1.1 503 Connection refused")
resp.content_raw << "The server refused the connection"
return resp
end
def post(page, postdata, headers = Hash.new)
headers['Content-type'] ||= 'application/x-www-form-urlencoded'
post_raw(page, postdata.map { |k, v|
# a => [a1, a2], b => b1 => 'a=a1&a=a2&b=b1'
((v.kind_of? Array) ? v : [v]).map { |vi| "#{HttpServer.urlenc k}=#{HttpServer.urlenc vi}" }.join('&')
}.join('&'), headers)
end
def connect_socket
case @proxytype
when 'http-proxy'
@socket = TCPSocket.new @proxyh, @proxyp
if @use_ssl
rq = "CONNECT #@host:#@port HTTP/1.1\r\n"
rq << "Proxy-Authorization: #{@proxylp}\r\n" if @proxylp
rq << "\r\n"
@socket.write rq
buf = @socket.gets
raise "non http answer #{buf[1..100].inspect}" if buf !~ /^HTTP\/1.. (\d+) /
raise "CONNECT bad response: #{buf.inspect}" if $1.to_i != 200
nil until @socket.gets.chomp.empty?
end
when 'socks'
@socket = TCPSocket.new @proxyh, @proxyp
# socks_ver 1=connect/2=bind port dest/0.0.0.1=sock4adns creds_strz hostsocks4a_strz
buf = [4, 1, @port, 1, '', @host].pack('CCnNa*xa*x')
@socket.write buf
bufa = @socket.read 8
resp = %w[access_granted access_failed failed_noident failed_badindent][bufa[1] - ?Z]
raise "socks: #{resp} #{bufa.inspect}" if resp != 'access_granted'
else
@socket = TCPSocket.new @host, @port
end
if @use_ssl
@socket = OpenSSL::SSL::SSLSocket.new(@socket, OpenSSL::SSL::SSLContext.new)
@socket.sync_close = true
@socket.connect
end
end
def close
return if not @socket
@socket.shutdown if @socket.respond_to? :shutdown # SSL
@socket.close
@socket = nil
rescue
end
def send_req(req)
s = nil
retried = 0
puts 'send_req:', req if $DEBUG
begin
if not @socket or !( @socket.write req ; s = @socket.gets )
close
connect_socket
@socket.write req
s = @socket.gets
end
rescue Errno::EPIPE, Errno::ECONNRESET, IOError
raise if retried > 2
retried += 1
@socket = nil
retry
end
return s
end
def read_resp(status, no_body=false)
page = HttpResp.new
page.answer.replace(status||'')
close_sock = true
Timeout.timeout(@timeout, RuntimeError) {
puts 'read_resp_hdr:', status if $DEBUG
# parse le header renvoyé par le serveur
while line = @socket.gets
if line =~ /^([^:]*):\s*(.*?)\r?$/
k, v = $1.downcase, $2
if (page.headers[k])
page.headers[k] += '; '+v
else
page.headers[k] = v
end
end
puts line if $DEBUG
break if line =~ /^\r?$/
end
if no_body
elsif page.headers['content-length']
contentlength = page.headers['content-length'].to_i
while contentlength > 1024
page.content_raw << @socket.read(1024)
contentlength -= 1024
end
page.content_raw << @socket.read(contentlength) if contentlength > 0
close_sock = false
elsif page.headers['transfer-encoding'] == 'chunked'
chunksize = 1
while chunksize > 0
line = @socket.gets
chunksize = line.hex
chunk = @socket.read chunksize
page.content_raw << chunk
@socket.read 2
end
close_sock = false
else
# Sinon on lit tout ce qu'on peut
while not @socket.eof?
page.content_raw << @socket.read(1024)
end
end
} rescue nil
close if close_sock or page.headers['connection'] == 'close'
#puts page.content if $DEBUG
return page
end
end
|
|