# # librairie HTTP bas niveau # # par Yoann Guillot - 2004 # require 'socket' require 'timeout' require 'zlib' begin require 'openssl' rescue LoadError end class HttpResp attr_accessor :answer, :headers, :content_raw def initialize @answer = String.new @headers = Hash.new @content_raw = String.new @content = nil @parse = nil end attr_writer :parse def parse(type=nil) type ? @parse.find_all { |e| e.type == type } : @parse end def status $1.to_i if @answer =~ /^HTTP\/1.\d+ (\d+) / end def content if not @content if @headers['content-encoding'] == 'gzip' tmpname = '/tmp/httpget.gz.' ext = rand(10000) ext = rand(10000) while (File.exist?(tmpname+ext.to_s)) begin File.open(tmpname+ext.to_s, 'wb+') { |file| file.write(@content_raw) file.rewind zfile = Zlib::GzipReader.new(file) @content = zfile.read zfile.close } rescue IOError # some version of zfile.close will also close file and make File.open {} raise ensure File.unlink(tmpname+ext.to_s) end elsif @headers['content-encoding'] == 'deflate' puts "Content-encoding deflate !!!" @content = Zlib::Inflate.inflate(@content_raw) else @content = @content_raw end @content_raw = nil end @content end def each_table raise "No parse" unless @parse # table est un array de [tablenum, ligne, col] # [[2, 1, 1], [1, 2, 3]] indique que l'on est dans la case [2, 3] de la premiere table contenue dans # la case [1, 1] de la deuxieme table du toplevel # ..
->*<- table = [] donetable = false @parse.each { |e| case e.type when 'table' # est-ce que l'on suit une autre table ou non if donetable donetable = false table.last[0] += 1 table.last[1] = table.last[2] = 0 else table << [1, 0, 0] end when 'tr' table.last[1] += 1 table.last[2] = 0 when 'td', 'th' if donetable table.pop donetable = false end table.last[2] += 1 # not mandatory when '/td', '/th' when '/tr' when '/table' table.pop if donetable donetable = true else donetable = table.pop if donetable # cannot distinguish A and B in "A
B" yield table, e if donetable table << donetable donetable = true end end } nil end def to_s @answer + @headers.map { |k, v| "#{k}: #{v}" }.join("\r\n") + "\r\n\r\n" + content end def inspect '<#HttpAnswer:' + {'answer' => answer, 'headers' => headers, 'content' => content}.inspect end def get_text_sep; @get_text_sep ||= ' ' end def get_text_sep=(a) @get_text_sep = a end def get_text(onlyform=false, onlystr=true) inform = false inbody = false innoframes = false maynl = false txt = [] nl = "\n" @parse ||= parsehtml content @parse.each { |e| case e.type when 'body'; inbody = true ; next when '/body'; inbody = false when 'noframes'; innoframes = true when '/noframes'; innoframes = false ; next when 'form'; inform = true when '/form' txt << e << nl unless onlystr inform = false end next if (onlyform and not inform) or not inbody or innoframes case e.type when 'String' txt << get_text_sep if maynl txt << HttpServer.htmlentitiesdec(e['content'].gsub(/(?: |\s)+/, ' ').strip) maynl = true when 'optgroup' txt << nl if maynl txt << HttpServer.htmlentitiesdec(e['label']) txt << nl maynl = false when 'b', '/b', 'td', '/td', 'span', '/span', 'font', '/font', 'Comment', 'Script', 'img', 'em', '/em' nil when 'br', 'p', '/p', 'table', '/table', 'tr', '/tr', 'tbody', '/tbody', 'div', '/div', '/option', 'li', '/li', 'ul', '/ul' txt << nl if maynl maynl = false else # input select option textarea next if onlystr txt << nl if maynl maynl = false txt << e << nl end } txt.join end end class HttpServer attr_accessor :host, :port, :vhost, :vport, :loginpass, :proxyh, :proxyp, :proxylp, :use_ssl, :socket # global defaults @@timeout = 120 @@hdr_useragent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.2.3) Gecko/20100401 Firefox/3.6.3 (.NET CLR 3.5.30729)' @@hdr_accept = 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5' @@hdr_encoding = 'gzip,deflate' @@hdr_language = 'en' class << self %w[timeout hdr_useragent hdr_accept hdr_encoding hdr_language].each { |a| define_method(a) { class_variable_get "@@#{a}" } define_method(a+'=') { |v| class_variable_set "@@#{a}", v } } end attr_accessor :timeout, :hdr_useragent, :hdr_accept, :hdr_encoding, :hdr_language attr_accessor :urlpath def self.open(*a) s = new(*a) yield s ensure s.close end def initialize(url) if not url.include? '://' url = "http://#{url}/" end hostre = '[\w.-]+|\[[a-fA-F0-9:]+\]' raise "Unparsed url #{url.inspect}" unless md = %r{^(?:(http-proxy|socks)://(\w*:[^@]*@)?(#{hostre})(:\d+)?/)?http(s)?://(\w*:[^@]*@)?(?:([\w.-]+)(:\d+)?@)?(#{hostre})(:\d+)?(/.*)}.match(url) @proxytype, @proxylp, @proxyh, proxyp, @use_ssl, @loginpass, vhost, vport, @host, port, @urlpath = md.captures @proxyh = @proxyh[1..-2] if @proxyh and @proxyh[0] == ?[ @host = @host[1..-2] if @host[0] == ?[ @proxyp = proxyp ? proxyp[1..-1].to_i : 3128 @port = port ? port[1..-1].to_i : (@use_ssl ? 443 : 80) @proxylp = 'Basic '+[@proxylp.chop].pack('m').split.join if @proxylp @loginpass = nil if @loginpass == ':@' @loginpass = 'Basic '+[@loginpass.chop].pack('m').split.join if @loginpass @vhost = vhost ? vhost : @host @vport = vport ? vport[1..-1].to_i : @port @socket = nil @timeout = @@timeout @hdr_useragent = @@hdr_useragent @hdr_accept = @@hdr_accept @hdr_encoding = @@hdr_encoding @hdr_language = @@hdr_language end def self.urlenc(s) s.to_s.gsub(/([^a-zA-Z0-9_.\/ -]+)/n) do '%' + $1.unpack('H2' * $1.size).join('%').upcase end.tr(' ', '+') end def self.get_form_url(url, vars) vars.empty? ? url : ( url + '?' + vars.map { |k, v| "#{urlenc k}=#{urlenc(htmlentitiesdec(v.to_s))}" }.join('&') ) end # This takes the long string til EOE, matches it with scan, and build a hash from that # > 255 omitted HTMLENTITIES = Hash[*< EOE def self.htmlentitiesdec(s) s.gsub(/&#(x?\d+);/) { v = (($1[0] == ?x) ? $1[1..-1].to_i(16) : $1.to_i) (v < 256) ? v.chr : $& }.gsub(/&(\w+);/) { HTMLENTITIES[$1] ? HTMLENTITIES[$1].chr : $& } end def self.htmlentitiesenc(s) s.gsub(/(.)/) { e = HTMLENTITIES.index $1[0] e = nil if e == 'hellip' e ? "&#{e};" : $1 } end def setup_request_headers(headers) headers['Host'] = @vhost headers['Host'] += ":#@vport" if @vport != 80 headers['User-Agent'] ||= @hdr_useragent headers['Accept'] ||= @hdr_accept headers['Connection'] ||= 'keep-alive' headers['Keep-Alive'] ||= 300 headers['Accept-Charset'] ||= 'ISO-8859-1,utf-8;q=0.7,*;q=0.7' headers['Accept-Encoding'] ||= @hdr_encoding headers['Accept-Language'] ||= @hdr_language headers['Authorization'] ||= @loginpass if @loginpass headers['Proxy-Authorization'] ||= @proxylp if @proxylp and not @use_ssl end def head(page, headers = Hash.new) setup_request_headers(headers) # sort headers (TODO better) h = headers.dup h = ["Host: #{h.delete 'Host'}"] + h.map { |k, v| "#{k}: #{v}" } req = ["HEAD #{'http://' << (@host + (@port != 80 ? ":#@port" : '')) if @proxyh}#{page} HTTP/1.1"] + h req = req.join("\r\n") + "\r\n\r\n" read_resp send_req(req), true rescue Errno::ECONNREFUSED resp = HttpResp.new resp.answer.replace("HTTP/1.1 503 Connection refused") resp.content_raw << "The server refused the connection" return resp end def get(page, headers = Hash.new) setup_request_headers(headers) # sort headers (TODO better) h = headers.dup h = ["Host: #{h.delete 'Host'}"] + h.map { |k, v| "#{k}: #{v}" } req = ["GET #{'http://' << (@host + (@port != 80 ? ":#@port" : '')) if @proxyh}#{page} HTTP/1.1"] + h req = req.join("\r\n") + "\r\n\r\n" read_resp send_req(req) rescue Errno::ECONNREFUSED resp = HttpResp.new resp.answer.replace("HTTP/1.1 503 Connection refused") resp.content_raw << "The server refused the connection" return resp end def post_raw(page, postraw, headers = Hash.new) setup_request_headers(headers) headers['Content-type'] ||= 'application/octet-stream' headers['Content-length'] = postraw.length req = ["POST #{'http://' << (@host + (@port != 80 ? ":#@port" : '')) if @proxyh}#{page} HTTP/1.1"] + headers.map { |k, v| "#{k}: #{v}" } req = req.join("\r\n") + "\r\n\r\n" + postraw read_resp send_req(req) rescue Errno::ECONNREFUSED resp = HttpResp.new resp.answer.replace("HTTP/1.1 503 Connection refused") resp.content_raw << "The server refused the connection" return resp end def post(page, postdata, headers = Hash.new) headers['Content-type'] ||= 'application/x-www-form-urlencoded' post_raw(page, postdata.map { |k, v| # a => [a1, a2], b => b1 => 'a=a1&a=a2&b=b1' ((v.kind_of? Array) ? v : [v]).map { |vi| "#{HttpServer.urlenc k}=#{HttpServer.urlenc vi}" }.join('&') }.join('&'), headers) end def connect_socket case @proxytype when 'http-proxy' @socket = TCPSocket.new @proxyh, @proxyp if @use_ssl rq = "CONNECT #@host:#@port HTTP/1.1\r\n" rq << "Proxy-Authorization: #{@proxylp}\r\n" if @proxylp rq << "\r\n" @socket.write rq buf = @socket.gets raise "non http answer #{buf[1..100].inspect}" if buf !~ /^HTTP\/1.. (\d+) / raise "CONNECT bad response: #{buf.inspect}" if $1.to_i != 200 nil until @socket.gets.chomp.empty? end when 'socks' @socket = TCPSocket.new @proxyh, @proxyp # socks_ver 1=connect/2=bind port dest/0.0.0.1=sock4adns creds_strz hostsocks4a_strz buf = [4, 1, @port, 1, '', @host].pack('CCnNa*xa*x') @socket.write buf bufa = @socket.read 8 resp = %w[access_granted access_failed failed_noident failed_badindent][bufa[1] - ?Z] raise "socks: #{resp} #{bufa.inspect}" if resp != 'access_granted' else @socket = TCPSocket.new @host, @port end if @use_ssl @socket = OpenSSL::SSL::SSLSocket.new(@socket, OpenSSL::SSL::SSLContext.new) @socket.sync_close = true @socket.connect end end def close return if not @socket @socket.shutdown @socket.close @socket = nil rescue end def send_req(req) s = nil retried = 0 puts 'send_req:', req if $DEBUG begin if not @socket or !( @socket.write req ; s = @socket.gets ) close connect_socket @socket.write req s = @socket.gets end rescue Errno::EPIPE, Errno::ECONNRESET, IOError raise if retried > 2 retried += 1 @socket = nil retry end return s end def read_resp(status, no_body=false) page = HttpResp.new page.answer.replace(status||'') close_sock = true Timeout.timeout(@timeout, RuntimeError) { # parse le header renvoyé par le serveur while line = @socket.gets if line =~ /^([^:]*):\s*(.*?)\r?$/ k, v = $1.downcase, $2 if (page.headers[k]) page.headers[k] += '; '+v else page.headers[k] = v end end break if line =~ /^\r?$/ end if no_body elsif page.headers['content-length'] contentlength = page.headers['content-length'].to_i while contentlength > 1024 page.content_raw << @socket.read(1024) contentlength -= 1024 end page.content_raw << @socket.read(contentlength) if contentlength > 0 close_sock = false elsif page.headers['transfer-encoding'] == 'chunked' chunksize = 1 while chunksize > 0 line = @socket.gets chunksize = line.hex chunk = @socket.read chunksize page.content_raw << chunk @socket.read 2 end close_sock = false else # Sinon on lit tout ce qu'on peut while not @socket.eof? page.content_raw << @socket.read(1024) end end } rescue nil close if close_sock or page.headers['connection'] == 'close' return page end end