require 'uri' require 'net/http' require 'time' require 'kconv' $KCODE = 'UTF-8' # # = enokidu.rb # # Copyright; 2008 ODA Kaname [trashsuite@gmail.com] # See also ; http://d.hatena.ne.jp/trashsuite/ # # ウェブページ更新確認スクリプト # module Enokidu class UserAgent NAME = 'Enokidu::Antenna' VERSION = '0.1.5' end class Antenna ERROR_CODE = { :connerr => 900, :noroute => 901, :sockerr => 902, :timeout => 903, :unknown => 999 } def initialize(options = {}) @timeout = options[:timeout] || 30 @debug = options[:debug] || false @request = Request.new(:timeout => @timeout, :debug => @debug) end def detective(options = {}) page = options[:page] page.body ||= '' # タイトルを取得 if page.title.empty? @request.get(page) page.last_modified_at = nil end # 初回は HEAD が使えるかどうか確認する page.method = 'HEAD' if page.last_modified_at.nil? page.updated = false case page.method when 'GET' then @request.get(page) when 'HEAD' then @request.head(page) else raise Request::InvalidMethod end rescue Request::Redirect retry rescue Request::InvalidHeadResponse page.method = 'GET' retry rescue Exception => exception case exception when Errno::ECONNREFUSED then page.code = ERROR_CODE[:connerr] when Errno::EHOSTUNREACH then page.code = ERROR_CODE[:noroute] when Timeout::Error then page.code = ERROR_CODE[:timeout] when SocketError then page.code = ERROR_CODE[:sockerr] else puts exception.class page.code = ERROR_CODE[:unknown] end page end end # Antenna class Request def initialize(options = {}) @http_header = {'User-Agent' => Enokidu::UserAgent::NAME, 'Connection' => 'close'} @timeout = options[:timeout] || 30 @debug = options[:debug] || false end def get(page) page.method = 'GET' debug_print "sync by GET method" debug_print "initial sync" if page.body.empty? debug_print "title #{page.title}" unless page.body.empty? uri = URI.parse(page.uri) http = http_instance(uri) res = nil # 更新状況を聞いてみる %w[If-Modified-Since, If-None-Match].each {|header|@http_header.delete header} if !page.title.empty? and !page.has_range? debug_print 'set If-Modified-Since' @http_header['If-Modified-Since'] = page.last_modified_at.httpdate if page.last_modified_at @http_header['If-None-Match'] = page.etag unless page.etag.empty? end timeout(@timeout) do res = http.get(mkpath(uri), @http_header) end page.code = res.code debug_print "Return code #{res.code}" # Redirect if res.code.match(/^30[12]$/) location = res['location'] || '' old_uri = page.uri page.uri = location # ロケーションが空または不完全な場合 if !location.empty? and !location.match(%r[^http://]) uri = URI.parse(old_uri) path, query = location.split('?') uri.path = path uri.query = query || '' page.uri = uri.to_s end raise Redirect end # 親切な御仁に感謝しつつ終了 if res.instance_of? Net::HTTPNotModified debug_print 'use If-Modified-Since' page.updated = false return page end # タイトルを抜き取る body = res.body.toutf8 if page.title.empty? debug_print "get page title" page.title = body.scan(/([^<]*)/im).to_s.strip page.title = 'no title' if page.title.empty? end body = if page.has_range? get_range(body, page.start_range, page.end_range) else body end.gsub(/<[^>]*>|\s|\n/, '') # 更新チェック debug_print "body size #{page.body.size} => #{body.size}" unless body.size == page.body.size page.last_modified_at = Time.now page.updated = true end page.body = body page end def head(page) page.method = 'HEAD' debug_print "sync by HEAD method" uri = URI.parse(page.uri) http = http_instance(uri) res = nil timeout(@timeout) do res = http.head(mkpath(uri), @http_header) end page.code = res.code debug_print "Return code #{res.code}" date = res['date'] lm = res['last-modified'] etag = res['etag'] # 初回は現在時刻をセット page.last_modified_at = Time.now if page.last_modified_at.nil? # Last-Modified で更新チェック raise InvalidHeadResponse if date == lm lm = Time.httpdate(lm).localtime if lm if lm and lm != page.last_modified_at.localtime page.last_modified_at = lm page.updated = true end # Etag で更新チェック if !lm and etag and page.etag != etag page.etag = etag page.last_modified_at = Time.now.localtime page.updated = true end # Last-Modified も Etag も使えない raise InvalidHeadResponse unless lm or etag page end private def mkpath(uri) uri.query ? [uri.path, uri.query].join('?') : uri.path end def http_instance(uri) http = Net::HTTP.new(uri.host, uri.port) http.use_ssl = true if uri.to_s.match(/^https/) http end def get_range(body, start_range, end_range) debug_print "use range" body.scan(/#{start_range}(.*)#{end_range}/im).to_s end def debug_print(content) puts content if @debug end class Redirect < StandardError; end class InvalidHeadResponse < StandardError; end class InvalidMethod < StandardError; end end # Request class Page PERMIT_OPTIONS = [:uri, :title, :method, :start_range, :end_range, :last_modified_at, :etag, :code, :body] PERMIT_OPTIONS.each {|opt|attr_accessor opt} attr_accessor :updated def initialize(options = {}) @uri = options[:uri] || '' @title = options[:title] || '' @method = options[:method] || 'GET' @start_range = options[:start_range] || '' @end_range = options[:end_range] || '' @last_modified_at = options[:last_modified_at] @etag = options[:etag] || '' @code = options[:code] || 200 @body = options[:body] || '' raise ArgumentError if @uri.empty? end def code @code.to_i end def has_range? !start_range.empty? or !end_range.empty? end def updated? @updated || false end def valid? code == 200 end end # Page end # Enokidu