require 'rubygems' require 'hpricot' require 'open-uri' module Hatena module Haiku class Entry ICON_SERVER_URI = 'http://www.hatena.ne.jp/users/' def initialize(options = {}) @id = options[:id] @keyword = options[:keyword] @permalink = options[:permalink] @timestamp = options[:timestamp] @body = options[:body] @source = options[:source] end attr_accessor :id, :keyword, :permalink, :timestamp, :body, :source def icon_uri File.join(ICON_SERVER_URI, id[0..1], id, "profile.gif") end end class Fetcher SERVER_ROOT_URI = 'http://h.hatena.ne.jp/' def initialize(options = {}) @interval = options[:interval] || 3 @debug = options[:debug] || false @max_entries_of_page = options[:max_entries_of_page] || 20 @entries = [] @lastest_fetched_at = Time.now end def fetch(uri, start_page = 1, &block) page = start_page entries_buf = [] while fetch_uri = uri + "?page=#{page}" html = open(fetch_uri.untaint) next if html.nil? document = Hpricot(html) document.search('div.entries/div.entry')[1..@max_entries_of_page].each_with_index do |entry,idx| break if entry.inner_html.scan('google_afc').size > 0 id = entry.search('span.username/a').inner_html keyword = entry.search('h2.title/a').last.inner_html timestamp = entry.search('span.timestamp/a').first permalink = File.join(SERVER_ROOT_URI, timestamp[:href]) timestamp = Time.local *timestamp.inner_html.split(/[- :]/) body = entry.search('div.body').inner_html.strip source = entry.search('span.source/a').inner_html next if timestamp >= @lastest_fetched_at entry = Entry.new( :id => id, :keyword => keyword, :permalink => permalink, :timestamp => timestamp, :body => body, :source => source ) @lastest_fetched_at = entry.timestamp yield entry if block_given? entries_buf << entry end rescue nil break if entries_buf.size == 0 debug_print "page #{page} was fetched." @entries += entries_buf page += 1 entries_buf.clear sleep @interval end debug_print "Total; #{@entries.size} entries." @entries end private def debug_print(content) puts content if @debug end end # class Fetcher end # module Haiku end # module Hatena if __FILE__ == $0 uri = ARGV.shift || 'http://h.hatena.ne.jp/id/trashsuite/' interval = ARGV.shift || 3 require 'pp' fetcher = Hatena::Haiku::Fetcher.new( :interval => interval.to_i, :debug => true ) entries = fetcher.fetch(uri) do |entry| print "#{entry.timestamp}::#{entry.source}:: " puts entry.body[0..100] end pp entries.last pp entries.last.icon_uri end