contrib/feedgrabber.rb

#!/usr/bin/ruby
#
# This module has the methods to get, parse and enrich rss feeds
#
# Change history
#
#  29.03.2010    erb    classes and methods to ease up feed parser writing
#
require 'net/http'
require "uri"
require 'timeout'
require 'gdbm'

#
# This class capsules DB caching and HTTP getting pages for articles
#
# You instantiate a FeedGrabber giving at least a unique feed name (for DB
# name generation) Then you can call getURL with the article url to get.  If
# it is found in the cache, that one is retrieved.  If you parsed your feed
# you should call cleanupDB to get rid of all those unused entries in the
# cache
#
#module FeedGrabber
  class FeedGrabber
    # create a FeedGrabber instance
    def initialize(uniqueName, path = nil, retries = 4, depth = 5, timeout = 15)
      path = File.expand_path("~") if path.nil?
      @dbCacheName = "#{path}/.newsboat/#{uniqueName}.db"       # generate db cache filename
      @maxRetries = retries
      @maxDepth = depth
      @timeout = timeout

      @usedURLs = Array.new                               # empty array to hold used URLs
    end

    #
    # try to retrieve web site, following up to maxDepth redirects, having up to maxRetries retries
    #
    def getURL_uncached(url)
      result = nil
      retries = @maxRetries
      begin
        Timeout::timeout(@timeout) do
          tempurl = url
          depth = @maxRetries
          while true
            raise ArgumentError, "Followed more #{@maxDepth} redirections. Stopping this nightmare now." if depth == 0
            response = Net::HTTP.get_response(URI.parse(tempurl))
            case response
              when Net::HTTPSuccess     then
                result = response.body
                break
              when Net::HTTPRedirection then
                tempurl = response['location']
                depth -= 1
                next # follow redirection
            end
          end
        end
      rescue Timeout::Error
        retries -= 1
        exit 1 if retries < 1
        sleep 1
        retry
      rescue # maybe an ArgumentError or anything the net layer throws
        # any other error shall not make any noise (maybe shall we produce a fake RSS item)
      end
      result
    end

    #
    # get url, but create and use a DB cache for each feed
    #
    def getURL(url)
      @usedURLs << url    # remember, we used that URL for cleanup later
      db = GDBM.new(@dbCacheName)
      if db.has_key?(url)
        data = db[url]      # get cached data from DB
      else
        # not in DB? so get it and store it into DB
        data = getURL_uncached(url)
        db[url] = data
      end
      db.close
      data
    end

    #
    # remove all URLs not used from DB cache
    #
    def cleanupDB
      toRemove = Array.new
      db = GDBM.new(@dbCacheName)
      db.each_key do |key|
        toRemove << key if @usedURLs.index(key) == nil
      end
      toRemove.each do |url|
        db.delete(url)
      end
      db.close
    end
  end #class
#end # module