forked from newsboat/newsboat
-
Notifications
You must be signed in to change notification settings - Fork 0
/
feedgrabber.rb
executable file
·103 lines (98 loc) · 2.96 KB
/
feedgrabber.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#!/usr/bin/ruby
#
# This module has the methods to get, parse and enrich rss feeds
#
# Change history
#
# 29.03.2010 erb classes and methods to ease up feed parser writing
#
require 'net/http'
require "uri"
require 'timeout'
require 'gdbm'
#
# This class capsules DB caching and HTTP getting pages for articles
#
# You instantiate a FeedGrabber giving at least a unique feed name (for DB
# name generation) Then you can call getURL with the article url to get. If
# it is found in the cache, that one is retrieved. If you parsed your feed
# you should call cleanupDB to get rid of all those unused entries in the
# cache
#
#module FeedGrabber
class FeedGrabber
# create a FeedGrabber instance
def initialize(uniqueName, path = nil, retries = 4, depth = 5, timeout = 15)
path = File.expand_path("~") if path.nil?
@dbCacheName = "#{path}/.newsboat/#{uniqueName}.db" # generate db cache filename
@maxRetries = retries
@maxDepth = depth
@timeout = timeout
@usedURLs = Array.new # empty array to hold used URLs
end
#
# try to retrieve web site, following up to maxDepth redirects, having up to maxRetries retries
#
def getURL_uncached(url)
result = nil
retries = @maxRetries
begin
Timeout::timeout(@timeout) do
tempurl = url
depth = @maxRetries
while true
raise ArgumentError, "Followed more #{@maxDepth} redirections. Stopping this nightmare now." if depth == 0
response = Net::HTTP.get_response(URI.parse(tempurl))
case response
when Net::HTTPSuccess then
result = response.body
break
when Net::HTTPRedirection then
tempurl = response['location']
depth -= 1
next # follow redirection
end
end
end
rescue Timeout::Error
retries -= 1
exit 1 if retries < 1
sleep 1
retry
rescue # maybe an ArgumentError or anything the net layer throws
# any other error shall not make any noise (maybe shall we produce a fake RSS item)
end
result
end
#
# get url, but create and use a DB cache for each feed
#
def getURL(url)
@usedURLs << url # remember, we used that URL for cleanup later
db = GDBM.new(@dbCacheName)
if db.has_key?(url)
data = db[url] # get cached data from DB
else
# not in DB? so get it and store it into DB
data = getURL_uncached(url)
db[url] = data
end
db.close
data
end
#
# remove all URLs not used from DB cache
#
def cleanupDB
toRemove = Array.new
db = GDBM.new(@dbCacheName)
db.each_key do |key|
toRemove << key if @usedURLs.index(key) == nil
end
toRemove.each do |url|
db.delete(url)
end
db.close
end
end #class
#end # module