forked from opensemanticsearch/open-semantic-etl
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathetl_rss.py
executable file
·132 lines (90 loc) · 3.46 KB
/
etl_rss.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import json
import feedparser
import sys
import urllib
from etl_web import Connector_Web
import export_solr
class Connector_RSS(Connector_Web):
def __init__(self, verbose=False, quiet=True):
Connector_Web.__init__(self, verbose=verbose, quiet=quiet)
self.quiet = quiet
self.read_configfiles()
def read_configfiles(self):
#
# include configs
#
# windows style filenames
self.read_configfile ('conf\\opensemanticsearch-connector')
self.read_configfile ('conf\\opensemanticsearch-enhancer-ocr')
self.read_configfile ('conf\\opensemanticsearch-enhancer-rdf')
self.read_configfile ('conf\\opensemanticsearch-connector-web')
self.read_configfile ('conf\\opensemanticsearch-connector-rss')
# linux style filenames
self.read_configfile ('/etc/opensemanticsearch/etl')
self.read_configfile ('/etc/opensemanticsearch/enhancer-ocr')
self.read_configfile ('/etc/opensemanticsearch/enhancer-rdf')
self.read_configfile ('/etc/opensemanticsearch/connector-web')
self.read_configfile ('/etc/opensemanticsearch/connector-rss')
# Import Feed
#
# Import a RSS feed: If article has changed or not indexed, call download_and_index_to_solr()
#
def index (self, uri):
result = True
exporter = export_solr.export_solr()
feed = feedparser.parse(uri)
for item in feed.entries:
articleuri = item.link
#
# Is new article or indexed in former runs?
#
doc_mtime = exporter.get_lastmodified(docid=articleuri)
if doc_mtime:
if self.verbose:
print ("Article indexed before, so skip new indexing: {}".format(articleuri))
else:
# Download and Index the new or updated uri
if self.verbose:
print ("Article not in index: {}".format(articleuri))
try:
partresult = Connector_Web.index(self, uri=articleuri)
if partresult == False:
result = False
except KeyboardInterrupt:
raise KeyboardInterrupt
except BaseException as e:
sys.stderr.write( "Exception while getting {} : {}".format(articleuri, e) )
return result
#
# If runned (not importet for functions) get parameters and start
#
if __name__ == "__main__":
#todo: if no protocoll, use http://
#get uri or filename from args
from optparse import OptionParser
parser = OptionParser("etl-rss [options] uri")
parser.add_option("-q", "--quiet", dest="quiet", action="store_true", default=None, help="Dont print status (filenames) while indexing")
parser.add_option("-v", "--verbose", dest="verbose", action="store_true", default=None, help="Print debug messages")
parser.add_option("-c", "--config", dest="config", default=False, help="Config file")
parser.add_option("-p", "--plugins", dest="plugins", default=False, help="Plugins (comma separated)")
parser.add_option("-w", "--outputfile", dest="outputfile", default=False, help="Output file")
(options, args) = parser.parse_args()
if len(args) != 1:
parser.error("No uri(s) given")
connector = Connector_RSS()
# add optional config parameters
if options.config:
connector.read_configfile(options.config)
if options.outputfile:
connector.config['outputfile'] = options.outputfile
# set (or if config overwrite) plugin config
if options.plugins:
connector.config['plugins'] = options.plugins.split(',')
if options.verbose == False or options.verbose==True:
connector.verbose=options.verbose
if options.quiet == False or options.quiet==True:
connector.quiet=options.quiet
for uri in args:
connector.index(uri)