-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathapp.py
59 lines (49 loc) · 1.9 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
from urllib.request import urlopen
import urllib
import json
import datetime
import time
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
auth = json.loads(open('auth.json').read())
USERNAME = auth["user"]
PASSWORD = auth["password"]
URL = "http://solr:8983/solr/nutch/select?indent=true&q.op=OR&q=*%3A*&useParams="
DATABASE = "nutch"
COLLECTION = "nutch"
uri = "mongodb+srv://" + USERNAME + ":" + PASSWORD + "@sandbox.zepml.mongodb.net/?retryWrites=true&w=majority"
client = MongoClient(uri, server_api=ServerApi('1'))
try:
client.admin.command('ping')
print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
print(e)
if DATABASE in client.list_database_names():
db = client[DATABASE]
collection = db[COLLECTION]
print("Database already exists")
else:
print("Creating database")
db = client[DATABASE]
db.create_collection(COLLECTION)
collection = db[COLLECTION]
print("Database created")
while True:
time.sleep(30)
try:
print('Trying to connect to Solr')
response = urlopen(URL)
if response.getcode() == 200:
print('Successfully connected to Solr')
data_json = json.loads(response.read())
for document in data_json['response']['docs']:
document['tstamp'] = datetime.datetime.strptime(document['tstamp'][0], '%Y-%m-%dT%H:%M:%S.%fZ')
for elem in document:
if (elem != 'tstamp') and isinstance(document[elem], list) and len(document[elem]) == 1:
document[elem] = document[elem][0]
if collection.count_documents(document, limit=1) == 0:
collection.insert_one(document)
print('Successfully inserted data into MongoDB')
except urllib.error.URLError as e:
print(e.reason)
continue