Skip to content

Commit ecddeec

Browse files
author
Jan Dobiasovsky
committed
Initial commit
1 parent 1527f73 commit ecddeec

File tree

8 files changed

+206
-0
lines changed

8 files changed

+206
-0
lines changed

install/install.sh

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
#!/usr/bin/env bash
2+
# setup some folders and add configuration + samples for testing
3+
mkdir configuration
4+
mkdir data
5+
cp ./../samples/kramerius_in_856_latest.csv ./data
6+
cp ./../configuration/config_sample.json ./configuration

metadator.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
"""Tool for matching system numbers with their uuids in digital library."""
2+
from modules import config, pull, reports
3+
import csv
4+
5+
6+
def process_data(configuration, counter):
7+
"""Read csv and try to match."""
8+
with open('./data/kramerius_in_856_latest.csv') as csvfile:
9+
reader = csv.reader(csvfile, delimiter=',')
10+
uuidlist = list()
11+
results = dict()
12+
results['ok'] = dict()
13+
results['unresolved'] = dict()
14+
15+
for row in reader:
16+
uuid = row[1]
17+
uuidlist.append(uuid)
18+
19+
oai_target = pull.Site(configuration['TARGETS']['OAI'],
20+
user=None, passw=None)
21+
fedora_target = pull.Site(location=configuration['TARGETS']['FEDORA'],
22+
user=configuration['AUTH']['FEDORA_USER'],
23+
passw=configuration['AUTH']['FEDORA_PASS'])
24+
index = pull.oai_index(oai_target.location)
25+
26+
for doc in index['response']["docs"]:
27+
if doc['PID'] not in uuidlist:
28+
print('Lookup system number for ' + doc['PID'])
29+
sysno = pull.fedora_record_identif(fedora_target.location,
30+
fedora_target.user,
31+
fedora_target.passw,
32+
uuid=doc['PID'])
33+
if sysno is None:
34+
counter.add('unresolved')
35+
results['unresolved'][doc['PID']] = None
36+
else:
37+
counter.add('resolved')
38+
results['ok'][doc['PID']] = sysno
39+
40+
counter.add('total')
41+
42+
counter.report()
43+
return results
44+
45+
46+
def write_outfile(results):
47+
"""Write output."""
48+
with open('./data/856_kramerius_export.txt', 'a') as outfile:
49+
# from dict with uuid keys and sysno values, generate file with lines
50+
for key, val in results['ok'].items():
51+
line1 = str(val) + ' 85640 L $$uhttps://kramerius.techlib.cz/search/handle/' + key + '$$yDigitalizovany dokument\n'
52+
line2 = str(val) + ' BAS L di\n'
53+
outfile.write(line1)
54+
outfile.write(line2)
55+
for key, val in results['unresolved'].items():
56+
line1 = 'SYSNO' + ' 85640 L $$uhttps://kramerius.techlib.cz/search/handle/' + key + '$$yDigitalizovany dokument\n'
57+
line2 = 'SYSNO' + ' BAS L di\n'
58+
outfile.write(line1)
59+
outfile.write(line2)
60+
61+
62+
if __name__ == '__main__':
63+
configuration = config.load_config("./configuration/config.json")
64+
missing_counter = reports.Counter()
65+
results = process_data(configuration, missing_counter)
66+
write_outfile(results)

modules/config.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
import json
2+
3+
4+
def load_config(path):
5+
'''Read configuration file and return dictionary with configuration.'''
6+
with open(path) as configfile:
7+
configuration_data = json.loads(configfile.read())
8+
return configuration_data

modules/pull.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
"""Utilities for pulling data."""
2+
import requests
3+
import xml.etree.ElementTree as ET
4+
import re
5+
6+
7+
class Site:
8+
"""Class for representing used sites."""
9+
10+
def __init__(self, location, user, passw):
11+
"""Create site representation."""
12+
self.location = location
13+
self.user = user
14+
self.passw = passw
15+
16+
17+
def oai_index(target):
18+
"""Pull solr index using oai, return as dictionary."""
19+
# test request if target is reachable, get entire index after
20+
query_info = requests.get(target + '?fl=PID&indent=on&q=fedora.model:monograph OR fedora.model:periodical&wt=json')
21+
query_info_json = query_info.json()
22+
num_found = query_info_json['response']['numFound']
23+
index = requests.get(target + 'fl=PID&indent=on&q=fedora.model:monograph OR fedora.model:periodical&wt=json&rows=' + str(num_found))
24+
indexdict = index.json()
25+
return indexdict
26+
27+
28+
def fedora_record_identif(target, user, passw, uuid):
29+
"""Request xml object using uuid and attempt to extract system number."""
30+
r = requests.get(target + uuid + '/objectXML', auth=requests.auth.HTTPBasicAuth(user, passw))
31+
if r.status_code != 200:
32+
print("Something went wrong... (error code: ", r.status_code, ")")
33+
try:
34+
root = ET.fromstring(r.content)
35+
36+
for element in root.iter():
37+
if element.tag == '{http://www.loc.gov/mods/v3}recordIdentifier':
38+
if re.match(r'\d\d\d\d\d\d\d\d\d', element.text):
39+
print("Sysno found: ", element.text)
40+
return element.text
41+
except ET.ParseError as err:
42+
print("Unable to parse xml...")
43+
print("Reason: \n", err)
44+
print("Sysno not found...")
45+
return None

modules/reports.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
"""Things for generating output for user."""
2+
3+
4+
class Counter:
5+
"""Count results and provide information about them."""
6+
7+
def __init__(self):
8+
"""Create new counter."""
9+
self.total = 0
10+
self.resolved = 0
11+
self.unresolved = 0
12+
13+
def add(self, type):
14+
"""Add to counter by type."""
15+
if type == 'total':
16+
self.total += 1
17+
if type == 'resolved':
18+
self.resolved += 1
19+
if type == 'unresolved':
20+
self.unresolved += 1
21+
22+
def get(self, type):
23+
"""Get current number on counter."""
24+
if type == 'total':
25+
return self.total
26+
if type == 'resolved':
27+
return self.resolved
28+
if type == 'unresolved':
29+
return self.unresolved
30+
31+
def report(self):
32+
"""Give information about current counter status."""
33+
print('='*50)
34+
print('Missing total: ', self.total)
35+
print('Missing resolved: ', self.resolved)
36+
print('Missing unresolved: ', self.unresolved)
37+
print('='*50)

samples/config_sample.json

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
{
2+
"AUTH": {
3+
"FEDORA_USER": "fedoraAdministrator",
4+
"FEDORA_PASS": "adminpass1234"
5+
},
6+
"TARGETS": {
7+
"FEDORA": "http://localhost:8080/fedora/objects/",
8+
"OAI": "http://localhost:8983/solr/kramerius/select?"
9+
}
10+
}

samples/export_file.txt

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
000639762 85640 L $$uhttps://kramerius.techlib.cz/search/handle/uuid:fa767277-d519-448b-87fa-bc01f30e38dc$$yDigitalizovaný dokument
2+
000639762 BAS L di
3+
000912370 85640 L $$uhttps://kramerius.techlib.cz/search/handle/uuid:90efc712-93e8-40df-bcb7-ae705f061c56$$yDigitalizovaný dokument
4+
000912370 BAS L di
5+
000910984 85640 L $$uhttps://kramerius.techlib.cz/search/handle/uuid:7c04861c-c199-48ab-8d7b-1f1e74c42980$$yDigitalizovaný dokument
6+
000910984 BAS L di
7+
000666530 85640 L $$uhttps://kramerius.techlib.cz/search/handle/uuid:d26016b8-1d2b-416b-a1c1-328d8171cbdd$$yDigitalizovaný dokument
8+
000666530 BAS L di
9+
000912373 85640 L $$uhttps://kramerius.techlib.cz/search/handle/uuid:c9359e58-c786-41d3-8a27-1f81f8bdfa69$$yDigitalizovaný dokument
10+
000912373 BAS L di
11+
000910972 85640 L $$uhttps://kramerius.techlib.cz/search/handle/uuid:78fa4d61-383d-4e09-b131-74f107edb963$$yDigitalizovaný dokument
12+
000910972 BAS L di
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
000666395,uuid:105f4c40-faf4-11e0-be50-0800200c9a66
2+
000612507,uuid:2b2d5518-a5ff-4053-98c4-d812b0f4a165
3+
000003579,uuid:31eb3ce0-feea-11e4-b939-0800200c9a66
4+
000681797,uuid:5a03d779-bed8-11e1-a154-001b63bd97ba
5+
000631513,uuid:12fa6296-6340-11df-b3e5-0050568253d9
6+
000678226,uuid:42eb1a80-3e4f-11e4-916c-0800200c9a66
7+
000653579,uuid:4e0849d0-7ff4-11e2-9e96-0800200c9a66
8+
000649610,uuid:bb8d2ca0-abd4-43d5-8549-3c93667f69b8
9+
000654111,uuid:e9f1588c-2c5a-4690-8039-422fb2cd69d9
10+
000654303,uuid:ef9e0fc3-2d39-44f6-8550-a5f679a176ba
11+
000654128,uuid:001ced00-8708-11e2-9e96-0800200c9a66
12+
000051672,uuid:008d1630-3087-11e1-b86c-0800200c9a66
13+
000006952,uuid:00931210-02b6-11e5-b939-0800200c9a66
14+
000609658,uuid:0095bca0-614f-11e2-bcfd-0800200c9a66
15+
000682483,uuid:00ac6580-375a-11e4-8510-0800200c9a66
16+
000170553,uuid:00d50512-d56a-4053-a3f4-cca082a2293d
17+
000653515,uuid:016d5e70-92bb-11e1-b0c4-0800200c9a66
18+
000661270,uuid:018b2c90-7bea-11e5-a837-0800200c9a66
19+
000121146,uuid:018cb344-3355-48f1-9820-9217c6f2cd59
20+
000171926,uuid:01cf39cd-c5f9-43bc-9b67-8751cb87e771
21+
000074880,uuid:01d1e320-b042-11e4-ab27-0800200c9a66
22+
000239221,uuid:fc4d3050-7ad4-11e4-82f8-0800200c9a66

0 commit comments

Comments
 (0)