-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.py
157 lines (123 loc) · 5.38 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import os
import requests
import json
import xml.etree.ElementTree as ET
login_url = 'https://transkribus.eu/TrpServer/rest/auth/login'
create_upload_url = 'https://transkribus.eu/TrpServer/rest/uploads'
# change these
collection_id = "COLLECTION_ID"
username = "USER_EMAIL"
password = "USER_PWD"
def login_transkribus(username, password):
"""
Logs in to the Transkribus API using the provided credentials.
Args:
username (str): Your Transkribus account username (email).
password (str): Your Transkribus account password.
Returns:
str: The session ID upon successful login.
Raises:
Exception: If the login fails, an exception is raised with the error details.
"""
response = requests.post(login_url, data={'user': username, 'pw': password})
if response.status_code == 200:
root = ET.fromstring(response.text)
session_id = root.find('sessionId').text
return session_id
else:
raise Exception(f"Login failed: {response.status_code} - {response.text}")
def create_upload(session_id, collection_id, doc_name, pages):
"""
Creates a new upload in the specified Transkribus collection.
Args:
session_id (str): The session ID from the login.
collection_id (str): The ID of the collection to upload the document to.
doc_name (str): The name of the document.
pages (list): A list of pages (image files and metadata) to be uploaded.
Returns:
str: The ID of the created upload.
Raises:
Exception: If the upload creation fails, an exception is raised with the error details.
"""
headers = {'Cookie': f"JSESSIONID={session_id}", 'Content-Type': 'application/json'}
body = {
"md": {
"title": doc_name
},
"pageList": {"pages": pages}
}
response = requests.post(f'{create_upload_url}?collId={collection_id}', headers=headers, data=json.dumps(body))
if response.status_code == 200:
root = ET.fromstring(response.text)
upload_id = root.find('uploadId').text
return upload_id
else:
raise Exception(f"Failed to create upload: {response.status_code}, {response.text}")
def upload_page(session_id, upload_id, page_data, image_path, xml_path=None):
"""
Uploads a single page (image and optional XML metadata) to the created upload.
Args:
session_id (str): The session ID from the login.
upload_id (str): The ID of the created upload.
page_data (dict): Metadata about the page being uploaded, including the filename and page number.
image_path (str): The path to the image file.
xml_path (str, optional): The path to the XML file, if available.
Raises:
Exception: If the upload fails, an error message is printed.
"""
headers = {'Cookie': f"JSESSIONID={session_id}"}
files = {'img': (page_data['fileName'], open(image_path, 'rb'), 'application/octet-stream')}
if xml_path and os.path.exists(xml_path):
files['xml'] = (page_data['pageXmlName'], open(xml_path, 'rb'), 'application/octet-stream')
else:
print(f"XML file not found: {xml_path}")
return
response = requests.put(f'https://transkribus.eu/TrpServer/rest/uploads/{upload_id}', headers=headers, files=files)
if response.status_code == 200:
print(f"Page {page_data['pageNr']} uploaded successfully.")
else:
print(f"Failed to upload page {page_data['pageNr']}: {response.status_code}, {response.text}")
def process_directory(base_dir):
"""
Processes a directory of documents and uploads their pages to Transkribus.
Args:
base_dir (str): The base directory containing documents to be uploaded.
Directory Structure:
base_dir/
└── document_name/
├── image1.jpg
├── image2.jpg
└── page/
├── image1.xml
└── image2.xml
"""
for dirpath, _, filenames in os.walk(base_dir):
if dirpath == base_dir:
continue
doc_name = os.path.basename(dirpath)
print(f"Processing directory {doc_name}...")
pages = []
# Sort filenames to ensure proper page order
sorted_filenames = sorted((filename for filename in filenames if not filename.endswith('.done')))
for filename in sorted_filenames:
if filename.lower().endswith('.jpg'):
base_filename = os.path.splitext(filename)[0]
# Define paths for image and XML files
image_path = os.path.join(dirpath, filename)
xml_path = os.path.join(dirpath, "page", f"{base_filename}.xml")
page_data = {
"fileName": filename,
"pageNr": len(pages) + 1,
"pageXmlName": f"{base_filename}.xml"
}
pages.append(page_data)
if pages:
session_id = login_transkribus(username, password)
upload_id = create_upload(session_id, collection_id, doc_name, pages)
for page in pages:
image_path = os.path.join(dirpath, page['fileName'])
xml_path = os.path.join(dirpath, "page", page['pageXmlName'])
upload_page(session_id, upload_id, page, image_path, xml_path)
if __name__ == "__main__":
base_dir = 'PATH/TO/DIRECTORY'
process_directory(base_dir)