-
Notifications
You must be signed in to change notification settings - Fork 6
/
ocrd-pagetopdf
executable file
·184 lines (157 loc) · 6.17 KB
/
ocrd-pagetopdf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
#!/usr/bin/env bash
set -eu
set -o pipefail
# showing cmd execution on std (deactivated for productive use)
#set -x
which ocrd >/dev/null 2>/dev/null || { echo "ocrd not in \$PATH. Panicking"; exit 1; }
SHAREDIR="$(cd "$(dirname "$0")" >/dev/null && pwd)"
SCRIPT_NAME="${0##*/}"
MIMETYPE_PAGE=$(ocrd bashlib constants MIMETYPE_PAGE)
FIFO=$(mktemp -u)
function backout {
kill $(jobs -p)
wait &>/dev/null
exec 3>&-
rm -f $FIFO
exit 1
}
main () {
# Load ocrd bashlib functions
# shellcheck source=../core/ocrd/bashlib/lib.bash
source $(ocrd bashlib filename)
ocrd__minversion 2.58.1
# Describe calling script to lib.bash
ocrd__wrap "$SHAREDIR/ocrd-tool.json" "$SCRIPT_NAME" "$@"
# Parameters
local negative2zero output_extension
declare -a parameters=()
if [ -z ${params['textequiv_level']:=} ]; then
ocrd log warning "If you want to add a text layer, please set parameter 'textequiv_level' accordingly!"
else
# first letter is sufficient (case does not matter)
parameters+=(-text-source ${params['textequiv_level']:0:1})
fi
parameters+=(${params['font']:+-font} ${params['font']:-})
if [ ${params['outlines']:=} ]; then
parameters+=(-outlines ${params['outlines']:0:1})
fi
case ${params['negative2zero']} in
[tT]rue|1)
negative2zero=1
;;
*)
negative2zero=0
esac
parameters+=(${params['script-args']})
output_extension=${params['ext']}
cd "${ocrd__argv[working_dir]}"
local page_id in_file_grp img_file_grp out_file_grp
page_id=${ocrd__argv[page_id]:-}
mets="$(basename ${ocrd__argv[mets_file]})"
IFS=',' read -ra in_grps <<< "${ocrd__argv[input_file_grp]}"
in_file_grp="${in_grps[0]}"
img_file_grp="${in_grps[1]:-}"
if [ -z "$img_file_grp" ]; then
ocrd log warning "Without a second input file group for images, the original imageFilename will be used"
fi
out_file_grp=${ocrd__argv[output_file_grp]}
mkfifo $FIFO
trap backout ERR
bulk_options=( -r '(?P<grp>[^ ]+) (?P<page>[^ ]+) (?P<file>[^ ]+) (?P<url>.*)')
bulk_options+=( -G '{{ grp }}' -m application/pdf -g '{{ page }}' -i '{{ file }}' -S '{{ url }}')
if [[ "${ocrd__argv[overwrite]}" == true ]];then
bulk_options+=( --force )
fi
declare -a workspace_options
workspace_options=( -m "${mets}" )
if [[ -n "${ocrd__argv[mets_server_url]}" ]];then
workspace_options+=( -U "${ocrd__argv[mets_server_url]}" )
fi
ocrd workspace "${workspace_options[@]}" bulk-add "${bulk_options[@]}" - <$FIFO &
exec 3>$FIFO
local zeros=0000
# Download the files and do the conversion
for ((n=0; n<${#ocrd__files[*]}; n++)); do
local in_file=($(ocrd__input_file $n local_filename))
local in_id=($(ocrd__input_file $n ID))
local in_mimetype=($(ocrd__input_file $n mimetype))
local in_pageId=($(ocrd__input_file $n pageId))
local out_id="$(ocrd__input_file $n outputFileId)"
local out_file="$out_file_grp/${out_id}.xml"
declare -a options=(${parameters[*]})
ocrd log info "processing page '${in_pageId}'"
if ! test -f "${in_file#file://}"; then
ocrd log error "input file \"${in_file#file://}\" (ID=${in_id} pageId=${in_pageId} MIME=${in_mimetype}) is not on disk"
continue
fi
mkdir -p $out_file_grp
ocrd log info "found ${in_mimetype} file '${in_file}'"
if [ -n "$img_file_grp" ]; then
# multiple fileGrps: ocrd__input_file() is n-ary
img_file="${in_file[1]}"
img_mime="${in_mimetype[1]}"
else
img_file="${in_file}"
img_mime="${in_mimetype}"
fi
# Rework coords in PAGE
if ((negative2zero)); then
local tmpfile
tmpfile=$(mktemp --tmpdir ocrd-pagetopdf.XXXXXX)
python3 "$SHAREDIR/ptp/negative2zero.py" "$in_file" $tmpfile
in_file=$tmpfile
fi
options+=(-xml "$in_file")
if [ "$img_mime" = "$MIMETYPE_PAGE" ]; then
# we could use xsltproc or xmlstarlet for this
# (but that would add another dependency)
img_file=$(python3 "$SHAREDIR/ptp/extract-imagefilename.py" "$img_file")
fi
if ! test -f "$img_file"; then
ocrd log error "No image file '$img_file' for $in_id (pageId=$in_pageId)"
continue
fi
options+=(-image "$img_file")
# Output filename
local out_id="${in_id//$in_file_grp/$out_file_grp}"
if [ "x$out_id" = "x$in_id" ]; then
out_id=${out_file_grp}_${zeros:0:$((4-${#n}))}$n
fi
local out_file="$out_file_grp/${out_id}$output_extension"
options+=(-pdf "$out_file")
ocrd log info "found image file '$img_file'"
if ! output=$(java -jar "$SHAREDIR/ptp/PageToPdf.jar" "${options[@]}" 2>&1); then
ocrd log error "PageToPdf failed for ID $in_id (pageId=$in_pageId): $output"
continue
elif ! [ -f "$out_file" -a -s "$out_file" ]; then
ocrd log error "PageToPdf result is empty for ID $in_id (pageId=$in_pageId): $output"
continue
fi
if ((negative2zero)); then
rm $tmpfile
fi
ocrd log info "adding output PDF file '$out_file'"
# Add the output file to METS
# if [ -n "$in_pageId" ]; then
# options=( -g $in_pageId )
# else
# options=()
# fi
# if [[ "${ocrd__argv[overwrite]}" = true ]]; then
# options+=( --force )
# fi
# options+=( -G $out_file_grp
# -m application/pdf
# -i "$out_id"
# "$out_file" )
# ocrd workspace add "${options[@]}"
echo "${out_file_grp}" "${in_pageId}" "${out_id}" "${out_file}" >&3
done
exec 3>&-
rm -f $FIFO
wait
if [ ${params['multipage']:=} ]; then
python3 "$SHAREDIR/ptp/multipagepdf.py" "$mets" "$out_file_grp" "$page_id" "${params['multipage']}" "${params['pagelabel']}" "${ocrd__argv['overwrite']}"
fi
}
main "$@"