-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathMokuro2Pdf.rb
406 lines (405 loc) · 23.2 KB
/
Mokuro2Pdf.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
require 'prawn'
require 'json'
require 'mini_magick'
require 'fileutils'
require 'optparse'
require 'find'
options = {}
OptionParser.new do |opt|
opt.banner = "Usage: Mokuro2Kindle.rb [options]"
opt.on("-i IMAGES", "--imageFolder IMAGES", "Folder containing all manga pages") do |i|
options[:imageFolder] = i
end
opt.on("-n NAME", "--name NAME", "Filename the created pdf will have") do |n|
options[:filename] = n
end
opt.on("-g GAMMA", "--gamma GAMMA", "Gamma value to be used on all images, default = 1") do |g|
options[:gamma] = g.to_f
end
opt.on("-o OCRFOLDER", "--ocr OCRFOLDER", "Folder containing all manga pages's ocr data, default = _ocr/#{options[:imageFolder]}") do |o|
options[:ocrFolder] = o
end
opt.on("-p PARENTimgFOLDER", "--parent_img_folder PARENTimgFOLDER", "Folder containing all volumes's images folders") do |p|
options[:parentImg] = p
end
opt.on("-q PARENTocrFOLDER", "--parent_ocr_folder PARENTocrFOLDER", "Folder containing all volumes's ocr data folders") do |q|
options[:parentOcr] = q
end
opt.on("-f FONT_TRANSPARENCY", "--font_transparency FONT_TRANSPARENCY", "Selectable text's transparency, default = 0.2") do |f|
options[:fontTransparency] = f.to_f
end
opt.on("-w OUTPUT_FOLDER", "--write_to OUTPUT_FOLDER", "Output folder") do |w|
options[:outputFolder] = w
end
opt.on("-u", "--upscale_on", "Turn on image upscaling if image resolution < Kindle's resolution") do |u|
options[:upscale] = true
end
opt.on("-c", "--convert", "Convert all images to JPGs to reduce the generated pdf's file size") do |c|
options[:convert] = true
end
opt.on("-s", "--sort_natural", "Use natural sorting for filenames, might break 'properly' named files so use only when pdf pages are placed out of order") do |s|
options[:nsorting] = true
end
end.parse!
puts ""
puts "Mokuro2Pdf"
if options.key?(:gamma)
puts "Using the defined #{options[:gamma]} gamma value"
else
puts "Using the default(1) gamma value"
options[:gamma] = 1
end
if options.key?(:fontTransparency)
puts "Using the defined #{options[:fontTransparency]} font transparency"
else
puts "Using the default(0) font transparency"
options[:fontTransparency] = 0
end
if options.key?(:outputFolder)
puts "Using the defined #{options[:outputFolder]} output folder"
if !(options[:outputFolder] =~ /[\\\/]$/)
options[:outputFolder] += '/'
end
else
options[:outputFolder] = ""
end
if options.key?(:upscale)
puts "Upscale on"
else
options[:upscale] = false
end
if options.key?(:convert)
puts "JPG conversion on"
else
options[:convert] = false
end
if options.key?(:nsorting)
puts "Using Natural Sorting"
else
options[:nsorting] = false
end
folders = []
if !options.key?(:parentImg)
if !options.key?(:filename) || options[:filename] == ''
options[:filename] = options[:imageFolder] =~ /(?<=\\|\/)[^\\\/]{1,}?(?=$|[\\\/]$)/ ? options[:imageFolder].match(/(?<=\\|\/)[^\\\/]{1,}?(?=$|[\\\/]$)/)[0] : options[:imageFolder]
end
folder = []
puts "Converting '#{options[:imageFolder]}/' to '#{options[:filename]} - MKR2PDF.pdf'"
begin
volumeImg = options[:imageFolder] =~ /(?<=\\|\/)[^\\\/]{1,}?(?=$|[\\\/]$)/ ? options[:imageFolder].match(/(?<=\\|\/)[^\\\/]{1,}?(?=$|[\\\/]$)/)[0] : options[:imageFolder]
if !options.key?(:ocrFolder)
puts "Using the default '_ocr/#{volumeImg}/' ocr folder path"
options[:ocrFolder] = "_ocr/#{volumeImg}"
volumeOcr = volumeImg
else
puts "Using the defined '#{options[:ocrFolder]}/' ocr folder path"
volumeOcr = options[:ocrFolder] =~ /(?<=\\|\/)[^\\\/]{1,}?(?=$|[\\\/]$)/ ? options[:ocrFolder].match(/(?<=\\|\/)[^\\\/]{1,}?(?=$|[\\\/]$)/)[0] : options[:ocrFolder]
end
pages = []
Find.find(options[:imageFolder]) do |path|
pages << path if path =~ /.*\.(jpg|jpeg|jpe|jif|jfif|jfi|png|gif|webp|tiff|tif|psd|raw|arw|cr2|nrw|k25|bmp|dib|jp2|j2k|jpf|jpx|jpm|mj2|avif)$/i
end
jsons = {}
Find.find(options[:ocrFolder]) do |path|
jsons[path.match(/#{Regexp.escape(volumeOcr)}.*?(?=\.json$)/i)[0].gsub(/^.*?(?=[\\\/])/, volumeImg)] = path if path =~ /.*\.json$/i
end
puts "#{pages.length} Pages found"
puts "#{jsons.length} Jsons found"
if pages.length != jsons.length
puts "\tWARNING - Pages and Jsons numbers don't match"
end
info = {
Title: options[:filename].gsub(/^([\[\(【].*?[\]\)】])+|(DLraw.*?[\]\)】])+|(DLraw.*?[\-_])+|(?<=\s)([\[\(].*?[\]\)](?=\s|$))+|【.*】【.*?】/i, "").strip,
Author: "MKR2PDF"
}
if options[:nsorting]
folder.append(pages.sort_by{|n| n.match(/\d+?(?=\.(jpg|jpeg|jpe|jif|jfif|jfi|png|gif|webp|tiff|tif|psd|raw|arw|cr2|nrw|k25|bmp|dib|jp2|j2k|jpf|jpx|jpm|mj2|avif)$)/i)[0].to_i})
else
folder.append(pages.sort)
end
folder.append(jsons)
folder.append(info)
folder.append(volumeImg)
folders.append(folder)
rescue
puts "No Pages/Jsons found"
end
else
volumesImg = Dir.glob("*", base: options[:parentImg]).sort
volumesImg = volumesImg.select {|item| File.directory?("#{options[:parentImg]}/#{item}")}
puts "#{volumesImg.length} folders found on '#{options[:parentImg]}/'\n"
for volume in volumesImg do
folder = []
info = {
Title: volume.gsub(/^([\[\(【].*?[\]\)】])+|(DLraw.*?[\]\)】])+|(DLraw.*?[\-_])+|(?<=\s)([\[\(].*?[\]\)](?=\s|$))+|【.*】【.*?】/i, "").strip,
Author: "MKR2PDF"
}
begin
pages = []
Find.find("#{options[:parentImg]}/#{volume}") do |path|
pages << path if path =~ /.*\.(jpg|jpeg|jpe|jif|jfif|jfi|png|gif|webp|tiff|tif|psd|raw|arw|cr2|nrw|k25|bmp|dib|jp2|j2k|jpf|jpx|jpm|mj2|avif)$/i
end
jsons = {}
Find.find("#{options[:parentOcr]}/#{volume}") do |path|
jsons[path.match(/#{Regexp.escape(volume)}.*?(?=\.json$)/i)[0]] = path if path =~ /.*\.json$/i
end
if pages.length > 0 && jsons.length > 0
puts "\t#{volume} - #{pages.length} Pages found, #{jsons.length} Jsons found\n"
if pages.length != jsons.length
puts "\t\tWARNING - Pages and Jsons numbers don't match"
end
if options[:nsorting]
folder.append(pages.sort_by{|n| n.match(/\d+?(?=\.(jpg|jpeg|jpe|jif|jfif|jfi|png|gif|webp|tiff|tif|psd|raw|arw|cr2|nrw|k25|bmp|dib|jp2|j2k|jpf|jpx|jpm|mj2|avif)$)/i)[0].to_i})
else
folder.append(pages.sort)
end
folder.append(jsons)
folder.append(info)
folder.append(volume)
folders.append(folder)
else
puts "\t#{volume} - #{pages.length} Pages found, #{jsons.length} Jsons found. Skipping folder\n"
end
rescue
puts "\t#{volume} - No Pages/Jsons found, skipping folder"
end
end
end
for folder in folders do
begin
pages = folder[0]
jsons = folder[1]
info = folder[2]
puts "\nProcessing #{info[:Title]}..."
for i in 0...pages.length do
imagePath = pages[i].match(/(#{Regexp.escape(folder[3])}).*?(?=\.(jpg|jpeg|jpe|jif|jfif|jfi|png|gif|webp|tiff|tif|psd|raw|arw|cr2|nrw|k25|bmp|dib|jp2|j2k|jpf|jpx|jpm|mj2|avif)$)/i)[0]
imageExt = pages[i].match(/\.(jpg|jpeg|jpe|jif|jfif|jfi|png|gif|webp|tiff|tif|psd|raw|arw|cr2|nrw|k25|bmp|dib|jp2|j2k|jpf|jpx|jpm|mj2|avif)$/i)
if !imageExt[0].match?(/jpg|jpeg|png/i)
options[:convert] = true
end
if jsons.include? imagePath
has_Json = true
page = JSON.parse(File.read(jsons[imagePath]))
pageWidth = page["img_width"]
pageHeight = page["img_height"]
else
puts "!!No ocr data found for page #{i+1}!!"
has_Json = false
page = MiniMagick::Image.open(pages[i])
pageWidth = page[:width]
pageHeight = page[:height]
end
if options[:upscale] || options[:gamma] != 1 || options[:convert]
FileUtils.mkdir_p "tmp"
pageBgMagick = MiniMagick::Image.open(pages[i])
upscale = 1
if options[:gamma] != 1
pageBgMagick.gamma options[:gamma]
end
if options[:upscale]
pageRes = [pageBgMagick[:width], pageBgMagick[:height]]
if pageRes[0] < 1016 || pageRes[1] < 1358
while (pageRes[0] * upscale).to_i < 1016 || (pageRes[1] * upscale).to_i < 1358
upscale += 0.25
end
else
upscale = 1
end
pageBgMagick.scale "#{(pageRes[0] * upscale).to_i}x#{(pageRes[1] * upscale).to_i}"
end
if options[:convert]
if pageBgMagick[:type] != "JPEG"
pageBgMagick.format "JPEG"
end
end
pageBgMagick.write "tmp/page-#{i}"
pageBgMagickPath = "tmp/page-#{i}"
pageWidth = pageBgMagick[:width]
pageHeight = pageBgMagick[:height]
else
pageBgMagickPath = pages[i]
upscale = 1
end
if i == 0
pdf = Prawn::Document.new(page_size: [pageWidth, pageHeight], margin: [0, 0, 0, 0], info: info)
else
pdf.start_new_page(size: [pageWidth, pageHeight], margin: [0, 0, 0, 0])
end
pdf.image pageBgMagickPath, height: pageHeight, width: pageWidth, at: [0, pageHeight]
next if !has_Json
if upscale > 1
pageText = page["blocks"].map{ |x|
x["box"].map!{ |y|
(y * upscale).to_i
}
x["lines_coords"].map!{ |l|
l.map!{ |c|
c.map!{ |d|
((d * upscale).to_i).to_f
}
}
}
x["font_size"] = (x["font_size"] * upscale).to_i
x
}
else
pageText = page["blocks"]
end
pdf.transparent(options[:fontTransparency]) do
pdf.font("ipaexg.ttf")
for b in 0...pageText.length do
heightTreshold = pageHeight * 0.0075
widthThreshold = pageWidth * 0.0075
isBoxVert = pageText[b]["vertical"]
fontSize = 0
if !isBoxVert
for l in 0...pageText[b]["lines"].length do
line = pageText[b]["lines"][l].gsub(/(...)/, "…").gsub(/(..)/, "‥").gsub(/(.)/, "").gsub(/\s/, "").gsub(/[。\..、,,]+$/, "")
if !(line.to_s == '' || line.nil?)
lineLeft = pageText[b]["lines_coords"][l][3][0]
lineRight = pageText[b]["lines_coords"][l][2][0]
lineBottom = pageText[b]["lines_coords"][l][3][1] <= pageText[b]["lines_coords"][l][2][1] ? pageText[b]["lines_coords"][l][3][1] : pageText[b]["lines_coords"][l][2][1]
lineTop = pageText[b]["lines_coords"][l][0][1] <= pageText[b]["lines_coords"][l][1][1] ? pageText[b]["lines_coords"][l][0][1] : pageText[b]["lines_coords"][l][1][1]
lineWidth = lineRight - lineLeft
lineHeight = lineBottom - lineTop
fontSize = (lineWidth / line.length) <= (lineHeight * 2) ? (lineWidth / line.length) : (lineHeight * 2)
next if fontSize <= (pageText[b]["font_size"] * 0.15)
line = pageText[b]["lines"][l].strip.gsub(/(...)/, "…").gsub(/(..)/, "‥").gsub(/(.)/, "").gsub(/\s/, "").gsub(/[。\..、,,…‥!!??:~~]+$/, "")
pdf.draw_text line, size: fontSize, at:[lineLeft, pageHeight - lineBottom]
end
end
else
textLevels = pageText[b]["lines_coords"].map{|line| (line[0][1] <= line[1][1] ? line[0][1] : line[1][1])}.sort.uniq
textLevels = textLevels.each_with_index {|y, idx| while idx + 1 < textLevels.length && (y >= (textLevels[idx + 1] - heightTreshold) && y <= (textLevels[idx + 1] + heightTreshold)) do textLevels.delete_at(idx + 1) end}
levelLeft = []
levelRight = []
levelLine = {}
for level in 0...textLevels.length do
levelLeft << pageText[b]["lines_coords"].reduce(pageWidth) {|lefttest, line| (line[0][1] <= line[1][1] ? line[0][1] : line[1][1]) >= (textLevels[level] - heightTreshold) && (line[0][1] <= line[1][1] ? line[0][1] : line[1][1]) <= (textLevels[level] + heightTreshold) ? (line[0][0] <= line[3][0] ? line[0][0] : line[3][0]) < lefttest ? (line[0][0] <= line[3][0] ? line[0][0] : line[3][0]) : lefttest : lefttest}
levelRight << pageText[b]["lines_coords"].reduce(0) {|righttest, line| (line[0][1] <= line[1][1] ? line[0][1] : line[1][1]) >= (textLevels[level] - heightTreshold) && (line[0][1] <= line[1][1] ? line[0][1] : line[1][1]) <= (textLevels[level] + heightTreshold) ? (line[1][0] >= line[2][0] ? line[1][0] : line[2][0]) > righttest ? (line[1][0] >= line[2][0] ? line[1][0] : line[2][0]) : righttest : righttest}
end
levelWidth = textLevels.map.with_index {|level, idx| [level, levelRight[idx], levelLeft[idx]]}
for l in 0...pageText[b]["lines"].length do
minTop = pageText[b]["lines_coords"][l][0][1] >= pageText[b]["lines_coords"][l][1][1] ? pageText[b]["lines_coords"][l][0][1] : pageText[b]["lines_coords"][l][1][1]
minBottom = pageText[b]["lines_coords"][l][3][1] <= pageText[b]["lines_coords"][l][2][1] ? pageText[b]["lines_coords"][l][3][1] : pageText[b]["lines_coords"][l][2][1]
widthTop = pageText[b]["lines_coords"][l][1][0] - pageText[b]["lines_coords"][l][0][0]
widthBottom = pageText[b]["lines_coords"][l][2][0] - pageText[b]["lines_coords"][l][3][0]
boxHeight = pageText[b]["box"][3] - pageText[b]["box"][1]
ocrFSize = pageText[b]["font_size"]
lineTmp = pageText[b]["lines"][l].gsub(/(...)/, "…").gsub(/(..)/, "‥").gsub(/(.)/, "").gsub(/\s/, "")
if /[!!??]+$/.match?(lineTmp)
lineTmp = lineTmp.gsub(/[!!??]+$/, "!")
end
if /[0-90-9]{2,3}/.match?(lineTmp)
lineTmp = lineTmp.gsub(/(?<![0-90-9])[0-90-9]{2,3}(?![0-90-9])/, "!")
end
if /[a-zA-Za-zA-Z]{2,3}/.match?(lineTmp)
lineTmp = lineTmp.gsub(/[a-zA-Za-zA-Z]{2,3}/, "!")
end
scanPar = lineTmp.scan(/[《『「(\[\{(〔[{〈【<≪”"“゛″〝〟"≫>】〉}]〕)\}\])」』》]/)
scanPt = lineTmp.scan(/[。\..、,,]+$/)
lineTmp = lineTmp.gsub(/[《『「(\[\{(〔[{〈【<≪”"“゛″〝〟"≫>】〉}]〕)\}\])」』》]/, "")
lineTmp = lineTmp.gsub(/[。\..、,,]+$/, "")
lineLength = lineTmp.length + (scanPar.length > 0 ? scanPar.length * 0.8 : 0) + (scanPt.length > 0 ? scanPt.length * 0.5 : 0)
lineHeight = minBottom - minTop
lineHeight = boxHeight <= lineHeight ? boxHeight : lineHeight
lineWidth = widthTop <= widthBottom ? widthTop : widthBottom
if !(lineLength.nil? || lineLength == 0 || lineLength.to_s == '')
fontSize = (lineHeight / lineLength) <= (lineWidth * 1.75) ? (lineHeight / lineLength) : (lineWidth * 1.75)
for level in textLevels do
levelLine[level] = [] if !(levelLine.key?(level))
lineTop = (pageText[b]["lines_coords"][l][0][1] <= pageText[b]["lines_coords"][l][1][1] ? pageText[b]["lines_coords"][l][0][1] : pageText[b]["lines_coords"][l][1][1])
lineLevelThreshLow = lineTop >= (level - heightTreshold)
lineLevelThreshHigh = lineTop <= (level + heightTreshold)
if lineLevelThreshLow && lineLevelThreshHigh
levelLine[level] << [pageText[b]["lines"][l], fontSize, ocrFSize]
end
end
end
end
for level in 0...textLevels.length do
next if (levelLine[textLevels[level]].nil? || levelLine[textLevels[level]] == 0 || levelLine[textLevels[level]].to_s == '')
boxWidth = levelWidth[level][1] - levelWidth[level][2]
boxLength = levelLine[textLevels[level]].length
boxFSize = levelLine[textLevels[level]].reduce(99999) {|smallest, line| (line[1] < smallest) && (line[1] > (line[2] * 0.3)) ? line[1] : smallest}
lineSpace = 1.1
if boxLength > 1
lineSpace = ((boxWidth - (boxLength * boxFSize)) / (boxLength - 1)) + boxFSize
end
boxLeft = levelWidth[level][2] + lineSpace * (boxLength - 1)
for line in levelLine[textLevels[level]] do
next if line[1] <= (line[2] * 0.5)
fontSize = line[1]
line = line[0].strip.gsub(/(...)/, "…").gsub(/(..)/, "‥").gsub(/(.)/, "").gsub(/\s/, "").gsub(/[。\..、,,…‥!!??:~~]+$/, "")
boxUp = (pageHeight - textLevels[level]) - fontSize
numberComp = ''
ponctComp = ''
romComp = ''
for char in 0...line.length do
if /[《『「\(\[\{(〔[{〈【<≪≫>】〉}]〕)\}\]\)」』》]/.match?(line[char])
boxUp -= fontSize * 0.8
elsif /[。\..、,,…‥!!??:~~]/.match?(line[char])
boxUp -= fontSize
elsif /[0-90-9]/.match?(line[char])
numberComp += line[char]
if (char + 1) > line.length || !/[0-90-9]/.match?(line[char + 1])
if numberComp.length == 2
tmpFSize = fontSize * 0.5
pdf.draw_text numberComp, size: tmpFSize, at: [boxLeft, boxUp + (tmpFSize/2)]
boxUp -= fontSize
elsif numberComp.length == 3
tmpFSize = fontSize * 0.35
pdf.draw_text numberComp, size: tmpFSize, at: [boxLeft, boxUp + (tmpFSize/3)]
boxUp -= fontSize
else
for n in 0...numberComp.length
pdf.draw_text numberComp[n], size: fontSize, at: [boxLeft, boxUp]
boxUp -= fontSize
end
end
numberComp = ''
end
elsif /[!!??]/.match?(line[char])
ponctComp += line[char]
if (char + 1) > line.length || !/[!!??]/.match?(line[char + 1])
tmpFSize = fontSize / ponctComp.length
pdf.draw_text ponctComp, size: tmpFSize, at: [boxLeft, boxUp]
boxUp -= fontSize
ponctComp = ''
end
elsif /[a-zA-Za-zA-Z]/.match?(line[char])
romComp += line[char]
if (char + 1) > line.length || !/[a-zA-Za-zA-Z]/.match?(line[char + 1])
if romComp.length <= 3
tmpFSize = fontSize / romComp.length
pdf.draw_text romComp, size: tmpFSize, at: [boxLeft, boxUp]
boxUp -= fontSize
else
for l in 0...romComp.length
pdf.draw_text romComp[l], size: fontSize, at: [boxLeft, boxUp]
boxUp -= fontSize
end
end
romComp = ''
end
else
pdf.draw_text line[char], size: fontSize, at: [boxLeft, boxUp]
boxUp -= fontSize
end
end
boxLeft -= lineSpace
end
end
end
end
end
end
if options[:gamma] != 1 || upscale != 1 || options[:convert]
FileUtils.remove_dir("tmp")
end
pdf.render_file("#{options[:outputFolder]}#{info[:Title]} - MKR2PDF.pdf")
puts "Done!"
rescue => e
puts e.full_message
end
end