forked from nisaacson/pdf-text-extract
-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.js
108 lines (89 loc) · 3.19 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
var path = require('path')
var spawn = require('child_process').spawn
module.exports = function pdfTextExtract(filePath, options, cb) {
// options is optional
if (typeof(options) === 'function') {
cb = options
options = {}
}
filePath = path.resolve(filePath)
// default options
options.encoding = options.encoding || 'UTF-8'
options.layout = options.layout || 'layout'
options.splitPages = (options.splitPages !== false)
// Build args based on options
var args = []
// First and last page to convert
if (options.firstPage) { args.push('-f'); args.push(options.firstPage) }
if (options.lastPage) { args.push('-l'); args.push(options.lastPage) }
// Resolution, in dpi. (null is pdftotext default = 72)
if (options.resolution) { args.push('-r'); args.push(options.resolution) }
// If defined, should be an object { x:x, y:y, w:w, h:h }
if (typeof(options.crop) === 'object') {
if (options.crop.x) { args.push('-x'); args.push(options.crop.x) }
if (options.crop.y) { args.push('-y'); args.push(options.crop.y) }
if (options.crop.w) { args.push('-W'); args.push(options.crop.w) }
if (options.crop.h) { args.push('-H'); args.push(options.crop.h) }
}
// One of either 'layout', 'raw' or 'htmlmeta'
if (options.layout === 'layout') args.push('-layout')
if (options.layout === 'raw') args.push('-raw')
if (options.layout === 'htmlmeta') args.push('-htmlmeta')
// Output text encoding (UCS-2, ASCII7, Latin1, UTF-8, ZapfDingbats or Symbol)
if (options.encoding) { args.push('-enc'); args.push(options.encoding) }
// Output end of line convention (unix, dos or mac)
if (options.eol) { args.push('-eol'); args.push(options.eol) }
// Owner and User password (for encrypted files)
if (options.ownerPassword) { args.push('-opw'); args.push(options.ownerPassword) }
if (options.userPassword) { args.push('-upw'); args.push(options.userPassword) }
// finish up arguments
args.push(filePath)
args.push('-')
streamResults(args, options, options.splitPages ? splitPages : cb)
function splitPages(err, content) {
if (err) {
return cb(err)
}
var pages = content.split(/\f/)
if (!pages) {
return cb({
message: 'pdf-text-extract failed',
error: 'no text returned from the pdftotext command',
filePath: filePath,
stack: new Error().stack
})
}
// sometimes there can be an extract blank page on the end
var lastPage = pages[pages.length - 1]
if (!lastPage) {
pages.pop()
}
cb(null, pages)
}
}
/**
* spawns pdftotext and returns its output
*/
function streamResults(args, options, cb) {
var output = ''
var stderr = ''
var command = 'pdftotext'
var child = spawn(command, args, options)
child.stdout.setEncoding('utf8')
child.stderr.setEncoding('utf8')
child.stdout.on('data', stdoutHandler)
child.stderr.on('data', stderrHandler)
child.on('close', closeHandler)
function stdoutHandler(data) {
output += data
}
function stderrHandler(data) {
stderr += data
}
function closeHandler(code) {
if (code !== 0) {
return cb(new Error('pdf-text-extract command failed: ' + stderr))
}
cb(null, output)
}
}