From 645f9b6f54cff49b0f920e30c30644fe5f078d34 Mon Sep 17 00:00:00 2001 From: Noah Isaacson Date: Wed, 26 Feb 2014 21:15:36 -0700 Subject: [PATCH] Fixes and add command line tool. --- README.md | 47 ++++++++++++++++++++++++++++++----- bin/pdf-text-extract.js | 19 ++++++++++++++ index.js | 55 +++++++++++++++++++++++++++-------------- package.json | 4 +++ test/extract-test.js | 7 +++--- 5 files changed, 104 insertions(+), 28 deletions(-) create mode 100755 bin/pdf-text-extract.js diff --git a/README.md b/README.md index dd2da65..3356cb4 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,8 @@ # PDF Text Extract -Extract text from pdfs that contain searchable pdf text. The module calls the pdftotext command to perform the actual extraction -[![build status](https://secure.travis-ci.org/nisaacson/pdf-text-extract)](http://travis-ci.org/nisaacson/pdf-text-extract) [![Dependency Status](https://david-dm.org/nisaacson/pdf-text-extract.png)](https://david-dm.org/nisaacson/pdf-text-extract) + +Extract text from pdfs that contain searchable pdf text. The module is wrapper that calls the `pdftotext` command to perform the actual extraction + +[![Build Status](https://travis-ci.org/nisaacson/pdf-text-extract.png?branch=master)](https://travis-ci.org/nisaacson/pdf-text-extract) [![Dependency Status](https://david-dm.org/nisaacson/pdf-text-extract.png)](https://david-dm.org/nisaacson/pdf-text-extract) # Installation ```bash @@ -8,26 +10,59 @@ npm install pdf-text-extract ``` -You will need the **pdftotext** binary available on your path. There are packages available for many different operating systems +You will need the `pdftotext` binary available on your path. There are packages available for many different operating systems -> See [https://github.com/nisaacson/pdf-extract#osx](https://github.com/nisaacson/pdf-extract#osx) for how to install the `pdftotext` command +See [https://github.com/nisaacson/pdf-extract#osx](https://github.com/nisaacson/pdf-extract#osx) for how to install the `pdftotext` command # Usage + +## As a module + ```javascript -var filePath = path.join(__dirname, 'test/pdf') +var filePath = path.join(__dirname, 'test/data/multipage.pdf') var extract = require('pdf-text-extract') extract(filePath, function (err, pages) { if (err) { console.dir(err) return } - console.dir('extracted pages', pages) + console.dir(pages) }) ``` The output will be an array of where each entry is a page of text. If you want just a string of all pages you can do `pages.join(' ')` +If needed you can pass an optional arguments to the extract function. These will be passed to the command + +```javascript +var filePath = path.join(__dirname, 'test/data/multipage.pdf') +var extract = require('pdf-text-extract') +var options = { + cwd: "./" +} +extract(filePath, options, function (err, pages) { + if (err) { + console.dir(err) + return + } + console.dir('extracted pages', pages) +}) +``` + +## As a command line tool + +```bash +npm install -g pdf-text-extract +``` + +Execute with the filePath as an argument. Output will be json-formatted array of pages + +```bash +pdf-text-extract ./test/data/multipage.pdf +# outputs +# ['', ''] +``` # Test diff --git a/bin/pdf-text-extract.js b/bin/pdf-text-extract.js new file mode 100755 index 0000000..db7408f --- /dev/null +++ b/bin/pdf-text-extract.js @@ -0,0 +1,19 @@ +#!/usr/bin/env node + +var extract = require('../index') + +var path = require('path') +var fileName = process.argv[2] +if (!fileName) { + throw new Error('file path must be specified as the argument like "pdf-text-extract /path/to/file"') +} +var filePath = path.resolve(fileName) +extract(filePath, cb) + +function cb(err, pages) { + if (err) { + throw err + } + console.dir(pages) +} + diff --git a/index.js b/index.js index bd7c1d1..7fd4666 100644 --- a/index.js +++ b/index.js @@ -1,37 +1,31 @@ var path = require('path') -var exec = require('child_process').exec +var spawn = require('child_process').spawn module.exports = function pdfTextExtract(filePath, options, cb) { if (typeof(options) === 'function') { cb = options options = {} } - var cmd = 'pdftotext' filePath = path.resolve(filePath) var args = [ '-layout', '-enc', 'UTF-8', - '"' + filePath + '"', + filePath, + //'"' + filePath + '"', '-' - ]; - var command = cmd + ' ' + args.join(' ') - var child = exec(command, options, function (err, stdout, stderr) { + ] + streamResults(args, options, splitPages) + + function splitPages(err, content) { if (err) { - return cb({ - message: 'pdf-text-extract failed', - error: err, - filePath: filePath, - command: command, - stack: new Error().stack - }) + return cb(err) } - var pages = stdout.split(/\f/); + var pages = content.split(/\f/) if (!pages) { return cb({ message: 'pdf-text-extract failed', error: 'no text returned from the pdftotext command', filePath: filePath, - command: command, stack: new Error().stack }) } @@ -40,9 +34,32 @@ module.exports = function pdfTextExtract(filePath, options, cb) { if (!lastPage) { pages.pop() } - if (!stderr || stderr === '') { - stderr = null + cb(null, pages) + } +} +function streamResults(args, options, cb) { + var output = '' + var stderr = '' + var command = 'pdftotext' + var child = spawn(command, args, options) + child.stdout.setEncoding('utf8') + child.stderr.setEncoding('utf8') + child.stdout.on('data', stdoutHandler) + child.stderr.on('data', stderrHandler) + child.on('exit', exitHandler) + + function stdoutHandler(data) { + output += data + } + + function stderrHandler(data) { + stderr += data + } + + function exitHandler(code) { + if (code !== 0) { + cb(new Error('pdftextextract command failed: ' + stderr)) } - cb(stderr, pages); - }); + cb(null, output) + } } diff --git a/package.json b/package.json index 9e455e9..422e006 100644 --- a/package.json +++ b/package.json @@ -3,6 +3,7 @@ "version": "1.0.13", "description": "Extract text from pdfs that contain searchable pdf text", "main": "index.js", + "bin": "./bin/pdf-text-extract.js", "directories": { "test": "test" }, @@ -26,5 +27,8 @@ "devDependencies": { "mocha": "~1.8.2", "should": "~1.2.2" + }, + "dependencies": { + "yargs": "~1.1.3" } } diff --git a/test/extract-test.js b/test/extract-test.js index 93b55e1..68c3d64 100644 --- a/test/extract-test.js +++ b/test/extract-test.js @@ -35,9 +35,10 @@ describe('Pdf extract', function () { this.slow('4s') var filePath = path.join(__dirname, 'data', 'huge.pdf') - extract(filePath, { - maxBuffer: 5000 * 1024 - }, function (err, pages) { + var options = { + cwd: null + } + extract(filePath, options, function (err, pages) { should.not.exists(err) should.exists(pages)