Fixes and add command line tool.

harryrobbins · Feb 27, 2014 · 645f9b6 · 645f9b6
1 parent b6e4dac
commit 645f9b6
Show file tree

Hide file tree

Showing 5 changed files with 104 additions and 28 deletions.
diff --git a/README.md b/README.md
@@ -1,33 +1,68 @@
 # PDF Text Extract
-Extract text from pdfs that contain searchable pdf text. The module calls the pdftotext command to perform the actual extraction
-[![build status](https://secure.travis-ci.org/nisaacson/pdf-text-extract)](http://travis-ci.org/nisaacson/pdf-text-extract) [![Dependency Status](https://david-dm.org/nisaacson/pdf-text-extract.png)](https://david-dm.org/nisaacson/pdf-text-extract)
+
+Extract text from pdfs that contain searchable pdf text. The module is wrapper that calls the `pdftotext` command to perform the actual extraction
+
+[![Build Status](https://travis-ci.org/nisaacson/pdf-text-extract.png?branch=master)](https://travis-ci.org/nisaacson/pdf-text-extract) [![Dependency Status](https://david-dm.org/nisaacson/pdf-text-extract.png)](https://david-dm.org/nisaacson/pdf-text-extract)
 
 # Installation
 ```bash
 npm install pdf-text-extract
 ```
 
 
-You will need the **pdftotext** binary available on your path. There are packages available for many different operating systems
+You will need the `pdftotext` binary available on your path. There are packages available for many different operating systems
 
-> See [https://github.com/nisaacson/pdf-extract#osx](https://github.com/nisaacson/pdf-extract#osx) for how to install the `pdftotext` command
+See [https://github.com/nisaacson/pdf-extract#osx](https://github.com/nisaacson/pdf-extract#osx) for how to install the `pdftotext` command
 
 
 # Usage
+
+## As a module
+
 ```javascript
-var filePath = path.join(__dirname, 'test/pdf')
+var filePath = path.join(__dirname, 'test/data/multipage.pdf')
 var extract = require('pdf-text-extract')
 extract(filePath, function (err, pages) {
   if (err) {
     console.dir(err)
     return
   }
-  console.dir('extracted pages', pages)
+  console.dir(pages)
 })
 ```
 The output will be an array of where each entry is a page of text. If you want just a string of all pages you can do `pages.join(' ')`
 
 
+If needed you can pass an optional arguments to the extract function. These will be passed to the command
+
+```javascript
+var filePath = path.join(__dirname, 'test/data/multipage.pdf')
+var extract = require('pdf-text-extract')
+var options = {
+  cwd: "./"
+}
+extract(filePath, options, function (err, pages) {
+  if (err) {
+    console.dir(err)
+    return
+  }
+  console.dir('extracted pages', pages)
+})
+```
+
+## As a command line tool
+
+```bash
+npm install -g pdf-text-extract
+```
+
+Execute with the filePath as an argument. Output will be json-formatted array of pages
+
+```bash
+pdf-text-extract ./test/data/multipage.pdf
+# outputs
+# ['<page 1 content...>', '<page 2 content...>']
+```
 
 # Test
 

diff --git a/bin/pdf-text-extract.js b/bin/pdf-text-extract.js
@@ -0,0 +1,19 @@
+#!/usr/bin/env node
+
+var extract = require('../index')
+
+var path = require('path')
+var fileName = process.argv[2]
+if (!fileName) {
+  throw new Error('file path must be specified as the argument like "pdf-text-extract /path/to/file"')
+}
+var filePath = path.resolve(fileName)
+extract(filePath, cb)
+
+function cb(err, pages) {
+  if (err) {
+    throw err
+  }
+  console.dir(pages)
+}
+
diff --git a/index.js b/index.js
@@ -1,37 +1,31 @@
 var path = require('path')
-var exec = require('child_process').exec
+var spawn = require('child_process').spawn
 module.exports = function pdfTextExtract(filePath, options, cb) {
   if (typeof(options) === 'function') {
     cb = options
     options = {}
   }
-  var cmd = 'pdftotext'
   filePath = path.resolve(filePath)
   var args = [
     '-layout',
     '-enc',
     'UTF-8',
-    '"' + filePath + '"',
+    filePath,
+    //'"' + filePath + '"',
     '-'
-  ];
-  var command = cmd + ' ' + args.join(' ')
-  var child = exec(command, options, function (err, stdout, stderr) {
+  ]
+  streamResults(args, options, splitPages)
+
+  function splitPages(err, content) {
     if (err) {
-      return cb({
-        message: 'pdf-text-extract failed',
-        error: err,
-        filePath: filePath,
-        command: command,
-        stack: new Error().stack
-      })
+      return cb(err)
     }
-    var pages = stdout.split(/\f/);
+    var pages = content.split(/\f/)
     if (!pages) {
       return cb({
         message: 'pdf-text-extract failed',
         error: 'no text returned from the pdftotext command',
         filePath: filePath,
-        command: command,
         stack: new Error().stack
       })
     }
@@ -40,9 +34,32 @@ module.exports = function pdfTextExtract(filePath, options, cb) {
     if (!lastPage) {
       pages.pop()
     }
-    if (!stderr || stderr === '') {
-      stderr = null
+    cb(null, pages)
+  }
+}
+function streamResults(args, options, cb) {
+  var output = ''
+  var stderr = ''
+  var command = 'pdftotext'
+  var child = spawn(command, args, options)
+  child.stdout.setEncoding('utf8')
+  child.stderr.setEncoding('utf8')
+  child.stdout.on('data', stdoutHandler)
+  child.stderr.on('data', stderrHandler)
+  child.on('exit', exitHandler)
+
+  function stdoutHandler(data) {
+    output += data
+  }
+
+  function stderrHandler(data) {
+    stderr += data
+  }
+
+  function exitHandler(code) {
+    if (code !== 0) {
+      cb(new Error('pdftextextract command failed: ' + stderr))
     }
-    cb(stderr, pages);
-  });
+    cb(null, output)
+  }
 }
diff --git a/package.json b/package.json
@@ -3,6 +3,7 @@
   "version": "1.0.13",
   "description": "Extract text from pdfs that contain searchable pdf text",
   "main": "index.js",
+  "bin": "./bin/pdf-text-extract.js",
   "directories": {
     "test": "test"
   },
@@ -26,5 +27,8 @@
   "devDependencies": {
     "mocha": "~1.8.2",
     "should": "~1.2.2"
+  },
+  "dependencies": {
+    "yargs": "~1.1.3"
   }
 }
diff --git a/test/extract-test.js b/test/extract-test.js
@@ -35,9 +35,10 @@ describe('Pdf extract', function () {
     this.slow('4s')
     var filePath = path.join(__dirname, 'data', 'huge.pdf')
 
-    extract(filePath, {
-      maxBuffer: 5000 * 1024
-    }, function (err, pages) {
+    var options = {
+      cwd: null
+    }
+    extract(filePath, options, function (err, pages) {
       should.not.exists(err)
       should.exists(pages)