Skip to content

Commit

Permalink
Fixes and add command line tool.
Browse files Browse the repository at this point in the history
  • Loading branch information
nisaacson committed Feb 27, 2014
1 parent b6e4dac commit 645f9b6
Show file tree
Hide file tree
Showing 5 changed files with 104 additions and 28 deletions.
47 changes: 41 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,33 +1,68 @@
# PDF Text Extract
Extract text from pdfs that contain searchable pdf text. The module calls the pdftotext command to perform the actual extraction
[![build status](https://secure.travis-ci.org/nisaacson/pdf-text-extract)](http://travis-ci.org/nisaacson/pdf-text-extract) [![Dependency Status](https://david-dm.org/nisaacson/pdf-text-extract.png)](https://david-dm.org/nisaacson/pdf-text-extract)

Extract text from pdfs that contain searchable pdf text. The module is wrapper that calls the `pdftotext` command to perform the actual extraction

[![Build Status](https://travis-ci.org/nisaacson/pdf-text-extract.png?branch=master)](https://travis-ci.org/nisaacson/pdf-text-extract) [![Dependency Status](https://david-dm.org/nisaacson/pdf-text-extract.png)](https://david-dm.org/nisaacson/pdf-text-extract)

# Installation
```bash
npm install pdf-text-extract
```


You will need the **pdftotext** binary available on your path. There are packages available for many different operating systems
You will need the `pdftotext` binary available on your path. There are packages available for many different operating systems

> See [https://github.com/nisaacson/pdf-extract#osx](https://github.com/nisaacson/pdf-extract#osx) for how to install the `pdftotext` command
See [https://github.com/nisaacson/pdf-extract#osx](https://github.com/nisaacson/pdf-extract#osx) for how to install the `pdftotext` command


# Usage

## As a module

```javascript
var filePath = path.join(__dirname, 'test/pdf')
var filePath = path.join(__dirname, 'test/data/multipage.pdf')
var extract = require('pdf-text-extract')
extract(filePath, function (err, pages) {
if (err) {
console.dir(err)
return
}
console.dir('extracted pages', pages)
console.dir(pages)
})
```
The output will be an array of where each entry is a page of text. If you want just a string of all pages you can do `pages.join(' ')`


If needed you can pass an optional arguments to the extract function. These will be passed to the command

```javascript
var filePath = path.join(__dirname, 'test/data/multipage.pdf')
var extract = require('pdf-text-extract')
var options = {
cwd: "./"
}
extract(filePath, options, function (err, pages) {
if (err) {
console.dir(err)
return
}
console.dir('extracted pages', pages)
})
```

## As a command line tool

```bash
npm install -g pdf-text-extract
```

Execute with the filePath as an argument. Output will be json-formatted array of pages

```bash
pdf-text-extract ./test/data/multipage.pdf
# outputs
# ['<page 1 content...>', '<page 2 content...>']
```

# Test

Expand Down
19 changes: 19 additions & 0 deletions bin/pdf-text-extract.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/usr/bin/env node

var extract = require('../index')

var path = require('path')
var fileName = process.argv[2]
if (!fileName) {
throw new Error('file path must be specified as the argument like "pdf-text-extract /path/to/file"')
}
var filePath = path.resolve(fileName)
extract(filePath, cb)

function cb(err, pages) {
if (err) {
throw err
}
console.dir(pages)
}

55 changes: 36 additions & 19 deletions index.js
Original file line number Diff line number Diff line change
@@ -1,37 +1,31 @@
var path = require('path')
var exec = require('child_process').exec
var spawn = require('child_process').spawn
module.exports = function pdfTextExtract(filePath, options, cb) {
if (typeof(options) === 'function') {
cb = options
options = {}
}
var cmd = 'pdftotext'
filePath = path.resolve(filePath)
var args = [
'-layout',
'-enc',
'UTF-8',
'"' + filePath + '"',
filePath,
//'"' + filePath + '"',
'-'
];
var command = cmd + ' ' + args.join(' ')
var child = exec(command, options, function (err, stdout, stderr) {
]
streamResults(args, options, splitPages)

function splitPages(err, content) {
if (err) {
return cb({
message: 'pdf-text-extract failed',
error: err,
filePath: filePath,
command: command,
stack: new Error().stack
})
return cb(err)
}
var pages = stdout.split(/\f/);
var pages = content.split(/\f/)
if (!pages) {
return cb({
message: 'pdf-text-extract failed',
error: 'no text returned from the pdftotext command',
filePath: filePath,
command: command,
stack: new Error().stack
})
}
Expand All @@ -40,9 +34,32 @@ module.exports = function pdfTextExtract(filePath, options, cb) {
if (!lastPage) {
pages.pop()
}
if (!stderr || stderr === '') {
stderr = null
cb(null, pages)
}
}
function streamResults(args, options, cb) {
var output = ''
var stderr = ''
var command = 'pdftotext'
var child = spawn(command, args, options)
child.stdout.setEncoding('utf8')
child.stderr.setEncoding('utf8')
child.stdout.on('data', stdoutHandler)
child.stderr.on('data', stderrHandler)
child.on('exit', exitHandler)

function stdoutHandler(data) {
output += data
}

function stderrHandler(data) {
stderr += data
}

function exitHandler(code) {
if (code !== 0) {
cb(new Error('pdftextextract command failed: ' + stderr))
}
cb(stderr, pages);
});
cb(null, output)
}
}
4 changes: 4 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"version": "1.0.13",
"description": "Extract text from pdfs that contain searchable pdf text",
"main": "index.js",
"bin": "./bin/pdf-text-extract.js",
"directories": {
"test": "test"
},
Expand All @@ -26,5 +27,8 @@
"devDependencies": {
"mocha": "~1.8.2",
"should": "~1.2.2"
},
"dependencies": {
"yargs": "~1.1.3"
}
}
7 changes: 4 additions & 3 deletions test/extract-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,10 @@ describe('Pdf extract', function () {
this.slow('4s')
var filePath = path.join(__dirname, 'data', 'huge.pdf')

extract(filePath, {
maxBuffer: 5000 * 1024
}, function (err, pages) {
var options = {
cwd: null
}
extract(filePath, options, function (err, pages) {
should.not.exists(err)
should.exists(pages)

Expand Down

0 comments on commit 645f9b6

Please sign in to comment.