-
Notifications
You must be signed in to change notification settings - Fork 0
/
pdftohtml.go
230 lines (199 loc) · 6.44 KB
/
pdftohtml.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
// Package pdftohtml is a wrapper for Xpdf command line tool `pdftohtml`.
//
// What is `pdftohtml`?
//
// Pdftohtml converts Portable Document Format (PDF) files to HTML.
//
// Reference: https://www.xpdfreader.com/pdftohtml-man.html
package pdftohtml
import (
"context"
"os/exec"
"strconv"
)
// ----------------------------------------------------------------------------
// -- `pdftohtml`
// ----------------------------------------------------------------------------
type Command struct {
path string
args []string
}
// NewCommand creates new `pdftohtml` command.
func NewCommand(opts ...option) (*Command, error) {
cmd := &Command{path: "pdftohtml"}
for _, opt := range opts {
opt(cmd)
}
var err error
// assert that executable exists and get absolute path
cmd.path, err = exec.LookPath(cmd.path)
if err != nil {
return nil, err
}
return cmd, nil
}
// Run executes prepared `pdftohtml` command.
func (c *Command) Run(ctx context.Context, inpath, outdir string) error {
cmd := exec.CommandContext(ctx, c.path, append(c.args, inpath, outdir)...)
return cmd.Run()
}
// String returns a human-readable description of the command.
func (c *Command) String() string {
return exec.Command(c.path, append(c.args, "<inpath>", "<outdir>")...).String()
}
// ----------------------------------------------------------------------------
// -- `pdftohtml` options
// ----------------------------------------------------------------------------
type option func(*Command)
// Set custom location for `pdftotext` executable.
func WithCustomPath(path string) option {
return func(c *Command) {
c.path = path
}
}
// Read config-file in place of ~/.xpdfrc or the system-wide config file.
func WithCustomConfig(path string) option {
return func(c *Command) {
c.args = append(c.args, "-cfg", path)
}
}
// This option tells pdftohtml to instead overwrite the existing directory.
//
// By default pdftohtml will not overwrite the output directory. If the directory already
// exists, pdftohtml will exit with an error.
func WithOutdirOverwrite() option {
return func(c *Command) {
c.args = append(c.args, "-overwrite")
}
}
// Specifies the first page to convert.
func WithPageFrom(page uint64) option {
return func(c *Command) {
c.args = append(c.args, "-f", strconv.FormatUint(page, 10))
}
}
// Specifies the last page to convert.
func WithPageTo(page uint64) option {
return func(c *Command) {
c.args = append(c.args, "-l", strconv.FormatUint(page, 10))
}
}
// Specifies the range of pages to convert.
func WithPageRange(from, to uint64) option {
return func(c *Command) {
WithPageFrom(from)
WithPageTo(to)
}
}
// Specifies the initial zoom level.
//
// The default is 1.0, which means 72dpi, i.e., 1 point in the PDF file will
// be 1 pixel in the HTML.
//
// Using ´-z 1.5’, for example, will make the initial view 50% larger.
func WithInitialZoom(zoom float64) option {
return func(c *Command) {
c.args = append(c.args, "-z", strconv.FormatFloat(zoom, 'e', 2, 64))
}
}
// Specifies the resolution, in DPI, for background images. This controls the
// pixel size of the background image files.
//
// The initial zoom level is set by the `WithInitialZoom` option. Specifying
// a larger zoom value will allow the viewer to zoom in farther without upscaling
// artifacts in the background.
func WithResolution(dpi uint64) option {
return func(c *Command) {
c.args = append(c.args, "-r", strconv.FormatUint(dpi, 10))
}
}
// Specifies a vertical stretch factor.
//
// Setting this to a value greater than 1.0 will stretch each page vertically,
// spreading out the lines. This also stretches the background image to match.
func WithVerticalStretch(factor float64) option {
return func(c *Command) {
c.args = append(c.args, "-vstretch", strconv.FormatFloat(factor, 'e', 2, 64))
}
}
// Embeds the background image as base64-encoded data directly in the HTML file,
// rather than storing it as a separate file.
func WithEmbedBackground() option {
return func(c *Command) {
c.args = append(c.args, "-embedbackground")
}
}
// Disable extraction of embedded fonts.
//
// By default, pdftohtml extracts TrueType and OpenType fonts. Disabling extraction
// can work around problems with buggy fonts.
func WithNoFonts() option {
return func(c *Command) {
c.args = append(c.args, "-nofonts")
}
}
// Embeds any extracted fonts as base64-encoded data directly in the HTML file, rather
// than storing them as separate files.
func WithEmbedFonts() option {
return func(c *Command) {
c.args = append(c.args, "-embedfonts")
}
}
// Don’t draw invisible text.
//
// By default, invisible text (commonly used in OCR’ed PDF files) is drawn as transparent
// (alpha=0) HTML text. This option tells pdftohtml to discard invisible text entirely.
func WithNoInvisibleText() option {
return func(c *Command) {
c.args = append(c.args, "-skipinvisible")
}
}
// Treat all text as invisible.
//
// By default, regular (non-invisible) text is not drawn in the background image, and is
// instead drawn with HTML on top of the image. This option tells pdftohtml to include the
// regular text in the background image, and then draw it as transparent (alpha=0) HTML text.
func WithAllInvisibleText() option {
return func(c *Command) {
c.args = append(c.args, "-allinvisible")
}
}
// Convert AcroForm text and checkbox fields to HTML input elements.
//
// This also removes text (e.g., underscore characters) and erases background image content
// (e.g., lines or boxes) in the field areas.
func WithEmbedFormFields() option {
return func(c *Command) {
c.args = append(c.args, "-formfields")
}
}
// Include PDF document metadata as ’meta’ elements in the HTML header.
func WithEmbedMetaTags() option {
return func(c *Command) {
c.args = append(c.args, "-meta")
}
}
// Use table mode when performing the underlying text extraction.
//
// This will generally produce better output when the PDF content is a full-page table.
//
// Note: This does not generate HTML tables; it just changes the way text is split up.
func WithModeTable() option {
return func(c *Command) {
c.args = append(c.args, "-table")
}
}
// Specify the owner password for the PDF file.
//
// Providing this will bypass all security restrictions.
func WithOwnerPassword(password string) option {
return func(c *Command) {
c.args = append(c.args, "-opw", password)
}
}
// Specify the user password for the PDF file.
func WithUserPassword(password string) option {
return func(c *Command) {
c.args = append(c.args, "-upw", password)
}
}