@@ -3,6 +3,18 @@ import * as temp from 'temp';
33
44import { ExtractTextOptions , Ocr } from '../src/index' ;
55
6+ const convertArgs = {
7+ quality : '100' ,
8+ trim : '' ,
9+ depth : '8' ,
10+ strip : '' ,
11+ background : 'white' ,
12+ alpha : 'off'
13+ } ;
14+
15+ // tslint:disable object-literal-key-quotes
16+ const tesseractArgs = { '-psm' : 6 , c : 'preserve_interword_spaces=1' } ;
17+
618describe ( 'Extract Text Tests' , ( ) => {
719 it ( 'should be able to extract pdf text from single-page.pdf' , async ( done ) => {
820 jest . setTimeout ( 15 * 1000 ) ;
@@ -11,7 +23,10 @@ describe('Extract Text Tests', () => {
1123 const pdfPath = path . join ( __dirname , relativePath ) ;
1224
1325 try {
14- const result : string = await Ocr . extractText ( pdfPath ) ;
26+ const options : ExtractTextOptions = {
27+ convertArgs
28+ } ;
29+ const result : string = await Ocr . extractText ( pdfPath , options ) ;
1530 expect ( result ) . toBeDefined ( ) ;
1631 expect ( result ) . toContain ( '00001-001-0002' ) ;
1732 } catch ( error ) {
@@ -29,7 +44,10 @@ describe('Extract Text Tests', () => {
2944 const pdfPath = path . join ( __dirname , relativePath ) ;
3045
3146 try {
32- const result : string = await Ocr . extractText ( pdfPath ) ;
47+ const options : ExtractTextOptions = {
48+ convertArgs
49+ } ;
50+ const result : string = await Ocr . extractText ( pdfPath , options ) ;
3351 expect ( result ) . toBeDefined ( ) ;
3452 expect ( result ) . toContain ( '00001-001-0002' ) ;
3553 } catch ( error ) {
@@ -41,13 +59,16 @@ describe('Extract Text Tests', () => {
4159 } ) ;
4260
4361 it ( 'should be able to extract pdf text from multi-page.pdf' , async ( done ) => {
44- jest . setTimeout ( 15 * 1000 ) ;
62+ jest . setTimeout ( 25 * 1000 ) ;
4563 const fileName = 'multi-page.pdf' ;
4664 const relativePath = path . join ( 'sample' , fileName ) ;
4765 const pdfPath = path . join ( __dirname , relativePath ) ;
4866
4967 try {
50- const options : ExtractTextOptions = { convertDensity : 400 , convertArgs : { trim : '' } } ;
68+ const options : ExtractTextOptions = {
69+ convertDensity : 600 ,
70+ convertArgs
71+ } ;
5172 const result : string = await Ocr . extractText ( pdfPath , options ) ;
5273 expect ( result ) . toBeDefined ( ) ;
5374 expect ( result ) . toContain ( 'National Airspace System' ) ;
@@ -67,7 +88,11 @@ describe('Extract Text Tests', () => {
6788 const pdfPath = path . join ( __dirname , relativePath ) ;
6889
6990 try {
70- const options : ExtractTextOptions = { pdfToTextArgs : { f : 1 , l : 4 } } ;
91+ const options : ExtractTextOptions = {
92+ pdfToTextArgs : { f : 1 , l : 4 } ,
93+ convertDensity : 600 ,
94+ convertArgs
95+ } ;
7196 const result : string = await Ocr . extractText ( pdfPath , options ) ;
7297 expect ( result ) . toBeDefined ( ) ;
7398 expect ( result ) . toContain ( 'TraceMonkey' ) ;
@@ -81,17 +106,19 @@ describe('Extract Text Tests', () => {
81106 } ) ;
82107
83108 it ( 'should be able to extract text from sample.png' , async ( done ) => {
84- jest . setTimeout ( 15 * 1000 ) ;
109+ jest . setTimeout ( 25 * 1000 ) ;
85110 const fileName = 'sample.png' ;
86111 const relativePath = path . join ( 'sample' , fileName ) ;
87112 const pngPath = path . join ( __dirname , relativePath ) ;
88113
89114 try {
90115 const tmpDir = temp . mkdirSync ( 'tmp' ) ;
91- // tslint:disable object-literal-key-quotes
116+
92117 const options : ExtractTextOptions = {
118+ convertDensity : 600 ,
119+ convertArgs,
93120 tesseractLang : 'eng' ,
94- tesseractArgs : { '-psm' : 6 , c : 'preserve_interword_spaces=1' }
121+ tesseractArgs
95122 } ;
96123 const result : string = await Ocr . invokeImageOcr ( tmpDir , pngPath , options ) ;
97124 expect ( result ) . toBeDefined ( ) ;
@@ -105,17 +132,24 @@ describe('Extract Text Tests', () => {
105132 } ) ;
106133
107134 it ( 'should be able to extract text from sample-low.jpg' , async ( done ) => {
108- jest . setTimeout ( 15 * 1000 ) ;
135+ jest . setTimeout ( 25 * 1000 ) ;
109136 const fileName = 'sample-low.jpg' ;
110137 const relativePath = path . join ( 'sample' , fileName ) ;
111138 const jpgPath = path . join ( __dirname , relativePath ) ;
112139
113140 try {
114141 const options : ExtractTextOptions = {
115142 convertDensity : 600 ,
116- convertArgs : { trim : '' } ,
143+ convertArgs : {
144+ ...convertArgs ,
145+ verbose : '' ,
146+ flatten : '' ,
147+ contrast : '' ,
148+ 'auto-level' : '' ,
149+ sharpen : '0x4.0'
150+ } ,
117151 tesseractLang : 'eng' ,
118- tesseractArgs : { '-psm' : 6 }
152+ tesseractArgs
119153 } ;
120154 const result : string = await Ocr . extractText ( jpgPath , options ) ;
121155 expect ( result ) . toBeDefined ( ) ;
0 commit comments