Skip to content

Commit

Permalink
Merge pull request gali8#186 from ws233/master
Browse files Browse the repository at this point in the history
Added PDF output support. Fixes gali8#172. Fixes gali8#173.
  • Loading branch information
Kevin Conley committed Jul 1, 2015
2 parents 76767fd + 6dc1eda commit f24bb99
Show file tree
Hide file tree
Showing 40 changed files with 2,566 additions and 317 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -71,3 +71,4 @@ crashlytics-build.properties
leptonica-*
tesseract-*

TesseractOCR/lib
6 changes: 6 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
[submodule "TesseractOCR/tesseract-ocr"]
path = TesseractOCR/tesseract-ocr
url = https://github.com/tesseract-ocr/tesseract.git
[submodule "TesseractOCR/libtiff-ios"]
path = TesseractOCR/libtiff-ios
url = https://github.com/ashtons/libtiff-ios.git
2 changes: 1 addition & 1 deletion Podfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,4 @@ DEPENDENCIES:
SPEC CHECKSUMS:
Kiwi: f038a6c61f7a9e4d7766bff5717aa3b3fdb75f55

COCOAPODS: 0.36.1
COCOAPODS: 0.36.4
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,10 @@ Easy and fast.**
These are the current versions of the upstream bundled libraries within the framework that this repository provides:

* Tesseract 3.03-rc1 ([homepage](https://code.google.com/p/tesseract-ocr/))
* Leptonica 1.71 ([homepage](http://leptonica.org/))
* Leptonica 1.72 ([homepage](http://leptonica.org/))
* Libtiff 4.0.4 ([homepage](http://www.remotesensing.org/libtiff/))
* Libpng 1.6.17 ([homepage](http://www.libpng.org/pub/png/libpng.html))
* Libjpeg 9a ([homepage](http://libjpeg.sourceforge.net/))

Getting Started
=================
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,7 @@
OTHER_LDFLAGS = (
"-lstdc++",
"-ObjC",
"-lz",
);
PRODUCT_NAME = "$(TARGET_NAME)";
WRAPPER_EXTENSION = app;
Expand All @@ -337,6 +338,7 @@
OTHER_LDFLAGS = (
"-lstdc++",
"-ObjC",
"-lz",
);
PRODUCT_NAME = "$(TARGET_NAME)";
WRAPPER_EXTENSION = app;
Expand Down
1,448 changes: 1,216 additions & 232 deletions Tesseract OCR iOS.xcodeproj/project.pbxproj

Large diffs are not rendered by default.

8 changes: 8 additions & 0 deletions TesseractOCR/G8Tesseract.h
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,14 @@ extern NSInteger const kG8MaxCredibleResolution;
*/
- (NSString *)recognizedHOCRForPageNumber:(int)pageNumber;

/**
* Produces a PDF output with the pages sent to the function
* @param images An array of the input images being recognized and
* included into the output PDF file.
* @return NSData representing output PDF file
*/
- (NSData *)recognizedPDFForImages:(NSArray*)images;

/**
* Run Tesseract's page analysis on the target image.
*
Expand Down
44 changes: 44 additions & 0 deletions TesseractOCR/G8Tesseract.mm
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#import "allheaders.h"
#import "genericvector.h"
#import "strngs.h"
#import "renderer.h"

NSInteger const kG8DefaultResolution = 72;
NSInteger const kG8MinCredibleResolution = 70;
Expand Down Expand Up @@ -644,6 +645,49 @@ - (NSString *)recognizedHOCRForPageNumber:(int)pageNumber {
return nil;
}

- (NSData *)recognizedPDFForImages:(NSArray*)images {

NSString *path = [self.absoluteDataPath stringByAppendingPathComponent:@"tessdata"];
tesseract::TessPDFRenderer *renderer = new tesseract::TessPDFRenderer(path.fileSystemRepresentation);

// Begin producing output
const char* kUnknownTitle = "Unknown Title";
if (renderer && !renderer->BeginDocument(kUnknownTitle)) {
return nil;
}

bool result = YES;
for (int page = 0; page < images.count && result; page++) {
UIImage *image = images[page];
if ([image isKindOfClass:[UIImage class]]) {
Pix *pixs = [self pixForImage:image];
Pix *pix = pixConvertTo1(pixs, UINT8_MAX / 2);
pixDestroy(&pixs);

const char *pagename = [NSString stringWithFormat:@"page #%i", page].UTF8String;
result = _tesseract->ProcessPage(pix, page, pagename, NULL, 0, renderer);
pixDestroy(&pix);
}
}

// error
if (!result) {
return nil;
}

// Finish producing output
if (renderer && !renderer->EndDocument()) {
return nil;
}

const char *pdfData = NULL;
int pdfDataLength = 0;
renderer->GetOutput(&pdfData, &pdfDataLength);

NSData *data = [NSData dataWithBytes:pdfData length:pdfDataLength];
return data;
}

- (UIImage *)imageWithBlocks:(NSArray *)blocks drawText:(BOOL)drawText thresholded:(BOOL)thresholded
{
UIImage *image = thresholded ? self.thresholdedImage : self.image;
Expand Down
10 changes: 2 additions & 8 deletions TesseractOCR/README_howto_compile_libaries.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
# Compiling the .a library files your self


### Step 1 - Prerequisites
First you need to install these tools:
Expand All @@ -12,16 +10,12 @@ First you need to install these tools:


### Step 2 - Download and unzip
Download the source code for `leptonica-1.71` from the Internet here: http://leptonica.org/source/leptonica-1.71.tar.gz.
Unzip it in this folder.


Download the source code for `tesseract-3.03` from the Internet here: https://tesseract-ocr.googlecode.com/archive/3.03-rc1.tar.gz.
Download the source code for `leptonica-1.72` from the Internet here: http://leptonica.org/source/leptonica-1.72.tar.gz.
Unzip it in this folder.


### Step 3 - Compile
Run `build_dependencies.sh` from the terminal. This first compiles leptonica and then tesseract for every architecture iOS/simulator uses (arm7 arm7s arm64 i386 x86_64), and then combines the resulting libs into one library file. It does this for both leptonica and tesseract, so the final results of the script are "libtesseract_all.a", "liblept.a", and "include" directories for both leptonica and tesseract. Finally, the script copies these results into the "lib" and "include" directories inside this directory.
Run `build_dependencies.sh` from the terminal. This first compiles dependent libraries (png, jpeg, tiff, leptonica) and then tesseract for every architecture iOS/simulator uses (arm7 arm7s arm64 i386 x86_64), and then combines the resulting libs into one library file. It does this for both dependent libraries and tesseract, so the final results of the script are "libpng.a", "libpng16.a", "libjpeg.a", "libtiff.a", "libtiffxx.a", "libtesseract_all.a", "liblept.a", and "include" directories for both leptonica and tesseract. Finally, the script copies these results into the "lib" and "include" directories inside this directory.


### Step 4 - Build
Expand Down
34 changes: 28 additions & 6 deletions TesseractOCR/build_dependencies.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@

GLOBAL_OUTDIR="`pwd`/build"
LOCAL_OUTDIR="./outdir"
LEPTON_LIB="`pwd`/leptonica-1.71"
TESSERACT_LIB="`pwd`/tesseract-3.03"
TESSERACT_LIB="`pwd`/tesseract-ocr"
LEPTON_LIB="`pwd`/leptonica-1.72"
IMAGE_LIBS="`pwd`/libtiff-ios/dependencies"

IOS_BASE_SDK="7.0"
IOS_DEPLOY_TGT="7.0"
Expand Down Expand Up @@ -37,9 +38,10 @@ arch_names=(arm-apple-darwin7 arm-apple-darwin7s arm-apple-darwin64 i386-apple-d

setenv_all() {
# Add internal libs
export CFLAGS="$CFLAGS -I$GLOBAL_OUTDIR/include -L$GLOBAL_OUTDIR/lib -Qunused-arguments"
export LIBS="-lz -lpng -ljpeg -ltiff"
export CFLAGS="$CFLAGS -I$IMAGE_LIBS/include -L$IMAGE_LIBS/lib -I$GLOBAL_OUTDIR/include -L$GLOBAL_OUTDIR/lib -Qunused-arguments"

export LDFLAGS="-L$SDKROOT/usr/lib/"
export LDFLAGS="-L$SDKROOT/usr/lib/ -L$IMAGE_LIBS/lib/"

export CPPFLAGS=$CFLAGS
export CXXFLAGS=$CFLAGS
Expand Down Expand Up @@ -126,6 +128,25 @@ cd -

rm -rf $GLOBAL_OUTDIR lib include

#######################
# Download dependencies
#######################

git submodule init
git submodule update

#######################
# Build libtiff and all of it's dependencies
#######################

cd libtiff-ios
./build-png.sh
./build-jpg.sh
./build-tiff.sh
cd ..

mkdir -p $GLOBAL_OUTDIR/lib && cp -rvf $IMAGE_LIBS/lib/lib*.a $GLOBAL_OUTDIR/lib

#######################
# LEPTONLIB
#######################
Expand All @@ -138,7 +159,8 @@ mkdir -p "$LOCAL_OUTDIR/${archs[$n]}"
make clean 2> /dev/null
make distclean 2> /dev/null
eval "setenv_${archs[$n]}"
./configure --host="${arch_names[$n]}" --enable-shared=no --disable-programs --without-zlib --without-libpng --without-jpeg --without-giflib --without-libtiff
env
./configure --host="${arch_names[$n]}" --enable-shared=no --disable-programs --with-zlib --with-libpng --with-jpeg --without-giflib --with-libtiff
make -j12
cp -rvf src/.libs/lib*.a "$LOCAL_OUTDIR/${archs[$n]}"
done
Expand Down Expand Up @@ -182,6 +204,6 @@ cd ..

cp -rf $GLOBAL_OUTDIR/include .
mkdir -p lib
cp -rf $GLOBAL_OUTDIR/lib/libtesseract_all.a $GLOBAL_OUTDIR/lib/liblept.a lib/
cp -rf $GLOBAL_OUTDIR/lib/lib*.a lib/

echo "Finished!"
Loading

0 comments on commit f24bb99

Please sign in to comment.