From 6700edd8bcdd2a4e86bca7727152f929ea660f21 Mon Sep 17 00:00:00 2001 From: Tom Morris Date: Tue, 1 Mar 2016 13:41:19 -0500 Subject: [PATCH] Cleanup TSV renderer Remove all references to hocr, hocr.tsv, etc. Remove dead code for font info, input filename, HTML escapes. Improved comments. Fixed indentation. --- api/baseapi.cpp | 159 ++++++++++++++------------------------ api/baseapi.h | 6 +- api/renderer.cpp | 23 +++--- api/renderer.h | 8 +- api/tesseractmain.cpp | 4 +- ccmain/tesseractclass.cpp | 2 +- ccmain/tesseractclass.h | 2 +- tessdata/configs/hocrtsv | 2 - tessdata/configs/tsv | 2 + 9 files changed, 83 insertions(+), 125 deletions(-) delete mode 100644 tessdata/configs/hocrtsv create mode 100644 tessdata/configs/tsv diff --git a/api/baseapi.cpp b/api/baseapi.cpp index 3546466da5..ab3cb2fd56 100644 --- a/api/baseapi.cpp +++ b/api/baseapi.cpp @@ -1417,7 +1417,7 @@ static void AddBoxTohOCR(const ResultIterator *it, *hocr_str += "\">"; } -static void AddBoxTohOCRTSV(const PageIterator *it, +static void AddBoxToTSV(const PageIterator *it, PageIteratorLevel level, STRING* hocr_str) { int left, top, right, bottom; @@ -1615,57 +1615,31 @@ char* TessBaseAPI::GetHOCRText(struct ETEXT_DESC* monitor, int page_number) { } /** - * Make a TSV-formatted string with hOCR markup from the internal - * data structures. + * Make a TSV-formatted string from the internal data structures. * page_number is 0-based but will appear in the output as 1-based. - * Image name/input_file_ can be set by SetInputName before calling - * GetHOCRText - * STL removed from original patch submission and refactored by rays. */ -char* TessBaseAPI::GetHOCRTSVText(int page_number) { +char* TessBaseAPI::GetTSVText(int page_number) { if (tesseract_ == NULL || (page_res_ == NULL && Recognize(NULL) < 0)) return NULL; int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1; - int page_id = page_number + 1; // hOCR uses 1-based page numbers. - bool font_info = false; - GetBoolVariable("hocr_font_info", &font_info); - - STRING hocr_str(""); + int page_id = page_number + 1; // we use 1-based page numbers. - if (input_file_ == NULL) - SetInputName(NULL); - -#ifdef _WIN32 - // convert input name from ANSI encoding to utf-8 - int str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1, - NULL, NULL); - wchar_t *uni16_str = new WCHAR[str16_len]; - str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1, - uni16_str, str16_len); - int utf8_len = WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, NULL, - NULL, NULL, NULL); - char *utf8_str = new char[utf8_len]; - WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, - utf8_len, NULL, NULL); - *input_file_ = utf8_str; - delete[] uni16_str; - delete[] utf8_str; -#endif + STRING tsv_str(""); int page_num = page_id, block_num = 0, par_num = 0, line_num = 0, word_num = 0; - hocr_str.add_str_int("1\t", page_num); - hocr_str.add_str_int("\t", block_num); - hocr_str.add_str_int("\t", par_num); - hocr_str.add_str_int("\t", line_num); - hocr_str.add_str_int("\t", word_num); - hocr_str.add_str_int("\t", rect_left_); - hocr_str.add_str_int("\t", rect_top_); - hocr_str.add_str_int("\t", rect_width_); - hocr_str.add_str_int("\t", rect_height_); - hocr_str += "\t-1\t\n"; + tsv_str.add_str_int("1\t", page_num); // level 1 - page + tsv_str.add_str_int("\t", block_num); + tsv_str.add_str_int("\t", par_num); + tsv_str.add_str_int("\t", line_num); + tsv_str.add_str_int("\t", word_num); + tsv_str.add_str_int("\t", rect_left_); + tsv_str.add_str_int("\t", rect_top_); + tsv_str.add_str_int("\t", rect_width_); + tsv_str.add_str_int("\t", rect_height_); + tsv_str += "\t-1\t\n"; ResultIterator *res_it = GetIterator(); while (!res_it->Empty(RIL_BLOCK)) { @@ -1674,36 +1648,36 @@ char* TessBaseAPI::GetHOCRTSVText(int page_number) { continue; } - // Open any new block/paragraph/textline. + // Add rows for any new block/paragraph/textline. if (res_it->IsAtBeginningOf(RIL_BLOCK)) { block_num++, par_num = 0, line_num = 0, word_num = 0; - hocr_str.add_str_int("2\t", page_num); - hocr_str.add_str_int("\t", block_num); - hocr_str.add_str_int("\t", par_num); - hocr_str.add_str_int("\t", line_num); - hocr_str.add_str_int("\t", word_num); - AddBoxTohOCRTSV(res_it, RIL_BLOCK, &hocr_str); - hocr_str += "\t-1\t\n"; + tsv_str.add_str_int("2\t", page_num); // level 2 - block + tsv_str.add_str_int("\t", block_num); + tsv_str.add_str_int("\t", par_num); + tsv_str.add_str_int("\t", line_num); + tsv_str.add_str_int("\t", word_num); + AddBoxToTSV(res_it, RIL_BLOCK, &tsv_str); + tsv_str += "\t-1\t\n"; // end of row for block } if (res_it->IsAtBeginningOf(RIL_PARA)) { par_num++, line_num = 0, word_num = 0; - hocr_str.add_str_int("3\t", page_num); - hocr_str.add_str_int("\t", block_num); - hocr_str.add_str_int("\t", par_num); - hocr_str.add_str_int("\t", line_num); - hocr_str.add_str_int("\t", word_num); - AddBoxTohOCRTSV(res_it, RIL_PARA, &hocr_str); - hocr_str += "\t-1\t\n"; + tsv_str.add_str_int("3\t", page_num); // level 3 - paragraph + tsv_str.add_str_int("\t", block_num); + tsv_str.add_str_int("\t", par_num); + tsv_str.add_str_int("\t", line_num); + tsv_str.add_str_int("\t", word_num); + AddBoxToTSV(res_it, RIL_PARA, &tsv_str); + tsv_str += "\t-1\t\n"; // end of row for para } if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) { line_num++, word_num = 0; - hocr_str.add_str_int("4\t", page_num); - hocr_str.add_str_int("\t", block_num); - hocr_str.add_str_int("\t", par_num); - hocr_str.add_str_int("\t", line_num); - hocr_str.add_str_int("\t", word_num); - AddBoxTohOCRTSV(res_it, RIL_TEXTLINE, &hocr_str); - hocr_str += "\t-1\t\n"; + tsv_str.add_str_int("4\t", page_num); // level 4 - line + tsv_str.add_str_int("\t", block_num); + tsv_str.add_str_int("\t", par_num); + tsv_str.add_str_int("\t", line_num); + tsv_str.add_str_int("\t", word_num); + AddBoxToTSV(res_it, RIL_TEXTLINE, &tsv_str); + tsv_str += "\t-1\t\n"; // end of row for line } // Now, process the word... @@ -1715,49 +1689,34 @@ char* TessBaseAPI::GetHOCRTSVText(int page_number) { font_name = res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace, &serif, &smallcaps, &pointsize, &font_id); - word_num++; - hocr_str.add_str_int("5\t", page_num); - hocr_str.add_str_int("\t", block_num); - hocr_str.add_str_int("\t", par_num); - hocr_str.add_str_int("\t", line_num); - hocr_str.add_str_int("\t", word_num); - hocr_str.add_str_int("\t", left); - hocr_str.add_str_int("\t", top); - hocr_str.add_str_int("\t", right - left + 1); - hocr_str.add_str_int("\t", bottom - top + 1); - hocr_str.add_str_int("\t", res_it->Confidence(RIL_WORD)); - bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD); - bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD); - bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD); - hocr_str += "\t"; + word_num++; + tsv_str.add_str_int("5\t", page_num); // level 5 - word + tsv_str.add_str_int("\t", block_num); + tsv_str.add_str_int("\t", par_num); + tsv_str.add_str_int("\t", line_num); + tsv_str.add_str_int("\t", word_num); + tsv_str.add_str_int("\t", left); + tsv_str.add_str_int("\t", top); + tsv_str.add_str_int("\t", right - left + 1); + tsv_str.add_str_int("\t", bottom - top + 1); + tsv_str.add_str_int("\t", res_it->Confidence(RIL_WORD)); + tsv_str += "\t"; + + // Increment counts if at end of block/paragraph/textline. + if (res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD)) lcnt++; + if (res_it->IsAtFinalElement(RIL_PARA, RIL_WORD)) pcnt++; + if (res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD)) bcnt++; + do { - const char *grapheme = res_it->GetUTF8Text(RIL_SYMBOL); -// if (grapheme && grapheme[0] != 0) { -// if (grapheme[1] == 0) { -// hocr_str += HOcrEscape(grapheme); -// } else { - hocr_str += grapheme; -// } -// } - delete []grapheme; + tsv_str += res_it->GetUTF8Text(RIL_SYMBOL); res_it->Next(RIL_SYMBOL); } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD)); - hocr_str += "\n"; + tsv_str += "\n"; // end of row wcnt++; - // Close any ending block/paragraph/textline. - if (last_word_in_line) { - lcnt++; - } - if (last_word_in_para) { - pcnt++; - } - if (last_word_in_block) { - bcnt++; - } } - char *ret = new char[hocr_str.length() + 1]; - strcpy(ret, hocr_str.string()); + char *ret = new char[tsv_str.length() + 1]; + strcpy(ret, tsv_str.string()); delete res_it; return ret; } diff --git a/api/baseapi.h b/api/baseapi.h index 4d231a8393..c84784a31c 100644 --- a/api/baseapi.h +++ b/api/baseapi.h @@ -603,12 +603,10 @@ class TESS_API TessBaseAPI { char* GetHOCRText(int page_number); /** - * Make a TSV-formatted string with hOCR markup from the internal - * data structures. + * Make a TSV-formatted string from the internal data structures. * page_number is 0-based but will appear in the output as 1-based. */ - char* GetHOCRTSVText(int page_number); - + char* GetTSVText(int page_number); /** * The recognized text is returned as a char* which is coded in the same diff --git a/api/renderer.cpp b/api/renderer.cpp index 127d20053f..172ef49a83 100644 --- a/api/renderer.cpp +++ b/api/renderer.cpp @@ -182,31 +182,32 @@ bool TessHOcrRenderer::AddImageHandler(TessBaseAPI* api) { /********************************************************************** * HOcr Text Renderer interface implementation **********************************************************************/ -TessHOcrTsvRenderer::TessHOcrTsvRenderer(const char *outputbase) - : TessResultRenderer(outputbase, "hocr.tsv") { +TessTsvRenderer::TessTsvRenderer(const char *outputbase) + : TessResultRenderer(outputbase, "tsv") { font_info_ = false; } -TessHOcrTsvRenderer::TessHOcrTsvRenderer(const char *outputbase, bool font_info) - : TessResultRenderer(outputbase, "hocr.tsv") { +TessTsvRenderer::TessTsvRenderer(const char *outputbase, bool font_info) + : TessResultRenderer(outputbase, "tsv") { font_info_ = font_info; } -bool TessHOcrTsvRenderer::BeginDocumentHandler() { +bool TessTsvRenderer::BeginDocumentHandler() { + // Output TSV column headings AppendString("level\tpage_num\tblock_num\tpar_num\tline_num\tword_num\tleft\ttop\twidth\theight\tconf\ttext\n"); return true; } -bool TessHOcrTsvRenderer::EndDocumentHandler() { +bool TessTsvRenderer::EndDocumentHandler() { return true; } -bool TessHOcrTsvRenderer::AddImageHandler(TessBaseAPI* api) { - char* hocrtsv = api->GetHOCRTSVText(imagenum()); - if (hocrtsv == NULL) return false; +bool TessTsvRenderer::AddImageHandler(TessBaseAPI* api) { + char* tsv = api->GetTSVText(imagenum()); + if (tsv == NULL) return false; - AppendString(hocrtsv); - delete[] hocrtsv; + AppendString(tsv); + delete[] tsv; return true; } diff --git a/api/renderer.h b/api/renderer.h index 0713f78e9a..6b47813f7b 100644 --- a/api/renderer.h +++ b/api/renderer.h @@ -163,12 +163,12 @@ class TESS_API TessHOcrRenderer : public TessResultRenderer { }; /** - * Renders tesseract output into an hocr tsv string + * Renders Tesseract output into a TSV string */ -class TESS_API TessHOcrTsvRenderer : public TessResultRenderer { +class TESS_API TessTsvRenderer : public TessResultRenderer { public: - explicit TessHOcrTsvRenderer(const char *outputbase, bool font_info); - explicit TessHOcrTsvRenderer(const char *outputbase); + explicit TessTsvRenderer(const char *outputbase, bool font_info); + explicit TessTsvRenderer(const char *outputbase); protected: virtual bool BeginDocumentHandler(); diff --git a/api/tesseractmain.cpp b/api/tesseractmain.cpp index 7c62ed5687..9405711350 100644 --- a/api/tesseractmain.cpp +++ b/api/tesseractmain.cpp @@ -299,12 +299,12 @@ void PreloadRenderers(tesseract::TessBaseAPI* api, new tesseract::TessHOcrRenderer(outputbase, font_info)); } - api->GetBoolVariable("tessedit_create_hocrtsv", &b); + api->GetBoolVariable("tessedit_create_tsv", &b); if (b) { bool font_info; api->GetBoolVariable("hocr_font_info", &font_info); renderers->push_back( - new tesseract::TessHOcrTsvRenderer(outputbase, font_info)); + new tesseract::TessTsvRenderer(outputbase, font_info)); } api->GetBoolVariable("tessedit_create_pdf", &b); diff --git a/ccmain/tesseractclass.cpp b/ccmain/tesseractclass.cpp index 64e0465936..8db50fbd54 100644 --- a/ccmain/tesseractclass.cpp +++ b/ccmain/tesseractclass.cpp @@ -385,7 +385,7 @@ Tesseract::Tesseract() this->params()), BOOL_MEMBER(tessedit_create_hocr, false, "Write .html hOCR output file", this->params()), - BOOL_MEMBER(tessedit_create_hocrtsv, false, "Write .hocr.tsv TSV output file", + BOOL_MEMBER(tessedit_create_tsv, false, "Write .tsv output file", this->params()), BOOL_MEMBER(tessedit_create_pdf, false, "Write .pdf output file", this->params()), diff --git a/ccmain/tesseractclass.h b/ccmain/tesseractclass.h index 048215b28b..91d25bc8ae 100644 --- a/ccmain/tesseractclass.h +++ b/ccmain/tesseractclass.h @@ -1003,7 +1003,7 @@ class Tesseract : public Wordrec { BOOL_VAR_H(tessedit_write_unlv, false, "Write .unlv output file"); BOOL_VAR_H(tessedit_create_txt, false, "Write .txt output file"); BOOL_VAR_H(tessedit_create_hocr, false, "Write .html hOCR output file"); - BOOL_VAR_H(tessedit_create_hocrtsv, false, "Write .hocr.tsv hOCR-tsv output file"); + BOOL_VAR_H(tessedit_create_tsv, false, "Write .tsv output file"); BOOL_VAR_H(tessedit_create_pdf, false, "Write .pdf output file"); STRING_VAR_H(unrecognised_char, "|", "Output char for unidentified blobs"); diff --git a/tessdata/configs/hocrtsv b/tessdata/configs/hocrtsv deleted file mode 100644 index 8d05478b15..0000000000 --- a/tessdata/configs/hocrtsv +++ /dev/null @@ -1,2 +0,0 @@ -tessedit_create_hocrtsv 1 -tessedit_pageseg_mode 1 diff --git a/tessdata/configs/tsv b/tessdata/configs/tsv new file mode 100644 index 0000000000..11cd6fc97a --- /dev/null +++ b/tessdata/configs/tsv @@ -0,0 +1,2 @@ +tessedit_create_tsv 1 +tessedit_pageseg_mode 1