diff --git a/include/tesseract/baseapi.h b/include/tesseract/baseapi.h index 103ca7b1c9..afb5595ea5 100644 --- a/include/tesseract/baseapi.h +++ b/include/tesseract/baseapi.h @@ -566,6 +566,14 @@ class TESS_API TessBaseAPI { */ char *GetTSVText(int page_number); + /** + * Make a TSV-formatted string from the internal data structures. + * Allows additional column with detected language. + * page_number is 0-based but will appear in the output as 1-based. + * Returned string must be freed with the delete [] operator. + */ + char *GetTSVText(int page_number, bool lang_info); + /** * Make a box file for LSTM training from the internal data structures. * Constructs coordinates in the original image - not just the rectangle. diff --git a/include/tesseract/renderer.h b/include/tesseract/renderer.h index 6f40523335..f06ceab0ee 100644 --- a/include/tesseract/renderer.h +++ b/include/tesseract/renderer.h @@ -197,7 +197,7 @@ class TESS_API TessAltoRenderer : public TessResultRenderer { */ class TESS_API TessTsvRenderer : public TessResultRenderer { public: - explicit TessTsvRenderer(const char *outputbase, bool font_info); + explicit TessTsvRenderer(const char *outputbase, bool lang_info); explicit TessTsvRenderer(const char *outputbase); protected: @@ -206,7 +206,7 @@ class TESS_API TessTsvRenderer : public TessResultRenderer { bool EndDocumentHandler() override; private: - bool font_info_; // whether to print font information + bool lang_info_; // whether to print language information }; /** diff --git a/src/api/baseapi.cpp b/src/api/baseapi.cpp index a21798429a..c6ead6fc7b 100644 --- a/src/api/baseapi.cpp +++ b/src/api/baseapi.cpp @@ -1422,6 +1422,16 @@ static void AddBoxToTSV(const PageIterator *it, PageIteratorLevel level, std::st * Returned string must be freed with the delete [] operator. */ char *TessBaseAPI::GetTSVText(int page_number) { + return GetTSVText(page_number, false); +} + +/** + * Make a TSV-formatted string from the internal data structures. + * Allows additional column with detected language. + * page_number is 0-based but will appear in the output as 1-based. + * Returned string must be freed with the delete [] operator. + */ +char *TessBaseAPI::GetTSVText(int page_number, bool lang_info) { if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(nullptr) < 0)) { return nullptr; } @@ -1434,6 +1444,7 @@ char *TessBaseAPI::GetTSVText(int page_number) { int par_num = 0; int line_num = 0; int word_num = 0; + std::string lang; std::string tsv_str; tsv_str += "1\t" + std::to_string(page_num); // level 1 - page @@ -1445,7 +1456,11 @@ char *TessBaseAPI::GetTSVText(int page_number) { tsv_str += "\t" + std::to_string(rect_top_); tsv_str += "\t" + std::to_string(rect_width_); tsv_str += "\t" + std::to_string(rect_height_); - tsv_str += "\t-1\t\n"; + tsv_str += "\t-1"; + if (lang_info) { + tsv_str += "\t" + lang; + } + tsv_str += "\t\n"; const std::unique_ptr res_it(GetIterator()); while (!res_it->Empty(RIL_BLOCK)) { @@ -1466,9 +1481,16 @@ char *TessBaseAPI::GetTSVText(int page_number) { tsv_str += "\t" + std::to_string(line_num); tsv_str += "\t" + std::to_string(word_num); AddBoxToTSV(res_it.get(), RIL_BLOCK, tsv_str); - tsv_str += "\t-1\t\n"; // end of row for block + tsv_str += "\t-1"; + if (lang_info) { + tsv_str += "\t"; + } + tsv_str += "\t\n"; // end of row for block } if (res_it->IsAtBeginningOf(RIL_PARA)) { + if (lang_info) { + lang = res_it->WordRecognitionLanguage(); + } par_num++; line_num = 0; word_num = 0; @@ -1478,7 +1500,11 @@ char *TessBaseAPI::GetTSVText(int page_number) { tsv_str += "\t" + std::to_string(line_num); tsv_str += "\t" + std::to_string(word_num); AddBoxToTSV(res_it.get(), RIL_PARA, tsv_str); - tsv_str += "\t-1\t\n"; // end of row for para + tsv_str += "\t-1"; + if (lang_info) { + tsv_str += "\t" + lang; + } + tsv_str += "\t\n"; // end of row for para } if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) { line_num++; @@ -1489,7 +1515,11 @@ char *TessBaseAPI::GetTSVText(int page_number) { tsv_str += "\t" + std::to_string(line_num); tsv_str += "\t" + std::to_string(word_num); AddBoxToTSV(res_it.get(), RIL_TEXTLINE, tsv_str); - tsv_str += "\t-1\t\n"; // end of row for line + tsv_str += "\t-1"; + if (lang_info) { + tsv_str += "\t"; + } + tsv_str += "\t\n"; // end of row for line } // Now, process the word... @@ -1506,9 +1536,17 @@ char *TessBaseAPI::GetTSVText(int page_number) { tsv_str += "\t" + std::to_string(right - left); tsv_str += "\t" + std::to_string(bottom - top); tsv_str += "\t" + std::to_string(res_it->Confidence(RIL_WORD)); - tsv_str += "\t"; + + if (lang_info) { + const char *word_lang = res_it->WordRecognitionLanguage(); + tsv_str += "\t"; + if (word_lang) { + tsv_str += word_lang; + } + } // Increment counts if at end of block/paragraph/textline. + tsv_str += "\t"; if (res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD)) { lcnt++; } diff --git a/src/api/renderer.cpp b/src/api/renderer.cpp index 8d4f1adc1b..aa25d905d6 100644 --- a/src/api/renderer.cpp +++ b/src/api/renderer.cpp @@ -156,19 +156,23 @@ bool TessTextRenderer::AddImageHandler(TessBaseAPI *api) { * TSV Text Renderer interface implementation **********************************************************************/ TessTsvRenderer::TessTsvRenderer(const char *outputbase) : TessResultRenderer(outputbase, "tsv") { - font_info_ = false; + lang_info_ = false; } -TessTsvRenderer::TessTsvRenderer(const char *outputbase, bool font_info) +TessTsvRenderer::TessTsvRenderer(const char *outputbase, bool lang_info) : TessResultRenderer(outputbase, "tsv") { - font_info_ = font_info; + lang_info_ = lang_info; } bool TessTsvRenderer::BeginDocumentHandler() { // Output TSV column headings AppendString( "level\tpage_num\tblock_num\tpar_num\tline_num\tword_" - "num\tleft\ttop\twidth\theight\tconf\ttext\n"); + "num\tleft\ttop\twidth\theight\tconf\t"); + if (lang_info_) { + AppendString("lang\t"); + } + AppendString("text\n"); return true; } @@ -177,7 +181,7 @@ bool TessTsvRenderer::EndDocumentHandler() { } bool TessTsvRenderer::AddImageHandler(TessBaseAPI *api) { - const std::unique_ptr tsv(api->GetTSVText(imagenum())); + const std::unique_ptr tsv(api->GetTSVText(imagenum(), lang_info_)); if (tsv == nullptr) { return false; } diff --git a/src/ccmain/tesseractclass.cpp b/src/ccmain/tesseractclass.cpp index fd58ac8746..a34041fc4a 100644 --- a/src/ccmain/tesseractclass.cpp +++ b/src/ccmain/tesseractclass.cpp @@ -399,6 +399,7 @@ Tesseract::Tesseract() this->params()) , BOOL_MEMBER(textord_tabfind_show_vlines, false, "Debug line finding", this->params()) , BOOL_MEMBER(textord_use_cjk_fp_model, false, "Use CJK fixed pitch model", this->params()) + , BOOL_MEMBER(tsv_lang_info, false, "Include language info in the .tsv output file", this->params()) , BOOL_MEMBER(poly_allow_detailed_fx, false, "Allow feature extractors to see the original outline", this->params()) , BOOL_INIT_MEMBER(tessedit_init_config_only, false, diff --git a/src/ccmain/tesseractclass.h b/src/ccmain/tesseractclass.h index 732bb9e62e..345e5cab79 100644 --- a/src/ccmain/tesseractclass.h +++ b/src/ccmain/tesseractclass.h @@ -920,6 +920,7 @@ class TESS_API Tesseract : public Wordrec { BOOL_VAR_H(tessedit_flip_0O); double_VAR_H(tessedit_lower_flip_hyphen); double_VAR_H(tessedit_upper_flip_hyphen); + BOOL_VAR_H(tsv_lang_info); BOOL_VAR_H(rej_trust_doc_dawg); BOOL_VAR_H(rej_1Il_use_dict_word); BOOL_VAR_H(rej_1Il_trust_permuter_type); diff --git a/src/tesseract.cpp b/src/tesseract.cpp index 480815564c..25d1c6aaa4 100644 --- a/src/tesseract.cpp +++ b/src/tesseract.cpp @@ -533,9 +533,9 @@ static void PreloadRenderers(tesseract::TessBaseAPI &api, api.GetBoolVariable("tessedit_create_tsv", &b); if (b) { - bool font_info; - api.GetBoolVariable("hocr_font_info", &font_info); - auto renderer = std::make_unique(outputbase, font_info); + bool lang_info; + api.GetBoolVariable("tsv_lang_info", &lang_info); + auto renderer = std::make_unique(outputbase, lang_info); if (renderer->happy()) { renderers.push_back(std::move(renderer)); } else { diff --git a/tessdata/configs/tsv b/tessdata/configs/tsv index dc52478177..84fd7adc43 100644 --- a/tessdata/configs/tsv +++ b/tessdata/configs/tsv @@ -1 +1,2 @@ tessedit_create_tsv 1 +tsv_lang_info 0