TextLang: better linebreaks at em-dash (EN/ES/FR) (#365)

Requires libunibreak patch that adds lb_get_char_class().
koreader · Aug 17, 2020 · 44bb1fc · 44bb1fc
1 parent 1f9b2cc
commit 44bb1fc
Show file tree

Hide file tree

Showing 4 changed files with 104 additions and 8 deletions.
diff --git a/crengine/include/textlang.h b/crengine/include/textlang.h
@@ -93,7 +93,9 @@ class TextLangMan
 
 #define MAX_NB_LB_PROPS_ITEMS 10 // for our statically sized array (increase if needed)
 
-typedef lChar16 (*lb_char_sub_func_t)(const lChar16 * text, int pos, int next_usable);
+#if USE_LIBUNIBREAK==1
+typedef lChar16 (*lb_char_sub_func_t)(struct LineBreakContext *lbpCtx, const lChar16 * text, int pos, int next_usable);
+#endif
 
 class TextLangCfg
 {

diff --git a/crengine/src/lvrend.cpp b/crengine/src/lvrend.cpp
@@ -10086,7 +10086,7 @@ void getRenderedWidths(ldomNode * node, int &maxWidth, int &minWidth, int direct
                 lChar16 c = *(txt + start + i);
                 lChar16 next_c = *(txt + start + i + 1); // might be 0 at end of string
                 if ( lang_cfg->hasLBCharSubFunc() ) {
-                    next_c = lang_cfg->getLBCharSubFunc()(txt+start, i+1, len-1 - (i+1));
+                    next_c = lang_cfg->getLBCharSubFunc()(&lbCtx, txt+start, i+1, len-1 - (i+1));
                 }
                 int brk = lb_process_next_char(&lbCtx, (utf32_t)next_c);
                     // We don't really need to bother with consecutive spaces (that

diff --git a/crengine/src/lvtextfm.cpp b/crengine/src/lvtextfm.cpp
@@ -1400,7 +1400,7 @@ class LVFormatter {
                     if ( src->lang_cfg->hasLBCharSubFunc() ) {
                         // Lang specific function may want to substitute char (for
                         // libunibreak only) to tweak line breaking around it
-                        ch = src->lang_cfg->getLBCharSubFunc()(m_text, pos, len-1 - k);
+                        ch = src->lang_cfg->getLBCharSubFunc()(&lbCtx, m_text, pos, len-1 - k);
                     }
                     int brk = lb_process_next_char(&lbCtx, (utf32_t)ch);
                     if ( pos > 0 ) {

diff --git a/crengine/src/textlang.cpp b/crengine/src/textlang.cpp
@@ -436,7 +436,91 @@ static quotes_spec _quotes_spec_table[] = {
 static quotes_spec _quotes_spec_default = { "", L"\x201c", L"\x201d", L"\x2018", L"\x2019" };
 
 #if USE_LIBUNIBREAK==1
-lChar16 lb_char_sub_func_polish(const lChar16 * text, int pos, int next_usable) {
+lChar16 lb_char_sub_func_english(struct LineBreakContext *lbpCtx, const lChar16 * text, int pos, int next_usable) {
+    // https://github.com/koreader/crengine/issues/364
+    // Normally, line breaks are allowed at both sides of an em-dash.
+    // When an em-dash is at the "end of a word" (or beginning), we want to avoid separating it from its word,
+    // this is detected by looking for letters/numbers at both sides of the dash, if on any side a space
+    // is closer than any letter/number, treat it as a non-breakable dash.
+    // The current implementation does not allow examining the following characters beyond the current node,
+    // so the detection is not perfect and we replace the dash with "opening" or "closing" characters
+    // (or "ambiguous), to play safer (note that "}" allows a break after, while ")" doesn't).
+    //
+    // The intent is the following:
+    //   blah—blah                     ->  —  (break before or after)
+    //   blah “—blah , <p>—blah        ->  {  (do not break after)
+    //   blah—” Blah , blah—”</p>      ->  }  (do not break before)
+    //   blah — blah , blah —<em>blah  ->  "  (break only at spaces)
+    switch ( text[pos] ) {
+        case 0x2014:  // em dash
+        case 0x2E3A:  // two-em dash
+        case 0x2E3B:  // three-em dash
+            {
+                // The variable "replacement" will be the output char,
+                // we start by setting it to the actual input char.
+                // It will be '{' if no-break on right,
+                //            '}' if no-break on left,
+                //            '"' if no-break on both.
+                lChar16 replacement = text[pos];
+                int new_pos;
+                enum LineBreakClass new_lbc;
+                // 1. Detect no-break on right (scan left of dash)
+                //
+                // already at the beginning of text
+                if ( pos == 0 ) {
+                    replacement = '{';
+                }
+                else {
+                    // inspect preceding characters
+                    new_pos = pos;
+                    while ( new_pos > 0) {
+                        new_pos--;
+                        new_lbc = lb_get_char_class(lbpCtx, text[new_pos]);
+                        if ( new_lbc == LBP_AL || new_lbc == LBP_NU ) {
+                            // found word / number
+                            break;
+                        }
+                        else if ( new_lbc == LBP_SP || new_pos == 0 ) {
+                            // found space or beginning
+                            replacement = '{';
+                            break;
+                        }
+                    }
+                }
+                // 2. Detect no-break on left (scan right of dash)
+                //    If already no-break on right, replacement will be '"'
+                //
+                // already at the end of text
+                if ( next_usable == 0 ) {
+                    replacement = ( replacement == '{' ) ? '"' : '}';
+                }
+                else {
+                    // inspect following characters
+                    new_pos = pos;
+                    while ( new_pos < pos+next_usable ) {
+                        new_pos++;
+                        new_lbc = lb_get_char_class(lbpCtx, text[new_pos]);
+                        if ( new_lbc == LBP_AL || new_lbc == LBP_NU ) {
+                            // found word / number
+                            break;
+                        }
+                        else if ( new_lbc == LBP_SP || new_pos == pos+next_usable ) {
+                            // found space or end (of the current text node, there could be letters beyond)
+                            replacement = ( replacement == '{' ) ? '"' : '}';
+                            break;
+                        }
+                    }
+                }
+                return replacement;
+            }
+            break;
+        default:
+            break;
+    }
+    return text[pos];
+}
+
+lChar16 lb_char_sub_func_polish(struct LineBreakContext *lbpCtx, const lChar16 * text, int pos, int next_usable) {
     // https://github.com/koreader/koreader/issues/5645#issuecomment-559193057
     // Letters aiouwzAIOUWS are prepositions that should not be left at the
     // end of a line.
@@ -466,7 +550,7 @@ lChar16 lb_char_sub_func_polish(const lChar16 * text, int pos, int next_usable)
     return text[pos];
 }
 
-lChar16 lb_char_sub_func_czech_slovak(const lChar16 * text, int pos, int next_usable) {
+lChar16 lb_char_sub_func_czech_slovak(struct LineBreakContext *lbpCtx, const lChar16 * text, int pos, int next_usable) {
     // Same for Czech and Slovak : AIiVvOoUuSsZzKk
     // https://tex.stackexchange.com/questions/27780/one-letter-word-at-the-end-of-line
     // https://github.com/michal-h21/luavlna
@@ -582,6 +666,10 @@ TextLangCfg::TextLangCfg( lString16 lang_tag ) {
     bool has_left_double_angle_quotation_mark_closing = false;
     bool has_right_double_angle_quotation_mark_opening = false;  // U+00BB »
     bool has_right_double_angle_quotation_mark_closing = false;
+    // Additional rule for treating em-dashes as e.g. "horizontal bar"
+    // This is appropriate for languages that typically have a space at a
+    // breakable side of the dash
+    bool has_em_dash_alphabetic = false; // U+2014 —, U+2E3A ⸺, U+2E3B ⸻
 
     // Note: these macros use 'lang_tag'.
     if ( LANG_STARTS_WITH(("en")) ) { // English
@@ -597,6 +685,7 @@ TextLangCfg::TextLangCfg( lString16 lang_tag ) {
         has_right_single_angle_quotation_mark_closing = true;
         has_left_double_angle_quotation_mark_opening = true;
         has_right_double_angle_quotation_mark_closing = true;
+        has_em_dash_alphabetic = true;
     }
     else if ( LANG_STARTS_WITH(("de")) ) { // German
         has_left_single_quotation_mark_closing = true;
@@ -631,6 +720,7 @@ TextLangCfg::TextLangCfg( lString16 lang_tag ) {
     _lb_props[n++] = { 0x00AD, 0x00AD, LBP_ZWJ };
     if ( has_right_double_angle_quotation_mark_opening ) _lb_props[n++] = { 0x00BB, 0x00BB, LBP_OP };
     if ( has_right_double_angle_quotation_mark_closing ) _lb_props[n++] = { 0x00BB, 0x00BB, LBP_CL };
+    if ( has_em_dash_alphabetic )                        _lb_props[n++] = { 0x2014, 0x2014, LBP_AL };
     if ( has_left_single_quotation_mark_opening )        _lb_props[n++] = { 0x2018, 0x2018, LBP_OP };
     if ( has_left_single_quotation_mark_closing )        _lb_props[n++] = { 0x2018, 0x2018, LBP_CL };
     if ( has_right_single_quotation_mark_opening )       _lb_props[n++] = { 0x2019, 0x2019, LBP_OP };
@@ -644,20 +734,24 @@ TextLangCfg::TextLangCfg( lString16 lang_tag ) {
     if ( has_left_single_angle_quotation_mark_closing )  _lb_props[n++] = { 0x2039, 0x2039, LBP_CL };
     if ( has_right_single_angle_quotation_mark_opening ) _lb_props[n++] = { 0x203A, 0x203A, LBP_OP };
     if ( has_right_single_angle_quotation_mark_closing ) _lb_props[n++] = { 0x203A, 0x203A, LBP_CL };
+    if ( has_em_dash_alphabetic )                        _lb_props[n++] = { 0x2E3A, 0x2E3B, LBP_AL };
     // End of list
     _lb_props[n++] = { 0, 0, LBP_Undefined };
     // Done with libunibreak per-language LineBreakProperties extensions
 
     // Other line breaking and text layout tweaks
     _lb_char_sub_func = NULL;
-    if ( LANG_STARTS_WITH(("pl")) ) { // Polish
+    if ( LANG_STARTS_WITH(("en")) ) { // English
+        _lb_char_sub_func = &lb_char_sub_func_english;
+    }
+    else if ( LANG_STARTS_WITH(("pl")) ) { // Polish
         _lb_char_sub_func = &lb_char_sub_func_polish;
         _duplicate_real_hyphen_on_next_line = true;
     }
-    if ( LANG_STARTS_WITH(("cs") ("sk")) ) { // Czech, Slovak
+    else if ( LANG_STARTS_WITH(("cs") ("sk")) ) { // Czech, Slovak
         _lb_char_sub_func = &lb_char_sub_func_czech_slovak;
     }
-    if ( LANG_STARTS_WITH(("pt")) ) { // Portuguese
+    else if ( LANG_STARTS_WITH(("pt")) ) { // Portuguese
         _duplicate_real_hyphen_on_next_line = true;
     }
 #endif