Skip to content

Commit

Permalink
TextLang: better linebreaks at em-dash (EN/ES/FR) (#365)
Browse files Browse the repository at this point in the history
Requires libunibreak patch that adds lb_get_char_class().
  • Loading branch information
Jellby authored Aug 17, 2020
1 parent 1f9b2cc commit 44bb1fc
Show file tree
Hide file tree
Showing 4 changed files with 104 additions and 8 deletions.
4 changes: 3 additions & 1 deletion crengine/include/textlang.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,9 @@ class TextLangMan

#define MAX_NB_LB_PROPS_ITEMS 10 // for our statically sized array (increase if needed)

typedef lChar16 (*lb_char_sub_func_t)(const lChar16 * text, int pos, int next_usable);
#if USE_LIBUNIBREAK==1
typedef lChar16 (*lb_char_sub_func_t)(struct LineBreakContext *lbpCtx, const lChar16 * text, int pos, int next_usable);
#endif

class TextLangCfg
{
Expand Down
2 changes: 1 addition & 1 deletion crengine/src/lvrend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10086,7 +10086,7 @@ void getRenderedWidths(ldomNode * node, int &maxWidth, int &minWidth, int direct
lChar16 c = *(txt + start + i);
lChar16 next_c = *(txt + start + i + 1); // might be 0 at end of string
if ( lang_cfg->hasLBCharSubFunc() ) {
next_c = lang_cfg->getLBCharSubFunc()(txt+start, i+1, len-1 - (i+1));
next_c = lang_cfg->getLBCharSubFunc()(&lbCtx, txt+start, i+1, len-1 - (i+1));
}
int brk = lb_process_next_char(&lbCtx, (utf32_t)next_c);
// We don't really need to bother with consecutive spaces (that
Expand Down
2 changes: 1 addition & 1 deletion crengine/src/lvtextfm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1400,7 +1400,7 @@ class LVFormatter {
if ( src->lang_cfg->hasLBCharSubFunc() ) {
// Lang specific function may want to substitute char (for
// libunibreak only) to tweak line breaking around it
ch = src->lang_cfg->getLBCharSubFunc()(m_text, pos, len-1 - k);
ch = src->lang_cfg->getLBCharSubFunc()(&lbCtx, m_text, pos, len-1 - k);
}
int brk = lb_process_next_char(&lbCtx, (utf32_t)ch);
if ( pos > 0 ) {
Expand Down
104 changes: 99 additions & 5 deletions crengine/src/textlang.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -436,7 +436,91 @@ static quotes_spec _quotes_spec_table[] = {
static quotes_spec _quotes_spec_default = { "", L"\x201c", L"\x201d", L"\x2018", L"\x2019" };

#if USE_LIBUNIBREAK==1
lChar16 lb_char_sub_func_polish(const lChar16 * text, int pos, int next_usable) {
lChar16 lb_char_sub_func_english(struct LineBreakContext *lbpCtx, const lChar16 * text, int pos, int next_usable) {
// https://github.com/koreader/crengine/issues/364
// Normally, line breaks are allowed at both sides of an em-dash.
// When an em-dash is at the "end of a word" (or beginning), we want to avoid separating it from its word,
// this is detected by looking for letters/numbers at both sides of the dash, if on any side a space
// is closer than any letter/number, treat it as a non-breakable dash.
// The current implementation does not allow examining the following characters beyond the current node,
// so the detection is not perfect and we replace the dash with "opening" or "closing" characters
// (or "ambiguous), to play safer (note that "}" allows a break after, while ")" doesn't).
//
// The intent is the following:
// blah—blah -> — (break before or after)
// blah “—blah , <p>—blah -> { (do not break after)
// blah—” Blah , blah—”</p> -> } (do not break before)
// blah — blah , blah —<em>blah -> " (break only at spaces)
switch ( text[pos] ) {
case 0x2014: // em dash
case 0x2E3A: // two-em dash
case 0x2E3B: // three-em dash
{
// The variable "replacement" will be the output char,
// we start by setting it to the actual input char.
// It will be '{' if no-break on right,
// '}' if no-break on left,
// '"' if no-break on both.
lChar16 replacement = text[pos];
int new_pos;
enum LineBreakClass new_lbc;
// 1. Detect no-break on right (scan left of dash)
//
// already at the beginning of text
if ( pos == 0 ) {
replacement = '{';
}
else {
// inspect preceding characters
new_pos = pos;
while ( new_pos > 0) {
new_pos--;
new_lbc = lb_get_char_class(lbpCtx, text[new_pos]);
if ( new_lbc == LBP_AL || new_lbc == LBP_NU ) {
// found word / number
break;
}
else if ( new_lbc == LBP_SP || new_pos == 0 ) {
// found space or beginning
replacement = '{';
break;
}
}
}
// 2. Detect no-break on left (scan right of dash)
// If already no-break on right, replacement will be '"'
//
// already at the end of text
if ( next_usable == 0 ) {
replacement = ( replacement == '{' ) ? '"' : '}';
}
else {
// inspect following characters
new_pos = pos;
while ( new_pos < pos+next_usable ) {
new_pos++;
new_lbc = lb_get_char_class(lbpCtx, text[new_pos]);
if ( new_lbc == LBP_AL || new_lbc == LBP_NU ) {
// found word / number
break;
}
else if ( new_lbc == LBP_SP || new_pos == pos+next_usable ) {
// found space or end (of the current text node, there could be letters beyond)
replacement = ( replacement == '{' ) ? '"' : '}';
break;
}
}
}
return replacement;
}
break;
default:
break;
}
return text[pos];
}

lChar16 lb_char_sub_func_polish(struct LineBreakContext *lbpCtx, const lChar16 * text, int pos, int next_usable) {
// https://github.com/koreader/koreader/issues/5645#issuecomment-559193057
// Letters aiouwzAIOUWS are prepositions that should not be left at the
// end of a line.
Expand Down Expand Up @@ -466,7 +550,7 @@ lChar16 lb_char_sub_func_polish(const lChar16 * text, int pos, int next_usable)
return text[pos];
}

lChar16 lb_char_sub_func_czech_slovak(const lChar16 * text, int pos, int next_usable) {
lChar16 lb_char_sub_func_czech_slovak(struct LineBreakContext *lbpCtx, const lChar16 * text, int pos, int next_usable) {
// Same for Czech and Slovak : AIiVvOoUuSsZzKk
// https://tex.stackexchange.com/questions/27780/one-letter-word-at-the-end-of-line
// https://github.com/michal-h21/luavlna
Expand Down Expand Up @@ -582,6 +666,10 @@ TextLangCfg::TextLangCfg( lString16 lang_tag ) {
bool has_left_double_angle_quotation_mark_closing = false;
bool has_right_double_angle_quotation_mark_opening = false; // U+00BB »
bool has_right_double_angle_quotation_mark_closing = false;
// Additional rule for treating em-dashes as e.g. "horizontal bar"
// This is appropriate for languages that typically have a space at a
// breakable side of the dash
bool has_em_dash_alphabetic = false; // U+2014 —, U+2E3A ⸺, U+2E3B ⸻

// Note: these macros use 'lang_tag'.
if ( LANG_STARTS_WITH(("en")) ) { // English
Expand All @@ -597,6 +685,7 @@ TextLangCfg::TextLangCfg( lString16 lang_tag ) {
has_right_single_angle_quotation_mark_closing = true;
has_left_double_angle_quotation_mark_opening = true;
has_right_double_angle_quotation_mark_closing = true;
has_em_dash_alphabetic = true;
}
else if ( LANG_STARTS_WITH(("de")) ) { // German
has_left_single_quotation_mark_closing = true;
Expand Down Expand Up @@ -631,6 +720,7 @@ TextLangCfg::TextLangCfg( lString16 lang_tag ) {
_lb_props[n++] = { 0x00AD, 0x00AD, LBP_ZWJ };
if ( has_right_double_angle_quotation_mark_opening ) _lb_props[n++] = { 0x00BB, 0x00BB, LBP_OP };
if ( has_right_double_angle_quotation_mark_closing ) _lb_props[n++] = { 0x00BB, 0x00BB, LBP_CL };
if ( has_em_dash_alphabetic ) _lb_props[n++] = { 0x2014, 0x2014, LBP_AL };
if ( has_left_single_quotation_mark_opening ) _lb_props[n++] = { 0x2018, 0x2018, LBP_OP };
if ( has_left_single_quotation_mark_closing ) _lb_props[n++] = { 0x2018, 0x2018, LBP_CL };
if ( has_right_single_quotation_mark_opening ) _lb_props[n++] = { 0x2019, 0x2019, LBP_OP };
Expand All @@ -644,20 +734,24 @@ TextLangCfg::TextLangCfg( lString16 lang_tag ) {
if ( has_left_single_angle_quotation_mark_closing ) _lb_props[n++] = { 0x2039, 0x2039, LBP_CL };
if ( has_right_single_angle_quotation_mark_opening ) _lb_props[n++] = { 0x203A, 0x203A, LBP_OP };
if ( has_right_single_angle_quotation_mark_closing ) _lb_props[n++] = { 0x203A, 0x203A, LBP_CL };
if ( has_em_dash_alphabetic ) _lb_props[n++] = { 0x2E3A, 0x2E3B, LBP_AL };
// End of list
_lb_props[n++] = { 0, 0, LBP_Undefined };
// Done with libunibreak per-language LineBreakProperties extensions

// Other line breaking and text layout tweaks
_lb_char_sub_func = NULL;
if ( LANG_STARTS_WITH(("pl")) ) { // Polish
if ( LANG_STARTS_WITH(("en")) ) { // English
_lb_char_sub_func = &lb_char_sub_func_english;
}
else if ( LANG_STARTS_WITH(("pl")) ) { // Polish
_lb_char_sub_func = &lb_char_sub_func_polish;
_duplicate_real_hyphen_on_next_line = true;
}
if ( LANG_STARTS_WITH(("cs") ("sk")) ) { // Czech, Slovak
else if ( LANG_STARTS_WITH(("cs") ("sk")) ) { // Czech, Slovak
_lb_char_sub_func = &lb_char_sub_func_czech_slovak;
}
if ( LANG_STARTS_WITH(("pt")) ) { // Portuguese
else if ( LANG_STARTS_WITH(("pt")) ) { // Portuguese
_duplicate_real_hyphen_on_next_line = true;
}
#endif
Expand Down

0 comments on commit 44bb1fc

Please sign in to comment.