Skip to content

Commit

Permalink
Try to improve em-dash linebreaks
Browse files Browse the repository at this point in the history
  • Loading branch information
Jellby committed Aug 12, 2020
1 parent 8a15844 commit caa5e86
Show file tree
Hide file tree
Showing 4 changed files with 84 additions and 8 deletions.
4 changes: 3 additions & 1 deletion crengine/include/textlang.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,9 @@ class TextLangMan

#define MAX_NB_LB_PROPS_ITEMS 10 // for our statically sized array (increase if needed)

typedef lChar16 (*lb_char_sub_func_t)(const lChar16 * text, int pos, int next_usable);
#if USE_LIBUNIBREAK==1
typedef lChar16 (*lb_char_sub_func_t)(struct LineBreakContext *lbpCtx, const lChar16 * text, int pos, int next_usable);
#endif

class TextLangCfg
{
Expand Down
2 changes: 1 addition & 1 deletion crengine/src/lvrend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10086,7 +10086,7 @@ void getRenderedWidths(ldomNode * node, int &maxWidth, int &minWidth, int direct
lChar16 c = *(txt + start + i);
lChar16 next_c = *(txt + start + i + 1); // might be 0 at end of string
if ( lang_cfg->hasLBCharSubFunc() ) {
next_c = lang_cfg->getLBCharSubFunc()(txt+start, i+1, len-1 - (i+1));
next_c = lang_cfg->getLBCharSubFunc()(&lbCtx, txt+start, i+1, len-1 - (i+1));
}
int brk = lb_process_next_char(&lbCtx, (utf32_t)next_c);
// We don't really need to bother with consecutive spaces (that
Expand Down
2 changes: 1 addition & 1 deletion crengine/src/lvtextfm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1400,7 +1400,7 @@ class LVFormatter {
if ( src->lang_cfg->hasLBCharSubFunc() ) {
// Lang specific function may want to substitute char (for
// libunibreak only) to tweak line breaking around it
ch = src->lang_cfg->getLBCharSubFunc()(m_text, pos, len-1 - k);
ch = src->lang_cfg->getLBCharSubFunc()(&lbCtx, m_text, pos, len-1 - k);
}
int brk = lb_process_next_char(&lbCtx, (utf32_t)ch);
if ( pos > 0 ) {
Expand Down
84 changes: 79 additions & 5 deletions crengine/src/textlang.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -436,7 +436,71 @@ static quotes_spec _quotes_spec_table[] = {
static quotes_spec _quotes_spec_default = { "", L"\x201c", L"\x201d", L"\x2018", L"\x2019" };

#if USE_LIBUNIBREAK==1
lChar16 lb_char_sub_func_polish(const lChar16 * text, int pos, int next_usable) {
lChar16 lb_char_sub_func_english(struct LineBreakContext *lbpCtx, const lChar16 * text, int pos, int next_usable) {
// https://github.com/koreader/crengine/issues/364
// Normally, line breaks are allowed at both sides of an em-dash.
// When an em-dash is at the "end of a word" (or beginning), we want to avoid separating it from its word,
// this is detected by looking for letters/numbers at both sides of the dash, if on any side a space
// is closer than any letter/number, treat it as a non-breakable dash.
// The current implementation does not allow examining the following characters beyond the current node,
// so the detection is not perfect and we replace the dash with "opening" or "closing" characters
// (or "ambiguous), to play safer (note that "}" allows a break after, while ")" doesn't).
//
// The intent is the following:
// blah—blah -> — (break before or after)
// blah “—blah , <p>—blah -> { (do not break after)
// blah—” Blah , blah—”</p> -> } (do not break before)
// blah — blah , blah —<em>blah -> " (break only at spaces)
switch ( text[pos] ) {
case 0x2014: // em dash
case 0x2E3A: // two-em dash
case 0x2E3B: // three-em dash
{
lChar16 replacement = text[pos];
int new_pos = pos;
enum LineBreakClass new_lbc = LBP_Undefined;
// already at the beginning or end of text
if ( pos == 0 ) replacement = '{';
if ( next_usable == 0 ) {
replacement = (replacement == text[pos] ) ? '}' : '"';
}
while ( new_pos > 0) {
new_pos--;
new_lbc = lb_get_char_class(lbpCtx, text[new_pos]);
if ( new_lbc == LBP_AL || new_lbc == LBP_NU ) {
// found word / number
break;
}
else if ( new_lbc == LBP_SP || new_pos == 0 ) {
// found space or beginning
replacement = (replacement == text[pos] ) ? '{' : '"';
break;
}
}
new_pos = pos;
while ( new_pos < pos+next_usable ) {
new_pos++;
new_lbc = lb_get_char_class(lbpCtx, text[new_pos]);
if ( new_lbc == LBP_AL || new_lbc == LBP_NU ) {
// found word / number
break;
}
else if ( new_lbc == LBP_SP || new_pos == pos+next_usable ) {
// found space or end (of the current text node, there could be letters beyond)
replacement = (replacement == text[pos] ) ? '}' : '"';
break;
}
}
return replacement;
}
break;
default:
break;
}
return text[pos];
}

lChar16 lb_char_sub_func_polish(struct LineBreakContext *lbpCtx, const lChar16 * text, int pos, int next_usable) {
// https://github.com/koreader/koreader/issues/5645#issuecomment-559193057
// Letters aiouwzAIOUWS are prepositions that should not be left at the
// end of a line.
Expand Down Expand Up @@ -466,7 +530,7 @@ lChar16 lb_char_sub_func_polish(const lChar16 * text, int pos, int next_usable)
return text[pos];
}

lChar16 lb_char_sub_func_czech_slovak(const lChar16 * text, int pos, int next_usable) {
lChar16 lb_char_sub_func_czech_slovak(struct LineBreakContext *lbpCtx, const lChar16 * text, int pos, int next_usable) {
// Same for Czech and Slovak : AIiVvOoUuSsZzKk
// https://tex.stackexchange.com/questions/27780/one-letter-word-at-the-end-of-line
// https://github.com/michal-h21/luavlna
Expand Down Expand Up @@ -582,6 +646,10 @@ TextLangCfg::TextLangCfg( lString16 lang_tag ) {
bool has_left_double_angle_quotation_mark_closing = false;
bool has_right_double_angle_quotation_mark_opening = false; // U+00BB »
bool has_right_double_angle_quotation_mark_closing = false;
// Additional rule for treating em-dashes as e.g. "horizontal bar"
// This is appropriate for languages that typically have a space at a
// breakable side of the dash
bool has_em_dash_alphabetic = false; // U+2014 —, U+2E3A ⸺, U+2E3B ⸻

// Note: these macros use 'lang_tag'.
if ( LANG_STARTS_WITH(("en")) ) { // English
Expand All @@ -597,6 +665,7 @@ TextLangCfg::TextLangCfg( lString16 lang_tag ) {
has_right_single_angle_quotation_mark_closing = true;
has_left_double_angle_quotation_mark_opening = true;
has_right_double_angle_quotation_mark_closing = true;
has_em_dash_alphabetic = true;
}
else if ( LANG_STARTS_WITH(("de")) ) { // German
has_left_single_quotation_mark_closing = true;
Expand Down Expand Up @@ -631,6 +700,7 @@ TextLangCfg::TextLangCfg( lString16 lang_tag ) {
_lb_props[n++] = { 0x00AD, 0x00AD, LBP_ZWJ };
if ( has_right_double_angle_quotation_mark_opening ) _lb_props[n++] = { 0x00BB, 0x00BB, LBP_OP };
if ( has_right_double_angle_quotation_mark_closing ) _lb_props[n++] = { 0x00BB, 0x00BB, LBP_CL };
if ( has_em_dash_alphabetic ) _lb_props[n++] = { 0x2014, 0x2014, LBP_AL };
if ( has_left_single_quotation_mark_opening ) _lb_props[n++] = { 0x2018, 0x2018, LBP_OP };
if ( has_left_single_quotation_mark_closing ) _lb_props[n++] = { 0x2018, 0x2018, LBP_CL };
if ( has_right_single_quotation_mark_opening ) _lb_props[n++] = { 0x2019, 0x2019, LBP_OP };
Expand All @@ -644,20 +714,24 @@ TextLangCfg::TextLangCfg( lString16 lang_tag ) {
if ( has_left_single_angle_quotation_mark_closing ) _lb_props[n++] = { 0x2039, 0x2039, LBP_CL };
if ( has_right_single_angle_quotation_mark_opening ) _lb_props[n++] = { 0x203A, 0x203A, LBP_OP };
if ( has_right_single_angle_quotation_mark_closing ) _lb_props[n++] = { 0x203A, 0x203A, LBP_CL };
if ( has_em_dash_alphabetic ) _lb_props[n++] = { 0x2E3A, 0x2E3B, LBP_AL };
// End of list
_lb_props[n++] = { 0, 0, LBP_Undefined };
// Done with libunibreak per-language LineBreakProperties extensions

// Other line breaking and text layout tweaks
_lb_char_sub_func = NULL;
if ( LANG_STARTS_WITH(("pl")) ) { // Polish
if ( LANG_STARTS_WITH(("en")) ) { // English
_lb_char_sub_func = &lb_char_sub_func_english;
}
else if ( LANG_STARTS_WITH(("pl")) ) { // Polish
_lb_char_sub_func = &lb_char_sub_func_polish;
_duplicate_real_hyphen_on_next_line = true;
}
if ( LANG_STARTS_WITH(("cs") ("sk")) ) { // Czech, Slovak
else if ( LANG_STARTS_WITH(("cs") ("sk")) ) { // Czech, Slovak
_lb_char_sub_func = &lb_char_sub_func_czech_slovak;
}
if ( LANG_STARTS_WITH(("pt")) ) { // Portuguese
else if ( LANG_STARTS_WITH(("pt")) ) { // Portuguese
_duplicate_real_hyphen_on_next_line = true;
}
#endif
Expand Down

0 comments on commit caa5e86

Please sign in to comment.