Skip to content

Commit

Permalink
HTML parser: ensure foster parenting inside tables
Browse files Browse the repository at this point in the history
According to the specs, non-table elements met while
building a table (before we meet a TD/TH), should be
moved outside the table and added as the previous
sibling of the table.
Previously, these elements were kept there, and ignored
by the table rendering code.
  • Loading branch information
poire-z committed Aug 24, 2020
1 parent 1ccfdf1 commit e5f6794
Show file tree
Hide file tree
Showing 3 changed files with 152 additions and 20 deletions.
9 changes: 6 additions & 3 deletions crengine/include/lvtinydom.h
Original file line number Diff line number Diff line change
Expand Up @@ -1054,7 +1054,7 @@ struct ldomNode
/// inserts child text
ldomNode * insertChildText( const lString16 & value );
/// inserts child text
ldomNode * insertChildText(const lString8 & value);
ldomNode * insertChildText(const lString8 & value, bool before_last_child=false);
/// remove child
ldomNode * removeChild( lUInt32 index );

Expand Down Expand Up @@ -2597,11 +2597,11 @@ class ldomElementWriter
return _element;
}
lString16 getPath();
void onText( const lChar16 * text, int len, lUInt32 flags );
void onText( const lChar16 * text, int len, lUInt32 flags, bool insert_before_last_child=false );
void addAttribute( lUInt16 nsid, lUInt16 id, const wchar_t * value );
//lxmlElementWriter * pop( lUInt16 id );

ldomElementWriter(ldomDocument * document, lUInt16 nsid, lUInt16 id, ldomElementWriter * parent);
ldomElementWriter(ldomDocument * document, lUInt16 nsid, lUInt16 id, ldomElementWriter * parent, bool insert_before_last_child=false);
~ldomElementWriter();

friend class ldomDocumentWriter;
Expand Down Expand Up @@ -2698,10 +2698,13 @@ class ldomDocumentWriterFilter : public ldomDocumentWriter
bool _bodyTagSeen;
bool _curNodeIsSelfClosing;
bool _curTagIsIgnored;
ldomElementWriter * _curNodeBeforeFostering;
ldomElementWriter * _curFosteredNode;
ldomElementWriter * _lastP;
virtual void AutoClose( lUInt16 tag_id, bool open );
virtual bool AutoOpenClosePop( int step, lUInt16 tag_id );
virtual lUInt16 popUpTo( ldomElementWriter * target, lUInt16 target_id=0, int scope=0 );
virtual bool CheckAndEnsureFosterParenting(lUInt16 tag_id);
virtual void ElementCloseHandler( ldomNode * node ) { node->persist(); }
virtual void appendStyle( const lChar16 * style );
virtual void setClass( const lChar16 * className, bool overrideExisting=false );
Expand Down
7 changes: 7 additions & 0 deletions crengine/src/lvdocview.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4546,6 +4546,13 @@ bool LVDocView::ParseDocument() {
ldomDocumentWriter writer(m_doc);
ldomDocumentWriterFilter writerFilter(m_doc, false,
HTML_AUTOCLOSE_TABLE);
// Note: creating these 2 writers here, and using only one,
// will still have both their destructors called when
// leaving this scope. Each destructor call will have
// ldomDocumentWriter::~ldomDocumentWriter() called, and
// both will do the same work on m_doc. So, beware there
// that this causes no issue.
// We might want to refactor this section to avoid any issue.

LVFileFormatParser * parser = NULL;
if (m_stream->GetSize() >= 5) {
Expand Down
156 changes: 139 additions & 17 deletions crengine/src/lvtinydom.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4993,7 +4993,7 @@ bool IsEmptySpace( const lChar16 * text, int len )

static bool IS_FIRST_BODY = false;

ldomElementWriter::ldomElementWriter(ldomDocument * document, lUInt16 nsid, lUInt16 id, ldomElementWriter * parent)
ldomElementWriter::ldomElementWriter(ldomDocument * document, lUInt16 nsid, lUInt16 id, ldomElementWriter * parent, bool insert_before_last_child)
: _parent(parent), _document(document), _tocItem(NULL), _isBlock(true), _isSection(false),
_stylesheetIsSet(false), _bodyEnterCalled(false), _pseudoElementAfterChildIndex(-1)
{
Expand Down Expand Up @@ -5022,8 +5022,12 @@ ldomElementWriter::ldomElementWriter(ldomDocument * document, lUInt16 nsid, lUIn
}
}

if (_parent)
_element = _parent->getElement()->insertChildElement( (lUInt32)-1, nsid, id );
if (_parent) {
lUInt32 index = _parent->getElement()->getChildCount();
if ( insert_before_last_child )
index--;
_element = _parent->getElement()->insertChildElement( index, nsid, id );
}
else
_element = _document->getRootNode(); //->insertChildElement( (lUInt32)-1, nsid, id );
if ( IS_FIRST_BODY && id==el_body ) {
Expand Down Expand Up @@ -7300,15 +7304,15 @@ void ldomElementWriter::onBodyExit()
#endif
}

void ldomElementWriter::onText( const lChar16 * text, int len, lUInt32 )
void ldomElementWriter::onText( const lChar16 * text, int len, lUInt32, bool insert_before_last_child )
{
//logfile << "{t";
{
// normal mode: store text copy
// add text node, if not first empty space string of block node
if ( !_isBlock || _element->getChildCount()!=0 || !IsEmptySpace( text, len ) || (_flags&TXTFLG_PRE) ) {
lString8 s8 = UnicodeToUtf8(text, len);
_element->insertChildText(s8);
_element->insertChildText(s8, insert_before_last_child);
} else {
//CRLog::trace("ldomElementWriter::onText: Ignoring first empty space of block item");
}
Expand Down Expand Up @@ -7591,7 +7595,9 @@ ldomDocumentWriter::~ldomDocumentWriter()
_document->dumpStatistics();
if ( _document->_nodeStylesInvalidIfLoading ) {
// Some pseudoclass like :last-child has been met which has set this flag
// (or, with the HTML parser, foster parenting of invalid element in tables)
printf("CRE: document loaded, but styles re-init needed (cause: peculiar CSS pseudoclasses met)\n");
_document->_nodeStylesInvalidIfLoading = false; // show this message only once
_document->forceReinitStyles();
}
if ( _document->hasRenderData() ) {
Expand Down Expand Up @@ -13010,12 +13016,17 @@ void ldomDocumentWriterFilter::AutoClose( lUInt16 tag_id, bool open )
// Note that a lot of rules and checks in the algorithm are for
// noticing "parser errors", with usually a fallback of ignoring
// it and going on.
// We ensure one tedious requirement: foster parenting of non-table
// elements met while building a table, mostly to not have mis-nested
// content simply ignored and not shown to the user.
// Other tedious requirements not ensured might just have some impact
// on the styling of the content, which should be a minor issue.
//
// It feels that we can simplify it to the following implementation,
// with possibly some cases not handled related to:
// - FORM and form elements (SELECT, INPUT, OPTION...)
// - TEMPLATE, APPLET, OBJECT, MARQUEE
// - Mis-nested HTML/BODY/HEAD
// - Foster parenting of non-table elements inside a table
// - Reconstructing the active formatting elements (B, I...) when
// mis-nested or "on hold" when entering block or table elements.
// - The "adoption agency algorithm" for mis-nested formatting
Expand Down Expand Up @@ -13077,6 +13088,12 @@ lUInt16 ldomDocumentWriterFilter::popUpTo( ldomElementWriter * target, lUInt16 t
}
if ( target_id && tmpId == target_id )
break;
if ( _curFosteredNode && tmp == _curFosteredNode ) {
// If fostering and we're not closing the fostered node itself,
// don't go at closing stuff above the fostered node
tmp = NULL;
break;
}
// Check scope stop tags
bool stop = false;
switch (scope) {
Expand Down Expand Up @@ -13208,9 +13225,19 @@ lUInt16 ldomDocumentWriterFilter::popUpTo( ldomElementWriter * target, lUInt16 t
}
if ( _lastP && _currNode == _lastP )
_lastP = NULL;
bool done = _currNode == target;
ldomElementWriter * tmp = _currNode;
_currNode = _currNode->_parent;
bool done = _currNode == target;
if ( _curFosteredNode && _currNode == _curFosteredNode ) {
// If we meet the fostered node, have it closed but don't
// go at closing above it
done = true;
_currNode = _curNodeBeforeFostering;
_curNodeBeforeFostering = NULL;
_curFosteredNode = NULL;
}
else {
_currNode = _currNode->_parent;
}
ElementCloseHandler( tmp->getElement() );
delete tmp;
if ( done )
Expand Down Expand Up @@ -13498,6 +13525,46 @@ bool ldomDocumentWriterFilter::AutoOpenClosePop( int step, lUInt16 tag_id )

return true;
}
bool ldomDocumentWriterFilter::CheckAndEnsureFosterParenting(lUInt16 tag_id)
{
if ( !_currNode )
return false;
lUInt16 curNodeId = _currNode->getElement()->getNodeId();
if ( curNodeId >= el_table && curNodeId <= el_tr && curNodeId != el_caption ) {
if ( tag_id < el_table || tag_id > el_td ) {
// Non table sub-element met as we expect only a table sub-element.
// Ensure foster parenting: this node (and its content) is to be
// inserted as a previous sibling of the table element we are in
_curNodeBeforeFostering = NULL;
// Look for the containing table element
ldomElementWriter * elem = _currNode;
while ( elem ) {
if ( elem->getElement()->getNodeId() == el_table ) {
break;
}
elem = elem->_parent;
}
if ( elem ) { // found it
_curNodeBeforeFostering = _currNode;
_currNode = elem->_parent; // parent of table
return true; // Insert the new element in _currNode (the parent of this
// table), before its last child (which is this table)
}
}
// We're in a table, and we see an expected sub-table element: all is fine
return false;
}
else if ( _curFosteredNode ) {
// We've been foster parenting: if we see a table sub-element,
// stop foster parenting and restore the original noce
if ( tag_id >= el_table && tag_id <= el_td ) {
popUpTo(_curFosteredNode);
// popUpTo() has restored _currNode to _curNodeBeforeFostering and
// reset _curFosteredNode and _curNodeBeforeFostering to NULL
}
}
return false;
}

ldomNode * ldomDocumentWriterFilter::OnTagOpen( const lChar16 * nsname, const lChar16 * tagname )
{
Expand Down Expand Up @@ -13603,10 +13670,29 @@ ldomNode * ldomDocumentWriterFilter::OnTagOpen( const lChar16 * nsname, const lC
}

bool tag_accepted = true;
bool insert_before_last_child = false;
if (gDOMVersionRequested >= 20200824) { // A little bit more HTML5 conformance
if ( id == el_image )
id = el_img;
tag_accepted = AutoOpenClosePop( PARSER_STEP_TAG_OPENING, id );
// If non-sub-table element opening while we're still
// inside sub-table non-TD/TH elements, we should
// do foster parenting: insert the node as the previous
// sibling of the TABLE element we're dealing with
// https://html.spec.whatwg.org/multipage/parsing.html#foster-parent
if ( CheckAndEnsureFosterParenting(id) ) {
insert_before_last_child = true;
// As we'll be inserting a node before the TABLE, which
// already had its style applied, some CSS selectors matches
// might no more be valid (i.e. :first-child, DIV + TABLE),
// so styles could change on the next re-rendering.
// We don't check if we actually had such selectors as that
// is complicated from here: we just set styles to be invalid
// so they are re-computed once the DOM is fully built.
_document->setNodeStylesInvalidIfLoading();
}
else {
tag_accepted = AutoOpenClosePop( PARSER_STEP_TAG_OPENING, id );
}
}
else {
AutoClose( id, true );
Expand All @@ -13632,8 +13718,13 @@ ldomNode * ldomDocumentWriterFilter::OnTagOpen( const lChar16 * nsname, const lC
return _currNode ? _currNode->getElement() : NULL;
}

_currNode = new ldomElementWriter( _document, nsid, id, _currNode );
_currNode = new ldomElementWriter( _document, nsid, id, _currNode, insert_before_last_child );
_flags = _currNode->getFlags();

if ( insert_before_last_child ) {
_curFosteredNode = _currNode;
}

if (gDOMVersionRequested >= 20200824 && id == el_p) {
// To avoid checking DOM ancestors with the numerous tags that close a P
_lastP = _currNode;
Expand Down Expand Up @@ -13931,16 +14022,35 @@ void ldomDocumentWriterFilter::OnText( const lChar16 * text, int len, lUInt32 fl
//logfile << "lxmlDocumentWriter::OnText() fpos=" << fpos;
if (_currNode)
{
lUInt16 curNodeId = _currNode->getElement()->getNodeId();
if (gDOMVersionRequested < 20200824) {
AutoClose( _currNode->_element->getNodeId(), false );
AutoClose( curNodeId, false );
}
if ( (_flags & XML_FLAG_NO_SPACE_TEXT)
&& IsEmptySpace(text, len) && !(flags & TXTFLG_PRE))
return;
if ( !_currNode->_allowText )
return;
bool insert_before_last_child = false;
if (gDOMVersionRequested >= 20200824) {
// If we're inserting text while in table sub-elements that
// don't accept text, have it foster parented
if ( curNodeId >= el_table && curNodeId <= el_tr && curNodeId != el_caption ) {
if ( !IsEmptySpace(text, len) ) {
if ( CheckAndEnsureFosterParenting(el_NULL) ) {
insert_before_last_child = true;
}
}
}
}
else {
// Previously, text in table sub-elements (only table elements and
// self-closing elements have _allowText=false) had any text in between
// table elements dropped (but not elements! with "<table>abc<div>def",
// "abc" was dropped, but not "def")
if ( !_currNode->_allowText )
return;
}
if ( !_libRuDocumentDetected ) {
_currNode->onText( text, len, flags );
_currNode->onText( text, len, flags, insert_before_last_child );
}
else { // Lib.ru text cleanup
if ( _libRuParagraphStart ) {
Expand Down Expand Up @@ -14001,11 +14111,18 @@ void ldomDocumentWriterFilter::OnText( const lChar16 * text, int len, lUInt32 fl
OnTagOpen( NULL, paraTag );
OnTagBody();
}
_currNode->onText( text, len, flags );
_currNode->onText( text, len, flags, insert_before_last_child );
if ( autoPara )
OnTagClose( NULL, paraTag );
}
}
if ( insert_before_last_child ) {
// We have no _curFosteredNode to pop, so just restore
// the previous table node
_currNode = _curNodeBeforeFostering;
_curNodeBeforeFostering = NULL;
_curFosteredNode = NULL;
}
}
//logfile << " !t!\n";
}
Expand All @@ -14024,6 +14141,8 @@ ldomDocumentWriterFilter::ldomDocumentWriterFilter(ldomDocument * document, bool
, _bodyTagSeen(false)
, _curNodeIsSelfClosing(false)
, _curTagIsIgnored(false)
, _curNodeBeforeFostering(NULL)
, _curFosteredNode(NULL)
, _lastP(NULL)
{
if (gDOMVersionRequested >= 20200824) {
Expand Down Expand Up @@ -17701,7 +17820,7 @@ ldomNode * ldomNode::insertChildText( const lString16 & value )
}

/// inserts child text
ldomNode * ldomNode::insertChildText(const lString8 & s8)
ldomNode * ldomNode::insertChildText(const lString8 & s8, bool before_last_child)
{
ASSERT_NODE_NOT_NULL;
if ( isElement() ) {
Expand All @@ -17715,7 +17834,10 @@ ldomNode * ldomNode::insertChildText(const lString8 & s8)
ldomNode * node = getDocument()->allocTinyNode( NT_PTEXT );
node->_data._ptext_addr = getDocument()->_textStorage.allocText( node->_handle._dataIndex, _handle._dataIndex, s8 );
#endif
me->_children.insert( me->_children.length(), node->getDataIndex() );
int index = me->_children.length();
if ( before_last_child && index > 0 )
index--;
me->_children.insert( index, node->getDataIndex() );
return node;
}
readOnlyError();
Expand Down

0 comments on commit e5f6794

Please sign in to comment.