%PDF- %PDF-
| Direktori : /www/varak.net/wiki.varak.net/includes/tidy/ |
| Current File : //www/varak.net/wiki.varak.net/includes/tidy/Balancer.php |
<?php
/**
* An implementation of the tree building portion of the HTML5 parsing
* spec.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* http://www.gnu.org/copyleft/gpl.html
*
* @file
* @ingroup Parser
* @since 1.27
* @author C. Scott Ananian, 2016
*/
namespace MediaWiki\Tidy;
use Wikimedia\Assert\Assert;
use Wikimedia\Assert\ParameterAssertionException;
use \ExplodeIterator;
use \IteratorAggregate;
use \ReverseArrayIterator;
use \Sanitizer;
// A note for future librarization[1] -- this file is a good candidate
// for splitting into an independent library, except that it is currently
// highly optimized for MediaWiki use. It only implements the portions
// of the HTML5 tree builder used by tags supported by MediaWiki, and
// does not contain a true tokenizer pass, instead relying on
// comment stripping, attribute normalization, and escaping done by
// the MediaWiki Sanitizer. It also deliberately avoids building
// a true DOM in memory, instead serializing elements to an output string
// as soon as possible (usually as soon as the tag is closed) to reduce
// its memory footprint.
// We've been gradually lifting some of these restrictions to handle
// non-sanitized output generated by extensions, but we shortcut the tokenizer
// for speed (primarily by splitting on `<`) and so rely on syntactic
// well-formedness.
// On the other hand, I've been pretty careful to note with comments in the
// code the places where this implementation omits features of the spec or
// depends on the MediaWiki Sanitizer. Perhaps in the future we'll want to
// implement the missing pieces and make this a standalone PHP HTML5 parser.
// In order to do so, some sort of MediaWiki-specific API will need
// to be added to (a) allow the Balancer to bypass the tokenizer,
// and (b) support on-the-fly flattening instead of DOM node creation.
// [1]: https://www.mediawiki.org/wiki/Library_infrastructure_for_MediaWiki
/**
* Utility constants and sets for the HTML5 tree building algorithm.
* Sets are associative arrays indexed first by namespace and then by
* lower-cased tag name.
*
* @ingroup Parser
* @since 1.27
*/
class BalanceSets {
const HTML_NAMESPACE = 'http://www.w3.org/1999/xhtml';
const MATHML_NAMESPACE = 'http://www.w3.org/1998/Math/MathML';
const SVG_NAMESPACE = 'http://www.w3.org/2000/svg';
public static $unsupportedSet = [
self::HTML_NAMESPACE => [
'html' => true, 'head' => true, 'body' => true, 'frameset' => true,
'frame' => true,
'plaintext' => true, 'isindex' => true,
'xmp' => true, 'iframe' => true, 'noembed' => true,
'noscript' => true, 'script' => true,
'title' => true
]
];
public static $emptyElementSet = [
self::HTML_NAMESPACE => [
'area' => true, 'base' => true, 'basefont' => true,
'bgsound' => true, 'br' => true, 'col' => true, 'command' => true,
'embed' => true, 'frame' => true, 'hr' => true, 'img' => true,
'input' => true, 'keygen' => true, 'link' => true, 'meta' => true,
'param' => true, 'source' => true, 'track' => true, 'wbr' => true
]
];
public static $extraLinefeedSet = [
self::HTML_NAMESPACE => [
'pre' => true, 'textarea' => true, 'listing' => true,
]
];
public static $headingSet = [
self::HTML_NAMESPACE => [
'h1' => true, 'h2' => true, 'h3' => true,
'h4' => true, 'h5' => true, 'h6' => true
]
];
public static $specialSet = [
self::HTML_NAMESPACE => [
'address' => true, 'applet' => true, 'area' => true,
'article' => true, 'aside' => true, 'base' => true,
'basefont' => true, 'bgsound' => true, 'blockquote' => true,
'body' => true, 'br' => true, 'button' => true, 'caption' => true,
'center' => true, 'col' => true, 'colgroup' => true, 'dd' => true,
'details' => true, 'dir' => true, 'div' => true, 'dl' => true,
'dt' => true, 'embed' => true, 'fieldset' => true,
'figcaption' => true, 'figure' => true, 'footer' => true,
'form' => true, 'frame' => true, 'frameset' => true, 'h1' => true,
'h2' => true, 'h3' => true, 'h4' => true, 'h5' => true,
'h6' => true, 'head' => true, 'header' => true, 'hgroup' => true,
'hr' => true, 'html' => true, 'iframe' => true, 'img' => true,
'input' => true, 'isindex' => true, 'li' => true, 'link' => true,
'listing' => true, 'main' => true, 'marquee' => true,
'menu' => true, 'menuitem' => true, 'meta' => true, 'nav' => true,
'noembed' => true, 'noframes' => true, 'noscript' => true,
'object' => true, 'ol' => true, 'p' => true, 'param' => true,
'plaintext' => true, 'pre' => true, 'script' => true,
'section' => true, 'select' => true, 'source' => true,
'style' => true, 'summary' => true, 'table' => true,
'tbody' => true, 'td' => true, 'template' => true,
'textarea' => true, 'tfoot' => true, 'th' => true, 'thead' => true,
'title' => true, 'tr' => true, 'track' => true, 'ul' => true,
'wbr' => true, 'xmp' => true
],
self::SVG_NAMESPACE => [
'foreignobject' => true, 'desc' => true, 'title' => true
],
self::MATHML_NAMESPACE => [
'mi' => true, 'mo' => true, 'mn' => true, 'ms' => true,
'mtext' => true, 'annotation-xml' => true
]
];
public static $addressDivPSet = [
self::HTML_NAMESPACE => [
'address' => true, 'div' => true, 'p' => true
]
];
public static $tableSectionRowSet = [
self::HTML_NAMESPACE => [
'table' => true, 'thead' => true, 'tbody' => true,
'tfoot' => true, 'tr' => true
]
];
public static $impliedEndTagsSet = [
self::HTML_NAMESPACE => [
'dd' => true, 'dt' => true, 'li' => true, 'optgroup' => true,
'option' => true, 'p' => true, 'rb' => true, 'rp' => true,
'rt' => true, 'rtc' => true
]
];
public static $thoroughImpliedEndTagsSet = [
self::HTML_NAMESPACE => [
'caption' => true, 'colgroup' => true, 'dd' => true, 'dt' => true,
'li' => true, 'optgroup' => true, 'option' => true, 'p' => true,
'rb' => true, 'rp' => true, 'rt' => true, 'rtc' => true,
'tbody' => true, 'td' => true, 'tfoot' => true, 'th' => true,
'thead' => true, 'tr' => true
]
];
public static $tableCellSet = [
self::HTML_NAMESPACE => [
'td' => true, 'th' => true
]
];
public static $tableContextSet = [
self::HTML_NAMESPACE => [
'table' => true, 'template' => true, 'html' => true
]
];
public static $tableBodyContextSet = [
self::HTML_NAMESPACE => [
'tbody' => true, 'tfoot' => true, 'thead' => true,
'template' => true, 'html' => true
]
];
public static $tableRowContextSet = [
self::HTML_NAMESPACE => [
'tr' => true, 'template' => true, 'html' => true
]
];
// See https://html.spec.whatwg.org/multipage/forms.html#form-associated-element
public static $formAssociatedSet = [
self::HTML_NAMESPACE => [
'button' => true, 'fieldset' => true, 'input' => true,
'keygen' => true, 'object' => true, 'output' => true,
'select' => true, 'textarea' => true, 'img' => true
]
];
public static $inScopeSet = [
self::HTML_NAMESPACE => [
'applet' => true, 'caption' => true, 'html' => true,
'marquee' => true, 'object' => true,
'table' => true, 'td' => true, 'template' => true,
'th' => true
],
self::SVG_NAMESPACE => [
'foreignobject' => true, 'desc' => true, 'title' => true
],
self::MATHML_NAMESPACE => [
'mi' => true, 'mo' => true, 'mn' => true, 'ms' => true,
'mtext' => true, 'annotation-xml' => true
]
];
private static $inListItemScopeSet = null;
public static function inListItemScopeSet() {
if ( self::$inListItemScopeSet === null ) {
self::$inListItemScopeSet = self::$inScopeSet;
self::$inListItemScopeSet[self::HTML_NAMESPACE]['ol'] = true;
self::$inListItemScopeSet[self::HTML_NAMESPACE]['ul'] = true;
}
return self::$inListItemScopeSet;
}
private static $inButtonScopeSet = null;
public static function inButtonScopeSet() {
if ( self::$inButtonScopeSet === null ) {
self::$inButtonScopeSet = self::$inScopeSet;
self::$inButtonScopeSet[self::HTML_NAMESPACE]['button'] = true;
}
return self::$inButtonScopeSet;
}
public static $inTableScopeSet = [
self::HTML_NAMESPACE => [
'html' => true, 'table' => true, 'template' => true
]
];
public static $inInvertedSelectScopeSet = [
self::HTML_NAMESPACE => [
'option' => true, 'optgroup' => true
]
];
public static $mathmlTextIntegrationPointSet = [
self::MATHML_NAMESPACE => [
'mi' => true, 'mo' => true, 'mn' => true, 'ms' => true,
'mtext' => true
]
];
public static $htmlIntegrationPointSet = [
self::SVG_NAMESPACE => [
'foreignobject' => true,
'desc' => true,
'title' => true
]
];
// For tidy compatibility.
public static $tidyPWrapSet = [
self::HTML_NAMESPACE => [
'body' => true, 'blockquote' => true,
// We parse with <body> as the fragment context, but the top-level
// element on the stack is actually <html>. We could use the
// "adjusted current node" everywhere to work around this, but it's
// easier just to add <html> to the p-wrap set.
'html' => true,
],
];
public static $tidyInlineSet = [
self::HTML_NAMESPACE => [
'a' => true, 'abbr' => true, 'acronym' => true, 'applet' => true,
'b' => true, 'basefont' => true, 'bdo' => true, 'big' => true,
'br' => true, 'button' => true, 'cite' => true, 'code' => true,
'dfn' => true, 'em' => true, 'font' => true, 'i' => true,
'iframe' => true, 'img' => true, 'input' => true, 'kbd' => true,
'label' => true, 'legend' => true, 'map' => true, 'object' => true,
'param' => true, 'q' => true, 'rb' => true, 'rbc' => true,
'rp' => true, 'rt' => true, 'rtc' => true, 'ruby' => true,
's' => true, 'samp' => true, 'select' => true, 'small' => true,
'span' => true, 'strike' => true, 'strong' => true, 'sub' => true,
'sup' => true, 'textarea' => true, 'tt' => true, 'u' => true,
'var' => true,
],
];
}
/**
* A BalanceElement is a simplified version of a DOM Node. The main
* difference is that we only keep BalanceElements around for nodes
* currently on the BalanceStack of open elements. As soon as an
* element is closed, with some minor exceptions relating to the
* tree builder "adoption agency algorithm", the element and all its
* children are serialized to a string using the flatten() method.
* This keeps our memory usage low.
*
* @ingroup Parser
* @since 1.27
*/
class BalanceElement {
/**
* The namespace of the element.
* @var string $namespaceURI
*/
public $namespaceURI;
/**
* The lower-cased name of the element.
* @var string $localName
*/
public $localName;
/**
* Attributes for the element, in array form
* @var array $attribs
*/
public $attribs;
/**
* Parent of this element, or the string "flat" if this element has
* already been flattened into its parent.
* @var string|null $parent
*/
public $parent;
/**
* An array of children of this element. Typically only the last
* child will be an actual BalanceElement object; the rest will
* be strings, representing either text nodes or flattened
* BalanceElement objects.
* @var array $children
*/
public $children;
/**
* A unique string identifier for Noah's Ark purposes, lazy initialized
*/
private $noahKey;
/**
* The next active formatting element in the list, or null if this is the
* end of the AFE list or if the element is not in the AFE list.
*/
public $nextAFE;
/**
* The previous active formatting element in the list, or null if this is
* the start of the list or if the element is not in the AFE list.
*/
public $prevAFE;
/**
* The next element in the Noah's Ark species bucket.
*/
public $nextNoah;
/**
* Make a new BalanceElement corresponding to the HTML DOM Element
* with the given localname, namespace, and attributes.
*
* @param string $namespaceURI The namespace of the element.
* @param string $localName The lowercased name of the tag.
* @param array $attribs Attributes of the element
*/
public function __construct( $namespaceURI, $localName, array $attribs ) {
$this->localName = $localName;
$this->namespaceURI = $namespaceURI;
$this->attribs = $attribs;
$this->contents = '';
$this->parent = null;
$this->children = [];
}
/**
* Remove the given child from this element.
* @param BalanceElement $elt
*/
private function removeChild( BalanceElement $elt ) {
Assert::precondition(
$this->parent !== 'flat', "Can't removeChild after flattening $this"
);
Assert::parameter(
$elt->parent === $this, 'elt', 'must have $this as a parent'
);
$idx = array_search( $elt, $this->children, true );
Assert::parameter( $idx !== false, '$elt', 'must be a child of $this' );
$elt->parent = null;
array_splice( $this->children, $idx, 1 );
}
/**
* Find $a in the list of children and insert $b before it.
* @param BalanceElement $a
* @param BalanceElement|string $b
*/
public function insertBefore( BalanceElement $a, $b ) {
Assert::precondition(
$this->parent !== 'flat', "Can't insertBefore after flattening."
);
$idx = array_search( $a, $this->children, true );
Assert::parameter( $idx !== false, '$a', 'must be a child of $this' );
if ( is_string( $b ) ) {
array_splice( $this->children, $idx, 0, [ $b ] );
} else {
Assert::parameter( $b->parent !== 'flat', '$b', "Can't be flat" );
if ( $b->parent !== null ) {
$b->parent->removeChild( $b );
}
array_splice( $this->children, $idx, 0, [ $b ] );
$b->parent = $this;
}
}
/**
* Append $elt to the end of the list of children.
* @param BalanceElement|string $elt
*/
public function appendChild( $elt ) {
Assert::precondition(
$this->parent !== 'flat', "Can't appendChild after flattening."
);
if ( is_string( $elt ) ) {
array_push( $this->children, $elt );
return;
}
// Remove $elt from parent, if it had one.
if ( $elt->parent !== null ) {
$elt->parent->removeChild( $elt );
}
array_push( $this->children, $elt );
$elt->parent = $this;
}
/**
* Transfer all of the children of $elt to $this.
* @param BalanceElement $elt
*/
public function adoptChildren( BalanceElement $elt ) {
Assert::precondition(
$elt->parent !== 'flat', "Can't adoptChildren after flattening."
);
foreach ( $elt->children as $child ) {
if ( !is_string( $child ) ) {
// This is an optimization which avoids an O(n^2) set of
// array_splice operations.
$child->parent = null;
}
$this->appendChild( $child );
}
$elt->children = [];
}
/**
* Flatten this node and all of its children into a string, as specified
* by the HTML serialization specification, and replace this node
* in its parent by that string.
*
* @param array $config Balancer configuration; see Balancer::__construct().
*
* @see __toString()
*/
public function flatten( array $config ) {
Assert::parameter( $this->parent !== null, '$this', 'must be a child' );
Assert::parameter( $this->parent !== 'flat', '$this', 'already flat' );
$idx = array_search( $this, $this->parent->children, true );
Assert::parameter(
$idx !== false, '$this', 'must be a child of its parent'
);
$tidyCompat = $config['tidyCompat'];
if ( $tidyCompat ) {
$blank = true;
foreach ( $this->children as $elt ) {
if ( !is_string( $elt ) ) {
$elt = $elt->flatten( $config );
}
if ( $blank && preg_match( '/[^\t\n\f\r ]/', $elt ) ) {
$blank = false;
}
}
if ( $this->isHtmlNamed( 'mw:p-wrap' ) ) {
$this->localName = 'p';
} elseif ( $blank ) {
// Add 'mw-empty-elt' class so elements can be hidden via CSS
// for compatibility with legacy tidy.
if ( !count( $this->attribs ) &&
( $this->localName === 'tr' || $this->localName === 'li' )
) {
$this->attribs = [ 'class' => "mw-empty-elt" ];
}
$blank = false;
}
$flat = $blank ? '' : "{$this}";
} else {
$flat = "{$this}";
}
$this->parent->children[$idx] = $flat;
$this->parent = 'flat'; // for assertion checking
return $flat;
}
/**
* Serialize this node and all of its children to a string, as specified
* by the HTML serialization specification.
*
* @return string The serialization of the BalanceElement
* @see https://html.spec.whatwg.org/multipage/syntax.html#serialising-html-fragments
*/
public function __toString() {
$encAttribs = '';
foreach ( $this->attribs as $name => $value ) {
$encValue = Sanitizer::encodeAttribute( $value );
$encAttribs .= " $name=\"$encValue\"";
}
if ( !$this->isA( BalanceSets::$emptyElementSet ) ) {
$out = "<{$this->localName}{$encAttribs}>";
$len = strlen( $out );
// flatten children
foreach ( $this->children as $elt ) {
$out .= "{$elt}";
}
$out .= "</{$this->localName}>";
if (
$this->isA( BalanceSets::$extraLinefeedSet ) &&
$out[$len] === "\n"
) {
// Double the linefeed after pre/listing/textarea
// according to the HTML5 fragment serialization algorithm.
$out = substr( $out, 0, $len + 1 ) .
substr( $out, $len );
}
} else {
$out = "<{$this->localName}{$encAttribs} />";
Assert::invariant(
count( $this->children ) === 0,
"Empty elements shouldn't have children."
);
}
return $out;
}
// Utility functions on BalanceElements.
/**
* Determine if $this represents a specific HTML tag, is a member of
* a tag set, or is equal to another BalanceElement.
*
* @param BalanceElement|array|string $set The target BalanceElement,
* set (from the BalanceSets class), or string (HTML tag name).
* @return bool
*/
public function isA( $set ) {
if ( $set instanceof BalanceElement ) {
return $this === $set;
} elseif ( is_array( $set ) ) {
return isset( $set[$this->namespaceURI] ) &&
isset( $set[$this->namespaceURI][$this->localName] );
} else {
// assume this is an HTML element name.
return $this->isHtml() && $this->localName === $set;
}
}
/**
* Determine if this element is an HTML element with the specified name
* @param string $tagName
* @return bool
*/
public function isHtmlNamed( $tagName ) {
return $this->namespaceURI === BalanceSets::HTML_NAMESPACE
&& $this->localName === $tagName;
}
/**
* Determine if $this represents an element in the HTML namespace.
*
* @return bool
*/
public function isHtml() {
return $this->namespaceURI === BalanceSets::HTML_NAMESPACE;
}
/**
* Determine if $this represents a MathML text integration point,
* as defined in the HTML5 specification.
*
* @return bool
* @see https://html.spec.whatwg.org/multipage/syntax.html#mathml-text-integration-point
*/
public function isMathmlTextIntegrationPoint() {
return $this->isA( BalanceSets::$mathmlTextIntegrationPointSet );
}
/**
* Determine if $this represents an HTML integration point,
* as defined in the HTML5 specification.
*
* @return bool
* @see https://html.spec.whatwg.org/multipage/syntax.html#html-integration-point
*/
public function isHtmlIntegrationPoint() {
if ( $this->isA( BalanceSets::$htmlIntegrationPointSet ) ) {
return true;
}
if (
$this->namespaceURI === BalanceSets::MATHML_NAMESPACE &&
$this->localName === 'annotation-xml' &&
isset( $this->attribs['encoding'] ) &&
( strcasecmp( $this->attribs['encoding'], 'text/html' ) == 0 ||
strcasecmp( $this->attribs['encoding'], 'application/xhtml+xml' ) == 0 )
) {
return true;
}
return false;
}
/**
* Get a string key for the Noah's Ark algorithm
*/
public function getNoahKey() {
if ( $this->noahKey === null ) {
$attribs = $this->attribs;
ksort( $attribs );
$this->noahKey = serialize( [ $this->namespaceURI, $this->localName, $attribs ] );
}
return $this->noahKey;
}
}
/**
* The "stack of open elements" as defined in the HTML5 tree builder
* spec. This contains methods to ensure that content (start tags, text)
* are inserted at the correct place in the output string, and to
* flatten BalanceElements are they are closed to avoid holding onto
* a complete DOM tree for the document in memory.
*
* The stack defines a PHP iterator to traverse it in "reverse order",
* that is, the most-recently-added element is visited first in a
* foreach loop.
*
* @ingroup Parser
* @since 1.27
* @see https://html.spec.whatwg.org/multipage/syntax.html#the-stack-of-open-elements
*/
class BalanceStack implements IteratorAggregate {
/**
* Backing storage for the stack.
* @var array $elements
*/
private $elements = [];
/**
* Foster parent mode determines how nodes are inserted into the
* stack.
* @var bool $fosterParentMode
* @see https://html.spec.whatwg.org/multipage/syntax.html#foster-parent
*/
public $fosterParentMode = false;
/**
* Configuration options governing flattening.
* @var array $config
* @see Balancer::__construct()
*/
private $config;
/**
* Reference to the current element
*/
public $currentNode;
/**
* Create a new BalanceStack with a single BalanceElement on it,
* representing the root <html> node.
* @param array $config Balancer configuration; see Balancer::_construct().
*/
public function __construct( array $config ) {
// always a root <html> element on the stack
array_push(
$this->elements,
new BalanceElement( BalanceSets::HTML_NAMESPACE, 'html', [] )
);
$this->currentNode = $this->elements[0];
$this->config = $config;
}
/**
* Return a string representing the output of the tree builder:
* all the children of the root <html> node.
* @return string
*/
public function getOutput() {
// Don't include the outer '<html>....</html>'
$out = '';
foreach ( $this->elements[0]->children as $elt ) {
$out .= is_string( $elt ) ? $elt :
$elt->flatten( $this->config );
}
return $out;
}
/**
* Insert a comment at the appropriate place for inserting a node.
* @param string $value Content of the comment.
* @see https://html.spec.whatwg.org/multipage/syntax.html#insert-a-comment
*/
public function insertComment( $value ) {
// Just another type of text node, except for tidy p-wrapping.
return $this->insertText( '<!--' . $value . '-->', true );
}
/**
* Insert text at the appropriate place for inserting a node.
* @param string $value
* @see https://html.spec.whatwg.org/multipage/syntax.html#appropriate-place-for-inserting-a-node
*/
public function insertText( $value, $isComment = false ) {
if (
$this->fosterParentMode &&
$this->currentNode->isA( BalanceSets::$tableSectionRowSet )
) {
$this->fosterParent( $value );
} elseif (
$this->config['tidyCompat'] && !$isComment &&
$this->currentNode->isA( BalanceSets::$tidyPWrapSet )
) {
$this->insertHTMLELement( 'mw:p-wrap', [] );
return $this->insertText( $value );
} else {
$this->currentNode->appendChild( $value );
}
}
/**
* Insert a BalanceElement at the appropriate place, pushing it
* on to the open elements stack.
* @param string $namespaceURI The element namespace
* @param string $tag The tag name
* @param string $attribs Normalized attributes, as a string.
* @return BalanceElement
* @see https://html.spec.whatwg.org/multipage/syntax.html#insert-a-foreign-element
*/
public function insertForeignElement( $namespaceURI, $tag, $attribs ) {
return $this->insertElement(
new BalanceElement( $namespaceURI, $tag, $attribs )
);
}
/**
* Insert an HTML element at the appropriate place, pushing it on to
* the open elements stack.
* @param string $tag The tag name
* @param string $attribs Normalized attributes, as a string.
* @return BalanceElement
* @see https://html.spec.whatwg.org/multipage/syntax.html#insert-an-html-element
*/
public function insertHTMLElement( $tag, $attribs ) {
return $this->insertForeignElement(
BalanceSets::HTML_NAMESPACE, $tag, $attribs
);
}
/**
* Insert an element at the appropriate place and push it on to the
* open elements stack.
* @param BalanceElement $elt
* @return BalanceElement
* @see https://html.spec.whatwg.org/multipage/syntax.html#appropriate-place-for-inserting-a-node
*/
public function insertElement( BalanceElement $elt ) {
if (
$this->currentNode->isHtmlNamed( 'mw:p-wrap' ) &&
!$elt->isA( BalanceSets::$tidyInlineSet )
) {
// Tidy compatibility.
$this->pop();
}
if (
$this->fosterParentMode &&
$this->currentNode->isA( BalanceSets::$tableSectionRowSet )
) {
$elt = $this->fosterParent( $elt );
} else {
$this->currentNode->appendChild( $elt );
}
Assert::invariant( $elt->parent !== null, "$elt must be in tree" );
Assert::invariant( $elt->parent !== 'flat', "$elt must not have been previous flattened" );
array_push( $this->elements, $elt );
$this->currentNode = $elt;
return $elt;
}
/**
* Determine if the stack has $tag in scope.
* @param BalanceElement|array|string $tag
* @return bool
* @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-scope
*/
public function inScope( $tag ) {
return $this->inSpecificScope( $tag, BalanceSets::$inScopeSet );
}
/**
* Determine if the stack has $tag in button scope.
* @param BalanceElement|array|string $tag
* @return bool
* @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-button-scope
*/
public function inButtonScope( $tag ) {
return $this->inSpecificScope( $tag, BalanceSets::inButtonScopeSet() );
}
/**
* Determine if the stack has $tag in list item scope.
* @param BalanceElement|array|string $tag
* @return bool
* @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-list-item-scope
*/
public function inListItemScope( $tag ) {
return $this->inSpecificScope( $tag, BalanceSets::inListItemScopeSet() );
}
/**
* Determine if the stack has $tag in table scope.
* @param BalanceElement|array|string $tag
* @return bool
* @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-table-scope
*/
public function inTableScope( $tag ) {
return $this->inSpecificScope( $tag, BalanceSets::$inTableScopeSet );
}
/**
* Determine if the stack has $tag in select scope.
* @param BalanceElement|array|string $tag
* @return bool
* @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-select-scope
*/
public function inSelectScope( $tag ) {
// Can't use inSpecificScope to implement this, since it involves
// *inverting* a set of tags. Implement manually.
foreach ( $this as $elt ) {
if ( $elt->isA( $tag ) ) {
return true;
}
if ( !$elt->isA( BalanceSets::$inInvertedSelectScopeSet ) ) {
return false;
}
}
return false;
}
/**
* Determine if the stack has $tag in a specific scope, $set.
* @param BalanceElement|array|string $tag
* @param BalanceElement|array|string $set
* @return bool
* @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-the-specific-scope
*/
public function inSpecificScope( $tag, $set ) {
foreach ( $this as $elt ) {
if ( $elt->isA( $tag ) ) {
return true;
}
if ( $elt->isA( $set ) ) {
return false;
}
}
return false;
}
/**
* Generate implied end tags.
* @param string $butnot
* @param bool $thorough True if we should generate end tags thoroughly.
* @see https://html.spec.whatwg.org/multipage/syntax.html#generate-implied-end-tags
*/
public function generateImpliedEndTags( $butnot = null, $thorough = false ) {
$endTagSet = $thorough ?
BalanceSets::$thoroughImpliedEndTagsSet :
BalanceSets::$impliedEndTagsSet;
while ( $this->currentNode ) {
if ( $butnot !== null && $this->currentNode->isHtmlNamed( $butnot ) ) {
break;
}
if ( !$this->currentNode->isA( $endTagSet ) ) {
break;
}
$this->pop();
}
}
/**
* Return the adjusted current node.
*/
public function adjustedCurrentNode( $fragmentContext ) {
return ( $fragmentContext && count( $this->elements ) === 1 ) ?
$fragmentContext : $this->currentNode;
}
/**
* Return an iterator over this stack which visits the current node
* first, and the root node last.
* @return Iterator
*/
public function getIterator() {
return new ReverseArrayIterator( $this->elements );
}
/**
* Return the BalanceElement at the given position $idx, where
* position 0 represents the root element.
* @param int $idx
* @return BalanceElement
*/
public function node( $idx ) {
return $this->elements[ $idx ];
}
/**
* Replace the element at position $idx in the BalanceStack with $elt.
* @param int $idx
* @param BalanceElement $elt
*/
public function replaceAt( $idx, BalanceElement $elt ) {
Assert::precondition(
$this->elements[$idx]->parent !== 'flat',
'Replaced element should not have already been flattened.'
);
Assert::precondition(
$elt->parent !== 'flat',
'New element should not have already been flattened.'
);
$this->elements[$idx] = $elt;
if ( $idx === count( $this->elements ) - 1 ) {
$this->currentNode = $elt;
}
}
/**
* Return the position of the given BalanceElement, set, or
* HTML tag name string in the BalanceStack.
* @param BalanceElement|array|string $tag
* @return int
*/
public function indexOf( $tag ) {
for ( $i = count( $this->elements ) - 1; $i >= 0; $i-- ) {
if ( $this->elements[$i]->isA( $tag ) ) {
return $i;
}
}
return -1;
}
/**
* Return the number of elements currently in the BalanceStack.
* @return int
*/
public function length() {
return count( $this->elements );
}
/**
* Remove the current node from the BalanceStack, flattening it
* in the process.
*/
public function pop() {
$elt = array_pop( $this->elements );
if ( count( $this->elements ) ) {
$this->currentNode = $this->elements[ count( $this->elements ) - 1 ];
} else {
$this->currentNode = null;
}
if ( !$elt->isHtmlNamed( 'mw:p-wrap' ) ) {
$elt->flatten( $this->config );
}
}
/**
* Remove all nodes up to and including position $idx from the
* BalanceStack, flattening them in the process.
* @param int $idx
*/
public function popTo( $idx ) {
for ( $length = count( $this->elements ); $length > $idx; $length-- ) {
$this->pop();
}
}
/**
* Pop elements off the stack up to and including the first
* element with the specified HTML tagname (or matching the given
* set).
* @param BalanceElement|array|string $tag
*/
public function popTag( $tag ) {
while ( $this->currentNode ) {
if ( $this->currentNode->isA( $tag ) ) {
$this->pop();
break;
}
$this->pop();
}
}
/**
* Pop elements off the stack *not including* the first element
* in the specified set.
* @param BalanceElement|array|string $set
*/
public function clearToContext( $set ) {
// Note that we don't loop to 0. Never pop the <html> elt off.
for ( $length = count( $this->elements ); $length > 1; $length-- ) {
if ( $this->currentNode->isA( $set ) ) {
break;
}
$this->pop();
}
}
/**
* Remove the given $elt from the BalanceStack, optionally
* flattening it in the process.
* @param BalanceElement $elt The element to remove.
* @param bool $flatten Whether to flatten the removed element.
*/
public function removeElement( BalanceElement $elt, $flatten = true ) {
Assert::parameter(
$elt->parent !== 'flat',
'$elt',
'$elt should not already have been flattened.'
);
Assert::parameter(
$elt->parent->parent !== 'flat',
'$elt',
'The parent of $elt should not already have been flattened.'
);
$idx = array_search( $elt, $this->elements, true );
Assert::parameter( $idx !== false, '$elt', 'must be in stack' );
array_splice( $this->elements, $idx, 1 );
if ( $idx === count( $this->elements ) ) {
$this->currentNode = $this->elements[$idx - 1];
}
if ( $flatten ) {
// serialize $elt into its parent
// otherwise, it will eventually serialize when the parent
// is serialized, we just hold onto the memory for its
// tree of objects a little longer.
$elt->flatten( $this->config );
}
Assert::postcondition(
array_search( $elt, $this->elements, true ) === false,
'$elt should no longer be in open elements stack'
);
}
/**
* Find $a in the BalanceStack and insert $b after it.
* @param BalanceElement $a
* @param BalanceElement $b
*/
public function insertAfter( BalanceElement $a, BalanceElement $b ) {
$idx = $this->indexOf( $a );
Assert::parameter( $idx !== false, '$a', 'must be in stack' );
if ( $idx === count( $this->elements ) - 1 ) {
array_push( $this->elements, $b );
$this->currentNode = $b;
} else {
array_splice( $this->elements, $idx + 1, 0, [ $b ] );
}
}
// Fostering and adoption.
/**
* Foster parent the given $elt in the stack of open elements.
* @param BalanceElement|string $elt
* @see https://html.spec.whatwg.org/multipage/syntax.html#foster-parent
*/
private function fosterParent( $elt ) {
$lastTable = $this->indexOf( 'table' );
$lastTemplate = $this->indexOf( 'template' );
$parent = null;
$before = null;
if ( $lastTemplate >= 0 && ( $lastTable < 0 || $lastTemplate > $lastTable ) ) {
$parent = $this->elements[$lastTemplate];
} elseif ( $lastTable >= 0 ) {
$parent = $this->elements[$lastTable]->parent;
// Assume all tables have parents, since we're not running scripts!
Assert::invariant(
$parent !== null, "All tables should have parents"
);
$before = $this->elements[$lastTable];
} else {
$parent = $this->elements[0]; // the `html` element.
}
if ( $this->config['tidyCompat'] ) {
if ( is_string( $elt ) ) {
// We're fostering text: do we need a p-wrapper?
if ( $parent->isA( BalanceSets::$tidyPWrapSet ) ) {
$this->insertHTMLElement( 'mw:p-wrap', [] );
$this->insertText( $elt );
return $elt;
}
} else {
// We're fostering an element; do we need to merge p-wrappers?
if ( $elt->isHtmlNamed( 'mw:p-wrap' ) ) {
$idx = $before ?
array_search( $before, $parent->children, true ) :
count( $parent->children );
$after = $idx > 0 ? $parent->children[$idx - 1] : '';
if (
$after instanceof BalanceElement &&
$after->isHtmlNamed( 'mw:p-wrap' )
) {
return $after; // Re-use existing p-wrapper.
}
}
}
}
if ( $before ) {
$parent->insertBefore( $before, $elt );
} else {
$parent->appendChild( $elt );
}
return $elt;
}
/**
* Run the "adoption agency algoritm" (AAA) for the given subject
* tag name.
* @param string $tag The subject tag name.
* @param BalanceActiveFormattingElements $afe The current
* active formatting elements list.
* @return true if the adoption agency algorithm "did something", false
* if more processing is required by the caller.
* @see https://html.spec.whatwg.org/multipage/syntax.html#adoption-agency-algorithm
*/
public function adoptionAgency( $tag, $afe ) {
// If the current node is an HTML element whose tag name is subject,
// and the current node is not in the list of active formatting
// elements, then pop the current node off the stack of open
// elements and abort these steps.
if (
$this->currentNode->isHtmlNamed( $tag ) &&
!$afe->isInList( $this->currentNode )
) {
$this->pop();
return true; // no more handling required
}
// Outer loop: If outer loop counter is greater than or
// equal to eight, then abort these steps.
for ( $outer = 0; $outer < 8; $outer++ ) {
// Let the formatting element be the last element in the list
// of active formatting elements that: is between the end of
// the list and the last scope marker in the list, if any, or
// the start of the list otherwise, and has the same tag name
// as the token.
$fmtElt = $afe->findElementByTag( $tag );
// If there is no such node, then abort these steps and instead
// act as described in the "any other end tag" entry below.
if ( !$fmtElt ) {
return false; // false means handle by the default case
}
// Otherwise, if there is such a node, but that node is not in
// the stack of open elements, then this is a parse error;
// remove the element from the list, and abort these steps.
$index = $this->indexOf( $fmtElt );
if ( $index < 0 ) {
$afe->remove( $fmtElt );
return true; // true means no more handling required
}
// Otherwise, if there is such a node, and that node is also in
// the stack of open elements, but the element is not in scope,
// then this is a parse error; ignore the token, and abort
// these steps.
if ( !$this->inScope( $fmtElt ) ) {
return true;
}
// Let the furthest block be the topmost node in the stack of
// open elements that is lower in the stack than the formatting
// element, and is an element in the special category. There
// might not be one.
$furthestBlock = null;
$furthestBlockIndex = -1;
$stackLength = $this->length();
for ( $i = $index+1; $i < $stackLength; $i++ ) {
if ( $this->node( $i )->isA( BalanceSets::$specialSet ) ) {
$furthestBlock = $this->node( $i );
$furthestBlockIndex = $i;
break;
}
}
// If there is no furthest block, then the UA must skip the
// subsequent steps and instead just pop all the nodes from the
// bottom of the stack of open elements, from the current node
// up to and including the formatting element, and remove the
// formatting element from the list of active formatting
// elements.
if ( !$furthestBlock ) {
$this->popTag( $fmtElt );
$afe->remove( $fmtElt );
return true;
}
// Let the common ancestor be the element immediately above
// the formatting element in the stack of open elements.
$ancestor = $this->node( $index-1 );
// Let a bookmark note the position of the formatting
// element in the list of active formatting elements
// relative to the elements on either side of it in the
// list.
$BOOKMARK = new BalanceElement( '[bookmark]', '[bookmark]', [] );
$afe->insertAfter( $fmtElt, $BOOKMARK );
// Let node and last node be the furthest block.
$node = $furthestBlock;
$lastNode = $furthestBlock;
$nodeIndex = $furthestBlockIndex;
$isAFE = false;
// Inner loop
for ( $inner = 1; true; $inner++ ) {
// Let node be the element immediately above node in
// the stack of open elements, or if node is no longer
// in the stack of open elements (e.g. because it got
// removed by this algorithm), the element that was
// immediately above node in the stack of open elements
// before node was removed.
$node = $this->node( --$nodeIndex );
// If node is the formatting element, then go
// to the next step in the overall algorithm.
if ( $node === $fmtElt ) break;
// If the inner loop counter is greater than three and node
// is in the list of active formatting elements, then remove
// node from the list of active formatting elements.
$isAFE = $afe->isInList( $node );
if ( $inner > 3 && $isAFE ) {
$afe->remove( $node );
$isAFE = false;
}
// If node is not in the list of active formatting
// elements, then remove node from the stack of open
// elements and then go back to the step labeled inner
// loop.
if ( !$isAFE ) {
// Don't flatten here, since we're about to relocate
// parts of this $node.
$this->removeElement( $node, false );
continue;
}
// Create an element for the token for which the
// element node was created with common ancestor as
// the intended parent, replace the entry for node
// in the list of active formatting elements with an
// entry for the new element, replace the entry for
// node in the stack of open elements with an entry for
// the new element, and let node be the new element.
$newElt = new BalanceElement(
$node->namespaceURI, $node->localName, $node->attribs );
$afe->replace( $node, $newElt );
$this->replaceAt( $nodeIndex, $newElt );
$node = $newElt;
// If last node is the furthest block, then move the
// aforementioned bookmark to be immediately after the
// new node in the list of active formatting elements.
if ( $lastNode === $furthestBlock ) {
$afe->remove( $BOOKMARK );
$afe->insertAfter( $newElt, $BOOKMARK );
}
// Insert last node into node, first removing it from
// its previous parent node if any.
$node->appendChild( $lastNode );
// Let last node be node.
$lastNode = $node;
}
// If the common ancestor node is a table, tbody, tfoot,
// thead, or tr element, then, foster parent whatever last
// node ended up being in the previous step, first removing
// it from its previous parent node if any.
if (
$this->fosterParentMode &&
$ancestor->isA( BalanceSets::$tableSectionRowSet )
) {
$this->fosterParent( $lastNode );
} else {
// Otherwise, append whatever last node ended up being in
// the previous step to the common ancestor node, first
// removing it from its previous parent node if any.
$ancestor->appendChild( $lastNode );
}
// Create an element for the token for which the
// formatting element was created, with furthest block
// as the intended parent.
$newElt2 = new BalanceElement(
$fmtElt->namespaceURI, $fmtElt->localName, $fmtElt->attribs );
// Take all of the child nodes of the furthest block and
// append them to the element created in the last step.
$newElt2->adoptChildren( $furthestBlock );
// Append that new element to the furthest block.
$furthestBlock->appendChild( $newElt2 );
// Remove the formatting element from the list of active
// formatting elements, and insert the new element into the
// list of active formatting elements at the position of
// the aforementioned bookmark.
$afe->remove( $fmtElt );
$afe->replace( $BOOKMARK, $newElt2 );
// Remove the formatting element from the stack of open
// elements, and insert the new element into the stack of
// open elements immediately below the position of the
// furthest block in that stack.
$this->removeElement( $fmtElt );
$this->insertAfter( $furthestBlock, $newElt2 );
}
return true;
}
/**
* Return the contents of the open elements stack as a string for
* debugging.
* @return string
*/
public function __toString() {
$r = [];
foreach ( $this->elements as $elt ) {
array_push( $r, $elt->localName );
}
return implode( $r, ' ' );
}
}
/**
* A pseudo-element used as a marker in the list of active formatting elements
*
* @ingroup Parser
* @since 1.27
*/
class BalanceMarker {
public $nextAFE;
public $prevAFE;
}
/**
* The list of active formatting elements, which is used to handle
* mis-nested formatting element tags in the HTML5 tree builder
* specification.
*
* @ingroup Parser
* @since 1.27
* @see https://html.spec.whatwg.org/multipage/syntax.html#list-of-active-formatting-elements
*/
class BalanceActiveFormattingElements {
/** The last (most recent) element in the list */
private $tail;
/** The first (least recent) element in the list */
private $head;
/**
* An array of arrays representing the population of elements in each bucket
* according to the Noah's Ark clause. The outer array is stack-like, with each
* integer-indexed element representing a segment of the list, bounded by
* markers. The first element represents the segment of the list before the
* first marker.
*
* The inner arrays are indexed by "Noah key", which is a string which uniquely
* identifies each bucket according to the rules in the spec. The value in
* the inner array is the first (least recently inserted) element in the bucket,
* and subsequent members of the bucket can be found by iterating through the
* singly-linked list via $node->nextNoah.
*
* This is optimised for the most common case of inserting into a bucket
* with zero members, and deleting a bucket containing one member. In the
* worst case, iteration through the list is still O(1) in the document
* size, since each bucket can have at most 3 members.
*/
private $noahTableStack = [ [] ];
public function __destruct() {
for ( $node = $this->head; $node; $node = $next ) {
$next = $node->nextAFE;
$node->prevAFE = $node->nextAFE = $node->nextNoah = null;
}
$this->head = $this->tail = $this->noahTableStack = null;
}
public function insertMarker() {
$elt = new BalanceMarker;
if ( $this->tail ) {
$this->tail->nextAFE = $elt;
$elt->prevAFE = $this->tail;
} else {
$this->head = $elt;
}
$this->tail = $elt;
$this->noahTableStack[] = [];
}
/**
* Follow the steps required when the spec requires us to "push onto the
* list of active formatting elements".
* @param BalanceElement $elt
*/
public function push( BalanceElement $elt ) {
// Must not be in the list already
if ( $elt->prevAFE !== null || $this->head === $elt ) {
throw new ParameterAssertionException( '$elt',
'Cannot insert a node into the AFE list twice' );
}
// "Noah's Ark clause" -- if there are already three copies of
// this element before we encounter a marker, then drop the last
// one.
$noahKey = $elt->getNoahKey();
$table =& $this->noahTableStack[ count( $this->noahTableStack ) - 1 ];
if ( !isset( $table[$noahKey] ) ) {
$table[$noahKey] = $elt;
} else {
$count = 1;
$head = $tail = $table[$noahKey];
while ( $tail->nextNoah ) {
$tail = $tail->nextNoah;
$count++;
}
if ( $count >= 3 ) {
$this->remove( $head );
}
$tail->nextNoah = $elt;
}
// Add to the main AFE list
if ( $this->tail ) {
$this->tail->nextAFE = $elt;
$elt->prevAFE = $this->tail;
} else {
$this->head = $elt;
}
$this->tail = $elt;
}
/**
* Follow the steps required when the spec asks us to "clear the list of
* active formatting elements up to the last marker".
*/
public function clearToMarker() {
// Iterate back through the list starting from the tail
$tail = $this->tail;
while ( $tail && !( $tail instanceof BalanceMarker ) ) {
// Unlink the element
$prev = $tail->prevAFE;
$tail->prevAFE = null;
if ( $prev ) {
$prev->nextAFE = null;
}
$tail->nextNoah = null;
$tail = $prev;
}
// If we finished on a marker, unlink it and pop it off the Noah table stack
if ( $tail ) {
$prev = $tail->prevAFE;
if ( $prev ) {
$prev->nextAFE = null;
}
$tail = $prev;
array_pop( $this->noahTableStack );
} else {
// No marker: wipe the top-level Noah table (which is the only one)
$this->noahTableStack[0] = [];
}
// If we removed all the elements, clear the head pointer
if ( !$tail ) {
$this->head = null;
}
$this->tail = $tail;
}
/**
* Find and return the last element with the specified tag between the
* end of the list and the last marker on the list.
* Used when parsing <a> "in body mode".
*/
public function findElementByTag( $tag ) {
$elt = $this->tail;
while ( $elt && !( $elt instanceof BalanceMarker ) ) {
if ( $elt->localName === $tag ) {
return $elt;
}
$elt = $elt->prevAFE;
}
return null;
}
/**
* Determine whether an element is in the list of formatting elements.
* @return boolean
*/
public function isInList( BalanceElement $elt ) {
return $this->head === $elt || $elt->prevAFE;
}
/**
* Find the element $elt in the list and remove it.
* Used when parsing <a> in body mode.
*/
public function remove( BalanceElement $elt ) {
if ( $this->head !== $elt && !$elt->prevAFE ) {
throw new ParameterAssertionException( '$elt',
"Attempted to remove an element which is not in the AFE list" );
}
// Update head and tail pointers
if ( $this->head === $elt ) {
$this->head = $elt->nextAFE;
}
if ( $this->tail === $elt ) {
$this->tail = $elt->prevAFE;
}
// Update previous element
if ( $elt->prevAFE ) {
$elt->prevAFE->nextAFE = $elt->nextAFE;
}
// Update next element
if ( $elt->nextAFE ) {
$elt->nextAFE->prevAFE = $elt->prevAFE;
}
// Clear pointers so that isInList() etc. will work
$elt->prevAFE = $elt->nextAFE = null;
// Update Noah list
$this->removeFromNoahList( $elt );
}
private function addToNoahList( BalanceElement $elt ) {
$noahKey = $elt->getNoahKey();
$table =& $this->noahTableStack[ count( $this->noahTableStack ) - 1 ];
if ( !isset( $table[$noahKey] ) ) {
$table[$noahKey] = $elt;
} else {
$tail = $table[$noahKey];
while ( $tail->nextNoah ) {
$tail = $tail->nextNoah;
}
$tail->nextNoah = $elt;
}
}
private function removeFromNoahList( BalanceElement $elt ) {
$table =& $this->noahTableStack[ count( $this->noahTableStack ) - 1 ];
$key = $elt->getNoahKey();
$noahElt = $table[$key];
if ( $noahElt === $elt ) {
if ( $noahElt->nextNoah ) {
$table[$key] = $noahElt->nextNoah;
$noahElt->nextNoah = null;
} else {
unset( $table[$key] );
}
} else {
do {
$prevNoahElt = $noahElt;
$noahElt = $prevNoahElt->nextNoah;
if ( $noahElt === $elt ) {
// Found it, unlink
$prevNoahElt->nextNoah = $elt->nextNoah;
$elt->nextNoah = null;
break;
}
} while ( $noahElt );
}
}
/**
* Find element $a in the list and replace it with element $b
*/
public function replace( BalanceElement $a, BalanceElement $b ) {
if ( $this->head !== $a && !$a->prevAFE ) {
throw new ParameterAssertionException( '$a',
"Attempted to replace an element which is not in the AFE list" );
}
// Update head and tail pointers
if ( $this->head === $a ) {
$this->head = $b;
}
if ( $this->tail === $a ) {
$this->tail = $b;
}
// Update previous element
if ( $a->prevAFE ) {
$a->prevAFE->nextAFE = $b;
}
// Update next element
if ( $a->nextAFE ) {
$a->nextAFE->prevAFE = $b;
}
$b->prevAFE = $a->prevAFE;
$b->nextAFE = $a->nextAFE;
$a->nextAFE = $a->prevAFE = null;
// Update Noah list
$this->removeFromNoahList( $a );
$this->addToNoahList( $b );
}
/**
* Find $a in the list and insert $b after it.
*/
public function insertAfter( BalanceElement $a, BalanceElement $b ) {
if ( $this->head !== $a && !$a->prevAFE ) {
throw new ParameterAssertionException( '$a',
"Attempted to insert after an element which is not in the AFE list" );
}
if ( $this->tail === $a ) {
$this->tail = $b;
}
if ( $a->nextAFE ) {
$a->nextAFE->prevAFE = $b;
}
$b->nextAFE = $a->nextAFE;
$b->prevAFE = $a;
$a->nextAFE = $b;
$this->addToNoahList( $b );
}
// @codingStandardsIgnoreStart Generic.Files.LineLength.TooLong
/**
* Reconstruct the active formatting elements.
* @param BalanceStack $stack The open elements stack
* @see https://html.spec.whatwg.org/multipage/syntax.html#reconstruct-the-active-formatting-elements
*/
// @codingStandardsIgnoreEnd
public function reconstruct( $stack ) {
$entry = $this->tail;
// If there are no entries in the list of active formatting elements,
// then there is nothing to reconstruct
if ( !$entry ) {
return;
}
// If the last is a marker, do nothing.
if ( $entry instanceof BalanceMarker ) {
return;
}
// Or if it is an open element, do nothing.
if ( $stack->indexOf( $entry ) >= 0 ) {
return;
}
// Loop backward through the list until we find a marker or an
// open element
$foundIt = false;
while ( $entry->prevAFE ) {
$entry = $entry->prevAFE;
if ( $entry instanceof BalanceMarker || $stack->indexOf( $entry ) >= 0 ) {
$foundIt = true;
break;
}
}
// Now loop forward, starting from the element after the current one (or
// the first element if we didn't find a marker or open element),
// recreating formatting elements and pushing them back onto the list
// of open elements.
if ( $foundIt ) {
$entry = $entry->nextAFE;
}
do {
$newElement = $stack->insertHTMLElement(
$entry->localName,
$entry->attribs );
$this->replace( $entry, $newElement );
$entry = $newElement->nextAFE;
} while ( $entry );
}
/**
* Get a string representation of the AFE list, for debugging
*/
public function __toString() {
$prev = null;
$s = '';
for ( $node = $this->head; $node; $prev = $node, $node = $node->nextAFE ) {
if ( $node instanceof BalanceMarker ) {
$s .= "MARKER\n";
continue;
}
$s .= $node->localName . '#' . substr( md5( spl_object_hash( $node ) ), 0, 8 );
if ( $node->nextNoah ) {
$s .= " (noah sibling: {$node->nextNoah->localName}#" .
substr( md5( spl_object_hash( $node->nextNoah ) ), 0, 8 ) .
')';
}
if ( $node->nextAFE && $node->nextAFE->prevAFE !== $node ) {
$s .= " (reverse link is wrong!)";
}
$s .= "\n";
}
if ( $prev !== $this->tail ) {
$s .= "(tail pointer is wrong!)\n";
}
return $s;
}
}
/**
* An implementation of the tree building portion of the HTML5 parsing
* spec.
*
* This is used to balance and tidy output so that the result can
* always be cleanly serialized/deserialized by an HTML5 parser. It
* does *not* guarantee "conforming" output -- the HTML5 spec contains
* a number of constraints which are not enforced by the HTML5 parsing
* process. But the result will be free of gross errors: misnested or
* unclosed tags, for example, and will be unchanged by spec-complient
* parsing followed by serialization.
*
* The tree building stage is structured as a state machine.
* When comparing the implementation to
* https://www.w3.org/TR/html5/syntax.html#tree-construction
* note that each state is implemented as a function with a
* name ending in `Mode` (because the HTML spec refers to them
* as insertion modes). The current insertion mode is held by
* the $parseMode property.
*
* The following simplifications have been made:
* - We handle body content only (ie, we start `in body`.)
* - The document is never in "quirks mode".
* - All occurrences of < and > have been entity escaped, so we
* can parse tags by simply splitting on those two characters.
* (This also simplifies the handling of < inside <textarea>.)
* The character < must not appear inside comments.
* Similarly, all attributes have been "cleaned" and are double-quoted
* and escaped.
* - All null characters are assumed to have been removed.
* - The following elements are disallowed: <html>, <head>, <body>, <frameset>,
* <frame>, <plaintext>, <isindex>, <xmp>, <iframe>,
* <noembed>, <noscript>, <script>, <title>. As a result,
* further simplifications can be made:
* - `frameset-ok` is not tracked.
* - `head element pointer` is not tracked (but presumed non-null)
* - Tokenizer has only a single mode. (<textarea> wants RCDATA and
* <style>/<noframes> want RAWTEXT modes which we only loosely emulate.)
*
* We generally mark places where we omit cases from the spec due to
* disallowed elements with a comment: `// OMITTED: <element-name>`.
*
* The HTML spec keeps a flag during the parsing process to track
* whether or not a "parse error" has been encountered. We don't
* bother to track that flag, we just implement the error-handling
* process as specified.
*
* @ingroup Parser
* @since 1.27
* @see https://html.spec.whatwg.org/multipage/syntax.html#tree-construction
*/
class Balancer {
private $parseMode;
private $bitsIterator;
private $allowedHtmlElements;
private $afe;
private $stack;
private $strict;
private $allowComments;
private $config;
private $textIntegrationMode;
private $pendingTableText;
private $originalInsertionMode;
private $fragmentContext;
private $formElementPointer;
private $ignoreLinefeed;
private $inRCDATA;
private $inRAWTEXT;
/**
* Valid HTML5 comments.
* Regex borrowed from Tim Starling's "remex-html" project.
*/
const VALID_COMMENT_REGEX = "~ !--
( # 1. Comment match detector
> | -> | # Invalid short close
( # 2. Comment contents
(?:
(?! --> )
(?! --!> )
(?! --! \z )
(?! -- \z )
(?! - \z )
.
)*+
)
( # 3. Comment close
--> | # Normal close
--!> | # Comment end bang
( # 4. Indicate matches requiring EOF
--! | # EOF in comment end bang state
-- | # EOF in comment end state
- | # EOF in comment end dash state
# EOF in comment state
)
)
)
([^<]*) \z # 5. Non-tag text after the comment
~xs";
/**
* Create a new Balancer.
* @param array $config Balancer configuration. Includes:
* 'strict' : boolean, defaults to false.
* When true, enforces syntactic constraints on input:
* all non-tag '<' must be escaped, all attributes must be
* separated by a single space and double-quoted. This is
* consistent with the output of the Sanitizer.
* 'allowedHtmlElements' : array, defaults to null.
* When present, the keys of this associative array give
* the acceptable HTML tag names. When not present, no
* tag sanitization is done.
* 'tidyCompat' : boolean, defaults to false.
* When true, the serialization algorithm is tweaked to
* provide historical compatibility with the old "tidy"
* program: <p>-wrapping is done to the children of
* <body> and <blockquote> elements, and empty elements
* are removed.
* 'allowComments': boolean, defaults to true.
* When true, allows HTML comments in the input.
* The Sanitizer generally strips all comments, so if you
* are running on sanitized output you can set this to
* false to get a bit more performance.
*/
public function __construct( array $config = [] ) {
$this->config = $config = $config + [
'strict' => false,
'allowedHtmlElements' => null,
'tidyCompat' => false,
'allowComments' => true,
];
$this->allowedHtmlElements = $config['allowedHtmlElements'];
$this->strict = $config['strict'];
$this->allowComments = $config['allowComments'];
if ( $this->allowedHtmlElements !== null ) {
// Sanity check!
$bad = array_uintersect_assoc(
$this->allowedHtmlElements,
BalanceSets::$unsupportedSet[BalanceSets::HTML_NAMESPACE],
function( $a, $b ) {
// Ignore the values (just intersect the keys) by saying
// all values are equal to each other.
return 0;
}
);
if ( count( $bad ) > 0 ) {
$badstr = implode( array_keys( $bad ), ',' );
throw new ParameterAssertionException(
'$config',
'Balance attempted with sanitization including ' .
"unsupported elements: {$badstr}"
);
}
}
}
/**
* Return a balanced HTML string for the HTML fragment given by $text,
* subject to the caveats listed in the class description. The result
* will typically be idempotent -- that is, rebalancing the output
* would result in no change.
*
* @param string $text The markup to be balanced
* @param callable $processingCallback Callback to do any variable or
* parameter replacements in HTML attributes values
* @param array|bool $processingArgs Arguments for the processing callback
* @return string The balanced markup
*/
public function balance( $text, $processingCallback = null, $processingArgs = [] ) {
$this->parseMode = 'inBodyMode';
$this->bitsIterator = new ExplodeIterator( '<', $text );
$this->afe = new BalanceActiveFormattingElements();
$this->stack = new BalanceStack( $this->config );
$this->processingCallback = $processingCallback;
$this->processingArgs = $processingArgs;
$this->textIntegrationMode =
$this->ignoreLinefeed =
$this->inRCDATA =
$this->inRAWTEXT = false;
// The stack is constructed with an <html> element already on it.
// Set this up as a fragment parsed with <body> as the context.
$this->fragmentContext =
new BalanceElement( BalanceSets::HTML_NAMESPACE, 'body', [] );
$this->resetInsertionMode();
$this->formElementPointer = null;
for ( $e = $this->fragmentContext; $e != null; $e = $e->parent ) {
if ( $e->isHtmlNamed( 'form' ) ) {
$this->formElementPointer = $e;
break;
}
}
// First element is text not tag
$x = $this->bitsIterator->current();
$this->bitsIterator->next();
$this->insertToken( 'text', str_replace( '>', '>', $x ) );
// Now process each tag.
while ( $this->bitsIterator->valid() ) {
$this->advance();
}
$this->insertToken( 'eof', null );
$result = $this->stack->getOutput();
// Free memory before returning.
$this->bitsIterator = null;
$this->afe = null;
$this->stack = null;
$this->fragmentContext = null;
$this->formElementPointer = null;
return $result;
}
/**
* Pass a token to the tree builder. The $token will be one of the
* strings "tag", "endtag", or "text".
*/
private function insertToken( $token, $value, $attribs = null, $selfClose = false ) {
// validate tags against $unsupportedSet
if ( $token === 'tag' || $token === 'endtag' ) {
if ( isset( BalanceSets::$unsupportedSet[BalanceSets::HTML_NAMESPACE][$value] ) ) {
// As described in "simplifications" above, these tags are
// not supported in the balancer.
Assert::invariant(
!$this->strict,
"Unsupported $token <$value> found."
);
return false;
}
} elseif ( $token === 'text' && $value === '' ) {
// Don't actually inject the empty string as a text token.
return true;
}
// Support pre/listing/textarea by suppressing initial linefeed
if ( $this->ignoreLinefeed ) {
$this->ignoreLinefeed = false;
if ( $token === 'text' ) {
if ( $value[0] === "\n" ) {
if ( $value === "\n" ) {
// Nothing would be left, don't inject the empty string.
return true;
}
$value = substr( $value, 1 );
}
}
}
// Some hoops we have to jump through
$adjusted = $this->stack->adjustedCurrentNode( $this->fragmentContext );
$isForeign = true;
if (
$this->stack->length() === 0 ||
$adjusted->isHtml() ||
$token === 'eof'
) {
$isForeign = false;
} elseif ( $adjusted->isMathmlTextIntegrationPoint() ) {
if ( $token === 'text' ) {
$isForeign = false;
} elseif (
$token === 'tag' &&
$value !== 'mglyph' && $value !== 'malignmark'
) {
$isForeign = false;
}
} elseif (
$adjusted->namespaceURI === BalanceSets::MATHML_NAMESPACE &&
$adjusted->localName === 'annotation-xml' &&
$token === 'tag' && $value === 'svg'
) {
$isForeign = false;
} elseif (
$adjusted->isHtmlIntegrationPoint() &&
( $token === 'tag' || $token === 'text' )
) {
$isForeign = false;
}
if ( $isForeign ) {
return $this->insertForeignToken( $token, $value, $attribs, $selfClose );
} else {
$func = $this->parseMode;
return $this->$func( $token, $value, $attribs, $selfClose );
}
}
private function insertForeignToken( $token, $value, $attribs = null, $selfClose = false ) {
if ( $token === 'text' ) {
$this->stack->insertText( $value );
return true;
} elseif ( $token === 'tag' ) {
switch ( $value ) {
case 'font':
if ( isset( $attribs['color'] )
|| isset( $attribs['face'] )
|| isset( $attribs['size'] )
) {
break;
}
// otherwise, fall through
case 'b':
case 'big':
case 'blockquote':
case 'body':
case 'br':
case 'center':
case 'code':
case 'dd':
case 'div':
case 'dl':
case 'dt':
case 'em':
case 'embed':
case 'h1':
case 'h2':
case 'h3':
case 'h4':
case 'h5':
case 'h6':
case 'head':
case 'hr':
case 'i':
case 'img':
case 'li':
case 'listing':
case 'menu':
case 'meta':
case 'nobr':
case 'ol':
case 'p':
case 'pre':
case 'ruby':
case 's':
case 'small':
case 'span':
case 'strong':
case 'strike':
case 'sub':
case 'sup':
case 'table':
case 'tt':
case 'u':
case 'ul':
case 'var':
if ( $this->fragmentContext ) {
break;
}
while ( true ) {
$this->stack->pop();
$node = $this->stack->currentNode;
if (
$node->isMathmlTextIntegrationPoint() ||
$node->isHtmlIntegrationPoint() ||
$node->isHtml()
) {
break;
}
}
return $this->insertToken( $token, $value, $attribs, $selfClose );
}
// "Any other start tag"
$adjusted = ( $this->fragmentContext && $this->stack->length()===1 ) ?
$this->fragmentContext : $this->stack->currentNode;
$this->stack->insertForeignElement(
$adjusted->namespaceURI, $value, $attribs
);
if ( $selfClose ) {
$this->stack->pop();
}
return true;
} elseif ( $token === 'endtag' ) {
$first = true;
foreach ( $this->stack as $i => $node ) {
if ( $node->isHtml() && !$first ) {
// process the end tag as HTML
$func = $this->parseMode;
return $this->$func( $token, $value, $attribs, $selfClose );
} elseif ( $i === 0 ) {
return true;
} elseif ( $node->localName === $value ) {
$this->stack->popTag( $node );
return true;
}
$first = false;
}
}
}
/**
* Grab the next "token" from $bitsIterator. This is either a open/close
* tag or text or a comment, depending on whether the Sanitizer approves.
*/
private function advance() {
$x = $this->bitsIterator->current();
$this->bitsIterator->next();
$regs = [];
// Handle comments. These won't be generated by mediawiki (they
// are stripped in the Sanitizer) but may be generated by extensions.
if (
$this->allowComments &&
!( $this->inRCDATA || $this->inRAWTEXT ) &&
preg_match( Balancer::VALID_COMMENT_REGEX, $x, $regs, PREG_OFFSET_CAPTURE ) &&
// verify EOF condition where necessary
( $regs[4][1] < 0 || !$this->bitsIterator->valid() )
) {
$contents = $regs[2][0];
$rest = $regs[5][0];
$this->insertToken( 'comment', $contents );
$this->insertToken( 'text', str_replace( '>', '>', $rest ) );
return;
}
// $slash: Does the current element start with a '/'?
// $t: Current element name
// $attribStr: String between element name and >
// $brace: Ending '>' or '/>'
// $rest: Everything until the next element from the $bitsIterator
if ( preg_match( Sanitizer::ELEMENT_BITS_REGEX, $x, $regs ) ) {
list( /* $qbar */, $slash, $t, $attribStr, $brace, $rest ) = $regs;
$t = strtolower( $t );
if ( $this->strict ) {
// Verify that attributes are all properly double-quoted
Assert::invariant(
preg_match(
'/^( [:_A-Z0-9][-.:_A-Z0-9]*="[^"]*")*[ ]*$/i', $attribStr
),
"Bad attribute string found"
);
}
} else {
Assert::invariant(
!$this->strict, "< found which does not start a valid tag"
);
$slash = $t = $attribStr = $brace = $rest = null;
}
$goodTag = $t;
if ( $this->inRCDATA ) {
if ( $slash && $t === $this->inRCDATA ) {
$this->inRCDATA = false;
} else {
// No tags allowed; this emulates the "rcdata" tokenizer mode.
$goodTag = false;
}
}
if ( $this->inRAWTEXT ) {
if ( $slash && $t === $this->inRAWTEXT ) {
$this->inRAWTEXT = false;
} else {
// No tags allowed, no entity-escaping done.
$goodTag = false;
}
}
$sanitize = $this->allowedHtmlElements !== null;
if ( $sanitize ) {
$goodTag = $t && isset( $this->allowedHtmlElements[$t] );
}
if ( $goodTag ) {
if ( is_callable( $this->processingCallback ) ) {
call_user_func_array( $this->processingCallback, [ &$attribStr, $this->processingArgs ] );
}
if ( $sanitize ) {
$goodTag = Sanitizer::validateTag( $attribStr, $t );
}
}
if ( $goodTag ) {
if ( $sanitize ) {
$attribs = Sanitizer::decodeTagAttributes( $attribStr );
$attribs = Sanitizer::validateTagAttributes( $attribs, $t );
} else {
$attribs = Sanitizer::decodeTagAttributes( $attribStr );
}
$goodTag = $this->insertToken(
$slash ? 'endtag' : 'tag', $t, $attribs, $brace === '/>'
);
}
if ( $goodTag ) {
$rest = str_replace( '>', '>', $rest );
$this->insertToken( 'text', str_replace( '>', '>', $rest ) );
} elseif ( $this->inRAWTEXT ) {
$this->insertToken( 'text', "<$x" );
} else {
// bad tag; serialize entire thing as text.
$this->insertToken( 'text', '<' . str_replace( '>', '>', $x ) );
}
}
private function switchMode( $mode ) {
Assert::parameter(
substr( $mode, -4 )==='Mode', '$mode', 'should end in Mode'
);
$oldMode = $this->parseMode;
$this->parseMode = $mode;
return $oldMode;
}
private function switchModeAndReprocess( $mode, $token, $value, $attribs, $selfClose ) {
$this->switchMode( $mode );
return $this->insertToken( $token, $value, $attribs, $selfClose );
}
private function resetInsertionMode() {
$last = false;
foreach ( $this->stack as $i => $node ) {
if ( $i === 0 ) {
$last = true;
if ( $this->fragmentContext ) {
$node = $this->fragmentContext;
}
}
if ( $node->isHtml() ) {
switch ( $node->localName ) {
case 'select':
$stackLength = $this->stack->length();
for ( $j = $i + 1; $j < $stackLength-1; $j++ ) {
$ancestor = $this->stack->node( $stackLength-$j-1 );
if ( $ancestor->isHtmlNamed( 'template' ) ) {
break;
}
if ( $ancestor->isHtmlNamed( 'table' ) ) {
$this->switchMode( 'inSelectInTableMode' );
return;
}
}
$this->switchMode( 'inSelectMode' );
return;
case 'tr':
$this->switchMode( 'inRowMode' );
return;
case 'tbody':
case 'tfoot':
case 'thead':
$this->switchMode( 'inTableBodyMode' );
return;
case 'caption':
$this->switchMode( 'inCaptionMode' );
return;
case 'colgroup':
$this->switchMode( 'inColumnGroupMode' );
return;
case 'table':
$this->switchMode( 'inTableMode' );
return;
case 'template':
$this->switchMode(
array_slice( $this->templateInsertionModes, -1 )[0]
);
return;
case 'body':
$this->switchMode( 'inBodyMode' );
return;
// OMITTED: <frameset>
// OMITTED: <html>
// OMITTED: <head>
default:
if ( !$last ) {
// OMITTED: <head>
if ( $node->isA( BalanceSets::$tableCellSet ) ) {
$this->switchMode( 'inCellMode' );
return;
}
}
}
}
if ( $last ) {
$this->switchMode( 'inBodyMode' );
return;
}
}
}
private function stopParsing() {
// Most of the spec methods are inapplicable, other than step 2:
// "pop all the nodes off the stack of open elements".
// We're going to keep the top-most <html> element on the stack, though.
// Clear the AFE list first, otherwise the element objects will stay live
// during serialization, potentially using O(N^2) memory. Note that
// popping the stack will never result in reconstructing the active
// formatting elements.
$this->afe = null;
$this->stack->popTo( 1 );
}
private function parseRawText( $value, $attribs = null ) {
$this->stack->insertHTMLElement( $value, $attribs );
$this->inRAWTEXT = $value;
$this->originalInsertionMode = $this->switchMode( 'inTextMode' );
return true;
}
private function inTextMode( $token, $value, $attribs = null, $selfClose = false ) {
if ( $token === 'text' ) {
$this->stack->insertText( $value );
return true;
} elseif ( $token === 'eof' ) {
$this->stack->pop();
return $this->switchModeAndReprocess(
$this->originalInsertionMode, $token, $value, $attribs, $selfClose
);
} elseif ( $token === 'endtag' ) {
$this->stack->pop();
$this->switchMode( $this->originalInsertionMode );
return true;
}
return true;
}
private function inHeadMode( $token, $value, $attribs = null, $selfClose = false ) {
if ( $token === 'text' ) {
if ( preg_match( '/^[\x09\x0A\x0C\x0D\x20]+/', $value, $matches ) ) {
$this->stack->insertText( $matches[0] );
$value = substr( $value, strlen( $matches[0] ) );
}
if ( strlen( $value ) === 0 ) {
return true; // All text handled.
}
// Fall through to handle non-whitespace below.
} elseif ( $token === 'tag' ) {
switch ( $value ) {
case 'meta':
// OMITTED: in a full HTML parser, this might change the encoding.
// falls through
// OMITTED: <html>
case 'base':
case 'basefont':
case 'bgsound':
case 'link':
$this->stack->insertHTMLElement( $value, $attribs );
$this->stack->pop();
return true;
// OMITTED: <title>
// OMITTED: <noscript>
case 'noframes':
case 'style':
return $this->parseRawText( $value, $attribs );
// OMITTED: <script>
case 'template':
$this->stack->insertHTMLElement( $value, $attribs );
$this->afe->insertMarker();
// OMITTED: frameset_ok
$this->switchMode( 'inTemplateMode' );
$this->templateInsertionModes[] = $this->parseMode;
return true;
// OMITTED: <head>
}
} elseif ( $token === 'endtag' ) {
switch ( $value ) {
// OMITTED: <head>
// OMITTED: <body>
// OMITTED: <html>
case 'br':
break; // handle at the bottom of the function
case 'template':
if ( $this->stack->indexOf( $value ) < 0 ) {
return true; // Ignore the token.
}
$this->stack->generateImpliedEndTags( null, true /* thorough */ );
$this->stack->popTag( $value );
$this->afe->clearToMarker();
array_pop( $this->templateInsertionModes );
$this->resetInsertionMode();
return true;
default:
// ignore any other end tag
return true;
}
} elseif ( $token === 'comment' ) {
$this->stack->insertComment( $value );
return true;
}
// If not handled above
$this->inHeadMode( 'endtag', 'head' ); // synthetic </head>
// Then redo this one
return $this->insertToken( $token, $value, $attribs, $selfClose );
}
private function inBodyMode( $token, $value, $attribs = null, $selfClose = false ) {
if ( $token === 'text' ) {
$this->afe->reconstruct( $this->stack );
$this->stack->insertText( $value );
return true;
} elseif ( $token === 'eof' ) {
if ( !empty( $this->templateInsertionModes ) ) {
return $this->inTemplateMode( $token, $value, $attribs, $selfClose );
}
$this->stopParsing();
return true;
} elseif ( $token === 'tag' ) {
switch ( $value ) {
// OMITTED: <html>
case 'base':
case 'basefont':
case 'bgsound':
case 'link':
case 'meta':
case 'noframes':
// OMITTED: <script>
case 'style':
case 'template':
// OMITTED: <title>
return $this->inHeadMode( $token, $value, $attribs, $selfClose );
// OMITTED: <body>
// OMITTED: <frameset>
case 'address':
case 'article':
case 'aside':
case 'blockquote':
case 'center':
case 'details':
case 'dialog':
case 'dir':
case 'div':
case 'dl':
case 'fieldset':
case 'figcaption':
case 'figure':
case 'footer':
case 'header':
case 'hgroup':
case 'main':
case 'menu':
case 'nav':
case 'ol':
case 'p':
case 'section':
case 'summary':
case 'ul':
if ( $this->stack->inButtonScope( 'p' ) ) {
$this->inBodyMode( 'endtag', 'p' );
}
$this->stack->insertHTMLElement( $value, $attribs );
return true;
case 'h1':
case 'h2':
case 'h3':
case 'h4':
case 'h5':
case 'h6':
if ( $this->stack->inButtonScope( 'p' ) ) {
$this->inBodyMode( 'endtag', 'p' );
}
if ( $this->stack->currentNode->isA( BalanceSets::$headingSet ) ) {
$this->stack->pop();
}
$this->stack->insertHTMLElement( $value, $attribs );
return true;
case 'pre':
case 'listing':
if ( $this->stack->inButtonScope( 'p' ) ) {
$this->inBodyMode( 'endtag', 'p' );
}
$this->stack->insertHTMLElement( $value, $attribs );
$this->ignoreLinefeed = true;
// OMITTED: frameset_ok
return true;
case 'form':
if (
$this->formElementPointer &&
$this->stack->indexOf( 'template' ) < 0
) {
return true; // in a form, not in a template.
}
if ( $this->stack->inButtonScope( "p" ) ) {
$this->inBodyMode( 'endtag', 'p' );
}
$elt = $this->stack->insertHTMLElement( $value, $attribs );
if ( $this->stack->indexOf( 'template' ) < 0 ) {
$this->formElementPointer = $elt;
}
return true;
case 'li':
// OMITTED: frameset_ok
foreach ( $this->stack as $node ) {
if ( $node->isHtmlNamed( 'li' ) ) {
$this->inBodyMode( 'endtag', 'li' );
break;
}
if (
$node->isA( BalanceSets::$specialSet ) &&
!$node->isA( BalanceSets::$addressDivPSet )
) {
break;
}
}
if ( $this->stack->inButtonScope( 'p' ) ) {
$this->inBodyMode( 'endtag', 'p' );
}
$this->stack->insertHTMLElement( $value, $attribs );
return true;
case 'dd':
case 'dt':
// OMITTED: frameset_ok
foreach ( $this->stack as $node ) {
if ( $node->isHtmlNamed( 'dd' ) ) {
$this->inBodyMode( 'endtag', 'dd' );
break;
}
if ( $node->isHtmlNamed( 'dt' ) ) {
$this->inBodyMode( 'endtag', 'dt' );
break;
}
if (
$node->isA( BalanceSets::$specialSet ) &&
!$node->isA( BalanceSets::$addressDivPSet )
) {
break;
}
}
if ( $this->stack->inButtonScope( 'p' ) ) {
$this->inBodyMode( 'endtag', 'p' );
}
$this->stack->insertHTMLElement( $value, $attribs );
return true;
// OMITTED: <plaintext>
case 'button':
if ( $this->stack->inScope( 'button' ) ) {
$this->inBodyMode( 'endtag', 'button' );
return $this->insertToken( $token, $value, $attribs, $selfClose );
}
$this->afe->reconstruct( $this->stack );
$this->stack->insertHTMLElement( $value, $attribs );
return true;
case 'a':
$activeElement = $this->afe->findElementByTag( 'a' );
if ( $activeElement ) {
$this->inBodyMode( 'endtag', 'a' );
if ( $this->afe->isInList( $activeElement ) ) {
$this->afe->remove( $activeElement );
// Don't flatten here, since when we fall
// through below we might foster parent
// the new <a> tag inside this one.
$this->stack->removeElement( $activeElement, false );
}
}
// Falls through
case 'b':
case 'big':
case 'code':
case 'em':
case 'font':
case 'i':
case 's':
case 'small':
case 'strike':
case 'strong':
case 'tt':
case 'u':
$this->afe->reconstruct( $this->stack );
$this->afe->push( $this->stack->insertHTMLElement( $value, $attribs ), $attribs );
return true;
case 'nobr':
$this->afe->reconstruct( $this->stack );
if ( $this->stack->inScope( 'nobr' ) ) {
$this->inBodyMode( 'endtag', 'nobr' );
$this->afe->reconstruct( $this->stack );
}
$this->afe->push( $this->stack->insertHTMLElement( $value, $attribs ), $attribs );
return true;
case 'applet':
case 'marquee':
case 'object':
$this->afe->reconstruct( $this->stack );
$this->stack->insertHTMLElement( $value, $attribs );
$this->afe->insertMarker();
// OMITTED: frameset_ok
return true;
case 'table':
// The document is never in "quirks mode"; see simplifications
// above.
if ( $this->stack->inButtonScope( 'p' ) ) {
$this->inBodyMode( 'endtag', 'p' );
}
$this->stack->insertHTMLElement( $value, $attribs );
// OMITTED: frameset_ok
$this->switchMode( 'inTableMode' );
return true;
case 'area':
case 'br':
case 'embed':
case 'img':
case 'keygen':
case 'wbr':
$this->afe->reconstruct( $this->stack );
$this->stack->insertHTMLElement( $value, $attribs );
$this->stack->pop();
// OMITTED: frameset_ok
return true;
case 'input':
$this->afe->reconstruct( $this->stack );
$this->stack->insertHTMLElement( $value, $attribs );
$this->stack->pop();
// OMITTED: frameset_ok
// (hence we don't need to examine the tag's "type" attribute)
return true;
case 'menuitem':
case 'param':
case 'source':
case 'track':
$this->stack->insertHTMLElement( $value, $attribs );
$this->stack->pop();
return true;
case 'hr':
if ( $this->stack->inButtonScope( 'p' ) ) {
$this->inBodyMode( 'endtag', 'p' );
}
$this->stack->insertHTMLElement( $value, $attribs );
$this->stack->pop();
return true;
case 'image':
// warts!
return $this->inBodyMode( $token, 'img', $attribs, $selfClose );
// OMITTED: <isindex>
case 'textarea':
$this->stack->insertHTMLElement( $value, $attribs );
$this->ignoreLinefeed = true;
$this->inRCDATA = $value; // emulate rcdata tokenizer mode
// OMITTED: frameset_ok
return true;
// OMITTED: <xmp>
// OMITTED: <iframe>
// OMITTED: <noembed>
// OMITTED: <noscript>
case 'select':
$this->afe->reconstruct( $this->stack );
$this->stack->insertHTMLElement( $value, $attribs );
switch ( $this->parseMode ) {
case 'inTableMode':
case 'inCaptionMode':
case 'inTableBodyMode':
case 'inRowMode':
case 'inCellMode':
$this->switchMode( 'inSelectInTableMode' );
return true;
default:
$this->switchMode( 'inSelectMode' );
return true;
}
case 'optgroup':
case 'option':
if ( $this->stack->currentNode->isHtmlNamed( 'option' ) ) {
$this->inBodyMode( 'endtag', 'option' );
}
$this->afe->reconstruct( $this->stack );
$this->stack->insertHTMLElement( $value, $attribs );
return true;
case 'rb':
case 'rtc':
if ( $this->stack->inScope( 'ruby' ) ) {
$this->stack->generateImpliedEndTags();
}
$this->stack->insertHTMLElement( $value, $attribs );
return true;
case 'rp':
case 'rt':
if ( $this->stack->inScope( 'ruby' ) ) {
$this->stack->generateImpliedEndTags( 'rtc' );
}
$this->stack->insertHTMLElement( $value, $attribs );
return true;
case 'math':
$this->afe->reconstruct( $this->stack );
// We skip the spec's "adjust MathML attributes" and
// "adjust foreign attributes" steps, since the browser will
// do this later when it parses the output and it doesn't affect
// balancing.
$this->stack->insertForeignElement(
BalanceSets::MATHML_NAMESPACE, $value, $attribs
);
if ( $selfClose ) {
// emit explicit </math> tag.
$this->stack->pop();
}
return true;
case 'svg':
$this->afe->reconstruct( $this->stack );
// We skip the spec's "adjust SVG attributes" and
// "adjust foreign attributes" steps, since the browser will
// do this later when it parses the output and it doesn't affect
// balancing.
$this->stack->insertForeignElement(
BalanceSets::SVG_NAMESPACE, $value, $attribs
);
if ( $selfClose ) {
// emit explicit </svg> tag.
$this->stack->pop();
}
return true;
case 'caption':
case 'col':
case 'colgroup':
// OMITTED: <frame>
case 'head':
case 'tbody':
case 'td':
case 'tfoot':
case 'th':
case 'thead':
case 'tr':
// Ignore table tags if we're not inTableMode
return true;
}
// Handle any other start tag here
$this->afe->reconstruct( $this->stack );
$this->stack->insertHTMLElement( $value, $attribs );
return true;
} elseif ( $token === 'endtag' ) {
switch ( $value ) {
// </body>,</html> are unsupported.
case 'template':
return $this->inHeadMode( $token, $value, $attribs, $selfClose );
case 'address':
case 'article':
case 'aside':
case 'blockquote':
case 'button':
case 'center':
case 'details':
case 'dialog':
case 'dir':
case 'div':
case 'dl':
case 'fieldset':
case 'figcaption':
case 'figure':
case 'footer':
case 'header':
case 'hgroup':
case 'listing':
case 'main':
case 'menu':
case 'nav':
case 'ol':
case 'pre':
case 'section':
case 'summary':
case 'ul':
// Ignore if there is not a matching open tag
if ( !$this->stack->inScope( $value ) ) {
return true;
}
$this->stack->generateImpliedEndTags();
$this->stack->popTag( $value );
return true;
case 'form':
if ( $this->stack->indexOf( 'template' ) < 0 ) {
$openform = $this->formElementPointer;
$this->formElementPointer = null;
if ( !$openform || !$this->stack->inScope( $openform ) ) {
return true;
}
$this->stack->generateImpliedEndTags();
// Don't flatten yet if we're removing a <form> element
// out-of-order. (eg. `<form><div></form>`)
$flatten = ( $this->stack->currentNode === $openform );
$this->stack->removeElement( $openform, $flatten );
} else {
if ( !$this->stack->inScope( 'form' ) ) {
return true;
}
$this->stack->generateImpliedEndTags();
$this->stack->popTag( 'form' );
}
return true;
case 'p':
if ( !$this->stack->inButtonScope( 'p' ) ) {
$this->inBodyMode( 'tag', 'p', [] );
return $this->insertToken( $token, $value, $attribs, $selfClose );
}
$this->stack->generateImpliedEndTags( $value );
$this->stack->popTag( $value );
return true;
case 'li':
if ( !$this->stack->inListItemScope( $value ) ) {
return true; // ignore
}
$this->stack->generateImpliedEndTags( $value );
$this->stack->popTag( $value );
return true;
case 'dd':
case 'dt':
if ( !$this->stack->inScope( $value ) ) {
return true; // ignore
}
$this->stack->generateImpliedEndTags( $value );
$this->stack->popTag( $value );
return true;
case 'h1':
case 'h2':
case 'h3':
case 'h4':
case 'h5':
case 'h6':
if ( !$this->stack->inScope( BalanceSets::$headingSet ) ) {
return true; // ignore
}
$this->stack->generateImpliedEndTags();
$this->stack->popTag( BalanceSets::$headingSet );
return true;
case 'sarcasm':
// Take a deep breath, then:
break;
case 'a':
case 'b':
case 'big':
case 'code':
case 'em':
case 'font':
case 'i':
case 'nobr':
case 's':
case 'small':
case 'strike':
case 'strong':
case 'tt':
case 'u':
if ( $this->stack->adoptionAgency( $value, $this->afe ) ) {
return true; // If we did something, we're done.
}
break; // Go to the "any other end tag" case.
case 'applet':
case 'marquee':
case 'object':
if ( !$this->stack->inScope( $value ) ) {
return true; // ignore
}
$this->stack->generateImpliedEndTags();
$this->stack->popTag( $value );
$this->afe->clearToMarker();
return true;
case 'br':
// Turn </br> into <br>
return $this->inBodyMode( 'tag', $value, [] );
}
// Any other end tag goes here
foreach ( $this->stack as $i => $node ) {
if ( $node->isHtmlNamed( $value ) ) {
$this->stack->generateImpliedEndTags( $value );
$this->stack->popTo( $i ); // including $i
break;
} elseif ( $node->isA( BalanceSets::$specialSet ) ) {
return true; // ignore this close token.
}
}
return true;
} elseif ( $token === 'comment' ) {
$this->stack->insertComment( $value );
return true;
} else {
Assert::invariant( false, "Bad token type: $token" );
}
}
private function inTableMode( $token, $value, $attribs = null, $selfClose = false ) {
if ( $token === 'text' ) {
if ( $this->textIntegrationMode ) {
return $this->inBodyMode( $token, $value, $attribs, $selfClose );
} elseif ( $this->stack->currentNode->isA( BalanceSets::$tableSectionRowSet ) ) {
$this->pendingTableText = '';
$this->originalInsertionMode = $this->parseMode;
return $this->switchModeAndReprocess( 'inTableTextMode',
$token, $value, $attribs, $selfClose );
}
// fall through to default case.
} elseif ( $token === 'eof' ) {
$this->stopParsing();
return true;
} elseif ( $token === 'tag' ) {
switch ( $value ) {
case 'caption':
$this->afe->insertMarker();
$this->stack->insertHTMLElement( $value, $attribs );
$this->switchMode( 'inCaptionMode' );
return true;
case 'colgroup':
$this->stack->clearToContext( BalanceSets::$tableContextSet );
$this->stack->insertHTMLElement( $value, $attribs );
$this->switchMode( 'inColumnGroupMode' );
return true;
case 'col':
$this->inTableMode( 'tag', 'colgroup', [] );
return $this->insertToken( $token, $value, $attribs, $selfClose );
case 'tbody':
case 'tfoot':
case 'thead':
$this->stack->clearToContext( BalanceSets::$tableContextSet );
$this->stack->insertHTMLElement( $value, $attribs );
$this->switchMode( 'inTableBodyMode' );
return true;
case 'td':
case 'th':
case 'tr':
$this->inTableMode( 'tag', 'tbody', [] );
return $this->insertToken( $token, $value, $attribs, $selfClose );
case 'table':
if ( !$this->stack->inTableScope( $value ) ) {
return true; // Ignore this tag.
}
$this->inTableMode( 'endtag', $value );
return $this->insertToken( $token, $value, $attribs, $selfClose );
case 'style':
// OMITTED: <script>
case 'template':
return $this->inHeadMode( $token, $value, $attribs, $selfClose );
case 'input':
if ( !isset( $attribs['type'] ) || strcasecmp( $attribs['type'], 'hidden' ) !== 0 ) {
break; // Handle this as "everything else"
}
$this->stack->insertHTMLElement( $value, $attribs );
$this->stack->pop();
return true;
case 'form':
if (
$this->formElementPointer ||
$this->stack->indexOf( 'template' ) >= 0
) {
return true; // ignore this token
}
$this->formElementPointer =
$this->stack->insertHTMLElement( $value, $attribs );
$this->stack->popTag( $this->formElementPointer );
return true;
}
// Fall through for "anything else" clause.
} elseif ( $token === 'endtag' ) {
switch ( $value ) {
case 'table':
if ( !$this->stack->inTableScope( $value ) ) {
return true; // Ignore.
}
$this->stack->popTag( $value );
$this->resetInsertionMode();
return true;
// OMITTED: <body>
case 'caption':
case 'col':
case 'colgroup':
// OMITTED: <html>
case 'tbody':
case 'td':
case 'tfoot':
case 'th':
case 'thead':
case 'tr':
return true; // Ignore the token.
case 'template':
return $this->inHeadMode( $token, $value, $attribs, $selfClose );
}
// Fall through for "anything else" clause.
} elseif ( $token === 'comment' ) {
$this->stack->insertComment( $value );
return true;
}
// This is the "anything else" case:
$this->stack->fosterParentMode = true;
$this->inBodyMode( $token, $value, $attribs, $selfClose );
$this->stack->fosterParentMode = false;
return true;
}
private function inTableTextMode( $token, $value, $attribs = null, $selfClose = false ) {
if ( $token === 'text' ) {
$this->pendingTableText .= $value;
return true;
}
// Non-text token:
$text = $this->pendingTableText;
$this->pendingTableText = '';
if ( preg_match( '/[^\x09\x0A\x0C\x0D\x20]/', $text ) ) {
// This should match the "anything else" case inTableMode
$this->stack->fosterParentMode = true;
$this->inBodyMode( 'text', $text );
$this->stack->fosterParentMode = false;
} else {
// Pending text is just whitespace.
$this->stack->insertText( $text );
}
return $this->switchModeAndReprocess(
$this->originalInsertionMode, $token, $value, $attribs, $selfClose
);
}
// helper for inCaptionMode
private function endCaption() {
if ( !$this->stack->inTableScope( 'caption' ) ) {
return false;
}
$this->stack->generateImpliedEndTags();
$this->stack->popTag( 'caption' );
$this->afe->clearToMarker();
$this->switchMode( 'inTableMode' );
return true;
}
private function inCaptionMode( $token, $value, $attribs = null, $selfClose = false ) {
if ( $token === 'tag' ) {
switch ( $value ) {
case 'caption':
case 'col':
case 'colgroup':
case 'tbody':
case 'td':
case 'tfoot':
case 'th':
case 'thead':
case 'tr':
if ( $this->endCaption() ) {
$this->insertToken( $token, $value, $attribs, $selfClose );
}
return true;
}
// Fall through to "anything else" case.
} elseif ( $token === 'endtag' ) {
switch ( $value ) {
case 'caption':
$this->endCaption();
return true;
case 'table':
if ( $this->endCaption() ) {
$this->insertToken( $token, $value, $attribs, $selfClose );
}
return true;
case 'body':
case 'col':
case 'colgroup':
// OMITTED: <html>
case 'tbody':
case 'td':
case 'tfoot':
case 'th':
case 'thead':
case 'tr':
// Ignore the token
return true;
}
// Fall through to "anything else" case.
}
// The Anything Else case
return $this->inBodyMode( $token, $value, $attribs, $selfClose );
}
private function inColumnGroupMode( $token, $value, $attribs = null, $selfClose = false ) {
if ( $token === 'text' ) {
if ( preg_match( '/^[\x09\x0A\x0C\x0D\x20]+/', $value, $matches ) ) {
$this->stack->insertText( $matches[0] );
$value = substr( $value, strlen( $matches[0] ) );
}
if ( strlen( $value ) === 0 ) {
return true; // All text handled.
}
// Fall through to handle non-whitespace below.
} elseif ( $token === 'tag' ) {
switch ( $value ) {
// OMITTED: <html>
case 'col':
$this->stack->insertHTMLElement( $value, $attribs );
$this->stack->pop();
return true;
case 'template':
return $this->inHeadMode( $token, $value, $attribs, $selfClose );
}
// Fall through for "anything else".
} elseif ( $token === 'endtag' ) {
switch ( $value ) {
case 'colgroup':
if ( !$this->stack->currentNode->isHtmlNamed( 'colgroup' ) ) {
return true; // Ignore the token.
}
$this->stack->pop();
$this->switchMode( 'inTableMode' );
return true;
case 'col':
return true; // Ignore the token.
case 'template':
return $this->inHeadMode( $token, $value, $attribs, $selfClose );
}
// Fall through for "anything else".
} elseif ( $token === 'eof' ) {
return $this->inBodyMode( $token, $value, $attribs, $selfClose );
} elseif ( $token === 'comment' ) {
$this->stack->insertComment( $value );
return true;
}
// Anything else
if ( !$this->stack->currentNode->isHtmlNamed( 'colgroup' ) ) {
return true; // Ignore the token.
}
$this->inColumnGroupMode( 'endtag', 'colgroup' );
return $this->insertToken( $token, $value, $attribs, $selfClose );
}
// Helper function for inTableBodyMode
private function endSection() {
if ( !(
$this->stack->inTableScope( 'tbody' ) ||
$this->stack->inTableScope( 'thead' ) ||
$this->stack->inTableScope( 'tfoot' )
) ) {
return false;
}
$this->stack->clearToContext( BalanceSets::$tableBodyContextSet );
$this->stack->pop();
$this->switchMode( 'inTableMode' );
return true;
}
private function inTableBodyMode( $token, $value, $attribs = null, $selfClose = false ) {
if ( $token === 'tag' ) {
switch ( $value ) {
case 'tr':
$this->stack->clearToContext( BalanceSets::$tableBodyContextSet );
$this->stack->insertHTMLElement( $value, $attribs );
$this->switchMode( 'inRowMode' );
return true;
case 'th':
case 'td':
$this->inTableBodyMode( 'tag', 'tr', [] );
$this->insertToken( $token, $value, $attribs, $selfClose );
return true;
case 'caption':
case 'col':
case 'colgroup':
case 'tbody':
case 'tfoot':
case 'thead':
if ( $this->endSection() ) {
$this->insertToken( $token, $value, $attribs, $selfClose );
}
return true;
}
} elseif ( $token === 'endtag' ) {
switch ( $value ) {
case 'table':
if ( $this->endSection() ) {
$this->insertToken( $token, $value, $attribs, $selfClose );
}
return true;
case 'tbody':
case 'tfoot':
case 'thead':
if ( $this->stack->inTableScope( $value ) ) {
$this->endSection();
}
return true;
// OMITTED: <body>
case 'caption':
case 'col':
case 'colgroup':
// OMITTED: <html>
case 'td':
case 'th':
case 'tr':
return true; // Ignore the token.
}
}
// Anything else:
return $this->inTableMode( $token, $value, $attribs, $selfClose );
}
// Helper function for inRowMode
private function endRow() {
if ( !$this->stack->inTableScope( 'tr' ) ) {
return false;
}
$this->stack->clearToContext( BalanceSets::$tableRowContextSet );
$this->stack->pop();
$this->switchMode( 'inTableBodyMode' );
return true;
}
private function inRowMode( $token, $value, $attribs = null, $selfClose = false ) {
if ( $token === 'tag' ) {
switch ( $value ) {
case 'th':
case 'td':
$this->stack->clearToContext( BalanceSets::$tableRowContextSet );
$this->stack->insertHTMLElement( $value, $attribs );
$this->switchMode( 'inCellMode' );
$this->afe->insertMarker();
return true;
case 'caption':
case 'col':
case 'colgroup':
case 'tbody':
case 'tfoot':
case 'thead':
case 'tr':
if ( $this->endRow() ) {
$this->insertToken( $token, $value, $attribs, $selfClose );
}
return true;
}
} elseif ( $token === 'endtag' ) {
switch ( $value ) {
case 'tr':
$this->endRow();
return true;
case 'table':
if ( $this->endRow() ) {
$this->insertToken( $token, $value, $attribs, $selfClose );
}
return true;
case 'tbody':
case 'tfoot':
case 'thead':
if (
$this->stack->inTableScope( $value ) &&
$this->endRow()
) {
$this->insertToken( $token, $value, $attribs, $selfClose );
}
return true;
// OMITTED: <body>
case 'caption':
case 'col':
case 'colgroup':
// OMITTED: <html>
case 'td':
case 'th':
return true; // Ignore the token.
}
}
// Anything else:
return $this->inTableMode( $token, $value, $attribs, $selfClose );
}
// Helper for inCellMode
private function endCell() {
if ( $this->stack->inTableScope( 'td' ) ) {
$this->inCellMode( 'endtag', 'td' );
return true;
} elseif ( $this->stack->inTableScope( 'th' ) ) {
$this->inCellMode( 'endtag', 'th' );
return true;
} else {
return false;
}
}
private function inCellMode( $token, $value, $attribs = null, $selfClose = false ) {
if ( $token === 'tag' ) {
switch ( $value ) {
case 'caption':
case 'col':
case 'colgroup':
case 'tbody':
case 'td':
case 'tfoot':
case 'th':
case 'thead':
case 'tr':
if ( $this->endCell() ) {
$this->insertToken( $token, $value, $attribs, $selfClose );
}
return true;
}
} elseif ( $token === 'endtag' ) {
switch ( $value ) {
case 'td':
case 'th':
if ( $this->stack->inTableScope( $value ) ) {
$this->stack->generateImpliedEndTags();
$this->stack->popTag( $value );
$this->afe->clearToMarker();
$this->switchMode( 'inRowMode' );
}
return true;
// OMITTED: <body>
case 'caption':
case 'col':
case 'colgroup':
// OMITTED: <html>
return true;
case 'table':
case 'tbody':
case 'tfoot':
case 'thead':
case 'tr':
if ( $this->stack->inTableScope( $value ) ) {
$this->stack->generateImpliedEndTags();
$this->stack->popTag( BalanceSets::$tableCellSet );
$this->afe->clearToMarker();
$this->switchMode( 'inRowMode' );
$this->insertToken( $token, $value, $attribs, $selfClose );
}
return true;
}
}
// Anything else:
return $this->inBodyMode( $token, $value, $attribs, $selfClose );
}
private function inSelectMode( $token, $value, $attribs = null, $selfClose = false ) {
if ( $token === 'text' ) {
$this->stack->insertText( $value );
return true;
} elseif ( $token === 'eof' ) {
return $this->inBodyMode( $token, $value, $attribs, $selfClose );
} elseif ( $token === 'tag' ) {
switch ( $value ) {
// OMITTED: <html>
case 'option':
if ( $this->stack->currentNode->isHtmlNamed( 'option' ) ) {
$this->stack->pop();
}
$this->stack->insertHTMLElement( $value, $attribs );
return true;
case 'optgroup':
if ( $this->stack->currentNode->isHtmlNamed( 'option' ) ) {
$this->stack->pop();
}
if ( $this->stack->currentNode->isHtmlNamed( 'optgroup' ) ) {
$this->stack->pop();
}
$this->stack->insertHTMLElement( $value, $attribs );
return true;
case 'select':
$this->inSelectMode( 'endtag', $value ); // treat it like endtag
return true;
case 'input':
case 'keygen':
case 'textarea':
if ( !$this->stack->inSelectScope( 'select' ) ) {
return true; // ignore token (fragment case)
}
$this->inSelectMode( 'endtag', 'select' );
return $this->insertToken( $token, $value, $attribs, $selfClose );
case 'script':
case 'template':
return $this->inHeadMode( $token, $value, $attribs, $selfClose );
}
} elseif ( $token === 'endtag' ) {
switch ( $value ) {
case 'optgroup':
if (
$this->stack->currentNode->isHtmlNamed( 'option' ) &&
$this->stack->length() >= 2 &&
$this->stack->node( $this->stack->length() - 2 )->isHtmlNamed( 'optgroup' )
) {
$this->stack->pop();
}
if ( $this->stack->currentNode->isHtmlNamed( 'optgroup' ) ) {
$this->stack->pop();
}
return true;
case 'option':
if ( $this->stack->currentNode->isHtmlNamed( 'option' ) ) {
$this->stack->pop();
}
return true;
case 'select':
if ( !$this->stack->inSelectScope( $value ) ) {
return true; // fragment case
}
$this->stack->popTag( $value );
$this->resetInsertionMode();
return true;
case 'template':
return $this->inHeadMode( $token, $value, $attribs, $selfClose );
}
} elseif ( $token === 'comment' ) {
$this->stack->insertComment( $value );
return true;
}
// anything else: just ignore the token
return true;
}
private function inSelectInTableMode( $token, $value, $attribs = null, $selfClose = false ) {
switch ( $value ) {
case 'caption':
case 'table':
case 'tbody':
case 'tfoot':
case 'thead':
case 'tr':
case 'td':
case 'th':
if ( $token === 'tag' ) {
$this->inSelectInTableMode( 'endtag', 'select' );
return $this->insertToken( $token, $value, $attribs, $selfClose );
} elseif ( $token === 'endtag' ) {
if ( $this->stack->inTableScope( $value ) ) {
$this->inSelectInTableMode( 'endtag', 'select' );
return $this->insertToken( $token, $value, $attribs, $selfClose );
}
return true;
}
}
// anything else
return $this->inSelectMode( $token, $value, $attribs, $selfClose );
}
private function inTemplateMode( $token, $value, $attribs = null, $selfClose = false ) {
if ( $token === 'text' || $token === 'comment' ) {
return $this->inBodyMode( $token, $value, $attribs, $selfClose );
} elseif ( $token === 'eof' ) {
if ( $this->stack->indexOf( 'template' ) < 0 ) {
$this->stopParsing();
} else {
$this->stack->popTag( 'template' );
$this->afe->clearToMarker();
array_pop( $this->templateInsertionModes );
$this->resetInsertionMode();
$this->insertToken( $token, $value, $attribs, $selfClose );
}
return true;
} elseif ( $token === 'tag' ) {
switch ( $value ) {
case 'base':
case 'basefont':
case 'bgsound':
case 'link':
case 'meta':
case 'noframes':
// OMITTED: <script>
case 'style':
case 'template':
// OMITTED: <title>
return $this->inHeadMode( $token, $value, $attribs, $selfClose );
case 'caption':
case 'colgroup':
case 'tbody':
case 'tfoot':
case 'thead':
return $this->switchModeAndReprocess(
'inTableMode', $token, $value, $attribs, $selfClose
);
case 'col':
return $this->switchModeAndReprocess(
'inColumnGroupMode', $token, $value, $attribs, $selfClose
);
case 'tr':
return $this->switchModeAndReprocess(
'inTableBodyMode', $token, $value, $attribs, $selfClose
);
case 'td':
case 'th':
return $this->switchModeAndReprocess(
'inRowMode', $token, $value, $attribs, $selfClose
);
}
return $this->switchModeAndReprocess(
'inBodyMode', $token, $value, $attribs, $selfClose
);
} elseif ( $token === 'endtag' ) {
switch ( $value ) {
case 'template':
return $this->inHeadMode( $token, $value, $attribs, $selfClose );
}
return true;
} else {
Assert::invariant( false, "Bad token type: $token" );
}
}
}