%PDF- %PDF-
Direktori : /www/varak.net/wiki.varak.net/includes/tidy/ |
Current File : /www/varak.net/wiki.varak.net/includes/tidy/Balancer.php |
<?php /** * An implementation of the tree building portion of the HTML5 parsing * spec. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * http://www.gnu.org/copyleft/gpl.html * * @file * @ingroup Parser * @since 1.27 * @author C. Scott Ananian, 2016 */ namespace MediaWiki\Tidy; use Wikimedia\Assert\Assert; use Wikimedia\Assert\ParameterAssertionException; use \ExplodeIterator; use \IteratorAggregate; use \ReverseArrayIterator; use \Sanitizer; // A note for future librarization[1] -- this file is a good candidate // for splitting into an independent library, except that it is currently // highly optimized for MediaWiki use. It only implements the portions // of the HTML5 tree builder used by tags supported by MediaWiki, and // does not contain a true tokenizer pass, instead relying on // comment stripping, attribute normalization, and escaping done by // the MediaWiki Sanitizer. It also deliberately avoids building // a true DOM in memory, instead serializing elements to an output string // as soon as possible (usually as soon as the tag is closed) to reduce // its memory footprint. // We've been gradually lifting some of these restrictions to handle // non-sanitized output generated by extensions, but we shortcut the tokenizer // for speed (primarily by splitting on `<`) and so rely on syntactic // well-formedness. // On the other hand, I've been pretty careful to note with comments in the // code the places where this implementation omits features of the spec or // depends on the MediaWiki Sanitizer. Perhaps in the future we'll want to // implement the missing pieces and make this a standalone PHP HTML5 parser. // In order to do so, some sort of MediaWiki-specific API will need // to be added to (a) allow the Balancer to bypass the tokenizer, // and (b) support on-the-fly flattening instead of DOM node creation. // [1]: https://www.mediawiki.org/wiki/Library_infrastructure_for_MediaWiki /** * Utility constants and sets for the HTML5 tree building algorithm. * Sets are associative arrays indexed first by namespace and then by * lower-cased tag name. * * @ingroup Parser * @since 1.27 */ class BalanceSets { const HTML_NAMESPACE = 'http://www.w3.org/1999/xhtml'; const MATHML_NAMESPACE = 'http://www.w3.org/1998/Math/MathML'; const SVG_NAMESPACE = 'http://www.w3.org/2000/svg'; public static $unsupportedSet = [ self::HTML_NAMESPACE => [ 'html' => true, 'head' => true, 'body' => true, 'frameset' => true, 'frame' => true, 'plaintext' => true, 'isindex' => true, 'xmp' => true, 'iframe' => true, 'noembed' => true, 'noscript' => true, 'script' => true, 'title' => true ] ]; public static $emptyElementSet = [ self::HTML_NAMESPACE => [ 'area' => true, 'base' => true, 'basefont' => true, 'bgsound' => true, 'br' => true, 'col' => true, 'command' => true, 'embed' => true, 'frame' => true, 'hr' => true, 'img' => true, 'input' => true, 'keygen' => true, 'link' => true, 'meta' => true, 'param' => true, 'source' => true, 'track' => true, 'wbr' => true ] ]; public static $extraLinefeedSet = [ self::HTML_NAMESPACE => [ 'pre' => true, 'textarea' => true, 'listing' => true, ] ]; public static $headingSet = [ self::HTML_NAMESPACE => [ 'h1' => true, 'h2' => true, 'h3' => true, 'h4' => true, 'h5' => true, 'h6' => true ] ]; public static $specialSet = [ self::HTML_NAMESPACE => [ 'address' => true, 'applet' => true, 'area' => true, 'article' => true, 'aside' => true, 'base' => true, 'basefont' => true, 'bgsound' => true, 'blockquote' => true, 'body' => true, 'br' => true, 'button' => true, 'caption' => true, 'center' => true, 'col' => true, 'colgroup' => true, 'dd' => true, 'details' => true, 'dir' => true, 'div' => true, 'dl' => true, 'dt' => true, 'embed' => true, 'fieldset' => true, 'figcaption' => true, 'figure' => true, 'footer' => true, 'form' => true, 'frame' => true, 'frameset' => true, 'h1' => true, 'h2' => true, 'h3' => true, 'h4' => true, 'h5' => true, 'h6' => true, 'head' => true, 'header' => true, 'hgroup' => true, 'hr' => true, 'html' => true, 'iframe' => true, 'img' => true, 'input' => true, 'isindex' => true, 'li' => true, 'link' => true, 'listing' => true, 'main' => true, 'marquee' => true, 'menu' => true, 'menuitem' => true, 'meta' => true, 'nav' => true, 'noembed' => true, 'noframes' => true, 'noscript' => true, 'object' => true, 'ol' => true, 'p' => true, 'param' => true, 'plaintext' => true, 'pre' => true, 'script' => true, 'section' => true, 'select' => true, 'source' => true, 'style' => true, 'summary' => true, 'table' => true, 'tbody' => true, 'td' => true, 'template' => true, 'textarea' => true, 'tfoot' => true, 'th' => true, 'thead' => true, 'title' => true, 'tr' => true, 'track' => true, 'ul' => true, 'wbr' => true, 'xmp' => true ], self::SVG_NAMESPACE => [ 'foreignobject' => true, 'desc' => true, 'title' => true ], self::MATHML_NAMESPACE => [ 'mi' => true, 'mo' => true, 'mn' => true, 'ms' => true, 'mtext' => true, 'annotation-xml' => true ] ]; public static $addressDivPSet = [ self::HTML_NAMESPACE => [ 'address' => true, 'div' => true, 'p' => true ] ]; public static $tableSectionRowSet = [ self::HTML_NAMESPACE => [ 'table' => true, 'thead' => true, 'tbody' => true, 'tfoot' => true, 'tr' => true ] ]; public static $impliedEndTagsSet = [ self::HTML_NAMESPACE => [ 'dd' => true, 'dt' => true, 'li' => true, 'optgroup' => true, 'option' => true, 'p' => true, 'rb' => true, 'rp' => true, 'rt' => true, 'rtc' => true ] ]; public static $thoroughImpliedEndTagsSet = [ self::HTML_NAMESPACE => [ 'caption' => true, 'colgroup' => true, 'dd' => true, 'dt' => true, 'li' => true, 'optgroup' => true, 'option' => true, 'p' => true, 'rb' => true, 'rp' => true, 'rt' => true, 'rtc' => true, 'tbody' => true, 'td' => true, 'tfoot' => true, 'th' => true, 'thead' => true, 'tr' => true ] ]; public static $tableCellSet = [ self::HTML_NAMESPACE => [ 'td' => true, 'th' => true ] ]; public static $tableContextSet = [ self::HTML_NAMESPACE => [ 'table' => true, 'template' => true, 'html' => true ] ]; public static $tableBodyContextSet = [ self::HTML_NAMESPACE => [ 'tbody' => true, 'tfoot' => true, 'thead' => true, 'template' => true, 'html' => true ] ]; public static $tableRowContextSet = [ self::HTML_NAMESPACE => [ 'tr' => true, 'template' => true, 'html' => true ] ]; // See https://html.spec.whatwg.org/multipage/forms.html#form-associated-element public static $formAssociatedSet = [ self::HTML_NAMESPACE => [ 'button' => true, 'fieldset' => true, 'input' => true, 'keygen' => true, 'object' => true, 'output' => true, 'select' => true, 'textarea' => true, 'img' => true ] ]; public static $inScopeSet = [ self::HTML_NAMESPACE => [ 'applet' => true, 'caption' => true, 'html' => true, 'marquee' => true, 'object' => true, 'table' => true, 'td' => true, 'template' => true, 'th' => true ], self::SVG_NAMESPACE => [ 'foreignobject' => true, 'desc' => true, 'title' => true ], self::MATHML_NAMESPACE => [ 'mi' => true, 'mo' => true, 'mn' => true, 'ms' => true, 'mtext' => true, 'annotation-xml' => true ] ]; private static $inListItemScopeSet = null; public static function inListItemScopeSet() { if ( self::$inListItemScopeSet === null ) { self::$inListItemScopeSet = self::$inScopeSet; self::$inListItemScopeSet[self::HTML_NAMESPACE]['ol'] = true; self::$inListItemScopeSet[self::HTML_NAMESPACE]['ul'] = true; } return self::$inListItemScopeSet; } private static $inButtonScopeSet = null; public static function inButtonScopeSet() { if ( self::$inButtonScopeSet === null ) { self::$inButtonScopeSet = self::$inScopeSet; self::$inButtonScopeSet[self::HTML_NAMESPACE]['button'] = true; } return self::$inButtonScopeSet; } public static $inTableScopeSet = [ self::HTML_NAMESPACE => [ 'html' => true, 'table' => true, 'template' => true ] ]; public static $inInvertedSelectScopeSet = [ self::HTML_NAMESPACE => [ 'option' => true, 'optgroup' => true ] ]; public static $mathmlTextIntegrationPointSet = [ self::MATHML_NAMESPACE => [ 'mi' => true, 'mo' => true, 'mn' => true, 'ms' => true, 'mtext' => true ] ]; public static $htmlIntegrationPointSet = [ self::SVG_NAMESPACE => [ 'foreignobject' => true, 'desc' => true, 'title' => true ] ]; // For tidy compatibility. public static $tidyPWrapSet = [ self::HTML_NAMESPACE => [ 'body' => true, 'blockquote' => true, // We parse with <body> as the fragment context, but the top-level // element on the stack is actually <html>. We could use the // "adjusted current node" everywhere to work around this, but it's // easier just to add <html> to the p-wrap set. 'html' => true, ], ]; public static $tidyInlineSet = [ self::HTML_NAMESPACE => [ 'a' => true, 'abbr' => true, 'acronym' => true, 'applet' => true, 'b' => true, 'basefont' => true, 'bdo' => true, 'big' => true, 'br' => true, 'button' => true, 'cite' => true, 'code' => true, 'dfn' => true, 'em' => true, 'font' => true, 'i' => true, 'iframe' => true, 'img' => true, 'input' => true, 'kbd' => true, 'label' => true, 'legend' => true, 'map' => true, 'object' => true, 'param' => true, 'q' => true, 'rb' => true, 'rbc' => true, 'rp' => true, 'rt' => true, 'rtc' => true, 'ruby' => true, 's' => true, 'samp' => true, 'select' => true, 'small' => true, 'span' => true, 'strike' => true, 'strong' => true, 'sub' => true, 'sup' => true, 'textarea' => true, 'tt' => true, 'u' => true, 'var' => true, ], ]; } /** * A BalanceElement is a simplified version of a DOM Node. The main * difference is that we only keep BalanceElements around for nodes * currently on the BalanceStack of open elements. As soon as an * element is closed, with some minor exceptions relating to the * tree builder "adoption agency algorithm", the element and all its * children are serialized to a string using the flatten() method. * This keeps our memory usage low. * * @ingroup Parser * @since 1.27 */ class BalanceElement { /** * The namespace of the element. * @var string $namespaceURI */ public $namespaceURI; /** * The lower-cased name of the element. * @var string $localName */ public $localName; /** * Attributes for the element, in array form * @var array $attribs */ public $attribs; /** * Parent of this element, or the string "flat" if this element has * already been flattened into its parent. * @var string|null $parent */ public $parent; /** * An array of children of this element. Typically only the last * child will be an actual BalanceElement object; the rest will * be strings, representing either text nodes or flattened * BalanceElement objects. * @var array $children */ public $children; /** * A unique string identifier for Noah's Ark purposes, lazy initialized */ private $noahKey; /** * The next active formatting element in the list, or null if this is the * end of the AFE list or if the element is not in the AFE list. */ public $nextAFE; /** * The previous active formatting element in the list, or null if this is * the start of the list or if the element is not in the AFE list. */ public $prevAFE; /** * The next element in the Noah's Ark species bucket. */ public $nextNoah; /** * Make a new BalanceElement corresponding to the HTML DOM Element * with the given localname, namespace, and attributes. * * @param string $namespaceURI The namespace of the element. * @param string $localName The lowercased name of the tag. * @param array $attribs Attributes of the element */ public function __construct( $namespaceURI, $localName, array $attribs ) { $this->localName = $localName; $this->namespaceURI = $namespaceURI; $this->attribs = $attribs; $this->contents = ''; $this->parent = null; $this->children = []; } /** * Remove the given child from this element. * @param BalanceElement $elt */ private function removeChild( BalanceElement $elt ) { Assert::precondition( $this->parent !== 'flat', "Can't removeChild after flattening $this" ); Assert::parameter( $elt->parent === $this, 'elt', 'must have $this as a parent' ); $idx = array_search( $elt, $this->children, true ); Assert::parameter( $idx !== false, '$elt', 'must be a child of $this' ); $elt->parent = null; array_splice( $this->children, $idx, 1 ); } /** * Find $a in the list of children and insert $b before it. * @param BalanceElement $a * @param BalanceElement|string $b */ public function insertBefore( BalanceElement $a, $b ) { Assert::precondition( $this->parent !== 'flat', "Can't insertBefore after flattening." ); $idx = array_search( $a, $this->children, true ); Assert::parameter( $idx !== false, '$a', 'must be a child of $this' ); if ( is_string( $b ) ) { array_splice( $this->children, $idx, 0, [ $b ] ); } else { Assert::parameter( $b->parent !== 'flat', '$b', "Can't be flat" ); if ( $b->parent !== null ) { $b->parent->removeChild( $b ); } array_splice( $this->children, $idx, 0, [ $b ] ); $b->parent = $this; } } /** * Append $elt to the end of the list of children. * @param BalanceElement|string $elt */ public function appendChild( $elt ) { Assert::precondition( $this->parent !== 'flat', "Can't appendChild after flattening." ); if ( is_string( $elt ) ) { array_push( $this->children, $elt ); return; } // Remove $elt from parent, if it had one. if ( $elt->parent !== null ) { $elt->parent->removeChild( $elt ); } array_push( $this->children, $elt ); $elt->parent = $this; } /** * Transfer all of the children of $elt to $this. * @param BalanceElement $elt */ public function adoptChildren( BalanceElement $elt ) { Assert::precondition( $elt->parent !== 'flat', "Can't adoptChildren after flattening." ); foreach ( $elt->children as $child ) { if ( !is_string( $child ) ) { // This is an optimization which avoids an O(n^2) set of // array_splice operations. $child->parent = null; } $this->appendChild( $child ); } $elt->children = []; } /** * Flatten this node and all of its children into a string, as specified * by the HTML serialization specification, and replace this node * in its parent by that string. * * @param array $config Balancer configuration; see Balancer::__construct(). * * @see __toString() */ public function flatten( array $config ) { Assert::parameter( $this->parent !== null, '$this', 'must be a child' ); Assert::parameter( $this->parent !== 'flat', '$this', 'already flat' ); $idx = array_search( $this, $this->parent->children, true ); Assert::parameter( $idx !== false, '$this', 'must be a child of its parent' ); $tidyCompat = $config['tidyCompat']; if ( $tidyCompat ) { $blank = true; foreach ( $this->children as $elt ) { if ( !is_string( $elt ) ) { $elt = $elt->flatten( $config ); } if ( $blank && preg_match( '/[^\t\n\f\r ]/', $elt ) ) { $blank = false; } } if ( $this->isHtmlNamed( 'mw:p-wrap' ) ) { $this->localName = 'p'; } elseif ( $blank ) { // Add 'mw-empty-elt' class so elements can be hidden via CSS // for compatibility with legacy tidy. if ( !count( $this->attribs ) && ( $this->localName === 'tr' || $this->localName === 'li' ) ) { $this->attribs = [ 'class' => "mw-empty-elt" ]; } $blank = false; } $flat = $blank ? '' : "{$this}"; } else { $flat = "{$this}"; } $this->parent->children[$idx] = $flat; $this->parent = 'flat'; // for assertion checking return $flat; } /** * Serialize this node and all of its children to a string, as specified * by the HTML serialization specification. * * @return string The serialization of the BalanceElement * @see https://html.spec.whatwg.org/multipage/syntax.html#serialising-html-fragments */ public function __toString() { $encAttribs = ''; foreach ( $this->attribs as $name => $value ) { $encValue = Sanitizer::encodeAttribute( $value ); $encAttribs .= " $name=\"$encValue\""; } if ( !$this->isA( BalanceSets::$emptyElementSet ) ) { $out = "<{$this->localName}{$encAttribs}>"; $len = strlen( $out ); // flatten children foreach ( $this->children as $elt ) { $out .= "{$elt}"; } $out .= "</{$this->localName}>"; if ( $this->isA( BalanceSets::$extraLinefeedSet ) && $out[$len] === "\n" ) { // Double the linefeed after pre/listing/textarea // according to the HTML5 fragment serialization algorithm. $out = substr( $out, 0, $len + 1 ) . substr( $out, $len ); } } else { $out = "<{$this->localName}{$encAttribs} />"; Assert::invariant( count( $this->children ) === 0, "Empty elements shouldn't have children." ); } return $out; } // Utility functions on BalanceElements. /** * Determine if $this represents a specific HTML tag, is a member of * a tag set, or is equal to another BalanceElement. * * @param BalanceElement|array|string $set The target BalanceElement, * set (from the BalanceSets class), or string (HTML tag name). * @return bool */ public function isA( $set ) { if ( $set instanceof BalanceElement ) { return $this === $set; } elseif ( is_array( $set ) ) { return isset( $set[$this->namespaceURI] ) && isset( $set[$this->namespaceURI][$this->localName] ); } else { // assume this is an HTML element name. return $this->isHtml() && $this->localName === $set; } } /** * Determine if this element is an HTML element with the specified name * @param string $tagName * @return bool */ public function isHtmlNamed( $tagName ) { return $this->namespaceURI === BalanceSets::HTML_NAMESPACE && $this->localName === $tagName; } /** * Determine if $this represents an element in the HTML namespace. * * @return bool */ public function isHtml() { return $this->namespaceURI === BalanceSets::HTML_NAMESPACE; } /** * Determine if $this represents a MathML text integration point, * as defined in the HTML5 specification. * * @return bool * @see https://html.spec.whatwg.org/multipage/syntax.html#mathml-text-integration-point */ public function isMathmlTextIntegrationPoint() { return $this->isA( BalanceSets::$mathmlTextIntegrationPointSet ); } /** * Determine if $this represents an HTML integration point, * as defined in the HTML5 specification. * * @return bool * @see https://html.spec.whatwg.org/multipage/syntax.html#html-integration-point */ public function isHtmlIntegrationPoint() { if ( $this->isA( BalanceSets::$htmlIntegrationPointSet ) ) { return true; } if ( $this->namespaceURI === BalanceSets::MATHML_NAMESPACE && $this->localName === 'annotation-xml' && isset( $this->attribs['encoding'] ) && ( strcasecmp( $this->attribs['encoding'], 'text/html' ) == 0 || strcasecmp( $this->attribs['encoding'], 'application/xhtml+xml' ) == 0 ) ) { return true; } return false; } /** * Get a string key for the Noah's Ark algorithm */ public function getNoahKey() { if ( $this->noahKey === null ) { $attribs = $this->attribs; ksort( $attribs ); $this->noahKey = serialize( [ $this->namespaceURI, $this->localName, $attribs ] ); } return $this->noahKey; } } /** * The "stack of open elements" as defined in the HTML5 tree builder * spec. This contains methods to ensure that content (start tags, text) * are inserted at the correct place in the output string, and to * flatten BalanceElements are they are closed to avoid holding onto * a complete DOM tree for the document in memory. * * The stack defines a PHP iterator to traverse it in "reverse order", * that is, the most-recently-added element is visited first in a * foreach loop. * * @ingroup Parser * @since 1.27 * @see https://html.spec.whatwg.org/multipage/syntax.html#the-stack-of-open-elements */ class BalanceStack implements IteratorAggregate { /** * Backing storage for the stack. * @var array $elements */ private $elements = []; /** * Foster parent mode determines how nodes are inserted into the * stack. * @var bool $fosterParentMode * @see https://html.spec.whatwg.org/multipage/syntax.html#foster-parent */ public $fosterParentMode = false; /** * Configuration options governing flattening. * @var array $config * @see Balancer::__construct() */ private $config; /** * Reference to the current element */ public $currentNode; /** * Create a new BalanceStack with a single BalanceElement on it, * representing the root <html> node. * @param array $config Balancer configuration; see Balancer::_construct(). */ public function __construct( array $config ) { // always a root <html> element on the stack array_push( $this->elements, new BalanceElement( BalanceSets::HTML_NAMESPACE, 'html', [] ) ); $this->currentNode = $this->elements[0]; $this->config = $config; } /** * Return a string representing the output of the tree builder: * all the children of the root <html> node. * @return string */ public function getOutput() { // Don't include the outer '<html>....</html>' $out = ''; foreach ( $this->elements[0]->children as $elt ) { $out .= is_string( $elt ) ? $elt : $elt->flatten( $this->config ); } return $out; } /** * Insert a comment at the appropriate place for inserting a node. * @param string $value Content of the comment. * @see https://html.spec.whatwg.org/multipage/syntax.html#insert-a-comment */ public function insertComment( $value ) { // Just another type of text node, except for tidy p-wrapping. return $this->insertText( '<!--' . $value . '-->', true ); } /** * Insert text at the appropriate place for inserting a node. * @param string $value * @see https://html.spec.whatwg.org/multipage/syntax.html#appropriate-place-for-inserting-a-node */ public function insertText( $value, $isComment = false ) { if ( $this->fosterParentMode && $this->currentNode->isA( BalanceSets::$tableSectionRowSet ) ) { $this->fosterParent( $value ); } elseif ( $this->config['tidyCompat'] && !$isComment && $this->currentNode->isA( BalanceSets::$tidyPWrapSet ) ) { $this->insertHTMLELement( 'mw:p-wrap', [] ); return $this->insertText( $value ); } else { $this->currentNode->appendChild( $value ); } } /** * Insert a BalanceElement at the appropriate place, pushing it * on to the open elements stack. * @param string $namespaceURI The element namespace * @param string $tag The tag name * @param string $attribs Normalized attributes, as a string. * @return BalanceElement * @see https://html.spec.whatwg.org/multipage/syntax.html#insert-a-foreign-element */ public function insertForeignElement( $namespaceURI, $tag, $attribs ) { return $this->insertElement( new BalanceElement( $namespaceURI, $tag, $attribs ) ); } /** * Insert an HTML element at the appropriate place, pushing it on to * the open elements stack. * @param string $tag The tag name * @param string $attribs Normalized attributes, as a string. * @return BalanceElement * @see https://html.spec.whatwg.org/multipage/syntax.html#insert-an-html-element */ public function insertHTMLElement( $tag, $attribs ) { return $this->insertForeignElement( BalanceSets::HTML_NAMESPACE, $tag, $attribs ); } /** * Insert an element at the appropriate place and push it on to the * open elements stack. * @param BalanceElement $elt * @return BalanceElement * @see https://html.spec.whatwg.org/multipage/syntax.html#appropriate-place-for-inserting-a-node */ public function insertElement( BalanceElement $elt ) { if ( $this->currentNode->isHtmlNamed( 'mw:p-wrap' ) && !$elt->isA( BalanceSets::$tidyInlineSet ) ) { // Tidy compatibility. $this->pop(); } if ( $this->fosterParentMode && $this->currentNode->isA( BalanceSets::$tableSectionRowSet ) ) { $elt = $this->fosterParent( $elt ); } else { $this->currentNode->appendChild( $elt ); } Assert::invariant( $elt->parent !== null, "$elt must be in tree" ); Assert::invariant( $elt->parent !== 'flat', "$elt must not have been previous flattened" ); array_push( $this->elements, $elt ); $this->currentNode = $elt; return $elt; } /** * Determine if the stack has $tag in scope. * @param BalanceElement|array|string $tag * @return bool * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-scope */ public function inScope( $tag ) { return $this->inSpecificScope( $tag, BalanceSets::$inScopeSet ); } /** * Determine if the stack has $tag in button scope. * @param BalanceElement|array|string $tag * @return bool * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-button-scope */ public function inButtonScope( $tag ) { return $this->inSpecificScope( $tag, BalanceSets::inButtonScopeSet() ); } /** * Determine if the stack has $tag in list item scope. * @param BalanceElement|array|string $tag * @return bool * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-list-item-scope */ public function inListItemScope( $tag ) { return $this->inSpecificScope( $tag, BalanceSets::inListItemScopeSet() ); } /** * Determine if the stack has $tag in table scope. * @param BalanceElement|array|string $tag * @return bool * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-table-scope */ public function inTableScope( $tag ) { return $this->inSpecificScope( $tag, BalanceSets::$inTableScopeSet ); } /** * Determine if the stack has $tag in select scope. * @param BalanceElement|array|string $tag * @return bool * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-select-scope */ public function inSelectScope( $tag ) { // Can't use inSpecificScope to implement this, since it involves // *inverting* a set of tags. Implement manually. foreach ( $this as $elt ) { if ( $elt->isA( $tag ) ) { return true; } if ( !$elt->isA( BalanceSets::$inInvertedSelectScopeSet ) ) { return false; } } return false; } /** * Determine if the stack has $tag in a specific scope, $set. * @param BalanceElement|array|string $tag * @param BalanceElement|array|string $set * @return bool * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-the-specific-scope */ public function inSpecificScope( $tag, $set ) { foreach ( $this as $elt ) { if ( $elt->isA( $tag ) ) { return true; } if ( $elt->isA( $set ) ) { return false; } } return false; } /** * Generate implied end tags. * @param string $butnot * @param bool $thorough True if we should generate end tags thoroughly. * @see https://html.spec.whatwg.org/multipage/syntax.html#generate-implied-end-tags */ public function generateImpliedEndTags( $butnot = null, $thorough = false ) { $endTagSet = $thorough ? BalanceSets::$thoroughImpliedEndTagsSet : BalanceSets::$impliedEndTagsSet; while ( $this->currentNode ) { if ( $butnot !== null && $this->currentNode->isHtmlNamed( $butnot ) ) { break; } if ( !$this->currentNode->isA( $endTagSet ) ) { break; } $this->pop(); } } /** * Return the adjusted current node. */ public function adjustedCurrentNode( $fragmentContext ) { return ( $fragmentContext && count( $this->elements ) === 1 ) ? $fragmentContext : $this->currentNode; } /** * Return an iterator over this stack which visits the current node * first, and the root node last. * @return Iterator */ public function getIterator() { return new ReverseArrayIterator( $this->elements ); } /** * Return the BalanceElement at the given position $idx, where * position 0 represents the root element. * @param int $idx * @return BalanceElement */ public function node( $idx ) { return $this->elements[ $idx ]; } /** * Replace the element at position $idx in the BalanceStack with $elt. * @param int $idx * @param BalanceElement $elt */ public function replaceAt( $idx, BalanceElement $elt ) { Assert::precondition( $this->elements[$idx]->parent !== 'flat', 'Replaced element should not have already been flattened.' ); Assert::precondition( $elt->parent !== 'flat', 'New element should not have already been flattened.' ); $this->elements[$idx] = $elt; if ( $idx === count( $this->elements ) - 1 ) { $this->currentNode = $elt; } } /** * Return the position of the given BalanceElement, set, or * HTML tag name string in the BalanceStack. * @param BalanceElement|array|string $tag * @return int */ public function indexOf( $tag ) { for ( $i = count( $this->elements ) - 1; $i >= 0; $i-- ) { if ( $this->elements[$i]->isA( $tag ) ) { return $i; } } return -1; } /** * Return the number of elements currently in the BalanceStack. * @return int */ public function length() { return count( $this->elements ); } /** * Remove the current node from the BalanceStack, flattening it * in the process. */ public function pop() { $elt = array_pop( $this->elements ); if ( count( $this->elements ) ) { $this->currentNode = $this->elements[ count( $this->elements ) - 1 ]; } else { $this->currentNode = null; } if ( !$elt->isHtmlNamed( 'mw:p-wrap' ) ) { $elt->flatten( $this->config ); } } /** * Remove all nodes up to and including position $idx from the * BalanceStack, flattening them in the process. * @param int $idx */ public function popTo( $idx ) { for ( $length = count( $this->elements ); $length > $idx; $length-- ) { $this->pop(); } } /** * Pop elements off the stack up to and including the first * element with the specified HTML tagname (or matching the given * set). * @param BalanceElement|array|string $tag */ public function popTag( $tag ) { while ( $this->currentNode ) { if ( $this->currentNode->isA( $tag ) ) { $this->pop(); break; } $this->pop(); } } /** * Pop elements off the stack *not including* the first element * in the specified set. * @param BalanceElement|array|string $set */ public function clearToContext( $set ) { // Note that we don't loop to 0. Never pop the <html> elt off. for ( $length = count( $this->elements ); $length > 1; $length-- ) { if ( $this->currentNode->isA( $set ) ) { break; } $this->pop(); } } /** * Remove the given $elt from the BalanceStack, optionally * flattening it in the process. * @param BalanceElement $elt The element to remove. * @param bool $flatten Whether to flatten the removed element. */ public function removeElement( BalanceElement $elt, $flatten = true ) { Assert::parameter( $elt->parent !== 'flat', '$elt', '$elt should not already have been flattened.' ); Assert::parameter( $elt->parent->parent !== 'flat', '$elt', 'The parent of $elt should not already have been flattened.' ); $idx = array_search( $elt, $this->elements, true ); Assert::parameter( $idx !== false, '$elt', 'must be in stack' ); array_splice( $this->elements, $idx, 1 ); if ( $idx === count( $this->elements ) ) { $this->currentNode = $this->elements[$idx - 1]; } if ( $flatten ) { // serialize $elt into its parent // otherwise, it will eventually serialize when the parent // is serialized, we just hold onto the memory for its // tree of objects a little longer. $elt->flatten( $this->config ); } Assert::postcondition( array_search( $elt, $this->elements, true ) === false, '$elt should no longer be in open elements stack' ); } /** * Find $a in the BalanceStack and insert $b after it. * @param BalanceElement $a * @param BalanceElement $b */ public function insertAfter( BalanceElement $a, BalanceElement $b ) { $idx = $this->indexOf( $a ); Assert::parameter( $idx !== false, '$a', 'must be in stack' ); if ( $idx === count( $this->elements ) - 1 ) { array_push( $this->elements, $b ); $this->currentNode = $b; } else { array_splice( $this->elements, $idx + 1, 0, [ $b ] ); } } // Fostering and adoption. /** * Foster parent the given $elt in the stack of open elements. * @param BalanceElement|string $elt * @see https://html.spec.whatwg.org/multipage/syntax.html#foster-parent */ private function fosterParent( $elt ) { $lastTable = $this->indexOf( 'table' ); $lastTemplate = $this->indexOf( 'template' ); $parent = null; $before = null; if ( $lastTemplate >= 0 && ( $lastTable < 0 || $lastTemplate > $lastTable ) ) { $parent = $this->elements[$lastTemplate]; } elseif ( $lastTable >= 0 ) { $parent = $this->elements[$lastTable]->parent; // Assume all tables have parents, since we're not running scripts! Assert::invariant( $parent !== null, "All tables should have parents" ); $before = $this->elements[$lastTable]; } else { $parent = $this->elements[0]; // the `html` element. } if ( $this->config['tidyCompat'] ) { if ( is_string( $elt ) ) { // We're fostering text: do we need a p-wrapper? if ( $parent->isA( BalanceSets::$tidyPWrapSet ) ) { $this->insertHTMLElement( 'mw:p-wrap', [] ); $this->insertText( $elt ); return $elt; } } else { // We're fostering an element; do we need to merge p-wrappers? if ( $elt->isHtmlNamed( 'mw:p-wrap' ) ) { $idx = $before ? array_search( $before, $parent->children, true ) : count( $parent->children ); $after = $idx > 0 ? $parent->children[$idx - 1] : ''; if ( $after instanceof BalanceElement && $after->isHtmlNamed( 'mw:p-wrap' ) ) { return $after; // Re-use existing p-wrapper. } } } } if ( $before ) { $parent->insertBefore( $before, $elt ); } else { $parent->appendChild( $elt ); } return $elt; } /** * Run the "adoption agency algoritm" (AAA) for the given subject * tag name. * @param string $tag The subject tag name. * @param BalanceActiveFormattingElements $afe The current * active formatting elements list. * @return true if the adoption agency algorithm "did something", false * if more processing is required by the caller. * @see https://html.spec.whatwg.org/multipage/syntax.html#adoption-agency-algorithm */ public function adoptionAgency( $tag, $afe ) { // If the current node is an HTML element whose tag name is subject, // and the current node is not in the list of active formatting // elements, then pop the current node off the stack of open // elements and abort these steps. if ( $this->currentNode->isHtmlNamed( $tag ) && !$afe->isInList( $this->currentNode ) ) { $this->pop(); return true; // no more handling required } // Outer loop: If outer loop counter is greater than or // equal to eight, then abort these steps. for ( $outer = 0; $outer < 8; $outer++ ) { // Let the formatting element be the last element in the list // of active formatting elements that: is between the end of // the list and the last scope marker in the list, if any, or // the start of the list otherwise, and has the same tag name // as the token. $fmtElt = $afe->findElementByTag( $tag ); // If there is no such node, then abort these steps and instead // act as described in the "any other end tag" entry below. if ( !$fmtElt ) { return false; // false means handle by the default case } // Otherwise, if there is such a node, but that node is not in // the stack of open elements, then this is a parse error; // remove the element from the list, and abort these steps. $index = $this->indexOf( $fmtElt ); if ( $index < 0 ) { $afe->remove( $fmtElt ); return true; // true means no more handling required } // Otherwise, if there is such a node, and that node is also in // the stack of open elements, but the element is not in scope, // then this is a parse error; ignore the token, and abort // these steps. if ( !$this->inScope( $fmtElt ) ) { return true; } // Let the furthest block be the topmost node in the stack of // open elements that is lower in the stack than the formatting // element, and is an element in the special category. There // might not be one. $furthestBlock = null; $furthestBlockIndex = -1; $stackLength = $this->length(); for ( $i = $index+1; $i < $stackLength; $i++ ) { if ( $this->node( $i )->isA( BalanceSets::$specialSet ) ) { $furthestBlock = $this->node( $i ); $furthestBlockIndex = $i; break; } } // If there is no furthest block, then the UA must skip the // subsequent steps and instead just pop all the nodes from the // bottom of the stack of open elements, from the current node // up to and including the formatting element, and remove the // formatting element from the list of active formatting // elements. if ( !$furthestBlock ) { $this->popTag( $fmtElt ); $afe->remove( $fmtElt ); return true; } // Let the common ancestor be the element immediately above // the formatting element in the stack of open elements. $ancestor = $this->node( $index-1 ); // Let a bookmark note the position of the formatting // element in the list of active formatting elements // relative to the elements on either side of it in the // list. $BOOKMARK = new BalanceElement( '[bookmark]', '[bookmark]', [] ); $afe->insertAfter( $fmtElt, $BOOKMARK ); // Let node and last node be the furthest block. $node = $furthestBlock; $lastNode = $furthestBlock; $nodeIndex = $furthestBlockIndex; $isAFE = false; // Inner loop for ( $inner = 1; true; $inner++ ) { // Let node be the element immediately above node in // the stack of open elements, or if node is no longer // in the stack of open elements (e.g. because it got // removed by this algorithm), the element that was // immediately above node in the stack of open elements // before node was removed. $node = $this->node( --$nodeIndex ); // If node is the formatting element, then go // to the next step in the overall algorithm. if ( $node === $fmtElt ) break; // If the inner loop counter is greater than three and node // is in the list of active formatting elements, then remove // node from the list of active formatting elements. $isAFE = $afe->isInList( $node ); if ( $inner > 3 && $isAFE ) { $afe->remove( $node ); $isAFE = false; } // If node is not in the list of active formatting // elements, then remove node from the stack of open // elements and then go back to the step labeled inner // loop. if ( !$isAFE ) { // Don't flatten here, since we're about to relocate // parts of this $node. $this->removeElement( $node, false ); continue; } // Create an element for the token for which the // element node was created with common ancestor as // the intended parent, replace the entry for node // in the list of active formatting elements with an // entry for the new element, replace the entry for // node in the stack of open elements with an entry for // the new element, and let node be the new element. $newElt = new BalanceElement( $node->namespaceURI, $node->localName, $node->attribs ); $afe->replace( $node, $newElt ); $this->replaceAt( $nodeIndex, $newElt ); $node = $newElt; // If last node is the furthest block, then move the // aforementioned bookmark to be immediately after the // new node in the list of active formatting elements. if ( $lastNode === $furthestBlock ) { $afe->remove( $BOOKMARK ); $afe->insertAfter( $newElt, $BOOKMARK ); } // Insert last node into node, first removing it from // its previous parent node if any. $node->appendChild( $lastNode ); // Let last node be node. $lastNode = $node; } // If the common ancestor node is a table, tbody, tfoot, // thead, or tr element, then, foster parent whatever last // node ended up being in the previous step, first removing // it from its previous parent node if any. if ( $this->fosterParentMode && $ancestor->isA( BalanceSets::$tableSectionRowSet ) ) { $this->fosterParent( $lastNode ); } else { // Otherwise, append whatever last node ended up being in // the previous step to the common ancestor node, first // removing it from its previous parent node if any. $ancestor->appendChild( $lastNode ); } // Create an element for the token for which the // formatting element was created, with furthest block // as the intended parent. $newElt2 = new BalanceElement( $fmtElt->namespaceURI, $fmtElt->localName, $fmtElt->attribs ); // Take all of the child nodes of the furthest block and // append them to the element created in the last step. $newElt2->adoptChildren( $furthestBlock ); // Append that new element to the furthest block. $furthestBlock->appendChild( $newElt2 ); // Remove the formatting element from the list of active // formatting elements, and insert the new element into the // list of active formatting elements at the position of // the aforementioned bookmark. $afe->remove( $fmtElt ); $afe->replace( $BOOKMARK, $newElt2 ); // Remove the formatting element from the stack of open // elements, and insert the new element into the stack of // open elements immediately below the position of the // furthest block in that stack. $this->removeElement( $fmtElt ); $this->insertAfter( $furthestBlock, $newElt2 ); } return true; } /** * Return the contents of the open elements stack as a string for * debugging. * @return string */ public function __toString() { $r = []; foreach ( $this->elements as $elt ) { array_push( $r, $elt->localName ); } return implode( $r, ' ' ); } } /** * A pseudo-element used as a marker in the list of active formatting elements * * @ingroup Parser * @since 1.27 */ class BalanceMarker { public $nextAFE; public $prevAFE; } /** * The list of active formatting elements, which is used to handle * mis-nested formatting element tags in the HTML5 tree builder * specification. * * @ingroup Parser * @since 1.27 * @see https://html.spec.whatwg.org/multipage/syntax.html#list-of-active-formatting-elements */ class BalanceActiveFormattingElements { /** The last (most recent) element in the list */ private $tail; /** The first (least recent) element in the list */ private $head; /** * An array of arrays representing the population of elements in each bucket * according to the Noah's Ark clause. The outer array is stack-like, with each * integer-indexed element representing a segment of the list, bounded by * markers. The first element represents the segment of the list before the * first marker. * * The inner arrays are indexed by "Noah key", which is a string which uniquely * identifies each bucket according to the rules in the spec. The value in * the inner array is the first (least recently inserted) element in the bucket, * and subsequent members of the bucket can be found by iterating through the * singly-linked list via $node->nextNoah. * * This is optimised for the most common case of inserting into a bucket * with zero members, and deleting a bucket containing one member. In the * worst case, iteration through the list is still O(1) in the document * size, since each bucket can have at most 3 members. */ private $noahTableStack = [ [] ]; public function __destruct() { for ( $node = $this->head; $node; $node = $next ) { $next = $node->nextAFE; $node->prevAFE = $node->nextAFE = $node->nextNoah = null; } $this->head = $this->tail = $this->noahTableStack = null; } public function insertMarker() { $elt = new BalanceMarker; if ( $this->tail ) { $this->tail->nextAFE = $elt; $elt->prevAFE = $this->tail; } else { $this->head = $elt; } $this->tail = $elt; $this->noahTableStack[] = []; } /** * Follow the steps required when the spec requires us to "push onto the * list of active formatting elements". * @param BalanceElement $elt */ public function push( BalanceElement $elt ) { // Must not be in the list already if ( $elt->prevAFE !== null || $this->head === $elt ) { throw new ParameterAssertionException( '$elt', 'Cannot insert a node into the AFE list twice' ); } // "Noah's Ark clause" -- if there are already three copies of // this element before we encounter a marker, then drop the last // one. $noahKey = $elt->getNoahKey(); $table =& $this->noahTableStack[ count( $this->noahTableStack ) - 1 ]; if ( !isset( $table[$noahKey] ) ) { $table[$noahKey] = $elt; } else { $count = 1; $head = $tail = $table[$noahKey]; while ( $tail->nextNoah ) { $tail = $tail->nextNoah; $count++; } if ( $count >= 3 ) { $this->remove( $head ); } $tail->nextNoah = $elt; } // Add to the main AFE list if ( $this->tail ) { $this->tail->nextAFE = $elt; $elt->prevAFE = $this->tail; } else { $this->head = $elt; } $this->tail = $elt; } /** * Follow the steps required when the spec asks us to "clear the list of * active formatting elements up to the last marker". */ public function clearToMarker() { // Iterate back through the list starting from the tail $tail = $this->tail; while ( $tail && !( $tail instanceof BalanceMarker ) ) { // Unlink the element $prev = $tail->prevAFE; $tail->prevAFE = null; if ( $prev ) { $prev->nextAFE = null; } $tail->nextNoah = null; $tail = $prev; } // If we finished on a marker, unlink it and pop it off the Noah table stack if ( $tail ) { $prev = $tail->prevAFE; if ( $prev ) { $prev->nextAFE = null; } $tail = $prev; array_pop( $this->noahTableStack ); } else { // No marker: wipe the top-level Noah table (which is the only one) $this->noahTableStack[0] = []; } // If we removed all the elements, clear the head pointer if ( !$tail ) { $this->head = null; } $this->tail = $tail; } /** * Find and return the last element with the specified tag between the * end of the list and the last marker on the list. * Used when parsing <a> "in body mode". */ public function findElementByTag( $tag ) { $elt = $this->tail; while ( $elt && !( $elt instanceof BalanceMarker ) ) { if ( $elt->localName === $tag ) { return $elt; } $elt = $elt->prevAFE; } return null; } /** * Determine whether an element is in the list of formatting elements. * @return boolean */ public function isInList( BalanceElement $elt ) { return $this->head === $elt || $elt->prevAFE; } /** * Find the element $elt in the list and remove it. * Used when parsing <a> in body mode. */ public function remove( BalanceElement $elt ) { if ( $this->head !== $elt && !$elt->prevAFE ) { throw new ParameterAssertionException( '$elt', "Attempted to remove an element which is not in the AFE list" ); } // Update head and tail pointers if ( $this->head === $elt ) { $this->head = $elt->nextAFE; } if ( $this->tail === $elt ) { $this->tail = $elt->prevAFE; } // Update previous element if ( $elt->prevAFE ) { $elt->prevAFE->nextAFE = $elt->nextAFE; } // Update next element if ( $elt->nextAFE ) { $elt->nextAFE->prevAFE = $elt->prevAFE; } // Clear pointers so that isInList() etc. will work $elt->prevAFE = $elt->nextAFE = null; // Update Noah list $this->removeFromNoahList( $elt ); } private function addToNoahList( BalanceElement $elt ) { $noahKey = $elt->getNoahKey(); $table =& $this->noahTableStack[ count( $this->noahTableStack ) - 1 ]; if ( !isset( $table[$noahKey] ) ) { $table[$noahKey] = $elt; } else { $tail = $table[$noahKey]; while ( $tail->nextNoah ) { $tail = $tail->nextNoah; } $tail->nextNoah = $elt; } } private function removeFromNoahList( BalanceElement $elt ) { $table =& $this->noahTableStack[ count( $this->noahTableStack ) - 1 ]; $key = $elt->getNoahKey(); $noahElt = $table[$key]; if ( $noahElt === $elt ) { if ( $noahElt->nextNoah ) { $table[$key] = $noahElt->nextNoah; $noahElt->nextNoah = null; } else { unset( $table[$key] ); } } else { do { $prevNoahElt = $noahElt; $noahElt = $prevNoahElt->nextNoah; if ( $noahElt === $elt ) { // Found it, unlink $prevNoahElt->nextNoah = $elt->nextNoah; $elt->nextNoah = null; break; } } while ( $noahElt ); } } /** * Find element $a in the list and replace it with element $b */ public function replace( BalanceElement $a, BalanceElement $b ) { if ( $this->head !== $a && !$a->prevAFE ) { throw new ParameterAssertionException( '$a', "Attempted to replace an element which is not in the AFE list" ); } // Update head and tail pointers if ( $this->head === $a ) { $this->head = $b; } if ( $this->tail === $a ) { $this->tail = $b; } // Update previous element if ( $a->prevAFE ) { $a->prevAFE->nextAFE = $b; } // Update next element if ( $a->nextAFE ) { $a->nextAFE->prevAFE = $b; } $b->prevAFE = $a->prevAFE; $b->nextAFE = $a->nextAFE; $a->nextAFE = $a->prevAFE = null; // Update Noah list $this->removeFromNoahList( $a ); $this->addToNoahList( $b ); } /** * Find $a in the list and insert $b after it. */ public function insertAfter( BalanceElement $a, BalanceElement $b ) { if ( $this->head !== $a && !$a->prevAFE ) { throw new ParameterAssertionException( '$a', "Attempted to insert after an element which is not in the AFE list" ); } if ( $this->tail === $a ) { $this->tail = $b; } if ( $a->nextAFE ) { $a->nextAFE->prevAFE = $b; } $b->nextAFE = $a->nextAFE; $b->prevAFE = $a; $a->nextAFE = $b; $this->addToNoahList( $b ); } // @codingStandardsIgnoreStart Generic.Files.LineLength.TooLong /** * Reconstruct the active formatting elements. * @param BalanceStack $stack The open elements stack * @see https://html.spec.whatwg.org/multipage/syntax.html#reconstruct-the-active-formatting-elements */ // @codingStandardsIgnoreEnd public function reconstruct( $stack ) { $entry = $this->tail; // If there are no entries in the list of active formatting elements, // then there is nothing to reconstruct if ( !$entry ) { return; } // If the last is a marker, do nothing. if ( $entry instanceof BalanceMarker ) { return; } // Or if it is an open element, do nothing. if ( $stack->indexOf( $entry ) >= 0 ) { return; } // Loop backward through the list until we find a marker or an // open element $foundIt = false; while ( $entry->prevAFE ) { $entry = $entry->prevAFE; if ( $entry instanceof BalanceMarker || $stack->indexOf( $entry ) >= 0 ) { $foundIt = true; break; } } // Now loop forward, starting from the element after the current one (or // the first element if we didn't find a marker or open element), // recreating formatting elements and pushing them back onto the list // of open elements. if ( $foundIt ) { $entry = $entry->nextAFE; } do { $newElement = $stack->insertHTMLElement( $entry->localName, $entry->attribs ); $this->replace( $entry, $newElement ); $entry = $newElement->nextAFE; } while ( $entry ); } /** * Get a string representation of the AFE list, for debugging */ public function __toString() { $prev = null; $s = ''; for ( $node = $this->head; $node; $prev = $node, $node = $node->nextAFE ) { if ( $node instanceof BalanceMarker ) { $s .= "MARKER\n"; continue; } $s .= $node->localName . '#' . substr( md5( spl_object_hash( $node ) ), 0, 8 ); if ( $node->nextNoah ) { $s .= " (noah sibling: {$node->nextNoah->localName}#" . substr( md5( spl_object_hash( $node->nextNoah ) ), 0, 8 ) . ')'; } if ( $node->nextAFE && $node->nextAFE->prevAFE !== $node ) { $s .= " (reverse link is wrong!)"; } $s .= "\n"; } if ( $prev !== $this->tail ) { $s .= "(tail pointer is wrong!)\n"; } return $s; } } /** * An implementation of the tree building portion of the HTML5 parsing * spec. * * This is used to balance and tidy output so that the result can * always be cleanly serialized/deserialized by an HTML5 parser. It * does *not* guarantee "conforming" output -- the HTML5 spec contains * a number of constraints which are not enforced by the HTML5 parsing * process. But the result will be free of gross errors: misnested or * unclosed tags, for example, and will be unchanged by spec-complient * parsing followed by serialization. * * The tree building stage is structured as a state machine. * When comparing the implementation to * https://www.w3.org/TR/html5/syntax.html#tree-construction * note that each state is implemented as a function with a * name ending in `Mode` (because the HTML spec refers to them * as insertion modes). The current insertion mode is held by * the $parseMode property. * * The following simplifications have been made: * - We handle body content only (ie, we start `in body`.) * - The document is never in "quirks mode". * - All occurrences of < and > have been entity escaped, so we * can parse tags by simply splitting on those two characters. * (This also simplifies the handling of < inside <textarea>.) * The character < must not appear inside comments. * Similarly, all attributes have been "cleaned" and are double-quoted * and escaped. * - All null characters are assumed to have been removed. * - The following elements are disallowed: <html>, <head>, <body>, <frameset>, * <frame>, <plaintext>, <isindex>, <xmp>, <iframe>, * <noembed>, <noscript>, <script>, <title>. As a result, * further simplifications can be made: * - `frameset-ok` is not tracked. * - `head element pointer` is not tracked (but presumed non-null) * - Tokenizer has only a single mode. (<textarea> wants RCDATA and * <style>/<noframes> want RAWTEXT modes which we only loosely emulate.) * * We generally mark places where we omit cases from the spec due to * disallowed elements with a comment: `// OMITTED: <element-name>`. * * The HTML spec keeps a flag during the parsing process to track * whether or not a "parse error" has been encountered. We don't * bother to track that flag, we just implement the error-handling * process as specified. * * @ingroup Parser * @since 1.27 * @see https://html.spec.whatwg.org/multipage/syntax.html#tree-construction */ class Balancer { private $parseMode; private $bitsIterator; private $allowedHtmlElements; private $afe; private $stack; private $strict; private $allowComments; private $config; private $textIntegrationMode; private $pendingTableText; private $originalInsertionMode; private $fragmentContext; private $formElementPointer; private $ignoreLinefeed; private $inRCDATA; private $inRAWTEXT; /** * Valid HTML5 comments. * Regex borrowed from Tim Starling's "remex-html" project. */ const VALID_COMMENT_REGEX = "~ !-- ( # 1. Comment match detector > | -> | # Invalid short close ( # 2. Comment contents (?: (?! --> ) (?! --!> ) (?! --! \z ) (?! -- \z ) (?! - \z ) . )*+ ) ( # 3. Comment close --> | # Normal close --!> | # Comment end bang ( # 4. Indicate matches requiring EOF --! | # EOF in comment end bang state -- | # EOF in comment end state - | # EOF in comment end dash state # EOF in comment state ) ) ) ([^<]*) \z # 5. Non-tag text after the comment ~xs"; /** * Create a new Balancer. * @param array $config Balancer configuration. Includes: * 'strict' : boolean, defaults to false. * When true, enforces syntactic constraints on input: * all non-tag '<' must be escaped, all attributes must be * separated by a single space and double-quoted. This is * consistent with the output of the Sanitizer. * 'allowedHtmlElements' : array, defaults to null. * When present, the keys of this associative array give * the acceptable HTML tag names. When not present, no * tag sanitization is done. * 'tidyCompat' : boolean, defaults to false. * When true, the serialization algorithm is tweaked to * provide historical compatibility with the old "tidy" * program: <p>-wrapping is done to the children of * <body> and <blockquote> elements, and empty elements * are removed. * 'allowComments': boolean, defaults to true. * When true, allows HTML comments in the input. * The Sanitizer generally strips all comments, so if you * are running on sanitized output you can set this to * false to get a bit more performance. */ public function __construct( array $config = [] ) { $this->config = $config = $config + [ 'strict' => false, 'allowedHtmlElements' => null, 'tidyCompat' => false, 'allowComments' => true, ]; $this->allowedHtmlElements = $config['allowedHtmlElements']; $this->strict = $config['strict']; $this->allowComments = $config['allowComments']; if ( $this->allowedHtmlElements !== null ) { // Sanity check! $bad = array_uintersect_assoc( $this->allowedHtmlElements, BalanceSets::$unsupportedSet[BalanceSets::HTML_NAMESPACE], function( $a, $b ) { // Ignore the values (just intersect the keys) by saying // all values are equal to each other. return 0; } ); if ( count( $bad ) > 0 ) { $badstr = implode( array_keys( $bad ), ',' ); throw new ParameterAssertionException( '$config', 'Balance attempted with sanitization including ' . "unsupported elements: {$badstr}" ); } } } /** * Return a balanced HTML string for the HTML fragment given by $text, * subject to the caveats listed in the class description. The result * will typically be idempotent -- that is, rebalancing the output * would result in no change. * * @param string $text The markup to be balanced * @param callable $processingCallback Callback to do any variable or * parameter replacements in HTML attributes values * @param array|bool $processingArgs Arguments for the processing callback * @return string The balanced markup */ public function balance( $text, $processingCallback = null, $processingArgs = [] ) { $this->parseMode = 'inBodyMode'; $this->bitsIterator = new ExplodeIterator( '<', $text ); $this->afe = new BalanceActiveFormattingElements(); $this->stack = new BalanceStack( $this->config ); $this->processingCallback = $processingCallback; $this->processingArgs = $processingArgs; $this->textIntegrationMode = $this->ignoreLinefeed = $this->inRCDATA = $this->inRAWTEXT = false; // The stack is constructed with an <html> element already on it. // Set this up as a fragment parsed with <body> as the context. $this->fragmentContext = new BalanceElement( BalanceSets::HTML_NAMESPACE, 'body', [] ); $this->resetInsertionMode(); $this->formElementPointer = null; for ( $e = $this->fragmentContext; $e != null; $e = $e->parent ) { if ( $e->isHtmlNamed( 'form' ) ) { $this->formElementPointer = $e; break; } } // First element is text not tag $x = $this->bitsIterator->current(); $this->bitsIterator->next(); $this->insertToken( 'text', str_replace( '>', '>', $x ) ); // Now process each tag. while ( $this->bitsIterator->valid() ) { $this->advance(); } $this->insertToken( 'eof', null ); $result = $this->stack->getOutput(); // Free memory before returning. $this->bitsIterator = null; $this->afe = null; $this->stack = null; $this->fragmentContext = null; $this->formElementPointer = null; return $result; } /** * Pass a token to the tree builder. The $token will be one of the * strings "tag", "endtag", or "text". */ private function insertToken( $token, $value, $attribs = null, $selfClose = false ) { // validate tags against $unsupportedSet if ( $token === 'tag' || $token === 'endtag' ) { if ( isset( BalanceSets::$unsupportedSet[BalanceSets::HTML_NAMESPACE][$value] ) ) { // As described in "simplifications" above, these tags are // not supported in the balancer. Assert::invariant( !$this->strict, "Unsupported $token <$value> found." ); return false; } } elseif ( $token === 'text' && $value === '' ) { // Don't actually inject the empty string as a text token. return true; } // Support pre/listing/textarea by suppressing initial linefeed if ( $this->ignoreLinefeed ) { $this->ignoreLinefeed = false; if ( $token === 'text' ) { if ( $value[0] === "\n" ) { if ( $value === "\n" ) { // Nothing would be left, don't inject the empty string. return true; } $value = substr( $value, 1 ); } } } // Some hoops we have to jump through $adjusted = $this->stack->adjustedCurrentNode( $this->fragmentContext ); $isForeign = true; if ( $this->stack->length() === 0 || $adjusted->isHtml() || $token === 'eof' ) { $isForeign = false; } elseif ( $adjusted->isMathmlTextIntegrationPoint() ) { if ( $token === 'text' ) { $isForeign = false; } elseif ( $token === 'tag' && $value !== 'mglyph' && $value !== 'malignmark' ) { $isForeign = false; } } elseif ( $adjusted->namespaceURI === BalanceSets::MATHML_NAMESPACE && $adjusted->localName === 'annotation-xml' && $token === 'tag' && $value === 'svg' ) { $isForeign = false; } elseif ( $adjusted->isHtmlIntegrationPoint() && ( $token === 'tag' || $token === 'text' ) ) { $isForeign = false; } if ( $isForeign ) { return $this->insertForeignToken( $token, $value, $attribs, $selfClose ); } else { $func = $this->parseMode; return $this->$func( $token, $value, $attribs, $selfClose ); } } private function insertForeignToken( $token, $value, $attribs = null, $selfClose = false ) { if ( $token === 'text' ) { $this->stack->insertText( $value ); return true; } elseif ( $token === 'tag' ) { switch ( $value ) { case 'font': if ( isset( $attribs['color'] ) || isset( $attribs['face'] ) || isset( $attribs['size'] ) ) { break; } // otherwise, fall through case 'b': case 'big': case 'blockquote': case 'body': case 'br': case 'center': case 'code': case 'dd': case 'div': case 'dl': case 'dt': case 'em': case 'embed': case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6': case 'head': case 'hr': case 'i': case 'img': case 'li': case 'listing': case 'menu': case 'meta': case 'nobr': case 'ol': case 'p': case 'pre': case 'ruby': case 's': case 'small': case 'span': case 'strong': case 'strike': case 'sub': case 'sup': case 'table': case 'tt': case 'u': case 'ul': case 'var': if ( $this->fragmentContext ) { break; } while ( true ) { $this->stack->pop(); $node = $this->stack->currentNode; if ( $node->isMathmlTextIntegrationPoint() || $node->isHtmlIntegrationPoint() || $node->isHtml() ) { break; } } return $this->insertToken( $token, $value, $attribs, $selfClose ); } // "Any other start tag" $adjusted = ( $this->fragmentContext && $this->stack->length()===1 ) ? $this->fragmentContext : $this->stack->currentNode; $this->stack->insertForeignElement( $adjusted->namespaceURI, $value, $attribs ); if ( $selfClose ) { $this->stack->pop(); } return true; } elseif ( $token === 'endtag' ) { $first = true; foreach ( $this->stack as $i => $node ) { if ( $node->isHtml() && !$first ) { // process the end tag as HTML $func = $this->parseMode; return $this->$func( $token, $value, $attribs, $selfClose ); } elseif ( $i === 0 ) { return true; } elseif ( $node->localName === $value ) { $this->stack->popTag( $node ); return true; } $first = false; } } } /** * Grab the next "token" from $bitsIterator. This is either a open/close * tag or text or a comment, depending on whether the Sanitizer approves. */ private function advance() { $x = $this->bitsIterator->current(); $this->bitsIterator->next(); $regs = []; // Handle comments. These won't be generated by mediawiki (they // are stripped in the Sanitizer) but may be generated by extensions. if ( $this->allowComments && !( $this->inRCDATA || $this->inRAWTEXT ) && preg_match( Balancer::VALID_COMMENT_REGEX, $x, $regs, PREG_OFFSET_CAPTURE ) && // verify EOF condition where necessary ( $regs[4][1] < 0 || !$this->bitsIterator->valid() ) ) { $contents = $regs[2][0]; $rest = $regs[5][0]; $this->insertToken( 'comment', $contents ); $this->insertToken( 'text', str_replace( '>', '>', $rest ) ); return; } // $slash: Does the current element start with a '/'? // $t: Current element name // $attribStr: String between element name and > // $brace: Ending '>' or '/>' // $rest: Everything until the next element from the $bitsIterator if ( preg_match( Sanitizer::ELEMENT_BITS_REGEX, $x, $regs ) ) { list( /* $qbar */, $slash, $t, $attribStr, $brace, $rest ) = $regs; $t = strtolower( $t ); if ( $this->strict ) { // Verify that attributes are all properly double-quoted Assert::invariant( preg_match( '/^( [:_A-Z0-9][-.:_A-Z0-9]*="[^"]*")*[ ]*$/i', $attribStr ), "Bad attribute string found" ); } } else { Assert::invariant( !$this->strict, "< found which does not start a valid tag" ); $slash = $t = $attribStr = $brace = $rest = null; } $goodTag = $t; if ( $this->inRCDATA ) { if ( $slash && $t === $this->inRCDATA ) { $this->inRCDATA = false; } else { // No tags allowed; this emulates the "rcdata" tokenizer mode. $goodTag = false; } } if ( $this->inRAWTEXT ) { if ( $slash && $t === $this->inRAWTEXT ) { $this->inRAWTEXT = false; } else { // No tags allowed, no entity-escaping done. $goodTag = false; } } $sanitize = $this->allowedHtmlElements !== null; if ( $sanitize ) { $goodTag = $t && isset( $this->allowedHtmlElements[$t] ); } if ( $goodTag ) { if ( is_callable( $this->processingCallback ) ) { call_user_func_array( $this->processingCallback, [ &$attribStr, $this->processingArgs ] ); } if ( $sanitize ) { $goodTag = Sanitizer::validateTag( $attribStr, $t ); } } if ( $goodTag ) { if ( $sanitize ) { $attribs = Sanitizer::decodeTagAttributes( $attribStr ); $attribs = Sanitizer::validateTagAttributes( $attribs, $t ); } else { $attribs = Sanitizer::decodeTagAttributes( $attribStr ); } $goodTag = $this->insertToken( $slash ? 'endtag' : 'tag', $t, $attribs, $brace === '/>' ); } if ( $goodTag ) { $rest = str_replace( '>', '>', $rest ); $this->insertToken( 'text', str_replace( '>', '>', $rest ) ); } elseif ( $this->inRAWTEXT ) { $this->insertToken( 'text', "<$x" ); } else { // bad tag; serialize entire thing as text. $this->insertToken( 'text', '<' . str_replace( '>', '>', $x ) ); } } private function switchMode( $mode ) { Assert::parameter( substr( $mode, -4 )==='Mode', '$mode', 'should end in Mode' ); $oldMode = $this->parseMode; $this->parseMode = $mode; return $oldMode; } private function switchModeAndReprocess( $mode, $token, $value, $attribs, $selfClose ) { $this->switchMode( $mode ); return $this->insertToken( $token, $value, $attribs, $selfClose ); } private function resetInsertionMode() { $last = false; foreach ( $this->stack as $i => $node ) { if ( $i === 0 ) { $last = true; if ( $this->fragmentContext ) { $node = $this->fragmentContext; } } if ( $node->isHtml() ) { switch ( $node->localName ) { case 'select': $stackLength = $this->stack->length(); for ( $j = $i + 1; $j < $stackLength-1; $j++ ) { $ancestor = $this->stack->node( $stackLength-$j-1 ); if ( $ancestor->isHtmlNamed( 'template' ) ) { break; } if ( $ancestor->isHtmlNamed( 'table' ) ) { $this->switchMode( 'inSelectInTableMode' ); return; } } $this->switchMode( 'inSelectMode' ); return; case 'tr': $this->switchMode( 'inRowMode' ); return; case 'tbody': case 'tfoot': case 'thead': $this->switchMode( 'inTableBodyMode' ); return; case 'caption': $this->switchMode( 'inCaptionMode' ); return; case 'colgroup': $this->switchMode( 'inColumnGroupMode' ); return; case 'table': $this->switchMode( 'inTableMode' ); return; case 'template': $this->switchMode( array_slice( $this->templateInsertionModes, -1 )[0] ); return; case 'body': $this->switchMode( 'inBodyMode' ); return; // OMITTED: <frameset> // OMITTED: <html> // OMITTED: <head> default: if ( !$last ) { // OMITTED: <head> if ( $node->isA( BalanceSets::$tableCellSet ) ) { $this->switchMode( 'inCellMode' ); return; } } } } if ( $last ) { $this->switchMode( 'inBodyMode' ); return; } } } private function stopParsing() { // Most of the spec methods are inapplicable, other than step 2: // "pop all the nodes off the stack of open elements". // We're going to keep the top-most <html> element on the stack, though. // Clear the AFE list first, otherwise the element objects will stay live // during serialization, potentially using O(N^2) memory. Note that // popping the stack will never result in reconstructing the active // formatting elements. $this->afe = null; $this->stack->popTo( 1 ); } private function parseRawText( $value, $attribs = null ) { $this->stack->insertHTMLElement( $value, $attribs ); $this->inRAWTEXT = $value; $this->originalInsertionMode = $this->switchMode( 'inTextMode' ); return true; } private function inTextMode( $token, $value, $attribs = null, $selfClose = false ) { if ( $token === 'text' ) { $this->stack->insertText( $value ); return true; } elseif ( $token === 'eof' ) { $this->stack->pop(); return $this->switchModeAndReprocess( $this->originalInsertionMode, $token, $value, $attribs, $selfClose ); } elseif ( $token === 'endtag' ) { $this->stack->pop(); $this->switchMode( $this->originalInsertionMode ); return true; } return true; } private function inHeadMode( $token, $value, $attribs = null, $selfClose = false ) { if ( $token === 'text' ) { if ( preg_match( '/^[\x09\x0A\x0C\x0D\x20]+/', $value, $matches ) ) { $this->stack->insertText( $matches[0] ); $value = substr( $value, strlen( $matches[0] ) ); } if ( strlen( $value ) === 0 ) { return true; // All text handled. } // Fall through to handle non-whitespace below. } elseif ( $token === 'tag' ) { switch ( $value ) { case 'meta': // OMITTED: in a full HTML parser, this might change the encoding. // falls through // OMITTED: <html> case 'base': case 'basefont': case 'bgsound': case 'link': $this->stack->insertHTMLElement( $value, $attribs ); $this->stack->pop(); return true; // OMITTED: <title> // OMITTED: <noscript> case 'noframes': case 'style': return $this->parseRawText( $value, $attribs ); // OMITTED: <script> case 'template': $this->stack->insertHTMLElement( $value, $attribs ); $this->afe->insertMarker(); // OMITTED: frameset_ok $this->switchMode( 'inTemplateMode' ); $this->templateInsertionModes[] = $this->parseMode; return true; // OMITTED: <head> } } elseif ( $token === 'endtag' ) { switch ( $value ) { // OMITTED: <head> // OMITTED: <body> // OMITTED: <html> case 'br': break; // handle at the bottom of the function case 'template': if ( $this->stack->indexOf( $value ) < 0 ) { return true; // Ignore the token. } $this->stack->generateImpliedEndTags( null, true /* thorough */ ); $this->stack->popTag( $value ); $this->afe->clearToMarker(); array_pop( $this->templateInsertionModes ); $this->resetInsertionMode(); return true; default: // ignore any other end tag return true; } } elseif ( $token === 'comment' ) { $this->stack->insertComment( $value ); return true; } // If not handled above $this->inHeadMode( 'endtag', 'head' ); // synthetic </head> // Then redo this one return $this->insertToken( $token, $value, $attribs, $selfClose ); } private function inBodyMode( $token, $value, $attribs = null, $selfClose = false ) { if ( $token === 'text' ) { $this->afe->reconstruct( $this->stack ); $this->stack->insertText( $value ); return true; } elseif ( $token === 'eof' ) { if ( !empty( $this->templateInsertionModes ) ) { return $this->inTemplateMode( $token, $value, $attribs, $selfClose ); } $this->stopParsing(); return true; } elseif ( $token === 'tag' ) { switch ( $value ) { // OMITTED: <html> case 'base': case 'basefont': case 'bgsound': case 'link': case 'meta': case 'noframes': // OMITTED: <script> case 'style': case 'template': // OMITTED: <title> return $this->inHeadMode( $token, $value, $attribs, $selfClose ); // OMITTED: <body> // OMITTED: <frameset> case 'address': case 'article': case 'aside': case 'blockquote': case 'center': case 'details': case 'dialog': case 'dir': case 'div': case 'dl': case 'fieldset': case 'figcaption': case 'figure': case 'footer': case 'header': case 'hgroup': case 'main': case 'menu': case 'nav': case 'ol': case 'p': case 'section': case 'summary': case 'ul': if ( $this->stack->inButtonScope( 'p' ) ) { $this->inBodyMode( 'endtag', 'p' ); } $this->stack->insertHTMLElement( $value, $attribs ); return true; case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6': if ( $this->stack->inButtonScope( 'p' ) ) { $this->inBodyMode( 'endtag', 'p' ); } if ( $this->stack->currentNode->isA( BalanceSets::$headingSet ) ) { $this->stack->pop(); } $this->stack->insertHTMLElement( $value, $attribs ); return true; case 'pre': case 'listing': if ( $this->stack->inButtonScope( 'p' ) ) { $this->inBodyMode( 'endtag', 'p' ); } $this->stack->insertHTMLElement( $value, $attribs ); $this->ignoreLinefeed = true; // OMITTED: frameset_ok return true; case 'form': if ( $this->formElementPointer && $this->stack->indexOf( 'template' ) < 0 ) { return true; // in a form, not in a template. } if ( $this->stack->inButtonScope( "p" ) ) { $this->inBodyMode( 'endtag', 'p' ); } $elt = $this->stack->insertHTMLElement( $value, $attribs ); if ( $this->stack->indexOf( 'template' ) < 0 ) { $this->formElementPointer = $elt; } return true; case 'li': // OMITTED: frameset_ok foreach ( $this->stack as $node ) { if ( $node->isHtmlNamed( 'li' ) ) { $this->inBodyMode( 'endtag', 'li' ); break; } if ( $node->isA( BalanceSets::$specialSet ) && !$node->isA( BalanceSets::$addressDivPSet ) ) { break; } } if ( $this->stack->inButtonScope( 'p' ) ) { $this->inBodyMode( 'endtag', 'p' ); } $this->stack->insertHTMLElement( $value, $attribs ); return true; case 'dd': case 'dt': // OMITTED: frameset_ok foreach ( $this->stack as $node ) { if ( $node->isHtmlNamed( 'dd' ) ) { $this->inBodyMode( 'endtag', 'dd' ); break; } if ( $node->isHtmlNamed( 'dt' ) ) { $this->inBodyMode( 'endtag', 'dt' ); break; } if ( $node->isA( BalanceSets::$specialSet ) && !$node->isA( BalanceSets::$addressDivPSet ) ) { break; } } if ( $this->stack->inButtonScope( 'p' ) ) { $this->inBodyMode( 'endtag', 'p' ); } $this->stack->insertHTMLElement( $value, $attribs ); return true; // OMITTED: <plaintext> case 'button': if ( $this->stack->inScope( 'button' ) ) { $this->inBodyMode( 'endtag', 'button' ); return $this->insertToken( $token, $value, $attribs, $selfClose ); } $this->afe->reconstruct( $this->stack ); $this->stack->insertHTMLElement( $value, $attribs ); return true; case 'a': $activeElement = $this->afe->findElementByTag( 'a' ); if ( $activeElement ) { $this->inBodyMode( 'endtag', 'a' ); if ( $this->afe->isInList( $activeElement ) ) { $this->afe->remove( $activeElement ); // Don't flatten here, since when we fall // through below we might foster parent // the new <a> tag inside this one. $this->stack->removeElement( $activeElement, false ); } } // Falls through case 'b': case 'big': case 'code': case 'em': case 'font': case 'i': case 's': case 'small': case 'strike': case 'strong': case 'tt': case 'u': $this->afe->reconstruct( $this->stack ); $this->afe->push( $this->stack->insertHTMLElement( $value, $attribs ), $attribs ); return true; case 'nobr': $this->afe->reconstruct( $this->stack ); if ( $this->stack->inScope( 'nobr' ) ) { $this->inBodyMode( 'endtag', 'nobr' ); $this->afe->reconstruct( $this->stack ); } $this->afe->push( $this->stack->insertHTMLElement( $value, $attribs ), $attribs ); return true; case 'applet': case 'marquee': case 'object': $this->afe->reconstruct( $this->stack ); $this->stack->insertHTMLElement( $value, $attribs ); $this->afe->insertMarker(); // OMITTED: frameset_ok return true; case 'table': // The document is never in "quirks mode"; see simplifications // above. if ( $this->stack->inButtonScope( 'p' ) ) { $this->inBodyMode( 'endtag', 'p' ); } $this->stack->insertHTMLElement( $value, $attribs ); // OMITTED: frameset_ok $this->switchMode( 'inTableMode' ); return true; case 'area': case 'br': case 'embed': case 'img': case 'keygen': case 'wbr': $this->afe->reconstruct( $this->stack ); $this->stack->insertHTMLElement( $value, $attribs ); $this->stack->pop(); // OMITTED: frameset_ok return true; case 'input': $this->afe->reconstruct( $this->stack ); $this->stack->insertHTMLElement( $value, $attribs ); $this->stack->pop(); // OMITTED: frameset_ok // (hence we don't need to examine the tag's "type" attribute) return true; case 'menuitem': case 'param': case 'source': case 'track': $this->stack->insertHTMLElement( $value, $attribs ); $this->stack->pop(); return true; case 'hr': if ( $this->stack->inButtonScope( 'p' ) ) { $this->inBodyMode( 'endtag', 'p' ); } $this->stack->insertHTMLElement( $value, $attribs ); $this->stack->pop(); return true; case 'image': // warts! return $this->inBodyMode( $token, 'img', $attribs, $selfClose ); // OMITTED: <isindex> case 'textarea': $this->stack->insertHTMLElement( $value, $attribs ); $this->ignoreLinefeed = true; $this->inRCDATA = $value; // emulate rcdata tokenizer mode // OMITTED: frameset_ok return true; // OMITTED: <xmp> // OMITTED: <iframe> // OMITTED: <noembed> // OMITTED: <noscript> case 'select': $this->afe->reconstruct( $this->stack ); $this->stack->insertHTMLElement( $value, $attribs ); switch ( $this->parseMode ) { case 'inTableMode': case 'inCaptionMode': case 'inTableBodyMode': case 'inRowMode': case 'inCellMode': $this->switchMode( 'inSelectInTableMode' ); return true; default: $this->switchMode( 'inSelectMode' ); return true; } case 'optgroup': case 'option': if ( $this->stack->currentNode->isHtmlNamed( 'option' ) ) { $this->inBodyMode( 'endtag', 'option' ); } $this->afe->reconstruct( $this->stack ); $this->stack->insertHTMLElement( $value, $attribs ); return true; case 'rb': case 'rtc': if ( $this->stack->inScope( 'ruby' ) ) { $this->stack->generateImpliedEndTags(); } $this->stack->insertHTMLElement( $value, $attribs ); return true; case 'rp': case 'rt': if ( $this->stack->inScope( 'ruby' ) ) { $this->stack->generateImpliedEndTags( 'rtc' ); } $this->stack->insertHTMLElement( $value, $attribs ); return true; case 'math': $this->afe->reconstruct( $this->stack ); // We skip the spec's "adjust MathML attributes" and // "adjust foreign attributes" steps, since the browser will // do this later when it parses the output and it doesn't affect // balancing. $this->stack->insertForeignElement( BalanceSets::MATHML_NAMESPACE, $value, $attribs ); if ( $selfClose ) { // emit explicit </math> tag. $this->stack->pop(); } return true; case 'svg': $this->afe->reconstruct( $this->stack ); // We skip the spec's "adjust SVG attributes" and // "adjust foreign attributes" steps, since the browser will // do this later when it parses the output and it doesn't affect // balancing. $this->stack->insertForeignElement( BalanceSets::SVG_NAMESPACE, $value, $attribs ); if ( $selfClose ) { // emit explicit </svg> tag. $this->stack->pop(); } return true; case 'caption': case 'col': case 'colgroup': // OMITTED: <frame> case 'head': case 'tbody': case 'td': case 'tfoot': case 'th': case 'thead': case 'tr': // Ignore table tags if we're not inTableMode return true; } // Handle any other start tag here $this->afe->reconstruct( $this->stack ); $this->stack->insertHTMLElement( $value, $attribs ); return true; } elseif ( $token === 'endtag' ) { switch ( $value ) { // </body>,</html> are unsupported. case 'template': return $this->inHeadMode( $token, $value, $attribs, $selfClose ); case 'address': case 'article': case 'aside': case 'blockquote': case 'button': case 'center': case 'details': case 'dialog': case 'dir': case 'div': case 'dl': case 'fieldset': case 'figcaption': case 'figure': case 'footer': case 'header': case 'hgroup': case 'listing': case 'main': case 'menu': case 'nav': case 'ol': case 'pre': case 'section': case 'summary': case 'ul': // Ignore if there is not a matching open tag if ( !$this->stack->inScope( $value ) ) { return true; } $this->stack->generateImpliedEndTags(); $this->stack->popTag( $value ); return true; case 'form': if ( $this->stack->indexOf( 'template' ) < 0 ) { $openform = $this->formElementPointer; $this->formElementPointer = null; if ( !$openform || !$this->stack->inScope( $openform ) ) { return true; } $this->stack->generateImpliedEndTags(); // Don't flatten yet if we're removing a <form> element // out-of-order. (eg. `<form><div></form>`) $flatten = ( $this->stack->currentNode === $openform ); $this->stack->removeElement( $openform, $flatten ); } else { if ( !$this->stack->inScope( 'form' ) ) { return true; } $this->stack->generateImpliedEndTags(); $this->stack->popTag( 'form' ); } return true; case 'p': if ( !$this->stack->inButtonScope( 'p' ) ) { $this->inBodyMode( 'tag', 'p', [] ); return $this->insertToken( $token, $value, $attribs, $selfClose ); } $this->stack->generateImpliedEndTags( $value ); $this->stack->popTag( $value ); return true; case 'li': if ( !$this->stack->inListItemScope( $value ) ) { return true; // ignore } $this->stack->generateImpliedEndTags( $value ); $this->stack->popTag( $value ); return true; case 'dd': case 'dt': if ( !$this->stack->inScope( $value ) ) { return true; // ignore } $this->stack->generateImpliedEndTags( $value ); $this->stack->popTag( $value ); return true; case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6': if ( !$this->stack->inScope( BalanceSets::$headingSet ) ) { return true; // ignore } $this->stack->generateImpliedEndTags(); $this->stack->popTag( BalanceSets::$headingSet ); return true; case 'sarcasm': // Take a deep breath, then: break; case 'a': case 'b': case 'big': case 'code': case 'em': case 'font': case 'i': case 'nobr': case 's': case 'small': case 'strike': case 'strong': case 'tt': case 'u': if ( $this->stack->adoptionAgency( $value, $this->afe ) ) { return true; // If we did something, we're done. } break; // Go to the "any other end tag" case. case 'applet': case 'marquee': case 'object': if ( !$this->stack->inScope( $value ) ) { return true; // ignore } $this->stack->generateImpliedEndTags(); $this->stack->popTag( $value ); $this->afe->clearToMarker(); return true; case 'br': // Turn </br> into <br> return $this->inBodyMode( 'tag', $value, [] ); } // Any other end tag goes here foreach ( $this->stack as $i => $node ) { if ( $node->isHtmlNamed( $value ) ) { $this->stack->generateImpliedEndTags( $value ); $this->stack->popTo( $i ); // including $i break; } elseif ( $node->isA( BalanceSets::$specialSet ) ) { return true; // ignore this close token. } } return true; } elseif ( $token === 'comment' ) { $this->stack->insertComment( $value ); return true; } else { Assert::invariant( false, "Bad token type: $token" ); } } private function inTableMode( $token, $value, $attribs = null, $selfClose = false ) { if ( $token === 'text' ) { if ( $this->textIntegrationMode ) { return $this->inBodyMode( $token, $value, $attribs, $selfClose ); } elseif ( $this->stack->currentNode->isA( BalanceSets::$tableSectionRowSet ) ) { $this->pendingTableText = ''; $this->originalInsertionMode = $this->parseMode; return $this->switchModeAndReprocess( 'inTableTextMode', $token, $value, $attribs, $selfClose ); } // fall through to default case. } elseif ( $token === 'eof' ) { $this->stopParsing(); return true; } elseif ( $token === 'tag' ) { switch ( $value ) { case 'caption': $this->afe->insertMarker(); $this->stack->insertHTMLElement( $value, $attribs ); $this->switchMode( 'inCaptionMode' ); return true; case 'colgroup': $this->stack->clearToContext( BalanceSets::$tableContextSet ); $this->stack->insertHTMLElement( $value, $attribs ); $this->switchMode( 'inColumnGroupMode' ); return true; case 'col': $this->inTableMode( 'tag', 'colgroup', [] ); return $this->insertToken( $token, $value, $attribs, $selfClose ); case 'tbody': case 'tfoot': case 'thead': $this->stack->clearToContext( BalanceSets::$tableContextSet ); $this->stack->insertHTMLElement( $value, $attribs ); $this->switchMode( 'inTableBodyMode' ); return true; case 'td': case 'th': case 'tr': $this->inTableMode( 'tag', 'tbody', [] ); return $this->insertToken( $token, $value, $attribs, $selfClose ); case 'table': if ( !$this->stack->inTableScope( $value ) ) { return true; // Ignore this tag. } $this->inTableMode( 'endtag', $value ); return $this->insertToken( $token, $value, $attribs, $selfClose ); case 'style': // OMITTED: <script> case 'template': return $this->inHeadMode( $token, $value, $attribs, $selfClose ); case 'input': if ( !isset( $attribs['type'] ) || strcasecmp( $attribs['type'], 'hidden' ) !== 0 ) { break; // Handle this as "everything else" } $this->stack->insertHTMLElement( $value, $attribs ); $this->stack->pop(); return true; case 'form': if ( $this->formElementPointer || $this->stack->indexOf( 'template' ) >= 0 ) { return true; // ignore this token } $this->formElementPointer = $this->stack->insertHTMLElement( $value, $attribs ); $this->stack->popTag( $this->formElementPointer ); return true; } // Fall through for "anything else" clause. } elseif ( $token === 'endtag' ) { switch ( $value ) { case 'table': if ( !$this->stack->inTableScope( $value ) ) { return true; // Ignore. } $this->stack->popTag( $value ); $this->resetInsertionMode(); return true; // OMITTED: <body> case 'caption': case 'col': case 'colgroup': // OMITTED: <html> case 'tbody': case 'td': case 'tfoot': case 'th': case 'thead': case 'tr': return true; // Ignore the token. case 'template': return $this->inHeadMode( $token, $value, $attribs, $selfClose ); } // Fall through for "anything else" clause. } elseif ( $token === 'comment' ) { $this->stack->insertComment( $value ); return true; } // This is the "anything else" case: $this->stack->fosterParentMode = true; $this->inBodyMode( $token, $value, $attribs, $selfClose ); $this->stack->fosterParentMode = false; return true; } private function inTableTextMode( $token, $value, $attribs = null, $selfClose = false ) { if ( $token === 'text' ) { $this->pendingTableText .= $value; return true; } // Non-text token: $text = $this->pendingTableText; $this->pendingTableText = ''; if ( preg_match( '/[^\x09\x0A\x0C\x0D\x20]/', $text ) ) { // This should match the "anything else" case inTableMode $this->stack->fosterParentMode = true; $this->inBodyMode( 'text', $text ); $this->stack->fosterParentMode = false; } else { // Pending text is just whitespace. $this->stack->insertText( $text ); } return $this->switchModeAndReprocess( $this->originalInsertionMode, $token, $value, $attribs, $selfClose ); } // helper for inCaptionMode private function endCaption() { if ( !$this->stack->inTableScope( 'caption' ) ) { return false; } $this->stack->generateImpliedEndTags(); $this->stack->popTag( 'caption' ); $this->afe->clearToMarker(); $this->switchMode( 'inTableMode' ); return true; } private function inCaptionMode( $token, $value, $attribs = null, $selfClose = false ) { if ( $token === 'tag' ) { switch ( $value ) { case 'caption': case 'col': case 'colgroup': case 'tbody': case 'td': case 'tfoot': case 'th': case 'thead': case 'tr': if ( $this->endCaption() ) { $this->insertToken( $token, $value, $attribs, $selfClose ); } return true; } // Fall through to "anything else" case. } elseif ( $token === 'endtag' ) { switch ( $value ) { case 'caption': $this->endCaption(); return true; case 'table': if ( $this->endCaption() ) { $this->insertToken( $token, $value, $attribs, $selfClose ); } return true; case 'body': case 'col': case 'colgroup': // OMITTED: <html> case 'tbody': case 'td': case 'tfoot': case 'th': case 'thead': case 'tr': // Ignore the token return true; } // Fall through to "anything else" case. } // The Anything Else case return $this->inBodyMode( $token, $value, $attribs, $selfClose ); } private function inColumnGroupMode( $token, $value, $attribs = null, $selfClose = false ) { if ( $token === 'text' ) { if ( preg_match( '/^[\x09\x0A\x0C\x0D\x20]+/', $value, $matches ) ) { $this->stack->insertText( $matches[0] ); $value = substr( $value, strlen( $matches[0] ) ); } if ( strlen( $value ) === 0 ) { return true; // All text handled. } // Fall through to handle non-whitespace below. } elseif ( $token === 'tag' ) { switch ( $value ) { // OMITTED: <html> case 'col': $this->stack->insertHTMLElement( $value, $attribs ); $this->stack->pop(); return true; case 'template': return $this->inHeadMode( $token, $value, $attribs, $selfClose ); } // Fall through for "anything else". } elseif ( $token === 'endtag' ) { switch ( $value ) { case 'colgroup': if ( !$this->stack->currentNode->isHtmlNamed( 'colgroup' ) ) { return true; // Ignore the token. } $this->stack->pop(); $this->switchMode( 'inTableMode' ); return true; case 'col': return true; // Ignore the token. case 'template': return $this->inHeadMode( $token, $value, $attribs, $selfClose ); } // Fall through for "anything else". } elseif ( $token === 'eof' ) { return $this->inBodyMode( $token, $value, $attribs, $selfClose ); } elseif ( $token === 'comment' ) { $this->stack->insertComment( $value ); return true; } // Anything else if ( !$this->stack->currentNode->isHtmlNamed( 'colgroup' ) ) { return true; // Ignore the token. } $this->inColumnGroupMode( 'endtag', 'colgroup' ); return $this->insertToken( $token, $value, $attribs, $selfClose ); } // Helper function for inTableBodyMode private function endSection() { if ( !( $this->stack->inTableScope( 'tbody' ) || $this->stack->inTableScope( 'thead' ) || $this->stack->inTableScope( 'tfoot' ) ) ) { return false; } $this->stack->clearToContext( BalanceSets::$tableBodyContextSet ); $this->stack->pop(); $this->switchMode( 'inTableMode' ); return true; } private function inTableBodyMode( $token, $value, $attribs = null, $selfClose = false ) { if ( $token === 'tag' ) { switch ( $value ) { case 'tr': $this->stack->clearToContext( BalanceSets::$tableBodyContextSet ); $this->stack->insertHTMLElement( $value, $attribs ); $this->switchMode( 'inRowMode' ); return true; case 'th': case 'td': $this->inTableBodyMode( 'tag', 'tr', [] ); $this->insertToken( $token, $value, $attribs, $selfClose ); return true; case 'caption': case 'col': case 'colgroup': case 'tbody': case 'tfoot': case 'thead': if ( $this->endSection() ) { $this->insertToken( $token, $value, $attribs, $selfClose ); } return true; } } elseif ( $token === 'endtag' ) { switch ( $value ) { case 'table': if ( $this->endSection() ) { $this->insertToken( $token, $value, $attribs, $selfClose ); } return true; case 'tbody': case 'tfoot': case 'thead': if ( $this->stack->inTableScope( $value ) ) { $this->endSection(); } return true; // OMITTED: <body> case 'caption': case 'col': case 'colgroup': // OMITTED: <html> case 'td': case 'th': case 'tr': return true; // Ignore the token. } } // Anything else: return $this->inTableMode( $token, $value, $attribs, $selfClose ); } // Helper function for inRowMode private function endRow() { if ( !$this->stack->inTableScope( 'tr' ) ) { return false; } $this->stack->clearToContext( BalanceSets::$tableRowContextSet ); $this->stack->pop(); $this->switchMode( 'inTableBodyMode' ); return true; } private function inRowMode( $token, $value, $attribs = null, $selfClose = false ) { if ( $token === 'tag' ) { switch ( $value ) { case 'th': case 'td': $this->stack->clearToContext( BalanceSets::$tableRowContextSet ); $this->stack->insertHTMLElement( $value, $attribs ); $this->switchMode( 'inCellMode' ); $this->afe->insertMarker(); return true; case 'caption': case 'col': case 'colgroup': case 'tbody': case 'tfoot': case 'thead': case 'tr': if ( $this->endRow() ) { $this->insertToken( $token, $value, $attribs, $selfClose ); } return true; } } elseif ( $token === 'endtag' ) { switch ( $value ) { case 'tr': $this->endRow(); return true; case 'table': if ( $this->endRow() ) { $this->insertToken( $token, $value, $attribs, $selfClose ); } return true; case 'tbody': case 'tfoot': case 'thead': if ( $this->stack->inTableScope( $value ) && $this->endRow() ) { $this->insertToken( $token, $value, $attribs, $selfClose ); } return true; // OMITTED: <body> case 'caption': case 'col': case 'colgroup': // OMITTED: <html> case 'td': case 'th': return true; // Ignore the token. } } // Anything else: return $this->inTableMode( $token, $value, $attribs, $selfClose ); } // Helper for inCellMode private function endCell() { if ( $this->stack->inTableScope( 'td' ) ) { $this->inCellMode( 'endtag', 'td' ); return true; } elseif ( $this->stack->inTableScope( 'th' ) ) { $this->inCellMode( 'endtag', 'th' ); return true; } else { return false; } } private function inCellMode( $token, $value, $attribs = null, $selfClose = false ) { if ( $token === 'tag' ) { switch ( $value ) { case 'caption': case 'col': case 'colgroup': case 'tbody': case 'td': case 'tfoot': case 'th': case 'thead': case 'tr': if ( $this->endCell() ) { $this->insertToken( $token, $value, $attribs, $selfClose ); } return true; } } elseif ( $token === 'endtag' ) { switch ( $value ) { case 'td': case 'th': if ( $this->stack->inTableScope( $value ) ) { $this->stack->generateImpliedEndTags(); $this->stack->popTag( $value ); $this->afe->clearToMarker(); $this->switchMode( 'inRowMode' ); } return true; // OMITTED: <body> case 'caption': case 'col': case 'colgroup': // OMITTED: <html> return true; case 'table': case 'tbody': case 'tfoot': case 'thead': case 'tr': if ( $this->stack->inTableScope( $value ) ) { $this->stack->generateImpliedEndTags(); $this->stack->popTag( BalanceSets::$tableCellSet ); $this->afe->clearToMarker(); $this->switchMode( 'inRowMode' ); $this->insertToken( $token, $value, $attribs, $selfClose ); } return true; } } // Anything else: return $this->inBodyMode( $token, $value, $attribs, $selfClose ); } private function inSelectMode( $token, $value, $attribs = null, $selfClose = false ) { if ( $token === 'text' ) { $this->stack->insertText( $value ); return true; } elseif ( $token === 'eof' ) { return $this->inBodyMode( $token, $value, $attribs, $selfClose ); } elseif ( $token === 'tag' ) { switch ( $value ) { // OMITTED: <html> case 'option': if ( $this->stack->currentNode->isHtmlNamed( 'option' ) ) { $this->stack->pop(); } $this->stack->insertHTMLElement( $value, $attribs ); return true; case 'optgroup': if ( $this->stack->currentNode->isHtmlNamed( 'option' ) ) { $this->stack->pop(); } if ( $this->stack->currentNode->isHtmlNamed( 'optgroup' ) ) { $this->stack->pop(); } $this->stack->insertHTMLElement( $value, $attribs ); return true; case 'select': $this->inSelectMode( 'endtag', $value ); // treat it like endtag return true; case 'input': case 'keygen': case 'textarea': if ( !$this->stack->inSelectScope( 'select' ) ) { return true; // ignore token (fragment case) } $this->inSelectMode( 'endtag', 'select' ); return $this->insertToken( $token, $value, $attribs, $selfClose ); case 'script': case 'template': return $this->inHeadMode( $token, $value, $attribs, $selfClose ); } } elseif ( $token === 'endtag' ) { switch ( $value ) { case 'optgroup': if ( $this->stack->currentNode->isHtmlNamed( 'option' ) && $this->stack->length() >= 2 && $this->stack->node( $this->stack->length() - 2 )->isHtmlNamed( 'optgroup' ) ) { $this->stack->pop(); } if ( $this->stack->currentNode->isHtmlNamed( 'optgroup' ) ) { $this->stack->pop(); } return true; case 'option': if ( $this->stack->currentNode->isHtmlNamed( 'option' ) ) { $this->stack->pop(); } return true; case 'select': if ( !$this->stack->inSelectScope( $value ) ) { return true; // fragment case } $this->stack->popTag( $value ); $this->resetInsertionMode(); return true; case 'template': return $this->inHeadMode( $token, $value, $attribs, $selfClose ); } } elseif ( $token === 'comment' ) { $this->stack->insertComment( $value ); return true; } // anything else: just ignore the token return true; } private function inSelectInTableMode( $token, $value, $attribs = null, $selfClose = false ) { switch ( $value ) { case 'caption': case 'table': case 'tbody': case 'tfoot': case 'thead': case 'tr': case 'td': case 'th': if ( $token === 'tag' ) { $this->inSelectInTableMode( 'endtag', 'select' ); return $this->insertToken( $token, $value, $attribs, $selfClose ); } elseif ( $token === 'endtag' ) { if ( $this->stack->inTableScope( $value ) ) { $this->inSelectInTableMode( 'endtag', 'select' ); return $this->insertToken( $token, $value, $attribs, $selfClose ); } return true; } } // anything else return $this->inSelectMode( $token, $value, $attribs, $selfClose ); } private function inTemplateMode( $token, $value, $attribs = null, $selfClose = false ) { if ( $token === 'text' || $token === 'comment' ) { return $this->inBodyMode( $token, $value, $attribs, $selfClose ); } elseif ( $token === 'eof' ) { if ( $this->stack->indexOf( 'template' ) < 0 ) { $this->stopParsing(); } else { $this->stack->popTag( 'template' ); $this->afe->clearToMarker(); array_pop( $this->templateInsertionModes ); $this->resetInsertionMode(); $this->insertToken( $token, $value, $attribs, $selfClose ); } return true; } elseif ( $token === 'tag' ) { switch ( $value ) { case 'base': case 'basefont': case 'bgsound': case 'link': case 'meta': case 'noframes': // OMITTED: <script> case 'style': case 'template': // OMITTED: <title> return $this->inHeadMode( $token, $value, $attribs, $selfClose ); case 'caption': case 'colgroup': case 'tbody': case 'tfoot': case 'thead': return $this->switchModeAndReprocess( 'inTableMode', $token, $value, $attribs, $selfClose ); case 'col': return $this->switchModeAndReprocess( 'inColumnGroupMode', $token, $value, $attribs, $selfClose ); case 'tr': return $this->switchModeAndReprocess( 'inTableBodyMode', $token, $value, $attribs, $selfClose ); case 'td': case 'th': return $this->switchModeAndReprocess( 'inRowMode', $token, $value, $attribs, $selfClose ); } return $this->switchModeAndReprocess( 'inBodyMode', $token, $value, $attribs, $selfClose ); } elseif ( $token === 'endtag' ) { switch ( $value ) { case 'template': return $this->inHeadMode( $token, $value, $attribs, $selfClose ); } return true; } else { Assert::invariant( false, "Bad token type: $token" ); } } }