plugin updates

This commit is contained in:
Tony Volpe
2024-11-15 13:53:04 -05:00
parent 1293d604ca
commit 0238f0c4ca
2009 changed files with 163492 additions and 89543 deletions

View File

@@ -51,7 +51,7 @@ class WP_HTML_Active_Formatting_Elements {
* @param WP_HTML_Token $token Look for this node in the stack.
* @return bool Whether the referenced node is in the stack of active formatting elements.
*/
public function contains_node( $token ) {
public function contains_node( WP_HTML_Token $token ) {
foreach ( $this->walk_up() as $item ) {
if ( $token->bookmark_name === $item->bookmark_name ) {
return true;
@@ -86,6 +86,22 @@ class WP_HTML_Active_Formatting_Elements {
return $current_node ? $current_node : null;
}
/**
* Inserts a "marker" at the end of the list of active formatting elements.
*
* > The markers are inserted when entering applet, object, marquee,
* > template, td, th, and caption elements, and are used to prevent
* > formatting from "leaking" into applet, object, marquee, template,
* > td, th, and caption elements.
*
* @see https://html.spec.whatwg.org/#concept-parser-marker
*
* @since 6.7.0
*/
public function insert_marker(): void {
$this->push( new WP_HTML_Token( null, 'marker', false ) );
}
/**
* Pushes a node onto the stack of active formatting elements.
*
@@ -95,7 +111,7 @@ class WP_HTML_Active_Formatting_Elements {
*
* @param WP_HTML_Token $token Push this node onto the stack.
*/
public function push( $token ) {
public function push( WP_HTML_Token $token ) {
/*
* > If there are already three elements in the list of active formatting elements after the last marker,
* > if any, or anywhere in the list if there are no markers, that have the same tag name, namespace, and
@@ -119,7 +135,7 @@ class WP_HTML_Active_Formatting_Elements {
* @param WP_HTML_Token $token Remove this node from the stack, if it's there already.
* @return bool Whether the node was found and removed from the stack of active formatting elements.
*/
public function remove_node( $token ) {
public function remove_node( WP_HTML_Token $token ) {
foreach ( $this->walk_up() as $position_from_end => $item ) {
if ( $token->bookmark_name !== $item->bookmark_name ) {
continue;
@@ -184,4 +200,30 @@ class WP_HTML_Active_Formatting_Elements {
yield $this->stack[ $i ];
}
}
/**
* Clears the list of active formatting elements up to the last marker.
*
* > When the steps below require the UA to clear the list of active formatting elements up to
* > the last marker, the UA must perform the following steps:
* >
* > 1. Let entry be the last (most recently added) entry in the list of active
* > formatting elements.
* > 2. Remove entry from the list of active formatting elements.
* > 3. If entry was a marker, then stop the algorithm at this point.
* > The list has been cleared up to the last marker.
* > 4. Go to step 1.
*
* @see https://html.spec.whatwg.org/multipage/parsing.html#clear-the-list-of-active-formatting-elements-up-to-the-last-marker
*
* @since 6.7.0
*/
public function clear_up_to_last_marker(): void {
foreach ( $this->walk_up() as $item ) {
array_pop( $this->stack );
if ( 'marker' === $item->node_name ) {
break;
}
}
}
}

View File

@@ -31,7 +31,7 @@ class WP_HTML_Decoder {
* Default 'case-sensitive'.
* @return bool Whether the attribute value starts with the given string.
*/
public static function attribute_starts_with( $haystack, $search_text, $case_sensitivity = 'case-sensitive' ) {
public static function attribute_starts_with( $haystack, $search_text, $case_sensitivity = 'case-sensitive' ): bool {
$search_length = strlen( $search_text );
$loose_case = 'ascii-case-insensitive' === $case_sensitivity;
$haystack_end = strlen( $haystack );
@@ -90,7 +90,7 @@ class WP_HTML_Decoder {
* @param string $text Text containing raw and non-decoded text node to decode.
* @return string Decoded UTF-8 value of given text node.
*/
public static function decode_text_node( $text ) {
public static function decode_text_node( $text ): string {
return static::decode( 'data', $text );
}
@@ -110,7 +110,7 @@ class WP_HTML_Decoder {
* @param string $text Text containing raw and non-decoded attribute value to decode.
* @return string Decoded UTF-8 value of given attribute value.
*/
public static function decode_attribute( $text ) {
public static function decode_attribute( $text ): string {
return static::decode( 'attribute', $text );
}
@@ -133,7 +133,7 @@ class WP_HTML_Decoder {
* @param string $text Text document containing span of text to decode.
* @return string Decoded UTF-8 string.
*/
public static function decode( $context, $text ) {
public static function decode( $context, $text ): string {
$decoded = '';
$end = strlen( $text );
$at = 0;
@@ -141,7 +141,7 @@ class WP_HTML_Decoder {
while ( $at < $end ) {
$next_character_reference_at = strpos( $text, '&', $at );
if ( false === $next_character_reference_at || $next_character_reference_at >= $end ) {
if ( false === $next_character_reference_at ) {
break;
}
@@ -196,6 +196,8 @@ class WP_HTML_Decoder {
*
* @since 6.6.0
*
* @global WP_Token_Map $html5_named_character_references Mappings for HTML5 named character references.
*
* @param string $context `attribute` for decoding attribute values, `data` otherwise.
* @param string $text Text document containing span of text to decode.
* @param int $at Optional. Byte offset into text where span begins, defaults to the beginning (0).
@@ -421,7 +423,7 @@ class WP_HTML_Decoder {
* @param int $code_point Which code point to convert.
* @return string Converted code point, or `<60>` if invalid.
*/
public static function code_point_to_utf8_bytes( $code_point ) {
public static function code_point_to_utf8_bytes( $code_point ): string {
// Pre-check to ensure a valid code point.
if (
$code_point <= 0 ||
@@ -436,26 +438,26 @@ class WP_HTML_Decoder {
}
if ( $code_point <= 0x7FF ) {
$byte1 = ( $code_point >> 6 ) | 0xC0;
$byte2 = $code_point & 0x3F | 0x80;
$byte1 = chr( ( $code_point >> 6 ) | 0xC0 );
$byte2 = chr( $code_point & 0x3F | 0x80 );
return pack( 'CC', $byte1, $byte2 );
return "{$byte1}{$byte2}";
}
if ( $code_point <= 0xFFFF ) {
$byte1 = ( $code_point >> 12 ) | 0xE0;
$byte2 = ( $code_point >> 6 ) & 0x3F | 0x80;
$byte3 = $code_point & 0x3F | 0x80;
$byte1 = chr( ( $code_point >> 12 ) | 0xE0 );
$byte2 = chr( ( $code_point >> 6 ) & 0x3F | 0x80 );
$byte3 = chr( $code_point & 0x3F | 0x80 );
return pack( 'CCC', $byte1, $byte2, $byte3 );
return "{$byte1}{$byte2}{$byte3}";
}
// Any values above U+10FFFF are eliminated above in the pre-check.
$byte1 = ( $code_point >> 18 ) | 0xF0;
$byte2 = ( $code_point >> 12 ) & 0x3F | 0x80;
$byte3 = ( $code_point >> 6 ) & 0x3F | 0x80;
$byte4 = $code_point & 0x3F | 0x80;
$byte1 = chr( ( $code_point >> 18 ) | 0xF0 );
$byte2 = chr( ( $code_point >> 12 ) & 0x3F | 0x80 );
$byte3 = chr( ( $code_point >> 6 ) & 0x3F | 0x80 );
$byte4 = chr( $code_point & 0x3F | 0x80 );
return pack( 'CCCC', $byte1, $byte2, $byte3, $byte4 );
return "{$byte1}{$byte2}{$byte3}{$byte4}";
}
}

View File

@@ -0,0 +1,616 @@
<?php
/**
* HTML API: WP_HTML_Doctype_Info class
*
* @package WordPress
* @subpackage HTML-API
* @since 6.7.0
*/
/**
* Core class used by the HTML API to represent a DOCTYPE declaration.
*
* This class parses DOCTYPE tokens for the full parser in the HTML Processor.
* Most code interacting with HTML won't need to parse DOCTYPE declarations;
* the HTML Processor is one exception. Consult the HTML Processor for proper
* parsing of an HTML document.
*
* A DOCTYPE declaration may indicate its document compatibility mode, which impacts
* the structure of the following HTML as well as the behavior of CSS class selectors.
* There are three possible modes:
*
* - "no-quirks" and "limited-quirks" modes (also called "standards mode").
* - "quirks" mode.
*
* These modes mostly determine whether CSS class name selectors match values in the
* HTML `class` attribute in an ASCII-case-insensitive way (quirks mode), or whether
* they match only when byte-for-byte identical (no-quirks mode).
*
* All HTML documents should start with the standard HTML5 DOCTYPE: `<!DOCTYPE html>`.
*
* > DOCTYPEs are required for legacy reasons. When omitted, browsers tend to use a different
* > rendering mode that is incompatible with some specifications. Including the DOCTYPE in a
* > document ensures that the browser makes a best-effort attempt at following the
* > relevant specifications.
*
* @see https://html.spec.whatwg.org/#the-doctype
*
* DOCTYPE declarations comprise four properties: a name, public identifier, system identifier,
* and an indication of which document compatability mode they would imply if an HTML parser
* hadn't already determined it from other information.
*
* @see https://html.spec.whatwg.org/#the-initial-insertion-mode
*
* Historically, the DOCTYPE declaration was used in SGML documents to instruct a parser how
* to interpret the various tags and entities within a document. Its role in HTML diverged
* from how it was used in SGML and no meaning should be back-read into HTML based on how it
* is used in SGML, XML, or XHTML documents.
*
* @see https://www.iso.org/standard/16387.html
*
* @since 6.7.0
*
* @see WP_HTML_Processor
*/
class WP_HTML_Doctype_Info {
/**
* Name of the DOCTYPE: should be "html" for HTML documents.
*
* This value should be considered "read only" and not modified.
*
* Historically the DOCTYPE name indicates name of the document's root element.
*
* <!DOCTYPE html>
* ╰──┴── name is "html".
*
* @see https://html.spec.whatwg.org/#tokenization
*
* @since 6.7.0
*
* @var string|null
*/
public $name = null;
/**
* Public identifier of the DOCTYPE.
*
* This value should be considered "read only" and not modified.
*
* The public identifier is optional and should not appear in HTML documents.
* A `null` value indicates that no public identifier was present in the DOCTYPE.
*
* Historically the presence of the public identifier indicated that a document
* was meant to be shared between computer systems and the value indicated to a
* knowledgeable parser how to find the relevant document type definition (DTD).
*
* <!DOCTYPE html PUBLIC "public id goes here in quotes">
* │ │ ╰─── public identifier ─────╯
* ╰──┴── name is "html".
*
* @see https://html.spec.whatwg.org/#tokenization
*
* @since 6.7.0
*
* @var string|null
*/
public $public_identifier = null;
/**
* System identifier of the DOCTYPE.
*
* This value should be considered "read only" and not modified.
*
* The system identifier is optional and should not appear in HTML documents.
* A `null` value indicates that no system identifier was present in the DOCTYPE.
*
* Historically the system identifier specified where a relevant document type
* declaration for the given document is stored and may be retrieved.
*
* <!DOCTYPE html SYSTEM "system id goes here in quotes">
* │ │ ╰──── system identifier ────╯
* ╰──┴── name is "html".
*
* If a public identifier were provided it would indicate to a knowledgeable
* parser how to interpret the system identifier.
*
* <!DOCTYPE html PUBLIC "public id goes here in quotes" "system id goes here in quotes">
* │ │ ╰─── public identifier ─────╯ ╰──── system identifier ────╯
* ╰──┴── name is "html".
*
* @see https://html.spec.whatwg.org/#tokenization
*
* @since 6.7.0
*
* @var string|null
*/
public $system_identifier = null;
/**
* Which document compatability mode this DOCTYPE declaration indicates.
*
* This value should be considered "read only" and not modified.
*
* When an HTML parser has not already set the document compatability mode,
* (e.g. "quirks" or "no-quirks" mode), it will infer if from the properties
* of the appropriate DOCTYPE declaration, if one exists. The DOCTYPE can
* indicate one of three possible document compatability modes:
*
* - "no-quirks" and "limited-quirks" modes (also called "standards" mode).
* - "quirks" mode (also called `CSS1Compat` mode).
*
* An appropriate DOCTYPE is one encountered in the "initial" insertion mode,
* before the HTML element has been opened and before finding any other
* DOCTYPE declaration tokens.
*
* @see https://html.spec.whatwg.org/#the-initial-insertion-mode
*
* @since 6.7.0
*
* @var string One of "no-quirks", "limited-quirks", or "quirks".
*/
public $indicated_compatability_mode;
/**
* Constructor.
*
* This class should not be instantiated directly.
* Use the static {@see self::from_doctype_token} method instead.
*
* The arguments to this constructor correspond to the "DOCTYPE token"
* as defined in the HTML specification.
*
* > DOCTYPE tokens have a name, a public identifier, a system identifier,
* > and a force-quirks flag. When a DOCTYPE token is created, its name, public identifier,
* > and system identifier must be marked as missing (which is a distinct state from the
* > empty string), and the force-quirks flag must be set to off (its other state is on).
*
* @see https://html.spec.whatwg.org/multipage/parsing.html#tokenization
*
* @since 6.7.0
*
* @param string|null $name Name of the DOCTYPE.
* @param string|null $public_identifier Public identifier of the DOCTYPE.
* @param string|null $system_identifier System identifier of the DOCTYPE.
* @param bool $force_quirks_flag Whether the force-quirks flag is set for the token.
*/
private function __construct(
?string $name,
?string $public_identifier,
?string $system_identifier,
bool $force_quirks_flag
) {
$this->name = $name;
$this->public_identifier = $public_identifier;
$this->system_identifier = $system_identifier;
/*
* > If the DOCTYPE token matches one of the conditions in the following list,
* > then set the Document to quirks mode:
*/
/*
* > The force-quirks flag is set to on.
*/
if ( $force_quirks_flag ) {
$this->indicated_compatability_mode = 'quirks';
return;
}
/*
* Normative documents will contain the literal `<!DOCTYPE html>` with no
* public or system identifiers; short-circuit to avoid extra parsing.
*/
if ( 'html' === $name && null === $public_identifier && null === $system_identifier ) {
$this->indicated_compatability_mode = 'no-quirks';
return;
}
/*
* > The name is not "html".
*
* The tokenizer must report the name in lower case even if provided in
* the document in upper case; thus no conversion is required here.
*/
if ( 'html' !== $name ) {
$this->indicated_compatability_mode = 'quirks';
return;
}
/*
* Set up some variables to handle the rest of the conditions.
*
* > set...the public identifier...to...the empty string if the public identifier was missing.
* > set...the system identifier...to...the empty string if the system identifier was missing.
* >
* > The system identifier and public identifier strings must be compared...
* > in an ASCII case-insensitive manner.
* >
* > A system identifier whose value is the empty string is not considered missing
* > for the purposes of the conditions above.
*/
$system_identifier_is_missing = null === $system_identifier;
$public_identifier = null === $public_identifier ? '' : strtolower( $public_identifier );
$system_identifier = null === $system_identifier ? '' : strtolower( $system_identifier );
/*
* > The public identifier is set to…
*/
if (
'-//w3o//dtd w3 html strict 3.0//en//' === $public_identifier ||
'-/w3c/dtd html 4.0 transitional/en' === $public_identifier ||
'html' === $public_identifier
) {
$this->indicated_compatability_mode = 'quirks';
return;
}
/*
* > The system identifier is set to…
*/
if ( 'http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd' === $system_identifier ) {
$this->indicated_compatability_mode = 'quirks';
return;
}
/*
* All of the following conditions depend on matching the public identifier.
* If the public identifier is empty, none of the following conditions will match.
*/
if ( '' === $public_identifier ) {
$this->indicated_compatability_mode = 'no-quirks';
return;
}
/*
* > The public identifier starts with…
*
* @todo Optimize this matching. It shouldn't be a large overall performance issue,
* however, as only a single DOCTYPE declaration token should ever be parsed,
* and normative documents will have exited before reaching this condition.
*/
if (
str_starts_with( $public_identifier, '+//silmaril//dtd html pro v0r11 19970101//' ) ||
str_starts_with( $public_identifier, '-//as//dtd html 3.0 aswedit + extensions//' ) ||
str_starts_with( $public_identifier, '-//advasoft ltd//dtd html 3.0 aswedit + extensions//' ) ||
str_starts_with( $public_identifier, '-//ietf//dtd html 2.0 level 1//' ) ||
str_starts_with( $public_identifier, '-//ietf//dtd html 2.0 level 2//' ) ||
str_starts_with( $public_identifier, '-//ietf//dtd html 2.0 strict level 1//' ) ||
str_starts_with( $public_identifier, '-//ietf//dtd html 2.0 strict level 2//' ) ||
str_starts_with( $public_identifier, '-//ietf//dtd html 2.0 strict//' ) ||
str_starts_with( $public_identifier, '-//ietf//dtd html 2.0//' ) ||
str_starts_with( $public_identifier, '-//ietf//dtd html 2.1e//' ) ||
str_starts_with( $public_identifier, '-//ietf//dtd html 3.0//' ) ||
str_starts_with( $public_identifier, '-//ietf//dtd html 3.2 final//' ) ||
str_starts_with( $public_identifier, '-//ietf//dtd html 3.2//' ) ||
str_starts_with( $public_identifier, '-//ietf//dtd html 3//' ) ||
str_starts_with( $public_identifier, '-//ietf//dtd html level 0//' ) ||
str_starts_with( $public_identifier, '-//ietf//dtd html level 1//' ) ||
str_starts_with( $public_identifier, '-//ietf//dtd html level 2//' ) ||
str_starts_with( $public_identifier, '-//ietf//dtd html level 3//' ) ||
str_starts_with( $public_identifier, '-//ietf//dtd html strict level 0//' ) ||
str_starts_with( $public_identifier, '-//ietf//dtd html strict level 1//' ) ||
str_starts_with( $public_identifier, '-//ietf//dtd html strict level 2//' ) ||
str_starts_with( $public_identifier, '-//ietf//dtd html strict level 3//' ) ||
str_starts_with( $public_identifier, '-//ietf//dtd html strict//' ) ||
str_starts_with( $public_identifier, '-//ietf//dtd html//' ) ||
str_starts_with( $public_identifier, '-//metrius//dtd metrius presentational//' ) ||
str_starts_with( $public_identifier, '-//microsoft//dtd internet explorer 2.0 html strict//' ) ||
str_starts_with( $public_identifier, '-//microsoft//dtd internet explorer 2.0 html//' ) ||
str_starts_with( $public_identifier, '-//microsoft//dtd internet explorer 2.0 tables//' ) ||
str_starts_with( $public_identifier, '-//microsoft//dtd internet explorer 3.0 html strict//' ) ||
str_starts_with( $public_identifier, '-//microsoft//dtd internet explorer 3.0 html//' ) ||
str_starts_with( $public_identifier, '-//microsoft//dtd internet explorer 3.0 tables//' ) ||
str_starts_with( $public_identifier, '-//netscape comm. corp.//dtd html//' ) ||
str_starts_with( $public_identifier, '-//netscape comm. corp.//dtd strict html//' ) ||
str_starts_with( $public_identifier, "-//o'reilly and associates//dtd html 2.0//" ) ||
str_starts_with( $public_identifier, "-//o'reilly and associates//dtd html extended 1.0//" ) ||
str_starts_with( $public_identifier, "-//o'reilly and associates//dtd html extended relaxed 1.0//" ) ||
str_starts_with( $public_identifier, '-//sq//dtd html 2.0 hotmetal + extensions//' ) ||
str_starts_with( $public_identifier, '-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//' ) ||
str_starts_with( $public_identifier, '-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//' ) ||
str_starts_with( $public_identifier, '-//spyglass//dtd html 2.0 extended//' ) ||
str_starts_with( $public_identifier, '-//sun microsystems corp.//dtd hotjava html//' ) ||
str_starts_with( $public_identifier, '-//sun microsystems corp.//dtd hotjava strict html//' ) ||
str_starts_with( $public_identifier, '-//w3c//dtd html 3 1995-03-24//' ) ||
str_starts_with( $public_identifier, '-//w3c//dtd html 3.2 draft//' ) ||
str_starts_with( $public_identifier, '-//w3c//dtd html 3.2 final//' ) ||
str_starts_with( $public_identifier, '-//w3c//dtd html 3.2//' ) ||
str_starts_with( $public_identifier, '-//w3c//dtd html 3.2s draft//' ) ||
str_starts_with( $public_identifier, '-//w3c//dtd html 4.0 frameset//' ) ||
str_starts_with( $public_identifier, '-//w3c//dtd html 4.0 transitional//' ) ||
str_starts_with( $public_identifier, '-//w3c//dtd html experimental 19960712//' ) ||
str_starts_with( $public_identifier, '-//w3c//dtd html experimental 970421//' ) ||
str_starts_with( $public_identifier, '-//w3c//dtd w3 html//' ) ||
str_starts_with( $public_identifier, '-//w3o//dtd w3 html 3.0//' ) ||
str_starts_with( $public_identifier, '-//webtechs//dtd mozilla html 2.0//' ) ||
str_starts_with( $public_identifier, '-//webtechs//dtd mozilla html//' )
) {
$this->indicated_compatability_mode = 'quirks';
return;
}
/*
* > The system identifier is missing and the public identifier starts with…
*/
if (
$system_identifier_is_missing && (
str_starts_with( $public_identifier, '-//w3c//dtd html 4.01 frameset//' ) ||
str_starts_with( $public_identifier, '-//w3c//dtd html 4.01 transitional//' )
)
) {
$this->indicated_compatability_mode = 'quirks';
return;
}
/*
* > Otherwise, if the DOCTYPE token matches one of the conditions in
* > the following list, then set the Document to limited-quirks mode.
*/
/*
* > The public identifier starts with…
*/
if (
str_starts_with( $public_identifier, '-//w3c//dtd xhtml 1.0 frameset//' ) ||
str_starts_with( $public_identifier, '-//w3c//dtd xhtml 1.0 transitional//' )
) {
$this->indicated_compatability_mode = 'limited-quirks';
return;
}
/*
* > The system identifier is not missing and the public identifier starts with…
*/
if (
! $system_identifier_is_missing && (
str_starts_with( $public_identifier, '-//w3c//dtd html 4.01 frameset//' ) ||
str_starts_with( $public_identifier, '-//w3c//dtd html 4.01 transitional//' )
)
) {
$this->indicated_compatability_mode = 'limited-quirks';
return;
}
$this->indicated_compatability_mode = 'no-quirks';
}
/**
* Creates a WP_HTML_Doctype_Info instance by parsing a raw DOCTYPE declaration token.
*
* Use this method to parse a DOCTYPE declaration token and get access to its properties
* via the returned WP_HTML_Doctype_Info class instance. The provided input must parse
* properly as a DOCTYPE declaration, though it must not represent a valid DOCTYPE.
*
* Example:
*
* // Normative HTML DOCTYPE declaration.
* $doctype = WP_HTML_Doctype_Info::from_doctype_token( '<!DOCTYPE html>' );
* 'no-quirks' === $doctype->indicated_compatability_mode;
*
* // A nonsensical DOCTYPE is still valid, and will indicate "quirks" mode.
* $doctype = WP_HTML_Doctype_Info::from_doctype_token( '<!doctypeJSON SILLY "nonsense\'>' );
* 'quirks' === $doctype->indicated_compatability_mode;
*
* // Textual quirks present in raw HTML are handled appropriately.
* $doctype = WP_HTML_Doctype_Info::from_doctype_token( "<!DOCTYPE\nhtml\n>" );
* 'no-quirks' === $doctype->indicated_compatability_mode;
*
* // Anything other than a proper DOCTYPE declaration token fails to parse.
* null === WP_HTML_Doctype_Info::from_doctype_token( ' <!DOCTYPE>' );
* null === WP_HTML_Doctype_Info::from_doctype_token( '<!DOCTYPE ><p>' );
* null === WP_HTML_Doctype_Info::from_doctype_token( '<!TYPEDOC>' );
* null === WP_HTML_Doctype_Info::from_doctype_token( 'html' );
* null === WP_HTML_Doctype_Info::from_doctype_token( '<?xml version="1.0" encoding="UTF-8" ?>' );
*
* @since 6.7.0
*
* @param string $doctype_html The complete raw DOCTYPE HTML string, e.g. `<!DOCTYPE html>`.
*
* @return WP_HTML_Doctype_Info|null A WP_HTML_Doctype_Info instance will be returned if the
* provided DOCTYPE HTML is a valid DOCTYPE. Otherwise, null.
*/
public static function from_doctype_token( string $doctype_html ): ?self {
$doctype_name = null;
$doctype_public_id = null;
$doctype_system_id = null;
$end = strlen( $doctype_html ) - 1;
/*
* This parser combines the rules for parsing DOCTYPE tokens found in the HTML
* specification for the DOCTYPE related tokenizer states.
*
* @see https://html.spec.whatwg.org/#doctype-state
*/
/*
* - Valid DOCTYPE HTML token must be at least `<!DOCTYPE>` assuming a complete token not
* ending in end-of-file.
* - It must start with an ASCII case-insensitive match for `<!DOCTYPE`.
* - The only occurrence of `>` must be the final byte in the HTML string.
*/
if (
$end < 9 ||
0 !== substr_compare( $doctype_html, '<!DOCTYPE', 0, 9, true )
) {
return null;
}
$at = 9;
// Is there one and only one `>`?
if ( '>' !== $doctype_html[ $end ] || ( strcspn( $doctype_html, '>', $at ) + $at ) < $end ) {
return null;
}
/*
* Perform newline normalization and ensure the $end value is correct after normalization.
*
* @see https://html.spec.whatwg.org/#preprocessing-the-input-stream
* @see https://infra.spec.whatwg.org/#normalize-newlines
*/
$doctype_html = str_replace( "\r\n", "\n", $doctype_html );
$doctype_html = str_replace( "\r", "\n", $doctype_html );
$end = strlen( $doctype_html ) - 1;
/*
* In this state, the doctype token has been found and its "content" optionally including the
* name, public identifier, and system identifier is between the current position and the end.
*
* "<!DOCTYPE...declaration...>"
* ╰─ $at ╰─ $end
*
* It's also possible that the declaration part is empty.
*
* ╭─ $at
* "<!DOCTYPE>"
* ╰─ $end
*
* Rules for parsing ">" which terminates the DOCTYPE do not need to be considered as they
* have been handled above in the condition that the provided DOCTYPE HTML must contain
* exactly one ">" character in the final position.
*/
/*
*
* Parsing effectively begins in "Before DOCTYPE name state". Ignore whitespace and
* proceed to the next state.
*
* @see https://html.spec.whatwg.org/#before-doctype-name-state
*/
$at += strspn( $doctype_html, " \t\n\f\r", $at );
if ( $at >= $end ) {
return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true );
}
$name_length = strcspn( $doctype_html, " \t\n\f\r", $at, $end - $at );
$doctype_name = str_replace( "\0", "\u{FFFD}", strtolower( substr( $doctype_html, $at, $name_length ) ) );
$at += $name_length;
$at += strspn( $doctype_html, " \t\n\f\r", $at, $end - $at );
if ( $at >= $end ) {
return new self( $doctype_name, $doctype_public_id, $doctype_system_id, false );
}
/*
* "After DOCTYPE name state"
*
* Find a case-insensitive match for "PUBLIC" or "SYSTEM" at this point.
* Otherwise, set force-quirks and enter bogus DOCTYPE state (skip the rest of the doctype).
*
* @see https://html.spec.whatwg.org/#after-doctype-name-state
*/
if ( $at + 6 >= $end ) {
return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true );
}
/*
* > If the six characters starting from the current input character are an ASCII
* > case-insensitive match for the word "PUBLIC", then consume those characters
* > and switch to the after DOCTYPE public keyword state.
*/
if ( 0 === substr_compare( $doctype_html, 'PUBLIC', $at, 6, true ) ) {
$at += 6;
$at += strspn( $doctype_html, " \t\n\f\r", $at, $end - $at );
if ( $at >= $end ) {
return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true );
}
goto parse_doctype_public_identifier;
}
/*
* > Otherwise, if the six characters starting from the current input character are an ASCII
* > case-insensitive match for the word "SYSTEM", then consume those characters and switch
* > to the after DOCTYPE system keyword state.
*/
if ( 0 === substr_compare( $doctype_html, 'SYSTEM', $at, 6, true ) ) {
$at += 6;
$at += strspn( $doctype_html, " \t\n\f\r", $at, $end - $at );
if ( $at >= $end ) {
return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true );
}
goto parse_doctype_system_identifier;
}
/*
* > Otherwise, this is an invalid-character-sequence-after-doctype-name parse error.
* > Set the current DOCTYPE token's force-quirks flag to on. Reconsume in the bogus
* > DOCTYPE state.
*/
return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true );
parse_doctype_public_identifier:
/*
* The parser should enter "DOCTYPE public identifier (double-quoted) state" or
* "DOCTYPE public identifier (single-quoted) state" by finding one of the valid quotes.
* Anything else forces quirks mode and ignores the rest of the contents.
*
* @see https://html.spec.whatwg.org/#doctype-public-identifier-(double-quoted)-state
* @see https://html.spec.whatwg.org/#doctype-public-identifier-(single-quoted)-state
*/
$closer_quote = $doctype_html[ $at ];
/*
* > This is a missing-quote-before-doctype-public-identifier parse error. Set the
* > current DOCTYPE token's force-quirks flag to on. Reconsume in the bogus DOCTYPE state.
*/
if ( '"' !== $closer_quote && "'" !== $closer_quote ) {
return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true );
}
++$at;
$identifier_length = strcspn( $doctype_html, $closer_quote, $at, $end - $at );
$doctype_public_id = str_replace( "\0", "\u{FFFD}", substr( $doctype_html, $at, $identifier_length ) );
$at += $identifier_length;
if ( $at >= $end || $closer_quote !== $doctype_html[ $at ] ) {
return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true );
}
++$at;
/*
* "Between DOCTYPE public and system identifiers state"
*
* Advance through whitespace between public and system identifiers.
*
* @see https://html.spec.whatwg.org/#between-doctype-public-and-system-identifiers-state
*/
$at += strspn( $doctype_html, " \t\n\f\r", $at, $end - $at );
if ( $at >= $end ) {
return new self( $doctype_name, $doctype_public_id, $doctype_system_id, false );
}
parse_doctype_system_identifier:
/*
* The parser should enter "DOCTYPE system identifier (double-quoted) state" or
* "DOCTYPE system identifier (single-quoted) state" by finding one of the valid quotes.
* Anything else forces quirks mode and ignores the rest of the contents.
*
* @see https://html.spec.whatwg.org/#doctype-system-identifier-(double-quoted)-state
* @see https://html.spec.whatwg.org/#doctype-system-identifier-(single-quoted)-state
*/
$closer_quote = $doctype_html[ $at ];
/*
* > This is a missing-quote-before-doctype-system-identifier parse error. Set the
* > current DOCTYPE token's force-quirks flag to on. Reconsume in the bogus DOCTYPE state.
*/
if ( '"' !== $closer_quote && "'" !== $closer_quote ) {
return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true );
}
++$at;
$identifier_length = strcspn( $doctype_html, $closer_quote, $at, $end - $at );
$doctype_system_id = str_replace( "\0", "\u{FFFD}", substr( $doctype_html, $at, $identifier_length ) );
$at += $identifier_length;
if ( $at >= $end || $closer_quote !== $doctype_html[ $at ] ) {
return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true );
}
return new self( $doctype_name, $doctype_public_id, $doctype_system_id, false );
}
}

View File

@@ -58,7 +58,7 @@ class WP_HTML_Open_Elements {
*
* @since 6.6.0
*
* @var Closure
* @var Closure|null
*/
private $pop_handler = null;
@@ -69,7 +69,7 @@ class WP_HTML_Open_Elements {
*
* @since 6.6.0
*
* @var Closure
* @var Closure|null
*/
private $push_handler = null;
@@ -83,7 +83,7 @@ class WP_HTML_Open_Elements {
*
* @param Closure $handler The handler function.
*/
public function set_pop_handler( Closure $handler ) {
public function set_pop_handler( Closure $handler ): void {
$this->pop_handler = $handler;
}
@@ -97,10 +97,53 @@ class WP_HTML_Open_Elements {
*
* @param Closure $handler The handler function.
*/
public function set_push_handler( Closure $handler ) {
public function set_push_handler( Closure $handler ): void {
$this->push_handler = $handler;
}
/**
* Returns the name of the node at the nth position on the stack
* of open elements, or `null` if no such position exists.
*
* Note that this uses a 1-based index, which represents the
* "nth item" on the stack, counting from the top, where the
* top-most element is the 1st, the second is the 2nd, etc...
*
* @since 6.7.0
*
* @param int $nth Retrieve the nth item on the stack, with 1 being
* the top element, 2 being the second, etc...
* @return WP_HTML_Token|null Name of the node on the stack at the given location,
* or `null` if the location isn't on the stack.
*/
public function at( int $nth ): ?WP_HTML_Token {
foreach ( $this->walk_down() as $item ) {
if ( 0 === --$nth ) {
return $item;
}
}
return null;
}
/**
* Reports if a node of a given name is in the stack of open elements.
*
* @since 6.7.0
*
* @param string $node_name Name of node for which to check.
* @return bool Whether a node of the given name is in the stack of open elements.
*/
public function contains( string $node_name ): bool {
foreach ( $this->walk_up() as $item ) {
if ( $node_name === $item->node_name ) {
return true;
}
}
return false;
}
/**
* Reports if a specific node is in the stack of open elements.
*
@@ -109,9 +152,9 @@ class WP_HTML_Open_Elements {
* @param WP_HTML_Token $token Look for this node in the stack.
* @return bool Whether the referenced node is in the stack of open elements.
*/
public function contains_node( $token ) {
public function contains_node( WP_HTML_Token $token ): bool {
foreach ( $this->walk_up() as $item ) {
if ( $token->bookmark_name === $item->bookmark_name ) {
if ( $token === $item ) {
return true;
}
}
@@ -126,7 +169,7 @@ class WP_HTML_Open_Elements {
*
* @return int How many node are in the stack of open elements.
*/
public function count() {
public function count(): int {
return count( $this->stack );
}
@@ -138,20 +181,57 @@ class WP_HTML_Open_Elements {
*
* @return WP_HTML_Token|null Last node in the stack of open elements, if one exists, otherwise null.
*/
public function current_node() {
public function current_node(): ?WP_HTML_Token {
$current_node = end( $this->stack );
return $current_node ? $current_node : null;
}
/**
* Indicates if the current node is of a given type or name.
*
* It's possible to pass either a node type or a node name to this function.
* In the case there is no current element it will always return `false`.
*
* Example:
*
* // Is the current node a text node?
* $stack->current_node_is( '#text' );
*
* // Is the current node a DIV element?
* $stack->current_node_is( 'DIV' );
*
* // Is the current node any element/tag?
* $stack->current_node_is( '#tag' );
*
* @see WP_HTML_Tag_Processor::get_token_type
* @see WP_HTML_Tag_Processor::get_token_name
*
* @since 6.7.0
*
* @access private
*
* @param string $identity Check if the current node has this name or type (depending on what is provided).
* @return bool Whether there is a current element that matches the given identity, whether a token name or type.
*/
public function current_node_is( string $identity ): bool {
$current_node = end( $this->stack );
if ( false === $current_node ) {
return false;
}
$current_node_name = $current_node->node_name;
return (
$current_node_name === $identity ||
( '#doctype' === $identity && 'html' === $current_node_name ) ||
( '#tag' === $identity && ctype_upper( $current_node_name ) )
);
}
/**
* Returns whether an element is in a specific scope.
*
* ## HTML Support
*
* This function skips checking for the termination list because there
* are no supported elements which appear in the termination list.
*
* @since 6.4.0
*
* @see https://html.spec.whatwg.org/#has-an-element-in-the-specific-scope
@@ -160,25 +240,24 @@ class WP_HTML_Open_Elements {
* @param string[] $termination_list List of elements that terminate the search.
* @return bool Whether the element was found in a specific scope.
*/
public function has_element_in_specific_scope( $tag_name, $termination_list ) {
public function has_element_in_specific_scope( string $tag_name, $termination_list ): bool {
foreach ( $this->walk_up() as $node ) {
if ( $node->node_name === $tag_name ) {
$namespaced_name = 'html' === $node->namespace
? $node->node_name
: "{$node->namespace} {$node->node_name}";
if ( $namespaced_name === $tag_name ) {
return true;
}
if (
'(internal: H1 through H6 - do not use)' === $tag_name &&
in_array( $node->node_name, array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ), true )
in_array( $namespaced_name, array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ), true )
) {
return true;
}
switch ( $node->node_name ) {
case 'HTML':
return false;
}
if ( in_array( $node->node_name, $termination_list, true ) ) {
if ( in_array( $namespaced_name, $termination_list, true ) ) {
return false;
}
}
@@ -189,25 +268,61 @@ class WP_HTML_Open_Elements {
/**
* Returns whether a particular element is in scope.
*
* > The stack of open elements is said to have a particular element in
* > scope when it has that element in the specific scope consisting of
* > the following element types:
* >
* > - applet
* > - caption
* > - html
* > - table
* > - td
* > - th
* > - marquee
* > - object
* > - template
* > - MathML mi
* > - MathML mo
* > - MathML mn
* > - MathML ms
* > - MathML mtext
* > - MathML annotation-xml
* > - SVG foreignObject
* > - SVG desc
* > - SVG title
*
* @since 6.4.0
* @since 6.7.0 Full support.
*
* @see https://html.spec.whatwg.org/#has-an-element-in-scope
*
* @param string $tag_name Name of tag to check.
* @return bool Whether given element is in scope.
*/
public function has_element_in_scope( $tag_name ) {
public function has_element_in_scope( string $tag_name ): bool {
return $this->has_element_in_specific_scope(
$tag_name,
array(
'APPLET',
'CAPTION',
'HTML',
'TABLE',
'TD',
'TH',
'MARQUEE',
'OBJECT',
'TEMPLATE',
/*
* Because it's not currently possible to encounter
* one of the termination elements, they don't need
* to be listed here. If they were, they would be
* unreachable and only waste CPU cycles while
* scanning through HTML.
*/
'math MI',
'math MO',
'math MN',
'math MS',
'math MTEXT',
'math ANNOTATION-XML',
'svg FOREIGNOBJECT',
'svg DESC',
'svg TITLE',
)
);
}
@@ -215,21 +330,50 @@ class WP_HTML_Open_Elements {
/**
* Returns whether a particular element is in list item scope.
*
* > The stack of open elements is said to have a particular element
* > in list item scope when it has that element in the specific scope
* > consisting of the following element types:
* >
* > - All the element types listed above for the has an element in scope algorithm.
* > - ol in the HTML namespace
* > - ul in the HTML namespace
*
* @since 6.4.0
* @since 6.5.0 Implemented: no longer throws on every invocation.
* @since 6.7.0 Supports all required HTML elements.
*
* @see https://html.spec.whatwg.org/#has-an-element-in-list-item-scope
*
* @param string $tag_name Name of tag to check.
* @return bool Whether given element is in scope.
*/
public function has_element_in_list_item_scope( $tag_name ) {
public function has_element_in_list_item_scope( string $tag_name ): bool {
return $this->has_element_in_specific_scope(
$tag_name,
array(
// There are more elements that belong here which aren't currently supported.
'APPLET',
'BUTTON',
'CAPTION',
'HTML',
'TABLE',
'TD',
'TH',
'MARQUEE',
'OBJECT',
'OL',
'TEMPLATE',
'UL',
'math MI',
'math MO',
'math MN',
'math MS',
'math MTEXT',
'math ANNOTATION-XML',
'svg FOREIGNOBJECT',
'svg DESC',
'svg TITLE',
)
);
}
@@ -237,51 +381,115 @@ class WP_HTML_Open_Elements {
/**
* Returns whether a particular element is in button scope.
*
* > The stack of open elements is said to have a particular element
* > in button scope when it has that element in the specific scope
* > consisting of the following element types:
* >
* > - All the element types listed above for the has an element in scope algorithm.
* > - button in the HTML namespace
*
* @since 6.4.0
* @since 6.7.0 Supports all required HTML elements.
*
* @see https://html.spec.whatwg.org/#has-an-element-in-button-scope
*
* @param string $tag_name Name of tag to check.
* @return bool Whether given element is in scope.
*/
public function has_element_in_button_scope( $tag_name ) {
return $this->has_element_in_specific_scope( $tag_name, array( 'BUTTON' ) );
public function has_element_in_button_scope( string $tag_name ): bool {
return $this->has_element_in_specific_scope(
$tag_name,
array(
'APPLET',
'BUTTON',
'CAPTION',
'HTML',
'TABLE',
'TD',
'TH',
'MARQUEE',
'OBJECT',
'TEMPLATE',
'math MI',
'math MO',
'math MN',
'math MS',
'math MTEXT',
'math ANNOTATION-XML',
'svg FOREIGNOBJECT',
'svg DESC',
'svg TITLE',
)
);
}
/**
* Returns whether a particular element is in table scope.
*
* > The stack of open elements is said to have a particular element
* > in table scope when it has that element in the specific scope
* > consisting of the following element types:
* >
* > - html in the HTML namespace
* > - table in the HTML namespace
* > - template in the HTML namespace
*
* @since 6.4.0
* @since 6.7.0 Full implementation.
*
* @see https://html.spec.whatwg.org/#has-an-element-in-table-scope
*
* @throws WP_HTML_Unsupported_Exception Always until this function is implemented.
*
* @param string $tag_name Name of tag to check.
* @return bool Whether given element is in scope.
*/
public function has_element_in_table_scope( $tag_name ) {
throw new WP_HTML_Unsupported_Exception( 'Cannot process elements depending on table scope.' );
return false; // The linter requires this unreachable code until the function is implemented and can return.
public function has_element_in_table_scope( string $tag_name ): bool {
return $this->has_element_in_specific_scope(
$tag_name,
array(
'HTML',
'TABLE',
'TEMPLATE',
)
);
}
/**
* Returns whether a particular element is in select scope.
*
* @since 6.4.0
* This test differs from the others like it, in that its rules are inverted.
* Instead of arriving at a match when one of any tag in a termination group
* is reached, this one terminates if any other tag is reached.
*
* > The stack of open elements is said to have a particular element in select scope when it has
* > that element in the specific scope consisting of all element types except the following:
* > - optgroup in the HTML namespace
* > - option in the HTML namespace
*
* @since 6.4.0 Stub implementation (throws).
* @since 6.7.0 Full implementation.
*
* @see https://html.spec.whatwg.org/#has-an-element-in-select-scope
*
* @throws WP_HTML_Unsupported_Exception Always until this function is implemented.
*
* @param string $tag_name Name of tag to check.
* @return bool Whether given element is in scope.
* @return bool Whether the given element is in SELECT scope.
*/
public function has_element_in_select_scope( $tag_name ) {
throw new WP_HTML_Unsupported_Exception( 'Cannot process elements depending on select scope.' );
public function has_element_in_select_scope( string $tag_name ): bool {
foreach ( $this->walk_up() as $node ) {
if ( $node->node_name === $tag_name ) {
return true;
}
return false; // The linter requires this unreachable code until the function is implemented and can return.
if (
'OPTION' !== $node->node_name &&
'OPTGROUP' !== $node->node_name
) {
return false;
}
}
return false;
}
/**
@@ -293,7 +501,7 @@ class WP_HTML_Open_Elements {
*
* @return bool Whether a P is in BUTTON scope.
*/
public function has_p_in_button_scope() {
public function has_p_in_button_scope(): bool {
return $this->has_p_in_button_scope;
}
@@ -306,7 +514,7 @@ class WP_HTML_Open_Elements {
*
* @return bool Whether a node was popped off of the stack.
*/
public function pop() {
public function pop(): bool {
$item = array_pop( $this->stack );
if ( null === $item ) {
return false;
@@ -322,31 +530,31 @@ class WP_HTML_Open_Elements {
}
/**
* Pops nodes off of the stack of open elements until one with the given tag name has been popped.
* Pops nodes off of the stack of open elements until an HTML tag with the given name has been popped.
*
* @since 6.4.0
*
* @see WP_HTML_Open_Elements::pop
*
* @param string $tag_name Name of tag that needs to be popped off of the stack of open elements.
* @param string $html_tag_name Name of tag that needs to be popped off of the stack of open elements.
* @return bool Whether a tag of the given name was found and popped off of the stack of open elements.
*/
public function pop_until( $tag_name ) {
public function pop_until( string $html_tag_name ): bool {
foreach ( $this->walk_up() as $item ) {
if ( 'context-node' === $item->bookmark_name ) {
return true;
}
$this->pop();
if ( 'html' !== $item->namespace ) {
continue;
}
if (
'(internal: H1 through H6 - do not use)' === $tag_name &&
'(internal: H1 through H6 - do not use)' === $html_tag_name &&
in_array( $item->node_name, array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ), true )
) {
return true;
}
if ( $tag_name === $item->node_name ) {
if ( $html_tag_name === $item->node_name ) {
return true;
}
}
@@ -363,7 +571,7 @@ class WP_HTML_Open_Elements {
*
* @param WP_HTML_Token $stack_item Item to add onto stack.
*/
public function push( $stack_item ) {
public function push( WP_HTML_Token $stack_item ): void {
$this->stack[] = $stack_item;
$this->after_element_push( $stack_item );
}
@@ -376,7 +584,7 @@ class WP_HTML_Open_Elements {
* @param WP_HTML_Token $token The node to remove from the stack of open elements.
* @return bool Whether the node was found and removed from the stack of open elements.
*/
public function remove_node( $token ) {
public function remove_node( WP_HTML_Token $token ): bool {
if ( 'context-node' === $token->bookmark_name ) {
return false;
}
@@ -443,9 +651,10 @@ class WP_HTML_Open_Elements {
* @since 6.4.0
* @since 6.5.0 Accepts $above_this_node to start traversal above a given node, if it exists.
*
* @param ?WP_HTML_Token $above_this_node Start traversing above this node, if provided and if the node exists.
* @param WP_HTML_Token|null $above_this_node Optional. Start traversing above this node,
* if provided and if the node exists.
*/
public function walk_up( $above_this_node = null ) {
public function walk_up( ?WP_HTML_Token $above_this_node = null ) {
$has_found_node = null === $above_this_node;
for ( $i = count( $this->stack ) - 1; $i >= 0; $i-- ) {
@@ -477,13 +686,35 @@ class WP_HTML_Open_Elements {
*
* @param WP_HTML_Token $item Element that was added to the stack of open elements.
*/
public function after_element_push( $item ) {
public function after_element_push( WP_HTML_Token $item ): void {
$namespaced_name = 'html' === $item->namespace
? $item->node_name
: "{$item->namespace} {$item->node_name}";
/*
* When adding support for new elements, expand this switch to trap
* cases where the precalculated value needs to change.
*/
switch ( $item->node_name ) {
switch ( $namespaced_name ) {
case 'APPLET':
case 'BUTTON':
case 'CAPTION':
case 'HTML':
case 'TABLE':
case 'TD':
case 'TH':
case 'MARQUEE':
case 'OBJECT':
case 'TEMPLATE':
case 'math MI':
case 'math MO':
case 'math MN':
case 'math MS':
case 'math MTEXT':
case 'math ANNOTATION-XML':
case 'svg FOREIGNOBJECT':
case 'svg DESC':
case 'svg TITLE':
$this->has_p_in_button_scope = false;
break;
@@ -510,17 +741,32 @@ class WP_HTML_Open_Elements {
*
* @param WP_HTML_Token $item Element that was removed from the stack of open elements.
*/
public function after_element_pop( $item ) {
public function after_element_pop( WP_HTML_Token $item ): void {
/*
* When adding support for new elements, expand this switch to trap
* cases where the precalculated value needs to change.
*/
switch ( $item->node_name ) {
case 'APPLET':
case 'BUTTON':
$this->has_p_in_button_scope = $this->has_element_in_button_scope( 'P' );
break;
case 'CAPTION':
case 'HTML':
case 'P':
case 'TABLE':
case 'TD':
case 'TH':
case 'MARQUEE':
case 'OBJECT':
case 'TEMPLATE':
case 'math MI':
case 'math MO':
case 'math MN':
case 'math MS':
case 'math MTEXT':
case 'math ANNOTATION-XML':
case 'svg FOREIGNOBJECT':
case 'svg DESC':
case 'svg TITLE':
$this->has_p_in_button_scope = $this->has_element_in_button_scope( 'P' );
break;
}
@@ -530,6 +776,80 @@ class WP_HTML_Open_Elements {
}
}
/**
* Clear the stack back to a table context.
*
* > When the steps above require the UA to clear the stack back to a table context, it means
* > that the UA must, while the current node is not a table, template, or html element, pop
* > elements from the stack of open elements.
*
* @see https://html.spec.whatwg.org/multipage/parsing.html#clear-the-stack-back-to-a-table-context
*
* @since 6.7.0
*/
public function clear_to_table_context(): void {
foreach ( $this->walk_up() as $item ) {
if (
'TABLE' === $item->node_name ||
'TEMPLATE' === $item->node_name ||
'HTML' === $item->node_name
) {
break;
}
$this->pop();
}
}
/**
* Clear the stack back to a table body context.
*
* > When the steps above require the UA to clear the stack back to a table body context, it
* > means that the UA must, while the current node is not a tbody, tfoot, thead, template, or
* > html element, pop elements from the stack of open elements.
*
* @see https://html.spec.whatwg.org/multipage/parsing.html#clear-the-stack-back-to-a-table-body-context
*
* @since 6.7.0
*/
public function clear_to_table_body_context(): void {
foreach ( $this->walk_up() as $item ) {
if (
'TBODY' === $item->node_name ||
'TFOOT' === $item->node_name ||
'THEAD' === $item->node_name ||
'TEMPLATE' === $item->node_name ||
'HTML' === $item->node_name
) {
break;
}
$this->pop();
}
}
/**
* Clear the stack back to a table row context.
*
* > When the steps above require the UA to clear the stack back to a table row context, it
* > means that the UA must, while the current node is not a tr, template, or html element, pop
* > elements from the stack of open elements.
*
* @see https://html.spec.whatwg.org/multipage/parsing.html#clear-the-stack-back-to-a-table-row-context
*
* @since 6.7.0
*/
public function clear_to_table_row_context(): void {
foreach ( $this->walk_up() as $item ) {
if (
'TR' === $item->node_name ||
'TEMPLATE' === $item->node_name ||
'HTML' === $item->node_name
) {
break;
}
$this->pop();
}
}
/**
* Wakeup magic method.
*

View File

@@ -47,6 +47,66 @@ class WP_HTML_Processor_State {
*/
const INSERTION_MODE_INITIAL = 'insertion-mode-initial';
/**
* Before HTML insertion mode for full HTML parser.
*
* @since 6.7.0
*
* @see https://html.spec.whatwg.org/#the-before-html-insertion-mode
* @see WP_HTML_Processor_State::$insertion_mode
*
* @var string
*/
const INSERTION_MODE_BEFORE_HTML = 'insertion-mode-before-html';
/**
* Before head insertion mode for full HTML parser.
*
* @since 6.7.0
*
* @see https://html.spec.whatwg.org/#parsing-main-beforehead
* @see WP_HTML_Processor_State::$insertion_mode
*
* @var string
*/
const INSERTION_MODE_BEFORE_HEAD = 'insertion-mode-before-head';
/**
* In head insertion mode for full HTML parser.
*
* @since 6.7.0
*
* @see https://html.spec.whatwg.org/#parsing-main-inhead
* @see WP_HTML_Processor_State::$insertion_mode
*
* @var string
*/
const INSERTION_MODE_IN_HEAD = 'insertion-mode-in-head';
/**
* In head noscript insertion mode for full HTML parser.
*
* @since 6.7.0
*
* @see https://html.spec.whatwg.org/#parsing-main-inheadnoscript
* @see WP_HTML_Processor_State::$insertion_mode
*
* @var string
*/
const INSERTION_MODE_IN_HEAD_NOSCRIPT = 'insertion-mode-in-head-noscript';
/**
* After head insertion mode for full HTML parser.
*
* @since 6.7.0
*
* @see https://html.spec.whatwg.org/#parsing-main-afterhead
* @see WP_HTML_Processor_State::$insertion_mode
*
* @var string
*/
const INSERTION_MODE_AFTER_HEAD = 'insertion-mode-after-head';
/**
* In body insertion mode for full HTML parser.
*
@@ -59,6 +119,197 @@ class WP_HTML_Processor_State {
*/
const INSERTION_MODE_IN_BODY = 'insertion-mode-in-body';
/**
* In table insertion mode for full HTML parser.
*
* @since 6.7.0
*
* @see https://html.spec.whatwg.org/#parsing-main-intable
* @see WP_HTML_Processor_State::$insertion_mode
*
* @var string
*/
const INSERTION_MODE_IN_TABLE = 'insertion-mode-in-table';
/**
* In table text insertion mode for full HTML parser.
*
* @since 6.7.0
*
* @see https://html.spec.whatwg.org/#parsing-main-intabletext
* @see WP_HTML_Processor_State::$insertion_mode
*
* @var string
*/
const INSERTION_MODE_IN_TABLE_TEXT = 'insertion-mode-in-table-text';
/**
* In caption insertion mode for full HTML parser.
*
* @since 6.7.0
*
* @see https://html.spec.whatwg.org/#parsing-main-incaption
* @see WP_HTML_Processor_State::$insertion_mode
*
* @var string
*/
const INSERTION_MODE_IN_CAPTION = 'insertion-mode-in-caption';
/**
* In column group insertion mode for full HTML parser.
*
* @since 6.7.0
*
* @see https://html.spec.whatwg.org/#parsing-main-incolumngroup
* @see WP_HTML_Processor_State::$insertion_mode
*
* @var string
*/
const INSERTION_MODE_IN_COLUMN_GROUP = 'insertion-mode-in-column-group';
/**
* In table body insertion mode for full HTML parser.
*
* @since 6.7.0
*
* @see https://html.spec.whatwg.org/#parsing-main-intablebody
* @see WP_HTML_Processor_State::$insertion_mode
*
* @var string
*/
const INSERTION_MODE_IN_TABLE_BODY = 'insertion-mode-in-table-body';
/**
* In row insertion mode for full HTML parser.
*
* @since 6.7.0
*
* @see https://html.spec.whatwg.org/#parsing-main-inrow
* @see WP_HTML_Processor_State::$insertion_mode
*
* @var string
*/
const INSERTION_MODE_IN_ROW = 'insertion-mode-in-row';
/**
* In cell insertion mode for full HTML parser.
*
* @since 6.7.0
*
* @see https://html.spec.whatwg.org/#parsing-main-incell
* @see WP_HTML_Processor_State::$insertion_mode
*
* @var string
*/
const INSERTION_MODE_IN_CELL = 'insertion-mode-in-cell';
/**
* In select insertion mode for full HTML parser.
*
* @since 6.7.0
*
* @see https://html.spec.whatwg.org/#parsing-main-inselect
* @see WP_HTML_Processor_State::$insertion_mode
*
* @var string
*/
const INSERTION_MODE_IN_SELECT = 'insertion-mode-in-select';
/**
* In select in table insertion mode for full HTML parser.
*
* @since 6.7.0
*
* @see https://html.spec.whatwg.org/#parsing-main-inselectintable
* @see WP_HTML_Processor_State::$insertion_mode
*
* @var string
*/
const INSERTION_MODE_IN_SELECT_IN_TABLE = 'insertion-mode-in-select-in-table';
/**
* In template insertion mode for full HTML parser.
*
* @since 6.7.0
*
* @see https://html.spec.whatwg.org/#parsing-main-intemplate
* @see WP_HTML_Processor_State::$insertion_mode
*
* @var string
*/
const INSERTION_MODE_IN_TEMPLATE = 'insertion-mode-in-template';
/**
* After body insertion mode for full HTML parser.
*
* @since 6.7.0
*
* @see https://html.spec.whatwg.org/#parsing-main-afterbody
* @see WP_HTML_Processor_State::$insertion_mode
*
* @var string
*/
const INSERTION_MODE_AFTER_BODY = 'insertion-mode-after-body';
/**
* In frameset insertion mode for full HTML parser.
*
* @since 6.7.0
*
* @see https://html.spec.whatwg.org/#parsing-main-inframeset
* @see WP_HTML_Processor_State::$insertion_mode
*
* @var string
*/
const INSERTION_MODE_IN_FRAMESET = 'insertion-mode-in-frameset';
/**
* After frameset insertion mode for full HTML parser.
*
* @since 6.7.0
*
* @see https://html.spec.whatwg.org/#parsing-main-afterframeset
* @see WP_HTML_Processor_State::$insertion_mode
*
* @var string
*/
const INSERTION_MODE_AFTER_FRAMESET = 'insertion-mode-after-frameset';
/**
* After after body insertion mode for full HTML parser.
*
* @since 6.7.0
*
* @see https://html.spec.whatwg.org/#the-after-after-body-insertion-mode
* @see WP_HTML_Processor_State::$insertion_mode
*
* @var string
*/
const INSERTION_MODE_AFTER_AFTER_BODY = 'insertion-mode-after-after-body';
/**
* After after frameset insertion mode for full HTML parser.
*
* @since 6.7.0
*
* @see https://html.spec.whatwg.org/#the-after-after-frameset-insertion-mode
* @see WP_HTML_Processor_State::$insertion_mode
*
* @var string
*/
const INSERTION_MODE_AFTER_AFTER_FRAMESET = 'insertion-mode-after-after-frameset';
/**
* The stack of template insertion modes.
*
* @since 6.7.0
*
* @see https://html.spec.whatwg.org/#the-insertion-mode:stack-of-template-insertion-modes
*
* @var array<string>
*/
public $stack_of_template_insertion_modes = array();
/**
* Tracks open elements while scanning HTML.
*
@@ -70,7 +321,7 @@ class WP_HTML_Processor_State {
*
* @var WP_HTML_Open_Elements
*/
public $stack_of_open_elements = null;
public $stack_of_open_elements;
/**
* Tracks open formatting elements, used to handle mis-nested formatting element tags.
@@ -83,7 +334,7 @@ class WP_HTML_Processor_State {
*
* @var WP_HTML_Active_Formatting_Elements
*/
public $active_formatting_elements = null;
public $active_formatting_elements;
/**
* Refers to the currently-matched tag, if any.
@@ -116,6 +367,67 @@ class WP_HTML_Processor_State {
*/
public $context_node = null;
/**
* The recognized encoding of the input byte stream.
*
* > The stream of code points that comprises the input to the tokenization
* > stage will be initially seen by the user agent as a stream of bytes
* > (typically coming over the network or from the local file system).
* > The bytes encode the actual characters according to a particular character
* > encoding, which the user agent uses to decode the bytes into characters.
*
* @since 6.7.0
*
* @var string|null
*/
public $encoding = null;
/**
* The parser's confidence in the input encoding.
*
* > When the HTML parser is decoding an input byte stream, it uses a character
* > encoding and a confidence. The confidence is either tentative, certain, or
* > irrelevant. The encoding used, and whether the confidence in that encoding
* > is tentative or certain, is used during the parsing to determine whether to
* > change the encoding. If no encoding is necessary, e.g. because the parser is
* > operating on a Unicode stream and doesn't have to use a character encoding
* > at all, then the confidence is irrelevant.
*
* @since 6.7.0
*
* @var string
*/
public $encoding_confidence = 'tentative';
/**
* HEAD element pointer.
*
* @since 6.7.0
*
* @see https://html.spec.whatwg.org/multipage/parsing.html#head-element-pointer
*
* @var WP_HTML_Token|null
*/
public $head_element = null;
/**
* FORM element pointer.
*
* > points to the last form element that was opened and whose end tag has
* > not yet been seen. It is used to make form controls associate with
* > forms in the face of dramatically bad markup, for historical reasons.
* > It is ignored inside template elements.
*
* @todo This may be invalidated by a seek operation.
*
* @see https://html.spec.whatwg.org/#form-element-pointer
*
* @since 6.7.0
*
* @var WP_HTML_Token|null
*/
public $form_element = null;
/**
* The frameset-ok flag indicates if a `FRAMESET` element is allowed in the current state.
*

File diff suppressed because it is too large Load Diff

View File

@@ -49,7 +49,7 @@ class WP_HTML_Span {
* @param int $start Byte offset into document where replacement span begins.
* @param int $length Byte length of span.
*/
public function __construct( $start, $length ) {
public function __construct( int $start, int $length ) {
$this->start = $start;
$this->length = $length;
}

View File

@@ -74,7 +74,7 @@ class WP_HTML_Stack_Event {
* @param string $operation One of self::PUSH or self::POP.
* @param string $provenance "virtual" or "real".
*/
public function __construct( $token, $operation, $provenance ) {
public function __construct( WP_HTML_Token $token, string $operation, string $provenance ) {
$this->token = $token;
$this->operation = $operation;
$this->provenance = $provenance;

File diff suppressed because it is too large Load Diff

View File

@@ -56,7 +56,7 @@ class WP_HTML_Text_Replacement {
* @param int $length Byte length of span in document being replaced.
* @param string $text Span of text to insert in document to replace existing content from start to end.
*/
public function __construct( $start, $length, $text ) {
public function __construct( int $start, int $length, string $text ) {
$this->start = $start;
$this->length = $length;
$this->text = $text;

View File

@@ -60,6 +60,24 @@ class WP_HTML_Token {
*/
public $has_self_closing_flag = false;
/**
* Indicates if the element is an HTML element or if it's inside foreign content.
*
* @since 6.7.0
*
* @var string 'html', 'svg', or 'math'.
*/
public $namespace = 'html';
/**
* Indicates which kind of integration point the element is, if any.
*
* @since 6.7.0
*
* @var string|null 'math', 'html', or null if not an integration point.
*/
public $integration_node_type = null;
/**
* Called when token is garbage-collected or otherwise destroyed.
*
@@ -72,13 +90,15 @@ class WP_HTML_Token {
*
* @since 6.4.0
*
* @param string $bookmark_name Name of bookmark corresponding to location in HTML where token is found.
* @param string $node_name Name of node token represents; if uppercase, an HTML element; if lowercase, a special value like "marker".
* @param bool $has_self_closing_flag Whether the source token contains the self-closing flag, regardless of whether it's valid.
* @param callable $on_destroy Function to call when destroying token, useful for releasing the bookmark.
* @param string|null $bookmark_name Name of bookmark corresponding to location in HTML where token is found,
* or `null` for markers and nodes without a bookmark.
* @param string $node_name Name of node token represents; if uppercase, an HTML element; if lowercase, a special value like "marker".
* @param bool $has_self_closing_flag Whether the source token contains the self-closing flag, regardless of whether it's valid.
* @param callable|null $on_destroy Optional. Function to call when destroying token, useful for releasing the bookmark.
*/
public function __construct( $bookmark_name, $node_name, $has_self_closing_flag, $on_destroy = null ) {
public function __construct( ?string $bookmark_name, string $node_name, bool $has_self_closing_flag, ?callable $on_destroy = null ) {
$this->bookmark_name = $bookmark_name;
$this->namespace = 'html';
$this->node_name = $node_name;
$this->has_self_closing_flag = $has_self_closing_flag;
$this->on_destroy = $on_destroy;

View File

@@ -21,11 +21,95 @@
* operation and signify that the given HTML cannot be processed.
*
* @since 6.4.0
* @since 6.7.0 Gained contextual information for use in debugging parse failures.
*
* @access private
*
* @see WP_HTML_Processor
*/
class WP_HTML_Unsupported_Exception extends Exception {
/**
* Name of the matched token when the exception was raised,
* if matched on a token.
*
* This does not imply that the token itself was unsupported, but it
* may have been the case that the token triggered part of the HTML
* parsing that isn't supported, such as the adoption agency algorithm.
*
* @since 6.7.0
*
* @var string
*/
public $token_name;
/**
* Number of bytes into the input HTML document where the parser was
* parsing when the exception was raised.
*
* Use this to reconstruct context for the failure.
*
* @since 6.7.0
*
* @var int
*/
public $token_at;
/**
* Full raw text of the matched token when the exception was raised,
* if matched on a token.
*
* Whereas the `$token_name` will be normalized, this contains the full
* raw text of the token, including original casing, duplicated attributes,
* and other syntactic variations that are normally abstracted in the HTML API.
*
* @since 6.7.0
*
* @var string
*/
public $token;
/**
* Stack of open elements when the exception was raised.
*
* Use this to trace the parsing circumstances which led to the exception.
*
* @since 6.7.0
*
* @var string[]
*/
public $stack_of_open_elements = array();
/**
* List of active formatting elements when the exception was raised.
*
* Use this to trace the parsing circumstances which led to the exception.
*
* @since 6.7.0
*
* @var string[]
*/
public $active_formatting_elements = array();
/**
* Constructor function.
*
* @since 6.7.0
*
* @param string $message Brief message explaining what is unsupported, the reason this exception was raised.
* @param string $token_name Normalized name of matched token when this exception was raised.
* @param int $token_at Number of bytes into source HTML document where matched token starts.
* @param string $token Full raw text of matched token when this exception was raised.
* @param string[] $stack_of_open_elements Stack of open elements when this exception was raised.
* @param string[] $active_formatting_elements List of active formatting elements when this exception was raised.
*/
public function __construct( string $message, string $token_name, int $token_at, string $token, array $stack_of_open_elements, array $active_formatting_elements ) {
parent::__construct( $message );
$this->token_name = $token_name;
$this->token_at = $token_at;
$this->token = $token;
$this->stack_of_open_elements = $stack_of_open_elements;
$this->active_formatting_elements = $active_formatting_elements;
}
}