. $document = self::convert_amp_bind_attributes( $document ); // Force all self-closing tags to have closing tags since DOMDocument isn't fully aware. $document = preg_replace( '#<(' . implode( '|', self::$self_closing_tags ) . ')[^>]*>(?!)#', '$0', $document ); // Deal with bugs in older versions of libxml. $added_back_compat_meta_content_type = false; if ( version_compare( LIBXML_DOTTED_VERSION, '2.8', '<' ) ) { /* * Replace noscript elements with placeholders since libxml<2.8 can parse them incorrectly. * When appearing in the head element, a noscript can cause the head to close prematurely * and the noscript gets moved to the body and anything after it which was in the head. * See . * This is limited to only running in the head element because this is where the problem lies, * and it is important for the AMP_Script_Sanitizer to be able to access the noscript elements * in the body otherwise. */ $document = preg_replace_callback( '#^.+?(?=]*>.*?#si', function( $noscript_matches ) { $placeholder = sprintf( '', (string) wp_rand() ); AMP_DOM_Utils::$noscript_placeholder_comments[ $placeholder ] = $noscript_matches[0]; return $placeholder; }, $head_matches[0] ); }, $document ); /* * Add a pre-HTML5-style declaration of the encoding since libxml<2.8 doesn't recognize * HTML5's meta charset. See . */ $document = preg_replace( '#(?=', $document, 1, $count ); if ( 1 === $count ) { $added_back_compat_meta_content_type = true; } } /* * Wrap in dummy tags, since XML needs one parent node. * It also makes it easier to loop through nodes. * We can later use this to extract our nodes. * Add charset so loadHTML does not have problems parsing it. */ $result = $dom->loadHTML( $document ); libxml_clear_errors(); libxml_use_internal_errors( $libxml_previous_state ); if ( ! $result ) { return false; } // Remove pre-HTML5-style encoding declaration if added above. if ( $added_back_compat_meta_content_type ) { $meta_http_equiv_element = $dom->getElementById( 'meta-http-equiv-content-type' ); if ( $meta_http_equiv_element ) { $meta_http_equiv_element->parentNode->removeChild( $meta_http_equiv_element ); } } return $dom; } /** * Get attribute prefix for converted amp-bind attributes. * * This contains a random string to prevent HTML content containing this data- attribute * originally from being mutated to contain an amp-bind attribute when attributes are restored. * * @since 0.7 * @see \AMP_DOM_Utils::convert_amp_bind_attributes() * @see \AMP_DOM_Utils::restore_amp_bind_attributes() * @link https://www.ampproject.org/docs/reference/components/amp-bind * * @return string HTML5 data-* attribute name prefix for AMP binding attributes. */ public static function get_amp_bind_placeholder_prefix() { static $attribute_prefix; if ( ! isset( $attribute_prefix ) ) { $attribute_prefix = sprintf( 'amp-binding-%s-', md5( wp_rand() ) ); } return $attribute_prefix; } /** * Get amp-mustache tag/placeholder mappings. * * @since 0.7 * @see \wpdb::placeholder_escape() * * @return array Mapping of mustache tag token to its placeholder. */ private static function get_mustache_tag_placeholders() { static $placeholders; if ( ! isset( $placeholders ) ) { $salt = wp_rand(); // Note: The order of these tokens is important, as it determines the order of the order of the replacements. $tokens = array( '{{{', '}}}', '{{#', '{{^', '{{/', '{{/', '{{', '}}', ); $placeholders = array(); foreach ( $tokens as $token ) { $placeholders[ $token ] = '_amp_mustache_' . md5( $salt . $token ); } } return $placeholders; } /** * Replace AMP binding attributes with something that libxml can parse (as HTML5 data-* attributes). * * This is necessary because attributes in square brackets are not understood in PHP and * get dropped with an error raised: * > Warning: DOMDocument::loadHTML(): error parsing attribute name * This is a reciprocal function of AMP_DOM_Utils::restore_amp_bind_attributes(). * * @since 0.7 * @see \AMP_DOM_Utils::convert_amp_bind_attributes() * @link https://www.ampproject.org/docs/reference/components/amp-bind * * @param string $html HTML containing amp-bind attributes. * @return string HTML with AMP binding attributes replaced with HTML5 data-* attributes. */ public static function convert_amp_bind_attributes( $html ) { $amp_bind_attr_prefix = self::get_amp_bind_placeholder_prefix(); // Pattern for HTML attribute accounting for binding attr name, boolean attribute, single/double-quoted attribute value, and unquoted attribute values. $attr_regex = '#^\s+(?P\[?[a-zA-Z0-9_\-]+\]?)(?P=(?:"[^"]*+"|\'[^\']*+\'|[^\'"\s]+))?#'; /** * Replace callback. * * @param array $tag_matches Tag matches. * @return string Replacement. */ $replace_callback = function( $tag_matches ) use ( $amp_bind_attr_prefix, $attr_regex ) { $old_attrs = rtrim( $tag_matches['attrs'] ); $new_attrs = ''; $offset = 0; while ( preg_match( $attr_regex, substr( $old_attrs, $offset ), $attr_matches ) ) { $offset += strlen( $attr_matches[0] ); if ( '[' === $attr_matches['name'][0] ) { $new_attrs .= ' ' . $amp_bind_attr_prefix . trim( $attr_matches['name'], '[]' ); if ( isset( $attr_matches['value'] ) ) { $new_attrs .= $attr_matches['value']; } } else { $new_attrs .= $attr_matches[0]; } } // Bail on parse error which occurs when the regex isn't able to consume the entire $new_attrs string. if ( strlen( $old_attrs ) !== $offset ) { return $tag_matches[0]; } return '<' . $tag_matches['name'] . $new_attrs . '>'; }; // Match all start tags that contain a binding attribute. $pattern = join( '', array( '#<', '(?P[a-zA-Z0-9_\-]+)', // Tag name. '(?P\s', // Attributes. '(?:[^>"\'\[\]]+|"[^"]*+"|\'[^\']*+\')*+', // Non-binding attributes tokens. '\[[a-zA-Z0-9_\-]+\]', // One binding attribute key. '(?:[^>"\']+|"[^"]*+"|\'[^\']*+\')*+', // Any attribute tokens, including binding ones. ')>#s', ) ); $converted = preg_replace_callback( $pattern, $replace_callback, $html ); /** * If the regex engine incurred an error during processing, for example exceeding the backtrack * limit, $converted will be null. In this case we return the originally passed document to allow * DOMDocument to attempt to load it. If the AMP HTML doesn't make use of amp-bind or similar * attributes, then everything should still work. * * See https://github.com/ampproject/amp-wp/issues/993 for additional context on this issue. * See http://php.net/manual/en/pcre.constants.php for additional info on PCRE errors. */ return ( ! is_null( $converted ) ) ? $converted : $html; } /** * Convert AMP bind-attributes back to their original syntax. * * This is a reciprocal function of AMP_DOM_Utils::convert_amp_bind_attributes(). * * @since 0.7 * @see \AMP_DOM_Utils::convert_amp_bind_attributes() * @link https://www.ampproject.org/docs/reference/components/amp-bind * * @param string $html HTML with amp-bind attributes converted. * @return string HTML with amp-bind attributes restored. */ public static function restore_amp_bind_attributes( $html ) { $html = preg_replace( '#\s' . self::get_amp_bind_placeholder_prefix() . '([a-zA-Z0-9_\-]+)#', ' [$1]', $html ); return $html; } /** * Return a valid DOMDocument representing arbitrary HTML content passed as a parameter. * * @see Reciprocal function get_content_from_dom() * * @since 0.2 * * @param string $content Valid HTML content to be represented by a DOMDocument. * * @return DOMDocument|false Returns DOMDocument, or false if conversion failed. */ public static function get_dom_from_content( $content ) { /* * Wrap in dummy tags, since XML needs one parent node. * It also makes it easier to loop through nodes. * We can later use this to extract our nodes. * Add utf-8 charset so loadHTML does not have problems parsing it. * See: http://php.net/manual/en/domdocument.loadhtml.php#78243 */ $document = sprintf( '%s', get_bloginfo( 'charset' ), $content ); return self::get_dom( $document ); } /** * Return valid HTML *body* content extracted from the DOMDocument passed as a parameter. * * @since 0.2 * @see AMP_DOM_Utils::get_content_from_dom_node() Reciprocal function. * * @param DOMDocument $dom Represents an HTML document from which to extract HTML content. * @return string Returns the HTML content of the body element represented in the DOMDocument. */ public static function get_content_from_dom( $dom ) { $body = $dom->getElementsByTagName( 'body' )->item( 0 ); // The DOMDocument may contain no body. In which case return nothing. if ( is_null( $body ) ) { return ''; } return preg_replace( '#^.*?(.*).*?$#si', '$1', self::get_content_from_dom_node( $dom, $body ) ); } /** * Return valid HTML content extracted from the DOMNode passed as a parameter. * * @since 0.6 * @see AMP_DOM_Utils::get_dom() Where the operations in this method are mirrored. * @see AMP_DOM_Utils::get_content_from_dom() Reciprocal function. * @todo In the future consider an AMP_DOMDocument subclass that does this automatically at saveHTML(). See . * * @param DOMDocument $dom Represents an HTML document. * @param DOMElement $node Represents an HTML element of the $dom from which to extract HTML content. * @return string Returns the HTML content represented in the DOMNode */ public static function get_content_from_dom_node( $dom, $node ) { /** * Self closing tags regex. * * @var string Regular expression to match self-closing tags * that saveXML() has generated a closing tag for. */ static $self_closing_tags_regex; /* * Cache this regex so we don't have to recreate it every call. */ if ( ! isset( $self_closing_tags_regex ) ) { $self_closing_tags = implode( '|', self::$self_closing_tags ); $self_closing_tags_regex = "##i"; } /* * Prevent amp-mustache syntax from getting URL-encoded in attributes when saveHTML is done. * While this is applying to the entire document, it only really matters inside of