PDF rausgenommen
This commit is contained in:
@ -0,0 +1,624 @@
|
||||
<?php
|
||||
/**
|
||||
* Class AMP_DOM_Utils.
|
||||
*
|
||||
* @package AMP
|
||||
*/
|
||||
|
||||
/**
|
||||
* Class AMP_DOM_Utils
|
||||
*
|
||||
* Functionality to simplify working with DOMDocuments and DOMElements.
|
||||
*/
|
||||
class AMP_DOM_Utils {
|
||||
|
||||
/**
|
||||
* HTML elements that are self-closing.
|
||||
*
|
||||
* Not all are valid AMP, but we include them for completeness.
|
||||
*
|
||||
* @since 0.7
|
||||
* @link https://www.w3.org/TR/html5/syntax.html#serializing-html-fragments
|
||||
* @var array
|
||||
*/
|
||||
private static $self_closing_tags = array(
|
||||
'area',
|
||||
'base',
|
||||
'basefont',
|
||||
'bgsound',
|
||||
'br',
|
||||
'col',
|
||||
'embed',
|
||||
'frame',
|
||||
'hr',
|
||||
'img',
|
||||
'input',
|
||||
'keygen',
|
||||
'link',
|
||||
'meta',
|
||||
'param',
|
||||
'source',
|
||||
'track',
|
||||
'wbr',
|
||||
);
|
||||
|
||||
/**
|
||||
* Stored noscript/comment replacements for libxml<2.8.
|
||||
*
|
||||
* @since 0.7
|
||||
* @var array
|
||||
*/
|
||||
public static $noscript_placeholder_comments = array();
|
||||
|
||||
/**
|
||||
* Return a valid DOMDocument representing HTML document passed as a parameter.
|
||||
*
|
||||
* @since 0.7
|
||||
* @see AMP_DOM_Utils::get_content_from_dom_node()
|
||||
*
|
||||
* @param string $document Valid HTML document to be represented by a DOMDocument.
|
||||
* @return DOMDocument|false Returns DOMDocument, or false if conversion failed.
|
||||
*/
|
||||
public static function get_dom( $document ) {
|
||||
$libxml_previous_state = libxml_use_internal_errors( true );
|
||||
|
||||
$dom = new DOMDocument();
|
||||
|
||||
// @todo In the future consider an AMP_DOMDocument subclass that does this automatically. See <https://github.com/ampproject/amp-wp/pull/895/files#r163825513>.
|
||||
$document = self::convert_amp_bind_attributes( $document );
|
||||
|
||||
// Force all self-closing tags to have closing tags since DOMDocument isn't fully aware.
|
||||
$document = preg_replace(
|
||||
'#<(' . implode( '|', self::$self_closing_tags ) . ')[^>]*>(?!</\1>)#',
|
||||
'$0</$1>',
|
||||
$document
|
||||
);
|
||||
|
||||
// Deal with bugs in older versions of libxml.
|
||||
$added_back_compat_meta_content_type = false;
|
||||
if ( version_compare( LIBXML_DOTTED_VERSION, '2.8', '<' ) ) {
|
||||
/*
|
||||
* Replace noscript elements with placeholders since libxml<2.8 can parse them incorrectly.
|
||||
* When appearing in the head element, a noscript can cause the head to close prematurely
|
||||
* and the noscript gets moved to the body and anything after it which was in the head.
|
||||
* See <https://stackoverflow.com/questions/39013102/why-does-noscript-move-into-body-tag-instead-of-head-tag>.
|
||||
* This is limited to only running in the head element because this is where the problem lies,
|
||||
* and it is important for the AMP_Script_Sanitizer to be able to access the noscript elements
|
||||
* in the body otherwise.
|
||||
*/
|
||||
$document = preg_replace_callback(
|
||||
'#^.+?(?=<body)#is',
|
||||
function( $head_matches ) {
|
||||
return preg_replace_callback(
|
||||
'#<noscript[^>]*>.*?</noscript>#si',
|
||||
function( $noscript_matches ) {
|
||||
$placeholder = sprintf( '<!--noscript:%s-->', (string) wp_rand() );
|
||||
AMP_DOM_Utils::$noscript_placeholder_comments[ $placeholder ] = $noscript_matches[0];
|
||||
return $placeholder;
|
||||
},
|
||||
$head_matches[0]
|
||||
);
|
||||
},
|
||||
$document
|
||||
);
|
||||
|
||||
/*
|
||||
* Add a pre-HTML5-style declaration of the encoding since libxml<2.8 doesn't recognize
|
||||
* HTML5's meta charset. See <https://bugzilla.gnome.org/show_bug.cgi?id=655218>.
|
||||
*/
|
||||
$document = preg_replace(
|
||||
'#(?=<meta\s+charset=["\']?([a-z0-9_-]+))#i',
|
||||
'<meta http-equiv="Content-Type" content="text/html; charset=$1" id="meta-http-equiv-content-type">',
|
||||
$document,
|
||||
1,
|
||||
$count
|
||||
);
|
||||
if ( 1 === $count ) {
|
||||
$added_back_compat_meta_content_type = true;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Wrap in dummy tags, since XML needs one parent node.
|
||||
* It also makes it easier to loop through nodes.
|
||||
* We can later use this to extract our nodes.
|
||||
* Add charset so loadHTML does not have problems parsing it.
|
||||
*/
|
||||
$result = $dom->loadHTML( $document );
|
||||
|
||||
libxml_clear_errors();
|
||||
libxml_use_internal_errors( $libxml_previous_state );
|
||||
|
||||
if ( ! $result ) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Remove pre-HTML5-style encoding declaration if added above.
|
||||
if ( $added_back_compat_meta_content_type ) {
|
||||
$meta_http_equiv_element = $dom->getElementById( 'meta-http-equiv-content-type' );
|
||||
if ( $meta_http_equiv_element ) {
|
||||
$meta_http_equiv_element->parentNode->removeChild( $meta_http_equiv_element );
|
||||
}
|
||||
}
|
||||
|
||||
return $dom;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get attribute prefix for converted amp-bind attributes.
|
||||
*
|
||||
* This contains a random string to prevent HTML content containing this data- attribute
|
||||
* originally from being mutated to contain an amp-bind attribute when attributes are restored.
|
||||
*
|
||||
* @since 0.7
|
||||
* @see \AMP_DOM_Utils::convert_amp_bind_attributes()
|
||||
* @see \AMP_DOM_Utils::restore_amp_bind_attributes()
|
||||
* @link https://www.ampproject.org/docs/reference/components/amp-bind
|
||||
*
|
||||
* @return string HTML5 data-* attribute name prefix for AMP binding attributes.
|
||||
*/
|
||||
public static function get_amp_bind_placeholder_prefix() {
|
||||
static $attribute_prefix;
|
||||
if ( ! isset( $attribute_prefix ) ) {
|
||||
$attribute_prefix = sprintf( 'amp-binding-%s-', md5( wp_rand() ) );
|
||||
}
|
||||
return $attribute_prefix;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get amp-mustache tag/placeholder mappings.
|
||||
*
|
||||
* @since 0.7
|
||||
* @see \wpdb::placeholder_escape()
|
||||
*
|
||||
* @return array Mapping of mustache tag token to its placeholder.
|
||||
*/
|
||||
private static function get_mustache_tag_placeholders() {
|
||||
static $placeholders;
|
||||
if ( ! isset( $placeholders ) ) {
|
||||
$salt = wp_rand();
|
||||
|
||||
// Note: The order of these tokens is important, as it determines the order of the order of the replacements.
|
||||
$tokens = array(
|
||||
'{{{',
|
||||
'}}}',
|
||||
'{{#',
|
||||
'{{^',
|
||||
'{{/',
|
||||
'{{/',
|
||||
'{{',
|
||||
'}}',
|
||||
);
|
||||
$placeholders = array();
|
||||
foreach ( $tokens as $token ) {
|
||||
$placeholders[ $token ] = '_amp_mustache_' . md5( $salt . $token );
|
||||
}
|
||||
}
|
||||
return $placeholders;
|
||||
}
|
||||
|
||||
/**
|
||||
* Replace AMP binding attributes with something that libxml can parse (as HTML5 data-* attributes).
|
||||
*
|
||||
* This is necessary because attributes in square brackets are not understood in PHP and
|
||||
* get dropped with an error raised:
|
||||
* > Warning: DOMDocument::loadHTML(): error parsing attribute name
|
||||
* This is a reciprocal function of AMP_DOM_Utils::restore_amp_bind_attributes().
|
||||
*
|
||||
* @since 0.7
|
||||
* @see \AMP_DOM_Utils::convert_amp_bind_attributes()
|
||||
* @link https://www.ampproject.org/docs/reference/components/amp-bind
|
||||
*
|
||||
* @param string $html HTML containing amp-bind attributes.
|
||||
* @return string HTML with AMP binding attributes replaced with HTML5 data-* attributes.
|
||||
*/
|
||||
public static function convert_amp_bind_attributes( $html ) {
|
||||
$amp_bind_attr_prefix = self::get_amp_bind_placeholder_prefix();
|
||||
|
||||
// Pattern for HTML attribute accounting for binding attr name, boolean attribute, single/double-quoted attribute value, and unquoted attribute values.
|
||||
$attr_regex = '#^\s+(?P<name>\[?[a-zA-Z0-9_\-]+\]?)(?P<value>=(?:"[^"]*+"|\'[^\']*+\'|[^\'"\s]+))?#';
|
||||
|
||||
/**
|
||||
* Replace callback.
|
||||
*
|
||||
* @param array $tag_matches Tag matches.
|
||||
* @return string Replacement.
|
||||
*/
|
||||
$replace_callback = function( $tag_matches ) use ( $amp_bind_attr_prefix, $attr_regex ) {
|
||||
$old_attrs = rtrim( $tag_matches['attrs'] );
|
||||
$new_attrs = '';
|
||||
$offset = 0;
|
||||
while ( preg_match( $attr_regex, substr( $old_attrs, $offset ), $attr_matches ) ) {
|
||||
$offset += strlen( $attr_matches[0] );
|
||||
|
||||
if ( '[' === $attr_matches['name'][0] ) {
|
||||
$new_attrs .= ' ' . $amp_bind_attr_prefix . trim( $attr_matches['name'], '[]' );
|
||||
if ( isset( $attr_matches['value'] ) ) {
|
||||
$new_attrs .= $attr_matches['value'];
|
||||
}
|
||||
} else {
|
||||
$new_attrs .= $attr_matches[0];
|
||||
}
|
||||
}
|
||||
|
||||
// Bail on parse error which occurs when the regex isn't able to consume the entire $new_attrs string.
|
||||
if ( strlen( $old_attrs ) !== $offset ) {
|
||||
return $tag_matches[0];
|
||||
}
|
||||
|
||||
return '<' . $tag_matches['name'] . $new_attrs . '>';
|
||||
};
|
||||
|
||||
// Match all start tags that contain a binding attribute.
|
||||
$pattern = join(
|
||||
'',
|
||||
array(
|
||||
'#<',
|
||||
'(?P<name>[a-zA-Z0-9_\-]+)', // Tag name.
|
||||
'(?P<attrs>\s', // Attributes.
|
||||
'(?:[^>"\'\[\]]+|"[^"]*+"|\'[^\']*+\')*+', // Non-binding attributes tokens.
|
||||
'\[[a-zA-Z0-9_\-]+\]', // One binding attribute key.
|
||||
'(?:[^>"\']+|"[^"]*+"|\'[^\']*+\')*+', // Any attribute tokens, including binding ones.
|
||||
')>#s',
|
||||
)
|
||||
);
|
||||
$converted = preg_replace_callback(
|
||||
$pattern,
|
||||
$replace_callback,
|
||||
$html
|
||||
);
|
||||
|
||||
/**
|
||||
* If the regex engine incurred an error during processing, for example exceeding the backtrack
|
||||
* limit, $converted will be null. In this case we return the originally passed document to allow
|
||||
* DOMDocument to attempt to load it. If the AMP HTML doesn't make use of amp-bind or similar
|
||||
* attributes, then everything should still work.
|
||||
*
|
||||
* See https://github.com/ampproject/amp-wp/issues/993 for additional context on this issue.
|
||||
* See http://php.net/manual/en/pcre.constants.php for additional info on PCRE errors.
|
||||
*/
|
||||
return ( ! is_null( $converted ) ) ? $converted : $html;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert AMP bind-attributes back to their original syntax.
|
||||
*
|
||||
* This is a reciprocal function of AMP_DOM_Utils::convert_amp_bind_attributes().
|
||||
*
|
||||
* @since 0.7
|
||||
* @see \AMP_DOM_Utils::convert_amp_bind_attributes()
|
||||
* @link https://www.ampproject.org/docs/reference/components/amp-bind
|
||||
*
|
||||
* @param string $html HTML with amp-bind attributes converted.
|
||||
* @return string HTML with amp-bind attributes restored.
|
||||
*/
|
||||
public static function restore_amp_bind_attributes( $html ) {
|
||||
$html = preg_replace(
|
||||
'#\s' . self::get_amp_bind_placeholder_prefix() . '([a-zA-Z0-9_\-]+)#',
|
||||
' [$1]',
|
||||
$html
|
||||
);
|
||||
return $html;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a valid DOMDocument representing arbitrary HTML content passed as a parameter.
|
||||
*
|
||||
* @see Reciprocal function get_content_from_dom()
|
||||
*
|
||||
* @since 0.2
|
||||
*
|
||||
* @param string $content Valid HTML content to be represented by a DOMDocument.
|
||||
*
|
||||
* @return DOMDocument|false Returns DOMDocument, or false if conversion failed.
|
||||
*/
|
||||
public static function get_dom_from_content( $content ) {
|
||||
/*
|
||||
* Wrap in dummy tags, since XML needs one parent node.
|
||||
* It also makes it easier to loop through nodes.
|
||||
* We can later use this to extract our nodes.
|
||||
* Add utf-8 charset so loadHTML does not have problems parsing it.
|
||||
* See: http://php.net/manual/en/domdocument.loadhtml.php#78243
|
||||
*/
|
||||
$document = sprintf(
|
||||
'<html><head><meta http-equiv="content-type" content="text/html; charset=%s"></head><body>%s</body></html>',
|
||||
get_bloginfo( 'charset' ),
|
||||
$content
|
||||
);
|
||||
|
||||
return self::get_dom( $document );
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Return valid HTML *body* content extracted from the DOMDocument passed as a parameter.
|
||||
*
|
||||
* @since 0.2
|
||||
* @see AMP_DOM_Utils::get_content_from_dom_node() Reciprocal function.
|
||||
*
|
||||
* @param DOMDocument $dom Represents an HTML document from which to extract HTML content.
|
||||
* @return string Returns the HTML content of the body element represented in the DOMDocument.
|
||||
*/
|
||||
public static function get_content_from_dom( $dom ) {
|
||||
$body = $dom->getElementsByTagName( 'body' )->item( 0 );
|
||||
|
||||
// The DOMDocument may contain no body. In which case return nothing.
|
||||
if ( is_null( $body ) ) {
|
||||
return '';
|
||||
}
|
||||
|
||||
return preg_replace(
|
||||
'#^.*?<body.*?>(.*)</body>.*?$#si',
|
||||
'$1',
|
||||
self::get_content_from_dom_node( $dom, $body )
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Return valid HTML content extracted from the DOMNode passed as a parameter.
|
||||
*
|
||||
* @since 0.6
|
||||
* @see AMP_DOM_Utils::get_dom() Where the operations in this method are mirrored.
|
||||
* @see AMP_DOM_Utils::get_content_from_dom() Reciprocal function.
|
||||
* @todo In the future consider an AMP_DOMDocument subclass that does this automatically at saveHTML(). See <https://github.com/ampproject/amp-wp/pull/895/files#r163825513>.
|
||||
*
|
||||
* @param DOMDocument $dom Represents an HTML document.
|
||||
* @param DOMElement $node Represents an HTML element of the $dom from which to extract HTML content.
|
||||
* @return string Returns the HTML content represented in the DOMNode
|
||||
*/
|
||||
public static function get_content_from_dom_node( $dom, $node ) {
|
||||
/**
|
||||
* Self closing tags regex.
|
||||
*
|
||||
* @var string Regular expression to match self-closing tags
|
||||
* that saveXML() has generated a closing tag for.
|
||||
*/
|
||||
static $self_closing_tags_regex;
|
||||
|
||||
/*
|
||||
* Cache this regex so we don't have to recreate it every call.
|
||||
*/
|
||||
if ( ! isset( $self_closing_tags_regex ) ) {
|
||||
$self_closing_tags = implode( '|', self::$self_closing_tags );
|
||||
$self_closing_tags_regex = "#</({$self_closing_tags})>#i";
|
||||
}
|
||||
|
||||
/*
|
||||
* Prevent amp-mustache syntax from getting URL-encoded in attributes when saveHTML is done.
|
||||
* While this is applying to the entire document, it only really matters inside of <template>
|
||||
* elements, since URL-encoding of curly braces in href attributes would not normally matter.
|
||||
* But when this is done inside of a <template> then it breaks Mustache. Since Mustache
|
||||
* is logic-less and curly braces are not unsafe for HTML, we can do a global replacement.
|
||||
* The replacement is done on the entire HTML document instead of just inside of the <template>
|
||||
* elements since it is faster and wouldn't change the outcome.
|
||||
*/
|
||||
$mustache_tag_placeholders = self::get_mustache_tag_placeholders();
|
||||
$mustache_tags_replaced = false;
|
||||
$xpath = new DOMXPath( $dom );
|
||||
$templates = $dom->getElementsByTagName( 'template' );
|
||||
foreach ( $templates as $template ) {
|
||||
|
||||
// These attributes are the only ones that saveHTML() will URL-encode.
|
||||
foreach ( $xpath->query( './/*/@src|.//*/@href|.//*/@action', $template ) as $attribute ) {
|
||||
$attribute->nodeValue = str_replace(
|
||||
array_keys( $mustache_tag_placeholders ),
|
||||
array_values( $mustache_tag_placeholders ),
|
||||
$attribute->nodeValue,
|
||||
$count
|
||||
);
|
||||
if ( $count ) {
|
||||
$mustache_tags_replaced = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ( version_compare( PHP_VERSION, '7.3', '>=' ) ) {
|
||||
$html = $dom->saveHTML( $node );
|
||||
} else {
|
||||
/*
|
||||
* Temporarily add fragment boundary comments in order to locate the desired node to extract from
|
||||
* the given HTML document. This is required because libxml seems to only preserve whitespace when
|
||||
* serializing when calling DOMDocument::saveHTML() on the entire document. If you pass the element
|
||||
* to DOMDocument::saveHTML() then formatting whitespace gets added unexpectedly. This is seen to
|
||||
* be fixed in PHP 7.3, but for older versions of PHP the following workaround is needed.
|
||||
*/
|
||||
|
||||
/*
|
||||
* First make sure meta[charset] gets http-equiv and content attributes to work around issue
|
||||
* with $dom->saveHTML() erroneously encoding UTF-8 as HTML entities.
|
||||
*/
|
||||
$meta_charset = $xpath->query( '/html/head/meta[ @charset ]' )->item( 0 );
|
||||
if ( $meta_charset ) {
|
||||
$meta_charset->setAttribute( 'http-equiv', 'Content-Type' );
|
||||
$meta_charset->setAttribute( 'content', sprintf( 'text/html; charset=%s', $meta_charset->getAttribute( 'charset' ) ) );
|
||||
}
|
||||
|
||||
$boundary = 'fragment_boundary:' . (string) wp_rand();
|
||||
$start_boundary = $boundary . ':start';
|
||||
$end_boundary = $boundary . ':end';
|
||||
$comment_start = $dom->createComment( $start_boundary );
|
||||
$comment_end = $dom->createComment( $end_boundary );
|
||||
$node->parentNode->insertBefore( $comment_start, $node );
|
||||
$node->parentNode->insertBefore( $comment_end, $node->nextSibling );
|
||||
$html = preg_replace(
|
||||
'/^.*?' . preg_quote( "<!--$start_boundary-->", '/' ) . '(.*)' . preg_quote( "<!--$end_boundary-->", '/' ) . '.*?\s*$/s',
|
||||
'$1',
|
||||
$dom->saveHTML()
|
||||
);
|
||||
|
||||
// Remove meta[http-equiv] and meta[content] attributes which were added to meta[charset] for HTML serialization.
|
||||
if ( $meta_charset ) {
|
||||
if ( $dom->documentElement === $node ) {
|
||||
$html = preg_replace( '#(<meta\scharset=\S+)[^<]*?>#i', '$1>', $html );
|
||||
}
|
||||
|
||||
$meta_charset->removeAttribute( 'http-equiv' );
|
||||
$meta_charset->removeAttribute( 'content' );
|
||||
}
|
||||
|
||||
$node->parentNode->removeChild( $comment_start );
|
||||
$node->parentNode->removeChild( $comment_end );
|
||||
}
|
||||
|
||||
// Whitespace just causes unit tests to fail... so whitespace begone.
|
||||
if ( '' === trim( $html ) ) {
|
||||
return '';
|
||||
}
|
||||
|
||||
// Restore amp-mustache placeholders which were replaced to prevent URL-encoded corruption by saveHTML.
|
||||
if ( $mustache_tags_replaced ) {
|
||||
$html = str_replace(
|
||||
array_values( $mustache_tag_placeholders ),
|
||||
array_keys( $mustache_tag_placeholders ),
|
||||
$html
|
||||
);
|
||||
}
|
||||
|
||||
// Restore noscript elements which were temporarily removed to prevent libxml<2.8 parsing problems.
|
||||
if ( version_compare( LIBXML_DOTTED_VERSION, '2.8', '<' ) ) {
|
||||
$html = str_replace(
|
||||
array_keys( self::$noscript_placeholder_comments ),
|
||||
array_values( self::$noscript_placeholder_comments ),
|
||||
$html
|
||||
);
|
||||
}
|
||||
|
||||
$html = self::restore_amp_bind_attributes( $html );
|
||||
|
||||
/*
|
||||
* Travis w/PHP 7.1 generates <br></br> and <hr></hr> vs. <br/> and <hr/>, respectively.
|
||||
* Travis w/PHP 7.x generates <source ...></source> vs. <source ... />. Etc.
|
||||
* Seems like LIBXML_NOEMPTYTAG was passed, but as you can see it was not.
|
||||
* This does not happen in my (@mikeschinkel) local testing, btw.
|
||||
*/
|
||||
$html = preg_replace( $self_closing_tags_regex, '', $html );
|
||||
|
||||
return $html;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new node w/attributes (a DOMElement) and add to the passed DOMDocument.
|
||||
*
|
||||
* @since 0.2
|
||||
*
|
||||
* @param DOMDocument $dom A representation of an HTML document to add the new node to.
|
||||
* @param string $tag A valid HTML element tag for the element to be added.
|
||||
* @param string[] $attributes One of more valid attributes for the new node.
|
||||
*
|
||||
* @return DOMElement|false The DOMElement for the given $tag, or false on failure
|
||||
*/
|
||||
public static function create_node( $dom, $tag, $attributes ) {
|
||||
$node = $dom->createElement( $tag );
|
||||
self::add_attributes_to_node( $node, $attributes );
|
||||
|
||||
return $node;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract a DOMElement node's HTML element attributes and return as an array.
|
||||
*
|
||||
* @since 0.2
|
||||
*
|
||||
* @param DOMElement $node Represents an HTML element for which to extract attributes.
|
||||
*
|
||||
* @return string[] The attributes for the passed node, or an
|
||||
* empty array if it has no attributes.
|
||||
*/
|
||||
public static function get_node_attributes_as_assoc_array( $node ) {
|
||||
$attributes = array();
|
||||
if ( ! $node->hasAttributes() ) {
|
||||
return $attributes;
|
||||
}
|
||||
|
||||
foreach ( $node->attributes as $attribute ) {
|
||||
$attributes[ $attribute->nodeName ] = $attribute->nodeValue;
|
||||
}
|
||||
|
||||
return $attributes;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add one or more HTML element attributes to a node's DOMElement.
|
||||
*
|
||||
* @since 0.2
|
||||
*
|
||||
* @param DOMElement $node Represents an HTML element.
|
||||
* @param string[] $attributes One or more attributes for the node's HTML element.
|
||||
*/
|
||||
public static function add_attributes_to_node( $node, $attributes ) {
|
||||
foreach ( $attributes as $name => $value ) {
|
||||
$node->setAttribute( $name, $value );
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines if a DOMElement's node is empty or not..
|
||||
*
|
||||
* @since 0.2
|
||||
*
|
||||
* @param DOMElement $node Represents an HTML element.
|
||||
* @return bool Returns true if the DOMElement has no child nodes and
|
||||
* the textContent property of the DOMElement is empty;
|
||||
* Otherwise it returns false.
|
||||
*/
|
||||
public static function is_node_empty( $node ) {
|
||||
return false === $node->hasChildNodes() && empty( $node->textContent );
|
||||
}
|
||||
|
||||
/**
|
||||
* Forces HTML element closing tags given a DOMDocument and optional DOMElement
|
||||
*
|
||||
* @since 0.2
|
||||
* @deprecated
|
||||
*
|
||||
* @param DOMDocument $dom Represents HTML document on which to force closing tags.
|
||||
* @param DOMElement $node Represents HTML element to start closing tags on.
|
||||
* If not passed, defaults to first child of body.
|
||||
*/
|
||||
public static function recursive_force_closing_tags( $dom, $node = null ) {
|
||||
_deprecated_function( __METHOD__, '0.7' );
|
||||
|
||||
if ( is_null( $node ) ) {
|
||||
$node = $dom->getElementsByTagName( 'body' )->item( 0 );
|
||||
}
|
||||
|
||||
if ( XML_ELEMENT_NODE !== $node->nodeType ) {
|
||||
return;
|
||||
}
|
||||
|
||||
if ( self::is_self_closing_tag( $node->nodeName ) ) {
|
||||
/*
|
||||
* Ensure there is no text content to accidentally force a child
|
||||
*/
|
||||
$node->textContent = null;
|
||||
return;
|
||||
}
|
||||
|
||||
if ( self::is_node_empty( $node ) ) {
|
||||
$text_node = $dom->createTextNode( '' );
|
||||
$node->appendChild( $text_node );
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
$num_children = $node->childNodes->length;
|
||||
for ( $i = $num_children - 1; $i >= 0; $i -- ) {
|
||||
$child = $node->childNodes->item( $i );
|
||||
self::recursive_force_closing_tags( $dom, $child );
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines if an HTML element tag is validly a self-closing tag per W3C HTML5 specs.
|
||||
*
|
||||
* @since 0.2
|
||||
*
|
||||
* @param string $tag Tag.
|
||||
* @return bool Returns true if a valid self-closing tag, false if not.
|
||||
*/
|
||||
private static function is_self_closing_tag( $tag ) {
|
||||
return in_array( strtolower( $tag ), self::$self_closing_tags, true );
|
||||
}
|
||||
}
|
@ -0,0 +1,62 @@
|
||||
<?php
|
||||
/**
|
||||
* Class AMP_HTML_Utils
|
||||
*
|
||||
* @package AMP
|
||||
*/
|
||||
|
||||
/**
|
||||
* Class with static HTML utility methods.
|
||||
*/
|
||||
class AMP_HTML_Utils {
|
||||
|
||||
/**
|
||||
* Generates HTML markup for a given tag, attributes and content.
|
||||
*
|
||||
* @param string $tag_name Tag name.
|
||||
* @param array $attributes Associative array of $attribute => $value pairs.
|
||||
* @param string $content Inner content for the generated node.
|
||||
* @return string HTML markup.
|
||||
*/
|
||||
public static function build_tag( $tag_name, $attributes = array(), $content = '' ) {
|
||||
$attr_string = self::build_attributes_string( $attributes );
|
||||
return sprintf( '<%1$s %2$s>%3$s</%1$s>', sanitize_key( $tag_name ), $attr_string, $content );
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates a HTML attributes string from given attributes.
|
||||
*
|
||||
* @param array $attributes Associative array of $attribute => $value pairs.
|
||||
* @return string HTML attributes string.
|
||||
*/
|
||||
public static function build_attributes_string( $attributes ) {
|
||||
$string = array();
|
||||
foreach ( $attributes as $name => $value ) {
|
||||
if ( '' === $value ) {
|
||||
$string[] = sprintf( '%s', sanitize_key( $name ) );
|
||||
} else {
|
||||
$string[] = sprintf( '%s="%s"', sanitize_key( $name ), esc_attr( $value ) );
|
||||
}
|
||||
}
|
||||
return implode( ' ', $string );
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks whether the given string is valid JSON.
|
||||
*
|
||||
* @param string $data String hopefully containing JSON.
|
||||
* @return bool True if the string is valid JSON, false otherwise.
|
||||
*/
|
||||
public static function is_valid_json( $data ) {
|
||||
if ( ! empty( $data ) ) {
|
||||
$decoded = json_decode( $data );
|
||||
if ( function_exists( 'json_last_error' ) ) {
|
||||
return ( json_last_error() === JSON_ERROR_NONE );
|
||||
} else {
|
||||
// For PHP 5.2 back-compatibility.
|
||||
return null !== $decoded;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
@ -0,0 +1,285 @@
|
||||
<?php
|
||||
/**
|
||||
* Class AMP_Image_Dimension_Extractor
|
||||
*
|
||||
* @package AMP
|
||||
*/
|
||||
|
||||
/**
|
||||
* Class with static methods to extract image dimensions.
|
||||
*/
|
||||
class AMP_Image_Dimension_Extractor {
|
||||
|
||||
const STATUS_FAILED_LAST_ATTEMPT = 'failed';
|
||||
const STATUS_IMAGE_EXTRACTION_FAILED = 'failed';
|
||||
|
||||
/**
|
||||
* Internal flag whether callbacks have been registered.
|
||||
*
|
||||
* @var bool
|
||||
*/
|
||||
private static $callbacks_registered = false;
|
||||
|
||||
/**
|
||||
* Extracts dimensions from image URLs.
|
||||
*
|
||||
* @since 0.2
|
||||
*
|
||||
* @param array|string $urls Array of URLs to extract dimensions from, or a single URL string.
|
||||
* @return array|string Extracted dimensions keyed by original URL, or else the single set of dimensions if one URL string is passed.
|
||||
*/
|
||||
public static function extract( $urls ) {
|
||||
if ( ! self::$callbacks_registered ) {
|
||||
self::register_callbacks();
|
||||
}
|
||||
|
||||
$return_dimensions = array();
|
||||
|
||||
// Back-compat for users calling this method directly.
|
||||
$is_single = is_string( $urls );
|
||||
if ( $is_single ) {
|
||||
$urls = array( $urls );
|
||||
}
|
||||
|
||||
// Normalize URLs and also track a map of normalized-to-original as we'll need it to reformat things when returning the data.
|
||||
$url_map = array();
|
||||
$normalized_urls = array();
|
||||
foreach ( $urls as $original_url ) {
|
||||
$normalized_url = self::normalize_url( $original_url );
|
||||
if ( false !== $normalized_url ) {
|
||||
$url_map[ $original_url ] = $normalized_url;
|
||||
$normalized_urls[] = $normalized_url;
|
||||
} else {
|
||||
// This is not a URL we can extract dimensions from, so default to false.
|
||||
$return_dimensions[ $original_url ] = false;
|
||||
}
|
||||
}
|
||||
|
||||
$extracted_dimensions = array_fill_keys( $normalized_urls, false );
|
||||
$extracted_dimensions = apply_filters( 'amp_extract_image_dimensions_batch', $extracted_dimensions );
|
||||
|
||||
// We need to return a map with the original (un-normalized URL) as we that to match nodes that need dimensions.
|
||||
foreach ( $url_map as $original_url => $normalized_url ) {
|
||||
$return_dimensions[ $original_url ] = $extracted_dimensions[ $normalized_url ];
|
||||
}
|
||||
|
||||
// Back-compat: just return the dimensions, not the full mapped array.
|
||||
if ( $is_single ) {
|
||||
return current( $return_dimensions );
|
||||
}
|
||||
|
||||
return $return_dimensions;
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalizes the given URL.
|
||||
*
|
||||
* This method ensures the URL has a scheme and, if relative, is prepended the WordPress site URL.
|
||||
*
|
||||
* @param string $url URL to normalize.
|
||||
* @return string Normalized URL.
|
||||
*/
|
||||
public static function normalize_url( $url ) {
|
||||
if ( empty( $url ) ) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if ( 0 === strpos( $url, 'data:' ) ) {
|
||||
return false;
|
||||
}
|
||||
|
||||
$normalized_url = $url;
|
||||
|
||||
if ( 0 === strpos( $url, '//' ) ) {
|
||||
$normalized_url = set_url_scheme( $url, 'http' );
|
||||
} else {
|
||||
$parsed = wp_parse_url( $url );
|
||||
if ( ! isset( $parsed['host'] ) ) {
|
||||
$path = '';
|
||||
if ( isset( $parsed['path'] ) ) {
|
||||
$path .= $parsed['path'];
|
||||
}
|
||||
if ( isset( $parsed['query'] ) ) {
|
||||
$path .= '?' . $parsed['query'];
|
||||
}
|
||||
$home = home_url();
|
||||
$home_path = wp_parse_url( $home, PHP_URL_PATH );
|
||||
if ( ! empty( $home_path ) ) {
|
||||
$home = substr( $home, 0, - strlen( $home_path ) );
|
||||
}
|
||||
$normalized_url = $home . $path;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Apply filters on the normalized image URL for dimension extraction.
|
||||
*
|
||||
* @since 1.1
|
||||
*
|
||||
* @param string $normalized_url Normalized image URL.
|
||||
* @param string $url Original image URL.
|
||||
*/
|
||||
$normalized_url = apply_filters( 'amp_normalized_dimension_extractor_image_url', $normalized_url, $url );
|
||||
|
||||
return $normalized_url;
|
||||
}
|
||||
|
||||
/**
|
||||
* Registers the necessary callbacks.
|
||||
*/
|
||||
private static function register_callbacks() {
|
||||
self::$callbacks_registered = true;
|
||||
|
||||
add_filter( 'amp_extract_image_dimensions_batch', array( __CLASS__, 'extract_by_downloading_images' ), 999, 1 );
|
||||
|
||||
do_action( 'amp_extract_image_dimensions_batch_callbacks_registered' );
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract dimensions from downloaded images (or transient/cached dimensions from downloaded images)
|
||||
*
|
||||
* @param array $dimensions Image urls mapped to dimensions.
|
||||
* @param false $mode Deprecated.
|
||||
* @return array Dimensions mapped to image urls, or false if they could not be retrieved
|
||||
*/
|
||||
public static function extract_by_downloading_images( $dimensions, $mode = false ) {
|
||||
if ( $mode ) {
|
||||
_deprecated_argument( __METHOD__, 'AMP 1.1' );
|
||||
}
|
||||
|
||||
$transient_expiration = 30 * DAY_IN_SECONDS;
|
||||
|
||||
$urls_to_fetch = array();
|
||||
$images = array();
|
||||
|
||||
self::determine_which_images_to_fetch( $dimensions, $urls_to_fetch );
|
||||
try {
|
||||
self::fetch_images( $urls_to_fetch, $images );
|
||||
self::process_fetched_images( $urls_to_fetch, $images, $dimensions, $transient_expiration );
|
||||
} catch ( \Exception $exception ) {
|
||||
trigger_error( esc_html( $exception->getMessage() ), E_USER_WARNING ); // phpcs:ignore WordPress.PHP.DevelopmentFunctions.error_log_trigger_error
|
||||
}
|
||||
|
||||
return $dimensions;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine which images to fetch by checking for dimensions in transient/cache.
|
||||
* Creates a short lived transient that acts as a semaphore so that another visitor
|
||||
* doesn't trigger a remote fetch for the same image at the same time.
|
||||
*
|
||||
* @param array $dimensions Image urls mapped to dimensions.
|
||||
* @param array $urls_to_fetch Urls of images to fetch because dimensions are not in transient/cache.
|
||||
*/
|
||||
private static function determine_which_images_to_fetch( &$dimensions, &$urls_to_fetch ) {
|
||||
foreach ( $dimensions as $url => $value ) {
|
||||
|
||||
// Check whether some other callback attached to the filter already provided dimensions for this image.
|
||||
if ( is_array( $value ) ) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$url_hash = md5( $url );
|
||||
$transient_name = sprintf( 'amp_img_%s', $url_hash );
|
||||
$cached_dimensions = get_transient( $transient_name );
|
||||
|
||||
// If we're able to retrieve the dimensions from a transient, set them and move on.
|
||||
if ( is_array( $cached_dimensions ) ) {
|
||||
$dimensions[ $url ] = array(
|
||||
'width' => $cached_dimensions[0],
|
||||
'height' => $cached_dimensions[1],
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
// If the value in the transient reflects we couldn't get dimensions for this image the last time we tried, move on.
|
||||
if ( self::STATUS_FAILED_LAST_ATTEMPT === $cached_dimensions ) {
|
||||
$dimensions[ $url ] = false;
|
||||
continue;
|
||||
}
|
||||
|
||||
$transient_lock_name = sprintf( 'amp_lock_%s', $url_hash );
|
||||
|
||||
// If somebody is already trying to extract dimensions for this transient right now, move on.
|
||||
if ( false !== get_transient( $transient_lock_name ) ) {
|
||||
$dimensions[ $url ] = false;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Include the image as a url to fetch.
|
||||
$urls_to_fetch[ $url ] = array();
|
||||
$urls_to_fetch[ $url ]['url'] = $url;
|
||||
$urls_to_fetch[ $url ]['transient_name'] = $transient_name;
|
||||
$urls_to_fetch[ $url ]['transient_lock_name'] = $transient_lock_name;
|
||||
set_transient( $transient_lock_name, 1, MINUTE_IN_SECONDS );
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch dimensions of remote images
|
||||
*
|
||||
* @throws Exception When cURL handle cannot be added.
|
||||
*
|
||||
* @param array $urls_to_fetch Image src urls to fetch.
|
||||
* @param array $images Array to populate with results of image/dimension inspection.
|
||||
*/
|
||||
private static function fetch_images( $urls_to_fetch, &$images ) {
|
||||
$urls = array_keys( $urls_to_fetch );
|
||||
$client = new \FasterImage\FasterImage();
|
||||
|
||||
/**
|
||||
* Filters the user agent for onbtaining the image dimensions.
|
||||
*
|
||||
* @param string $user_agent User agent.
|
||||
*/
|
||||
$client->setUserAgent( apply_filters( 'amp_extract_image_dimensions_get_user_agent', self::get_default_user_agent() ) );
|
||||
$client->setBufferSize( 1024 );
|
||||
$client->setSslVerifyHost( true );
|
||||
$client->setSslVerifyPeer( true );
|
||||
|
||||
$images = $client->batch( $urls );
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine success or failure of remote fetch, integrate fetched dimensions into url to dimension mapping,
|
||||
* cache fetched dimensions via transient and release/delete semaphore transient
|
||||
*
|
||||
* @param array $urls_to_fetch List of image urls that were fetched and transient names corresponding to each (for unlocking semaphore, setting "real" transient).
|
||||
* @param array $images Results of remote fetch mapping fetched image url to dimensions.
|
||||
* @param array $dimensions Map of image url to dimensions to be updated with results of remote fetch.
|
||||
* @param int $transient_expiration Duration image dimensions should exist in transient/cache.
|
||||
*/
|
||||
private static function process_fetched_images( $urls_to_fetch, $images, &$dimensions, $transient_expiration ) {
|
||||
foreach ( $urls_to_fetch as $url_data ) {
|
||||
$image_data = $images[ $url_data['url'] ];
|
||||
if ( self::STATUS_IMAGE_EXTRACTION_FAILED === $image_data['size'] ) {
|
||||
$dimensions[ $url_data['url'] ] = false;
|
||||
set_transient( $url_data['transient_name'], self::STATUS_FAILED_LAST_ATTEMPT, $transient_expiration );
|
||||
} else {
|
||||
$dimensions[ $url_data['url'] ] = array(
|
||||
'width' => $image_data['size'][0],
|
||||
'height' => $image_data['size'][1],
|
||||
);
|
||||
set_transient(
|
||||
$url_data['transient_name'],
|
||||
array(
|
||||
$image_data['size'][0],
|
||||
$image_data['size'][1],
|
||||
),
|
||||
$transient_expiration
|
||||
);
|
||||
}
|
||||
delete_transient( $url_data['transient_lock_name'] );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Get default user agent
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
public static function get_default_user_agent() {
|
||||
return 'amp-wp, v' . AMP__VERSION . ', ' . home_url();
|
||||
}
|
||||
}
|
@ -0,0 +1,25 @@
|
||||
<?php
|
||||
/**
|
||||
* Class AMP_String_Utils
|
||||
*
|
||||
* @package AMP
|
||||
*/
|
||||
|
||||
/**
|
||||
* Class with static string utility methods.
|
||||
*/
|
||||
class AMP_String_Utils {
|
||||
|
||||
/**
|
||||
* Checks whether a given string ends in the given substring.
|
||||
*
|
||||
* @param string $haystack Input string.
|
||||
* @param string $needle Substring to look for at the end of $haystack.
|
||||
* @return bool True if $haystack ends in $needle, false otherwise.
|
||||
*/
|
||||
public static function endswith( $haystack, $needle ) {
|
||||
return '' !== $haystack
|
||||
&& '' !== $needle
|
||||
&& substr( $haystack, -strlen( $needle ) ) === $needle;
|
||||
}
|
||||
}
|
@ -0,0 +1,102 @@
|
||||
<?php
|
||||
/**
|
||||
* Class AMP_WP_Utils
|
||||
*
|
||||
* @package AMP
|
||||
*/
|
||||
|
||||
/**
|
||||
* Class with static WordPress utility methods.
|
||||
*
|
||||
* @since 0.5
|
||||
*
|
||||
* @deprecated 0.7 As WordPress 4.7 is our minimum supported version.
|
||||
*/
|
||||
class AMP_WP_Utils {
|
||||
/**
|
||||
* The core function wp_parse_url in < WordPress 4.7 does not respect the component arg. This helper lets us use it.
|
||||
*
|
||||
* Don't use.
|
||||
*
|
||||
* @deprecated 0.7 wp_parse_url() is now used instead.
|
||||
*
|
||||
* @param string $url The raw URL. Can be false if the URL failed to parse.
|
||||
* @param int $component The specific component to retrieve. Use one of the PHP
|
||||
* predefined constants to specify which one.
|
||||
* Defaults to -1 (= return all parts as an array).
|
||||
* @return mixed False on parse failure; Array of URL components on success;
|
||||
* When a specific component has been requested: null if the component
|
||||
* doesn't exist in the given URL; a string or - in the case of
|
||||
* PHP_URL_PORT - integer when it does. See parse_url()'s return values.
|
||||
*/
|
||||
public static function parse_url( $url, $component = -1 ) {
|
||||
_deprecated_function( __METHOD__, '0.7', 'wp_parse_url' );
|
||||
$parsed = wp_parse_url( $url, $component );
|
||||
|
||||
// Because < 4.7 always returned a full array regardless of component.
|
||||
if ( -1 !== $component && is_array( $parsed ) ) {
|
||||
return self::_get_component_from_parsed_url_array( $parsed, $component );
|
||||
}
|
||||
|
||||
return $parsed;
|
||||
}
|
||||
|
||||
/**
|
||||
* Included for 4.6 back-compat
|
||||
*
|
||||
* Copied from https://developer.wordpress.org/reference/functions/_get_component_from_parsed_url_array/
|
||||
*
|
||||
* @deprecated 0.7
|
||||
*
|
||||
* @param array|false $url_parts The parsed URL. Can be false if the URL failed to parse.
|
||||
* @param int $component The specific component to retrieve. Use one of the PHP
|
||||
* predefined constants to specify which one.
|
||||
* Defaults to -1 (= return all parts as an array).
|
||||
* @return mixed False on parse failure; Array of URL components on success;
|
||||
* When a specific component has been requested: null if the component
|
||||
* doesn't exist in the given URL; a string or - in the case of
|
||||
* PHP_URL_PORT - integer when it does. See parse_url()'s return values.
|
||||
*/
|
||||
protected static function _get_component_from_parsed_url_array( $url_parts, $component = -1 ) { // phpcs:ignore PSR2.Methods.MethodDeclaration.Underscore
|
||||
if ( -1 === $component ) {
|
||||
return $url_parts;
|
||||
}
|
||||
|
||||
$key = self::_wp_translate_php_url_constant_to_key( $component );
|
||||
if ( false !== $key && is_array( $url_parts ) && isset( $url_parts[ $key ] ) ) {
|
||||
return $url_parts[ $key ];
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Included for 4.6 back-compat
|
||||
*
|
||||
* Copied from https://developer.wordpress.org/reference/functions/_wp_translate_php_url_constant_to_key/
|
||||
*
|
||||
* @param int $constant The specific component to retrieve. Use one of the PHP
|
||||
* predefined constants to specify which one.
|
||||
* @return mixed False if component not found. string or integer if found.
|
||||
*
|
||||
* @deprecated 0.7
|
||||
*/
|
||||
protected static function _wp_translate_php_url_constant_to_key( $constant ) { // phpcs:ignore PSR2.Methods.MethodDeclaration.Underscore
|
||||
$translation = array(
|
||||
PHP_URL_SCHEME => 'scheme',
|
||||
PHP_URL_HOST => 'host',
|
||||
PHP_URL_PORT => 'port',
|
||||
PHP_URL_USER => 'user',
|
||||
PHP_URL_PASS => 'pass',
|
||||
PHP_URL_PATH => 'path',
|
||||
PHP_URL_QUERY => 'query',
|
||||
PHP_URL_FRAGMENT => 'fragment',
|
||||
);
|
||||
|
||||
if ( isset( $translation[ $constant ] ) ) {
|
||||
return $translation[ $constant ];
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user