Merged in feature/from-pantheon (pull request #16)

code from pantheon

* code from pantheon
This commit is contained in:
Tony Volpe
2024-01-10 17:03:02 +00:00
parent 054b4fffc9
commit 4eb982d7a8
16492 changed files with 3475854 additions and 0 deletions

View File

@@ -0,0 +1,21 @@
MIT License
Copyright (c) 2008-2018 Pelago
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@@ -0,0 +1,87 @@
<?php
declare(strict_types=1);
namespace Pelago\Emogrifier\Caching;
/**
* This cache caches string values with string keys. It is not PSR-6-compliant.
*
* Usage:
*
* ```php
* $cache = new SimpleStringCache();
* $cache->set($key, $value);
* …
* if ($cache->has($key) {
* $cachedValue = $cache->get($value);
* }
* ```
*
* @internal
*/
class SimpleStringCache
{
/**
* @var array<string, string>
*/
private $values = [];
/**
* Checks whether there is an entry stored for the given key.
*
* @param string $key the key to check; must not be empty
*
* @throws \InvalidArgumentException
*/
public function has(string $key): bool
{
$this->assertNotEmptyKey($key);
return isset($this->values[$key]);
}
/**
* Returns the entry stored for the given key, and throws an exception if the value does not exist
* (which helps keep the return type simple).
*
* @param string $key the key to of the item to retrieve; must not be empty
*
* @return string the retrieved value; may be empty
*
* @throws \BadMethodCallException
*/
public function get(string $key): string
{
if (!$this->has($key)) {
throw new \BadMethodCallException('You can only call `get` with a key for an existing value.', 1625996246);
}
return $this->values[$key];
}
/**
* Sets or overwrites an entry.
*
* @param string $key the key to of the item to set; must not be empty
* @param string $value the value to set; can be empty
*
* @throws \BadMethodCallException
*/
public function set(string $key, string $value): void
{
$this->assertNotEmptyKey($key);
$this->values[$key] = $value;
}
/**
* @throws \InvalidArgumentException
*/
private function assertNotEmptyKey(string $key): void
{
if ($key === '') {
throw new \InvalidArgumentException('Please provide a non-empty key.', 1625995840);
}
}
}

View File

@@ -0,0 +1,187 @@
<?php
declare(strict_types=1);
namespace Pelago\Emogrifier\Css;
use Sabberworm\CSS\CSSList\AtRuleBlockList as CssAtRuleBlockList;
use Sabberworm\CSS\CSSList\Document as SabberwormCssDocument;
use Sabberworm\CSS\Parser as CssParser;
use Sabberworm\CSS\Property\AtRule as CssAtRule;
use Sabberworm\CSS\Property\Charset as CssCharset;
use Sabberworm\CSS\Property\Import as CssImport;
use Sabberworm\CSS\Renderable as CssRenderable;
use Sabberworm\CSS\RuleSet\DeclarationBlock as CssDeclarationBlock;
use Sabberworm\CSS\RuleSet\RuleSet as CssRuleSet;
/**
* Parses and stores a CSS document from a string of CSS, and provides methods to obtain the CSS in parts or as data
* structures.
*
* @internal
*/
class CssDocument
{
/**
* @var SabberwormCssDocument
*/
private $sabberwormCssDocument;
/**
* `@import` rules must precede all other types of rules, except `@charset` rules. This property is used while
* rendering at-rules to enforce that.
*
* @var bool
*/
private $isImportRuleAllowed = true;
/**
* @param string $css
*/
public function __construct(string $css)
{
$cssParser = new CssParser($css);
/** @var SabberwormCssDocument $sabberwormCssDocument */
$sabberwormCssDocument = $cssParser->parse();
$this->sabberwormCssDocument = $sabberwormCssDocument;
}
/**
* Collates the media query, selectors and declarations for individual rules from the parsed CSS, in order.
*
* @param array<array-key, string> $allowedMediaTypes
*
* @return array<int, StyleRule>
*/
public function getStyleRulesData(array $allowedMediaTypes): array
{
$ruleMatches = [];
/** @var CssRenderable $rule */
foreach ($this->sabberwormCssDocument->getContents() as $rule) {
if ($rule instanceof CssAtRuleBlockList) {
$containingAtRule = $this->getFilteredAtIdentifierAndRule($rule, $allowedMediaTypes);
if (\is_string($containingAtRule)) {
/** @var CssRenderable $nestedRule */
foreach ($rule->getContents() as $nestedRule) {
if ($nestedRule instanceof CssDeclarationBlock) {
$ruleMatches[] = new StyleRule($nestedRule, $containingAtRule);
}
}
}
} elseif ($rule instanceof CssDeclarationBlock) {
$ruleMatches[] = new StyleRule($rule);
}
}
return $ruleMatches;
}
/**
* Renders at-rules from the parsed CSS that are valid and not conditional group rules (i.e. not rules such as
* `@media` which contain style rules whose data is returned by {@see getStyleRulesData}). Also does not render
* `@charset` rules; these are discarded (only UTF-8 is supported).
*
* @return string
*/
public function renderNonConditionalAtRules(): string
{
$this->isImportRuleAllowed = true;
/** @var array<int, CssRenderable> $cssContents */
$cssContents = $this->sabberwormCssDocument->getContents();
$atRules = \array_filter($cssContents, [$this, 'isValidAtRuleToRender']);
if ($atRules === []) {
return '';
}
$atRulesDocument = new SabberwormCssDocument();
$atRulesDocument->setContents($atRules);
/** @var string $renderedRules */
$renderedRules = $atRulesDocument->render();
return $renderedRules;
}
/**
* @param CssAtRuleBlockList $rule
* @param array<array-key, string> $allowedMediaTypes
*
* @return ?string
* If the nested at-rule is supported, it's opening declaration (e.g. "@media (max-width: 768px)") is
* returned; otherwise the return value is null.
*/
private function getFilteredAtIdentifierAndRule(CssAtRuleBlockList $rule, array $allowedMediaTypes): ?string
{
$result = null;
if ($rule->atRuleName() === 'media') {
/** @var string $mediaQueryList */
$mediaQueryList = $rule->atRuleArgs();
[$mediaType] = \explode('(', $mediaQueryList, 2);
if (\trim($mediaType) !== '') {
$escapedAllowedMediaTypes = \array_map(
static function (string $allowedMediaType): string {
return \preg_quote($allowedMediaType, '/');
},
$allowedMediaTypes
);
$mediaTypesMatcher = \implode('|', $escapedAllowedMediaTypes);
$isAllowed = \preg_match('/^\\s*+(?:only\\s++)?+(?:' . $mediaTypesMatcher . ')/i', $mediaType) > 0;
} else {
$isAllowed = true;
}
if ($isAllowed) {
$result = '@media ' . $mediaQueryList;
}
}
return $result;
}
/**
* Tests if a CSS rule is an at-rule that should be passed though and copied to a `<style>` element unmodified:
* - `@charset` rules are discarded - only UTF-8 is supported - `false` is returned;
* - `@import` rules are passed through only if they satisfy the specification ("user agents must ignore any
* '@import' rule that occurs inside a block or after any non-ignored statement other than an '@charset' or an
* '@import' rule");
* - `@media` rules are processed separately to see if their nested rules apply - `false` is returned;
* - `@font-face` rules are checked for validity - they must contain both a `src` and `font-family` property;
* - other at-rules are assumed to be valid and treated as a black box - `true` is returned.
*
* @param CssRenderable $rule
*
* @return bool
*/
private function isValidAtRuleToRender(CssRenderable $rule): bool
{
if ($rule instanceof CssCharset) {
return false;
}
if ($rule instanceof CssImport) {
return $this->isImportRuleAllowed;
}
$this->isImportRuleAllowed = false;
if (!$rule instanceof CssAtRule) {
return false;
}
switch ($rule->atRuleName()) {
case 'media':
$result = false;
break;
case 'font-face':
$result = $rule instanceof CssRuleSet
&& $rule->getRules('font-family') !== []
&& $rule->getRules('src') !== [];
break;
default:
$result = true;
}
return $result;
}
}

View File

@@ -0,0 +1,83 @@
<?php
declare(strict_types=1);
namespace Pelago\Emogrifier\Css;
use Sabberworm\CSS\Property\Selector;
use Sabberworm\CSS\RuleSet\DeclarationBlock;
/**
* This class represents a CSS style rule, including selectors, a declaration block, and an optional containing at-rule.
*
* @internal
*/
class StyleRule
{
/**
* @var DeclarationBlock
*/
private $declarationBlock;
/**
* @var string
*/
private $containingAtRule;
/**
* @param DeclarationBlock $declarationBlock
* @param string $containingAtRule e.g. `@media screen and (max-width: 480px)`
*/
public function __construct(DeclarationBlock $declarationBlock, string $containingAtRule = '')
{
$this->declarationBlock = $declarationBlock;
$this->containingAtRule = \trim($containingAtRule);
}
/**
* @return array<int, string> the selectors, e.g. `["h1", "p"]`
*/
public function getSelectors(): array
{
/** @var array<int, Selector> $selectors */
$selectors = $this->declarationBlock->getSelectors();
return \array_map(
static function (Selector $selector): string {
return (string)$selector;
},
$selectors
);
}
/**
* @return string the CSS declarations, separated and followed by a semicolon, e.g., `color: red; height: 4px;`
*/
public function getDeclarationAsText(): string
{
return \implode(' ', $this->declarationBlock->getRules());
}
/**
* Checks whether the declaration block has at least one declaration.
*/
public function hasAtLeastOneDeclaration(): bool
{
return $this->declarationBlock->getRules() !== [];
}
/**
* @returns string e.g. `@media screen and (max-width: 480px)`, or an empty string
*/
public function getContainingAtRule(): string
{
return $this->containingAtRule;
}
/**
* Checks whether the containing at-rule is non-empty and has any non-whitespace characters.
*/
public function hasContainingAtRule(): bool
{
return $this->getContainingAtRule() !== '';
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,472 @@
<?php
declare(strict_types=1);
namespace Pelago\Emogrifier\HtmlProcessor;
/**
* Base class for HTML processor that e.g., can remove, add or modify nodes or attributes.
*
* The "vanilla" subclass is the HtmlNormalizer.
*
* @psalm-consistent-constructor
*/
abstract class AbstractHtmlProcessor
{
/**
* @var string
*/
protected const DEFAULT_DOCUMENT_TYPE = '<!DOCTYPE html>';
/**
* @var string
*/
protected const CONTENT_TYPE_META_TAG = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8">';
/**
* @var string Regular expression part to match tag names that PHP's DOMDocument implementation is not aware are
* self-closing. These are mostly HTML5 elements, but for completeness <command> (obsolete) and <keygen>
* (deprecated) are also included.
*
* @see https://bugs.php.net/bug.php?id=73175
*/
protected const PHP_UNRECOGNIZED_VOID_TAGNAME_MATCHER = '(?:command|embed|keygen|source|track|wbr)';
/**
* Regular expression part to match tag names that may appear before the start of the `<body>` element. A start tag
* for any other element would implicitly start the `<body>` element due to tag omission rules.
*
* @var string
*/
protected const TAGNAME_ALLOWED_BEFORE_BODY_MATCHER
= '(?:html|head|base|command|link|meta|noscript|script|style|template|title)';
/**
* regular expression pattern to match an HTML comment, including delimiters and modifiers
*
* @var string
*/
protected const HTML_COMMENT_PATTERN = '/<!--[^-]*+(?:-(?!->)[^-]*+)*+(?:-->|$)/';
/**
* regular expression pattern to match an HTML `<template>` element, including delimiters and modifiers
*
* @var string
*/
protected const HTML_TEMPLATE_ELEMENT_PATTERN
= '%<template[\\s>][^<]*+(?:<(?!/template>)[^<]*+)*+(?:</template>|$)%i';
/**
* @var ?\DOMDocument
*/
protected $domDocument = null;
/**
* @var ?\DOMXPath
*/
private $xPath = null;
/**
* The constructor.
*
* Please use `::fromHtml` or `::fromDomDocument` instead.
*/
private function __construct()
{
}
/**
* Builds a new instance from the given HTML.
*
* @param string $unprocessedHtml raw HTML, must be UTF-encoded, must not be empty
*
* @return static
*
* @throws \InvalidArgumentException if $unprocessedHtml is anything other than a non-empty string
*/
public static function fromHtml(string $unprocessedHtml): self
{
if ($unprocessedHtml === '') {
throw new \InvalidArgumentException('The provided HTML must not be empty.', 1515763647);
}
$instance = new static();
$instance->setHtml($unprocessedHtml);
return $instance;
}
/**
* Builds a new instance from the given DOM document.
*
* @param \DOMDocument $document a DOM document returned by getDomDocument() of another instance
*
* @return static
*/
public static function fromDomDocument(\DOMDocument $document): self
{
$instance = new static();
$instance->setDomDocument($document);
return $instance;
}
/**
* Sets the HTML to process.
*
* @param string $html the HTML to process, must be UTF-8-encoded
*/
private function setHtml(string $html): void
{
$this->createUnifiedDomDocument($html);
}
/**
* Provides access to the internal DOMDocument representation of the HTML in its current state.
*
* @return \DOMDocument
*
* @throws \UnexpectedValueException
*/
public function getDomDocument(): \DOMDocument
{
if (!$this->domDocument instanceof \DOMDocument) {
$message = self::class . '::setDomDocument() has not yet been called on ' . static::class;
throw new \UnexpectedValueException($message, 1570472239);
}
return $this->domDocument;
}
/**
* @param \DOMDocument $domDocument
*/
private function setDomDocument(\DOMDocument $domDocument): void
{
$this->domDocument = $domDocument;
$this->xPath = new \DOMXPath($this->domDocument);
}
/**
* @return \DOMXPath
*
* @throws \UnexpectedValueException
*/
protected function getXPath(): \DOMXPath
{
if (!$this->xPath instanceof \DOMXPath) {
$message = self::class . '::setDomDocument() has not yet been called on ' . static::class;
throw new \UnexpectedValueException($message, 1617819086);
}
return $this->xPath;
}
/**
* Renders the normalized and processed HTML.
*
* @return string
*/
public function render(): string
{
$htmlWithPossibleErroneousClosingTags = $this->getDomDocument()->saveHTML();
return $this->removeSelfClosingTagsClosingTags($htmlWithPossibleErroneousClosingTags);
}
/**
* Renders the content of the BODY element of the normalized and processed HTML.
*
* @return string
*/
public function renderBodyContent(): string
{
$htmlWithPossibleErroneousClosingTags = $this->getDomDocument()->saveHTML($this->getBodyElement());
$bodyNodeHtml = $this->removeSelfClosingTagsClosingTags($htmlWithPossibleErroneousClosingTags);
return \preg_replace('%</?+body(?:\\s[^>]*+)?+>%', '', $bodyNodeHtml);
}
/**
* Eliminates any invalid closing tags for void elements from the given HTML.
*
* @param string $html
*
* @return string
*/
private function removeSelfClosingTagsClosingTags(string $html): string
{
return \preg_replace('%</' . self::PHP_UNRECOGNIZED_VOID_TAGNAME_MATCHER . '>%', '', $html);
}
/**
* Returns the BODY element.
*
* This method assumes that there always is a BODY element.
*
* @return \DOMElement
*
* @throws \RuntimeException
*/
private function getBodyElement(): \DOMElement
{
$node = $this->getDomDocument()->getElementsByTagName('body')->item(0);
if (!$node instanceof \DOMElement) {
throw new \RuntimeException('There is no body element.', 1617922607);
}
return $node;
}
/**
* Creates a DOM document from the given HTML and stores it in $this->domDocument.
*
* The DOM document will always have a BODY element and a document type.
*
* @param string $html
*/
private function createUnifiedDomDocument(string $html): void
{
$this->createRawDomDocument($html);
$this->ensureExistenceOfBodyElement();
}
/**
* Creates a DOMDocument instance from the given HTML and stores it in $this->domDocument.
*
* @param string $html
*/
private function createRawDomDocument(string $html): void
{
$domDocument = new \DOMDocument();
$domDocument->strictErrorChecking = false;
$domDocument->formatOutput = true;
$libXmlState = \libxml_use_internal_errors(true);
$domDocument->loadHTML($this->prepareHtmlForDomConversion($html));
\libxml_clear_errors();
\libxml_use_internal_errors($libXmlState);
$this->setDomDocument($domDocument);
}
/**
* Returns the HTML with added document type, Content-Type meta tag, and self-closing slashes, if needed,
* ensuring that the HTML will be good for creating a DOM document from it.
*
* @param string $html
*
* @return string the unified HTML
*/
private function prepareHtmlForDomConversion(string $html): string
{
$htmlWithSelfClosingSlashes = $this->ensurePhpUnrecognizedSelfClosingTagsAreXml($html);
$htmlWithDocumentType = $this->ensureDocumentType($htmlWithSelfClosingSlashes);
return $this->addContentTypeMetaTag($htmlWithDocumentType);
}
/**
* Makes sure that the passed HTML has a document type, with lowercase "html".
*
* @param string $html
*
* @return string HTML with document type
*/
private function ensureDocumentType(string $html): string
{
$hasDocumentType = \stripos($html, '<!DOCTYPE') !== false;
if ($hasDocumentType) {
return $this->normalizeDocumentType($html);
}
return self::DEFAULT_DOCUMENT_TYPE . $html;
}
/**
* Makes sure the document type in the passed HTML has lowercase "html".
*
* @param string $html
*
* @return string HTML with normalized document type
*/
private function normalizeDocumentType(string $html): string
{
// Limit to replacing the first occurrence: as an optimization; and in case an example exists as unescaped text.
return \preg_replace(
'/<!DOCTYPE\\s++html(?=[\\s>])/i',
'<!DOCTYPE html',
$html,
1
);
}
/**
* Adds a Content-Type meta tag for the charset.
*
* This method also ensures that there is a HEAD element.
*
* @param string $html
*
* @return string the HTML with the meta tag added
*/
private function addContentTypeMetaTag(string $html): string
{
if ($this->hasContentTypeMetaTagInHead($html)) {
return $html;
}
// We are trying to insert the meta tag to the right spot in the DOM.
// If we just prepended it to the HTML, we would lose attributes set to the HTML tag.
$hasHeadTag = \preg_match('/<head[\\s>]/i', $html);
$hasHtmlTag = \stripos($html, '<html') !== false;
if ($hasHeadTag) {
$reworkedHtml = \preg_replace(
'/<head(?=[\\s>])([^>]*+)>/i',
'<head$1>' . self::CONTENT_TYPE_META_TAG,
$html
);
} elseif ($hasHtmlTag) {
$reworkedHtml = \preg_replace(
'/<html(.*?)>/is',
'<html$1><head>' . self::CONTENT_TYPE_META_TAG . '</head>',
$html
);
} else {
$reworkedHtml = self::CONTENT_TYPE_META_TAG . $html;
}
return $reworkedHtml;
}
/**
* Tests whether the given HTML has a valid `Content-Type` metadata element within the `<head>` element. Due to tag
* omission rules, HTML parsers are expected to end the `<head>` element and start the `<body>` element upon
* encountering a start tag for any element which is permitted only within the `<body>`.
*
* @param string $html
*
* @return bool
*/
private function hasContentTypeMetaTagInHead(string $html): bool
{
\preg_match('%^.*?(?=<meta(?=\\s)[^>]*\\shttp-equiv=(["\']?+)Content-Type\\g{-1}[\\s/>])%is', $html, $matches);
if (isset($matches[0])) {
$htmlBefore = $matches[0];
try {
$hasContentTypeMetaTagInHead = !$this->hasEndOfHeadElement($htmlBefore);
} catch (\RuntimeException $exception) {
// If something unexpected occurs, assume the `Content-Type` that was found is valid.
\trigger_error($exception->getMessage());
$hasContentTypeMetaTagInHead = true;
}
} else {
$hasContentTypeMetaTagInHead = false;
}
return $hasContentTypeMetaTagInHead;
}
/**
* Tests whether the `<head>` element ends within the given HTML. Due to tag omission rules, HTML parsers are
* expected to end the `<head>` element and start the `<body>` element upon encountering a start tag for any element
* which is permitted only within the `<body>`.
*
* @param string $html
*
* @return bool
*
* @throws \RuntimeException
*/
private function hasEndOfHeadElement(string $html): bool
{
$headEndTagMatchCount
= \preg_match('%<(?!' . self::TAGNAME_ALLOWED_BEFORE_BODY_MATCHER . '[\\s/>])\\w|</head>%i', $html);
if (\is_int($headEndTagMatchCount) && $headEndTagMatchCount > 0) {
// An exception to the implicit end of the `<head>` is any content within a `<template>` element, as well in
// comments. As an optimization, this is only checked for if a potential `<head>` end tag is found.
$htmlWithoutCommentsOrTemplates = $this->removeHtmlTemplateElements($this->removeHtmlComments($html));
$hasEndOfHeadElement = $htmlWithoutCommentsOrTemplates === $html
|| $this->hasEndOfHeadElement($htmlWithoutCommentsOrTemplates);
} else {
$hasEndOfHeadElement = false;
}
return $hasEndOfHeadElement;
}
/**
* Removes comments from the given HTML, including any which are unterminated, for which the remainder of the string
* is removed.
*
* @param string $html
*
* @return string
*
* @throws \RuntimeException
*/
private function removeHtmlComments(string $html): string
{
$result = \preg_replace(self::HTML_COMMENT_PATTERN, '', $html);
if (!\is_string($result)) {
throw new \RuntimeException('Internal PCRE error', 1616521475);
}
return $result;
}
/**
* Removes `<template>` elements from the given HTML, including any without an end tag, for which the remainder of
* the string is removed.
*
* @param string $html
*
* @return string
*
* @throws \RuntimeException
*/
private function removeHtmlTemplateElements(string $html): string
{
$result = \preg_replace(self::HTML_TEMPLATE_ELEMENT_PATTERN, '', $html);
if (!\is_string($result)) {
throw new \RuntimeException('Internal PCRE error', 1616519652);
}
return $result;
}
/**
* Makes sure that any self-closing tags not recognized as such by PHP's DOMDocument implementation have a
* self-closing slash.
*
* @param string $html
*
* @return string HTML with problematic tags converted.
*/
private function ensurePhpUnrecognizedSelfClosingTagsAreXml(string $html): string
{
return \preg_replace(
'%<' . self::PHP_UNRECOGNIZED_VOID_TAGNAME_MATCHER . '\\b[^>]*+(?<!/)(?=>)%',
'$0/',
$html
);
}
/**
* Checks that $this->domDocument has a BODY element and adds it if it is missing.
*
* @throws \UnexpectedValueException
*/
private function ensureExistenceOfBodyElement(): void
{
if ($this->getDomDocument()->getElementsByTagName('body')->item(0) instanceof \DOMElement) {
return;
}
$htmlElement = $this->getDomDocument()->getElementsByTagName('html')->item(0);
if (!$htmlElement instanceof \DOMElement) {
throw new \UnexpectedValueException('There is no HTML element although there should be one.', 1569930853);
}
$htmlElement->appendChild($this->getDomDocument()->createElement('body'));
}
}

View File

@@ -0,0 +1,303 @@
<?php
declare(strict_types=1);
namespace Pelago\Emogrifier\HtmlProcessor;
/**
* This HtmlProcessor can convert style HTML attributes to the corresponding other visual HTML attributes,
* e.g. it converts style="width: 100px" to width="100".
*
* It will only add attributes, but leaves the style attribute untouched.
*
* To trigger the conversion, call the convertCssToVisualAttributes method.
*/
class CssToAttributeConverter extends AbstractHtmlProcessor
{
/**
* This multi-level array contains simple mappings of CSS properties to
* HTML attributes. If a mapping only applies to certain HTML nodes or
* only for certain values, the mapping is an object with a whitelist
* of nodes and values.
*
* @var array<string, array{attribute: string, nodes?: array<int, string>, values?: array<int, string>}>
*/
private $cssToHtmlMap = [
'background-color' => [
'attribute' => 'bgcolor',
],
'text-align' => [
'attribute' => 'align',
'nodes' => ['p', 'div', 'td', 'th'],
'values' => ['left', 'right', 'center', 'justify'],
],
'float' => [
'attribute' => 'align',
'nodes' => ['table', 'img'],
'values' => ['left', 'right'],
],
'border-spacing' => [
'attribute' => 'cellspacing',
'nodes' => ['table'],
],
];
/**
* @var array<string, array<string, string>>
*/
private static $parsedCssCache = [];
/**
* Maps the CSS from the style nodes to visual HTML attributes.
*
* @return self fluent interface
*/
public function convertCssToVisualAttributes(): self
{
/** @var \DOMElement $node */
foreach ($this->getAllNodesWithStyleAttribute() as $node) {
$inlineStyleDeclarations = $this->parseCssDeclarationsBlock($node->getAttribute('style'));
$this->mapCssToHtmlAttributes($inlineStyleDeclarations, $node);
}
return $this;
}
/**
* Returns a list with all DOM nodes that have a style attribute.
*
* @return \DOMNodeList
*/
private function getAllNodesWithStyleAttribute(): \DOMNodeList
{
return $this->getXPath()->query('//*[@style]');
}
/**
* Parses a CSS declaration block into property name/value pairs.
*
* Example:
*
* The declaration block
*
* "color: #000; font-weight: bold;"
*
* will be parsed into the following array:
*
* "color" => "#000"
* "font-weight" => "bold"
*
* @param string $cssDeclarationsBlock the CSS declarations block without the curly braces, may be empty
*
* @return array<string, string>
* the CSS declarations with the property names as array keys and the property values as array values
*/
private function parseCssDeclarationsBlock(string $cssDeclarationsBlock): array
{
if (isset(self::$parsedCssCache[$cssDeclarationsBlock])) {
return self::$parsedCssCache[$cssDeclarationsBlock];
}
$properties = [];
foreach (\preg_split('/;(?!base64|charset)/', $cssDeclarationsBlock) as $declaration) {
/** @var array<int, string> $matches */
$matches = [];
if (!\preg_match('/^([A-Za-z\\-]+)\\s*:\\s*(.+)$/s', \trim($declaration), $matches)) {
continue;
}
$propertyName = \strtolower($matches[1]);
$propertyValue = $matches[2];
$properties[$propertyName] = $propertyValue;
}
self::$parsedCssCache[$cssDeclarationsBlock] = $properties;
return $properties;
}
/**
* Applies $styles to $node.
*
* This method maps CSS styles to HTML attributes and adds those to the
* node.
*
* @param array<string, string> $styles the new CSS styles taken from the global styles to be applied to this node
* @param \DOMElement $node node to apply styles to
*/
private function mapCssToHtmlAttributes(array $styles, \DOMElement $node): void
{
foreach ($styles as $property => $value) {
// Strip !important indicator
$value = \trim(\str_replace('!important', '', $value));
$this->mapCssToHtmlAttribute($property, $value, $node);
}
}
/**
* Tries to apply the CSS style to $node as an attribute.
*
* This method maps a CSS rule to HTML attributes and adds those to the node.
*
* @param string $property the name of the CSS property to map
* @param string $value the value of the style rule to map
* @param \DOMElement $node node to apply styles to
*/
private function mapCssToHtmlAttribute(string $property, string $value, \DOMElement $node): void
{
if (!$this->mapSimpleCssProperty($property, $value, $node)) {
$this->mapComplexCssProperty($property, $value, $node);
}
}
/**
* Looks up the CSS property in the mapping table and maps it if it matches the conditions.
*
* @param string $property the name of the CSS property to map
* @param string $value the value of the style rule to map
* @param \DOMElement $node node to apply styles to
*
* @return bool true if the property can be mapped using the simple mapping table
*/
private function mapSimpleCssProperty(string $property, string $value, \DOMElement $node): bool
{
if (!isset($this->cssToHtmlMap[$property])) {
return false;
}
$mapping = $this->cssToHtmlMap[$property];
$nodesMatch = !isset($mapping['nodes']) || \in_array($node->nodeName, $mapping['nodes'], true);
$valuesMatch = !isset($mapping['values']) || \in_array($value, $mapping['values'], true);
$canBeMapped = $nodesMatch && $valuesMatch;
if ($canBeMapped) {
$node->setAttribute($mapping['attribute'], $value);
}
return $canBeMapped;
}
/**
* Maps CSS properties that need special transformation to an HTML attribute.
*
* @param string $property the name of the CSS property to map
* @param string $value the value of the style rule to map
* @param \DOMElement $node node to apply styles to
*/
private function mapComplexCssProperty(string $property, string $value, \DOMElement $node): void
{
switch ($property) {
case 'background':
$this->mapBackgroundProperty($node, $value);
break;
case 'width':
// intentional fall-through
case 'height':
$this->mapWidthOrHeightProperty($node, $value, $property);
break;
case 'margin':
$this->mapMarginProperty($node, $value);
break;
case 'border':
$this->mapBorderProperty($node, $value);
break;
default:
}
}
/**
* @param \DOMElement $node node to apply styles to
* @param string $value the value of the style rule to map
*/
private function mapBackgroundProperty(\DOMElement $node, string $value): void
{
// parse out the color, if any
/** @var array<int, string> $styles */
$styles = \explode(' ', $value, 2);
$first = $styles[0];
if (\is_numeric($first[0]) || \strncmp($first, 'url', 3) === 0) {
return;
}
// as this is not a position or image, assume it's a color
$node->setAttribute('bgcolor', $first);
}
/**
* @param \DOMElement $node node to apply styles to
* @param string $value the value of the style rule to map
* @param string $property the name of the CSS property to map
*/
private function mapWidthOrHeightProperty(\DOMElement $node, string $value, string $property): void
{
// only parse values in px and %, but not values like "auto"
if (!\preg_match('/^(\\d+)(\\.(\\d+))?(px|%)$/', $value)) {
return;
}
$number = \preg_replace('/[^0-9.%]/', '', $value);
$node->setAttribute($property, $number);
}
/**
* @param \DOMElement $node node to apply styles to
* @param string $value the value of the style rule to map
*/
private function mapMarginProperty(\DOMElement $node, string $value): void
{
if (!$this->isTableOrImageNode($node)) {
return;
}
$margins = $this->parseCssShorthandValue($value);
if ($margins['left'] === 'auto' && $margins['right'] === 'auto') {
$node->setAttribute('align', 'center');
}
}
/**
* @param \DOMElement $node node to apply styles to
* @param string $value the value of the style rule to map
*/
private function mapBorderProperty(\DOMElement $node, string $value): void
{
if (!$this->isTableOrImageNode($node)) {
return;
}
if ($value === 'none' || $value === '0') {
$node->setAttribute('border', '0');
}
}
/**
* @param \DOMElement $node
*
* @return bool
*/
private function isTableOrImageNode(\DOMElement $node): bool
{
return $node->nodeName === 'table' || $node->nodeName === 'img';
}
/**
* Parses a shorthand CSS value and splits it into individual values. For example: `padding: 0 auto;` - `0 auto` is
* split into top: 0, left: auto, bottom: 0, right: auto.
*
* @param string $value a CSS property value with 1, 2, 3 or 4 sizes
*
* @return array<string, string>
* an array of values for top, right, bottom and left (using these as associative array keys)
*/
private function parseCssShorthandValue(string $value): array
{
/** @var array<int, string> $values */
$values = \preg_split('/\\s+/', $value);
$css = [];
$css['top'] = $values[0];
$css['right'] = (\count($values) > 1) ? $values[1] : $css['top'];
$css['bottom'] = (\count($values) > 2) ? $values[2] : $css['top'];
$css['left'] = (\count($values) > 3) ? $values[3] : $css['right'];
return $css;
}
}

View File

@@ -0,0 +1,16 @@
<?php
declare(strict_types=1);
namespace Pelago\Emogrifier\HtmlProcessor;
/**
* Normalizes HTML:
* - add a document type (HTML5) if missing
* - disentangle incorrectly nested tags
* - add HEAD and BODY elements (if they are missing)
* - reformat the HTML
*/
class HtmlNormalizer extends AbstractHtmlProcessor
{
}

View File

@@ -0,0 +1,137 @@
<?php
declare(strict_types=1);
namespace Pelago\Emogrifier\HtmlProcessor;
use Pelago\Emogrifier\CssInliner;
use Pelago\Emogrifier\Utilities\ArrayIntersector;
/**
* This class can remove things from HTML.
*/
class HtmlPruner extends AbstractHtmlProcessor
{
/**
* We need to look for display:none, but we need to do a case-insensitive search. Since DOMDocument only
* supports XPath 1.0, lower-case() isn't available to us. We've thus far only set attributes to lowercase,
* not attribute values. Consequently, we need to translate() the letters that would be in 'NONE' ("NOE")
* to lowercase.
*
* @var string
*/
private const DISPLAY_NONE_MATCHER
= '//*[@style and contains(translate(translate(@style," ",""),"NOE","noe"),"display:none")'
. ' and not(@class and contains(concat(" ", normalize-space(@class), " "), " -emogrifier-keep "))]';
/**
* Removes elements that have a "display: none;" style.
*
* @return self fluent interface
*/
public function removeElementsWithDisplayNone(): self
{
$elementsWithStyleDisplayNone = $this->getXPath()->query(self::DISPLAY_NONE_MATCHER);
if ($elementsWithStyleDisplayNone->length === 0) {
return $this;
}
foreach ($elementsWithStyleDisplayNone as $element) {
$parentNode = $element->parentNode;
if ($parentNode !== null) {
$parentNode->removeChild($element);
}
}
return $this;
}
/**
* Removes classes that are no longer required (e.g. because there are no longer any CSS rules that reference them)
* from `class` attributes.
*
* Note that this does not inspect the CSS, but expects to be provided with a list of classes that are still in use.
*
* This method also has the (presumably beneficial) side-effect of minifying (removing superfluous whitespace from)
* `class` attributes.
*
* @param array<array-key, string> $classesToKeep names of classes that should not be removed
*
* @return self fluent interface
*/
public function removeRedundantClasses(array $classesToKeep = []): self
{
$elementsWithClassAttribute = $this->getXPath()->query('//*[@class]');
if ($classesToKeep !== []) {
$this->removeClassesFromElements($elementsWithClassAttribute, $classesToKeep);
} else {
// Avoid unnecessary processing if there are no classes to keep.
$this->removeClassAttributeFromElements($elementsWithClassAttribute);
}
return $this;
}
/**
* Removes classes from the `class` attribute of each element in `$elements`, except any in `$classesToKeep`,
* removing the `class` attribute itself if the resultant list is empty.
*
* @param \DOMNodeList $elements
* @param array<array-key, string> $classesToKeep
*/
private function removeClassesFromElements(\DOMNodeList $elements, array $classesToKeep): void
{
$classesToKeepIntersector = new ArrayIntersector($classesToKeep);
/** @var \DOMElement $element */
foreach ($elements as $element) {
$elementClasses = \preg_split('/\\s++/', \trim($element->getAttribute('class')));
$elementClassesToKeep = $classesToKeepIntersector->intersectWith($elementClasses);
if ($elementClassesToKeep !== []) {
$element->setAttribute('class', \implode(' ', $elementClassesToKeep));
} else {
$element->removeAttribute('class');
}
}
}
/**
* Removes the `class` attribute from each element in `$elements`.
*
* @param \DOMNodeList $elements
*/
private function removeClassAttributeFromElements(\DOMNodeList $elements): void
{
/** @var \DOMElement $element */
foreach ($elements as $element) {
$element->removeAttribute('class');
}
}
/**
* After CSS has been inlined, there will likely be some classes in `class` attributes that are no longer referenced
* by any remaining (uninlinable) CSS. This method removes such classes.
*
* Note that it does not inspect the remaining CSS, but uses information readily available from the `CssInliner`
* instance about the CSS rules that could not be inlined.
*
* @param CssInliner $cssInliner object instance that performed the CSS inlining
*
* @return self fluent interface
*
* @throws \BadMethodCallException if `inlineCss` has not first been called on `$cssInliner`
*/
public function removeRedundantClassesAfterCssInlined(CssInliner $cssInliner): self
{
$classesToKeepAsKeys = [];
foreach ($cssInliner->getMatchingUninlinableSelectors() as $selector) {
\preg_match_all('/\\.(-?+[_a-zA-Z][\\w\\-]*+)/', $selector, $matches);
$classesToKeepAsKeys += \array_fill_keys($matches[1], true);
}
$this->removeRedundantClasses(\array_keys($classesToKeepAsKeys));
return $this;
}
}

View File

@@ -0,0 +1,57 @@
<?php
declare(strict_types=1);
namespace Pelago\Emogrifier\Utilities;
/**
* When computing many array intersections using the same array, it is more efficient to use `array_flip()` first and
* then `array_intersect_key()`, than `array_intersect()`. See the discussion at
* {@link https://stackoverflow.com/questions/6329211/php-array-intersect-efficiency Stack Overflow} for more
* information.
*
* Of course, this is only possible if the arrays contain integer or string values, and either don't contain duplicates,
* or that fact that duplicates will be removed does not matter.
*
* This class takes care of the detail.
*
* @internal
*/
class ArrayIntersector
{
/**
* the array with which the object was constructed, with all its keys exchanged with their associated values
*
* @var array<array-key, array-key>
*/
private $invertedArray;
/**
* Constructs the object with the array that will be reused for many intersection computations.
*
* @param array<array-key, array-key> $array
*/
public function __construct(array $array)
{
$this->invertedArray = \array_flip($array);
}
/**
* Computes the intersection of `$array` and the array with which this object was constructed.
*
* @param array<array-key, array-key> $array
*
* @return array<array-key, array-key>
* Returns an array containing all of the values in `$array` whose values exist in the array
* with which this object was constructed. Note that keys are preserved, order is maintained, but
* duplicates are removed.
*/
public function intersectWith(array $array): array
{
$invertedArray = \array_flip($array);
$invertedIntersection = \array_intersect_key($invertedArray, $this->invertedArray);
return \array_flip($invertedIntersection);
}
}

View File

@@ -0,0 +1,181 @@
<?php
declare(strict_types=1);
namespace Pelago\Emogrifier\Utilities;
/**
* Facilitates building a CSS string by appending rule blocks one at a time, checking whether the media query,
* selectors, or declarations block are the same as those from the preceding block and combining blocks in such cases.
*
* Example:
* $concatenator = new CssConcatenator();
* $concatenator->append(['body'], 'color: blue;');
* $concatenator->append(['body'], 'font-size: 16px;');
* $concatenator->append(['p'], 'margin: 1em 0;');
* $concatenator->append(['ul', 'ol'], 'margin: 1em 0;');
* $concatenator->append(['body'], 'font-size: 14px;', '@media screen and (max-width: 400px)');
* $concatenator->append(['ul', 'ol'], 'margin: 0.75em 0;', '@media screen and (max-width: 400px)');
* $css = $concatenator->getCss();
*
* `$css` (if unminified) would contain the following CSS:
* ` body {
* ` color: blue;
* ` font-size: 16px;
* ` }
* ` p, ul, ol {
* ` margin: 1em 0;
* ` }
* ` @media screen and (max-width: 400px) {
* ` body {
* ` font-size: 14px;
* ` }
* ` ul, ol {
* ` margin: 0.75em 0;
* ` }
* ` }
*
* @internal
*/
class CssConcatenator
{
/**
* Array of media rules in order. Each element is an object with the following properties:
* - string `media` - The media query string, e.g. "@media screen and (max-width:639px)", or an empty string for
* rules not within a media query block;
* - object[] `ruleBlocks` - Array of rule blocks in order, where each element is an object with the following
* properties:
* - mixed[] `selectorsAsKeys` - Array whose keys are selectors for the rule block (values are of no
* significance);
* - string `declarationsBlock` - The property declarations, e.g. "margin-top: 0.5em; padding: 0".
*
* @var array<int, object{
* media: string,
* ruleBlocks: array<int, object{
* selectorsAsKeys: array<string, array-key>,
* declarationsBlock: string
* }>
* }>
*/
private $mediaRules = [];
/**
* Appends a declaration block to the CSS.
*
* @param array<array-key, string> $selectors
* array of selectors for the rule, e.g. ["ul", "ol", "p:first-child"]
* @param string $declarationsBlock
* the property declarations, e.g. "margin-top: 0.5em; padding: 0"
* @param string $media
* the media query for the rule, e.g. "@media screen and (max-width:639px)", or an empty string if none
*/
public function append(array $selectors, string $declarationsBlock, string $media = ''): void
{
$selectorsAsKeys = \array_flip($selectors);
$mediaRule = $this->getOrCreateMediaRuleToAppendTo($media);
$ruleBlocks = $mediaRule->ruleBlocks;
$lastRuleBlock = \end($ruleBlocks);
$hasSameDeclarationsAsLastRule = \is_object($lastRuleBlock)
&& $declarationsBlock === $lastRuleBlock->declarationsBlock;
if ($hasSameDeclarationsAsLastRule) {
$lastRuleBlock->selectorsAsKeys += $selectorsAsKeys;
} else {
$lastRuleBlockSelectors = \is_object($lastRuleBlock) ? $lastRuleBlock->selectorsAsKeys : [];
$hasSameSelectorsAsLastRule = \is_object($lastRuleBlock)
&& self::hasEquivalentSelectors($selectorsAsKeys, $lastRuleBlockSelectors);
if ($hasSameSelectorsAsLastRule) {
$lastDeclarationsBlockWithoutSemicolon = \rtrim(\rtrim($lastRuleBlock->declarationsBlock), ';');
$lastRuleBlock->declarationsBlock = $lastDeclarationsBlockWithoutSemicolon . ';' . $declarationsBlock;
} else {
$mediaRule->ruleBlocks[] = (object)\compact('selectorsAsKeys', 'declarationsBlock');
}
}
}
/**
* @return string
*/
public function getCss(): string
{
return \implode('', \array_map([self::class, 'getMediaRuleCss'], $this->mediaRules));
}
/**
* @param string $media The media query for rules to be appended, e.g. "@media screen and (max-width:639px)",
* or an empty string if none.
*
* @return object{
* media: string,
* ruleBlocks: array<int, object{
* selectorsAsKeys: array<string, array-key>,
* declarationsBlock: string
* }>
* }
*/
private function getOrCreateMediaRuleToAppendTo(string $media): object
{
$lastMediaRule = \end($this->mediaRules);
if (\is_object($lastMediaRule) && $media === $lastMediaRule->media) {
return $lastMediaRule;
}
$newMediaRule = (object)[
'media' => $media,
'ruleBlocks' => [],
];
$this->mediaRules[] = $newMediaRule;
return $newMediaRule;
}
/**
* Tests if two sets of selectors are equivalent (i.e. the same selectors, possibly in a different order).
*
* @param array<string, array-key> $selectorsAsKeys1
* array in which the selectors are the keys, and the values are of no significance
* @param array<string, array-key> $selectorsAsKeys2 another such array
*
* @return bool
*/
private static function hasEquivalentSelectors(array $selectorsAsKeys1, array $selectorsAsKeys2): bool
{
return \count($selectorsAsKeys1) === \count($selectorsAsKeys2)
&& \count($selectorsAsKeys1) === \count($selectorsAsKeys1 + $selectorsAsKeys2);
}
/**
* @param object{
* media: string,
* ruleBlocks: array<int, object{
* selectorsAsKeys: array<string, array-key>,
* declarationsBlock: string
* }>
* } $mediaRule
*
* @return string CSS for the media rule.
*/
private static function getMediaRuleCss(object $mediaRule): string
{
$ruleBlocks = $mediaRule->ruleBlocks;
$css = \implode('', \array_map([self::class, 'getRuleBlockCss'], $ruleBlocks));
$media = $mediaRule->media;
if ($media !== '') {
$css = $media . '{' . $css . '}';
}
return $css;
}
/**
* @param object{selectorsAsKeys: array<string, array-key>, declarationsBlock: string} $ruleBlock
*
* @return string CSS for the rule block.
*/
private static function getRuleBlockCss(object $ruleBlock): string
{
$selectorsAsKeys = $ruleBlock->selectorsAsKeys;
$selectors = \array_keys($selectorsAsKeys);
$declarationsBlock = $ruleBlock->declarationsBlock;
return \implode(',', $selectors) . '{' . $declarationsBlock . '}';
}
}