mirror of
https://github.com/gtbu/Typesetter-5.3-p8.git
synced 2025-07-15 19:13:15 +02:00
Added html-sanitizer
If Tidy is unavailable - still needs integration into the cms
This commit is contained in:
parent
b72b8ffbf8
commit
5cc9d11a11
6 changed files with 307 additions and 0 deletions
52
include/thirdparty/html-sanitizer/sanitizer-examples.php
vendored
Normal file
52
include/thirdparty/html-sanitizer/sanitizer-examples.php
vendored
Normal file
|
@ -0,0 +1,52 @@
|
|||
// --- Example using custom validators and srcset ---
|
||||
|
||||
$customConfig = [
|
||||
'attributes' => [
|
||||
'img' => ['src', 'srcset', 'width', 'data-id'],
|
||||
],
|
||||
'validators' => [
|
||||
'img' => [
|
||||
// Custom validator for the 'width' attribute on <img> tags
|
||||
'width' => function($value) {
|
||||
// Ensure width is a digit and not larger than 500
|
||||
if (ctype_digit($value) && (int)$value <= 500) {
|
||||
return $value;
|
||||
}
|
||||
return '500'; // Return a default/safe value
|
||||
},
|
||||
// Custom validator for a specific data attribute
|
||||
'data-id' => function($value) {
|
||||
// Allow only alphanumeric IDs
|
||||
return preg_replace('/[^a-zA-Z0-9-]/', '', $value);
|
||||
}
|
||||
]
|
||||
],
|
||||
'allow_protocol_relative' => true, // Explicitly allow for this instance
|
||||
];
|
||||
|
||||
$dirtyHtml = <<<HTML
|
||||
<img src="/logo.png"
|
||||
width="900"
|
||||
data-id="user-profile-123!@#"
|
||||
srcset="
|
||||
//example.com/small.jpg 300w,
|
||||
javascript:alert(1) 600w,
|
||||
/large.jpg 1000w
|
||||
">
|
||||
HTML;
|
||||
|
||||
$sanitizer = new HtmlSanitizer($customConfig);
|
||||
$cleanHtml = $sanitizer->sanitize($dirtyHtml);
|
||||
|
||||
echo $cleanHtml;
|
||||
|
||||
/*
|
||||
Expected Clean Output:
|
||||
|
||||
<img src="/logo.png" width="500" data-id="user-profile-123" srcset="//example.com/small.jpg 300w, /large.jpg 1000w">
|
||||
|
||||
Breakdown of what happened:
|
||||
- `width="900"` was changed to `width="500"` by the custom validator.
|
||||
- `data-id` had special characters stripped by its validator.
|
||||
- `srcset` had the javascript: URL removed, but the protocol-relative and safe relative URLs were kept.
|
||||
*/
|
255
include/thirdparty/html-sanitizer/sanitizer.php
vendored
Normal file
255
include/thirdparty/html-sanitizer/sanitizer.php
vendored
Normal file
|
@ -0,0 +1,255 @@
|
|||
<?php
|
||||
|
||||
/**
|
||||
* A modern, secure, and maintainable HTML sanitizer for PHP.
|
||||
*
|
||||
* Incorporates advanced features including srcset/data-* handling,
|
||||
* DoS protection for attribute counts, custom validators, and more robust
|
||||
* configuration options. It remains built on DOMDocument for security.
|
||||
*/
|
||||
class HtmlSanitizer
|
||||
{
|
||||
protected array $config;
|
||||
protected DOMDocument $dom;
|
||||
|
||||
public function __construct(array $config = [])
|
||||
{
|
||||
$this->dom = new DOMDocument();
|
||||
$this->processConfig($config);
|
||||
}
|
||||
|
||||
public function sanitize(string $html): string
|
||||
{
|
||||
if (trim($html) === '') {
|
||||
return '';
|
||||
}
|
||||
|
||||
libxml_use_internal_errors(true);
|
||||
$this->dom->loadHTML(
|
||||
'<?xml encoding="UTF-8">' . $html,
|
||||
LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD
|
||||
);
|
||||
libxml_clear_errors();
|
||||
libxml_use_internal_errors(false);
|
||||
|
||||
$root = $this->dom->firstChild;
|
||||
if ($root) {
|
||||
$this->walkNode($root);
|
||||
}
|
||||
|
||||
$cleanHtml = '';
|
||||
if ($root && $root->childNodes) {
|
||||
foreach ($root->childNodes as $node) {
|
||||
$cleanHtml .= $this->dom->saveHTML($node);
|
||||
}
|
||||
}
|
||||
|
||||
return $cleanHtml;
|
||||
}
|
||||
|
||||
protected function walkNode(DOMNode $node, int $depth = 0): void
|
||||
{
|
||||
if ($depth > $this->config['max_depth']) {
|
||||
$node->parentNode->removeChild($node);
|
||||
return;
|
||||
}
|
||||
|
||||
if ($node instanceof DOMElement) {
|
||||
if ($node->hasChildNodes()) {
|
||||
for ($i = $node->childNodes->length - 1; $i >= 0; $i--) {
|
||||
$this->walkNode($node->childNodes->item($i), $depth + 1);
|
||||
}
|
||||
}
|
||||
|
||||
$tagName = strtolower($node->tagName);
|
||||
|
||||
if (!isset($this->config['elements'][$tagName])) {
|
||||
$this->unwrapNode($node);
|
||||
return;
|
||||
}
|
||||
|
||||
if ($node->hasAttributes()) {
|
||||
// DoS Protection: Attribute count
|
||||
if ($node->attributes->length > $this->config['max_attributes']) {
|
||||
// Remove all attributes if the count is excessive
|
||||
for ($i = $node->attributes->length - 1; $i >= 0; $i--) {
|
||||
$node->removeAttribute($node->attributes->item($i)->name);
|
||||
}
|
||||
} else {
|
||||
for ($i = $node->attributes->length - 1; $i >= 0; $i--) {
|
||||
$attribute = $node->attributes->item($i);
|
||||
$this->sanitizeAttribute($node, $attribute);
|
||||
}
|
||||
}
|
||||
}
|
||||
} elseif (!$node instanceof DOMText) {
|
||||
$node->parentNode->removeChild($node);
|
||||
}
|
||||
}
|
||||
|
||||
protected function sanitizeAttribute(DOMElement $node, DOMAttr $attribute): void
|
||||
{
|
||||
$attrName = strtolower($attribute->name);
|
||||
$attrValue = $attribute->value;
|
||||
$tagName = strtolower($node->tagName);
|
||||
|
||||
if (strpos($attrName, 'on') === 0) {
|
||||
$node->removeAttribute($attrName);
|
||||
return;
|
||||
}
|
||||
|
||||
$allowedAttributes = $this->config['attributes'][$tagName] ?? [];
|
||||
$globalAttributes = $this->config['attributes']['*'] ?? [];
|
||||
$isDataAttribute = strpos($attrName, 'data-') === 0;
|
||||
|
||||
$isAllowed = in_array($attrName, $allowedAttributes)
|
||||
|| in_array($attrName, $globalAttributes)
|
||||
|| ($isDataAttribute && in_array('data-*', $globalAttributes));
|
||||
|
||||
if (!$isAllowed) {
|
||||
$node->removeAttribute($attrName);
|
||||
return;
|
||||
}
|
||||
|
||||
// Apply custom validator if one exists
|
||||
$validator = $this->config['validators'][$tagName][$attrName] ?? null;
|
||||
if ($validator && is_callable($validator)) {
|
||||
$validatedValue = $validator($attrValue);
|
||||
if ($validatedValue === null) {
|
||||
$node->removeAttribute($attrName);
|
||||
} else {
|
||||
$node->setAttribute($attrName, $validatedValue);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// Default sanitization logic
|
||||
if (in_array($attrName, ['href', 'src', 'action'])) {
|
||||
$url = $this->validateUrl($attrValue);
|
||||
if ($url === null) $node->removeAttribute($attrName);
|
||||
else $node->setAttribute($attrName, $url);
|
||||
} elseif ($attrName === 'srcset') {
|
||||
$node->setAttribute($attrName, $this->sanitizeSrcset($attrValue));
|
||||
} elseif ($attrName === 'style') {
|
||||
$style = $this->sanitizeStyle($attrValue);
|
||||
if (empty($style)) $node->removeAttribute($attrName);
|
||||
else $node->setAttribute($attrName, $style);
|
||||
} else {
|
||||
$node->setAttribute($attrName, $this->sanitizeGenericAttribute($attrValue));
|
||||
}
|
||||
}
|
||||
|
||||
protected function validateUrl(string $url): ?string
|
||||
{
|
||||
$url = trim($url);
|
||||
if (strpos($url, '//') === 0) {
|
||||
return $this->config['allow_protocol_relative'] ? $url : null;
|
||||
}
|
||||
|
||||
if (strpos($url, ':') === false) return $url;
|
||||
|
||||
$scheme = strtolower(parse_url($url, PHP_URL_SCHEME));
|
||||
if ($scheme === null || !in_array($scheme, $this->config['schemes'])) return null;
|
||||
|
||||
return $url;
|
||||
}
|
||||
|
||||
protected function sanitizeSrcset(string $srcset): string
|
||||
{
|
||||
$validParts = [];
|
||||
$parts = explode(',', $srcset);
|
||||
foreach ($parts as $part) {
|
||||
$part = trim($part);
|
||||
if (empty($part)) continue;
|
||||
|
||||
$urlPart = preg_split('/\s+/', $part, 2);
|
||||
$url = $this->validateUrl($urlPart[0]);
|
||||
|
||||
if ($url !== null) {
|
||||
$descriptor = $urlPart[1] ?? '';
|
||||
// Basic validation for descriptor to ensure it's not malicious
|
||||
if (preg_match('/^[\d.]+[wx]$/', $descriptor) || empty($descriptor)) {
|
||||
$validParts[] = $url . (empty($descriptor) ? '' : ' ' . $descriptor);
|
||||
}
|
||||
}
|
||||
}
|
||||
return implode(', ', $validParts);
|
||||
}
|
||||
|
||||
protected function sanitizeStyle(string $css): string
|
||||
{
|
||||
$cleanDeclarations = [];
|
||||
$declarations = explode(';', $css);
|
||||
|
||||
foreach ($declarations as $declaration) {
|
||||
if (strpos($declaration, ':') === false) continue;
|
||||
|
||||
[$property, $value] = explode(':', $declaration, 2);
|
||||
$property = strtolower(trim($property));
|
||||
$value = trim($value);
|
||||
|
||||
if (!isset($this->config['css_properties'][$property])) continue;
|
||||
|
||||
if (preg_match('/url\s*\(/i', $value)) {
|
||||
// Use validateUrl to check schemes inside url()
|
||||
preg_match('/url\s*\(\s*["\']?([^)]+)["\']?\s*\)/i', $value, $matches);
|
||||
if (!isset($matches[1]) || $this->validateUrl($matches[1]) === null) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if (preg_match('/(expression|javascript|behavior)/i', $value)) continue;
|
||||
|
||||
$cleanDeclarations[] = "{$property}: {$value}";
|
||||
}
|
||||
|
||||
return implode('; ', $cleanDeclarations);
|
||||
}
|
||||
|
||||
protected function sanitizeGenericAttribute(string $value): string
|
||||
{
|
||||
return htmlspecialchars($value, ENT_QUOTES | ENT_HTML5, 'UTF-8');
|
||||
}
|
||||
|
||||
protected function unwrapNode(DOMNode $node): void
|
||||
{
|
||||
if ($node->hasChildNodes()) {
|
||||
while ($node->firstChild) {
|
||||
$child = $node->removeChild($node->firstChild);
|
||||
$node->parentNode->insertBefore($child, $node);
|
||||
}
|
||||
}
|
||||
$node->parentNode->removeChild($node);
|
||||
}
|
||||
|
||||
protected function processConfig(array $userConfig): void
|
||||
{
|
||||
$defaultConfig = [
|
||||
'max_depth' => 100,
|
||||
'max_attributes' => 50,
|
||||
'allow_protocol_relative' => false,
|
||||
'elements' => [
|
||||
'a', 'b', 'strong', 'i', 'em', 'u', 'p', 'br', 'div', 'span', 'ul', 'ol', 'li',
|
||||
'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'blockquote', 'pre', 'code', 'img',
|
||||
'table', 'thead', 'tbody', 'tfoot', 'tr', 'td', 'th',
|
||||
],
|
||||
'attributes' => [
|
||||
'a' => ['href', 'title'],
|
||||
'img' => ['src', 'srcset', 'alt', 'title', 'width', 'height'],
|
||||
'*' => ['class', 'id', 'title', 'data-*'],
|
||||
],
|
||||
'schemes' => ['http', 'https', 'mailto'],
|
||||
'css_properties' => [
|
||||
'color', 'background-color', 'font-size', 'font-weight',
|
||||
'text-align', 'text-decoration', 'width', 'height',
|
||||
'margin', 'padding', 'border', 'border-collapse',
|
||||
],
|
||||
'validators' => [],
|
||||
];
|
||||
|
||||
$this->config = array_replace_recursive($defaultConfig, $userConfig);
|
||||
|
||||
$this->config['elements'] = array_fill_keys($this->config['elements'], 1);
|
||||
$this->config['css_properties'] = array_fill_keys($this->config['css_properties'], 1);
|
||||
}
|
||||
}
|
Loading…
Reference in a new issue