From 5cc9d11a111dca3591d0516c5dfc7a0d995b82ba Mon Sep 17 00:00:00 2001 From: gtbu Date: Sun, 13 Jul 2025 15:06:37 +0200 Subject: [PATCH] Added html-sanitizer If Tidy is unavailable - still needs integration into the cms --- .../{HTMLawed => html-sanitizer}/LICENSE-GPL2 | 0 .../LICENSE-LGPL3 | 0 .../{HTMLawed => html-sanitizer}/README.md | 0 .../{HTMLawed => html-sanitizer}/htmLawed.php | 0 .../html-sanitizer/sanitizer-examples.php | 52 ++++ .../thirdparty/html-sanitizer/sanitizer.php | 255 ++++++++++++++++++ 6 files changed, 307 insertions(+) rename include/thirdparty/{HTMLawed => html-sanitizer}/LICENSE-GPL2 (100%) rename include/thirdparty/{HTMLawed => html-sanitizer}/LICENSE-LGPL3 (100%) rename include/thirdparty/{HTMLawed => html-sanitizer}/README.md (100%) rename include/thirdparty/{HTMLawed => html-sanitizer}/htmLawed.php (100%) create mode 100644 include/thirdparty/html-sanitizer/sanitizer-examples.php create mode 100644 include/thirdparty/html-sanitizer/sanitizer.php diff --git a/include/thirdparty/HTMLawed/LICENSE-GPL2 b/include/thirdparty/html-sanitizer/LICENSE-GPL2 similarity index 100% rename from include/thirdparty/HTMLawed/LICENSE-GPL2 rename to include/thirdparty/html-sanitizer/LICENSE-GPL2 diff --git a/include/thirdparty/HTMLawed/LICENSE-LGPL3 b/include/thirdparty/html-sanitizer/LICENSE-LGPL3 similarity index 100% rename from include/thirdparty/HTMLawed/LICENSE-LGPL3 rename to include/thirdparty/html-sanitizer/LICENSE-LGPL3 diff --git a/include/thirdparty/HTMLawed/README.md b/include/thirdparty/html-sanitizer/README.md similarity index 100% rename from include/thirdparty/HTMLawed/README.md rename to include/thirdparty/html-sanitizer/README.md diff --git a/include/thirdparty/HTMLawed/htmLawed.php b/include/thirdparty/html-sanitizer/htmLawed.php similarity index 100% rename from include/thirdparty/HTMLawed/htmLawed.php rename to include/thirdparty/html-sanitizer/htmLawed.php diff --git a/include/thirdparty/html-sanitizer/sanitizer-examples.php b/include/thirdparty/html-sanitizer/sanitizer-examples.php new file mode 100644 index 0000000..b80b159 --- /dev/null +++ b/include/thirdparty/html-sanitizer/sanitizer-examples.php @@ -0,0 +1,52 @@ +// --- Example using custom validators and srcset --- + +$customConfig = [ + 'attributes' => [ + 'img' => ['src', 'srcset', 'width', 'data-id'], + ], + 'validators' => [ + 'img' => [ + // Custom validator for the 'width' attribute on tags + 'width' => function($value) { + // Ensure width is a digit and not larger than 500 + if (ctype_digit($value) && (int)$value <= 500) { + return $value; + } + return '500'; // Return a default/safe value + }, + // Custom validator for a specific data attribute + 'data-id' => function($value) { + // Allow only alphanumeric IDs + return preg_replace('/[^a-zA-Z0-9-]/', '', $value); + } + ] + ], + 'allow_protocol_relative' => true, // Explicitly allow for this instance +]; + +$dirtyHtml = << +HTML; + +$sanitizer = new HtmlSanitizer($customConfig); +$cleanHtml = $sanitizer->sanitize($dirtyHtml); + +echo $cleanHtml; + +/* +Expected Clean Output: + + + +Breakdown of what happened: +- `width="900"` was changed to `width="500"` by the custom validator. +- `data-id` had special characters stripped by its validator. +- `srcset` had the javascript: URL removed, but the protocol-relative and safe relative URLs were kept. +*/ \ No newline at end of file diff --git a/include/thirdparty/html-sanitizer/sanitizer.php b/include/thirdparty/html-sanitizer/sanitizer.php new file mode 100644 index 0000000..60b5cc5 --- /dev/null +++ b/include/thirdparty/html-sanitizer/sanitizer.php @@ -0,0 +1,255 @@ +dom = new DOMDocument(); + $this->processConfig($config); + } + + public function sanitize(string $html): string + { + if (trim($html) === '') { + return ''; + } + + libxml_use_internal_errors(true); + $this->dom->loadHTML( + '' . $html, + LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD + ); + libxml_clear_errors(); + libxml_use_internal_errors(false); + + $root = $this->dom->firstChild; + if ($root) { + $this->walkNode($root); + } + + $cleanHtml = ''; + if ($root && $root->childNodes) { + foreach ($root->childNodes as $node) { + $cleanHtml .= $this->dom->saveHTML($node); + } + } + + return $cleanHtml; + } + + protected function walkNode(DOMNode $node, int $depth = 0): void + { + if ($depth > $this->config['max_depth']) { + $node->parentNode->removeChild($node); + return; + } + + if ($node instanceof DOMElement) { + if ($node->hasChildNodes()) { + for ($i = $node->childNodes->length - 1; $i >= 0; $i--) { + $this->walkNode($node->childNodes->item($i), $depth + 1); + } + } + + $tagName = strtolower($node->tagName); + + if (!isset($this->config['elements'][$tagName])) { + $this->unwrapNode($node); + return; + } + + if ($node->hasAttributes()) { + // DoS Protection: Attribute count + if ($node->attributes->length > $this->config['max_attributes']) { + // Remove all attributes if the count is excessive + for ($i = $node->attributes->length - 1; $i >= 0; $i--) { + $node->removeAttribute($node->attributes->item($i)->name); + } + } else { + for ($i = $node->attributes->length - 1; $i >= 0; $i--) { + $attribute = $node->attributes->item($i); + $this->sanitizeAttribute($node, $attribute); + } + } + } + } elseif (!$node instanceof DOMText) { + $node->parentNode->removeChild($node); + } + } + + protected function sanitizeAttribute(DOMElement $node, DOMAttr $attribute): void + { + $attrName = strtolower($attribute->name); + $attrValue = $attribute->value; + $tagName = strtolower($node->tagName); + + if (strpos($attrName, 'on') === 0) { + $node->removeAttribute($attrName); + return; + } + + $allowedAttributes = $this->config['attributes'][$tagName] ?? []; + $globalAttributes = $this->config['attributes']['*'] ?? []; + $isDataAttribute = strpos($attrName, 'data-') === 0; + + $isAllowed = in_array($attrName, $allowedAttributes) + || in_array($attrName, $globalAttributes) + || ($isDataAttribute && in_array('data-*', $globalAttributes)); + + if (!$isAllowed) { + $node->removeAttribute($attrName); + return; + } + + // Apply custom validator if one exists + $validator = $this->config['validators'][$tagName][$attrName] ?? null; + if ($validator && is_callable($validator)) { + $validatedValue = $validator($attrValue); + if ($validatedValue === null) { + $node->removeAttribute($attrName); + } else { + $node->setAttribute($attrName, $validatedValue); + } + return; + } + + // Default sanitization logic + if (in_array($attrName, ['href', 'src', 'action'])) { + $url = $this->validateUrl($attrValue); + if ($url === null) $node->removeAttribute($attrName); + else $node->setAttribute($attrName, $url); + } elseif ($attrName === 'srcset') { + $node->setAttribute($attrName, $this->sanitizeSrcset($attrValue)); + } elseif ($attrName === 'style') { + $style = $this->sanitizeStyle($attrValue); + if (empty($style)) $node->removeAttribute($attrName); + else $node->setAttribute($attrName, $style); + } else { + $node->setAttribute($attrName, $this->sanitizeGenericAttribute($attrValue)); + } + } + + protected function validateUrl(string $url): ?string + { + $url = trim($url); + if (strpos($url, '//') === 0) { + return $this->config['allow_protocol_relative'] ? $url : null; + } + + if (strpos($url, ':') === false) return $url; + + $scheme = strtolower(parse_url($url, PHP_URL_SCHEME)); + if ($scheme === null || !in_array($scheme, $this->config['schemes'])) return null; + + return $url; + } + + protected function sanitizeSrcset(string $srcset): string + { + $validParts = []; + $parts = explode(',', $srcset); + foreach ($parts as $part) { + $part = trim($part); + if (empty($part)) continue; + + $urlPart = preg_split('/\s+/', $part, 2); + $url = $this->validateUrl($urlPart[0]); + + if ($url !== null) { + $descriptor = $urlPart[1] ?? ''; + // Basic validation for descriptor to ensure it's not malicious + if (preg_match('/^[\d.]+[wx]$/', $descriptor) || empty($descriptor)) { + $validParts[] = $url . (empty($descriptor) ? '' : ' ' . $descriptor); + } + } + } + return implode(', ', $validParts); + } + + protected function sanitizeStyle(string $css): string + { + $cleanDeclarations = []; + $declarations = explode(';', $css); + + foreach ($declarations as $declaration) { + if (strpos($declaration, ':') === false) continue; + + [$property, $value] = explode(':', $declaration, 2); + $property = strtolower(trim($property)); + $value = trim($value); + + if (!isset($this->config['css_properties'][$property])) continue; + + if (preg_match('/url\s*\(/i', $value)) { + // Use validateUrl to check schemes inside url() + preg_match('/url\s*\(\s*["\']?([^)]+)["\']?\s*\)/i', $value, $matches); + if (!isset($matches[1]) || $this->validateUrl($matches[1]) === null) { + continue; + } + } + + if (preg_match('/(expression|javascript|behavior)/i', $value)) continue; + + $cleanDeclarations[] = "{$property}: {$value}"; + } + + return implode('; ', $cleanDeclarations); + } + + protected function sanitizeGenericAttribute(string $value): string + { + return htmlspecialchars($value, ENT_QUOTES | ENT_HTML5, 'UTF-8'); + } + + protected function unwrapNode(DOMNode $node): void + { + if ($node->hasChildNodes()) { + while ($node->firstChild) { + $child = $node->removeChild($node->firstChild); + $node->parentNode->insertBefore($child, $node); + } + } + $node->parentNode->removeChild($node); + } + + protected function processConfig(array $userConfig): void + { + $defaultConfig = [ + 'max_depth' => 100, + 'max_attributes' => 50, + 'allow_protocol_relative' => false, + 'elements' => [ + 'a', 'b', 'strong', 'i', 'em', 'u', 'p', 'br', 'div', 'span', 'ul', 'ol', 'li', + 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'blockquote', 'pre', 'code', 'img', + 'table', 'thead', 'tbody', 'tfoot', 'tr', 'td', 'th', + ], + 'attributes' => [ + 'a' => ['href', 'title'], + 'img' => ['src', 'srcset', 'alt', 'title', 'width', 'height'], + '*' => ['class', 'id', 'title', 'data-*'], + ], + 'schemes' => ['http', 'https', 'mailto'], + 'css_properties' => [ + 'color', 'background-color', 'font-size', 'font-weight', + 'text-align', 'text-decoration', 'width', 'height', + 'margin', 'padding', 'border', 'border-collapse', + ], + 'validators' => [], + ]; + + $this->config = array_replace_recursive($defaultConfig, $userConfig); + + $this->config['elements'] = array_fill_keys($this->config['elements'], 1); + $this->config['css_properties'] = array_fill_keys($this->config['css_properties'], 1); + } +} \ No newline at end of file