From af8b95522ba58ad71b83c8782c4304e3c6dac5fa Mon Sep 17 00:00:00 2001 From: gtbu Date: Wed, 18 Jun 2025 16:47:25 +0200 Subject: [PATCH] update Htmlparse.php Difficult to test - if you find bugs please make an issue (or replace with the old version form 5/25) --- include/tool/Editing/HTMLParse.php | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/include/tool/Editing/HTMLParse.php b/include/tool/Editing/HTMLParse.php index 5461885..1944e6d 100644 --- a/include/tool/Editing/HTMLParse.php +++ b/include/tool/Editing/HTMLParse.php @@ -9,6 +9,10 @@ defined('is_running') or die('Not an entry point...'); */ class HTMLParse { + const VOID_ELEMENTS = [ + 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', + 'link', 'meta', 'param', 'source', 'track', 'wbr' + ]; public string $doc = ''; public array $dom_array = []; public array $errors = []; @@ -115,7 +119,7 @@ class HTMLParse } /** @return ?array{name: string, self_closing: bool} */ - private function parseTag(): ?array + private function parseTag(): ?array { $original_tag_start_pos = $this->position; $this->position++; // Skip '<' @@ -161,19 +165,24 @@ class HTMLParse $before_gt_segment = substr($this->doc, $this->position, $gt_pos - $this->position); $trimmed_before_gt = rtrim($before_gt_segment); - // CHANGE 3: Use substr() for a cleaner, more modern check. if (substr($trimmed_before_gt, -1) === '/') { $self_closing = true; } } - + + // **THE FIX IS HERE** + // Ensure HTML5 void elements are always treated as self-closing. + if (!$is_closing_tag_char && in_array($tag_name, self::VOID_ELEMENTS)) { + $self_closing = true; + } + $element['self_closing'] = $self_closing; $this->dom_array[] = $element; $this->position = $gt_pos + 1; return ['name' => $element['tag'], 'self_closing' => $self_closing]; } - + private function parseTagName(): ?string { $name_len = strspn( @@ -193,15 +202,16 @@ class HTMLParse } /** @return array */ - private function parseAttributes(): array + private function parseAttributes(): array { $attributes = []; - // This regex finds attributes one by one, from the current position. - $pattern = '/ + // This regex finds attributes one by one. + // NOTE: We use ~ as the delimiter to avoid escaping forward slashes /. + $pattern = '~ \G # Anchor to the current position in the string \s+ # Require at least one space before an attribute (?!/?>) # Negative lookahead: ensure we are not at the end of the tag (/> or >) - ([^\s=<>\/]+) # Capture group 1: The attribute name + ([^\s=<>/]+) # Capture group 1: The attribute name (no escape needed for /) (?: # Optional group for the value part \s*=\s* # The equals sign, with optional whitespace (?: @@ -212,7 +222,7 @@ class HTMLParse ([^\s"\'=<>`]+) # Capture group 4: Unquoted value ) )? # The entire value part is optional (for boolean attributes) - /ix'; // Case-insensitive and extended mode + ~ix'; // Case-insensitive, extended mode, and ~ as delimiter while (preg_match($pattern, $this->doc, $matches, PREG_OFFSET_CAPTURE, $this->position)) { $name = strtolower($matches[1][0]);