update Htmlparse.php

Difficult to test - if you find bugs please make an issue (or replace with the old version form 5/25)
This commit is contained in:
gtbu 2025-06-18 16:47:25 +02:00
parent 2c712361a6
commit af8b95522b

View file

@ -9,6 +9,10 @@ defined('is_running') or die('Not an entry point...');
*/
class HTMLParse
{
const VOID_ELEMENTS = [
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input',
'link', 'meta', 'param', 'source', 'track', 'wbr'
];
public string $doc = '';
public array $dom_array = [];
public array $errors = [];
@ -115,7 +119,7 @@ class HTMLParse
}
/** @return ?array{name: string, self_closing: bool} */
private function parseTag(): ?array
private function parseTag(): ?array
{
$original_tag_start_pos = $this->position;
$this->position++; // Skip '<'
@ -161,19 +165,24 @@ class HTMLParse
$before_gt_segment = substr($this->doc, $this->position, $gt_pos - $this->position);
$trimmed_before_gt = rtrim($before_gt_segment);
// CHANGE 3: Use substr() for a cleaner, more modern check.
if (substr($trimmed_before_gt, -1) === '/') {
$self_closing = true;
}
}
// **THE FIX IS HERE**
// Ensure HTML5 void elements are always treated as self-closing.
if (!$is_closing_tag_char && in_array($tag_name, self::VOID_ELEMENTS)) {
$self_closing = true;
}
$element['self_closing'] = $self_closing;
$this->dom_array[] = $element;
$this->position = $gt_pos + 1;
return ['name' => $element['tag'], 'self_closing' => $self_closing];
}
private function parseTagName(): ?string
{
$name_len = strspn(
@ -193,15 +202,16 @@ class HTMLParse
}
/** @return array<string, string|null> */
private function parseAttributes(): array
private function parseAttributes(): array
{
$attributes = [];
// This regex finds attributes one by one, from the current position.
$pattern = '/
// This regex finds attributes one by one.
// NOTE: We use ~ as the delimiter to avoid escaping forward slashes /.
$pattern = '~
\G # Anchor to the current position in the string
\s+ # Require at least one space before an attribute
(?!/?>) # Negative lookahead: ensure we are not at the end of the tag (/> or >)
([^\s=<>\/]+) # Capture group 1: The attribute name
([^\s=<>/]+) # Capture group 1: The attribute name (no escape needed for /)
(?: # Optional group for the value part
\s*=\s* # The equals sign, with optional whitespace
(?:
@ -212,7 +222,7 @@ class HTMLParse
([^\s"\'=<>`]+) # Capture group 4: Unquoted value
)
)? # The entire value part is optional (for boolean attributes)
/ix'; // Case-insensitive and extended mode
~ix'; // Case-insensitive, extended mode, and ~ as delimiter
while (preg_match($pattern, $this->doc, $matches, PREG_OFFSET_CAPTURE, $this->position)) {
$name = strtolower($matches[1][0]);