diff --git a/include/tool/Editing/HTMLParse.php b/include/tool/Editing/HTMLParse.php index 8abb34a..5461885 100644 --- a/include/tool/Editing/HTMLParse.php +++ b/include/tool/Editing/HTMLParse.php @@ -1,330 +1,276 @@ doc = $text; - $this->Init_Parse(); - $this->Parse(); - } - - - public function Parse(){ - - $offset = 0; - - do{ - $continue = true; - $pos = strpos($this->doc,'<',$offset); - - - //no more tags - if( $pos === false ){ - $continue = false; - break; - } - - //comment - if( substr($this->doc,$pos,4) === ' - * Does not support full sgml comments second comment --> - * - */ - public function CommentContent(&$offset){ - - $this->doc = substr($this->doc,$offset); - $offset = 0; - - $pos = strpos($this->doc,'-->'); - if( $pos === false ){ - $pos = strlen($this->doc); - } - - $comment_content = substr($this->doc,0,$pos); - $this->doc = substr($this->doc,$pos+3); - - $new_element = array(); - $new_element['comment'] = $comment_content; - $this->dom_array[] = $new_element; - } - - public function NonHtmlContent(&$offset,$untill){ - - $this->doc = substr($this->doc,$offset); - $offset = 0; - $this->doc = $this->EscapeQuotes($this->doc); - $full_length = strlen($this->doc); - $untill_length = strlen($untill); - - do{ - - $continue = false; - $end_string = false; - - $pos_quote1 = $this->strpos_min("'",$offset,$full_length); - $pos_quote2 = $this->strpos_min('"',$offset,$full_length); - $pos_scomment = $this->strpos_min('//',$offset,$full_length); - $pos_mcomment = $this->strpos_min('/*',$offset,$full_length); - - $min_pos = min($pos_quote1, $pos_quote2, $pos_scomment, $pos_mcomment); - - $pos_close = strpos($this->doc,' - if( ($pos_close !== false) - && ($pos_close <= $min_pos) - && (strtolower(substr($this->doc,$pos_close+2,$untill_length)) == $untill) - ){ - $offset = $pos_close; - break; - } - - // nothing else found - if( $min_pos === $full_length ){ - $offset = $full_length; - break; - } - - - if( $min_pos === $pos_quote1 ){ - $end_string = "'"; - }elseif( $min_pos === $pos_quote2 ){ - $end_string = '"'; - }elseif( $min_pos === $pos_scomment ){ - $end_string = "\n"; - }elseif( $min_pos === $pos_mcomment ){ - $end_string = '*/'; - } - - $end_pos = strpos($this->doc,$end_string,$min_pos+1); - if( $end_pos === false ){ - $offset = $full_length; - }else{ - $offset = $full_length; - $offset = $end_pos + strlen($end_string); - $continue = true; - } - - - }while($continue); - - $code = substr($this->doc,0,$offset); - $this->doc = substr($this->doc,$offset); - $this->doc = $this->UnescapeQuotes($this->doc); - $this->dom_array[] = $this->UnescapeQuotes($code); - $offset = 0; - } - - - public function strpos_min($needle,$offset,$length){ - $pos = strpos($this->doc,$needle,$offset); - if( $pos === false ){ - return $length; - } - return $pos; - } - - - public function EscapeQuotes($string){ - - $search = array('\\\\','\\\'','\\"'); - $replace = array( $this->mark_double_slash, $this->mark_escaped_single, $this->mark_escaped_double); - - return str_replace($search, $replace, $string); - } - - public function UnescapeQuotes($string){ - $search = array( $this->mark_double_slash, $this->mark_escaped_single, $this->mark_escaped_double); - $replace = array('\\\\','\\\'','\\"'); - return str_replace($search, $replace, $string); - } - - /* - * Init - * - */ - public function Init_Parse(){ - $this->GetRandom(); - $this->mark_double_slash = $this->GetMarker(); - $this->mark_escaped_single = $this->GetMarker(); - $this->mark_escaped_double = $this->GetMarker(); - } - - - public function GetRandom(){ - do{ - $this->random = dechex(mt_rand(0, 0x7fffff)); - }while(strpos($this->doc,$this->random) !== false); - } - - public function GetMarker(){ - static $n = 0; - return $this->random . sprintf('%08X', $n++); - } -} +/** + * A custom, non-validating HTML parser that converts an HTML string into an array structure. + * It's designed to be fast and handle real-world, often imperfect, HTML. + */ +class HTMLParse +{ + public string $doc = ''; + public array $dom_array = []; + public array $errors = []; + + private int $doc_length; + private int $position = 0; + + private string $mark_double_slash; + private string $mark_escaped_single; + private string $mark_escaped_double; + + public function __construct(string $text) + { + $this->doc = $text; + $this->doc_length = strlen($text); + $this->Init_Parse(); + $this->Parse(); + } + + public function Init_Parse(): void + { + $this->generateMarkers(); + } + + private function generateMarkers(): void + { + $this->mark_double_slash = $this->uniqueMarker(); + $this->mark_escaped_single = $this->uniqueMarker(); + $this->mark_escaped_double = $this->uniqueMarker(); + } + + private function uniqueMarker(): string + { + static $counter = 0; + return "\x01".hash('xxh3', microtime().$counter++)."\x02"; + } + + private function addError(string $message): void + { + $this->errors[] = "Error at position {$this->position}: {$message}"; + } + + public function Parse(): void + { + while ($this->position < $this->doc_length) { + $char = $this->doc[$this->position]; + + if ($char !== '<') { + $this->parseTextContent(); + continue; + } + + if ($this->handleCommentIfAny()) { + continue; + } + + $tag_info = $this->parseTag(); + if ($tag_info === null) { + // If parseTag fails, treat the '<' as literal text + $this->dom_array[] = '<'; + $this->position++; + continue; + } + + // If it's an opening tag for a special content element... + if ($tag_info['name'][0] !== '/' && !$tag_info['self_closing']) { + $this->handleSpecialContent($tag_info['name']); + } + } + } + + private function parseTextContent(): void + { + $next_tag_pos = strpos($this->doc, '<', $this->position); + if ($next_tag_pos === false) { + $text = substr($this->doc, $this->position); + $this->position = $this->doc_length; + } else { + $text = substr($this->doc, $this->position, $next_tag_pos - $this->position); + $this->position = $next_tag_pos; + } + + if ($text !== '') { + $this->dom_array[] = $text; + } + } + + private function handleCommentIfAny(): bool + { + if (substr_compare($this->doc, '', $this->position + 4); + if ($end_pos === false) { + $content = substr($this->doc, $this->position + 4); + $this->position = $this->doc_length; + $this->addError("Unclosed HTML comment."); + } else { + $content = substr($this->doc, $this->position + 4, $end_pos - ($this->position + 4)); + $this->position = $end_pos + 3; + } + $this->dom_array[] = ['comment' => $content]; + return true; + } + return false; + } + + /** @return ?array{name: string, self_closing: bool} */ + private function parseTag(): ?array + { + $original_tag_start_pos = $this->position; + $this->position++; // Skip '<' + + if ($this->position >= $this->doc_length) { + $this->position = $original_tag_start_pos; // backtrack + return null; + } + + $is_closing_tag_char = ($this->doc[$this->position] === '/'); + if ($is_closing_tag_char) { + $this->position++; // Skip '/' for tag name parsing + } + + $tag_name = $this->parseTagName(); + + if ($tag_name === null || $tag_name === '') { + $this->position = $original_tag_start_pos; // backtrack + return null; + } + + $element = ['tag' => $tag_name]; + $self_closing = false; + + if ($is_closing_tag_char) { + $element['tag'] = '/' . $tag_name; + } else { // Only parse attributes for opening tags + $element['attributes'] = $this->parseAttributes(); + } + + // Find the end of the tag + $gt_pos = strpos($this->doc, '>', $this->position); + if ($gt_pos === false) { + $this->addError("Unclosed tag '{$element['tag']}'."); + $this->position = $this->doc_length; // Consume rest of document + $element['self_closing'] = false; + $this->dom_array[] = $element; + return ['name' => $element['tag'], 'self_closing' => false]; + } + + // Check for XML-style self-closing tags like
+ if (!$is_closing_tag_char) { + $before_gt_segment = substr($this->doc, $this->position, $gt_pos - $this->position); + $trimmed_before_gt = rtrim($before_gt_segment); + + // CHANGE 3: Use substr() for a cleaner, more modern check. + if (substr($trimmed_before_gt, -1) === '/') { + $self_closing = true; + } + } + + $element['self_closing'] = $self_closing; + $this->dom_array[] = $element; + $this->position = $gt_pos + 1; + + return ['name' => $element['tag'], 'self_closing' => $self_closing]; + } + + private function parseTagName(): ?string + { + $name_len = strspn( + $this->doc, + 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_:.-', + $this->position + ); + + if ($name_len === 0) { + return null; + } + + $name = substr($this->doc, $this->position, $name_len); + $this->position += $name_len; + + return strtolower($name); + } + + /** @return array */ + private function parseAttributes(): array + { + $attributes = []; + // This regex finds attributes one by one, from the current position. + $pattern = '/ + \G # Anchor to the current position in the string + \s+ # Require at least one space before an attribute + (?!/?>) # Negative lookahead: ensure we are not at the end of the tag (/> or >) + ([^\s=<>\/]+) # Capture group 1: The attribute name + (?: # Optional group for the value part + \s*=\s* # The equals sign, with optional whitespace + (?: + "([^"]*)" # Capture group 2: Double-quoted value + | # OR + \'([^\']*)\' # Capture group 3: Single-quoted value + | # OR + ([^\s"\'=<>`]+) # Capture group 4: Unquoted value + ) + )? # The entire value part is optional (for boolean attributes) + /ix'; // Case-insensitive and extended mode + + while (preg_match($pattern, $this->doc, $matches, PREG_OFFSET_CAPTURE, $this->position)) { + $name = strtolower($matches[1][0]); + + $value = $matches[2][0] ?? $matches[3][0] ?? $matches[4][0] ?? null; + + if (!isset($attributes[$name])) { + $attributes[$name] = $value !== null ? + htmlspecialchars_decode($value, ENT_QUOTES) : + null; // Store null for boolean attributes like 'disabled' + } + + $this->position = $matches[0][1] + strlen($matches[0][0]); + } + return $attributes; + } + + private function handleSpecialContent(string $tag_name_from_parser): void + { + if (!in_array($tag_name_from_parser, ['script', 'style'])) { + return; + } + + $content_start_pos = $this->position; + // Use the already-lowercased tag name + $end_tag_to_find = ""; + + $remaining_doc_part = substr($this->doc, $content_start_pos); + if ($remaining_doc_part === false || $remaining_doc_part === '') { + $this->addError("Unclosed special tag '<{$tag_name_from_parser}>'."); + return; + } + + // IMPORTANT: This logic correctly handles cases like `var x = "";` inside a script tag. + // It temporarily escapes certain sequences to prevent a premature match. + $escaped_remaining_part = str_replace( + ['\\\\', '\\\'', '\\"', 'mark_double_slash, $this->mark_escaped_single, $this->mark_escaped_double, "<\\/"], + $remaining_doc_part + ); + + $end_tag_pos_in_escaped_part = stripos($escaped_remaining_part, $end_tag_to_find); + + $actual_content = ''; + if ($end_tag_pos_in_escaped_part === false) { + $this->addError("Unclosed special tag '<{$tag_name_from_parser}>'."); + // Consume the rest of the document as content of this tag + $actual_content = $remaining_doc_part; + $this->position = $this->doc_length; + } else { + + $actual_content = substr($remaining_doc_part, 0, $end_tag_pos_in_escaped_part); + + $this->position = $content_start_pos + strlen($actual_content) + strlen($end_tag_to_find); + } + + if ($actual_content !== '') { + $this->dom_array[] = $actual_content; + } + } +} \ No newline at end of file