improved Htmlparser

2025-08-04 13:33:14 +02:00 · 2025-06-18 16:08:25 +02:00 · 2025-06-18 16:08:25 +02:00 · 2c712361a6
commit 2c712361a6
parent 796a988e9a
1 changed files with 272 additions and 326 deletions
--- a/include/tool/Editing/HTMLParse.php
+++ b/include/tool/Editing/HTMLParse.php
@ -1,330 +1,276 @@
 <?php
-
+declare(strict_types=1);
 namespace gp\tool\Editing;
-
 defined('is_running') or die('Not an entry point...');

-
-class HTMLParse{
-
-	public $doc = '';
-	public $dom_array = array();
-	public $errors = array();
-
-	public $random;
-	public $mark_double_slash;
-	public $mark_escaped_single;
-	public $mark_escaped_double;
-
-	public function __construct($text){
-		$this->doc = $text;
-		$this->Init_Parse();
-		$this->Parse();
-	}
-
-
-	public function Parse(){
-
-		$offset = 0;
-
-		do{
-			$continue = true;
-			$pos = strpos($this->doc,'<',$offset);
-
-
-			//no more tags
-			if( $pos === false ){
-				$continue = false;
-				break;
-			}
-
-			//comment
-			if( substr($this->doc,$pos,4) === '<!--' ){
-				$this->dom_array[] = substr($this->doc,$offset,$pos-$offset); //get content before comment
-				$offset = $pos + 4;
-				$this->CommentContent($offset);
-				continue;
-			}
-
-			//get tag name
-			$tag_name = $this->TagName($pos+1,$name_len);
-			if( !$tag_name ){
-				$this->doc = substr_replace($this->doc,'&lt;',$pos,1);
-				continue;
-			}
-
-			//content
-			if( $pos > $offset ){
-				$this->dom_array[] = substr($this->doc,$offset,$pos-$offset);
-			}
-
-
-			//advance offset to just after <tag_name
-			$offset = $pos+1+$name_len;
-			$new_element = array();
-			$new_element['tag'] = $tag_name;
-
-			//attributes
-			if( $tag_name[0] != '/' ){
-				$new_element['attributes'] = $this->GetAttributes($offset);
-			}
-
-			//tag closing and self-closing?
-			$pos = strpos($this->doc,'>',$offset);
-			if( $pos !== false ){
-				if( $pos > 0 && $this->doc[$pos-1] == '/' ){
-					$new_element['self_closing'] = true;
-				}
-				$offset = $pos+1;
-			}
-
-			$this->dom_array[] = $new_element;
-
-
-			//content
-			switch(strtolower($tag_name)){
-				case 'script':
-				case 'style';
-					$this->NonHtmlContent($offset,$tag_name);
-				break;
-			}
-
-		}while($continue);
-
-		//content after
-		if( $offset < strlen($this->doc) ){
-			$this->dom_array[] = substr($this->doc,$offset);
-		}
-
-	}
-
-
-	/**
-	 * Parse an html tag name
-	 *
-	 */
-	public function TagName($pos,&$name_len){
-		$tag_name = false;
-		$name_len = strspn($this->doc,'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890/!:--',$pos);
-		if( $name_len > 0 ){
-			$tag_name = substr($this->doc,$pos,$name_len);
-		}
-		return $tag_name;
-	}
-
-
-	/**
-	 * Parse html attributes
-	 * match name="value", name=value or name
-	 * accounts for name="value ' value"
-	 *
-	 */
-	public function GetAttributes(&$offset){
-
-		$this->doc		= substr($this->doc,$offset);
-		$pattern_name	= '#^\s+([^\'"<>=\s/]+)\s*(=?)\s*#';
-		$attributes		= array();
-
-
-		while( preg_match($pattern_name, $this->doc, $matches) ){
-
-			$attr_name	= $matches[1];
-			$attr_value = null;
-			$offset		= strlen($matches[0]);
-
-			//get attribute value
-			if( !empty($matches[2]) ){
-
-				$attr_match			= $this->MatchAttribute($offset);
-				if( $attr_match ){
-					$offset += strlen($attr_match[0]);
-					$attr_value = $attr_match[1];
-				}
-			}
-
-
-			if( !isset($attributes[$attr_name]) ){
-				$attributes[$attr_name] = $attr_value;
-			}
-
-			$this->doc = substr($this->doc,$offset);
-			$offset = 0;
-		}
-
-
-		$offset = 0;
-
-		return $attributes;
-	}
-
-	/**
-	 * Get an html attribute value
-	 *
-	 */
-	protected function MatchAttribute(&$offset){
-
-		$char = $this->doc[$offset];
-
-		//double quote
-		if( $char === '"' ){
-			if( preg_match('#\\G"([^"]*)"#', $this->doc, $matches, 0, $offset) ){
-				return $matches;
-			}
-			return;
-		}
-
-		//single quote
-		if( $char == "'" ){
-			if( preg_match('#\\G\'([^\']*)\'#', $this->doc, $matches, 0, $offset) ){
-				return $matches;
-			}
-			return;
-		}
-
-
-
-		//not quoted
-		if( preg_match('#\\G\s*([^\'"<>=\s/]+)#', $this->doc, $matches, 0, $offset) ){
-			return $matches;
-		}
-
-	}
-
-
-	/**
-	 * Parse HTML comments <!-- comments go here -->
-	 * Does not support full sgml comments <!------> second comment -->
-	 *
-	 */
-	public function CommentContent(&$offset){
-
-		$this->doc = substr($this->doc,$offset);
-		$offset = 0;
-
-		$pos = strpos($this->doc,'-->');
-		if( $pos === false ){
-			$pos = strlen($this->doc);
-		}
-
-		$comment_content = substr($this->doc,0,$pos);
-		$this->doc = substr($this->doc,$pos+3);
-
-		$new_element = array();
-		$new_element['comment'] = $comment_content;
-		$this->dom_array[] = $new_element;
-	}
-
-	public function NonHtmlContent(&$offset,$untill){
-
-		$this->doc = substr($this->doc,$offset);
-		$offset = 0;
-		$this->doc = $this->EscapeQuotes($this->doc);
-		$full_length = strlen($this->doc);
-		$untill_length = strlen($untill);
-
-		do{
-
-			$continue = false;
-			$end_string = false;
-
-			$pos_quote1 = $this->strpos_min("'",$offset,$full_length);
-			$pos_quote2 = $this->strpos_min('"',$offset,$full_length);
-			$pos_scomment = $this->strpos_min('//',$offset,$full_length);
-			$pos_mcomment = $this->strpos_min('/*',$offset,$full_length);
-
-			$min_pos = min($pos_quote1, $pos_quote2, $pos_scomment, $pos_mcomment);
-
-			$pos_close = strpos($this->doc,'</',$offset);
-
-			// found </script>
-			if( ($pos_close !== false)
-				&& ($pos_close <= $min_pos)
-				&& (strtolower(substr($this->doc,$pos_close+2,$untill_length)) == $untill)
-				){
-					$offset = $pos_close;
-					break;
-			}
-
-			// nothing else found
-			if( $min_pos === $full_length ){
-				$offset = $full_length;
-				break;
-			}
-
-
-			if( $min_pos === $pos_quote1 ){
-				$end_string = "'";
-			}elseif( $min_pos === $pos_quote2 ){
-				$end_string = '"';
-			}elseif( $min_pos === $pos_scomment ){
-				$end_string = "\n";
-			}elseif( $min_pos === $pos_mcomment ){
-				$end_string = '*/';
-			}
-
-			$end_pos = strpos($this->doc,$end_string,$min_pos+1);
-			if( $end_pos === false ){
-				$offset = $full_length;
-			}else{
-				$offset = $full_length;
-				$offset = $end_pos + strlen($end_string);
-				$continue = true;
-			}
-
-
-		}while($continue);
-
-		$code = substr($this->doc,0,$offset);
-		$this->doc = substr($this->doc,$offset);
-		$this->doc = $this->UnescapeQuotes($this->doc);
-		$this->dom_array[] = $this->UnescapeQuotes($code);
-		$offset = 0;
-	}
-
-
-	public function strpos_min($needle,$offset,$length){
-		$pos = strpos($this->doc,$needle,$offset);
-		if( $pos === false ){
-			return $length;
-		}
-		return $pos;
-	}
-
-
-	public function EscapeQuotes($string){
-
-		$search = array('\\\\','\\\'','\\"');
-		$replace = array( $this->mark_double_slash, $this->mark_escaped_single, $this->mark_escaped_double);
-
-		return str_replace($search, $replace, $string);
-	}
-
-	public function UnescapeQuotes($string){
-		$search = array( $this->mark_double_slash, $this->mark_escaped_single, $this->mark_escaped_double);
-		$replace = array('\\\\','\\\'','\\"');
-		return str_replace($search, $replace, $string);
-	}
-
-	/*
-	 * Init
-	 *
-	 */
-	public function Init_Parse(){
-		$this->GetRandom();
-		$this->mark_double_slash = $this->GetMarker();
-		$this->mark_escaped_single = $this->GetMarker();
-		$this->mark_escaped_double = $this->GetMarker();
-	}
-
-
-	public function GetRandom(){
-		do{
-			$this->random = dechex(mt_rand(0, 0x7fffff));
-		}while(strpos($this->doc,$this->random) !== false);
-	}
-
-	public function GetMarker(){
-		static $n = 0;
-		return $this->random . sprintf('%08X', $n++);
-	}
-}
+/**
+ * A custom, non-validating HTML parser that converts an HTML string into an array structure.
+ * It's designed to be fast and handle real-world, often imperfect, HTML.
+ */
+class HTMLParse
+{
+    public string $doc = '';
+    public array $dom_array = [];
+    public array $errors = [];
+
+    private int $doc_length;
+    private int $position = 0;
+
+    private string $mark_double_slash;
+    private string $mark_escaped_single;
+    private string $mark_escaped_double;
+
+    public function __construct(string $text)
+    {
+        $this->doc = $text;
+        $this->doc_length = strlen($text);
+        $this->Init_Parse();
+        $this->Parse();
+    }
+
+    public function Init_Parse(): void
+    {
+        $this->generateMarkers();
+    }
+
+    private function generateMarkers(): void
+    {
+        $this->mark_double_slash = $this->uniqueMarker();
+        $this->mark_escaped_single = $this->uniqueMarker();
+        $this->mark_escaped_double = $this->uniqueMarker();
+    }
+
+    private function uniqueMarker(): string
+    {
+        static $counter = 0;
+        return "\x01".hash('xxh3', microtime().$counter++)."\x02";
+    }
+
+    private function addError(string $message): void
+    {
+        $this->errors[] = "Error at position {$this->position}: {$message}";
+    }
+
+    public function Parse(): void
+    {
+        while ($this->position < $this->doc_length) {
+            $char = $this->doc[$this->position];
+
+            if ($char !== '<') {
+                $this->parseTextContent();
+                continue;
+            }
+
+            if ($this->handleCommentIfAny()) {
+                continue;
+            }
+
+            $tag_info = $this->parseTag();
+            if ($tag_info === null) {
+                // If parseTag fails, treat the '<' as literal text
+                $this->dom_array[] = '&lt;';
+                $this->position++;
+                continue;
+            }
+
+            // If it's an opening tag for a special content element...
+            if ($tag_info['name'][0] !== '/' && !$tag_info['self_closing']) {
+                 $this->handleSpecialContent($tag_info['name']);
+            }
+        }
+    }
+
+    private function parseTextContent(): void
+    {
+        $next_tag_pos = strpos($this->doc, '<', $this->position);
+        if ($next_tag_pos === false) {
+            $text = substr($this->doc, $this->position);
+            $this->position = $this->doc_length;
+        } else {
+            $text = substr($this->doc, $this->position, $next_tag_pos - $this->position);
+            $this->position = $next_tag_pos;
+        }
+
+        if ($text !== '') {
+            $this->dom_array[] = $text;
+        }
+    }
+
+    private function handleCommentIfAny(): bool
+    {
+        if (substr_compare($this->doc, '<!--', $this->position, 4) === 0) {
+            $end_pos = strpos($this->doc, '-->', $this->position + 4);
+            if ($end_pos === false) {
+                $content = substr($this->doc, $this->position + 4);
+                $this->position = $this->doc_length;
+                $this->addError("Unclosed HTML comment.");
+            } else {
+                $content = substr($this->doc, $this->position + 4, $end_pos - ($this->position + 4));
+                $this->position = $end_pos + 3;
+            }
+            $this->dom_array[] = ['comment' => $content];
+            return true;
+        }
+        return false;
+    }
+
+    /** @return ?array{name: string, self_closing: bool} */
+    private function parseTag(): ?array
+    {
+        $original_tag_start_pos = $this->position;
+        $this->position++; // Skip '<'
+
+        if ($this->position >= $this->doc_length) {
+            $this->position = $original_tag_start_pos; // backtrack
+            return null;
+        }
+
+        $is_closing_tag_char = ($this->doc[$this->position] === '/');
+        if ($is_closing_tag_char) {
+            $this->position++; // Skip '/' for tag name parsing
+        }
+
+        $tag_name = $this->parseTagName();
+
+        if ($tag_name === null || $tag_name === '') {
+            $this->position = $original_tag_start_pos; // backtrack
+            return null;
+        }
+
+        $element = ['tag' => $tag_name];
+        $self_closing = false;
+
+        if ($is_closing_tag_char) {
+            $element['tag'] = '/' . $tag_name;
+        } else { // Only parse attributes for opening tags
+            $element['attributes'] = $this->parseAttributes();
+        }
+
+        // Find the end of the tag
+        $gt_pos = strpos($this->doc, '>', $this->position);
+        if ($gt_pos === false) {
+            $this->addError("Unclosed tag '{$element['tag']}'.");
+            $this->position = $this->doc_length; // Consume rest of document
+            $element['self_closing'] = false;
+            $this->dom_array[] = $element;
+            return ['name' => $element['tag'], 'self_closing' => false];
+        }
+
+        // Check for XML-style self-closing tags like <br />
+        if (!$is_closing_tag_char) {
+            $before_gt_segment = substr($this->doc, $this->position, $gt_pos - $this->position);
+            $trimmed_before_gt = rtrim($before_gt_segment);
+
+            // CHANGE 3: Use substr() for a cleaner, more modern check.
+            if (substr($trimmed_before_gt, -1) === '/') {
+                $self_closing = true;
+            }
+        }
+        
+        $element['self_closing'] = $self_closing;
+        $this->dom_array[] = $element;
+        $this->position = $gt_pos + 1;
+
+        return ['name' => $element['tag'], 'self_closing' => $self_closing];
+    }
+
+    private function parseTagName(): ?string
+    {
+        $name_len = strspn(
+            $this->doc,
+            'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_:.-',
+            $this->position
+        );
+
+        if ($name_len === 0) {
+            return null;
+        }
+
+        $name = substr($this->doc, $this->position, $name_len);
+        $this->position += $name_len;        
+       
+        return strtolower($name);
+    }
+
+    /** @return array<string, string|null> */
+    private function parseAttributes(): array
+    {
+        $attributes = [];
+        // This regex finds attributes one by one, from the current position.
+        $pattern = '/
+            \G             # Anchor to the current position in the string
+            \s+            # Require at least one space before an attribute
+            (?!/?>)        # Negative lookahead: ensure we are not at the end of the tag (/> or >)
+            ([^\s=<>\/]+)  # Capture group 1: The attribute name
+            (?:            # Optional group for the value part
+                \s*=\s*    # The equals sign, with optional whitespace
+                (?:
+                    "([^"]*)"  # Capture group 2: Double-quoted value
+                    |          # OR
+                    \'([^\']*)\'  # Capture group 3: Single-quoted value
+                    |          # OR
+                    ([^\s"\'=<>`]+) # Capture group 4: Unquoted value
+                )
+            )?             # The entire value part is optional (for boolean attributes)
+        /ix'; // Case-insensitive and extended mode
+
+        while (preg_match($pattern, $this->doc, $matches, PREG_OFFSET_CAPTURE, $this->position)) {
+            $name = strtolower($matches[1][0]);
+
+            $value = $matches[2][0] ?? $matches[3][0] ?? $matches[4][0] ?? null;
+
+            if (!isset($attributes[$name])) {
+                 $attributes[$name] = $value !== null ?
+                    htmlspecialchars_decode($value, ENT_QUOTES) :
+                    null; // Store null for boolean attributes like 'disabled'
+            }
+
+            $this->position = $matches[0][1] + strlen($matches[0][0]);
+        }
+        return $attributes;
+    }
+
+    private function handleSpecialContent(string $tag_name_from_parser): void
+    {        
+        if (!in_array($tag_name_from_parser, ['script', 'style'])) {
+            return;
+        }
+
+        $content_start_pos = $this->position;
+        // Use the already-lowercased tag name
+        $end_tag_to_find = "</{$tag_name_from_parser}>";
+
+        $remaining_doc_part = substr($this->doc, $content_start_pos);
+        if ($remaining_doc_part === false || $remaining_doc_part === '') {
+            $this->addError("Unclosed special tag '<{$tag_name_from_parser}>'.");
+            return;
+        }
+
+        // IMPORTANT: This logic correctly handles cases like `var x = "</script>";` inside a script tag.
+        // It temporarily escapes certain sequences to prevent a premature match.
+        $escaped_remaining_part = str_replace(
+            ['\\\\', '\\\'', '\\"', '</'],
+            [$this->mark_double_slash, $this->mark_escaped_single, $this->mark_escaped_double, "<\\/"],
+            $remaining_doc_part
+        );
+
+        $end_tag_pos_in_escaped_part = stripos($escaped_remaining_part, $end_tag_to_find);
+
+        $actual_content = '';
+        if ($end_tag_pos_in_escaped_part === false) {
+            $this->addError("Unclosed special tag '<{$tag_name_from_parser}>'.");
+            // Consume the rest of the document as content of this tag
+            $actual_content = $remaining_doc_part;
+            $this->position = $this->doc_length;
+        } else {
+            
+            $actual_content = substr($remaining_doc_part, 0, $end_tag_pos_in_escaped_part);
+            
+            $this->position = $content_start_pos + strlen($actual_content) + strlen($end_tag_to_find);
+        }
+
+        if ($actual_content !== '') {
+            $this->dom_array[] = $actual_content;
+        }
+    }
+}