Use html.Parse rather than html.ParseFragment (#16223) (#16225)

* Use html.Parse rather than html.ParseFragment
  There have been a few issues with html.ParseFragment - just use html.Parse instead.

* Skip document node

Signed-off-by: Andrew Thornton <art27@cantab.net>

Co-authored-by: zeripath <art27@cantab.net>
This commit is contained in:
6543 2021-06-22 03:46:39 +02:00 committed by GitHub
parent e898590c81
commit 8ac48584ec
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 14 additions and 17 deletions

View File

@ -334,40 +334,37 @@ func (ctx *postProcessCtx) postProcess(rawHTML []byte) ([]byte, error) {
_, _ = res.WriteString("</body></html>") _, _ = res.WriteString("</body></html>")
// parse the HTML // parse the HTML
nodes, err := html.ParseFragment(res, nil) node, err := html.Parse(res)
if err != nil { if err != nil {
return nil, &postProcessError{"invalid HTML", err} return nil, &postProcessError{"invalid HTML", err}
} }
for _, node := range nodes { if node.Type == html.DocumentNode {
ctx.visitNode(node, true) node = node.FirstChild
} }
newNodes := make([]*html.Node, 0, len(nodes)) ctx.visitNode(node, true)
for _, node := range nodes { nodes := make([]*html.Node, 0, 5)
if node.Data == "html" {
node = node.FirstChild if node.Data == "html" {
for node != nil && node.Data != "body" { node = node.FirstChild
node = node.NextSibling for node != nil && node.Data != "body" {
} node = node.NextSibling
}
if node == nil {
continue
} }
}
if node != nil {
if node.Data == "body" { if node.Data == "body" {
child := node.FirstChild child := node.FirstChild
for child != nil { for child != nil {
newNodes = append(newNodes, child) nodes = append(nodes, child)
child = child.NextSibling child = child.NextSibling
} }
} else { } else {
newNodes = append(newNodes, node) nodes = append(nodes, node)
} }
} }
nodes = newNodes
// Create buffer in which the data will be placed again. We know that the // Create buffer in which the data will be placed again. We know that the
// length will be at least that of res; to spare a few alloc+copy, we // length will be at least that of res; to spare a few alloc+copy, we
// reuse res, resetting its length to 0. // reuse res, resetting its length to 0.