Code for the fix/clean of HTML using the HtmlAgilityPack

Created by Jeremy Burgess, Modified on Thu, 23 Feb, 2023 at 3:22 PM by Joel Marsh-Trim

Symptoms

You wish to process an HTML document [for the extraction of text/data] but the HTML may not be standards compliant and document contains unnecessary formatting.

Cause

n/a

Resolution

using HtmlAgilityPack;

public string CleanHtml(string html)
{
    // Load the HTML document
    var doc = new HtmlDocument();
    doc.LoadHtml(html);
    
    // Check and repair the HTML
    doc.OptionFixNestedTags = true;
    doc.OptionAutoCloseOnEnd = true;
    doc.OptionCheckSyntax = true;
    doc.OptionWriteEmptyNodes = true;
    doc.OptionOutputOriginalCase = true;
    
    // Remove all stylesheets and style elements
    var styleNodes = doc.DocumentNode.DescendantsAndSelf().Where(n => n.Name == "style");
    foreach (var styleNode in styleNodes.ToList())
    {
        styleNode.Remove();
    }
    
    // Remove all script elements
    var scriptNodes = doc.DocumentNode.DescendantsAndSelf().Where(n => n.Name == "script");
    foreach (var scriptNode in scriptNodes.ToList())
    {
        scriptNode.Remove();
    }
    
    // Remove all images
    var imageNodes = doc.DocumentNode.DescendantsAndSelf().Where(n => n.Name == "img");
    foreach (var imageNode in imageNodes.ToList())
    {
        imageNode.Remove();
    }
    
    // Remove all font elements
    var fontNodes = doc.DocumentNode.DescendantsAndSelf().Where(n => n.Name == "font");
    foreach (var fontNode in fontNodes.ToList())
    {
        fontNode.ParentNode.ReplaceChild(HtmlTextNode.CreateNode(fontNode.InnerText), fontNode);
    }
    
    // Remove all color attributes
    var nodesWithColor = doc.DocumentNode.DescendantsAndSelf().Where(n => n.Attributes["color"] != null);
    foreach (var node in nodesWithColor.ToList())
    {
        node.Attributes.Remove("color");
    }
    
    // Remove all background color attributes
    var nodesWithBgColor = doc.DocumentNode.DescendantsAndSelf().Where(n => n.Attributes["bgcolor"] != null);
    foreach (var node in nodesWithBgColor.ToList())
    {
        node.Attributes.Remove("bgcolor");
    }
    
    // Remove all border attributes
    var nodesWithBorder = doc.DocumentNode.DescendantsAndSelf().Where(n => n.Attributes["border"] != null);
    foreach (var node in nodesWithBorder.ToList())
    {
        node.Attributes.Remove("border");
    }
    
    // Return the cleaned HTML as a string
    return doc.DocumentNode.OuterHtml;
}


Was this article helpful?

That’s Great!

Thank you for your feedback

Sorry! We couldn't be helpful

Thank you for your feedback

Let us know how can we improve this article!

Select at least one of the reasons
CAPTCHA verification is required.

Feedback sent

We appreciate your effort and will try to fix the article