Symptoms
You wish to process an HTML document [for the extraction of text/data] but the HTML may not be standards compliant and document contains unnecessary formatting.
Cause
n/a
Resolution
using HtmlAgilityPack; public string CleanHtml(string html) { // Load the HTML document var doc = new HtmlDocument(); doc.LoadHtml(html); // Check and repair the HTML doc.OptionFixNestedTags = true; doc.OptionAutoCloseOnEnd = true; doc.OptionCheckSyntax = true; doc.OptionWriteEmptyNodes = true; doc.OptionOutputOriginalCase = true; // Remove all stylesheets and style elements var styleNodes = doc.DocumentNode.DescendantsAndSelf().Where(n => n.Name == "style"); foreach (var styleNode in styleNodes.ToList()) { styleNode.Remove(); } // Remove all script elements var scriptNodes = doc.DocumentNode.DescendantsAndSelf().Where(n => n.Name == "script"); foreach (var scriptNode in scriptNodes.ToList()) { scriptNode.Remove(); } // Remove all images var imageNodes = doc.DocumentNode.DescendantsAndSelf().Where(n => n.Name == "img"); foreach (var imageNode in imageNodes.ToList()) { imageNode.Remove(); } // Remove all font elements var fontNodes = doc.DocumentNode.DescendantsAndSelf().Where(n => n.Name == "font"); foreach (var fontNode in fontNodes.ToList()) { fontNode.ParentNode.ReplaceChild(HtmlTextNode.CreateNode(fontNode.InnerText), fontNode); } // Remove all color attributes var nodesWithColor = doc.DocumentNode.DescendantsAndSelf().Where(n => n.Attributes["color"] != null); foreach (var node in nodesWithColor.ToList()) { node.Attributes.Remove("color"); } // Remove all background color attributes var nodesWithBgColor = doc.DocumentNode.DescendantsAndSelf().Where(n => n.Attributes["bgcolor"] != null); foreach (var node in nodesWithBgColor.ToList()) { node.Attributes.Remove("bgcolor"); } // Remove all border attributes var nodesWithBorder = doc.DocumentNode.DescendantsAndSelf().Where(n => n.Attributes["border"] != null); foreach (var node in nodesWithBorder.ToList()) { node.Attributes.Remove("border"); } // Return the cleaned HTML as a string return doc.DocumentNode.OuterHtml; }
Was this article helpful?
That’s Great!
Thank you for your feedback
Sorry! We couldn't be helpful
Thank you for your feedback
Feedback sent
We appreciate your effort and will try to fix the article