Symptoms
You wish to process an HTML document [for the extraction of text/data] but the HTML may not be standards compliant and document contains unnecessary formatting.
Cause
n/a
Resolution
using HtmlAgilityPack;
public string CleanHtml(string html)
{
// Load the HTML document
var doc = new HtmlDocument();
doc.LoadHtml(html);
// Check and repair the HTML
doc.OptionFixNestedTags = true;
doc.OptionAutoCloseOnEnd = true;
doc.OptionCheckSyntax = true;
doc.OptionWriteEmptyNodes = true;
doc.OptionOutputOriginalCase = true;
// Remove all stylesheets and style elements
var styleNodes = doc.DocumentNode.DescendantsAndSelf().Where(n => n.Name == "style");
foreach (var styleNode in styleNodes.ToList())
{
styleNode.Remove();
}
// Remove all script elements
var scriptNodes = doc.DocumentNode.DescendantsAndSelf().Where(n => n.Name == "script");
foreach (var scriptNode in scriptNodes.ToList())
{
scriptNode.Remove();
}
// Remove all images
var imageNodes = doc.DocumentNode.DescendantsAndSelf().Where(n => n.Name == "img");
foreach (var imageNode in imageNodes.ToList())
{
imageNode.Remove();
}
// Remove all font elements
var fontNodes = doc.DocumentNode.DescendantsAndSelf().Where(n => n.Name == "font");
foreach (var fontNode in fontNodes.ToList())
{
fontNode.ParentNode.ReplaceChild(HtmlTextNode.CreateNode(fontNode.InnerText), fontNode);
}
// Remove all color attributes
var nodesWithColor = doc.DocumentNode.DescendantsAndSelf().Where(n => n.Attributes["color"] != null);
foreach (var node in nodesWithColor.ToList())
{
node.Attributes.Remove("color");
}
// Remove all background color attributes
var nodesWithBgColor = doc.DocumentNode.DescendantsAndSelf().Where(n => n.Attributes["bgcolor"] != null);
foreach (var node in nodesWithBgColor.ToList())
{
node.Attributes.Remove("bgcolor");
}
// Remove all border attributes
var nodesWithBorder = doc.DocumentNode.DescendantsAndSelf().Where(n => n.Attributes["border"] != null);
foreach (var node in nodesWithBorder.ToList())
{
node.Attributes.Remove("border");
}
// Return the cleaned HTML as a string
return doc.DocumentNode.OuterHtml;
}Was this article helpful?
That’s Great!
Thank you for your feedback
Sorry! We couldn't be helpful
Thank you for your feedback
Feedback sent
We appreciate your effort and will try to fix the article