Strip HTML Tags from Text

By | March 24, 2010

Have you ever wondered how would you show on a web form text that is stripped of HTML tags but you still want to accept HTML tags when saving to your form? Sounds confusing?

To make it clear, I will give a good example.  Lets say you have a form that has a Rich Text Box (FTB or FCK) which you allow users to cut and paste items that have HTML tags so that you can display it properly like a Blog Article, but there are some instances that you want this to be stripped of the HTML tags like displaying a summary on a Grid.  Now my solution is to strip the HTML codes before displaying it when needed.

So when you copy this

Hello, World!

it wont show on your grid as this

<html>
 <head>
 <title>
 Hello World
 </title>
 </head>
 <body>
 <font size ="4" color="blue">
 Hello, World!
 </font>
 </body>
</html>

but as this

"Hello World!"

Now with a mix or Replace and Regular Expressions I created a class to handle that and here it is:

public string StripHTML(string sInputString)
 {

 try
 {
 string sOutputString;
 sOutputString = sInputString;
 //Initial Cleaning Step
 //Replace new line and carriage return with Spaces
 sOutputString = sOutputString.Replace("r", " ");
 sOutputString = sOutputString.Replace("n", " ");
 // Remove sTabs
 sOutputString = sOutputString.Replace("t", string.Empty);

 //Tag Removal
 DataTable myDataTable = GetTableDefinition();
 myDataTable.DefaultView.Sort = "iID ASC";
 foreach (DataRow drCleaningItem in myDataTable.Rows)
 {
 string sOriginalString = (drCleaningItem["sOriginalString"]).ToString();
 string sReplacementString = (drCleaningItem["sReplacementString"]).ToString();
 sOutputString = Regex.Replace(sOutputString, sOriginalString, sReplacementString, RegexOptions.IgnoreCase);
 }

 //Initial replacement target string for linebreaks
 string sBreaks = "rrr";

 // Initial replacement target string for sTabs
 string sTabs = "ttttt";
 for (int x = 0; x < sOutputString.Length; x++)
 {
 sOutputString = sOutputString.Replace(sBreaks, "rr");
 sOutputString = sOutputString.Replace(sTabs, "tttt");
 sBreaks = sBreaks + "r";
 sTabs = sTabs + "t";
 }

 return sOutputString;

 }
 catch
 {
 return sInputString;
 }
 }

 private DataTable GetTableDefinition()
 {

 DataTable dtCleaningCollection = new DataTable();
 dtCleaningCollection.Columns.Add("iID", typeof(int));
 dtCleaningCollection.Columns.Add("sOriginalString", typeof(string));
 dtCleaningCollection.Columns.Add("sReplacementString", typeof(string));

 // Replace repeating spaces with single space
 dtCleaningCollection.Rows.Add(1, @"( )+", " ");

 // Prepare and clean Header Tag
 dtCleaningCollection.Rows.Add(2, @"<( )*head([^>])*>", "<head>");
 dtCleaningCollection.Rows.Add(3, @"(<( )*(/)( )*head( )*>)", "</head>");
 dtCleaningCollection.Rows.Add(4, "(<head>).*(</head>)", string.Empty);

 // Prepare and clean Script Tag
 dtCleaningCollection.Rows.Add(5, @"<( )*script([^>])*>", "<script>");
 dtCleaningCollection.Rows.Add(6, @"(<( )*(/)( )*script( )*>)", "</script>");
 dtCleaningCollection.Rows.Add(7, @"(<script>).*(</script>)", string.Empty);

 // Prepare and clean Style Tag
 dtCleaningCollection.Rows.Add(8, @"<( )*style([^>])*>", "<style>");
 dtCleaningCollection.Rows.Add(9, @"(<( )*(/)( )*style( )*>)", "</style>");
 dtCleaningCollection.Rows.Add(10, "(<style>).*(</style>)", string.Empty);

 // Replace <td> with sTabs
 dtCleaningCollection.Rows.Add(11, @"<( )*td([^>])*>", "t");

 // Replace <BR> and <LI> with Line sBreaks
 dtCleaningCollection.Rows.Add(12, @"<( )*br( )*>", "r");
 dtCleaningCollection.Rows.Add(13, @"<( )*li( )*>", "r");

 // Replace <P>, <DIV> and <TR> with Double Line sBreaks
 dtCleaningCollection.Rows.Add(14, @"<( )*div([^>])*>", "rr");
 dtCleaningCollection.Rows.Add(15, @"<( )*tr([^>])*>", "rr");
 dtCleaningCollection.Rows.Add(16, @"<( )*p([^>])*>", "rr");

 // Remove Remaining tags enclosed in < >
 dtCleaningCollection.Rows.Add(17, @"<[^>]*>", string.Empty);

 // Replace special characters:
 dtCleaningCollection.Rows.Add(18, @"&nbsp;", " ");
 dtCleaningCollection.Rows.Add(19, @"&bull;", " * ");
 dtCleaningCollection.Rows.Add(20, @"&lsaquo;", "<");
 dtCleaningCollection.Rows.Add(21, @"&rsaquo;", ">");
 dtCleaningCollection.Rows.Add(22, @"&trade;", "(tm)");
 dtCleaningCollection.Rows.Add(23, @"&frasl;", "/");
 dtCleaningCollection.Rows.Add(24, @"&lt;", "<");
 dtCleaningCollection.Rows.Add(25, @"&gt;", ">");
 dtCleaningCollection.Rows.Add(26, @"&copy;", "(c)");
 dtCleaningCollection.Rows.Add(27, @"&reg;", "(r)");
 dtCleaningCollection.Rows.Add(28, @"&frac14;", "1/4");
 dtCleaningCollection.Rows.Add(29, @"&frac12;", "1/2");
 dtCleaningCollection.Rows.Add(30, @"&frac34;", "3/4");
 dtCleaningCollection.Rows.Add(31, @"&lsquo;", "'");
 dtCleaningCollection.Rows.Add(32, @"&rsquo;", "'");
 dtCleaningCollection.Rows.Add(33, @"&ldquo;", """);
 dtCleaningCollection.Rows.Add(34, @"&rdquo;", """);

 // Remove all others remianing special characters
 // you dont want to replace with another string
 dtCleaningCollection.Rows.Add(35, @"&(.{2,6});", string.Empty);

 // Remove extra line sBreaks and sTabs
 dtCleaningCollection.Rows.Add(36, "(r)( )+(r)", "rr");
 dtCleaningCollection.Rows.Add(37, "(t)( )+(t)", "tt");
 dtCleaningCollection.Rows.Add(38, "(t)( )+(r)", "tr");
 dtCleaningCollection.Rows.Add(39, "(r)( )+(t)", "rt");
 dtCleaningCollection.Rows.Add(40, "(r)(t)+(r)", "rr");
 dtCleaningCollection.Rows.Add(41, "(r)(t)+", "rt");

 return dtCleaningCollection;
 }
Recommended

Leave a Reply

This site uses Akismet to reduce spam. Learn how your comment data is processed.