Strip HTML Tags from Text

By | March 24, 2010

Have you ever wondered how would you show on a web form text that is stripped of HTML tags but you still want to accept HTML tags when saving to your form? Sounds confusing?

To make it clear, I will give a good example.  Lets say you have a form that has a Rich Text Box (FTB or FCK) which you allow users to cut and paste items that have HTML tags so that you can display it properly like a Blog Article, but there are some instances that you want this to be stripped of the HTML tags like displaying a summary on a Grid.  Now my solution is to strip the HTML codes before displaying it when needed.

So when you copy this

Hello, World!

it wont show on your grid as this

 Hello World
 <font size ="4" color="blue">
 Hello, World!

but as this

"Hello World!"

Now with a mix or Replace and Regular Expressions I created a class to handle that and here it is:

public string StripHTML(string sInputString)

 string sOutputString;
 sOutputString = sInputString;
 //Initial Cleaning Step
 //Replace new line and carriage return with Spaces
 sOutputString = sOutputString.Replace("r", " ");
 sOutputString = sOutputString.Replace("n", " ");
 // Remove sTabs
 sOutputString = sOutputString.Replace("t", string.Empty);

 //Tag Removal
 DataTable myDataTable = GetTableDefinition();
 myDataTable.DefaultView.Sort = "iID ASC";
 foreach (DataRow drCleaningItem in myDataTable.Rows)
 string sOriginalString = (drCleaningItem["sOriginalString"]).ToString();
 string sReplacementString = (drCleaningItem["sReplacementString"]).ToString();
 sOutputString = Regex.Replace(sOutputString, sOriginalString, sReplacementString, RegexOptions.IgnoreCase);

 //Initial replacement target string for linebreaks
 string sBreaks = "rrr";

 // Initial replacement target string for sTabs
 string sTabs = "ttttt";
 for (int x = 0; x < sOutputString.Length; x++)
 sOutputString = sOutputString.Replace(sBreaks, "rr");
 sOutputString = sOutputString.Replace(sTabs, "tttt");
 sBreaks = sBreaks + "r";
 sTabs = sTabs + "t";

 return sOutputString;

 return sInputString;

 private DataTable GetTableDefinition()

 DataTable dtCleaningCollection = new DataTable();
 dtCleaningCollection.Columns.Add("iID", typeof(int));
 dtCleaningCollection.Columns.Add("sOriginalString", typeof(string));
 dtCleaningCollection.Columns.Add("sReplacementString", typeof(string));

 // Replace repeating spaces with single space
 dtCleaningCollection.Rows.Add(1, @"( )+", " ");

 // Prepare and clean Header Tag
 dtCleaningCollection.Rows.Add(2, @"<( )*head([^>])*>", "<head>");
 dtCleaningCollection.Rows.Add(3, @"(<( )*(/)( )*head( )*>)", "</head>");
 dtCleaningCollection.Rows.Add(4, "(<head>).*(</head>)", string.Empty);

 // Prepare and clean Script Tag
 dtCleaningCollection.Rows.Add(5, @"<( )*script([^>])*>", "<script>");
 dtCleaningCollection.Rows.Add(6, @"(<( )*(/)( )*script( )*>)", "</script>");
 dtCleaningCollection.Rows.Add(7, @"(<script>).*(</script>)", string.Empty);

 // Prepare and clean Style Tag
 dtCleaningCollection.Rows.Add(8, @"<( )*style([^>])*>", "<style>");
 dtCleaningCollection.Rows.Add(9, @"(<( )*(/)( )*style( )*>)", "</style>");
 dtCleaningCollection.Rows.Add(10, "(<style>).*(</style>)", string.Empty);

 // Replace <td> with sTabs
 dtCleaningCollection.Rows.Add(11, @"<( )*td([^>])*>", "t");

 // Replace <BR> and <LI> with Line sBreaks
 dtCleaningCollection.Rows.Add(12, @"<( )*br( )*>", "r");
 dtCleaningCollection.Rows.Add(13, @"<( )*li( )*>", "r");

 // Replace <P>, <DIV> and <TR> with Double Line sBreaks
 dtCleaningCollection.Rows.Add(14, @"<( )*div([^>])*>", "rr");
 dtCleaningCollection.Rows.Add(15, @"<( )*tr([^>])*>", "rr");
 dtCleaningCollection.Rows.Add(16, @"<( )*p([^>])*>", "rr");

 // Remove Remaining tags enclosed in < >
 dtCleaningCollection.Rows.Add(17, @"<[^>]*>", string.Empty);

 // Replace special characters:
 dtCleaningCollection.Rows.Add(18, @"&nbsp;", " ");
 dtCleaningCollection.Rows.Add(19, @"&bull;", " * ");
 dtCleaningCollection.Rows.Add(20, @"&lsaquo;", "<");
 dtCleaningCollection.Rows.Add(21, @"&rsaquo;", ">");
 dtCleaningCollection.Rows.Add(22, @"&trade;", "(tm)");
 dtCleaningCollection.Rows.Add(23, @"&frasl;", "/");
 dtCleaningCollection.Rows.Add(24, @"&lt;", "<");
 dtCleaningCollection.Rows.Add(25, @"&gt;", ">");
 dtCleaningCollection.Rows.Add(26, @"&copy;", "(c)");
 dtCleaningCollection.Rows.Add(27, @"&reg;", "(r)");
 dtCleaningCollection.Rows.Add(28, @"&frac14;", "1/4");
 dtCleaningCollection.Rows.Add(29, @"&frac12;", "1/2");
 dtCleaningCollection.Rows.Add(30, @"&frac34;", "3/4");
 dtCleaningCollection.Rows.Add(31, @"&lsquo;", "'");
 dtCleaningCollection.Rows.Add(32, @"&rsquo;", "'");
 dtCleaningCollection.Rows.Add(33, @"&ldquo;", """);
 dtCleaningCollection.Rows.Add(34, @"&rdquo;", """);

 // Remove all others remianing special characters
 // you dont want to replace with another string
 dtCleaningCollection.Rows.Add(35, @"&(.{2,6});", string.Empty);

 // Remove extra line sBreaks and sTabs
 dtCleaningCollection.Rows.Add(36, "(r)( )+(r)", "rr");
 dtCleaningCollection.Rows.Add(37, "(t)( )+(t)", "tt");
 dtCleaningCollection.Rows.Add(38, "(t)( )+(r)", "tr");
 dtCleaningCollection.Rows.Add(39, "(r)( )+(t)", "rt");
 dtCleaningCollection.Rows.Add(40, "(r)(t)+(r)", "rr");
 dtCleaningCollection.Rows.Add(41, "(r)(t)+", "rt");

 return dtCleaningCollection;

Leave a Reply