Remove HTML Tags which are place in same sequence or one after another
I have following string:
Result = "<em>Administration</em> <em>Resources</em> Officer <em>paragraphs</em>";
Can any one guide me how to generate following string with the replacement of above string:
Result = "<em>Administration Resources</em> Officer <em>paragraphs</em> "
Basically I have multiple "Emphasized text" tag in string I want to remove which are place one word to another word and generate one "Emphasized text" t开发者_如何学运维ag of that two words.
while (result.Contains(" <em>")) {
result = result.Replace(" <em>", "<em>");
}
result = result.Replace("</em><em>", "");
Here is a state machine approach using switch statement that will do what you want:
public class TagCleaner
{
public string CleanEM(string input)
{
var beginTag = "<em>";
var endTag = "</em>";
var parsable = input;
var state = 0;
var output = string.Empty;
var done = false;
while (!done)
{
switch (state)
{
case 0: // new attempt... find <em>
{
var idx = parsable.IndexOf(beginTag);
if (idx < 0)
{
output = parsable;
state = -1;
}
if (idx > 0)
{
output = parsable.Substring(0, idx + beginTag.Length);
}
if (idx == 0)
{
output = beginTag;
}
parsable = parsable.Substring(idx + beginTag.Length); //chopped off anything before the <em> tag for next round
state = 1; // set state to go find matching </em>
}
break;
case 1: // found <em>, now find matching </em>
{
var idx = parsable.IndexOf(endTag);
if (idx < 0)
{
output += parsable;
state = -1;
}
if (idx > 0)
{
output += parsable.Substring(0, idx + endTag.Length);
}
if (idx == 0) //<em></em>... remove the last <em> tag from output...
{
output = output.Substring(0, output.LastIndexOf(beginTag));
}
parsable = parsable.Substring(idx + endTag.Length); //chopped off anything before the </em> tag for next round
if (parsable.Length < 1)
state = -1; //done
else
state = 2; // set state to find the next <em>
}
break;
case 2: //just found </em>, now look for the next <em>
{
var idx = parsable.IndexOf(beginTag);
if (idx < 0)
{
output += parsable;
state = -1; //done
}
if (idx >= 0)
{
var prefix = parsable.Substring(0, idx);
var re = new System.Text.RegularExpressions.Regex("^ *$");
if (re.IsMatch(prefix)) // found 0 or more spaces between the </em> and <em> tag...
{
output = output.Substring(0, output.LastIndexOf(endTag)); //chop off the last </em> from output
output += prefix; //add the spaces to the output
parsable = parsable.Substring(idx + beginTag.Length);
state = 1; //set state to go find </em>
}
else // there are other things beside empty spaces in between...
{
output += parsable.Substring(0, idx + beginTag.Length);
parsable = parsable.Substring(idx + beginTag.Length);
state = 1; //set state to go find </em>
}
}
if (parsable.Length < 1)
state = -1; //done
}
break;
default:
done = true;
break;
}
}
return output;
}
}
It'll do what you want and with a bit of changes, you can switch it to clean whatever tag you want.
Here is a sample MS Test to prove that it work...
[TestClass]
public class TagCleanerTest
{
[TestMethod]
public void Should_Clean_EM_Tag_That_Are_In_Sequence_Even_With_Spaces_In_Between()
{
var input = "<em>Administration</em> <em>Resource</em> Officer <em>paragraphs</em>";
var expected = "<em>Administration Resource</em> Officer <em>paragraphs</em>";
var sut = new TagCleaner();
var actual = sut.CleanEM(input);
Assert.AreEqual(expected, actual);
}
}
精彩评论