In article <#mBSmzk7FHA.736@TK2MSFTNGP09.phx.gbl>,
[quoted text, click to view] .NET Developer <a@b.com> wrote:
: I'm trying to write a RegEx that will find all occurances of a
: particular type of HTML anchor <a> element in a big block of HTML.
: Here are the pattern requirements - they consist of certain attributes
: being present, basically:
:
: must start with "<a" followed by 1 or more white-space characters
: then, before the closing tag ">" it must contain 3 attributes:
:
: 1. class
: 2. href
: 3. id
:
: the values for these 3 attributes must be a 1 or more character
: string. Both single and double quotes have to be able to work and it
: should not be case sensitive.
It's extremely important to note that regular expressions aren't
parsers. For general HTML, a correct solution requires a parser.
If you accept this limitation, you can do what you want with a regular
expression, but the code is ugly. A better tool would be XPath
expressions, but you seem to be using HTML and not XHTML.
The code below is a start, and I hope you find it helpful -- even if
only as cause for choosing a better approach. :-)
Enjoy,
Greg
using System;
using System.Collections;
using System.Text.RegularExpressions;
using NUnit.Framework;
namespace App
{
public class Chunk
{
private string html;
public Chunk(string html)
{
this.html = html;
}
private static Regex special = new Regex(
@"<a
\s+
(?<a1>href|id|class)\s*=\s*""[^""].*?""
\s+
(?<a2>(?!\k<a1>)(?:href|id|class))\s*=\s*""[^""].*?""
\s+
((?!\k<a1>|\k<a2>)(?:href|id|class))\s*=\s*""[^""].*?""
\s*>
",
RegexOptions.IgnorePatternWhitespace |
RegexOptions.IgnoreCase);
public ArrayList SpecialAnchors
{
get
{
ArrayList hits = new ArrayList();
if (special.IsMatch(html))
hits.Add(html);
return hits;
}
}
}
[TestFixture]
public class Test
{
private void AssertSingleHit(string html)
{
Chunk c = new Chunk(html);
ArrayList got = c.SpecialAnchors;
Assert.AreEqual(1, got.Count, "bad .Count");
Assert.AreEqual(html, (string) got[0]);
}
[Test]
public void HrefClassId()
{
AssertSingleHit("<a href=\"MyPage.aspx\" class=\"MyAnchor\" id=\"Anchor1\" >");
}
[Test]
public void ClassIdHref()
{
AssertSingleHit("<a class=\"MyAnchor\" id=\"Anchor1\" href=\"MyPage.aspx\">");
}
[Test]
public void IdHrefClass()
{
AssertSingleHit("<a id=\"Anchor1\" href=\"MyPage.aspx\" class=\"MyAnchor\">");
}
[Test]
public void UpperCase()
{
AssertSingleHit("<A ID=\"Anchor1\" HREF=\"MyPage.aspx\" CLASS=\"MyAnchor\">");
}
private void AssertNoHits(string html)
{
Chunk c = new Chunk(html);
Assert.AreEqual(0, c.SpecialAnchors.Count);
}
[Test]
public void MustHaveAttributes()
{
AssertNoHits("<a></a>");
}
[Test]
public void EmptyHref()
{
AssertNoHits("<a href=\"\" class=\"MyAnchor\" id=\"Anchor1\">");
}
[Test]
public void MissingHref()
{
AssertNoHits("<a class=\"MyAnchor\" id=\"Anchor1\" >");
}
[Test]
public void NoId()
{
AssertNoHits("<a href=\"MyPage.aspx\" class=\"MyAnchor\">");
}
[Test]
public void EmptyId()
{
AssertNoHits("<a href=\"MyPage.aspx\" class=\"MyAnchor\" id=\"\">");
}
[Test]
public void TwoHrefsAB()
{
AssertNoHits("<a href=\"MyPage.aspx\" href=\"MyPage.aspx\" id=\"Anchor1\" >");
}
[Test]
public void TwoHrefsAC()
{
AssertNoHits("<a href=\"MyPage.aspx\" id=\"Anchor1\" href=\"MyPage.aspx\" >");
}
[Test]
public void TwoHrefsBC()
{
AssertNoHits("<a id=\"Anchor1\" href=\"MyPage.aspx\" href=\"MyPage.aspx\" >");
}
[Test]
public void TwoIdsAB()
{
AssertNoHits("<a id=\"Anchor1\" id=\"Anchor1\" href=\"MyPage.aspx\" >");
}
}
}
--
If you make an optimization and don't measure to confirm the performance
increase, all you know for certain is that you've made your code harder
to read.