)。
我先介绍另一种方案:
首先要取出标签,如,<span style=“ color:blue”>、</span>和<script >,我们的替换范围仅局限于标签 < > 之间的内容。
然后获取所有的标签名称、属性的名称和值,如果有禁止出现的内容,就替换掉。可能的恶意代码形式如下所示:
标签的名称: <script </script
标签里的属性:<span onclick
属性的值:<img onerror=“javascript:''
最后,我们对所有的“恶意单词”进行替换:
using System; using System.Text.RegularExpressions;
/// <summary> /// Sanitize contains functionality to remove unaccepted tags or attributes /// </summary> public static class Sanitize { // list of accepted/harmeless tags (in lower case) private static string[] allowedTags = { "p", "h1", "b", "i", "a", "ul", "li", "pre", "hr", "blockquote", "img" };
// list of attributes that need to be sanitized private static string badAttributes = "onerror|onmousemove|onmouseout|onmouseover|" + "onkeypress|onkeydown|onkeyup|javascript:";
// sanitizes the HTML code in $inputHTML public static string FixTags(string inputHtml) { // define the match evaluator // MatchEvaluator 是一个委托,它调用fixTag方法 MatchEvaluator fixThisLink = new MatchEvaluator(Sanitize.fixTag);
// process each tags in the input string string fixedHtml = Regex.Replace(inputHtml, //需要替换的字符串 "(<.*?>)", //正则表达式:注意“?”的使用 --贪婪模式 fixThisLink, //委托“实例”做参数 RegexOptions.IgnoreCase); //整句代码的意思就是:将输入字符串inputHtml中能匹配上"(<.*?>)"的部分(也就是被< >包裹的标签)用fixThisLink方法进行处理
// return the "fixed" input string return fixedHtml; }
// remove tag if is not in the list of allowed tags private static string fixTag(Match tagMatch) { string tag = tagMatch.Value;
// extrag the tag name, such as "a" or "h1" Match m = Regex.Match(tag, @"</?(?<tagName>[^\s/]*)[>\s/]", RegexOptions.IgnoreCase); string tagName = m.Groups["tagName"].Value.ToLower();
// if the tag isn''t in the list of allowed tags, it should be removed if (Array.IndexOf(allowedTags, tagName) < 0) { return ""; }
// remove bad attributes from the tag string fixedTag = Regex.Replace(tag, "(" + Sanitize.badAttributes + @")(\s*)(?==)", // 注意"?=="的意思 --正向预查 "SANITIZED", RegexOptions.IgnoreCase);
// return the altered tag return fixedTag; } } |
注意代码中两处正则表达式的高级用法,贪婪模式和正向预查,详细可参考贪婪模式和正向预查
这里我们就可以看到正则表达式说起到的强大作用——操作字符串的无上利器啊!
2. 除了注入攻击,另一种必须使用的技术是nofollow。因为Google的链接价值算法,我们都希望能有高价值的链接能指向我们的网站,以提高我们网站的等级。一种简单的方式就是到其他网站(如新浪)申请一个博客,然后在博客里添加一条链接,指向自己的网站即可。但如果我们自己是新浪,我们当然不愿意有其他人这样做(毕竟我们不知道其他人链接指向的网站究竟是好是坏,如果是一个垃圾网站,会牵连到我们自己的)。但是呢,我们也不愿意完全禁止掉链接的使用(比如简单的对链接进行编码,让链接失去作用),因为毕竟很多链接或许只是内部链接,而且一个能直接点击的链接能带来更好的用户体验。
为了解决这个问题,Google给出了一个方法,在链接中加上关键字nofollow,如下所示:
<a rel=“nofollow” href=“http://too.much.spam”>cool link</a>
这样,链接能直接点击,但不会带来链接价值——即Google不会认为你认可或推荐了该链接指向的网站。看看博客园有没有这样做,……,呵呵,好像没有,很大度哟。不过据说Google也会逐步降低链接价值的作用,谣言了,随他去吧……
就直接上代码了:
using System; using System.Text.RegularExpressions;
/// <summary> /// NoFollow contains the functionality to add rel=nofollow to unstusted links /// </summary> public static class NoFollow { // the white list of domains (in lower case) private static string[] whitelist = { "seoasp", "www.seoegghead.com", "www.cristiandarie.ro" };
// finds all the links in the input string and processes them using fixLink public static string FixLinks(string input) { // define the match evaluator MatchEvaluator fixThisLink = new MatchEvaluator(NoFollow.fixLink);
// fix the links in the input string string fixedInput = Regex.Replace(input, "(<a.*?>)", fixThisLink, RegexOptions.IgnoreCase);
// return the "fixed" input string return fixedInput; }
// receives a Regex match that contains a link such as // <a href="http://too.much.spam/"> and adds ref=nofollow if needed private static string fixLink(Match linkMatch) { // retrieve the link from the received Match string singleLink = linkMatch.Value;
// if the link already has rel=nofollow, return it back as it is if (Regex.IsMatch(singleLink, @"rel\s*?=\s*?[''""]?.*?nofollow.*?[''""]?", RegexOptions.IgnoreCase)) { return singleLink; }
// use a named group to extract the URL from the link Match m = Regex.Match(singleLink, @"href\s*?=\s*?[''""]?(?<url>[^''""]*)[''""]?", RegexOptions.IgnoreCase); string url = m.Groups["url"].Value;
// if URL doesn''t contain http://, assume it''s a local link if (!url.Contains("http://")) { return singleLink; }
// extract the host name (such as www.cristiandarie.ro) from the URL Uri uri = new Uri(url); string host = uri.Host.ToLower();
// if the host is in the whitelist, don''t alter it if (Array.IndexOf(whitelist, host) >= 0) { return singleLink; }
// if the URL already has a rel attribute, change its value to nofollow string newLink = Regex.Replace(singleLink, @"(?<a>rel\s*=\s*(?<b>[''""]?))((?<c>[^''""\s]*|[^''""]*))(?<d>[''""]?)?", "${a}nofollow${d}", RegexOptions.IgnoreCase);
// if the string had a rel attribute that we changed, return the new link if (newLink != singleLink) { return newLink; }
// if we reached this point, we need to add rel=nofollow to our link newLink = Regex.Replace(singleLink, "<a", @"<a rel=""nofollow""", RegexOptions.IgnoreCase); return newLink; } } |
|