2012-05-17 45 views
0

我验证HTML输入(形成一个RSS feed)显示在一个MVC视图白名单正则表达式来允许YouTube的iframe

我用下面的白名单的方式来净化我的HTML

private static Regex _tags = new Regex("<[^>]*(>|$)", 
RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.Compiled); 
private static Regex _whitelist = new Regex(@" 
^</?(b(lockquote)?|code|d(d|t|l|el)|em|h(1|2|3)|i|kbd|u|li|ol|p(re)?|s(ub|up|trong|trike)?|ul)>$| 
^<(b|h)r\s?/?>$", 
    RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.Compiled | RegexOptions.IgnorePatternWhitespace); 
private static Regex _whitelist_a = new Regex(@" 
^<a\s 
href=""(\#\d+|(https?|ftp)://[-a-z0-9+&@#/%?=~_|!:,.;\(\)]+)"" 
(\stitle=""[^""<>]+"")?\s?>$| 
^</a>$", 
    RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.Compiled | RegexOptions.IgnorePatternWhitespace); 
private static Regex _whitelist_img = new Regex(@" 
^<img\s 
src=""https?://[-a-z0-9+&@#/%?=~_|!:,.;\(\)]+"" 
(\swidth=""\d{1,3}"")? 
(\sheight=""\d{1,3}"")? 
(\salt=""[^""<>]*"")? 
(\stitle=""[^""<>]*"")? 
\s?/?>$", 
    RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.Compiled | RegexOptions.IgnorePatternWhitespace); 


/// <summary> 
/// sanitize any potentially dangerous tags from the provided raw HTML input using 
/// a whitelist based approach, leaving the "safe" HTML tags 
/// CODESNIPPET:4100A61A-1711-4366-B0B0-144D1179A937 
/// </summary> 
public static string Sanitize(string html) 
{ 
    if (String.IsNullOrEmpty(html)) return html; 

    string tagname; 
    Match tag; 

    // match every HTML tag in the input 
    MatchCollection tags = _tags.Matches(html); 
    for (int i = tags.Count - 1; i > -1; i--) 
    { 
     tag = tags[i]; 
     tagname = tag.Value.ToLowerInvariant(); 

     if (!(_whitelist.IsMatch(tagname) || _whitelist_a.IsMatch(tagname) || _whitelist_img.IsMatch(tagname))) 
     { 
      html = html.Remove(tag.Index, tag.Length); 

     } 
    } 

    return html; 
} 

我想也允许从YouTube或Vimeo的视频内容使用的iFrame或HTML5视频标签显示

任何人都可以点我在一个reg前,这是一个更灵活一点正确的方向?

这里是我的I帧

private static Regex _whitelist_iframe = new Regex(@" 
      ^<iframe\s 
      src=""https?://(player.vimeo.com|www.youtube.com)/[-a-z0-9+&@#/%?=~_|!:,.;\(\)|\s]+"" 
      (\swidth=""\d{1,3}"")? 
      (\sheight=""\d{1,3}"")? 
      (\sframeborder=""\d{1,3}"")? 
      (\sallowfullscreen)? 
      \s?>$|^</iframe>$", 
      RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.Compiled | RegexOptions.IgnorePatternWhitespace); 
+1

这已被多次提及(这是最好的http://stackoverflow.com/a/1732454/1191903) - 不使用正则表达式用于解析HTML标签。 –

回答

1

上述正则表达式的方法得太严,更何况凯文的制作精良点的尝试!

这里就是我所做的:

使用html-agility-pack解析HTML和消毒它作为这个stackoverflow answer

提到我还添加了一些代码来检查对正则表达式的图像或I帧的SRC标签。 (我敢肯定,这是可以做到更好)

public class HtmlSanitizer 
{ 
    private readonly IDictionary<string, string[]> _whitelist; 
    private readonly List<string> _deletableNodesXpath = new List<string>(); 

    public HtmlSanitizer() 
    { 
     _whitelist = new Dictionary<string, string[]> 
         { 
          {"a", new[] {"href", "target", "title"}}, 
          {"img", new[] {"src", "alt", "width", "height"}}, 
          {"iframe", new[] {"src", "width", "height", "frameborder", "allowfullscreen" }}, 
          {"strong", null}, 
          {"em", null}, 
          {"blockquote", null}, 
          {"b", null}, 
          {"p", null}, 
          {"ul", null}, 
          {"ol", null}, 
          {"li", null}, 
          {"div", new[] {"align"}}, 
          {"strike", null}, 
          {"u", null}, 
          {"sub", null}, 
          {"sup", null}, 
          {"table", null}, 
          {"tr", null}, 
          {"td", null}, 
          {"th", null}, 
          {"dd", null}, 
          {"dt", null}, 
          {"dl", null}, 
          {"h1", null}, 
          {"h2", null}, 
          {"h3", null}, 
         }; 
    } 

    public string Sanitize(string input) 
    { 
     if (input.Trim().Length < 1) 
      return string.Empty; 
     var htmlDocument = new HtmlDocument(); 

     htmlDocument.LoadHtml(input); 
     SanitizeNode(htmlDocument.DocumentNode); 
     string xPath = CreateXPath(); 

     return StripHtml(htmlDocument.DocumentNode.WriteTo().Trim(), xPath); 
    } 

    private void SanitizeChildren(HtmlNode parentNode) 
    { 
     for (int i = parentNode.ChildNodes.Count - 1; i >= 0; i--) 
     { 
      SanitizeNode(parentNode.ChildNodes[i]); 
     } 
    } 

    private static Regex _srcAttribute = new Regex(@"^https?://[-a-z0-9+&@#/%?=~_|!:,.;\(\)]+$", RegexOptions.Singleline | RegexOptions.IgnoreCase 
         | RegexOptions.ExplicitCapture | RegexOptions.Compiled | RegexOptions.IgnorePatternWhitespace); 

    private static Regex _iframeSrc = new Regex(@"https?://(player.vimeo.com|www.youtube.com)/[-a-z0-9+&@#/%?=~_|!:,.;\(\)|\s]+", RegexOptions.Singleline | RegexOptions.IgnoreCase 
         | RegexOptions.ExplicitCapture | RegexOptions.Compiled | RegexOptions.IgnorePatternWhitespace); 

    private void SanitizeNode(HtmlNode node) 
    { 
     if (node.NodeType == HtmlNodeType.Element) 
     { 
      if (!_whitelist.ContainsKey(node.Name)) 
      { 
       if (!_deletableNodesXpath.Contains(node.Name)) 
       { 
        //DeletableNodesXpath.Add(node.Name.Replace("?","")); 
        node.Name = "removeableNode"; 
        _deletableNodesXpath.Add(node.Name); 
       } 
       if (node.HasChildNodes) 
       { 
        SanitizeChildren(node); 
       } 

       return; 
      } 

      if (node.HasAttributes) 
      { 
       for (int i = node.Attributes.Count - 1; i >= 0; i--) 
       { 
        HtmlAttribute currentAttribute = node.Attributes[i]; 
        string[] allowedAttributes = _whitelist[node.Name]; 
        if (allowedAttributes != null) 
        { 
         if (!allowedAttributes.Contains(currentAttribute.Name)) 
         { 
          node.Attributes.Remove(currentAttribute); 
         } 

         // if img src ensure matches regex 
         if (node.Name == "img" && currentAttribute.Name == "src") 
         { 
          if (!_srcAttribute.IsMatch(currentAttribute.Value)) 
          { 
           // remove node 
           node.Name = "removeableNode"; 
           _deletableNodesXpath.Add(node.Name); 
          } 
         } 

         // if iframe - ensure it within allowed src tags 
         if (node.Name == "iframe" && currentAttribute.Name == "src") 
         { 
          if (!_iframeSrc.IsMatch(currentAttribute.Value)) 
          { 
           // remove node 
           node.Name = "removeableNode"; 
           _deletableNodesXpath.Add(node.Name); 
          } 
         } 

        } 
        else 
        { 
         node.Attributes.Remove(currentAttribute); 
        } 
       } 
      } 
     } 

     if (node.HasChildNodes) 
     { 
      SanitizeChildren(node); 
     } 
    } 

    private string StripHtml(string html, string xPath) 
    { 
     HtmlDocument htmlDoc = new HtmlDocument(); 
     htmlDoc.LoadHtml(html); 
     if (xPath.Length > 0) 
     { 
      HtmlNodeCollection invalidNodes = htmlDoc.DocumentNode.SelectNodes(@xPath); 
      foreach (HtmlNode node in invalidNodes) 
      { 
       node.ParentNode.RemoveChild(node, true); 
      } 
     } 
     return htmlDoc.DocumentNode.WriteContentTo(); 
     ; 
    } 

    private string CreateXPath() 
    { 
     string xPath = string.Empty; 
     for (int i = 0; i < _deletableNodesXpath.Count; i++) 
     { 
      if (i != _deletableNodesXpath.Count - 1) 
      { 
       xPath += string.Format("//{0}|", _deletableNodesXpath[i].ToString(CultureInfo.InvariantCulture)); 
      } 
      else xPath += string.Format("//{0}", _deletableNodesXpath[i].ToString(CultureInfo.InvariantCulture)); 
     } 
     return xPath; 
    } 
}