清理HTML标记属性

我需要使用JavaScript来浏览大量HTML以将属性引号语法调整为全部使用双引号。我不需要担心诸如“禁用”或“选定”之类的仅键特性。清理HTML标记属性

这是我现在的考验：

var text = "<input class=daily_input type='hidden' size='1' value=3 disabled />"; 
var regex = /<([\w]+)([^>]+)([\w]+)=['"]?([^'\s|"\s|\s]*)['"]?([^>]+)>/gi; 
text = text.replace(regex, "<$1$2$3=\"$4\"$5>"); 

console.log(text); // logs <input class=daily_input type='hidden' size='1' value="3" disabled />

看起来它仍然只是调整了最后一个属性。我能够轻松地测试使用正则表达式中找到/ TextMate的更换比赛了，下面将在文本HTML标记每个属性相匹配：

/([\w]+)=['"]?([^'\s|"\s|\s]*)['"]?/gi

我怎样才能改变这种捕获并调整每个属性，不仅是最后一个？已经摆弄了很长一段时间没有结果。任何帮助表示赞赏！

来源

2010-11-03 thechriskelley

text.replace(/='([^']*)'/g, '="$1"').replace(/=([^"'][^ >]*)/g, '="$1"')

或（更换）：

text.replace(/='([^']*)'|=([^"'][^ >]*)/g, '="$1"')

来源

2010-11-03 21:37:31 thejh

首先，谢谢！这样可行。我唯一的问题是否可以在一个replace（）中完成。 HTML文件可能非常大，效率是关键。我会尽情玩。 – thechriskelley 2010-11-03 22:00:40

@thechriskelley：在一个“替换”中增加了一个解决方案 – thejh 2010-11-03 22:58:45

非常好，谢谢！ – thechriskelley 2010-11-04 21:53:56

我知道这是一个迟到的回答，但如果你可以随时使用sanitize-html它为节点编写的，但可以肯定，你可以运行browserify对库（或者你的代码）。

请注意，它使用lodash，所以如果您已经在使用它，那么您可能需要调整包装。

这个例子比你想要的要多......我使用这个库来清理输入代码，从这里转换为存储在db中的markdown，我通过marked重新水合。

// convert/html-to-filtered-markdown.js 

'use strict'; 

var sanitize = require('sanitize-html') //https://www.npmjs.org/package/sanitize-html 
    ,toMarkdown = require('to-markdown').toMarkdown 
    ; 

module.exports = function convertHtmlToFilteredMarkdown(input, options) { 
    if (!input) return ''; 

    options = options || {}; 

    //basic cleanup, normalize line endings, normalize/reduce whitespace and extra line endings 
    var response = (input || '').toString().trim() 
    .replace(/(\r\n|\r|\n)/g, '\n') //normalize line endings 
    .replace(/“/g, '"') //remove fancy quotes 
    .replace(/”/g, '"') //remove fancy quotes 
    .replace(/‘/g, '\'') //remove fancy quotes 
    .replace(/’/g, '\'') //remove fancy quotes 
    ; 

    //sanitize html input 
    response = sanitize(response, { 
    //don't allow table elements 
    allowedTags: [ 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'blockquote', 'p', 'a', 'ul', 'ol', 'nl', 'li', 'b', 'i', 'strong', 'em', 'strike', 'code', 'hr', 'br', 'div', 'table', 'thead', 'caption', 'tbody', 'tr', 'th', 'td', 'pre' ], 

    //make orderd lists 
    transformTags: { 
     'ol': 'ul' 
    } 
    }).replace(/\r\n|\r|\n/g,'\n') //normalize line endings; 

    if (!options.tables) { 
    response = response.replace(/[\s\n]*\<(\/?)(table|thead|tbody|tr|th|td)\>[\s\n]*/g, '\n\n') //replace divs/tables blocks as paragraphs 
    } 

    //cleanup input further 
    response = response 
    .replace(/[\s\n]*\<(\/?)(div|p)\>[\s\n]*/g, '\n\n') //divs and p's to simple multi-line expressions 
    .replace(/\>#/g, '\n\n#') //cleanup #'s' after closing tag, ex: <a>...</a>\n\n# will be reduced via sanitizer 
    .replace(/\\s+\</,'<') //remove space before a tag open 
    .replace(/\>\s+\n?/,'>\n') //remove space after a tag close 
    .replace(/\&?nbsp\;?/g,' ') //revert nbsp to space 
    .replace(/\<\h[12]/g,'<h3').replace(/\<\/\h[12]/g,'</h3') //reduce h1/h2 to h3 
    ; 

    //convert response to markdown 
    response = toMarkdown(response); 

    //normalize line endings 
    response = response 
    .replace(/(?:^|\n)##?[\b\s]/g,'\n### ') //reduce h1 and h2 to h3 
    .replace(/(\r\n|\r|\n)/g, '\n') //normalize line endings 
    .trim() 

    return response + '\n'; 
}

来源

2014-12-04 23:02:30 Tracker1

清理HTML标记属性

回答

相关问题