-
Notifications
You must be signed in to change notification settings - Fork 1.4k
html解析
RubyLouvre edited this page Jan 24, 2016
·
1 revision
23号的版本
//匹配同时拥有开标签闭标签的元素
var rfullTag = /^<([^\s>\/=.$<]+)(\s+[^=\s]+(?:=(?:"[^"]*"|'[^']*'|[^>\s]+))?)*\s*>([\s\S]*)<\/\1>/
//匹配只有开标签的无内容元素(Void elements)
//http://www.colorglare.com/2014/02/03/to-close-or-not-to-close.html
//http://blog.jobbole.com/61514/
var rvoidTag = /^<([^\s>\/=.$<]+)(\s+([^=\s]+)(?:=("[^"]*"|'[^']*'|[^\s>]+))?)*\s*\/?>/
//用于创建适配某一种标签的正则表达式
var openStr = "(?:\\s+[^=\\s]+(?:=(?:\"[^\"]*\"|'[^']*'|[^>\s]+))?)*\\s*>"
//匹配文本节点
var rtext = /^[^<]+/
//匹配注释节点
var rcomment = /^<!--([\w\W]*?)-->/
//从大片标签中匹想第一个标签的所有属性
var rallAttrs = /(\s+[^\s>\/\/=]+(?:=(?:("|')(?:\\\2|\\?(?!\2)[\w\W])*\2|[^\s'">=]+))?)*\s*\/?>/g
var vdom = require("../vdom/index")
var VText = vdom.VText
var VComment = vdom.VComment
var VElement = vdom.VElement
// /<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi
var rnocontent = /textarea|template|script|style/
var tagCache = {}// 缓存所有匹配开标签闭标签的正则
//=== === === === 创建虚拟DOM树 === === === === =
//依赖config
//此阶段只会生成VElement,VText,VComment
function createVirtual(aaa, force) {
var nodes = []
if (!force && !avalon.config.rbind.test(aaa)) {
return nodes
}
var text = aaa.replace(rstring, function(_){
return new Array(_.length + 1).join("1")
})
do {
var matchText = ""
var match = text.match(rtext)
var node = false
if (match) {//尝试匹配文本
matchText = match[0]
node = new VText(matchText)
console.log("文本节点", matchText, "|")
}
if (!node) {//尝试匹配注释
match = text.match(rcomment)
if (match) {
matchText = match[0]
node = new VComment(match[1])
}
}
if (!node) {//尝试匹配拥有闭标签的元素节点
match = text.match(rfullTag)
if (match) {
matchText = match[0]//贪婪匹配 outerHTML,可能匹配过多
var tagName = match[1]//nodeName
var opens = []
var closes = []
var ropen = tagCache[tagName + "open"] ||
(tagCache[tagName + "open"] = new RegExp("<" + tagName + openStr, "g"))
var rclose = tagCache[tagName + "close"] ||
(tagCache[tagName + "close"] = new RegExp("<\/" + tagName + ">", "g"))
/* jshint ignore:start */
matchText.replace(rclose, function (_, b) {
closes.push(("0000" + b + ">").slice(-4))//取得所有闭标签的位置
return new Array(_.length + 1).join("1")
}).replace(ropen, function (_, b) {
opens.push(("0000" + b + "<").slice(-4))//取得所有开标签的位置
return new Array(_.length + 1).join("1")
})
/* jshint ignore:end */
//<div><div>01</div><div>02</div></div><div>222</div><div>333</div>
//会变成000<005<012>018<025>031>037<045>051<059>
//再变成<<><>><><>
//最后获取正确的>的索引值,这里为<<><>>的最后一个字符,
var pos = opens.concat(closes).sort()
var gtlt = pos.join("").replace(/\d+/g, "")
var k = 0, last = 0
for (var i = 0, n = gtlt.length; i < n; i++) {
var c = gtlt.charAt(i)
if (c === "<") {
k += 1
} else {
k -= 1
}
if (k === 0) {
last = i
break
}
}
var findex = parseFloat(pos[last]) + tagName.length + 3 // (</>为三个字符)
matchText = matchText.slice(0, findex) //取得正确的outerHTML
var attrs = matchText.match(rallAttrs)[0] //抽取所有属性
var innerHTML = matchText.slice((tagName + attrs).length + 1,
(tagName.length + 3) * -1) //抽取innerHTML
node = new VElement(tagName, attrs.slice(0, -1), innerHTML)
}
}
if (!node) {
match = text.match(rvoidTag)
if (match) {//尝试匹配自闭合标签及注释节点
matchText = match[0]
//不打算序列化的属性不要放在props中
tagName = match[1].toLowerCase()
attrs = matchText.slice(tagName.length + 1).replace(/\/?>$/, "")
//这里可能由VElement变成VComponent
node = new VElement(tagName, attrs, "")
node.isVoidTag = true
}
}
if (node) {
nodes.push(node)
text = text.slice(matchText.length)
} else {
break
}
} while (1);
return nodes
}
module.exports = avalon.createVirtual = createVirtual
正则太复杂,解析太慢
24号的版本 先处理掉所有字符串,优化所有正则
function heredoc(fn) {
return fn.toString().
replace(/^[^\/]+\/\*!?\s?/, '').
replace(/\*\/[^\/]+$/, '')
}
//匹配文本节点
var rtext = /^[^<]+/
//匹配注释节点
var rcomment = /^<!--([\w\W]*?)-->/
var ramp = /&/g
var rstring = /(["'])(\\(?:\r\n|[\s\S])|(?!\1)[^\\\r\n])*\1/g
var tagCache = {}
var openStr = "(?:\\s+[^=\\s]+(?:\\=[^>\\s]+)?)*\\s*>"
var rfullTag = /^<([^\s>\/=.$<]+)(?:\s+[^=\s]+(?:=[^>\s]+)?)*\s*>(?:[\s\S]*)<\/\1>/
//匹配只有开标签的无内容元素(Void elements 或 self-contained tags)
//http://www.colorglare.com/2014/02/03/to-close-or-not-to-close.html
//http://blog.jobbole.com/61514/
var rvoidTag = /^<([^\s>\/=.$<]+)\s*([^>]*?)\/?>/
var maps = {}
var number = 1
function dig(a) {
var key = "??" + number++
maps[key] = a
return key
}
var rfill = /\?\?\d+/g
function fill(a) {
var val = maps[a]
delete maps[a]
return val
}
function pushArray(target, other) {
target.push.apply(target, other)
}
// /<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi
var rnocontent = /textarea|template|script|style/
function createVirtual(str, recursive) {
var text = recursive == true ? str.replace(rstring, dig) : str
var nodes = []
do {
var matchText = ""
var match = text.match(rtext)
var node = false
var attrs = []
if (match) {//尝试匹配文本
matchText = match[0]
node = {
type: "#text",
nodeValue: matchText.replace(rfill, fill)
}
}
if (!node) {//尝试匹配注释
match = text.match(rcomment)
if (match) {
matchText = match[0]
node = {
type: "#comment",
nodeValue: matchText.replace(rfill, fill)
}
}
}
if (!node) {//尝试匹配拥有闭标签的元素节点
match = text.match(rfullTag)
if (match) {
matchText = match[0]//贪婪匹配 outerHTML,可能匹配过多
var type = match[1].toLowerCase()//nodeName
var opens = []
var closes = []
var ropen = tagCache[type + "open"] ||
(tagCache[type + "open"] = new RegExp("<" + type + openStr, "g"))
var rclose = tagCache[type + "close"] ||
(tagCache[type + "close"] = new RegExp("<\/" + type + ">", "g"))
/* jshint ignore:start */
matchText.replace(ropen, function (_, b) {
opens.push(("0000" + b + "<").slice(-4))//取得所有开标签的位置
return new Array(_.length + 1).join("1")
}).replace(rclose, function (_, b) {
closes.push(("0000" + b + ">").slice(-4))//取得所有闭标签的位置
})
/* jshint ignore:end */
//<div><div>01</div><div>02</div></div><div>222</div><div>333</div>
//会变成000<005<012>018<025>031>037<045>051<059>
//再变成<<><>><><>
//最后获取正确的>的索引值,这里为<<><>>的最后一个字符,
var pos = opens.concat(closes).sort()
var gtlt = pos.join("").replace(/\d+/g, "")
var k = 0, last = 0
for (var i = 0, n = gtlt.length; i < n; i++) {
var c = gtlt.charAt(i)
if (c === "<") {
k += 1
} else {
k -= 1
}
if (k === 0) {
last = i
break
}
}
var findex = parseFloat(pos[last]) + type.length + 3 // (</>为三个字符)
matchText = matchText.slice(0, findex) //取得正确的outerHTML
match = matchText.match(rvoidTag) //抽取所有属性
if (match[2]) {
attrs = parseAttrs(match[2])
}
var template = matchText.slice(match[0].length,
(type.length + 3) * -1) //抽取innerHTML
var innerHTML = template.replace(rfill, fill)
node = {
type: type,
props: attrs,
template: innerHTML,
children: []
}
if (node.props["ms-skip"]) {
node.skipContent = true
} else if (type === "option" || type === "xmp") {
node.children.push({
type: "text",
template: innerHTML
})
} else if (rnocontent.test(type)) {
node.skipContent = true
} else {//script, noscript, template, textarea
pushArray(node.children, createVirtual(template, true))
}
}
}
if (!node) {
match = text.match(rvoidTag)
if (match) {//尝试匹配自闭合标签及注释节点
matchText = match[0]
type = match[1]
if (match[2]) {
attrs = parseAttrs(match[2])
}
node = {
type: type,
props: attrs,
template: "",
children: [],
isVoidTag: true
}
}
}
if (node) {
nodes.push(node)
text = text.slice(matchText.length)
} else {
break
}
} while (1);
return nodes
}
function parseAttrs(str) {
var attrs = []
str.replace(/\s*=\s*/g, "=").replace(/\S+/g, function (attr) {
var arr = attr.split("=")
if (arr.length === 2) {
var value = arr[1].replace(rfill, fill)
if (rstring.test(value)) {
value = value.replace(ramp, "&").
replace(/"/g, '"').
slice(1, -1)
}
attrs.push({
name: arr[0],
value: value
})
} else {
attrs.push({
name: arr[0],
value: ""
})
}
})
return attrs
}
var str = heredoc(function () {
/*
<div ms-data-number="number"
ms-data-number2="number2"
ms-data-bool="bool"
ms-data-bool2="bool2"
ms-data-void="vv"
ms-data-null="nn"
ms-data-array="array"
ms-data-date="date"
ms-data-object="object"
ms-data-fn="show"
>点我</div><div id=aaa><div>1111<b></b></div></div><div>222</div>
<br /><hr id=eee >
*/
}).trim()
console.log(str)
var nodes = createVirtual(str)
console.log(nodes)