diff --git a/Sources/Attributes.swift b/Sources/Attributes.swift index e169bb8..5b70f3b 100644 --- a/Sources/Attributes.swift +++ b/Sources/Attributes.swift @@ -152,10 +152,30 @@ open class Attributes: NSCopying { return hasKeyIgnoreCase(key: key.utf8Array) } + @inline(__always) + private func asciiLowercase(_ byte: UInt8) -> UInt8 { + return (byte >= 65 && byte <= 90) ? (byte + 32) : byte + } + open func hasKeyIgnoreCase(key: T) -> Bool where T.Element == UInt8 { - return attributes.contains(where: { $0.getKeyUTF8().caseInsensitiveCompare(key) == .orderedSame}) + let keyCount = key.count + for attr in attributes { + let attrKey = attr.getKeyUTF8() + if attrKey.count != keyCount { continue } + var attrIter = attrKey.makeIterator() + var keyIter = key.makeIterator() + var equal = true + while let a = attrIter.next(), let b = keyIter.next() { + if asciiLowercase(a) != asciiLowercase(b) { + equal = false + break + } + } + if equal { return true } + } + return false } - + /** Get the number of attributes in this set. @return size diff --git a/Sources/CharacterReader.swift b/Sources/CharacterReader.swift index f25ed54..cca46f3 100644 --- a/Sources/CharacterReader.swift +++ b/Sources/CharacterReader.swift @@ -48,8 +48,8 @@ public final class CharacterReader { } } - public func currentUTF8() -> [UInt8] { - guard pos < end else { return TokeniserStateVars.eofUTF8 } + public func currentUTF8() -> ArraySlice { + guard pos < end else { return TokeniserStateVars.eofUTF8Slice } let firstByte = input[pos] @@ -81,7 +81,7 @@ public final class CharacterReader { } // Return the valid UTF-8 byte sequence - return Array(input[pos..<(pos + length)]) + return input[pos..<(pos + length)] } @discardableResult @@ -452,31 +452,22 @@ public final class CharacterReader { public func matchesLetter() -> Bool { guard pos < end else { return false } - var buffer = [UInt8](repeating: 0, count: 4) - var length = 0 - - buffer[0] = input[pos] - length = 1 + let firstByte = input[pos] + var length = 1 - if buffer[0] & 0b10000000 != 0 { // Multibyte sequence - if buffer[0] & 0b11100000 == 0b11000000, pos + 1 < end { - buffer[1] = input[pos + 1] + if firstByte & 0b10000000 != 0 { + if firstByte & 0b11100000 == 0b11000000, pos + 1 < end { length = 2 - } else if buffer[0] & 0b11110000 == 0b11100000, pos + 2 < end { - buffer[1] = input[pos + 1] - buffer[2] = input[pos + 2] + } else if firstByte & 0b11110000 == 0b11100000, pos + 2 < end { length = 3 - } else if buffer[0] & 0b11111000 == 0b11110000, pos + 3 < end { - buffer[1] = input[pos + 1] - buffer[2] = input[pos + 2] - buffer[3] = input[pos + 3] + } else if firstByte & 0b11111000 == 0b11110000, pos + 3 < end { length = 4 } else { return false } } - return Self.letters.contains(buffer[.. Bool { diff --git a/Sources/Element.swift b/Sources/Element.swift index 27d9350..8b4c67b 100644 --- a/Sources/Element.swift +++ b/Sources/Element.swift @@ -686,9 +686,14 @@ open class Element: Node { public func getElementsByTag(_ tagName: [UInt8]) throws -> Elements { try Validate.notEmpty(string: tagName) let normalizedTagName = tagName.lowercased().trim() - let weakElements = self.normalizedTagNameIndex[normalizedTagName] ?? [] - let elements = weakElements.compactMap { $0.value } - return Elements(elements) + + if isQueryIndexDirty || normalizedTagNameIndex == nil { + rebuildQueryIndexesForAllTags() + isQueryIndexDirty = false + } + + let weakElements = normalizedTagNameIndex?[normalizedTagName] ?? [] + return Elements(weakElements.compactMap { $0.value }) } /** @@ -1158,7 +1163,7 @@ open class Element: Node { var inClass: Bool = false var start: Int = 0 for i in 0..]] = [:] + internal var normalizedTagNameIndex: [[UInt8]: [Weak]]? = nil +// internal lazy var normalizedTagNameIndex: [[UInt8]: [Weak]] = [:] + + @usableFromInline + internal var isQueryIndexDirty: Bool = false /** * Get the list index of this node in its node sibling list. I.e. if this is the first node @@ -883,71 +884,43 @@ extension Node: CustomDebugStringConvertible { } internal extension Node { - func rebuildQueryIndexesForThisNodeOnly() { - var newNormalizedTagNameIndex: [[UInt8]: [Weak]] = [:] - var stack: [Node] = self.childNodes + @inlinable + func markQueryIndexDirty() { + isQueryIndexDirty = true + parentNode?.markQueryIndexDirty() + } + + @usableFromInline + func rebuildQueryIndexesForAllTags() { + var newIndex: [[UInt8]: [Weak]] = [:] + var queue: [Node] = [self] + queue.reserveCapacity(childNodeSize()) - while !stack.isEmpty { - let node = stack.removeLast() + var index = 0 + while index < queue.count { + let node = queue[index] + index += 1 // Move to the next element + if let element = node as? Element { let key = element.tagNameNormalUTF8() - newNormalizedTagNameIndex[key, default: []].append(Weak(element)) + newIndex[key, default: []].append(Weak(element)) } - stack.append(contentsOf: node.childNodes) // Add children to stack + + queue.append(contentsOf: node.childNodes) } - normalizedTagNameIndex = newNormalizedTagNameIndex + normalizedTagNameIndex = newIndex + isQueryIndexDirty = false } - @inlinable - func updateQueryIndex(for nodes: [Node], adding: Bool) { - for element in nodes.lazy.compactMap({ $0 as? Element }) { - let key = element.tagNameNormalUTF8() - if adding { - normalizedTagNameIndex[key, default: []].append(Weak(element)) - } else { - if var list = normalizedTagNameIndex[key] { - list.removeAll { $0.value == element } - if list.isEmpty { - normalizedTagNameIndex.removeValue(forKey: key) - } else { - normalizedTagNameIndex[key] = list - } - } - } - } - - propagateQueryIndexUpdateUpward(removed: adding ? nil : nodes, added: adding ? nodes : nil) + @usableFromInline + func rebuildQueryIndexesForThisNodeOnly() { + normalizedTagNameIndex = nil + markQueryIndexDirty() } @inlinable - func propagateQueryIndexUpdateUpward(removed: [Node]?, added: [Node]?) { - var currentNode: Node? = self - - while let node = currentNode, let parent = node.parentNode { - if let removed { - for element in removed.lazy.compactMap({ $0 as? Element }) { - let key = element.tagNameNormalUTF8() - if var elements = parent.normalizedTagNameIndex[key] { - elements.removeAll { $0.value == element } - if elements.isEmpty { - parent.normalizedTagNameIndex.removeValue(forKey: key) - } else { - parent.normalizedTagNameIndex[key] = elements - } - } - } - } - - if let added { - for element in added.lazy.compactMap({ $0 as? Element }) { - let key = element.tagNameNormalUTF8() - // TODO: reserve capacity - parent.normalizedTagNameIndex[key, default: []].append(Weak(element)) - } - } - - currentNode = parent - } + func updateQueryIndex(for nodes: [Node], adding: Bool) { + markQueryIndexDirty() } } diff --git a/Sources/String.swift b/Sources/String.swift index 34839f2..5533d40 100644 --- a/Sources/String.swift +++ b/Sources/String.swift @@ -32,9 +32,19 @@ extension UInt8 { extension Array: Comparable where Element == UInt8 { @inline(__always) public func lowercased() -> [UInt8] { - map { $0 >= 65 && $0 <= 90 ? $0 + 32 : $0 } + // Check if any element needs lowercasing + guard self.contains(where: { $0 >= 65 && $0 <= 90 }) else { return self } + // Only allocate a new array if necessary + var result = self + for i in result.indices { + let b = result[i] + if b >= 65 && b <= 90 { + result[i] = b + 32 + } + } + return result } - + func uppercased() -> [UInt8] { map { $0 >= 97 && $0 <= 122 ? $0 - 32 : $0 } } @@ -282,7 +292,7 @@ extension String { return self[i] as Character } - func utf8CharAt(_ i: Int) -> UTF8Char { + func utf8ByteAt(_ i: Int) -> UInt8 { return self.utf8Array[i] } diff --git a/Sources/Tokeniser.swift b/Sources/Tokeniser.swift index 33cfc81..e6fc7f6 100644 --- a/Sources/Tokeniser.swift +++ b/Sources/Tokeniser.swift @@ -19,7 +19,8 @@ final class Tokeniser { private var state: TokeniserState = TokeniserState.Data // current tokenisation state private var emitPending: Token? // the token we are about to emit on next read private var isEmitPending: Bool = false - private var charsString: [UInt8]? // characters pending an emit. Will fall to charsBuilder if more than one + private var charsSlice: ArraySlice? = nil // characters pending an emit. Will fall to charsBuilder if more than one + private var pendingSlices = [ArraySlice]() private let charsBuilder: StringBuilder = StringBuilder(1024) // buffers characters to output as one token, if more than one emit per read let dataBuffer: StringBuilder = StringBuilder(1024) // buffers data looking for @@ -42,27 +43,33 @@ final class Tokeniser { error("Self closing flag not acknowledged") selfClosingFlagAcknowledged = true } - + while (!isEmitPending) { try state.read(self, reader) } - - // if emit is pending, a non-character token was found: return any chars in buffer, and leave token for next read: + if !charsBuilder.isEmpty { let str = charsBuilder.buffer charsBuilder.clear() - charsString = nil + // Clear any pending slices, as the builder takes precedence. + pendingSlices.removeAll() return charPending.data(str) - } else if (charsString != nil) { - let token: Token = charPending.data(charsString!) - charsString = nil - return token + } else if !pendingSlices.isEmpty { + // Combine all the pending slices in one allocation. + let totalCount = pendingSlices.reduce(0) { $0 + $1.count } + var combined = [UInt8]() + combined.reserveCapacity(totalCount) + for slice in pendingSlices { + combined.append(contentsOf: slice) + } + pendingSlices.removeAll() + return charPending.data(combined) } else { isEmitPending = false return emitPending! } } - + func emit(_ token: Token) throws { try Validate.isFalse(val: isEmitPending, msg: "There is an unread token pending!") @@ -84,16 +91,7 @@ final class Tokeniser { } func emit(_ str: ArraySlice) { - // Buffer strings up until last string token found, to emit only one token for a run of character refs, etc. - // Does not set isEmitPending; read checks that - if charsString == nil { - charsString = Array(str) // Convert to Array when first string is stored - } else { - if charsBuilder.isEmpty { // Switching to string builder as more than one emit before read - charsBuilder.append(charsString!) - } - charsBuilder.append(str) // Append directly from the slice - } + pendingSlices.append(str) } func emit(_ str: [UInt8]) { diff --git a/Sources/TokeniserState.swift b/Sources/TokeniserState.swift index 2b89143..6631c8b 100644 --- a/Sources/TokeniserState.swift +++ b/Sources/TokeniserState.swift @@ -15,6 +15,7 @@ protocol TokeniserStateProtocol { public class TokeniserStateVars { public static let nullScalr: UnicodeScalar = "\u{0000}" public static let nullScalrUTF8 = "\u{0000}".utf8Array + public static let nullScalrUTF8Slice = ArraySlice(nullScalrUTF8) static let attributeSingleValueChars = ParsingStrings(["'", UnicodeScalar.Ampersand, nullScalr]) static let attributeDoubleValueChars = ParsingStrings(["\"", UnicodeScalar.Ampersand, nullScalr]) @@ -31,6 +32,7 @@ public class TokeniserStateVars { static let replacementStr: [UInt8] = Array(Tokeniser.replacementChar.utf8) static let eof: UnicodeScalar = CharacterReader.EOF static let eofUTF8 = String(CharacterReader.EOF).utf8Array + static let eofUTF8Slice = ArraySlice(String(CharacterReader.EOF).utf8Array) } enum TokeniserState: TokeniserStateProtocol { @@ -107,17 +109,17 @@ enum TokeniserState: TokeniserStateProtocol { switch self { case .Data: switch (r.currentUTF8()) { - case UTF8Arrays.ampersand: + case UTF8ArraySlices.ampersand: t.advanceTransition(.CharacterReferenceInData) break - case UTF8Arrays.tagStart: + case UTF8ArraySlices.tagStart: t.advanceTransition(.TagOpen) break - case TokeniserStateVars.nullScalrUTF8: + case TokeniserStateVars.nullScalrUTF8Slice: t.error(self) // NOT replacement character (oddly?) t.emit(r.consume()) break - case TokeniserStateVars.eofUTF8: + case TokeniserStateVars.eofUTF8Slice: try t.emit(Token.EOF()) break default: @@ -131,18 +133,18 @@ enum TokeniserState: TokeniserStateProtocol { break case .Rcdata: switch (r.currentUTF8()) { - case UTF8Arrays.ampersand: + case UTF8ArraySlices.ampersand: t.advanceTransition(.CharacterReferenceInRcdata) break - case UTF8Arrays.tagStart: + case UTF8ArraySlices.tagStart: t.advanceTransition(.RcdataLessthanSign) break - case TokeniserStateVars.nullScalrUTF8: + case TokeniserStateVars.nullScalrUTF8Slice: t.error(self) r.advance() t.emit(TokeniserStateVars.replacementChar) break - case TokeniserStateVars.eofUTF8: + case TokeniserStateVars.eofUTF8Slice: try t.emit(Token.EOF()) break default: @@ -162,12 +164,12 @@ enum TokeniserState: TokeniserStateProtocol { break case .PLAINTEXT: switch (r.currentUTF8()) { - case TokeniserStateVars.nullScalrUTF8: + case TokeniserStateVars.nullScalrUTF8Slice: t.error(self) r.advance() t.emit(TokeniserStateVars.replacementChar) break - case TokeniserStateVars.eofUTF8: + case TokeniserStateVars.eofUTF8Slice: try t.emit(Token.EOF()) break default: @@ -179,13 +181,13 @@ enum TokeniserState: TokeniserStateProtocol { case .TagOpen: // from < in data switch r.currentUTF8() { - case UTF8Arrays.bang: + case UTF8ArraySlices.bang: t.advanceTransition(.MarkupDeclarationOpen) break - case UTF8Arrays.forwardSlash: + case UTF8ArraySlices.forwardSlash: t.advanceTransition(.EndTagOpen) break - case UTF8Arrays.questionMark: + case UTF8ArraySlices.questionMark: t.advanceTransition(.BogusComment) break default: @@ -413,14 +415,14 @@ enum TokeniserState: TokeniserStateProtocol { } switch (r.currentUTF8()) { - case UTF8Arrays.hyphen: + case UTF8ArraySlices.hyphen: t.emit(UTF8Arrays.hyphen) t.advanceTransition(.ScriptDataEscapedDash) break - case UTF8Arrays.tagStart: + case UTF8ArraySlices.tagStart: t.advanceTransition(.ScriptDataEscapedLessthanSign) break - case TokeniserStateVars.nullScalrUTF8: + case TokeniserStateVars.nullScalrUTF8Slice: t.error(self) r.advance() t.emit(TokeniserStateVars.replacementChar) @@ -519,20 +521,20 @@ enum TokeniserState: TokeniserStateProtocol { case .ScriptDataDoubleEscaped: let c = r.currentUTF8() switch (c) { - case UTF8Arrays.hyphen: + case UTF8ArraySlices.hyphen: t.emit(c) t.advanceTransition(.ScriptDataDoubleEscapedDash) break - case UTF8Arrays.tagStart: + case UTF8ArraySlices.tagStart: t.emit(c) t.advanceTransition(.ScriptDataDoubleEscapedLessthanSign) break - case TokeniserStateVars.nullScalrUTF8: + case TokeniserStateVars.nullScalrUTF8Slice: t.error(self) r.advance() t.emit(TokeniserStateVars.replacementChar) break - case TokeniserStateVars.eofUTF8: + case TokeniserStateVars.eofUTF8Slice: t.eofError(self) t.transition(.Data) break @@ -1003,15 +1005,15 @@ enum TokeniserState: TokeniserStateProtocol { case .Comment: let c = r.currentUTF8() switch (c) { - case UTF8Arrays.hyphen: + case UTF8ArraySlices.hyphen: t.advanceTransition(.CommentEndDash) break - case TokeniserStateVars.nullScalrUTF8: + case TokeniserStateVars.nullScalrUTF8Slice: t.error(self) r.advance() t.commentPending.data.append(TokeniserStateVars.replacementChar) break - case TokeniserStateVars.eofUTF8: + case TokeniserStateVars.eofUTF8Slice: t.eofError(self) try t.emitCommentPending() t.transition(.Data) @@ -1590,15 +1592,15 @@ enum TokeniserState: TokeniserStateProtocol { private static func readData(_ t: Tokeniser, _ r: CharacterReader, _ current: TokeniserState, _ advance: TokeniserState)throws { switch (r.currentUTF8()) { - case UTF8Arrays.tagStart: + case UTF8ArraySlices.tagStart: t.advanceTransition(advance) break - case TokeniserStateVars.nullScalrUTF8: + case TokeniserStateVars.nullScalrUTF8Slice: t.error(current) r.advance() t.emit(TokeniserStateVars.replacementChar) break - case TokeniserStateVars.eofUTF8: + case TokeniserStateVars.eofUTF8Slice: try t.emit(Token.EOF()) break default: diff --git a/Sources/UTF8Arrays.swift b/Sources/UTF8Arrays.swift index 9b9f59f..525d8bc 100644 --- a/Sources/UTF8Arrays.swift +++ b/Sources/UTF8Arrays.swift @@ -80,4 +80,86 @@ public enum UTF8Arrays { public static let br = "br".utf8Array public static let frameset = "frameset".utf8Array public static let blobColon = "blob:".utf8Array - } +} + +public enum UTF8ArraySlices { + public static let whitespace = UTF8Arrays.whitespace[...] + public static let bang = UTF8Arrays.bang[...] + public static let equalSign = UTF8Arrays.equalSign[...] + public static let ampersand = UTF8Arrays.ampersand[...] + public static let hyphen = UTF8Arrays.hyphen[...] + public static let underscore = UTF8Arrays.underscore[...] + public static let semicolon = UTF8Arrays.semicolon[...] + public static let questionMark = UTF8Arrays.questionMark[...] + public static let forwardSlash = UTF8Arrays.forwardSlash[...] + public static let selfClosingTagEnd = UTF8Arrays.selfClosingTagEnd[...] + public static let endTagStart = UTF8Arrays.endTagStart[...] + public static let tagStart = UTF8Arrays.tagStart[...] + public static let tagEnd = UTF8Arrays.tagEnd[...] + public static let attributeEqualsQuoteMark = UTF8Arrays.attributeEqualsQuoteMark[...] + public static let quoteMark = UTF8Arrays.quoteMark[...] + public static let html = UTF8Arrays.html[...] + public static let head = UTF8Arrays.head[...] + public static let meta = UTF8Arrays.meta[...] + public static let body = UTF8Arrays.body[...] + public static let a = UTF8Arrays.a[...] + public static let p = UTF8Arrays.p[...] + public static let li = UTF8Arrays.li[...] + public static let span = UTF8Arrays.span[...] + public static let img = UTF8Arrays.img[...] + public static let action = UTF8Arrays.action[...] + public static let prompt = UTF8Arrays.prompt[...] + public static let comment = UTF8Arrays.comment[...] + public static let hash = UTF8Arrays.hash[...] + public static let hashRoot = UTF8Arrays.hashRoot[...] + public static let ruby = UTF8Arrays.ruby[...] + public static let rb = UTF8Arrays.rb[...] + public static let rp = UTF8Arrays.rp[...] + public static let rt = UTF8Arrays.rt[...] + public static let rtc = UTF8Arrays.rtc[...] + public static let page = UTF8Arrays.page[...] + public static let table = UTF8Arrays.table[...] + public static let tbody = UTF8Arrays.tbody[...] + public static let th = UTF8Arrays.th[...] + public static let tr = UTF8Arrays.tr[...] + public static let td = UTF8Arrays.td[...] + public static let thead = UTF8Arrays.thead[...] + public static let tfoot = UTF8Arrays.tfoot[...] + public static let optgroup = UTF8Arrays.optgroup[...] + public static let select = UTF8Arrays.select[...] + public static let form = UTF8Arrays.form[...] + public static let plaintext = UTF8Arrays.plaintext[...] + public static let button = UTF8Arrays.button[...] + public static let image = UTF8Arrays.image[...] + public static let value = UTF8Arrays.value[...] + public static let nobr = UTF8Arrays.nobr[...] + public static let input = UTF8Arrays.input[...] + public static let type = UTF8Arrays.type[...] + public static let hidden = UTF8Arrays.hidden[...] + public static let caption = UTF8Arrays.caption[...] + public static let hr = UTF8Arrays.hr[...] + public static let svg = UTF8Arrays.svg[...] + public static let isindex = UTF8Arrays.isindex[...] + public static let label = UTF8Arrays.label[...] + public static let xmp = UTF8Arrays.xmp[...] + public static let textarea = UTF8Arrays.textarea[...] + public static let iframe = UTF8Arrays.iframe[...] + public static let noembed = UTF8Arrays.noembed[...] + public static let option = UTF8Arrays.option[...] + public static let math = UTF8Arrays.math[...] + public static let sarcasm = UTF8Arrays.sarcasm[...] + public static let name = UTF8Arrays.name[...] + public static let col = UTF8Arrays.col[...] + public static let colgroup = UTF8Arrays.colgroup[...] + public static let frame = UTF8Arrays.frame[...] + public static let base = UTF8Arrays.base[...] + public static let href = UTF8Arrays.href[...] + public static let noscript = UTF8Arrays.noscript[...] + public static let noframes = UTF8Arrays.noframes[...] + public static let style = UTF8Arrays.style[...] + public static let title = UTF8Arrays.title[...] + public static let script = UTF8Arrays.script[...] + public static let br = UTF8Arrays.br[...] + public static let frameset = UTF8Arrays.frameset[...] + public static let blobColon = UTF8Arrays.blobColon[...] +}