diff --git a/Sources/Attribute.swift b/Sources/Attribute.swift index f95c3d9..df48a0f 100644 --- a/Sources/Attribute.swift +++ b/Sources/Attribute.swift @@ -19,7 +19,9 @@ open class Attribute { "selected", "sortable", "truespeed", "typemustmatch" ]) + @usableFromInline var key: [UInt8] + @usableFromInline var value: [UInt8] public init(key: [UInt8], value: [UInt8]) throws { @@ -86,15 +88,16 @@ open class Attribute { */ public func html() -> String { let accum = StringBuilder() - html(accum: accum, out: (Document("")).outputSettings()) + html(accum: accum, out: (Document([])).outputSettings()) return accum.toString() } - + + @inlinable public func html(accum: StringBuilder, out: OutputSettings ) { accum.append(key) if (!shouldCollapseAttribute(out: out)) { accum.append(UTF8Arrays.attributeEqualsQuoteMark) - Entities.escape(accum, Array(value), out, true, false, false) + Entities.escape(&accum.buffer, value, out, true, false, false) accum.append(UTF8Arrays.quoteMark) } } diff --git a/Sources/Attributes.swift b/Sources/Attributes.swift index 80d8f6a..4df3f04 100644 --- a/Sources/Attributes.swift +++ b/Sources/Attributes.swift @@ -22,15 +22,17 @@ import Foundation * */ open class Attributes: NSCopying { - public static var dataPrefix: [UInt8] = "data-".utf8Array // Stored by lowercased key, but key case is checked against the copy inside // the Attribute on retrieval. + @usableFromInline lazy var attributes: [Attribute] = [] internal var lowercasedKeysCache: [[UInt8]]? = nil - public init() {} + public init() { + attributes.reserveCapacity(16) + } @usableFromInline internal func updateLowercasedKeysCache() { @@ -81,11 +83,13 @@ open class Attributes: NSCopying { @param key attribute key @param value attribute value */ + @inlinable open func put(_ key: [UInt8], _ value: [UInt8]) throws { let attr = try Attribute(key: key, value: value) put(attribute: attr) } + @inlinable open func put(_ key: String, _ value: String) throws { return try put(key.utf8Array, value.utf8Array) } @@ -95,6 +99,7 @@ open class Attributes: NSCopying { @param key attribute key @param value attribute value */ + @inlinable open func put(_ key: [UInt8], _ value: Bool) throws { if (value) { try put(attribute: BooleanAttribute(key: key)) @@ -107,6 +112,7 @@ open class Attributes: NSCopying { Set a new attribute, or replace an existing one by (case-sensitive) key. @param attribute attribute */ + @inlinable open func put(attribute: Attribute) { let key = attribute.getKeyUTF8() if let ix = attributes.firstIndex(where: { $0.getKeyUTF8() == key }) { @@ -229,13 +235,14 @@ open class Attributes: NSCopying { */ open func html()throws -> String { let accum = StringBuilder() - try html(accum: accum, out: Document("").outputSettings()) // output settings a bit funky, but this html() seldom used + try html(accum: accum, out: Document([]).outputSettings()) // output settings a bit funky, but this html() seldom used return accum.toString() } + @inlinable public func html(accum: StringBuilder, out: OutputSettings ) throws { for attr in attributes { - accum.append(" ") + accum.append(UTF8Arrays.whitespace) attr.html(accum: accum, out: out) } } diff --git a/Sources/BooleanAttribute.swift b/Sources/BooleanAttribute.swift index 1a5b834..cf73564 100644 --- a/Sources/BooleanAttribute.swift +++ b/Sources/BooleanAttribute.swift @@ -16,6 +16,7 @@ open class BooleanAttribute: Attribute { * Create a new boolean attribute from unencoded (raw) key. * @param key attribute key */ + @usableFromInline init(key: [UInt8]) throws { try super.init(key: key, value: []) } diff --git a/Sources/Document.swift b/Sources/Document.swift index fb798ca..7021ba4 100644 --- a/Sources/Document.swift +++ b/Sources/Document.swift @@ -421,8 +421,10 @@ public class OutputSettings: NSCopying { */ public enum Syntax {case html, xml} - private var _escapeMode: Entities.EscapeMode = Entities.EscapeMode.base - private var _encoder: String.Encoding = String.Encoding.utf8 // Charset.forName("UTF-8") + @usableFromInline + internal var _escapeMode: Entities.EscapeMode = Entities.EscapeMode.base + @usableFromInline + internal var _encoder: String.Encoding = String.Encoding.utf8 // Charset.forName("UTF-8") private var _prettyPrint: Bool = true private var _outline: Bool = false private var _indentAmount: UInt = 1 @@ -438,6 +440,7 @@ public class OutputSettings: NSCopying { * The default escape mode is base. * @return the document's current escape mode */ + @inlinable public func escapeMode() -> Entities.EscapeMode { return _escapeMode } @@ -449,6 +452,7 @@ public class OutputSettings: NSCopying { * @return the document's output settings, for chaining */ @discardableResult + @inlinable public func escapeMode(_ escapeMode: Entities.EscapeMode) -> OutputSettings { self._escapeMode = escapeMode return self @@ -462,9 +466,11 @@ public class OutputSettings: NSCopying { * input charset. Otherwise, it defaults to UTF-8. * @return the document's current charset. */ + @inlinable public func encoder() -> String.Encoding { return _encoder } + @inlinable public func charset() -> String.Encoding { return _encoder } diff --git a/Sources/Elements.swift b/Sources/Elements.swift index 0f08521..00593fc 100644 --- a/Sources/Elements.swift +++ b/Sources/Elements.swift @@ -190,7 +190,7 @@ open class Elements: NSCopying { let sb: StringBuilder = StringBuilder() for element: Element in this { if !sb.isEmpty { - sb.append(" ") + sb.append(UTF8Arrays.whitespace) } sb.append(try element.text(trimAndNormaliseWhitespace: trimAndNormaliseWhitespace)) } diff --git a/Sources/Entities.swift b/Sources/Entities.swift index 110e636..028cc9c 100644 --- a/Sources/Entities.swift +++ b/Sources/Entities.swift @@ -204,162 +204,95 @@ public class Entities { } public static func escape(_ string: String, _ out: OutputSettings) -> String { - let accum = StringBuilder()//string.characters.count * 2 - escape(accum, string.utf8Array, out, false, false, false) - // try { - // - // } catch (IOException e) { - // throw new SerializationException(e) // doesn't happen - // } - return accum.toString() + var accum = [UInt8]() + accum.reserveCapacity(string.utf8.count * 2) + escape(&accum, string.utf8Array, out, false, false, false) + return String(decoding: accum, as: UTF8.self) + } + + @inline(__always) + internal static func utf8CharLength(for byte: UInt8) -> Int { + if byte < 0x80 { return 1 } + else if byte < 0xE0 { return 2 } + else if byte < 0xF0 { return 3 } + else { return 4 } } // this method is ugly, and does a lot. but other breakups cause rescanning and stringbuilder generations + @usableFromInline static func escape( - _ accum: StringBuilder, + _ accum: inout [UInt8], _ string: [UInt8], _ out: OutputSettings, _ inAttribute: Bool, _ normaliseWhite: Bool, _ stripLeadingWhite: Bool ) { - var lastWasWhite = false - var reachedNonWhite = false - let escapeMode: EscapeMode = out.escapeMode() - let encoder: String.Encoding = out.encoder() - - var i = 0 - while i < string.count { - let byte = string[i] - - if normaliseWhite && byte.isWhitespace { + let escapeMode = out.escapeMode() + let encoder = out.encoder() + var i = 0, n = string.count + var lastWasWhite = false, reachedNonWhite = false + while i < n { + let b = string[i] + if normaliseWhite && b.isWhitespace { var j = i - // Skip all consecutive whitespace - while j < string.count && string[j].isWhitespace { - j += 1 - } - // If leading or consecutive whitespace should be skipped + while j < n && string[j].isWhitespace { j += 1 } if (!reachedNonWhite && stripLeadingWhite) || lastWasWhite { - i = j - continue + i = j; continue } - accum.append(spaceString) // Append one space (normalize) - lastWasWhite = true - i = j - continue + accum.append(0x20) + lastWasWhite = true; i = j; continue } lastWasWhite = false reachedNonWhite = true - - if byte < 0x80 { - // Single-byte ASCII character - switch byte { - case 0x26: // '&' - accum.append(ampEntityUTF8) - case 0xA0: // Non-breaking space - if escapeMode != .xhtml { - accum.append(nbspEntityUTF8) - } else { - accum.append(xa0EntityUTF8) - } - case 0x3C: // '<' - if !inAttribute || escapeMode == .xhtml { - accum.append(ltEntityUTF8) - } else { - accum.append(byte) - } - case 0x3E: // '>' - if !inAttribute { - accum.append(gtEntityUTF8) - } else { - accum.append(byte) - } - case 0x22: // '"' - if inAttribute { - accum.append(quotEntityUTF8) - } else { - accum.append(byte) - } + if b < 0x80 { + switch b { + case 0x26: accum.append(contentsOf: ampEntityUTF8) + case 0xA0: accum.append(contentsOf: escapeMode == .xhtml ? xa0EntityUTF8 : nbspEntityUTF8) + case 0x3C: + if !inAttribute || escapeMode == .xhtml { accum.append(contentsOf: ltEntityUTF8) } else { accum.append(b) } + case 0x3E: + if !inAttribute { accum.append(contentsOf: gtEntityUTF8) } else { accum.append(b) } + case 0x22: + if inAttribute { accum.append(contentsOf: quotEntityUTF8) } else { accum.append(b) } default: - if encoder == .ascii || encoder == .utf8 || encoder == .utf16 || canEncode(byte: byte, encoder: encoder) { - accum.append(byte) + if encoder == .ascii || encoder == .utf8 || encoder == .utf16 || canEncode(byte: b, encoder: encoder) { + accum.append(b) } else { - appendEncoded(accum: accum, escapeMode: escapeMode, bytes: [byte]) + appendEncoded(accum: &accum, escapeMode: escapeMode, bytes: [b]) } } i += 1 } else { - // Multi-byte UTF-8 character - var charBytes: [UInt8] = [] - var remainingBytes = 0 - - if byte & 0xE0 == 0xC0 { - // Two-byte character - remainingBytes = 1 - } else if byte & 0xF0 == 0xE0 { - // Three-byte character - remainingBytes = 2 - } else if byte & 0xF8 == 0xF0 { - // Four-byte character - remainingBytes = 3 + let len = utf8CharLength(for: b) + let end = i + len <= n ? i + len : n + let charBytes = Array(string[i.. 0, i + 1 < string.count { - i += 1 - let nextByte = string[i] - if nextByte & 0xC0 == 0x80 { - charBytes.append(nextByte) - remainingBytes -= 1 - } else { - // Invalid UTF-8 sequence - appendEncoded(accum: accum, escapeMode: escapeMode, bytes: [byte]) - break - } - } - - if remainingBytes == 0 { - // Successfully collected a valid multi-byte character - if canEncode(bytes: charBytes, encoder: encoder) { - accum.append(charBytes) - } else { - appendEncoded(accum: accum, escapeMode: escapeMode, bytes: charBytes) - } + appendEncoded(accum: &accum, escapeMode: escapeMode, bytes: charBytes) } - i += 1 + i += len } } } - + @inlinable - internal static func appendEncoded(accum: StringBuilder, escapeMode: EscapeMode, bytes: [UInt8]) { + internal static func appendEncoded(accum: inout [UInt8], escapeMode: EscapeMode, bytes: [UInt8]) { if let name = escapeMode.nameForCodepoint(bytes) { - // Append named entity (e.g., "&") accum.append(0x26) // '&' - accum.append(name) + accum.append(contentsOf: name) accum.append(0x3B) // ';' } else { - // Convert bytes into a UnicodeScalar guard let scalar = String(bytes: bytes, encoding: .utf8)?.unicodeScalars.first else { - // Fallback for invalid encoding - accum.append([0x26, 0x23, 0x78]) // '&#x' - for byte in bytes { - accum.append(String.toHexString(n: Int(byte))) - } - accum.append(0x3B) // ';' + accum.append(contentsOf: [0x26, 0x23, 0x78]) // '&#x' + for b in bytes { accum.append(contentsOf: String.toHexString(n: Int(b)).utf8Array) } + accum.append(0x3B) return } - - // Append numeric entity for the scalar - accum.append([0x26, 0x23, 0x78]) // '&#x' - accum.append(String.toHexString(n: Int(scalar.value))) - accum.append(0x3B) // ';' + accum.append(contentsOf: [0x26, 0x23, 0x78]) + accum.append(contentsOf: String.toHexString(n: Int(scalar.value)).utf8Array) + accum.append(0x3B) } } diff --git a/Sources/Node.swift b/Sources/Node.swift index bc5a785..00e4015 100644 --- a/Sources/Node.swift +++ b/Sources/Node.swift @@ -38,7 +38,6 @@ open class Node: Equatable, Hashable { @usableFromInline internal var normalizedTagNameIndex: [[UInt8]: [Weak]]? = nil -// internal lazy var normalizedTagNameIndex: [[UInt8]: [Weak]] = [:] @usableFromInline internal var isQueryIndexDirty: Bool = false @@ -313,10 +312,12 @@ open class Node: Equatable, Hashable { * Get the number of child nodes that this node holds. * @return the number of child nodes that this node holds. */ + @inline(__always) public func childNodeSize() -> Int { return childNodes.count } + @inline(__always) public func hasChildNodes() -> Bool { return !childNodes.isEmpty } @@ -706,7 +707,7 @@ open class Node: Equatable, Hashable { // if this node has no document (or parent), retrieve the default output settings func getOutputSettings() -> OutputSettings { - return ownerDocument() != nil ? ownerDocument()!.outputSettings() : (Document("")).outputSettings() + return ownerDocument() != nil ? ownerDocument()!.outputSettings() : (Document([])).outputSettings() } /** @@ -909,7 +910,10 @@ internal extension Node { func rebuildQueryIndexesForAllTags() { var newIndex: [[UInt8]: [Weak]] = [:] var queue: [Node] = [self] - queue.reserveCapacity(childNodeSize()) + + let childNodeCount = childNodeSize() + newIndex.reserveCapacity(childNodeCount * 4) + queue.reserveCapacity(childNodeCount) var index = 0 while index < queue.count { diff --git a/Sources/String.swift b/Sources/String.swift index 5533d40..72418b7 100644 --- a/Sources/String.swift +++ b/Sources/String.swift @@ -12,6 +12,7 @@ extension UInt8 { /// Checks if the byte represents a whitespace character: /// Space (0x20), Tab (0x09), Newline (0x0A), Carriage Return (0x0D), /// Form Feed (0x0C), or Vertical Tab (0x0B). + @inline(__always) var isWhitespace: Bool { switch self { case 0x20, // Space @@ -177,22 +178,27 @@ extension ArraySlice: Comparable where Element == UInt8 { } extension String { + @inline(__always) public var utf8Array: [UInt8] { return Array(self.utf8) } + @inline(__always) var utf8ArraySlice: ArraySlice { return ArraySlice(self.utf8) } + @inline(__always) func equals(_ string: [UInt8]?) -> Bool { return self.utf8Array == string } + @inline(__always) subscript (i: Int) -> Character { return self[self.index(self.startIndex, offsetBy: i)] } + @inline(__always) subscript (i: Int) -> String { return String(self[i] as Character) } diff --git a/Sources/StringUtil.swift b/Sources/StringUtil.swift index 61bad64..daf4aa1 100644 --- a/Sources/StringUtil.swift +++ b/Sources/StringUtil.swift @@ -186,7 +186,7 @@ open class StringUtil { if ((stripLeading && !reachedNonWhite) || lastWasWhite) { continue } - accum.append(" ") + accum.append(UTF8Arrays.whitespace) lastWasWhite = true } else { accum.append(c) diff --git a/Sources/TextNode.swift b/Sources/TextNode.swift index 9e4decd..91ce38e 100644 --- a/Sources/TextNode.swift +++ b/Sources/TextNode.swift @@ -136,7 +136,7 @@ open class TextNode: Node { let par: Element? = parent() as? Element let normaliseWhite = out.prettyPrint() && par != nil && !Element.preserveWhitespace(par!) - Entities.escape(accum, getWholeTextUTF8(), out, false, normaliseWhite, false) + Entities.escape(&accum.buffer, getWholeTextUTF8(), out, false, normaliseWhite, false) } override func outerHtmlTail(_ accum: StringBuilder, _ depth: Int, _ out: OutputSettings) { diff --git a/Sources/Token.swift b/Sources/Token.swift index f1a138a..4078c25 100644 --- a/Sources/Token.swift +++ b/Sources/Token.swift @@ -136,7 +136,8 @@ open class Token { _pendingAttributeValueS = nil } - func finaliseTag() throws { + @inlinable + func finaliseTag() throws { // finalises for emit if (_pendingAttributeName != nil) { // todo: check if attribute name exists; if so, drop and error @@ -144,11 +145,13 @@ open class Token { } } + @inlinable func name() throws -> [UInt8] { // preserves case, for input into Tag.valueOf (which may drop case) try Validate.isFalse(val: _tagName == nil || _tagName!.isEmpty) return _tagName! } + @inline(__always) func normalName() -> [UInt8]? { // loses case, used in tree building for working out where in tree it should go return _normalName } @@ -160,10 +163,12 @@ open class Token { return self } + @inline(__always) func isSelfClosing() -> Bool { return _selfClosing } + @inline(__always) func getAttributes() -> Attributes { return _attributes } diff --git a/Sources/Tokeniser.swift b/Sources/Tokeniser.swift index 48cd087..9a2760c 100644 --- a/Sources/Tokeniser.swift +++ b/Sources/Tokeniser.swift @@ -209,9 +209,14 @@ final class Tokeniser { @discardableResult @inlinable func createTagPending(_ start: Bool) -> Token.Tag { - let token: Token.Tag = start ? Token.StartTag() : Token.EndTag() - tagPending = token - return token + if start { + startPending.reset() + tagPending = startPending + } else { + endPending.reset() + tagPending = endPending + } + return tagPending } @inlinable diff --git a/Sources/UTF8Arrays.swift b/Sources/UTF8Arrays.swift index 5b4e979..3c38ba3 100644 --- a/Sources/UTF8Arrays.swift +++ b/Sources/UTF8Arrays.swift @@ -36,6 +36,7 @@ public enum UTF8Arrays { public static let rt = "rt".utf8Array public static let rtc = "rtc".utf8Array public static let page = "page".utf8Array + public static let class_ = "class".utf8Array public static let table = "table".utf8Array public static let tbody = "tbody".utf8Array public static let th = "th".utf8Array @@ -119,6 +120,7 @@ public enum UTF8ArraySlices { public static let rt = UTF8Arrays.rt[...] public static let rtc = UTF8Arrays.rtc[...] public static let page = UTF8Arrays.page[...] + public static let class_ = UTF8Arrays.class_[...] public static let table = UTF8Arrays.table[...] public static let tbody = UTF8Arrays.tbody[...] public static let th = UTF8Arrays.th[...]