From b8585062d8868bbfaa2778387887e05ce9c52712 Mon Sep 17 00:00:00 2001 From: Alex Ehlke Date: Fri, 28 Feb 2025 22:45:06 -0500 Subject: [PATCH 1/8] optimize Entities.encode --- Sources/Attribute.swift | 2 +- Sources/Entities.swift | 174 ++++++++++++---------------------------- Sources/TextNode.swift | 2 +- 3 files changed, 55 insertions(+), 123 deletions(-) diff --git a/Sources/Attribute.swift b/Sources/Attribute.swift index f95c3d91..01af02d6 100644 --- a/Sources/Attribute.swift +++ b/Sources/Attribute.swift @@ -94,7 +94,7 @@ open class Attribute { accum.append(key) if (!shouldCollapseAttribute(out: out)) { accum.append(UTF8Arrays.attributeEqualsQuoteMark) - Entities.escape(accum, Array(value), out, true, false, false) + Entities.escape(&accum.buffer, Array(value), out, true, false, false) accum.append(UTF8Arrays.quoteMark) } } diff --git a/Sources/Entities.swift b/Sources/Entities.swift index 110e6366..84b2eeba 100644 --- a/Sources/Entities.swift +++ b/Sources/Entities.swift @@ -204,162 +204,94 @@ public class Entities { } public static func escape(_ string: String, _ out: OutputSettings) -> String { - let accum = StringBuilder()//string.characters.count * 2 - escape(accum, string.utf8Array, out, false, false, false) - // try { - // - // } catch (IOException e) { - // throw new SerializationException(e) // doesn't happen - // } - return accum.toString() + var accum = [UInt8]() + accum.reserveCapacity(string.utf8.count * 2) + escape(&accum, string.utf8Array, out, false, false, false) + return String(decoding: accum, as: UTF8.self) + } + + @inline(__always) + internal static func utf8CharLength(for byte: UInt8) -> Int { + if byte < 0x80 { return 1 } + else if byte < 0xE0 { return 2 } + else if byte < 0xF0 { return 3 } + else { return 4 } } // this method is ugly, and does a lot. but other breakups cause rescanning and stringbuilder generations static func escape( - _ accum: StringBuilder, + _ accum: inout [UInt8], _ string: [UInt8], _ out: OutputSettings, _ inAttribute: Bool, _ normaliseWhite: Bool, _ stripLeadingWhite: Bool ) { - var lastWasWhite = false - var reachedNonWhite = false - let escapeMode: EscapeMode = out.escapeMode() - let encoder: String.Encoding = out.encoder() - - var i = 0 - while i < string.count { - let byte = string[i] - - if normaliseWhite && byte.isWhitespace { + let escapeMode = out.escapeMode() + let encoder = out.encoder() + var i = 0, n = string.count + var lastWasWhite = false, reachedNonWhite = false + while i < n { + let b = string[i] + if normaliseWhite && b.isWhitespace { var j = i - // Skip all consecutive whitespace - while j < string.count && string[j].isWhitespace { - j += 1 - } - // If leading or consecutive whitespace should be skipped + while j < n && string[j].isWhitespace { j += 1 } if (!reachedNonWhite && stripLeadingWhite) || lastWasWhite { - i = j - continue + i = j; continue } - accum.append(spaceString) // Append one space (normalize) - lastWasWhite = true - i = j - continue + accum.append(0x20) + lastWasWhite = true; i = j; continue } lastWasWhite = false reachedNonWhite = true - - if byte < 0x80 { - // Single-byte ASCII character - switch byte { - case 0x26: // '&' - accum.append(ampEntityUTF8) - case 0xA0: // Non-breaking space - if escapeMode != .xhtml { - accum.append(nbspEntityUTF8) - } else { - accum.append(xa0EntityUTF8) - } - case 0x3C: // '<' - if !inAttribute || escapeMode == .xhtml { - accum.append(ltEntityUTF8) - } else { - accum.append(byte) - } - case 0x3E: // '>' - if !inAttribute { - accum.append(gtEntityUTF8) - } else { - accum.append(byte) - } - case 0x22: // '"' - if inAttribute { - accum.append(quotEntityUTF8) - } else { - accum.append(byte) - } + if b < 0x80 { + switch b { + case 0x26: accum.append(contentsOf: ampEntityUTF8) + case 0xA0: accum.append(contentsOf: escapeMode == .xhtml ? xa0EntityUTF8 : nbspEntityUTF8) + case 0x3C: + if !inAttribute || escapeMode == .xhtml { accum.append(contentsOf: ltEntityUTF8) } else { accum.append(b) } + case 0x3E: + if !inAttribute { accum.append(contentsOf: gtEntityUTF8) } else { accum.append(b) } + case 0x22: + if inAttribute { accum.append(contentsOf: quotEntityUTF8) } else { accum.append(b) } default: - if encoder == .ascii || encoder == .utf8 || encoder == .utf16 || canEncode(byte: byte, encoder: encoder) { - accum.append(byte) + if encoder == .ascii || encoder == .utf8 || encoder == .utf16 || canEncode(byte: b, encoder: encoder) { + accum.append(b) } else { - appendEncoded(accum: accum, escapeMode: escapeMode, bytes: [byte]) + appendEncoded(accum: &accum, escapeMode: escapeMode, bytes: [b]) } } i += 1 } else { - // Multi-byte UTF-8 character - var charBytes: [UInt8] = [] - var remainingBytes = 0 - - if byte & 0xE0 == 0xC0 { - // Two-byte character - remainingBytes = 1 - } else if byte & 0xF0 == 0xE0 { - // Three-byte character - remainingBytes = 2 - } else if byte & 0xF8 == 0xF0 { - // Four-byte character - remainingBytes = 3 + let len = utf8CharLength(for: b) + let end = i + len <= n ? i + len : n + let charBytes = Array(string[i.. 0, i + 1 < string.count { - i += 1 - let nextByte = string[i] - if nextByte & 0xC0 == 0x80 { - charBytes.append(nextByte) - remainingBytes -= 1 - } else { - // Invalid UTF-8 sequence - appendEncoded(accum: accum, escapeMode: escapeMode, bytes: [byte]) - break - } - } - - if remainingBytes == 0 { - // Successfully collected a valid multi-byte character - if canEncode(bytes: charBytes, encoder: encoder) { - accum.append(charBytes) - } else { - appendEncoded(accum: accum, escapeMode: escapeMode, bytes: charBytes) - } + appendEncoded(accum: &accum, escapeMode: escapeMode, bytes: charBytes) } - i += 1 + i += len } } } - + @inlinable - internal static func appendEncoded(accum: StringBuilder, escapeMode: EscapeMode, bytes: [UInt8]) { + internal static func appendEncoded(accum: inout [UInt8], escapeMode: EscapeMode, bytes: [UInt8]) { if let name = escapeMode.nameForCodepoint(bytes) { - // Append named entity (e.g., "&") accum.append(0x26) // '&' - accum.append(name) + accum.append(contentsOf: name) accum.append(0x3B) // ';' } else { - // Convert bytes into a UnicodeScalar guard let scalar = String(bytes: bytes, encoding: .utf8)?.unicodeScalars.first else { - // Fallback for invalid encoding - accum.append([0x26, 0x23, 0x78]) // '&#x' - for byte in bytes { - accum.append(String.toHexString(n: Int(byte))) - } - accum.append(0x3B) // ';' + accum.append(contentsOf: [0x26, 0x23, 0x78]) // '&#x' + for b in bytes { accum.append(contentsOf: String.toHexString(n: Int(b)).utf8Array) } + accum.append(0x3B) return } - - // Append numeric entity for the scalar - accum.append([0x26, 0x23, 0x78]) // '&#x' - accum.append(String.toHexString(n: Int(scalar.value))) - accum.append(0x3B) // ';' + accum.append(contentsOf: [0x26, 0x23, 0x78]) + accum.append(contentsOf: String.toHexString(n: Int(scalar.value)).utf8Array) + accum.append(0x3B) } } diff --git a/Sources/TextNode.swift b/Sources/TextNode.swift index 9e4decdc..91ce38eb 100644 --- a/Sources/TextNode.swift +++ b/Sources/TextNode.swift @@ -136,7 +136,7 @@ open class TextNode: Node { let par: Element? = parent() as? Element let normaliseWhite = out.prettyPrint() && par != nil && !Element.preserveWhitespace(par!) - Entities.escape(accum, getWholeTextUTF8(), out, false, normaliseWhite, false) + Entities.escape(&accum.buffer, getWholeTextUTF8(), out, false, normaliseWhite, false) } override func outerHtmlTail(_ accum: StringBuilder, _ depth: Int, _ out: OutputSettings) { From 6e2cdfc330a5740366876e838fbcbb671d07e491 Mon Sep 17 00:00:00 2001 From: Alex Ehlke Date: Fri, 28 Feb 2025 23:05:56 -0500 Subject: [PATCH 2/8] optimizations --- Sources/Attribute.swift | 9 ++++++--- Sources/Attributes.swift | 6 ++++-- Sources/Document.swift | 10 ++++++++-- Sources/Elements.swift | 2 +- Sources/Entities.swift | 1 + Sources/Node.swift | 10 +++++++--- Sources/String.swift | 6 ++++++ Sources/StringUtil.swift | 2 +- Sources/UTF8Arrays.swift | 2 ++ 9 files changed, 36 insertions(+), 12 deletions(-) diff --git a/Sources/Attribute.swift b/Sources/Attribute.swift index 01af02d6..df48a0fe 100644 --- a/Sources/Attribute.swift +++ b/Sources/Attribute.swift @@ -19,7 +19,9 @@ open class Attribute { "selected", "sortable", "truespeed", "typemustmatch" ]) + @usableFromInline var key: [UInt8] + @usableFromInline var value: [UInt8] public init(key: [UInt8], value: [UInt8]) throws { @@ -86,15 +88,16 @@ open class Attribute { */ public func html() -> String { let accum = StringBuilder() - html(accum: accum, out: (Document("")).outputSettings()) + html(accum: accum, out: (Document([])).outputSettings()) return accum.toString() } - + + @inlinable public func html(accum: StringBuilder, out: OutputSettings ) { accum.append(key) if (!shouldCollapseAttribute(out: out)) { accum.append(UTF8Arrays.attributeEqualsQuoteMark) - Entities.escape(&accum.buffer, Array(value), out, true, false, false) + Entities.escape(&accum.buffer, value, out, true, false, false) accum.append(UTF8Arrays.quoteMark) } } diff --git a/Sources/Attributes.swift b/Sources/Attributes.swift index 80d8f6ae..e54cc378 100644 --- a/Sources/Attributes.swift +++ b/Sources/Attributes.swift @@ -27,6 +27,7 @@ open class Attributes: NSCopying { // Stored by lowercased key, but key case is checked against the copy inside // the Attribute on retrieval. + @usableFromInline lazy var attributes: [Attribute] = [] internal var lowercasedKeysCache: [[UInt8]]? = nil @@ -229,13 +230,14 @@ open class Attributes: NSCopying { */ open func html()throws -> String { let accum = StringBuilder() - try html(accum: accum, out: Document("").outputSettings()) // output settings a bit funky, but this html() seldom used + try html(accum: accum, out: Document([]).outputSettings()) // output settings a bit funky, but this html() seldom used return accum.toString() } + @inlinable public func html(accum: StringBuilder, out: OutputSettings ) throws { for attr in attributes { - accum.append(" ") + accum.append(UTF8Arrays.whitespace) attr.html(accum: accum, out: out) } } diff --git a/Sources/Document.swift b/Sources/Document.swift index fb798cab..7021ba48 100644 --- a/Sources/Document.swift +++ b/Sources/Document.swift @@ -421,8 +421,10 @@ public class OutputSettings: NSCopying { */ public enum Syntax {case html, xml} - private var _escapeMode: Entities.EscapeMode = Entities.EscapeMode.base - private var _encoder: String.Encoding = String.Encoding.utf8 // Charset.forName("UTF-8") + @usableFromInline + internal var _escapeMode: Entities.EscapeMode = Entities.EscapeMode.base + @usableFromInline + internal var _encoder: String.Encoding = String.Encoding.utf8 // Charset.forName("UTF-8") private var _prettyPrint: Bool = true private var _outline: Bool = false private var _indentAmount: UInt = 1 @@ -438,6 +440,7 @@ public class OutputSettings: NSCopying { * The default escape mode is base. * @return the document's current escape mode */ + @inlinable public func escapeMode() -> Entities.EscapeMode { return _escapeMode } @@ -449,6 +452,7 @@ public class OutputSettings: NSCopying { * @return the document's output settings, for chaining */ @discardableResult + @inlinable public func escapeMode(_ escapeMode: Entities.EscapeMode) -> OutputSettings { self._escapeMode = escapeMode return self @@ -462,9 +466,11 @@ public class OutputSettings: NSCopying { * input charset. Otherwise, it defaults to UTF-8. * @return the document's current charset. */ + @inlinable public func encoder() -> String.Encoding { return _encoder } + @inlinable public func charset() -> String.Encoding { return _encoder } diff --git a/Sources/Elements.swift b/Sources/Elements.swift index 0f08521b..00593fc4 100644 --- a/Sources/Elements.swift +++ b/Sources/Elements.swift @@ -190,7 +190,7 @@ open class Elements: NSCopying { let sb: StringBuilder = StringBuilder() for element: Element in this { if !sb.isEmpty { - sb.append(" ") + sb.append(UTF8Arrays.whitespace) } sb.append(try element.text(trimAndNormaliseWhitespace: trimAndNormaliseWhitespace)) } diff --git a/Sources/Entities.swift b/Sources/Entities.swift index 84b2eeba..028cc9cc 100644 --- a/Sources/Entities.swift +++ b/Sources/Entities.swift @@ -219,6 +219,7 @@ public class Entities { } // this method is ugly, and does a lot. but other breakups cause rescanning and stringbuilder generations + @usableFromInline static func escape( _ accum: inout [UInt8], _ string: [UInt8], diff --git a/Sources/Node.swift b/Sources/Node.swift index bc5a7854..00e40150 100644 --- a/Sources/Node.swift +++ b/Sources/Node.swift @@ -38,7 +38,6 @@ open class Node: Equatable, Hashable { @usableFromInline internal var normalizedTagNameIndex: [[UInt8]: [Weak]]? = nil -// internal lazy var normalizedTagNameIndex: [[UInt8]: [Weak]] = [:] @usableFromInline internal var isQueryIndexDirty: Bool = false @@ -313,10 +312,12 @@ open class Node: Equatable, Hashable { * Get the number of child nodes that this node holds. * @return the number of child nodes that this node holds. */ + @inline(__always) public func childNodeSize() -> Int { return childNodes.count } + @inline(__always) public func hasChildNodes() -> Bool { return !childNodes.isEmpty } @@ -706,7 +707,7 @@ open class Node: Equatable, Hashable { // if this node has no document (or parent), retrieve the default output settings func getOutputSettings() -> OutputSettings { - return ownerDocument() != nil ? ownerDocument()!.outputSettings() : (Document("")).outputSettings() + return ownerDocument() != nil ? ownerDocument()!.outputSettings() : (Document([])).outputSettings() } /** @@ -909,7 +910,10 @@ internal extension Node { func rebuildQueryIndexesForAllTags() { var newIndex: [[UInt8]: [Weak]] = [:] var queue: [Node] = [self] - queue.reserveCapacity(childNodeSize()) + + let childNodeCount = childNodeSize() + newIndex.reserveCapacity(childNodeCount * 4) + queue.reserveCapacity(childNodeCount) var index = 0 while index < queue.count { diff --git a/Sources/String.swift b/Sources/String.swift index 5533d40d..72418b76 100644 --- a/Sources/String.swift +++ b/Sources/String.swift @@ -12,6 +12,7 @@ extension UInt8 { /// Checks if the byte represents a whitespace character: /// Space (0x20), Tab (0x09), Newline (0x0A), Carriage Return (0x0D), /// Form Feed (0x0C), or Vertical Tab (0x0B). + @inline(__always) var isWhitespace: Bool { switch self { case 0x20, // Space @@ -177,22 +178,27 @@ extension ArraySlice: Comparable where Element == UInt8 { } extension String { + @inline(__always) public var utf8Array: [UInt8] { return Array(self.utf8) } + @inline(__always) var utf8ArraySlice: ArraySlice { return ArraySlice(self.utf8) } + @inline(__always) func equals(_ string: [UInt8]?) -> Bool { return self.utf8Array == string } + @inline(__always) subscript (i: Int) -> Character { return self[self.index(self.startIndex, offsetBy: i)] } + @inline(__always) subscript (i: Int) -> String { return String(self[i] as Character) } diff --git a/Sources/StringUtil.swift b/Sources/StringUtil.swift index 61bad647..daf4aa10 100644 --- a/Sources/StringUtil.swift +++ b/Sources/StringUtil.swift @@ -186,7 +186,7 @@ open class StringUtil { if ((stripLeading && !reachedNonWhite) || lastWasWhite) { continue } - accum.append(" ") + accum.append(UTF8Arrays.whitespace) lastWasWhite = true } else { accum.append(c) diff --git a/Sources/UTF8Arrays.swift b/Sources/UTF8Arrays.swift index 5b4e9796..3c38ba3a 100644 --- a/Sources/UTF8Arrays.swift +++ b/Sources/UTF8Arrays.swift @@ -36,6 +36,7 @@ public enum UTF8Arrays { public static let rt = "rt".utf8Array public static let rtc = "rtc".utf8Array public static let page = "page".utf8Array + public static let class_ = "class".utf8Array public static let table = "table".utf8Array public static let tbody = "tbody".utf8Array public static let th = "th".utf8Array @@ -119,6 +120,7 @@ public enum UTF8ArraySlices { public static let rt = UTF8Arrays.rt[...] public static let rtc = UTF8Arrays.rtc[...] public static let page = UTF8Arrays.page[...] + public static let class_ = UTF8Arrays.class_[...] public static let table = UTF8Arrays.table[...] public static let tbody = UTF8Arrays.tbody[...] public static let th = UTF8Arrays.th[...] From d71a819198def89ef68521e87b91fd0091689aa5 Mon Sep 17 00:00:00 2001 From: Alex Ehlke Date: Fri, 28 Feb 2025 23:19:58 -0500 Subject: [PATCH 3/8] optimizations --- Sources/Attributes.swift | 4 +++- Sources/Token.swift | 7 ++++++- Sources/Tokeniser.swift | 11 ++++++++--- 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/Sources/Attributes.swift b/Sources/Attributes.swift index e54cc378..2f05f7d2 100644 --- a/Sources/Attributes.swift +++ b/Sources/Attributes.swift @@ -31,7 +31,9 @@ open class Attributes: NSCopying { lazy var attributes: [Attribute] = [] internal var lowercasedKeysCache: [[UInt8]]? = nil - public init() {} + public init() { + attributes.reserveCapacity(16) + } @usableFromInline internal func updateLowercasedKeysCache() { diff --git a/Sources/Token.swift b/Sources/Token.swift index f1a138a8..4078c253 100644 --- a/Sources/Token.swift +++ b/Sources/Token.swift @@ -136,7 +136,8 @@ open class Token { _pendingAttributeValueS = nil } - func finaliseTag() throws { + @inlinable + func finaliseTag() throws { // finalises for emit if (_pendingAttributeName != nil) { // todo: check if attribute name exists; if so, drop and error @@ -144,11 +145,13 @@ open class Token { } } + @inlinable func name() throws -> [UInt8] { // preserves case, for input into Tag.valueOf (which may drop case) try Validate.isFalse(val: _tagName == nil || _tagName!.isEmpty) return _tagName! } + @inline(__always) func normalName() -> [UInt8]? { // loses case, used in tree building for working out where in tree it should go return _normalName } @@ -160,10 +163,12 @@ open class Token { return self } + @inline(__always) func isSelfClosing() -> Bool { return _selfClosing } + @inline(__always) func getAttributes() -> Attributes { return _attributes } diff --git a/Sources/Tokeniser.swift b/Sources/Tokeniser.swift index 48cd087a..9a2760cb 100644 --- a/Sources/Tokeniser.swift +++ b/Sources/Tokeniser.swift @@ -209,9 +209,14 @@ final class Tokeniser { @discardableResult @inlinable func createTagPending(_ start: Bool) -> Token.Tag { - let token: Token.Tag = start ? Token.StartTag() : Token.EndTag() - tagPending = token - return token + if start { + startPending.reset() + tagPending = startPending + } else { + endPending.reset() + tagPending = endPending + } + return tagPending } @inlinable From c22b846b9fde2a4a31a6e32a610ec2ee2088afe9 Mon Sep 17 00:00:00 2001 From: Alex Ehlke Date: Fri, 28 Feb 2025 23:21:30 -0500 Subject: [PATCH 4/8] optimizations --- Sources/Attributes.swift | 5 ++++- Sources/BooleanAttribute.swift | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/Sources/Attributes.swift b/Sources/Attributes.swift index 2f05f7d2..4df3f049 100644 --- a/Sources/Attributes.swift +++ b/Sources/Attributes.swift @@ -22,7 +22,6 @@ import Foundation * */ open class Attributes: NSCopying { - public static var dataPrefix: [UInt8] = "data-".utf8Array // Stored by lowercased key, but key case is checked against the copy inside @@ -84,11 +83,13 @@ open class Attributes: NSCopying { @param key attribute key @param value attribute value */ + @inlinable open func put(_ key: [UInt8], _ value: [UInt8]) throws { let attr = try Attribute(key: key, value: value) put(attribute: attr) } + @inlinable open func put(_ key: String, _ value: String) throws { return try put(key.utf8Array, value.utf8Array) } @@ -98,6 +99,7 @@ open class Attributes: NSCopying { @param key attribute key @param value attribute value */ + @inlinable open func put(_ key: [UInt8], _ value: Bool) throws { if (value) { try put(attribute: BooleanAttribute(key: key)) @@ -110,6 +112,7 @@ open class Attributes: NSCopying { Set a new attribute, or replace an existing one by (case-sensitive) key. @param attribute attribute */ + @inlinable open func put(attribute: Attribute) { let key = attribute.getKeyUTF8() if let ix = attributes.firstIndex(where: { $0.getKeyUTF8() == key }) { diff --git a/Sources/BooleanAttribute.swift b/Sources/BooleanAttribute.swift index 1a5b8347..cf73564f 100644 --- a/Sources/BooleanAttribute.swift +++ b/Sources/BooleanAttribute.swift @@ -16,6 +16,7 @@ open class BooleanAttribute: Attribute { * Create a new boolean attribute from unencoded (raw) key. * @param key attribute key */ + @usableFromInline init(key: [UInt8]) throws { try super.init(key: key, value: []) } From 7f79612dd918ec90a9c91d8ced63d88135ced367 Mon Sep 17 00:00:00 2001 From: Petr Pavlik Date: Mon, 10 Mar 2025 23:27:01 +0100 Subject: [PATCH 5/8] Update Element.swift --- Sources/Element.swift | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/Sources/Element.swift b/Sources/Element.swift index 63e3cd15..53cce0f2 100644 --- a/Sources/Element.swift +++ b/Sources/Element.swift @@ -142,7 +142,14 @@ open class Element: Node { try super.attr(attributeKey, attributeValue) return self } - + + /** + * Set an attribute value on this element. If this element already has an attribute with the + * key, its value is updated; otherwise, a new attribute is added. + * + * @return this element + */ + @discardableResult open override func attr(_ attributeKey: String, _ attributeValue: String) throws -> Element { try super.attr(attributeKey.utf8Array, attributeValue.utf8Array) return self @@ -163,7 +170,18 @@ open class Element: Node { try attributes?.put(attributeKey, attributeValue) return self } - + + /** + * Set a boolean attribute value on this element. Setting to true sets the attribute value to "" and + * marks the attribute as boolean so no value is written out. Setting to false removes the attribute + * with the same key if it exists. + * + * @param attributeKey the attribute key + * @param attributeValue the attribute value + * + * @return this element + */ + @discardableResult open func attr(_ attributeKey: String, _ attributeValue: Bool) throws -> Element { try attributes?.put(attributeKey.utf8Array, attributeValue) return self From f9c21e11bbf3a7693190d80afa58c09dd2721bcf Mon Sep 17 00:00:00 2001 From: Alex Ehlke Date: Wed, 12 Mar 2025 00:45:29 -0400 Subject: [PATCH 6/8] wip --- Sources/Element.swift | 2 + Sources/Node.swift | 336 ++++++++++++++++++++-------------------- Sources/Tokeniser.swift | 76 ++++----- 3 files changed, 212 insertions(+), 202 deletions(-) diff --git a/Sources/Element.swift b/Sources/Element.swift index 63e3cd15..681693c7 100644 --- a/Sources/Element.swift +++ b/Sources/Element.swift @@ -32,6 +32,7 @@ open class Element: Node { public init(_ tag: Tag, _ baseUri: [UInt8], _ attributes: Attributes) { self._tag = tag super.init(baseUri, attributes) + childNodes.reserveCapacity(8) } /** * Create a new Element from a tag and a base URI. @@ -48,6 +49,7 @@ open class Element: Node { public init(_ tag: Tag, _ baseUri: [UInt8]) { self._tag = tag super.init(baseUri, Attributes()) + childNodes.reserveCapacity(8) } public override func nodeNameUTF8() -> [UInt8] { diff --git a/Sources/Node.swift b/Sources/Node.swift index 00e40150..dd7a9bf0 100644 --- a/Sources/Node.swift +++ b/Sources/Node.swift @@ -38,18 +38,18 @@ open class Node: Equatable, Hashable { @usableFromInline internal var normalizedTagNameIndex: [[UInt8]: [Weak]]? = nil - + @usableFromInline internal var isQueryIndexDirty: Bool = false - /** - * Get the list index of this node in its node sibling list. I.e. if this is the first node - * sibling, returns 0. - * @return position in node sibling list - * @see Element#elementSiblingIndex() - */ + /** + * Get the list index of this node in its node sibling list. I.e. if this is the first node + * sibling, returns 0. + * @return position in node sibling list + * @see Element#elementSiblingIndex() + */ public private(set) var siblingIndex: Int = 0 - + private static let abs = "abs:".utf8Array private static let absCount = abs.count fileprivate static let empty = "".utf8Array @@ -67,7 +67,7 @@ open class Node: Equatable, Hashable { rebuildQueryIndexesForThisNodeOnly() } - + public init(_ baseUri: [UInt8]) { childNodes = Node.EMPTY_NODES self.baseUri = baseUri.trim() @@ -75,7 +75,7 @@ open class Node: Equatable, Hashable { rebuildQueryIndexesForThisNodeOnly() } - + /** * Default constructor. Doesn't setup base uri, children, or attributes; use with caution. */ @@ -98,7 +98,7 @@ open class Node: Equatable, Hashable { public func nodeNameUTF8() -> [UInt8] { preconditionFailure("This method must be overridden") } - + /** * Get an attribute's value by its key. Case insensitive *

@@ -126,7 +126,7 @@ open class Node: Equatable, Hashable { open func attr(_ attributeKey: String) throws -> String { return try String(decoding: attr(attributeKey.utf8Array), as: UTF8.self) } - + /** * Get all of the element's attributes. * @return attributes (which implements iterable, in same order as presented in original HTML). @@ -134,7 +134,7 @@ open class Node: Equatable, Hashable { open func getAttributes() -> Attributes? { return attributes } - + /** * Set an attribute (key=value). If the attribute already exists, it is replaced. * @param attributeKey The attribute key. @@ -152,7 +152,7 @@ open class Node: Equatable, Hashable { try attributes?.put(attributeKey, attributeValue) return self } - + /** * Test if this element has an attribute. Case insensitive * @param attributeKey The attribute key to check. @@ -185,7 +185,7 @@ open class Node: Equatable, Hashable { } return attributes.hasKeyIgnoreCase(key: attributeKey) } - + /** * Remove an attribute from this element. * @param attributeKey The attribute to remove. @@ -201,7 +201,7 @@ open class Node: Equatable, Hashable { open func removeAttr(_ attributeKey: String) throws -> Node { return try removeAttr(attributeKey.utf8Array) } - + /** Get the base URI of this node. @return base URI @@ -213,7 +213,7 @@ open class Node: Equatable, Hashable { open func getBaseUriUTF8() -> [UInt8] { return baseUri ?? [] } - + /** Update the base URI of this node and all of its descendants. @param baseUri base URI to set @@ -228,17 +228,17 @@ open class Node: Equatable, Hashable { init(_ baseUri: [UInt8]) { self.baseUri = baseUri } - + func head(_ node: Node, _ depth: Int) throws { node.baseUri = baseUri } - + func tail(_ node: Node, _ depth: Int) throws { } } try traverse(nodeVisitor(baseUri)) } - + /** * Get an absolute URL from a URL attribute that may be relative (i.e. an <a href> or * <img src>). @@ -268,7 +268,7 @@ open class Node: Equatable, Hashable { open func absUrl(_ attributeKey: T) throws -> [UInt8] where T.Element == UInt8 { try Validate.notEmpty(string: attributeKey) - + let keyStr = String(decoding: attributeKey, as: UTF8.self) if (!hasAttr(keyStr)) { return Node.empty // nothing to make absolute with @@ -276,38 +276,40 @@ open class Node: Equatable, Hashable { return StringUtil.resolve(String(decoding: baseUri!, as: UTF8.self), relUrl: try attr(keyStr)).utf8Array } } - + /** Get a child node by its 0-based index. @param index index of child node @return the child node at this index. Throws a {@code IndexOutOfBoundsException} if the index is out of bounds. */ + @inline(__always) open func childNode(_ index: Int) -> Node { return childNodes[index] } - + /** Get this node's children. Presented as an unmodifiable list: new children can not be added, but the child nodes themselves can be manipulated. @return list of children. If no children, returns an empty list. */ + @inline(__always) open func getChildNodes() -> Array { return childNodes } - + /** * Returns a deep copy of this node's children. Changes made to these nodes will not be reflected in the original * nodes * @return a deep copy of this node's children */ open func childNodesCopy() -> Array { - var children: Array = Array() - for node: Node in childNodes { - children.append(node.copy() as! Node) - } - return children + var children: Array = Array() + for node: Node in childNodes { + children.append(node.copy() as! Node) + } + return children } - + /** * Get the number of child nodes that this node holds. * @return the number of child nodes that this node holds. @@ -321,27 +323,30 @@ open class Node: Equatable, Hashable { public func hasChildNodes() -> Bool { return !childNodes.isEmpty } - + + @inline(__always) final func childNodesAsArray() -> [Node] { return childNodes as Array } - + /** Gets this node's parent node. @return parent node or null if no parent. */ + @inline(__always) open func parent() -> Node? { return parentNode } - + /** Gets this node's parent node. Node overridable by extending classes, so useful if you really just need the Node type. @return parent node or null if no parent. */ + @inline(__always) final func getParentNode() -> Node? { return parentNode } - + /** * Gets the Document associated with this Node. * @return the Document associated with this Node, or null if there is no such Document. @@ -355,14 +360,14 @@ open class Node: Equatable, Hashable { return parentNode!.ownerDocument() } } - + /** * Remove (delete) this node from the DOM tree. If this node has children, they are also removed. */ open func remove() throws { try parentNode?.removeChild(self) } - + /** * Insert the specified HTML into the DOM before this node (i.e. as a preceding sibling). * @param html HTML to add before this node @@ -374,7 +379,7 @@ open class Node: Equatable, Hashable { try addSiblingHtml(siblingIndex, html) return self } - + /** * Insert the specified node into the DOM before this node (i.e. as a preceding sibling). * @param node to add before this node @@ -385,11 +390,11 @@ open class Node: Equatable, Hashable { open func before(_ node: Node) throws -> Node { try Validate.notNull(obj: node) try Validate.notNull(obj: parentNode) - + try parentNode?.addChildren(siblingIndex, node) return self } - + /** * Insert the specified HTML into the DOM after this node (i.e. as a following sibling). * @param html HTML to add after this node @@ -401,7 +406,7 @@ open class Node: Equatable, Hashable { try addSiblingHtml(siblingIndex + 1, html) return self } - + /** * Insert the specified node into the DOM after this node (i.e. as a following sibling). * @param node to add after this node @@ -412,20 +417,20 @@ open class Node: Equatable, Hashable { open func after(_ node: Node) throws -> Node { try Validate.notNull(obj: node) try Validate.notNull(obj: parentNode) - + try parentNode?.addChildren(siblingIndex+1, node) return self } - + private func addSiblingHtml(_ index: Int, _ html: String) throws { try Validate.notNull(obj: parentNode) - + let context: Element? = parent() as? Element - + let nodes: Array = try Parser.parseFragment(html, context, getBaseUriUTF8()) try parentNode?.addChildren(index, nodes) } - + /** * Insert the specified HTML into the DOM after this node (i.e. as a following sibling). * @param html HTML to add after this node @@ -437,7 +442,7 @@ open class Node: Equatable, Hashable { try addSiblingHtml(siblingIndex + 1, html) return self } - + /** * Insert the specified node into the DOM after this node (i.e. as a following sibling). * @param node to add after this node @@ -448,20 +453,20 @@ open class Node: Equatable, Hashable { open func after(node: Node) throws -> Node { try Validate.notNull(obj: node) try Validate.notNull(obj: parentNode) - + try parentNode?.addChildren(siblingIndex + 1, node) return self } - + open func addSiblingHtml(index: Int, _ html: String)throws { try Validate.notNull(obj: html) try Validate.notNull(obj: parentNode) - + let context: Element? = parent() as? Element let nodes: Array = try Parser.parseFragment(html, context, getBaseUriUTF8()) try parentNode?.addChildren(index, nodes) } - + /** Wrap the supplied HTML around this node. @param html HTML to wrap around this element, e.g. {@code

}. Can be arbitrarily deep. @@ -470,20 +475,20 @@ open class Node: Equatable, Hashable { @discardableResult open func wrap(_ html: String) throws -> Node? { try Validate.notEmpty(string: html.utf8Array) - + let context: Element? = parent() as? Element var wrapChildren: Array = try Parser.parseFragment(html, context, getBaseUriUTF8()) let wrapNode: Node? = !wrapChildren.isEmpty ? wrapChildren[0] : nil if (wrapNode == nil || !(((wrapNode as? Element) != nil))) { // nothing to wrap with; noop return nil } - + let wrap: Element = wrapNode as! Element let deepest: Element = getDeepChild(el: wrap) try parentNode?.replaceChild(self, wrap) - wrapChildren = wrapChildren.filter { $0 != wrap} + wrapChildren = wrapChildren.filter { $0 != wrap} try deepest.addChildren(self) - + // remainder (unbalanced wrap, like

-- The

is remainder if !wrapChildren.isEmpty { for i in 0..Node? { try Validate.notNull(obj: parentNode) - + let firstChild: Node? = !childNodes.isEmpty ? childNodes[0] : nil try parentNode?.addChildren(siblingIndex, self.childNodesAsArray()) try self.remove() - + return firstChild } - + private func getDeepChild(el: Element) -> Element { let children = el.children() if (children.size() > 0) { @@ -529,7 +534,7 @@ open class Node: Equatable, Hashable { return el } } - + /** * Replace this node in the DOM with the supplied node. * @param in the node that will will replace the existing node. @@ -540,15 +545,15 @@ open class Node: Equatable, Hashable { try Validate.notNull(obj: parentNode) try parentNode?.replaceChild(self, input) } - + @inlinable public func setParentNode(_ parentNode: Node) throws { if (self.parentNode != nil) { - try self.parentNode?.removeChild(self) + try self.parentNode?.removeChild(self) } self.parentNode = parentNode } - + @inlinable public func replaceChild(_ out: Node, _ input: Node) throws { try Validate.isTrue(val: out.parentNode === self) @@ -556,14 +561,14 @@ open class Node: Equatable, Hashable { if (input.parentNode != nil) { try input.parentNode?.removeChild(input) } - + let index: Int = out.siblingIndex childNodes[index] = input input.parentNode = self input.setSiblingIndex(index) out.parentNode = nil } - + @inlinable public func removeChild(_ out: Node) throws { try Validate.isTrue(val: out.parentNode === self) @@ -572,13 +577,13 @@ open class Node: Equatable, Hashable { reindexChildren(index) out.parentNode = nil } - + @inlinable public func addChildren(_ children: Node...) throws { //most used. short circuit addChildren(int), which hits reindex children and array copy try addChildren(children) } - + @inlinable public func addChildren(_ children: [Node]) throws { //most used. short circuit addChildren(int), which hits reindex children and array copy @@ -588,12 +593,12 @@ open class Node: Equatable, Hashable { child.setSiblingIndex(childNodes.count - 1) } } - + @inlinable public func addChildren(_ index: Int, _ children: Node...) throws { try addChildren(index, children) } - + @inlinable public func addChildren(_ index: Int, _ children: [Node]) throws { for i in (0..() } - + let nodes: Array = parentNode!.childNodes var siblings: Array = Array() for node in nodes { @@ -658,7 +663,7 @@ open class Node: Equatable, Hashable { } return parent.childNodeSize() > siblingIndex + 1 } - + /** Get this node's previous sibling. @return the previous sibling, or null if this is the first sibling @@ -678,7 +683,7 @@ open class Node: Equatable, Hashable { public func setSiblingIndex(_ siblingIndex: Int) { self.siblingIndex = siblingIndex } - + /** * Perform a depth-first traversal through this node and its descendants. * @param nodeVisitor the visitor callbacks to perform on each node @@ -690,7 +695,7 @@ open class Node: Equatable, Hashable { try traversor.traverse(self) return self } - + /** Get the outer HTML of this node. @return HTML @@ -700,16 +705,16 @@ open class Node: Equatable, Hashable { try outerHtml(accum) return accum.toString() } - + public func outerHtml(_ accum: StringBuilder)throws { try NodeTraversor(OuterHtmlVisitor(accum, getOutputSettings())).traverse(self) } - + // if this node has no document (or parent), retrieve the default output settings func getOutputSettings() -> OutputSettings { return ownerDocument() != nil ? ownerDocument()!.outputSettings() : (Document([])).outputSettings() } - + /** Get the outer HTML of this node. @param accum accumulator to place HTML into @@ -718,11 +723,11 @@ open class Node: Equatable, Hashable { func outerHtmlHead(_ accum: StringBuilder, _ depth: Int, _ out: OutputSettings) throws { preconditionFailure("This method must be overridden") } - + func outerHtmlTail(_ accum: StringBuilder, _ depth: Int, _ out: OutputSettings) throws { preconditionFailure("This method must be overridden") } - + /** * Write this node and its children to the given {@link Appendable}. * @@ -733,40 +738,40 @@ open class Node: Equatable, Hashable { try outerHtml(appendable) return appendable } - + public func indent(_ accum: StringBuilder, _ depth: Int, _ out: OutputSettings) { accum.append(UnicodeScalar.BackslashN).append(StringUtil.padding(depth * Int(out.indentAmount()))) } - + /** * Check if this node is the same instance of another (object identity test). * @param o other object to compare to * @return true if the content of this node is the same as the other * @see Node#hasSameValue(Object) to compare nodes by their value */ - + open func equals(_ o: Node) -> Bool { - // implemented just so that javadoc is clear this is an identity test + // implemented just so that javadoc is clear this is an identity test return self === o } - + /** * Check if this node is has the same content as another node. A node is considered the same if its name, attributes and content match the * other node; particularly its position in the tree does not influence its similarity. * @param o other object to compare to * @return true if the content of this node is the same as the other */ - + open func hasSameValue(_ o: Node)throws->Bool { if (self === o) {return true} -// if (type(of:self) != type(of: o)) -// { -// return false -// } - + // if (type(of:self) != type(of: o)) + // { + // return false + // } + return try self.outerHtml() == o.outerHtml() } - + /** * Create a stand-alone, deep copy of this node, and all of its children. The cloned node will have no siblings or * parent node. As a stand-alone object, any changes made to the clone or any of its children will not impact the @@ -776,84 +781,84 @@ open class Node: Equatable, Hashable { * @return stand-alone cloned node */ public func copy(with zone: NSZone? = nil) -> Any { - return copy(clone: Node()) + return copy(clone: Node()) } - - public func copy(parent: Node?) -> Node { - let clone = Node() - return copy(clone: clone, parent: parent) - } - - public func copy(clone: Node) -> Node { - let thisClone: Node = copy(clone: clone, parent: nil) // splits for orphan - - // Queue up nodes that need their children cloned (BFS). - var nodesToProcess: Array = Array() - nodesToProcess.append(thisClone) - - while (!nodesToProcess.isEmpty) { - let currParent: Node = nodesToProcess.removeFirst() - - for i in 0.. Node { + let clone = Node() + return copy(clone: clone, parent: parent) + } + + public func copy(clone: Node) -> Node { + let thisClone: Node = copy(clone: clone, parent: nil) // splits for orphan + + // Queue up nodes that need their children cloned (BFS). + var nodesToProcess: Array = Array() + nodesToProcess.append(thisClone) + + while (!nodesToProcess.isEmpty) { + let currParent: Node = nodesToProcess.removeFirst() + + for i in 0.. Node { - clone.parentNode = parent // can be null, to create an orphan split - clone.siblingIndex = parent == nil ? 0 : siblingIndex - clone.attributes = attributes != nil ? attributes?.clone() : nil - clone.baseUri = baseUri - clone.childNodes = Array() - - for child in childNodes { - clone.childNodes.append(child) - } - + return thisClone + } + + /* + * Return a clone of the node using the given parent (which can be null). + * Not a deep copy of children. + */ + public func copy(clone: Node, parent: Node?) -> Node { + clone.parentNode = parent // can be null, to create an orphan split + clone.siblingIndex = parent == nil ? 0 : siblingIndex + clone.attributes = attributes != nil ? attributes?.clone() : nil + clone.baseUri = baseUri + clone.childNodes = Array() + + for child in childNodes { + clone.childNodes.append(child) + } + clone.rebuildQueryIndexesForThisNodeOnly() - return clone - } - + return clone + } + private class OuterHtmlVisitor: NodeVisitor { private var accum: StringBuilder private var out: OutputSettings static private let text = "#text".utf8Array - + init(_ accum: StringBuilder, _ out: OutputSettings) { self.accum = accum self.out = out } - + open func head(_ node: Node, _ depth: Int)throws { try node.outerHtmlHead(accum, depth, out) } - + open func tail(_ node: Node, _ depth: Int)throws { // When compiling a release optimized swift linux 4.2 version the "saves a void hit." // causes a SIL error. Removing optimization on linux until a fix is found. - #if os(Linux) +#if os(Linux) try node.outerHtmlTail(accum, depth, out) - #else +#else if (!(node.nodeNameUTF8() == OuterHtmlVisitor.text)) { // saves a void hit. try node.outerHtmlTail(accum, depth, out) } - #endif +#endif } } - + /// Returns a Boolean value indicating whether two values are equal. /// /// Equality is the inverse of inequality. For any values `a` and `b`, @@ -865,11 +870,11 @@ open class Node: Equatable, Hashable { public static func ==(lhs: Node, rhs: Node) -> Bool { return lhs === rhs } - - /// The hash value. - /// - /// Hash values are not guaranteed to be equal across different executions of - /// your program. Do not save hash values to use during a future execution. + + /// The hash value. + /// + /// Hash values are not guaranteed to be equal across different executions of + /// your program. Do not save hash values to use during a future execution. public func hash(into hasher: inout Hasher) { hasher.combine(description) hasher.combine(baseUri) @@ -877,33 +882,36 @@ open class Node: Equatable, Hashable { } extension Node: CustomStringConvertible { - public var description: String { - do { - return try outerHtml() - } catch { - - } - return "" - } + public var description: String { + do { + return try outerHtml() + } catch { + + } + return "" + } } extension Node: CustomDebugStringConvertible { private static let space = " " - public var debugDescription: String { - do { + public var debugDescription: String { + do { return try String(describing: type(of: self)) + Node.space + outerHtml() - } catch { - - } - return String(describing: type(of: self)) - } + } catch { + + } + return String(describing: type(of: self)) + } } internal extension Node { @inlinable func markQueryIndexDirty() { - isQueryIndexDirty = true - parentNode?.markQueryIndexDirty() + var current: Node? = self + while let node = current { + node.isQueryIndexDirty = true + current = node.parentNode + } } @usableFromInline diff --git a/Sources/Tokeniser.swift b/Sources/Tokeniser.swift index 9a2760cb..050d1a43 100644 --- a/Sources/Tokeniser.swift +++ b/Sources/Tokeniser.swift @@ -12,10 +12,10 @@ final class Tokeniser { static let replacementChar: UnicodeScalar = "\u{FFFD}" // replaces null character private static let notCharRefChars = ParsingStrings([UnicodeScalar.BackslashT, "\n", "\r", UnicodeScalar.BackslashF, " ", "<", UnicodeScalar.Ampersand]) private static let notNamedCharRefChars = ParsingStrings([UTF8Arrays.equalSign, UTF8Arrays.hyphen, UTF8Arrays.underscore]) - + private let reader: CharacterReader // html input private let errors: ParseErrorList? // errors found while tokenising - + private var state: TokeniserState = TokeniserState.Data // current tokenisation state private var emitPending: Token? // the token we are about to emit on next read private var isEmitPending: Bool = false @@ -32,12 +32,12 @@ final class Tokeniser { let commentPending: Token.Comment = Token.Comment() // comment building up private var lastStartTag: [UInt8]? // the last start tag emitted, to test appropriate end tag private var selfClosingFlagAcknowledged: Bool = true - + init(_ reader: CharacterReader, _ errors: ParseErrorList?) { self.reader = reader self.errors = errors } - + func read() throws -> Token { if (!selfClosingFlagAcknowledged) { error("Self closing flag not acknowledged") @@ -72,10 +72,10 @@ final class Tokeniser { func emit(_ token: Token) throws { try Validate.isFalse(val: isEmitPending, msg: "There is an unread token pending!") - + emitPending = token isEmitPending = true - + if (token.type == Token.TokenType.StartTag) { let startTag: Token.StartTag = token as! Token.StartTag lastStartTag = startTag._tagName! @@ -89,7 +89,7 @@ final class Tokeniser { } } } - + func emit(_ str: ArraySlice) { pendingSlices.append(str) } @@ -101,15 +101,15 @@ final class Tokeniser { func emit(_ str: String) { emit(str.utf8Array) } - -// func emit(_ chars: [UInt8]) { -// emit(String(chars.map {Character($0)})) -// } - + + // func emit(_ chars: [UInt8]) { + // emit(String(chars.map {Character($0)})) + // } + // func emit(_ codepoints: [Int]) { // emit(String(codepoints, 0, codepoints.length)); // } - + func emit(_ c: UnicodeScalar) { emit(Array(c.utf8)) } @@ -117,24 +117,24 @@ final class Tokeniser { func emit(_ c: [UnicodeScalar]) { emit(c.flatMap { Array($0.utf8) }) } - + func getState() -> TokeniserState { return state } - + func transition(_ state: TokeniserState) { self.state = state } - + func advanceTransition(_ state: TokeniserState) { reader.advance() self.state = state } - + func acknowledgeSelfClosingFlag() { selfClosingFlagAcknowledged = true } - + func consumeCharacterReference(_ additionalAllowedCharacter: UnicodeScalar?, _ inAttribute: Bool) throws -> [UnicodeScalar]? { if (reader.isEmpty()) { return nil @@ -145,7 +145,7 @@ final class Tokeniser { if (reader.matchesAny(Tokeniser.notCharRefChars)) { return nil } - + reader.markPos() if (reader.matchConsume(UTF8Arrays.hash)) { // numbered let isHexMode: Bool = reader.matchConsumeIgnoreCase("X".utf8Array) @@ -159,12 +159,12 @@ final class Tokeniser { characterReferenceError("missing semicolon") // missing semi } var charval: Int = -1 - + let base: Int = isHexMode ? 16 : 10 if let num = numRef.toInt(radix: base) { charval = num } - + if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF) || charval > 0x10FFFF) { characterReferenceError("character outside of valid range") return [Tokeniser.replacementChar] @@ -174,12 +174,12 @@ final class Tokeniser { return [UnicodeScalar(charval)!] } } else { // named - // get as many letters as possible, and look for matching entities. + // get as many letters as possible, and look for matching entities. let nameRef: ArraySlice = reader.consumeLetterThenDigitSequence() let looksLegit: Bool = reader.matches(";") // found if a base named entity without a ;, or an extended entity with the ;. let found: Bool = (Entities.isBaseNamedEntity(nameRef) || (Entities.isNamedEntity(nameRef) && looksLegit)) - + if (!found) { reader.rewindToMark() if (looksLegit) { // named with semicolon @@ -205,7 +205,7 @@ final class Tokeniser { return [] } } - + @discardableResult @inlinable func createTagPending(_ start: Bool) -> Token.Tag { @@ -218,33 +218,33 @@ final class Tokeniser { } return tagPending } - + @inlinable func emitTagPending() throws { try tagPending.finaliseTag() try emit(tagPending) } - + func createCommentPending() { commentPending.reset() } - + func emitCommentPending() throws { try emit(commentPending) } - + func createDoctypePending() { doctypePending.reset() } - + func emitDoctypePending() throws { try emit(doctypePending) } - + func createTempBuffer() { Token.reset(dataBuffer) } - + func isAppropriateEndTagToken()throws->Bool { if(lastStartTag != nil) { let s = try tagPending.name() @@ -252,45 +252,45 @@ final class Tokeniser { } return false } - + func appropriateEndTagName() -> [UInt8]? { if (lastStartTag == nil) { return nil } return lastStartTag } - + func error(_ state: TokeniserState) { if (errors != nil && errors!.canAddError()) { errors?.add(ParseError(reader.getPos(), "Unexpected character '\(String(reader.current()))' in input state [\(state.description)]")) } } - + func eofError(_ state: TokeniserState) { if (errors != nil && errors!.canAddError()) { errors?.add(ParseError(reader.getPos(), "Unexpectedly reached end of file (EOF) in input state [\(state.description)]")) } } - + private func characterReferenceError(_ message: String) { if (errors != nil && errors!.canAddError()) { errors?.add(ParseError(reader.getPos(), "Invalid character reference: \(message)")) } } - + private func error(_ errorMsg: String) { if (errors != nil && errors!.canAddError()) { errors?.add(ParseError(reader.getPos(), errorMsg)) } } - + func currentNodeInHtmlNS() -> Bool { // todo: implement namespaces correctly return true // Element currentNode = currentNode() // return currentNode != null && currentNode.namespace().equals("HTML") } - + /** * Utility method to consume reader and unescape entities found within. * @param inAttribute From 65fec4f4b3e95bad50012e2c88b8293dec598ff4 Mon Sep 17 00:00:00 2001 From: Alex Ehlke Date: Wed, 12 Mar 2025 01:18:45 -0400 Subject: [PATCH 7/8] wip --- Sources/Attributes.swift | 13 ++++++++++++- Sources/Element.swift | 23 ++++++++++++++++++----- Sources/Node.swift | 25 +++++++++++++++++++++++-- Sources/String.swift | 19 ++++++++++++++++++- Sources/UTF8Arrays.swift | 24 ++++++++++++++++++++++++ 5 files changed, 95 insertions(+), 9 deletions(-) diff --git a/Sources/Attributes.swift b/Sources/Attributes.swift index 4df3f049..864d85b2 100644 --- a/Sources/Attributes.swift +++ b/Sources/Attributes.swift @@ -233,11 +233,22 @@ open class Attributes: NSCopying { @return HTML @throws SerializationException if the HTML representation of the attributes cannot be constructed. */ - open func html()throws -> String { + open func html() throws -> String { let accum = StringBuilder() try html(accum: accum, out: Document([]).outputSettings()) // output settings a bit funky, but this html() seldom used return accum.toString() } + + /** + Get the HTML representation of these attributes. + @return HTML + @throws SerializationException if the HTML representation of the attributes cannot be constructed. + */ + open func htmlUTF8() throws -> [UInt8] { + let accum = StringBuilder() + try html(accum: accum, out: Document([]).outputSettings()) // output settings a bit funky, but this html() seldom used + return accum.buffer + } @inlinable public func html(accum: StringBuilder, out: OutputSettings ) throws { diff --git a/Sources/Element.swift b/Sources/Element.swift index 681693c7..565196e0 100644 --- a/Sources/Element.swift +++ b/Sources/Element.swift @@ -1228,7 +1228,7 @@ open class Element: Node { @return this element */ @discardableResult - public func addClass(_ className: String)throws->Element { + public func addClass(_ className: String) throws -> Element { let classes: OrderedSet = try classNames() classes.append(className) try classNames(classes) @@ -1241,7 +1241,7 @@ open class Element: Node { @return this element */ @discardableResult - public func removeClass(_ className: String)throws->Element { + public func removeClass(_ className: String) throws -> Element { let classes: OrderedSet = try classNames() classes.remove(className) try classNames(classes) @@ -1254,7 +1254,7 @@ open class Element: Node { @return this element */ @discardableResult - public func toggleClass(_ className: String)throws->Element { + public func toggleClass(_ className: String) throws -> Element { let classes: OrderedSet = try classNames() if (classes.contains(className)) {classes.remove(className) } else { @@ -1338,6 +1338,19 @@ open class Element: Node { try html2(accum) return getOutputSettings().prettyPrint() ? accum.toString().trim() : accum.toString() } + + /** + * Retrieves the element's inner HTML. E.g. on a {@code

} with one empty {@code

}, would return + * {@code

}. (Whereas {@link #outerHtml()} would return {@code

}.) + * + * @return String of HTML. + * @see #outerHtml() + */ + public func htmlUTF8() throws -> [UInt8] { + let accum: StringBuilder = StringBuilder() + try html2(accum) + return getOutputSettings().prettyPrint() ? accum.buffer.trim() : accum.buffer + } private func html2(_ accum: StringBuilder) throws { for node in childNodes { @@ -1348,7 +1361,7 @@ open class Element: Node { /** * {@inheritDoc} */ - open override func html(_ appendable: StringBuilder)throws->StringBuilder { + open override func html(_ appendable: StringBuilder) throws -> StringBuilder { for node in childNodes { try node.outerHtml(appendable) } @@ -1362,7 +1375,7 @@ open class Element: Node { * @see #append(String) */ @discardableResult - public func html(_ html: String)throws->Element { + public func html(_ html: String) throws -> Element { empty() try append(html) return self diff --git a/Sources/Node.swift b/Sources/Node.swift index dd7a9bf0..bff97db3 100644 --- a/Sources/Node.swift +++ b/Sources/Node.swift @@ -380,6 +380,18 @@ open class Node: Equatable, Hashable { return self } + /** + * Insert the specified HTML into the DOM before this node (i.e. as a preceding sibling). + * @param html HTML to add before this node + * @return this node, for chaining + * @see #after(String) + */ + @discardableResult + open func before(_ html: [UInt8]) throws -> Node { + try addSiblingHtml(siblingIndex, html) + return self + } + /** * Insert the specified node into the DOM before this node (i.e. as a preceding sibling). * @param node to add before this node @@ -431,6 +443,15 @@ open class Node: Equatable, Hashable { try parentNode?.addChildren(index, nodes) } + private func addSiblingHtml(_ index: Int, _ html: [UInt8]) throws { + try Validate.notNull(obj: parentNode) + + let context: Element? = parent() as? Element + + let nodes: Array = try Parser.parseFragment(html, context, getBaseUriUTF8()) + try parentNode?.addChildren(index, nodes) + } + /** * Insert the specified HTML into the DOM after this node (i.e. as a following sibling). * @param html HTML to add after this node @@ -700,13 +721,13 @@ open class Node: Equatable, Hashable { Get the outer HTML of this node. @return HTML */ - open func outerHtml()throws->String { + open func outerHtml() throws -> String { let accum: StringBuilder = StringBuilder(128) try outerHtml(accum) return accum.toString() } - public func outerHtml(_ accum: StringBuilder)throws { + public func outerHtml(_ accum: StringBuilder) throws { try NodeTraversor(OuterHtmlVisitor(accum, getOutputSettings())).traverse(self) } diff --git a/Sources/String.swift b/Sources/String.swift index 72418b76..27da8658 100644 --- a/Sources/String.swift +++ b/Sources/String.swift @@ -28,6 +28,23 @@ extension UInt8 { } } +extension ArraySlice where Element == UInt8 { + @inline(__always) + public func lowercased() -> ArraySlice { + // Check if any element needs lowercasing + guard self.contains(where: { $0 >= 65 && $0 <= 90 }) else { return self } + // Only allocate a new array if necessary + var result = self + for i in result.indices { + let b = result[i] + if b >= 65 && b <= 90 { + result[i] = b + 32 + } + } + return result + } +} + // TODO: Use @retroactive once supported on Ubuntu (?) //extension Array: @retroactive Comparable where Element == UInt8 { extension Array: Comparable where Element == UInt8 { @@ -184,7 +201,7 @@ extension String { } @inline(__always) - var utf8ArraySlice: ArraySlice { + public var utf8ArraySlice: ArraySlice { return ArraySlice(self.utf8) } diff --git a/Sources/UTF8Arrays.swift b/Sources/UTF8Arrays.swift index 3c38ba3a..f16205c9 100644 --- a/Sources/UTF8Arrays.swift +++ b/Sources/UTF8Arrays.swift @@ -20,6 +20,7 @@ public enum UTF8Arrays { public static let head = "head".utf8Array public static let meta = "meta".utf8Array public static let body = "body".utf8Array + public static let cite = "cite".utf8Array public static let a = "a".utf8Array public static let p = "p".utf8Array public static let li = "li".utf8Array @@ -57,6 +58,7 @@ public enum UTF8Arrays { public static let hidden = "hidden".utf8Array public static let caption = "caption".utf8Array public static let hr = "hr".utf8Array + public static let abbr = "abbr".utf8Array public static let svg = "svg".utf8Array public static let isindex = "isindex".utf8Array public static let label = "label".utf8Array @@ -66,14 +68,24 @@ public enum UTF8Arrays { public static let noembed = "noembed".utf8Array public static let option = "option".utf8Array public static let math = "math".utf8Array + public static let data = "data".utf8Array + public static let strong = "strong".utf8Array public static let sarcasm = "sarcasm".utf8Array // Huh public static let name = "name".utf8Array + public static let i = "i".utf8Array public static let col = "col".utf8Array public static let colgroup = "colgroup".utf8Array + public static let em = "em".utf8Array + public static let small = "small".utf8Array public static let frame = "frame".utf8Array + public static let sub = "sub".utf8Array + public static let sup = "sup".utf8Array public static let base = "base".utf8Array + public static let time = "time".utf8Array public static let href = "href".utf8Array + public static let meter = "meter".utf8Array public static let noscript = "noscript".utf8Array + public static let b = "b".utf8Array public static let noframes = "noframes".utf8Array public static let style = "style".utf8Array public static let title = "title".utf8Array @@ -104,9 +116,21 @@ public enum UTF8ArraySlices { public static let head = UTF8Arrays.head[...] public static let meta = UTF8Arrays.meta[...] public static let body = UTF8Arrays.body[...] + public static let cite = UTF8Arrays.cite[...] + public static let abbr = UTF8Arrays.abbr[...] + public static let data = UTF8Arrays.data[...] + public static let strong = UTF8Arrays.strong[...] + public static let sub = UTF8Arrays.sub[...] + public static let sup = UTF8Arrays.sup[...] + public static let b = UTF8Arrays.b[...] + public static let i = UTF8Arrays.i[...] + public static let meter = UTF8Arrays.meter[...] public static let a = UTF8Arrays.a[...] public static let p = UTF8Arrays.p[...] public static let li = UTF8Arrays.li[...] + public static let em = UTF8Arrays.em[...] + public static let time = UTF8Arrays.time[...] + public static let small = UTF8Arrays.small[...] public static let span = UTF8Arrays.span[...] public static let img = UTF8Arrays.img[...] public static let action = UTF8Arrays.action[...] From aa3c73a99d8847983018b6dcb1d3ea7b948a19be Mon Sep 17 00:00:00 2001 From: Alex Ehlke Date: Wed, 12 Mar 2025 01:34:29 -0400 Subject: [PATCH 8/8] wip --- Sources/Element.swift | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/Sources/Element.swift b/Sources/Element.swift index 565196e0..91341059 100644 --- a/Sources/Element.swift +++ b/Sources/Element.swift @@ -999,6 +999,16 @@ open class Element: Node { } return text } + + public func textUTF8(trimAndNormaliseWhitespace: Bool = true) throws -> [UInt8] { + let accum: StringBuilder = StringBuilder() + try NodeTraversor(TextNodeVisitor(accum, trimAndNormaliseWhitespace: trimAndNormaliseWhitespace)).traverse(self) + let text = accum.buffer + if trimAndNormaliseWhitespace { + return text.trim() + } + return text + } /** * Gets the text owned by this element only; does not get the combined text of all children. @@ -1016,6 +1026,23 @@ open class Element: Node { ownText(sb) return sb.toString().trim() } + + /** + * Gets the text owned by this element only; does not get the combined text of all children. + *

+ * For example, given HTML {@code

Hello there now!

}, {@code p.ownText()} returns {@code "Hello now!"}, + * whereas {@code p.text()} returns {@code "Hello there now!"}. + * Note that the text within the {@code b} element is not returned, as it is not a direct child of the {@code p} element. + * + * @return unencoded text, or empty string if none. + * @see #text() + * @see #textNodes() + */ + public func ownTextUTF8() -> [UInt8] { + let sb: StringBuilder = StringBuilder() + ownText(sb) + return sb.buffer.trim() + } private func ownText(_ accum: StringBuilder) { for child: Node in childNodes {