From d3d96c2752f2d4fa9be409db1a4731c827bda05b Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Sat, 7 Dec 2024 14:19:01 -0500 Subject: [PATCH 01/10] Extract DocumentFragment initializer parameter handling Using `...` in Ruby is so much easier (and more efficient) than the equivalent code written in the C extension, and allows us to introduce very flexible initializer parameters (like kwargs). --- ext/nokogiri/xml_document_fragment.c | 35 ++++++++------------------- lib/nokogiri/xml/document_fragment.rb | 20 ++++++++++++--- 2 files changed, 26 insertions(+), 29 deletions(-) diff --git a/ext/nokogiri/xml_document_fragment.c b/ext/nokogiri/xml_document_fragment.c index d89b7483607..3f28d28d40a 100644 --- a/ext/nokogiri/xml_document_fragment.c +++ b/ext/nokogiri/xml_document_fragment.c @@ -2,31 +2,18 @@ VALUE cNokogiriXmlDocumentFragment; -/* - * call-seq: - * new(document) - * - * Create a new DocumentFragment element on the +document+ - */ +/* :nodoc: */ static VALUE -new (int argc, VALUE *argv, VALUE klass) +noko_xml_document_fragment_s_native_new(VALUE klass, VALUE rb_doc) { - xmlDocPtr xml_doc; - xmlNodePtr node; - VALUE document; - VALUE rest; + xmlDocPtr c_doc; + xmlNodePtr c_node; VALUE rb_node; - rb_scan_args(argc, argv, "1*", &document, &rest); - - xml_doc = noko_xml_document_unwrap(document); - - node = xmlNewDocFragment(xml_doc->doc); - - noko_xml_document_pin_node(node); - - rb_node = noko_xml_node_wrap(klass, node); - rb_obj_call_init(rb_node, argc, argv); + c_doc = noko_xml_document_unwrap(rb_doc); + c_node = xmlNewDocFragment(c_doc->doc); + noko_xml_document_pin_node(c_node); + rb_node = noko_xml_node_wrap(klass, c_node); return rb_node; } @@ -35,10 +22,8 @@ void noko_init_xml_document_fragment(void) { assert(cNokogiriXmlNode); - /* - * DocumentFragment represents a DocumentFragment node in an xml document. - */ + cNokogiriXmlDocumentFragment = rb_define_class_under(mNokogiriXml, "DocumentFragment", cNokogiriXmlNode); - rb_define_singleton_method(cNokogiriXmlDocumentFragment, "new", new, -1); + rb_define_singleton_method(cNokogiriXmlDocumentFragment, "native_new", noko_xml_document_fragment_s_native_new, 1); } diff --git a/lib/nokogiri/xml/document_fragment.rb b/lib/nokogiri/xml/document_fragment.rb index dbdc46b4243..4c6ce3ef179 100644 --- a/lib/nokogiri/xml/document_fragment.rb +++ b/lib/nokogiri/xml/document_fragment.rb @@ -3,16 +3,28 @@ module Nokogiri module XML + # DocumentFragment represents a fragment of an \XML document. It provides the same functionality + # exposed by XML::Node and can be used to contain one or more \XML subtrees. class DocumentFragment < Nokogiri::XML::Node # The options used to parse the document fragment. Returns the value of any options that were # passed into the constructor as a parameter or set in a config block, else the default # options for the specific subclass. attr_reader :parse_options - #### - # Create a Nokogiri::XML::DocumentFragment from +tags+ - def self.parse(tags, options = ParseOptions::DEFAULT_XML, &block) - new(XML::Document.new, tags, nil, options, &block) + class << self + # Create a Nokogiri::XML::DocumentFragment from +tags+ + def parse(tags, options = ParseOptions::DEFAULT_XML, &block) + new(XML::Document.new, tags, nil, options, &block) + end + + # Wrapper method to separate the concerns of: + # - the native object allocator's parameter (it only requires `document`) + # - the initializer's parameters + def new(document, ...) # :nodoc: + instance = native_new(document) + instance.send(:initialize, document, ...) + instance + end end ## From 7b505527ab7e73a949c63aec0fb855483b099906 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Sun, 8 Dec 2024 10:43:58 -0500 Subject: [PATCH 02/10] Extract JRuby DocumentFragment initializer parameter handling and delete some unused code. --- ext/java/nokogiri/XmlDocumentFragment.java | 96 +--------------------- 1 file changed, 4 insertions(+), 92 deletions(-) diff --git a/ext/java/nokogiri/XmlDocumentFragment.java b/ext/java/nokogiri/XmlDocumentFragment.java index f20b9472249..406b3d1d1c3 100644 --- a/ext/java/nokogiri/XmlDocumentFragment.java +++ b/ext/java/nokogiri/XmlDocumentFragment.java @@ -48,112 +48,24 @@ public class XmlDocumentFragment extends XmlNode super(ruby, klazz); } - @JRubyMethod(name = "new", meta = true, required = 1, optional = 3) + @JRubyMethod(name = "native_new", meta = true) public static IRubyObject - rbNew(ThreadContext context, IRubyObject cls, IRubyObject[] args, Block block) + rbNew(ThreadContext context, IRubyObject cls, IRubyObject value) { - if (args.length < 1) { - throw context.runtime.newArgumentError(args.length, 1); - } - - if (!(args[0] instanceof XmlDocument)) { + if (!(value instanceof XmlDocument)) { throw context.runtime.newArgumentError("first parameter must be a Nokogiri::XML::Document instance"); } - XmlDocument doc = (XmlDocument) args[0]; - - // make well-formed fragment, ignore invalid namespace, or add appropriate namespace to parse - if (args.length > 1 && args[1] instanceof RubyString) { - final RubyString arg1 = (RubyString) args[1]; - if (XmlDocumentFragment.isTag(arg1)) { - args[1] = RubyString.newString(context.runtime, addNamespaceDeclIfNeeded(doc, rubyStringToString(arg1))); - } - } + XmlDocument doc = (XmlDocument) value; XmlDocumentFragment fragment = (XmlDocumentFragment) NokogiriService.XML_DOCUMENT_FRAGMENT_ALLOCATOR.allocate( context.runtime, (RubyClass)cls); fragment.setDocument(context, doc); fragment.setNode(context.runtime, doc.getDocument().createDocumentFragment()); - Helpers.invoke(context, fragment, "initialize", args, block); return fragment; } - private static final ByteList TAG_BEG = ByteList.create("<"); - private static final ByteList TAG_END = ByteList.create(">"); - - private static boolean - isTag(final RubyString str) - { - return str.getByteList().startsWith(TAG_BEG) && str.getByteList().endsWith(TAG_END); - } - - private static boolean - isNamespaceDefined(String qName, NamedNodeMap nodeMap) - { - if (isNamespace(qName.intern())) { return true; } - for (int i = 0; i < nodeMap.getLength(); i++) { - Attr attr = (Attr)nodeMap.item(i); - if (isNamespace(attr.getNodeName())) { - String localPart = getLocalNameForNamespace(attr.getNodeName(), null); - if (getPrefix(qName).equals(localPart)) { - return true; - } - } - } - return false; - } - - private static final Pattern QNAME_RE = Pattern.compile("[^\\s]+:[^=\\s]+"); - private static final Pattern START_TAG_RE = Pattern.compile("<[^]+>"); - - private static String - addNamespaceDeclIfNeeded(XmlDocument doc, String tags) - { - if (doc.getDocument() == null) { return tags; } - if (doc.getDocument().getDocumentElement() == null) { return tags; } - Matcher matcher = START_TAG_RE.matcher(tags); - Map rewriteTable = null; - while (matcher.find()) { - String start_tag = matcher.group(); - Matcher matcher2 = QNAME_RE.matcher(start_tag); - while (matcher2.find()) { - String qName = matcher2.group(); - NamedNodeMap nodeMap = doc.getDocument().getDocumentElement().getAttributes(); - if (isNamespaceDefined(qName, nodeMap)) { - CharSequence namespaceDecl = getNamespaceDecl(getPrefix(qName), nodeMap); - if (namespaceDecl != null) { - if (rewriteTable == null) { rewriteTable = new HashMap(8, 1); } - StringBuilder str = new StringBuilder(qName.length() + namespaceDecl.length() + 3); - String key = str.append('<').append(qName).append('>').toString(); - str.setCharAt(key.length() - 1, ' '); // (last) '>' -> ' ' - rewriteTable.put(key, str.append(namespaceDecl).append('>')); - } - } - } - } - if (rewriteTable != null) { - for (Map.Entry e : rewriteTable.entrySet()) { - tags = tags.replace(e.getKey(), e.getValue()); - } - } - - return tags; - } - - private static CharSequence - getNamespaceDecl(final String prefix, NamedNodeMap nodeMap) - { - for (int i = 0; i < nodeMap.getLength(); i++) { - Attr attr = (Attr) nodeMap.item(i); - if (prefix.equals(attr.getLocalName())) { - return new StringBuilder(). - append(attr.getName()).append('=').append('"').append(attr.getValue()).append('"'); - } - } - return null; - } - @Override public void relink_namespace(ThreadContext context) From ed957665fa8a6cd133653a92e4e058837859954b Mon Sep 17 00:00:00 2001 From: Brian Lesperance Date: Thu, 14 Nov 2024 12:37:01 -0600 Subject: [PATCH 03/10] XML::DocumentFragment.parse supports kwargs Related to sparklemotion/nokogiri#3323 --- lib/nokogiri/xml/document_fragment.rb | 2 +- test/xml/test_document_fragment.rb | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/lib/nokogiri/xml/document_fragment.rb b/lib/nokogiri/xml/document_fragment.rb index 4c6ce3ef179..baf0540d602 100644 --- a/lib/nokogiri/xml/document_fragment.rb +++ b/lib/nokogiri/xml/document_fragment.rb @@ -13,7 +13,7 @@ class DocumentFragment < Nokogiri::XML::Node class << self # Create a Nokogiri::XML::DocumentFragment from +tags+ - def parse(tags, options = ParseOptions::DEFAULT_XML, &block) + def parse(tags, options_ = ParseOptions::DEFAULT_XML, options: options_, &block) new(XML::Document.new, tags, nil, options, &block) end diff --git a/test/xml/test_document_fragment.rb b/test/xml/test_document_fragment.rb index 02eb8fc5811..76351673c67 100644 --- a/test/xml/test_document_fragment.rb +++ b/test/xml/test_document_fragment.rb @@ -416,6 +416,16 @@ def test_for_libxml_in_context_memory_badness_when_encountering_encoding_errors end end + it "accepts kwargs" do + frag = Nokogiri::XML::DocumentFragment.parse(input, options: xml_default) + assert_equal("foo", frag.to_html) + refute_empty(frag.errors) + + assert_raises(Nokogiri::SyntaxError) do + Nokogiri::XML::DocumentFragment.parse(input, options: xml_strict) + end + end + it "takes a config block" do default_config = nil Nokogiri::XML::DocumentFragment.parse(input) do |config| From 961aa3724b66c25bef87aff20253dbbf4af3490c Mon Sep 17 00:00:00 2001 From: Brian Lesperance Date: Thu, 14 Nov 2024 15:30:18 -0600 Subject: [PATCH 04/10] Nokogiri::XML.fragment() supports argument forwarding Related to https://github.com/sparklemotion/nokogiri#3323 --- lib/nokogiri/xml.rb | 7 +++---- test/xml/test_document_fragment.rb | 10 ++++++++++ 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/lib/nokogiri/xml.rb b/lib/nokogiri/xml.rb index 888c1f39009..08be5e4550c 100644 --- a/lib/nokogiri/xml.rb +++ b/lib/nokogiri/xml.rb @@ -29,10 +29,9 @@ def parse(thing, url = nil, encoding = nil, options = ParseOptions::DEFAULT_XML, Document.parse(thing, url, encoding, options, &block) end - #### - # Parse a fragment from +string+ in to a NodeSet. - def fragment(string, options = ParseOptions::DEFAULT_XML, &block) - XML::DocumentFragment.parse(string, options, &block) + # Convenience method for Nokogiri::XML::DocumentFragment.parse + def fragment(...) + XML::DocumentFragment.parse(...) end end end diff --git a/test/xml/test_document_fragment.rb b/test/xml/test_document_fragment.rb index 76351673c67..8be61d062f6 100644 --- a/test/xml/test_document_fragment.rb +++ b/test/xml/test_document_fragment.rb @@ -386,6 +386,16 @@ def test_for_libxml_in_context_memory_badness_when_encountering_encoding_errors end end + it "accepts kwargs" do + frag = Nokogiri::XML.fragment(input, options: xml_default) + assert_equal("foo", frag.to_html) + refute_empty(frag.errors) + + assert_raises(Nokogiri::SyntaxError) do + Nokogiri::XML.fragment(input, options: xml_strict) + end + end + it "takes a config block" do default_config = nil Nokogiri::XML.fragment(input) do |config| From 41abed73211b0fbda722365982581c82c3224d31 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Sun, 8 Dec 2024 09:36:19 -0500 Subject: [PATCH 05/10] HTML4::DocumentFragment.parse and #initialize take kwargs Related to #3323 This commit was merged and expanded from #3336, thank you @MattJones! Co-authored-by: Matt Jones --- lib/nokogiri/html4/document_fragment.rb | 118 +++++++++++++++++------- test/html4/test_document_fragment.rb | 26 +++++- 2 files changed, 105 insertions(+), 39 deletions(-) diff --git a/lib/nokogiri/html4/document_fragment.rb b/lib/nokogiri/html4/document_fragment.rb index eae79bcb14a..2a70cb1a571 100644 --- a/lib/nokogiri/html4/document_fragment.rb +++ b/lib/nokogiri/html4/document_fragment.rb @@ -5,51 +5,60 @@ module HTML4 class DocumentFragment < Nokogiri::XML::DocumentFragment # # :call-seq: - # parse(tags) => DocumentFragment - # parse(tags, encoding) => DocumentFragment - # parse(tags, encoding, options) => DocumentFragment - # parse(tags, encoding) { |options| ... } => DocumentFragment + # parse(input) { |options| ... } → HTML4::DocumentFragment + # parse(input, encoding:, options:) { |options| ... } → HTML4::DocumentFragment # - # Parse an HTML4 fragment. + # Parse \HTML4 fragment input from a String, and return a new HTML4::DocumentFragment. This + # method creates a new, empty HTML4::Document to contain the fragment. # - # [Parameters] - # - +tags+ (optional String, or any object that responds to +#read+ such as an IO, or - # StringIO) - # - +encoding+ (optional String) the name of the encoding that should be used when processing - # the document. (default +nil+ for auto-detection) - # - +options+ (optional) configuration object that sets options during parsing, such as - # Nokogiri::XML::ParseOptions::RECOVER. See Nokogiri::XML::ParseOptions for more - # information. + # [Required Parameters] + # - +input+ (String | IO) The content to be parsed. # - # [Yields] If present, the block will be passed a Nokogiri::XML::ParseOptions object to modify - # before the fragment is parsed. See Nokogiri::XML::ParseOptions for more information. + # [Optional Keyword Arguments] + # - +encoding:+ (String) The name of the encoding that should be used when processing the + # document. When not provided, the encoding will be determined based on the document + # content. # - # [Returns] DocumentFragment + # - +options:+ (Nokogiri::XML::ParseOptions) Configuration object that determines some + # behaviors during parsing. See ParseOptions for more information. The default value is + # +ParseOptions::DEFAULT_HTML+. + # + # [Yields] + # If a block is given, a Nokogiri::XML::ParseOptions object is yielded to the block which + # can be configured before parsing. See ParseOptions for more information. + # + # [Returns] HTML4::DocumentFragment # # *Example:* Parsing a string # - # fragment = DocumentFragment.parse("
Hello World
") + # fragment = HTML4::DocumentFragment.parse("
Hello World
") # # *Example:* Parsing an IO # # fragment = File.open("fragment.html") do |file| - # DocumentFragment.parse(file) + # HTML4::DocumentFragment.parse(file) # end # # *Example:* Specifying encoding # - # fragment = DocumentFragment.parse(input, "EUC-JP") + # fragment = HTML4::DocumentFragment.parse(input, encoding: "EUC-JP") # # *Example:* Setting parse options dynamically # - # DocumentFragment.parse("
Hello World") do |options| + # HTML4::DocumentFragment.parse("
Hello World") do |options| # options.huge.pedantic # end # - def self.parse(tags, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block) + def self.parse( + input, + encoding_ = nil, options_ = XML::ParseOptions::DEFAULT_HTML, + encoding: encoding_, options: options_, + &block + ) + # TODO: this method should take a context node. doc = HTML4::Document.new - if tags.respond_to?(:read) + if input.respond_to?(:read) # Handle IO-like objects (IO, File, StringIO, etc.) # The _read_ method of these objects doesn't accept an +encoding+ parameter. # Encoding is usually set when the IO object is created or opened, @@ -65,12 +74,12 @@ def self.parse(tags, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, # # For StringIO specifically, _set_encoding_ affects only the internal string, # not how the data is read out. - tags.set_encoding(encoding) if encoding && tags.respond_to?(:set_encoding) - tags = tags.read + input.set_encoding(encoding) if encoding && input.respond_to?(:set_encoding) + input = input.read end - encoding ||= if tags.respond_to?(:encoding) - encoding = tags.encoding + encoding ||= if input.respond_to?(:encoding) + encoding = input.encoding if encoding == ::Encoding::ASCII_8BIT "UTF-8" else @@ -82,32 +91,71 @@ def self.parse(tags, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, doc.encoding = encoding - new(doc, tags, nil, options, &block) + new(doc, input, options: options, &block) end - # It's recommended to use either DocumentFragment.parse or XML::Node#parse rather than call this - # method directly. - def initialize(document, tags = nil, ctx = nil, options = XML::ParseOptions::DEFAULT_HTML) # rubocop:disable Lint/MissingSuper - return self unless tags + # + # :call-seq: + # new(document) { |options| ... } → HTML4::DocumentFragment + # new(document, input) { |options| ... } → HTML4::DocumentFragment + # new(document, input, context:, options:) { |options| ... } → HTML4::DocumentFragment + # + # Parse \HTML4 fragment input from a String, and return a new HTML4::DocumentFragment. + # + # 💡 It's recommended to use either HTML4::DocumentFragment.parse or XML::Node#parse rather + # than call this method directly. + # + # [Required Parameters] + # - +document+ (HTML4::Document) The parent document to associate the returned fragment with. + # + # [Optional Parameters] + # - +input+ (String) The content to be parsed. + # + # [Optional Keyword Arguments] + # - +context:+ (Nokogiri::XML::Node) The context node for the subtree created. See + # below for more information. + # + # - +options:+ (Nokogiri::XML::ParseOptions) Configuration object that determines some + # behaviors during parsing. See ParseOptions for more information. The default value is + # +ParseOptions::DEFAULT_HTML+. + # + # [Yields] + # If a block is given, a Nokogiri::XML::ParseOptions object is yielded to the block which + # can be configured before parsing. See ParseOptions for more information. + # + # [Returns] HTML4::DocumentFragment + # + # === Context \Node + # + # If a context node is specified using +context:+, then the fragment will be created by + # calling XML::Node#parse on that node, so the parser will behave as if that Node is the + # parent of the fragment subtree. + # + def initialize( + document, input = nil, + context_ = nil, options_ = XML::ParseOptions::DEFAULT_HTML, + context: context_, options: options_ + ) # rubocop:disable Lint/MissingSuper + return self unless input options = Nokogiri::XML::ParseOptions.new(options) if Integer === options @parse_options = options yield options if block_given? - if ctx + if context preexisting_errors = document.errors.dup - node_set = ctx.parse("
#{tags}
", options) + node_set = context.parse("
#{input}
", options) node_set.first.children.each { |child| child.parent = self } unless node_set.empty? self.errors = document.errors - preexisting_errors else # This is a horrible hack, but I don't care - path = if /^\s*?#{tags}", nil, document.encoding, options) + temp_doc = HTML4::Document.parse("#{input}", nil, document.encoding, options) temp_doc.xpath(path).each { |child| child.parent = self } self.errors = temp_doc.errors end diff --git a/test/html4/test_document_fragment.rb b/test/html4/test_document_fragment.rb index 6ef3faac074..57ad1193b3f 100644 --- a/test/html4/test_document_fragment.rb +++ b/test/html4/test_document_fragment.rb @@ -326,6 +326,14 @@ def test_parse_with_io assert_equal("hello world", fragment.content) end + it "returns a string matching an encoding passed with kwargs" do + input = "
hello world
" + + fragment = Nokogiri::HTML4::DocumentFragment.parse(input, encoding: "ISO-8859-1") + assert_equal("ISO-8859-1", fragment.document.encoding) + assert_equal("hello world", fragment.content) + end + it "respects encoding for empty strings" do fragment = Nokogiri::HTML::DocumentFragment.parse("", "UTF-8") assert_equal "UTF-8", fragment.to_html.encoding.to_s @@ -384,6 +392,13 @@ def test_parse_with_io assert_equal(html4_huge, frag.parse_options) end + it "accepts options as kwargs" do + frag = Nokogiri::HTML4::DocumentFragment.parse(input, options: html4_huge) + + assert_equal("
foo
", frag.to_html) + assert_equal(html4_huge, frag.parse_options) + end + it "takes a config block" do default_config = nil frag = Nokogiri::HTML4.fragment(input) do |config| @@ -495,9 +510,9 @@ def test_parse_with_io Class.new(Nokogiri::HTML4::DocumentFragment) do attr_accessor :initialized_with, :initialized_count - def initialize(*args) + def initialize(*args, **kwargs) super - @initialized_with = args + @initialized_with = [args, kwargs] @initialized_count ||= 0 @initialized_count += 1 end @@ -516,8 +531,11 @@ def initialize(*args) end it "passes args to #initialize" do - fragment = klass.new(html, "
a
") - assert_equal([html, "
a
"], fragment.initialized_with) + fragment = klass.new(html, "
a
", options: 1) + assert_equal( + [[html, "
a
"], { options: 1 }], + fragment.initialized_with, + ) end end From 98a878c26ba669b0e975094b74348e343fc9f9ac Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Sun, 8 Dec 2024 10:08:16 -0500 Subject: [PATCH 06/10] HTML5::DocumentFragment.parse and #initialize take kwargs Related to #3323 This commit was merged and expanded from #3335, thank you @infews! Co-authored-by: Davis W. Frank --- CHANGELOG.md | 1 - lib/nokogiri/html5/document_fragment.rb | 125 +++++++++++++++++++----- test/html5/test_api.rb | 31 +++--- 3 files changed, 117 insertions(+), 40 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c0e6e411b82..211e6806891 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -104,7 +104,6 @@ We've resolved many long-standing bugs in the various schema classes, validation * The undocumented and unused method `Nokogiri::CSS.parse` is now deprecated and will generate a warning. The AST returned by this method is private and subject to change and removal in future versions of Nokogiri. This method will be removed in a future version of Nokogiri. * Passing an options hash to `CSS.xpath_for` is now deprecated and will generate a warning. Use keyword arguments instead. This will become an error in a future version of Nokogiri. -* Passing an options hash to `HTML5::DocumentFragment.parse` is now deprecated and will generate a warning. Use keyword arguments instead. This will become an error in a future version of Nokogiri. * Passing libxml2 encoding IDs to `SAX::ParserContext` methods is now deprecated and will generate a warning. The use of `SAX::Parser::ENCODINGS` is also deprecated. Use `Encoding` objects or encoding names instead. diff --git a/lib/nokogiri/html5/document_fragment.rb b/lib/nokogiri/html5/document_fragment.rb index 935ac6491dc..b43da63c5f1 100644 --- a/lib/nokogiri/html5/document_fragment.rb +++ b/lib/nokogiri/html5/document_fragment.rb @@ -27,32 +27,51 @@ module HTML5 class DocumentFragment < Nokogiri::HTML4::DocumentFragment class << self # :call-seq: - # parse(tags, **options) - # parse(tags, encoding = nil, **options) + # parse(input, **options) → HTML5::DocumentFragment # - # Parse an HTML5 document fragment from +tags+, returning a Nodeset. + # Parse \HTML5 fragment input from a String, and return a new HTML5::DocumentFragment. This + # method creates a new, empty HTML5::Document to contain the fragment. # # [Parameters] - # - +tags+ [String, IO] The HTML5 document fragment to parse. - # - +encoding+ [String] The name of the encoding to use when parsing the document fragment. (default +nil+) + # - +input+ (String | IO) The HTML5 document fragment to parse. # - # Also see Nokogiri::HTML5 for a longer explanation of how encoding is handled by the parser. + # [Optional Keyword Arguments] + # - +encoding:+ (String | Encoding) The encoding, or name of the encoding, that should be + # used when processing the document. When not provided, the encoding will be determined + # based on the document content. Also see Nokogiri::HTML5 for a longer explanation of how + # encoding is handled by the parser. # - # [Options] - # - +:context+ [String, Nokogiri::XML::Node] The context in which to parse the document fragment. (default +"body"+) - # - +:max_errors+ [Integer] The maximum number of parse errors to record. (default +Nokogiri::Gumbo::DEFAULT_MAX_ERRORS+ which is currently 0) - # - +:max_tree_depth+ [Integer] The maximum depth of the parse tree. (default +Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH+) - # - +:max_attributes+ [Integer] The maximum number of attributes allowed on an element. (default +Nokogiri::Gumbo::DEFAULT_MAX_ATTRIBUTES+) - # - +:parse_noscript_content_as_text+ [Boolean] Whether to parse the content of +noscript+ elements as text. (default +false+) + # - +context:+ (String | Nokogiri::XML::Node) The node, or the name of an HTML5 element, "in + # context" of which to parse the document fragment. See below for more + # information. (default +"body"+) # - # Also see Nokogiri::HTML5 for a longer explanation of the options. + # - +max_errors:+ (Integer) The maximum number of parse errors to record. (default + # +Nokogiri::Gumbo::DEFAULT_MAX_ERRORS+ which is currently 0) # - # [Returns] - # - [Nokogiri::XML::NodeSet] A node set containing the root nodes of the parsed fragment. + # - +max_tree_depth:+ (Integer) The maximum depth of the parse tree. (default + # +Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH+) # - def parse(tags, encoding = nil, positional_options_hash = nil, **options) - unless positional_options_hash.nil? - warn("Nokogiri::HTML5::DocumentFragment.parse: Passing options as an explicit hash is deprecated. Use keyword arguments instead. This will become an error in a future release.", uplevel: 1, category: :deprecated) + # - +max_attributes:+ (Integer) The maximum number of attributes allowed on an + # element. (default +Nokogiri::Gumbo::DEFAULT_MAX_ATTRIBUTES+) + # + # - +parse_noscript_content_as_text:+ (Boolean) Whether to parse the content of +noscript+ + # elements as text. (default +false+) + # + # See rdoc-ref:HTML5@Parsing+options for a complete description of these parsing options. + # + # [Returns] Nokogiri::HTML5::DocumentFragment + # + # === Context \Node + # + # If a context node is specified using +context:+, then the parser will behave as if that + # Node, or a hypothetical tag named as specified, is the parent of the fragment subtree. + # + def parse( + input, + encoding_ = nil, positional_options_hash = nil, + encoding: encoding_, **options + ) + unless positional_options_hash.nil? || positional_options_hash.empty? options.merge!(positional_options_hash) end @@ -60,9 +79,9 @@ def parse(tags, encoding = nil, positional_options_hash = nil, **options) document = HTML5::Document.new document.encoding = "UTF-8" - tags = HTML5.read_and_encode(tags, encoding) + input = HTML5.read_and_encode(input, encoding) - new(document, tags, context, options) + new(document, input, context, options) end end @@ -71,18 +90,72 @@ def parse(tags, encoding = nil, positional_options_hash = nil, **options) # Get the parser's quirks mode value. See HTML5::QuirksMode. # - # This method returns `nil` if the parser was not invoked (e.g., `Nokogiri::HTML5::DocumentFragment.new(doc)`). + # This method returns `nil` if the parser was not invoked (e.g., + # `Nokogiri::HTML5::DocumentFragment.new(doc)`). # # Since v1.14.0 attr_reader :quirks_mode - # Create a document fragment. - def initialize(doc, tags = nil, context = nil, options = {}) # rubocop:disable Lint/MissingSuper + # + # :call-seq: + # new(document, input, **options) → HTML5::DocumentFragment + # + # Parse \HTML5 fragment input from a String, and return a new HTML5::DocumentFragment. + # + # 💡 It's recommended to use either HTML5::DocumentFragment.parse or HTML5::Node#fragment + # rather than call this method directly. + # + # [Required Parameters] + # - +document+ (HTML5::Document) The parent document to associate the returned fragment with. + # + # [Optional Parameters] + # - +input+ (String) The content to be parsed. + # + # [Optional Keyword Arguments] + # - +encoding:+ (String | Encoding) The encoding, or name of the encoding, that should be + # used when processing the document. When not provided, the encoding will be determined + # based on the document content. Also see Nokogiri::HTML5 for a longer explanation of how + # encoding is handled by the parser. + # + # - +context:+ (String | Nokogiri::XML::Node) The node, or the name of an HTML5 element, in + # which to parse the document fragment. (default +"body"+) + # + # - +max_errors:+ (Integer) The maximum number of parse errors to record. (default + # +Nokogiri::Gumbo::DEFAULT_MAX_ERRORS+ which is currently 0) + # + # - +max_tree_depth:+ (Integer) The maximum depth of the parse tree. (default + # +Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH+) + # + # - +max_attributes:+ (Integer) The maximum number of attributes allowed on an + # element. (default +Nokogiri::Gumbo::DEFAULT_MAX_ATTRIBUTES+) + # + # - +parse_noscript_content_as_text:+ (Boolean) Whether to parse the content of +noscript+ + # elements as text. (default +false+) + # + # See rdoc-ref:HTML5@Parsing+options for a complete description of these parsing options. + # + # [Returns] HTML5::DocumentFragment + # + # === Context \Node + # + # If a context node is specified using +context:+, then the parser will behave as if that + # Node, or a hypothetical tag named as specified, is the parent of the fragment subtree. + # + def initialize( + doc, input = nil, + context_ = nil, positional_options_hash = nil, + context: context_, + **options + ) # rubocop:disable Lint/MissingSuper + unless positional_options_hash.nil? || positional_options_hash.empty? + options.merge!(positional_options_hash) + end + @document = doc @errors = [] - return self unless tags + return self unless input - tags = Nokogiri::HTML5.read_and_encode(tags, nil) + input = Nokogiri::HTML5.read_and_encode(input, nil) context = options.delete(:context) if options.key?(:context) @@ -90,7 +163,7 @@ def initialize(doc, tags = nil, context = nil, options = {}) # rubocop:disable L options[:max_errors] ||= options.delete(:max_parse_errors) || Nokogiri::Gumbo::DEFAULT_MAX_ERRORS options[:max_tree_depth] ||= Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH - Nokogiri::Gumbo.fragment(self, tags, context, **options) + Nokogiri::Gumbo.fragment(self, input, context, **options) end def serialize(options = {}, &block) # :nodoc: diff --git a/test/html5/test_api.rb b/test/html5/test_api.rb index 468be98bcf9..1294632effe 100644 --- a/test/html5/test_api.rb +++ b/test/html5/test_api.rb @@ -92,6 +92,10 @@ def test_fragment_encoding assert_match(/おはようございます/, Nokogiri::HTML5.fragment(raw, Encoding::SHIFT_JIS).to_s) assert_match(/おはようございます/, Nokogiri::HTML5::DocumentFragment.parse(raw, Encoding::SHIFT_JIS).to_s) + + # with kwargs + assert_match(/おはようございます/, Nokogiri::HTML5.fragment(raw, encoding: Encoding::SHIFT_JIS).to_s) + assert_match(/おはようございます/, Nokogiri::HTML5::DocumentFragment.parse(raw, encoding: Encoding::SHIFT_JIS).to_s) end def test_fragment_serialization_encoding @@ -432,15 +436,13 @@ def initialize(*args) describe "to DocumentFragment.parse" do it "as an options hash" do - assert_output(nil, /Passing options as an explicit hash is deprecated/) do - fragment = Nokogiri::HTML5::DocumentFragment.parse( - "
foo
", - nil, - { context: "html" }, - ) - assert_match(//, fragment.to_s) - assert_match(//, fragment.to_s) - end + fragment = Nokogiri::HTML5::DocumentFragment.parse( + "
foo
", + nil, + { context: "html" }, + ) + assert_match(//, fragment.to_s) + assert_match(//, fragment.to_s) end it "as keyword argument" do @@ -462,9 +464,9 @@ def initialize(*args) Class.new(Nokogiri::HTML5::DocumentFragment) do attr_accessor :initialized_with, :initialized_count - def initialize(*args) + def initialize(*args, **kwargs) super - @initialized_with = args + @initialized_with = [args, **kwargs] @initialized_count ||= 0 @initialized_count += 1 end @@ -484,8 +486,11 @@ def initialize(*args) end it "passes args to #initialize" do - fragment = klass.new(html, "
a
") - assert_equal([html, "
a
"], fragment.initialized_with) + fragment = klass.new(html, "
a
", max_errors: 1) + assert_equal( + [[html, "
a
"], { max_errors: 1 }], + fragment.initialized_with, + ) end end From 30bac72490b9b65fc40cec78a28af60f68bafaff Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Sun, 8 Dec 2024 10:18:16 -0500 Subject: [PATCH 07/10] XML::DocumentFragment#initialize takes kwargs and improve the documentation Part of #3323 --- lib/nokogiri/xml/document_fragment.rb | 75 +++++++++++++++++++++++---- test/xml/test_document_fragment.rb | 21 ++++++-- 2 files changed, 81 insertions(+), 15 deletions(-) diff --git a/lib/nokogiri/xml/document_fragment.rb b/lib/nokogiri/xml/document_fragment.rb index baf0540d602..a1a1f44c57d 100644 --- a/lib/nokogiri/xml/document_fragment.rb +++ b/lib/nokogiri/xml/document_fragment.rb @@ -12,9 +12,28 @@ class DocumentFragment < Nokogiri::XML::Node attr_reader :parse_options class << self - # Create a Nokogiri::XML::DocumentFragment from +tags+ + # :call-seq: + # parse(input) { |options| ... } → XML::DocumentFragment + # parse(input, options:) → XML::DocumentFragment + # + # Parse \XML fragment input from a String, and return a new XML::DocumentFragment. This + # method creates a new, empty XML::Document to contain the fragment. + # + # [Required Parameters] + # - +input+ (String) The content to be parsed. + # + # [Optional Keyword Arguments] + # - +options+ (Nokogiri::XML::ParseOptions) Configuration object that determines some + # behaviors during parsing. See ParseOptions for more information. The default value is + # +ParseOptions::DEFAULT_XML+. + # + # [Yields] + # If a block is given, a Nokogiri::XML::ParseOptions object is yielded to the block which + # can be configured before parsing. See Nokogiri::XML::ParseOptions for more information. + # + # [Returns] Nokogiri::XML::DocumentFragment def parse(tags, options_ = ParseOptions::DEFAULT_XML, options: options_, &block) - new(XML::Document.new, tags, nil, options, &block) + new(XML::Document.new, tags, options: options, &block) end # Wrapper method to separate the concerns of: @@ -27,26 +46,60 @@ def new(document, ...) # :nodoc: end end - ## - # Create a new DocumentFragment from +tags+. + # :call-seq: + # new(document, input=nil) { |options| ... } → DocumentFragment + # new(document, input=nil, context:, options:) → DocumentFragment # - # If +ctx+ is present, it is used as a context node for the - # subtree created, e.g., namespaces will be resolved relative - # to +ctx+. - def initialize(document, tags = nil, ctx = nil, options = ParseOptions::DEFAULT_XML) # rubocop:disable Lint/MissingSuper + # Parse \XML fragment input from a String, and return a new DocumentFragment that is + # associated with the given +document+. + # + # 💡 It's recommended to use either XML::DocumentFragment.parse or Node#parse rather than call + # this method directly. + # + # [Required Parameters] + # - +document+ (XML::Document) The parent document to associate the returned fragment with. + # + # [Optional Parameters] + # - +input+ (String) The content to be parsed. + # + # [Optional Keyword Arguments] + # - +context:+ (Nokogiri::XML::Node) The context node for the subtree created. See + # below for more information. + # + # - +options:+ (Nokogiri::XML::ParseOptions) Configuration object that determines some + # behaviors during parsing. See ParseOptions for more information. The default value is + # +ParseOptions::DEFAULT_XML+. + # + # [Yields] + # If a block is given, a Nokogiri::XML::ParseOptions object is yielded to the block which + # can be configured before parsing. See ParseOptions for more information. + # + # [Returns] XML::DocumentFragment + # + # === Context \Node + # + # If a context node is specified using +context:+, then the fragment will be created by + # calling Node#parse on that node, so the parser will behave as if that Node is the parent of + # the fragment subtree, and will resolve namespaces relative to that node. + # + def initialize( + document, tags = nil, + context_ = nil, options_ = ParseOptions::DEFAULT_XML, + context: context_, options: options_ + ) # rubocop:disable Lint/MissingSuper return self unless tags options = Nokogiri::XML::ParseOptions.new(options) if Integer === options @parse_options = options yield options if block_given? - children = if ctx + children = if context # Fix for issue#490 if Nokogiri.jruby? # fix for issue #770 - ctx.parse("#{tags}", options).children + context.parse("#{tags}", options).children else - ctx.parse(tags, options) + context.parse(tags, options) end else wrapper_doc = XML::Document.parse("#{tags}", nil, nil, options) diff --git a/test/xml/test_document_fragment.rb b/test/xml/test_document_fragment.rb index 8be61d062f6..28407a4b2df 100644 --- a/test/xml/test_document_fragment.rb +++ b/test/xml/test_document_fragment.rb @@ -467,6 +467,16 @@ def test_for_libxml_in_context_memory_badness_when_encountering_encoding_errors end end + it "accepts options as kwargs" do + frag = Nokogiri::XML::DocumentFragment.new(Nokogiri::XML::Document.new, input, options: xml_default) + assert_equal("foo", frag.to_html) + refute_empty(frag.errors) + + assert_raises(Nokogiri::SyntaxError) do + Nokogiri::XML::DocumentFragment.new(Nokogiri::XML::Document.new, input, options: xml_strict) + end + end + it "takes a config block" do default_config = nil Nokogiri::XML::DocumentFragment.new(Nokogiri::XML::Document.new, input) do |config| @@ -520,9 +530,9 @@ def test_for_libxml_in_context_memory_badness_when_encountering_encoding_errors Class.new(Nokogiri::XML::DocumentFragment) do attr_accessor :initialized_with, :initialized_count - def initialize(*args) + def initialize(*args, **kwargs) super - @initialized_with = args + @initialized_with = [args, kwargs] @initialized_count ||= 0 @initialized_count += 1 end @@ -541,8 +551,11 @@ def initialize(*args) end it "passes args to #initialize" do - fragment = klass.new(xml, "
a
") - assert_equal([xml, "
a
"], fragment.initialized_with) + fragment = klass.new(xml, "
a
", options: ParseOptions::DEFAULT_XML) + assert_equal( + [[xml, "
a
"], { options: ParseOptions::DEFAULT_XML }], + fragment.initialized_with, + ) end end From 373baf8df300ec09af6703bbac2a85d00f4cd616 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Sun, 8 Dec 2024 10:27:50 -0500 Subject: [PATCH 08/10] HTML4::Document.parse accepts kwargs Part of #3323 --- lib/nokogiri/html4/document.rb | 67 ++++++++++++++++++++++------------ test/html4/test_document.rb | 33 +++++++++++++++-- 2 files changed, 74 insertions(+), 26 deletions(-) diff --git a/lib/nokogiri/html4/document.rb b/lib/nokogiri/html4/document.rb index 4239affad88..9b291bf9dcf 100644 --- a/lib/nokogiri/html4/document.rb +++ b/lib/nokogiri/html4/document.rb @@ -161,52 +161,73 @@ def xpath_doctype end class << self - ### - # Parse HTML. +string_or_io+ may be a String, or any object that - # responds to _read_ and _close_ such as an IO, or StringIO. - # +url+ is resource where this document is located. +encoding+ is the - # encoding that should be used when processing the document. +options+ - # is a number that sets options in the parser, such as - # Nokogiri::XML::ParseOptions::RECOVER. See the constants in - # Nokogiri::XML::ParseOptions. - def parse(string_or_io, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML) + # :call-seq: + # parse(input) { |options| ... } => Nokogiri::HTML4::Document + # parse(input, url:, encoding:, options:) => Nokogiri::HTML4::Document + # + # Parse \HTML4 input from a String or IO object, and return a new HTML4::Document. + # + # [Required Parameters] + # - +input+ (String | IO) The content to be parsed. + # + # [Optional Keyword Arguments] + # - +url:+ (String) The base URI for this document. + # + # - +encoding:+ (String) The name of the encoding that should be used when processing the + # document. When not provided, the encoding will be determined based on the document + # content. + # + # - +options:+ (Nokogiri::XML::ParseOptions) Configuration object that determines some + # behaviors during parsing. See ParseOptions for more information. The default value is + # +ParseOptions::DEFAULT_HTML+. + # + # [Yields] + # If a block is given, a Nokogiri::XML::ParseOptions object is yielded to the block which + # can be configured before parsing. See Nokogiri::XML::ParseOptions for more information. + # + # [Returns] Nokogiri::HTML4::Document + def parse( + input, + url_ = nil, encoding_ = nil, options_ = XML::ParseOptions::DEFAULT_HTML, + url: url_, encoding: encoding_, options: options_ + ) options = Nokogiri::XML::ParseOptions.new(options) if Integer === options yield options if block_given? - url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil + url ||= input.respond_to?(:path) ? input.path : nil - if string_or_io.respond_to?(:encoding) - unless string_or_io.encoding == Encoding::ASCII_8BIT - encoding ||= string_or_io.encoding.name + if input.respond_to?(:encoding) + unless input.encoding == Encoding::ASCII_8BIT + encoding ||= input.encoding.name end end - if string_or_io.respond_to?(:read) - if string_or_io.is_a?(Pathname) + if input.respond_to?(:read) + if input.is_a?(Pathname) # resolve the Pathname to the file and open it as an IO object, see #2110 - string_or_io = string_or_io.expand_path.open - url ||= string_or_io.path + input = input.expand_path.open + url ||= input.path end unless encoding - string_or_io = EncodingReader.new(string_or_io) + input = EncodingReader.new(input) begin - return read_io(string_or_io, url, encoding, options.to_i) + return read_io(input, url, encoding, options.to_i) rescue EncodingReader::EncodingFound => e encoding = e.found_encoding end end - return read_io(string_or_io, url, encoding, options.to_i) + return read_io(input, url, encoding, options.to_i) end # read_memory pukes on empty docs - if string_or_io.nil? || string_or_io.empty? + if input.nil? || input.empty? return encoding ? new.tap { |i| i.encoding = encoding } : new end - encoding ||= EncodingReader.detect_encoding(string_or_io) + encoding ||= EncodingReader.detect_encoding(input) - read_memory(string_or_io, url, encoding, options.to_i) + read_memory(input, url, encoding, options.to_i) end end end diff --git a/test/html4/test_document.rb b/test/html4/test_document.rb index e0ece6ada23..d218ae26af1 100644 --- a/test/html4/test_document.rb +++ b/test/html4/test_document.rb @@ -65,6 +65,12 @@ def test_document_parse_method_with_url assert_equal("http://foobar.example.com/", doc.url) end + def test_document_parse_method_with_url_kwarg + doc = Nokogiri::HTML4("", url: "http://foobar.example.com/", encoding: "UTF-8") + refute_empty(doc.to_s, "Document should not be empty") + assert_equal("http://foobar.example.com/", doc.url) + end + ### # Nokogiri::HTML4 returns an empty Document when given a blank string GH#11 def test_empty_string_returns_empty_doc @@ -231,7 +237,7 @@ def test_title= def test_meta_encoding_without_head encoding = "EUC-JP" - html = Nokogiri::HTML4("foo", nil, encoding) + html = Nokogiri::HTML4("foo", encoding: encoding) assert_nil(html.meta_encoding) @@ -246,7 +252,7 @@ def test_meta_encoding_without_head def test_html5_meta_encoding_without_head encoding = "EUC-JP" - html = Nokogiri::HTML4("foo", nil, encoding) + html = Nokogiri::HTML4("foo", encoding: encoding) assert_nil(html.meta_encoding) @@ -722,7 +728,7 @@ def test_silencing_nonparse_errors_during_attribute_insertion_1262 html_fragment = <<~HTML Inactive hide details for "User" ---19/05/2015 12:55:29---Provvediamo subito nell’integrare HTML - doc = Nokogiri::HTML4(html_fragment, nil, "ISO-8859-1") + doc = Nokogiri::HTML4(html_fragment, encoding: "ISO-8859-1") html = doc.to_html assert html.index("src=\"images/icon.gif\"") assert_equal "ISO-8859-1", html.encoding.name @@ -815,6 +821,14 @@ def test_silencing_nonparse_errors_during_attribute_insertion_1262 assert_match(/Parser without recover option encountered error or warning/, exception.to_s) assert_nil(exception.path) end + + it "raises exception on parse error using kwarg" do + exception = assert_raises(Nokogiri::SyntaxError) do + Nokogiri::HTML4.parse(input, options: parse_options) + end + assert_match(/Parser without recover option encountered error or warning/, exception.to_s) + assert_nil(exception.path) + end end describe "default options" do @@ -838,6 +852,14 @@ def test_silencing_nonparse_errors_during_attribute_insertion_1262 assert_match(/Parser without recover option encountered error or warning/, exception.to_s) assert_nil(exception.path) end + + it "raises exception on parse error using kwargs" do + exception = assert_raises(Nokogiri::SyntaxError) do + Nokogiri::HTML4.parse(input, encoding: "UTF-8", options: parse_options) + end + assert_match(/Parser without recover option encountered error or warning/, exception.to_s) + assert_nil(exception.path) + end end describe "default options" do @@ -845,6 +867,11 @@ def test_silencing_nonparse_errors_during_attribute_insertion_1262 doc = Nokogiri::HTML4.parse(input, nil, "UTF-8") assert_operator(doc.errors.length, :>, 0) end + + it "does not raise exception on parse error using kwarg" do + doc = Nokogiri::HTML4.parse(input, encoding: "UTF-8") + assert_operator(doc.errors.length, :>, 0) + end end end end From 7928194f2468e7aa7660026aa1b1cce41ddac596 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Sun, 8 Dec 2024 10:29:20 -0500 Subject: [PATCH 09/10] doc: hack to trick rdoc into processing html4_document.c --- ext/nokogiri/html4_document.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ext/nokogiri/html4_document.c b/ext/nokogiri/html4_document.c index 03b72fb24da..2a308a109f9 100644 --- a/ext/nokogiri/html4_document.c +++ b/ext/nokogiri/html4_document.c @@ -151,6 +151,12 @@ rb_html_document_type(VALUE self) void noko_init_html_document(void) { + /* this is here so that rdoc doesn't ignore this file. */ + /* + mNokogiri = rb_define_module("Nokogiri"); + mNokogiriHtml4 = rb_define_module_under(mNokogiri, "HTML4"); + */ + assert(cNokogiriXmlDocument); cNokogiriHtml4Document = rb_define_class_under(mNokogiriHtml4, "Document", cNokogiriXmlDocument); From ac9fb8a7cf8b0d7d298458dbd13a088cafc81ab0 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Sun, 8 Dec 2024 10:32:30 -0500 Subject: [PATCH 10/10] doc: general tidying up of docstrings and adding some TODOs --- ext/nokogiri/html4_document.c | 4 ++-- ext/nokogiri/nokogiri.c | 4 ++-- ext/nokogiri/xml_document.c | 12 +++++----- lib/nokogiri/html5.rb | 10 ++++----- lib/nokogiri/html5/document.rb | 29 ++++++++++++++++++------ lib/nokogiri/xml/document.rb | 41 +++++++++++++++++----------------- 6 files changed, 59 insertions(+), 41 deletions(-) diff --git a/ext/nokogiri/html4_document.c b/ext/nokogiri/html4_document.c index 2a308a109f9..e3e0ee0847f 100644 --- a/ext/nokogiri/html4_document.c +++ b/ext/nokogiri/html4_document.c @@ -7,9 +7,9 @@ static ID id_to_s; /* * call-seq: - * new + * new(uri=nil, external_id=nil) → HTML4::Document * - * Create a new document + * Create a new empty document with base URI +uri+ and external ID +external_id+. */ static VALUE rb_html_document_s_new(int argc, VALUE *argv, VALUE klass) diff --git a/ext/nokogiri/nokogiri.c b/ext/nokogiri/nokogiri.c index 66c40bda263..a43813b9ceb 100644 --- a/ext/nokogiri/nokogiri.c +++ b/ext/nokogiri/nokogiri.c @@ -185,8 +185,8 @@ Init_nokogiri(void) { mNokogiri = rb_define_module("Nokogiri"); mNokogiriGumbo = rb_define_module_under(mNokogiri, "Gumbo"); - mNokogiriHtml4 = rb_define_module_under(mNokogiri, "HTML4"); - mNokogiriHtml4Sax = rb_define_module_under(mNokogiriHtml4, "SAX"); + mNokogiriHtml4 = rb_define_module_under(mNokogiri, "HTML4"); + mNokogiriHtml4Sax = rb_define_module_under(mNokogiriHtml4, "SAX"); mNokogiriHtml5 = rb_define_module_under(mNokogiri, "HTML5"); mNokogiriXml = rb_define_module_under(mNokogiri, "XML"); mNokogiriXmlSax = rb_define_module_under(mNokogiriXml, "SAX"); diff --git a/ext/nokogiri/xml_document.c b/ext/nokogiri/xml_document.c index e1022f67f5f..740819300b5 100644 --- a/ext/nokogiri/xml_document.c +++ b/ext/nokogiri/xml_document.c @@ -370,6 +370,8 @@ noko_xml_document_s_read_io(VALUE rb_class, VALUE rb_encoding, VALUE rb_options) { + /* TODO: deprecate this method, parse should be the preferred entry point. then we can make this + private. */ libxmlStructuredErrorHandlerState handler_state; VALUE rb_errors = rb_ary_new(); @@ -417,6 +419,8 @@ noko_xml_document_s_read_memory(VALUE rb_class, VALUE rb_encoding, VALUE rb_options) { + /* TODO: deprecate this method, parse should be the preferred entry point. then we can make this + private. */ VALUE rb_errors = rb_ary_new(); xmlSetStructuredErrorFunc((void *)rb_errors, noko__error_array_pusher); @@ -444,9 +448,9 @@ noko_xml_document_s_read_memory(VALUE rb_class, /* * call-seq: - * new(version = default) + * new(version = "1.0") * - * Create a new document with +version+ (defaults to "1.0") + * Create a new empty document declaring XML version +version+. */ static VALUE new (int argc, VALUE *argv, VALUE klass) @@ -756,9 +760,7 @@ void noko_init_xml_document(void) { assert(cNokogiriXmlNode); - /* - * Nokogiri::XML::Document wraps an xml document. - */ + cNokogiriXmlDocument = rb_define_class_under(mNokogiriXml, "Document", cNokogiriXmlNode); rb_define_alloc_func(cNokogiriXmlDocument, _xml_document_alloc); diff --git a/lib/nokogiri/html5.rb b/lib/nokogiri/html5.rb index 9ca26db494a..5566e058ce0 100644 --- a/lib/nokogiri/html5.rb +++ b/lib/nokogiri/html5.rb @@ -46,11 +46,11 @@ def self.HTML5(...) # The document and fragment parsing methods support options that are different from # Nokogiri::HTML4::Document or Nokogiri::XML::Document. # - # - Nokogiri.HTML5(html, url:, encoding:, **parse_options) - # - Nokogiri::HTML5.parse(html, url:, encoding:, **parse_options) - # - Nokogiri::HTML5::Document.parse(html, url:, encoding:, **parse_options) - # - Nokogiri::HTML5.fragment(html, encoding = nil, **parse_options) - # - Nokogiri::HTML5::DocumentFragment.parse(html, encoding = nil, **parse_options) + # - Nokogiri.HTML5(input, url:, encoding:, **parse_options) + # - Nokogiri::HTML5.parse(input, url:, encoding:, **parse_options) + # - Nokogiri::HTML5::Document.parse(input, url:, encoding:, **parse_options) + # - Nokogiri::HTML5.fragment(input, encoding:, **parse_options) + # - Nokogiri::HTML5::DocumentFragment.parse(input, encoding:, **parse_options) # # The four currently supported parse options are # diff --git a/lib/nokogiri/html5/document.rb b/lib/nokogiri/html5/document.rb index 51b77c23b36..8bb4f10d28c 100644 --- a/lib/nokogiri/html5/document.rb +++ b/lib/nokogiri/html5/document.rb @@ -50,8 +50,9 @@ class Document < Nokogiri::HTML4::Document class << self # :call-seq: - # parse(input) { |parse_options| ... } - # parse(input, url:, encoding:, **parse_options) + # parse(input) { |options| ... } → HTML5::Document + # parse(input, url: encoding:) { |options| ... } → HTML5::Document + # parse(input, **options) → HTML5::Document # # Parse \HTML input with a parser compliant with the HTML5 spec. This method uses the # encoding of +input+ if it can be determined, or else falls back to the +encoding:+ @@ -62,11 +63,25 @@ class << self # # [Optional Parameters] # - +url:+ (String) the base URI of the document. - # - +encoding+ (Encoding) The encoding that should be used when processing the - # document. This option is only used as a fallback when the encoding of +input+ cannot be - # determined. - # - +parse_options+ (Hash) represents keywords arguments that control the behavior of the - # parser. See rdoc-ref:HTML5@Parsing+options for a list of available options. + # + # [Optional Keyword Arguments] + # - +encoding:+ (Encoding) The name of the encoding that should be used when processing the + # document. When not provided, the encoding will be determined based on the document + # content. + # + # - +max_errors:+ (Integer) The maximum number of parse errors to record. (default + # +Nokogiri::Gumbo::DEFAULT_MAX_ERRORS+ which is currently 0) + # + # - +max_tree_depth:+ (Integer) The maximum depth of the parse tree. (default + # +Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH+) + # + # - +max_attributes:+ (Integer) The maximum number of attributes allowed on an + # element. (default +Nokogiri::Gumbo::DEFAULT_MAX_ATTRIBUTES+) + # + # - +parse_noscript_content_as_text:+ (Boolean) Whether to parse the content of +noscript+ + # elements as text. (default +false+) + # + # See rdoc-ref:HTML5@Parsing+options for a complete description of these parsing options. # # [Yields] # If present, the block will be passed a Hash object to modify with parse options before the diff --git a/lib/nokogiri/xml/document.rb b/lib/nokogiri/xml/document.rb index 2ef98c9dd47..6c9d4949a12 100644 --- a/lib/nokogiri/xml/document.rb +++ b/lib/nokogiri/xml/document.rb @@ -5,12 +5,12 @@ module Nokogiri module XML - # Nokogiri::XML::Document is the main entry point for dealing with XML documents. The Document - # is created by parsing an XML document. See Nokogiri::XML::Document.parse for more information - # on parsing. + # Nokogiri::XML::Document is the main entry point for dealing with \XML documents. The Document + # is created by parsing \XML content from a String or an IO object. See + # Nokogiri::XML::Document.parse for more information on parsing. # - # For searching a Document, see Nokogiri::XML::Searchable#css and - # Nokogiri::XML::Searchable#xpath + # Document inherits a great deal of functionality from its superclass Nokogiri::XML::Node, so + # please read that class's documentation as well. class Document < Nokogiri::XML::Node # See http://www.w3.org/TR/REC-xml-names/#ns-decl for more details. Note that we're not # attempting to handle unicode characters partly because libxml2 doesn't handle unicode @@ -25,34 +25,34 @@ class Document < Nokogiri::XML::Node class << self # call-seq: - # parse(input, url: nil, encoding: nil, options: DEFAULT_XML) { |options| } => Nokogiri::XML::Document + # parse(input) { |options| ... } => Nokogiri::XML::Document + # parse(input, url:, encoding:, options:) => Nokogiri::XML::Document # - # Parse XML input from a String or IO object, and return a new Document object. + # Parse \XML input from a String or IO object, and return a new XML::Document. # - # By default, Nokogiri treats documents as untrusted, and so does not attempt to load DTDs + # 🛡 By default, Nokogiri treats documents as untrusted, and so does not attempt to load DTDs # or access the network. See Nokogiri::XML::ParseOptions for a complete list of options; and # that module's DEFAULT_XML constant for what's set (and not set) by default. # - # See also: Nokogiri.XML() which is a convenience method which will call this method. + # [Required Parameters] + # - +input+ (String | IO) The content to be parsed. # - # [Parameters] - # - +input+ (String, IO) The content to be parsed. - # - # [Keyword arguments] - # - +url:+ (String) The URI where this document is located. + # [Optional Keyword Arguments] + # - +url:+ (String) The base URI for this document. # # - +encoding:+ (String) The name of the encoding that should be used when processing the - # document. (default +nil+ means that the encoding will be determined based on the - # document content) + # document. When not provided, the encoding will be determined based on the document + # content. # - # - +options+ (Nokogiri::XML::ParseOptions) Configuration object that determines some - # behaviors during parsing, such as Nokogiri::XML::ParseOptions::RECOVER. See the - # Nokogiri::XML::ParseOptions for more information. + # - +options:+ (Nokogiri::XML::ParseOptions) Configuration object that determines some + # behaviors during parsing. See ParseOptions for more information. The default value is + # +ParseOptions::DEFAULT_XML+. # # [Yields] # If a block is given, a Nokogiri::XML::ParseOptions object is yielded to the block which - # can be configured before parsing. See Nokogiri::XML::ParseOptions for more information. + # can be configured before parsing. See Nokogiri::XML::ParseOptions for more information. # + # [Returns] Nokogiri::XML::Document def parse( string_or_io, url_ = nil, encoding_ = nil, options_ = XML::ParseOptions::DEFAULT_XML, @@ -72,6 +72,7 @@ def parse( end doc = if string_or_io.respond_to?(:read) + # TODO: should we instead check for respond_to?(:to_path) ? if string_or_io.is_a?(Pathname) # resolve the Pathname to the file and open it as an IO object, see #2110 string_or_io = string_or_io.expand_path.open