diff --git a/ext/nokogiri/extconf.rb b/ext/nokogiri/extconf.rb index 9f9ee8dd0a..3f2de4ddfc 100644 --- a/ext/nokogiri/extconf.rb +++ b/ext/nokogiri/extconf.rb @@ -1116,6 +1116,8 @@ def compile ensure_func("gumbo_parse_with_options", "nokogiri_gumbo.h") end +have_func("xmlCtxtSetOptions") # introduced in libxml2 2.13.0 +have_func("xmlCtxtGetOptions") # introduced in libxml2 2.14.0 have_func("rb_category_warning") # introduced in Ruby 3.0 other_library_versions_string = OTHER_LIBRARY_VERSIONS.map { |k, v| [k, v].join(":") }.join(",") diff --git a/ext/nokogiri/html4_sax_parser.c b/ext/nokogiri/html4_sax_parser.c new file mode 100644 index 0000000000..9dba3eb8d4 --- /dev/null +++ b/ext/nokogiri/html4_sax_parser.c @@ -0,0 +1,36 @@ +#include + +VALUE cNokogiriHtml4SaxParser; + +static ID id_start_document; + +static void +noko_html4_sax_parser_start_document(void *ctx) +{ + VALUE self = NOKOGIRI_SAX_SELF(ctx); + VALUE doc = rb_iv_get(self, "@document"); + + rb_funcall(doc, id_start_document, 0); +} + +static VALUE +noko_html4_sax_parser_initialize(VALUE self) +{ + xmlSAXHandlerPtr handler = noko_xml_sax_parser_unwrap(self); + + rb_call_super(0, NULL); + + handler->startDocument = noko_html4_sax_parser_start_document; + + return self; +} + +void +noko_init_html4_sax_parser(void) +{ + cNokogiriHtml4SaxParser = rb_define_class_under(mNokogiriHtml4Sax, "Parser", cNokogiriXmlSaxParser); + + rb_define_private_method(cNokogiriHtml4SaxParser, "initialize_native", noko_html4_sax_parser_initialize, 0); + + id_start_document = rb_intern("start_document"); +} diff --git a/ext/nokogiri/html4_sax_parser_context.c b/ext/nokogiri/html4_sax_parser_context.c index 4a5293d0ec..38f102590a 100644 --- a/ext/nokogiri/html4_sax_parser_context.c +++ b/ext/nokogiri/html4_sax_parser_context.c @@ -82,7 +82,7 @@ parse_with(VALUE self, VALUE sax_handler) } ctxt = noko_xml_sax_parser_context_unwrap(self); - sax = noko_sax_handler_unwrap(sax_handler); + sax = noko_xml_sax_parser_unwrap(sax_handler); ctxt->sax = sax; ctxt->userData = (void *)NOKOGIRI_SAX_TUPLE_NEW(ctxt, sax_handler); diff --git a/ext/nokogiri/html4_sax_push_parser.c b/ext/nokogiri/html4_sax_push_parser.c index 6955c0dcdb..cab3d31756 100644 --- a/ext/nokogiri/html4_sax_push_parser.c +++ b/ext/nokogiri/html4_sax_push_parser.c @@ -30,7 +30,7 @@ native_write(VALUE self, VALUE _chunk, VALUE _last_chunk) Nokogiri_structured_error_func_restore(&handler_state); - if ((status != 0) && !(ctx->options & XML_PARSE_RECOVER)) { + if ((status != 0) && !(xmlCtxtGetOptions(ctx) & XML_PARSE_RECOVER)) { // TODO: there appear to be no tests for this block xmlErrorConstPtr e = xmlCtxtGetLastError(ctx); Nokogiri_error_raise(NULL, e); @@ -54,7 +54,7 @@ initialize_native(VALUE self, VALUE _xml_sax, VALUE _filename, htmlParserCtxtPtr ctx; xmlCharEncoding enc = XML_CHAR_ENCODING_NONE; - sax = noko_sax_handler_unwrap(_xml_sax); + sax = noko_xml_sax_parser_unwrap(_xml_sax); if (_filename != Qnil) { filename = StringValueCStr(_filename); } @@ -79,7 +79,6 @@ initialize_native(VALUE self, VALUE _xml_sax, VALUE _filename, ctx->userData = NOKOGIRI_SAX_TUPLE_NEW(ctx, self); - ctx->sax2 = 1; DATA_PTR(self) = ctx; return self; } diff --git a/ext/nokogiri/libxml2_polyfill.c b/ext/nokogiri/libxml2_polyfill.c new file mode 100644 index 0000000000..70ed189e7c --- /dev/null +++ b/ext/nokogiri/libxml2_polyfill.c @@ -0,0 +1,97 @@ +#include + +#ifndef HAVE_XMLCTXTSETOPTIONS +/* based on libxml2-2.14.0-dev (1d8bd126) parser.c xmlCtxtSetInternalOptions */ +int +xmlCtxtSetOptions(xmlParserCtxtPtr ctxt, int options) +{ + int keepMask = 0; + int allMask; + + if (ctxt == NULL) { + return (-1); + } + + /* + * XInclude options aren't handled by the parser. + * + * XML_PARSE_XINCLUDE + * XML_PARSE_NOXINCNODE + * XML_PARSE_NOBASEFIX + */ + allMask = XML_PARSE_RECOVER | + XML_PARSE_NOENT | + XML_PARSE_DTDLOAD | + XML_PARSE_DTDATTR | + XML_PARSE_DTDVALID | + XML_PARSE_NOERROR | + XML_PARSE_NOWARNING | + XML_PARSE_PEDANTIC | + XML_PARSE_NOBLANKS | +#ifdef LIBXML_SAX1_ENABLED + XML_PARSE_SAX1 | +#endif + XML_PARSE_NONET | + XML_PARSE_NODICT | + XML_PARSE_NSCLEAN | + XML_PARSE_NOCDATA | + XML_PARSE_COMPACT | + XML_PARSE_OLD10 | + XML_PARSE_HUGE | + XML_PARSE_OLDSAX | + XML_PARSE_IGNORE_ENC | + XML_PARSE_BIG_LINES; + + ctxt->options = (ctxt->options & keepMask) | (options & allMask); + + /* + * For some options, struct members are historically the source + * of truth. The values are initalized from global variables and + * old code could also modify them directly. Several older API + * functions that don't take an options argument rely on these + * deprecated mechanisms. + * + * Once public access to struct members and the globals are + * disabled, we can use the options bitmask as source of + * truth, making all these struct members obsolete. + * + * The XML_DETECT_IDS flags is misnamed. It simply enables + * loading of the external subset. + */ + ctxt->recovery = (options & XML_PARSE_RECOVER) ? 1 : 0; + ctxt->replaceEntities = (options & XML_PARSE_NOENT) ? 1 : 0; + ctxt->loadsubset = (options & XML_PARSE_DTDLOAD) ? XML_DETECT_IDS : 0; + ctxt->loadsubset |= (options & XML_PARSE_DTDATTR) ? XML_COMPLETE_ATTRS : 0; + ctxt->validate = (options & XML_PARSE_DTDVALID) ? 1 : 0; + ctxt->pedantic = (options & XML_PARSE_PEDANTIC) ? 1 : 0; + ctxt->keepBlanks = (options & XML_PARSE_NOBLANKS) ? 0 : 1; + ctxt->dictNames = (options & XML_PARSE_NODICT) ? 0 : 1; + + /* + * Changing SAX callbacks is a bad idea. This should be fixed. + */ + if (options & XML_PARSE_NOBLANKS) { + ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace; + } + if (options & XML_PARSE_NOCDATA) { + ctxt->sax->cdataBlock = NULL; + } + if (options & XML_PARSE_HUGE) { + if (ctxt->dict != NULL) { + xmlDictSetLimit(ctxt->dict, 0); + } + } + + ctxt->linenumbers = 1; + + return (options & ~allMask); +} +#endif + +#ifndef HAVE_XMLCTXTGETOPTIONS +int +xmlCtxtGetOptions(xmlParserCtxtPtr ctxt) +{ + return (ctxt->options); +} +#endif diff --git a/ext/nokogiri/nokogiri.c b/ext/nokogiri/nokogiri.c index ea078f0964..d97413e5c2 100644 --- a/ext/nokogiri/nokogiri.c +++ b/ext/nokogiri/nokogiri.c @@ -46,6 +46,7 @@ void noko_init_html_element_description(void); void noko_init_html_entity_lookup(void); void noko_init_html_sax_parser_context(void); void noko_init_html_sax_push_parser(void); +void noko_init_html4_sax_parser(void); void noko_init_gumbo(void); void noko_init_test_global_handlers(void); @@ -244,7 +245,10 @@ Init_nokogiri(void) noko_init_xml_namespace(); noko_init_xml_node_set(); noko_init_xml_reader(); + noko_init_xml_sax_parser(); + noko_init_html4_sax_parser(); + noko_init_xml_xpath_context(); noko_init_xslt_stylesheet(); noko_init_html_element_description(); diff --git a/ext/nokogiri/nokogiri.h b/ext/nokogiri/nokogiri.h index 7a2e070af8..95e3ab366f 100644 --- a/ext/nokogiri/nokogiri.h +++ b/ext/nokogiri/nokogiri.h @@ -56,6 +56,14 @@ #include +/* libxml2_polyfill.c */ +#ifndef HAVE_XMLCTXTSETOPTIONS +int xmlCtxtSetOptions(xmlParserCtxtPtr ctxt, int options); +#endif +#ifndef HAVE_XMLCTXTGETOPTIONS +int xmlCtxtGetOptions(xmlParserCtxtPtr ctxt); +#endif + #define XMLNS_PREFIX "xmlns" #define XMLNS_PREFIX_LEN 6 /* including either colon or \0 */ @@ -141,6 +149,7 @@ NOKOPUBVAR VALUE cNokogiriXsltStylesheet ; NOKOPUBVAR VALUE cNokogiriHtml4Document ; NOKOPUBVAR VALUE cNokogiriHtml4SaxPushParser ; NOKOPUBVAR VALUE cNokogiriHtml4ElementDescription ; +NOKOPUBVAR VALUE cNokogiriHtml4SaxParser; NOKOPUBVAR VALUE cNokogiriHtml4SaxParserContext; NOKOPUBVAR VALUE cNokogiriHtml5Document ; @@ -196,7 +205,7 @@ xmlDocPtr noko_xml_document_unwrap(VALUE rb_document); NOKOPUBFUN VALUE Nokogiri_wrap_xml_document(VALUE klass, xmlDocPtr doc); /* deprecated. use noko_xml_document_wrap() instead. */ -xmlSAXHandlerPtr noko_sax_handler_unwrap(VALUE rb_sax_handler); +xmlSAXHandlerPtr noko_xml_sax_parser_unwrap(VALUE rb_sax_handler); xmlParserCtxtPtr noko_xml_sax_push_parser_unwrap(VALUE rb_parser); diff --git a/ext/nokogiri/xml_node.c b/ext/nokogiri/xml_node.c index 0c59ae49e4..5c5e4e9c19 100644 --- a/ext/nokogiri/xml_node.c +++ b/ext/nokogiri/xml_node.c @@ -2081,7 +2081,7 @@ dump_html(VALUE self) buf = xmlBufferCreate() ; htmlNodeDump(buf, node->doc, node); - html = NOKOGIRI_STR_NEW2(buf->content); + html = NOKOGIRI_STR_NEW2(xmlBufferContent(buf)); xmlBufferFree(buf); return html ; } diff --git a/ext/nokogiri/xml_sax_parser.c b/ext/nokogiri/xml_sax_parser.c index 989ad9eb30..5e84be5661 100644 --- a/ext/nokogiri/xml_sax_parser.c +++ b/ext/nokogiri/xml_sax_parser.c @@ -16,30 +16,28 @@ start_document(void *ctx) xmlParserCtxtPtr ctxt = NOKOGIRI_SAX_CTXT(ctx); - if (NULL != ctxt && ctxt->html != 1) { - if (ctxt->standalone != -1) { /* -1 means there was no declaration */ - VALUE encoding = Qnil ; - VALUE standalone = Qnil; - VALUE version; - if (ctxt->encoding) { - encoding = NOKOGIRI_STR_NEW2(ctxt->encoding) ; - } else if (ctxt->input && ctxt->input->encoding) { - encoding = NOKOGIRI_STR_NEW2(ctxt->input->encoding) ; - } - - version = ctxt->version ? NOKOGIRI_STR_NEW2(ctxt->version) : Qnil; - - switch (ctxt->standalone) { - case 0: - standalone = NOKOGIRI_STR_NEW2("no"); - break; - case 1: - standalone = NOKOGIRI_STR_NEW2("yes"); - break; - } - - rb_funcall(doc, id_xmldecl, 3, version, encoding, standalone); + if (ctxt->standalone != -1) { /* -1 means there was no declaration */ + VALUE encoding = Qnil ; + VALUE standalone = Qnil; + VALUE version; + if (ctxt->encoding) { + encoding = NOKOGIRI_STR_NEW2(ctxt->encoding) ; + } else if (ctxt->input && ctxt->input->encoding) { + encoding = NOKOGIRI_STR_NEW2(ctxt->input->encoding) ; } + + version = ctxt->version ? NOKOGIRI_STR_NEW2(ctxt->version) : Qnil; + + switch (ctxt->standalone) { + case 0: + standalone = NOKOGIRI_STR_NEW2("no"); + break; + case 1: + standalone = NOKOGIRI_STR_NEW2("yes"); + break; + } + + rb_funcall(doc, id_xmldecl, 3, version, encoding, standalone); } rb_funcall(doc, id_start_document, 0); @@ -282,10 +280,9 @@ static const rb_data_type_t xml_sax_handler_type = { }; static VALUE -allocate(VALUE klass) +noko_xml_sax_parser_initialize(VALUE self) { - xmlSAXHandlerPtr handler; - VALUE self = TypedData_Make_Struct(klass, xmlSAXHandler, &xml_sax_handler_type, handler); + xmlSAXHandlerPtr handler = noko_xml_sax_parser_unwrap(self); handler->startDocument = start_document; handler->endDocument = end_document; @@ -304,8 +301,15 @@ allocate(VALUE klass) return self; } +static VALUE +noko_xml_sax_parser_allocate(VALUE klass) +{ + xmlSAXHandlerPtr handler; + return TypedData_Make_Struct(klass, xmlSAXHandler, &xml_sax_handler_type, handler); +} + xmlSAXHandlerPtr -noko_sax_handler_unwrap(VALUE rb_sax_handler) +noko_xml_sax_parser_unwrap(VALUE rb_sax_handler) { xmlSAXHandlerPtr c_sax_handler; TypedData_Get_Struct(rb_sax_handler, xmlSAXHandler, &xml_sax_handler_type, c_sax_handler); @@ -317,7 +321,9 @@ noko_init_xml_sax_parser(void) { cNokogiriXmlSaxParser = rb_define_class_under(mNokogiriXmlSax, "Parser", rb_cObject); - rb_define_alloc_func(cNokogiriXmlSaxParser, allocate); + rb_define_alloc_func(cNokogiriXmlSaxParser, noko_xml_sax_parser_allocate); + + rb_define_private_method(cNokogiriXmlSaxParser, "initialize_native", noko_xml_sax_parser_initialize, 0); id_start_document = rb_intern("start_document"); id_end_document = rb_intern("end_document"); diff --git a/ext/nokogiri/xml_sax_parser_context.c b/ext/nokogiri/xml_sax_parser_context.c index 87fd9f50d9..61468c48cb 100644 --- a/ext/nokogiri/xml_sax_parser_context.c +++ b/ext/nokogiri/xml_sax_parser_context.c @@ -155,7 +155,7 @@ parse_with(VALUE self, VALUE sax_handler) } ctxt = noko_xml_sax_parser_context_unwrap(self); - sax = noko_sax_handler_unwrap(sax_handler); + sax = noko_xml_sax_parser_unwrap(sax_handler); ctxt->sax = sax; ctxt->userData = (void *)NOKOGIRI_SAX_TUPLE_NEW(ctxt, sax_handler); @@ -177,12 +177,17 @@ parse_with(VALUE self, VALUE sax_handler) static VALUE set_replace_entities(VALUE self, VALUE value) { + int error; xmlParserCtxtPtr ctxt = noko_xml_sax_parser_context_unwrap(self); - if (Qfalse == value) { - ctxt->replaceEntities = 0; + if (RB_TEST(value)) { + error = xmlCtxtSetOptions(ctxt, xmlCtxtGetOptions(ctxt) | XML_PARSE_NOENT); } else { - ctxt->replaceEntities = 1; + error = xmlCtxtSetOptions(ctxt, xmlCtxtGetOptions(ctxt) & ~XML_PARSE_NOENT); + } + + if (error) { + rb_raise(rb_eRuntimeError, "failed to set parser context options (%x)", error); } return value; @@ -200,10 +205,10 @@ get_replace_entities(VALUE self) { xmlParserCtxtPtr ctxt = noko_xml_sax_parser_context_unwrap(self); - if (0 == ctxt->replaceEntities) { - return Qfalse; - } else { + if (xmlCtxtGetOptions(ctxt) & XML_PARSE_NOENT) { return Qtrue; + } else { + return Qfalse; } } @@ -255,12 +260,17 @@ column(VALUE self) static VALUE set_recovery(VALUE self, VALUE value) { + int error; xmlParserCtxtPtr ctxt = noko_xml_sax_parser_context_unwrap(self); - if (value == Qfalse) { - ctxt->recovery = 0; + if (RB_TEST(value)) { + error = xmlCtxtSetOptions(ctxt, xmlCtxtGetOptions(ctxt) | XML_PARSE_RECOVER); } else { - ctxt->recovery = 1; + error = xmlCtxtSetOptions(ctxt, xmlCtxtGetOptions(ctxt) & ~XML_PARSE_RECOVER); + } + + if (error) { + rb_raise(rb_eRuntimeError, "failed to set parser context options (%x)", error); } return value; @@ -278,10 +288,10 @@ get_recovery(VALUE self) { xmlParserCtxtPtr ctxt = noko_xml_sax_parser_context_unwrap(self); - if (ctxt->recovery == 0) { - return Qfalse; - } else { + if (xmlCtxtGetOptions(ctxt) & XML_PARSE_RECOVER) { return Qtrue; + } else { + return Qfalse; } } diff --git a/ext/nokogiri/xml_sax_push_parser.c b/ext/nokogiri/xml_sax_push_parser.c index b23f278791..dd131094e5 100644 --- a/ext/nokogiri/xml_sax_push_parser.c +++ b/ext/nokogiri/xml_sax_push_parser.c @@ -58,7 +58,7 @@ native_write(VALUE self, VALUE _chunk, VALUE _last_chunk) xmlSetStructuredErrorFunc(NULL, NULL); if (xmlParseChunk(ctx, chunk, size, Qtrue == _last_chunk ? 1 : 0)) { - if (!(ctx->options & XML_PARSE_RECOVER)) { + if (!(xmlCtxtGetOptions(ctx) & XML_PARSE_RECOVER)) { xmlErrorConstPtr e = xmlCtxtGetLastError(ctx); Nokogiri_error_raise(NULL, e); } @@ -80,7 +80,7 @@ initialize_native(VALUE self, VALUE _xml_sax, VALUE _filename) const char *filename = NULL; xmlParserCtxtPtr ctx; - sax = noko_sax_handler_unwrap(_xml_sax); + sax = noko_xml_sax_parser_unwrap(_xml_sax); if (_filename != Qnil) { filename = StringValueCStr(_filename); } @@ -97,7 +97,6 @@ initialize_native(VALUE self, VALUE _xml_sax, VALUE _filename) ctx->userData = NOKOGIRI_SAX_TUPLE_NEW(ctx, self); - ctx->sax2 = 1; DATA_PTR(self) = ctx; return self; } @@ -109,18 +108,20 @@ get_options(VALUE self) ctx = noko_xml_sax_push_parser_unwrap(self); - return INT2NUM(ctx->options); + return INT2NUM(xmlCtxtGetOptions(ctx)); } static VALUE set_options(VALUE self, VALUE options) { + int error; xmlParserCtxtPtr ctx; ctx = noko_xml_sax_push_parser_unwrap(self); - if (xmlCtxtUseOptions(ctx, (int)NUM2INT(options)) != 0) { - rb_raise(rb_eRuntimeError, "Cannot set XML parser context options"); + error = xmlCtxtSetOptions(ctx, (int)NUM2INT(options)); + if (error) { + rb_raise(rb_eRuntimeError, "Cannot set XML parser context options (%x)", error); } return Qnil; @@ -136,14 +137,12 @@ set_options(VALUE self, VALUE options) static VALUE get_replace_entities(VALUE self) { - xmlParserCtxtPtr ctx; - - ctx = noko_xml_sax_push_parser_unwrap(self); + xmlParserCtxtPtr ctxt = noko_xml_sax_push_parser_unwrap(self); - if (0 == ctx->replaceEntities) { - return Qfalse; - } else { + if (xmlCtxtGetOptions(ctxt) & XML_PARSE_NOENT) { return Qtrue; + } else { + return Qfalse; } } @@ -157,14 +156,17 @@ get_replace_entities(VALUE self) static VALUE set_replace_entities(VALUE self, VALUE value) { - xmlParserCtxtPtr ctx; - - ctx = noko_xml_sax_push_parser_unwrap(self); + int error; + xmlParserCtxtPtr ctxt = noko_xml_sax_push_parser_unwrap(self); - if (Qfalse == value) { - ctx->replaceEntities = 0; + if (RB_TEST(value)) { + error = xmlCtxtSetOptions(ctxt, xmlCtxtGetOptions(ctxt) | XML_PARSE_NOENT); } else { - ctx->replaceEntities = 1; + error = xmlCtxtSetOptions(ctxt, xmlCtxtGetOptions(ctxt) & ~XML_PARSE_NOENT); + } + + if (error) { + rb_raise(rb_eRuntimeError, "failed to set parser context options (%x)", error); } return value; diff --git a/lib/nokogiri/xml/sax/parser.rb b/lib/nokogiri/xml/sax/parser.rb index 289aabe318..c9447f6496 100644 --- a/lib/nokogiri/xml/sax/parser.rb +++ b/lib/nokogiri/xml/sax/parser.rb @@ -73,6 +73,8 @@ def initialize(doc = Nokogiri::XML::SAX::Document.new, encoding = "UTF-8") @encoding = check_encoding(encoding) @document = doc @warned = false + + initialize_native unless Nokogiri.jruby? end ### diff --git a/nokogiri.gemspec b/nokogiri.gemspec index f4f3941b4e..faa875ecaa 100644 --- a/nokogiri.gemspec +++ b/nokogiri.gemspec @@ -150,8 +150,10 @@ Gem::Specification.new do |spec| "ext/nokogiri/html4_document.c", "ext/nokogiri/html4_element_description.c", "ext/nokogiri/html4_entity_lookup.c", + "ext/nokogiri/html4_sax_parser.c", "ext/nokogiri/html4_sax_parser_context.c", "ext/nokogiri/html4_sax_push_parser.c", + "ext/nokogiri/libxml2_polyfill.c", "ext/nokogiri/nokogiri.c", "ext/nokogiri/nokogiri.h", "ext/nokogiri/xml_attr.c", diff --git a/test/xml/sax/test_parser.rb b/test/xml/sax/test_parser.rb index 8551a3469c..832ab05e2e 100644 --- a/test/xml/sax/test_parser.rb +++ b/test/xml/sax/test_parser.rb @@ -49,15 +49,15 @@ class TestCase assert_nil(parser.document.xmldecls) end - it :test_xml_decl do - [ - ['', ["1.0"]], - ['', ["1.0", "UTF-8"]], - ['', ["1.0", "yes"]], - ['', ["1.0", "no"]], - ['', ["1.0", "UTF-8", "no"]], - ['', ["1.0", "ISO-8859-1", "yes"]], - ].each do |decl, value| + [ + ['', ["1.0"]], + ['', ["1.0", "UTF-8"]], + ['', ["1.0", "yes"]], + ['', ["1.0", "no"]], + ['', ["1.0", "UTF-8", "no"]], + ['', ["1.0", "ISO-8859-1", "yes"]], + ].each do |decl, value| + it "parses xml decl '#{decl}'" do parser = Nokogiri::XML::SAX::Parser.new(Doc.new) xml = "#{decl}\n"