Skip to content

Commit

Permalink
libxml 2.14-dev deprecations (#3241)
Browse files Browse the repository at this point in the history
**What problem is this PR intended to solve?**

Upstream has started to deprecate some struct members that we're
accessing directly. Let's start addressing some of those issues.

I'm also introducing some polyfills for libxml2 so we can always use the
latest and greatest API functions.


**Have you included adequate test coverage?**

Refactoring. Existing coverage is adequate.


**Does this change affect the behavior of either the C or the Java
implementations?**

Refactoring. No behavior changes.

**Related upstream PRs**

- [parser: implement xmlCtxtGetOptions (!262) · Merge requests · GNOME /
libxml2 ·
GitLab](https://gitlab.gnome.org/GNOME/libxml2/-/merge_requests/262)
  • Loading branch information
flavorjones authored Jun 21, 2024
2 parents 5ca284d + 3667d7a commit d981833
Show file tree
Hide file tree
Showing 14 changed files with 243 additions and 74 deletions.
2 changes: 2 additions & 0 deletions ext/nokogiri/extconf.rb
Original file line number Diff line number Diff line change
Expand Up @@ -1116,6 +1116,8 @@ def compile
ensure_func("gumbo_parse_with_options", "nokogiri_gumbo.h")
end

have_func("xmlCtxtSetOptions") # introduced in libxml2 2.13.0
have_func("xmlCtxtGetOptions") # introduced in libxml2 2.14.0
have_func("rb_category_warning") # introduced in Ruby 3.0

other_library_versions_string = OTHER_LIBRARY_VERSIONS.map { |k, v| [k, v].join(":") }.join(",")
Expand Down
36 changes: 36 additions & 0 deletions ext/nokogiri/html4_sax_parser.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#include <nokogiri.h>

VALUE cNokogiriHtml4SaxParser;

static ID id_start_document;

static void
noko_html4_sax_parser_start_document(void *ctx)
{
VALUE self = NOKOGIRI_SAX_SELF(ctx);
VALUE doc = rb_iv_get(self, "@document");

rb_funcall(doc, id_start_document, 0);
}

static VALUE
noko_html4_sax_parser_initialize(VALUE self)
{
xmlSAXHandlerPtr handler = noko_xml_sax_parser_unwrap(self);

rb_call_super(0, NULL);

handler->startDocument = noko_html4_sax_parser_start_document;

return self;
}

void
noko_init_html4_sax_parser(void)
{
cNokogiriHtml4SaxParser = rb_define_class_under(mNokogiriHtml4Sax, "Parser", cNokogiriXmlSaxParser);

rb_define_private_method(cNokogiriHtml4SaxParser, "initialize_native", noko_html4_sax_parser_initialize, 0);

id_start_document = rb_intern("start_document");
}
2 changes: 1 addition & 1 deletion ext/nokogiri/html4_sax_parser_context.c
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ parse_with(VALUE self, VALUE sax_handler)
}

ctxt = noko_xml_sax_parser_context_unwrap(self);
sax = noko_sax_handler_unwrap(sax_handler);
sax = noko_xml_sax_parser_unwrap(sax_handler);

ctxt->sax = sax;
ctxt->userData = (void *)NOKOGIRI_SAX_TUPLE_NEW(ctxt, sax_handler);
Expand Down
5 changes: 2 additions & 3 deletions ext/nokogiri/html4_sax_push_parser.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ native_write(VALUE self, VALUE _chunk, VALUE _last_chunk)

Nokogiri_structured_error_func_restore(&handler_state);

if ((status != 0) && !(ctx->options & XML_PARSE_RECOVER)) {
if ((status != 0) && !(xmlCtxtGetOptions(ctx) & XML_PARSE_RECOVER)) {
// TODO: there appear to be no tests for this block
xmlErrorConstPtr e = xmlCtxtGetLastError(ctx);
Nokogiri_error_raise(NULL, e);
Expand All @@ -54,7 +54,7 @@ initialize_native(VALUE self, VALUE _xml_sax, VALUE _filename,
htmlParserCtxtPtr ctx;
xmlCharEncoding enc = XML_CHAR_ENCODING_NONE;

sax = noko_sax_handler_unwrap(_xml_sax);
sax = noko_xml_sax_parser_unwrap(_xml_sax);

if (_filename != Qnil) { filename = StringValueCStr(_filename); }

Expand All @@ -79,7 +79,6 @@ initialize_native(VALUE self, VALUE _xml_sax, VALUE _filename,

ctx->userData = NOKOGIRI_SAX_TUPLE_NEW(ctx, self);

ctx->sax2 = 1;
DATA_PTR(self) = ctx;
return self;
}
Expand Down
97 changes: 97 additions & 0 deletions ext/nokogiri/libxml2_polyfill.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
#include <nokogiri.h>

#ifndef HAVE_XMLCTXTSETOPTIONS
/* based on libxml2-2.14.0-dev (1d8bd126) parser.c xmlCtxtSetInternalOptions */
int
xmlCtxtSetOptions(xmlParserCtxtPtr ctxt, int options)
{
int keepMask = 0;
int allMask;

if (ctxt == NULL) {
return (-1);
}

/*
* XInclude options aren't handled by the parser.
*
* XML_PARSE_XINCLUDE
* XML_PARSE_NOXINCNODE
* XML_PARSE_NOBASEFIX
*/
allMask = XML_PARSE_RECOVER |
XML_PARSE_NOENT |
XML_PARSE_DTDLOAD |
XML_PARSE_DTDATTR |
XML_PARSE_DTDVALID |
XML_PARSE_NOERROR |
XML_PARSE_NOWARNING |
XML_PARSE_PEDANTIC |
XML_PARSE_NOBLANKS |
#ifdef LIBXML_SAX1_ENABLED
XML_PARSE_SAX1 |
#endif
XML_PARSE_NONET |
XML_PARSE_NODICT |
XML_PARSE_NSCLEAN |
XML_PARSE_NOCDATA |
XML_PARSE_COMPACT |
XML_PARSE_OLD10 |
XML_PARSE_HUGE |
XML_PARSE_OLDSAX |
XML_PARSE_IGNORE_ENC |
XML_PARSE_BIG_LINES;

ctxt->options = (ctxt->options & keepMask) | (options & allMask);

/*
* For some options, struct members are historically the source
* of truth. The values are initalized from global variables and
* old code could also modify them directly. Several older API
* functions that don't take an options argument rely on these
* deprecated mechanisms.
*
* Once public access to struct members and the globals are
* disabled, we can use the options bitmask as source of
* truth, making all these struct members obsolete.
*
* The XML_DETECT_IDS flags is misnamed. It simply enables
* loading of the external subset.
*/
ctxt->recovery = (options & XML_PARSE_RECOVER) ? 1 : 0;
ctxt->replaceEntities = (options & XML_PARSE_NOENT) ? 1 : 0;
ctxt->loadsubset = (options & XML_PARSE_DTDLOAD) ? XML_DETECT_IDS : 0;
ctxt->loadsubset |= (options & XML_PARSE_DTDATTR) ? XML_COMPLETE_ATTRS : 0;
ctxt->validate = (options & XML_PARSE_DTDVALID) ? 1 : 0;
ctxt->pedantic = (options & XML_PARSE_PEDANTIC) ? 1 : 0;
ctxt->keepBlanks = (options & XML_PARSE_NOBLANKS) ? 0 : 1;
ctxt->dictNames = (options & XML_PARSE_NODICT) ? 0 : 1;

/*
* Changing SAX callbacks is a bad idea. This should be fixed.
*/
if (options & XML_PARSE_NOBLANKS) {
ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
}
if (options & XML_PARSE_NOCDATA) {
ctxt->sax->cdataBlock = NULL;
}
if (options & XML_PARSE_HUGE) {
if (ctxt->dict != NULL) {
xmlDictSetLimit(ctxt->dict, 0);
}
}

ctxt->linenumbers = 1;

return (options & ~allMask);
}
#endif

#ifndef HAVE_XMLCTXTGETOPTIONS
int
xmlCtxtGetOptions(xmlParserCtxtPtr ctxt)
{
return (ctxt->options);
}
#endif
4 changes: 4 additions & 0 deletions ext/nokogiri/nokogiri.c
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ void noko_init_html_element_description(void);
void noko_init_html_entity_lookup(void);
void noko_init_html_sax_parser_context(void);
void noko_init_html_sax_push_parser(void);
void noko_init_html4_sax_parser(void);
void noko_init_gumbo(void);
void noko_init_test_global_handlers(void);

Expand Down Expand Up @@ -244,7 +245,10 @@ Init_nokogiri(void)
noko_init_xml_namespace();
noko_init_xml_node_set();
noko_init_xml_reader();

noko_init_xml_sax_parser();
noko_init_html4_sax_parser();

noko_init_xml_xpath_context();
noko_init_xslt_stylesheet();
noko_init_html_element_description();
Expand Down
11 changes: 10 additions & 1 deletion ext/nokogiri/nokogiri.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,14 @@

#include <libexslt/exslt.h>

/* libxml2_polyfill.c */
#ifndef HAVE_XMLCTXTSETOPTIONS
int xmlCtxtSetOptions(xmlParserCtxtPtr ctxt, int options);
#endif
#ifndef HAVE_XMLCTXTGETOPTIONS
int xmlCtxtGetOptions(xmlParserCtxtPtr ctxt);
#endif

#define XMLNS_PREFIX "xmlns"
#define XMLNS_PREFIX_LEN 6 /* including either colon or \0 */

Expand Down Expand Up @@ -141,6 +149,7 @@ NOKOPUBVAR VALUE cNokogiriXsltStylesheet ;
NOKOPUBVAR VALUE cNokogiriHtml4Document ;
NOKOPUBVAR VALUE cNokogiriHtml4SaxPushParser ;
NOKOPUBVAR VALUE cNokogiriHtml4ElementDescription ;
NOKOPUBVAR VALUE cNokogiriHtml4SaxParser;
NOKOPUBVAR VALUE cNokogiriHtml4SaxParserContext;
NOKOPUBVAR VALUE cNokogiriHtml5Document ;

Expand Down Expand Up @@ -196,7 +205,7 @@ xmlDocPtr noko_xml_document_unwrap(VALUE rb_document);
NOKOPUBFUN VALUE Nokogiri_wrap_xml_document(VALUE klass,
xmlDocPtr doc); /* deprecated. use noko_xml_document_wrap() instead. */

xmlSAXHandlerPtr noko_sax_handler_unwrap(VALUE rb_sax_handler);
xmlSAXHandlerPtr noko_xml_sax_parser_unwrap(VALUE rb_sax_handler);

xmlParserCtxtPtr noko_xml_sax_push_parser_unwrap(VALUE rb_parser);

Expand Down
2 changes: 1 addition & 1 deletion ext/nokogiri/xml_node.c
Original file line number Diff line number Diff line change
Expand Up @@ -2081,7 +2081,7 @@ dump_html(VALUE self)

buf = xmlBufferCreate() ;
htmlNodeDump(buf, node->doc, node);
html = NOKOGIRI_STR_NEW2(buf->content);
html = NOKOGIRI_STR_NEW2(xmlBufferContent(buf));
xmlBufferFree(buf);
return html ;
}
Expand Down
62 changes: 34 additions & 28 deletions ext/nokogiri/xml_sax_parser.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,30 +16,28 @@ start_document(void *ctx)

xmlParserCtxtPtr ctxt = NOKOGIRI_SAX_CTXT(ctx);

if (NULL != ctxt && ctxt->html != 1) {
if (ctxt->standalone != -1) { /* -1 means there was no declaration */
VALUE encoding = Qnil ;
VALUE standalone = Qnil;
VALUE version;
if (ctxt->encoding) {
encoding = NOKOGIRI_STR_NEW2(ctxt->encoding) ;
} else if (ctxt->input && ctxt->input->encoding) {
encoding = NOKOGIRI_STR_NEW2(ctxt->input->encoding) ;
}

version = ctxt->version ? NOKOGIRI_STR_NEW2(ctxt->version) : Qnil;

switch (ctxt->standalone) {
case 0:
standalone = NOKOGIRI_STR_NEW2("no");
break;
case 1:
standalone = NOKOGIRI_STR_NEW2("yes");
break;
}

rb_funcall(doc, id_xmldecl, 3, version, encoding, standalone);
if (ctxt->standalone != -1) { /* -1 means there was no declaration */
VALUE encoding = Qnil ;
VALUE standalone = Qnil;
VALUE version;
if (ctxt->encoding) {
encoding = NOKOGIRI_STR_NEW2(ctxt->encoding) ;
} else if (ctxt->input && ctxt->input->encoding) {
encoding = NOKOGIRI_STR_NEW2(ctxt->input->encoding) ;
}

version = ctxt->version ? NOKOGIRI_STR_NEW2(ctxt->version) : Qnil;

switch (ctxt->standalone) {
case 0:
standalone = NOKOGIRI_STR_NEW2("no");
break;
case 1:
standalone = NOKOGIRI_STR_NEW2("yes");
break;
}

rb_funcall(doc, id_xmldecl, 3, version, encoding, standalone);
}

rb_funcall(doc, id_start_document, 0);
Expand Down Expand Up @@ -282,10 +280,9 @@ static const rb_data_type_t xml_sax_handler_type = {
};

static VALUE
allocate(VALUE klass)
noko_xml_sax_parser_initialize(VALUE self)
{
xmlSAXHandlerPtr handler;
VALUE self = TypedData_Make_Struct(klass, xmlSAXHandler, &xml_sax_handler_type, handler);
xmlSAXHandlerPtr handler = noko_xml_sax_parser_unwrap(self);

handler->startDocument = start_document;
handler->endDocument = end_document;
Expand All @@ -304,8 +301,15 @@ allocate(VALUE klass)
return self;
}

static VALUE
noko_xml_sax_parser_allocate(VALUE klass)
{
xmlSAXHandlerPtr handler;
return TypedData_Make_Struct(klass, xmlSAXHandler, &xml_sax_handler_type, handler);
}

xmlSAXHandlerPtr
noko_sax_handler_unwrap(VALUE rb_sax_handler)
noko_xml_sax_parser_unwrap(VALUE rb_sax_handler)
{
xmlSAXHandlerPtr c_sax_handler;
TypedData_Get_Struct(rb_sax_handler, xmlSAXHandler, &xml_sax_handler_type, c_sax_handler);
Expand All @@ -317,7 +321,9 @@ noko_init_xml_sax_parser(void)
{
cNokogiriXmlSaxParser = rb_define_class_under(mNokogiriXmlSax, "Parser", rb_cObject);

rb_define_alloc_func(cNokogiriXmlSaxParser, allocate);
rb_define_alloc_func(cNokogiriXmlSaxParser, noko_xml_sax_parser_allocate);

rb_define_private_method(cNokogiriXmlSaxParser, "initialize_native", noko_xml_sax_parser_initialize, 0);

id_start_document = rb_intern("start_document");
id_end_document = rb_intern("end_document");
Expand Down
Loading

0 comments on commit d981833

Please sign in to comment.