Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Migrate Nokogiri::XML::Document to the TypedData API #2807

Merged
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ext/nokogiri/html4_document.c
Original file line number Diff line number Diff line change
@@ -145,7 +145,7 @@ static VALUE
rb_html_document_type(VALUE self)
{
htmlDocPtr doc;
Data_Get_Struct(self, xmlDoc, doc);
TypedData_Get_Struct(self, xmlDoc, &noko_xml_document_data_type, doc);
return INT2NUM(doc->type);
}

2 changes: 2 additions & 0 deletions ext/nokogiri/nokogiri.h
Original file line number Diff line number Diff line change
@@ -166,6 +166,8 @@ typedef struct _nokogiriXsltStylesheetTuple {
VALUE func_instances;
} nokogiriXsltStylesheetTuple;

extern const rb_data_type_t noko_xml_document_data_type;

void noko_xml_document_pin_node(xmlNodePtr);
void noko_xml_document_pin_namespace(xmlNsPtr, xmlDocPtr);

2 changes: 1 addition & 1 deletion ext/nokogiri/xml_comment.c
Original file line number Diff line number Diff line change
@@ -30,7 +30,7 @@ new (int argc, VALUE *argv, VALUE klass)
rb_raise(rb_eArgError, "first argument must be a XML::Document or XML::Node");
}

Data_Get_Struct(document, xmlDoc, xml_doc);
TypedData_Get_Struct(document, xmlDoc, &noko_xml_document_data_type, xml_doc);

node = xmlNewDocComment(
xml_doc,
67 changes: 54 additions & 13 deletions ext/nokogiri/xml_document.c
Original file line number Diff line number Diff line change
@@ -51,8 +51,9 @@ remove_private(xmlNodePtr node)
}

static void
mark(xmlDocPtr doc)
mark(void *data)
{
xmlDocPtr doc = (xmlDocPtr)data;
nokogiriTuplePtr tuple = (nokogiriTuplePtr)doc->_private;
if (tuple) {
rb_gc_mark(tuple->doc);
@@ -61,8 +62,9 @@ mark(xmlDocPtr doc)
}

static void
dealloc(xmlDocPtr doc)
dealloc(void *data)
{
xmlDocPtr doc = (xmlDocPtr)data;
st_table *node_hash;

node_hash = DOC_UNLINKED_NODE_HASH(doc);
@@ -84,6 +86,45 @@ dealloc(xmlDocPtr doc)
xmlFreeDoc(doc);
}

static size_t
memsize_node(const xmlNodePtr node)
{
/* note we don't count namespace definitions, just going for a good-enough number here */
xmlNodePtr child;
size_t memsize = 0;

memsize += xmlStrlen(node->name);
for (child = (xmlNodePtr)node->properties; child; child = child->next) {
memsize += sizeof(xmlAttr) + memsize_node(child);
}
if (node->type == XML_TEXT_NODE) {
memsize += xmlStrlen(node->content);
}
for (child = node->children; child; child = child->next) {
memsize += sizeof(xmlNode) + memsize_node(child);
}
return memsize;
}

static size_t
memsize(const void *data)
{
xmlDocPtr doc = (const xmlDocPtr)data;
size_t memsize = sizeof(xmlDoc);
/* This may not account for all memory use */
memsize += memsize_node((xmlNodePtr)doc);
return memsize;
}

const rb_data_type_t noko_xml_document_data_type = {
.wrap_struct_name = "Nokogiri::XML::Document",
.function = {
.dmark = mark,
.dfree = dealloc,
.dsize = memsize,
}
};

static void
recursively_remove_namespaces_from_node(xmlNodePtr node)
{
@@ -127,7 +168,7 @@ static VALUE
url(VALUE self)
{
xmlDocPtr doc;
Data_Get_Struct(self, xmlDoc, doc);
TypedData_Get_Struct(self, xmlDoc, &noko_xml_document_data_type, doc);

if (doc->URL) { return NOKOGIRI_STR_NEW2(doc->URL); }

@@ -146,7 +187,7 @@ rb_xml_document_root_set(VALUE self, VALUE rb_new_root)
xmlDocPtr c_document;
xmlNodePtr c_new_root = NULL, c_current_root;

Data_Get_Struct(self, xmlDoc, c_document);
TypedData_Get_Struct(self, xmlDoc, &noko_xml_document_data_type, c_document);

c_current_root = xmlDocGetRootElement(c_document);
if (c_current_root) {
@@ -190,7 +231,7 @@ rb_xml_document_root(VALUE self)
xmlDocPtr c_document;
xmlNodePtr c_root;

Data_Get_Struct(self, xmlDoc, c_document);
TypedData_Get_Struct(self, xmlDoc, &noko_xml_document_data_type, c_document);

c_root = xmlDocGetRootElement(c_document);
if (!c_root) {
@@ -210,7 +251,7 @@ static VALUE
set_encoding(VALUE self, VALUE encoding)
{
xmlDocPtr doc;
Data_Get_Struct(self, xmlDoc, doc);
TypedData_Get_Struct(self, xmlDoc, &noko_xml_document_data_type, doc);

if (doc->encoding) {
xmlFree(DISCARD_CONST_QUAL_XMLCHAR(doc->encoding));
@@ -231,7 +272,7 @@ static VALUE
encoding(VALUE self)
{
xmlDocPtr doc;
Data_Get_Struct(self, xmlDoc, doc);
TypedData_Get_Struct(self, xmlDoc, &noko_xml_document_data_type, doc);

if (!doc->encoding) { return Qnil; }
return NOKOGIRI_STR_NEW2(doc->encoding);
@@ -247,7 +288,7 @@ static VALUE
version(VALUE self)
{
xmlDocPtr doc;
Data_Get_Struct(self, xmlDoc, doc);
TypedData_Get_Struct(self, xmlDoc, &noko_xml_document_data_type, doc);

if (!doc->version) { return Qnil; }
return NOKOGIRI_STR_NEW2(doc->version);
@@ -369,7 +410,7 @@ duplicate_document(int argc, VALUE *argv, VALUE self)
level = INT2NUM((long)1);
}

Data_Get_Struct(self, xmlDoc, doc);
TypedData_Get_Struct(self, xmlDoc, &noko_xml_document_data_type, doc);

dup = xmlCopyDoc(doc, (int)NUM2INT(level));

@@ -443,7 +484,7 @@ static VALUE
remove_namespaces_bang(VALUE self)
{
xmlDocPtr doc ;
Data_Get_Struct(self, xmlDoc, doc);
TypedData_Get_Struct(self, xmlDoc, &noko_xml_document_data_type, doc);

recursively_remove_namespaces_from_node((xmlNodePtr)doc);
return self;
@@ -471,7 +512,7 @@ create_entity(int argc, VALUE *argv, VALUE self)
xmlEntityPtr ptr;
xmlDocPtr doc ;

Data_Get_Struct(self, xmlDoc, doc);
TypedData_Get_Struct(self, xmlDoc, &noko_xml_document_data_type, doc);

rb_scan_args(argc, argv, "14", &name, &type, &external_id, &system_id,
&content);
@@ -559,7 +600,7 @@ rb_xml_document_canonicalize(int argc, VALUE *argv, VALUE self)
}
}

Data_Get_Struct(self, xmlDoc, c_doc);
TypedData_Get_Struct(self, xmlDoc, &noko_xml_document_data_type, c_doc);

rb_cStringIO = rb_const_get_at(rb_cObject, rb_intern("StringIO"));
rb_io = rb_class_new_instance(0, 0, rb_cStringIO);
@@ -607,7 +648,7 @@ noko_xml_document_wrap_with_init_args(VALUE klass, xmlDocPtr c_document, int arg
klass = cNokogiriXmlDocument;
}

rb_document = Data_Wrap_Struct(klass, mark, dealloc, c_document);
rb_document = TypedData_Wrap_Struct(klass, &noko_xml_document_data_type, c_document);

tuple = (nokogiriTuplePtr)ruby_xmalloc(sizeof(nokogiriTuple));
tuple->doc = rb_document;
2 changes: 1 addition & 1 deletion ext/nokogiri/xml_document_fragment.c
Original file line number Diff line number Diff line change
@@ -19,7 +19,7 @@ new (int argc, VALUE *argv, VALUE klass)

rb_scan_args(argc, argv, "1*", &document, &rest);

Data_Get_Struct(document, xmlDoc, xml_doc);
TypedData_Get_Struct(document, xmlDoc, &noko_xml_document_data_type, xml_doc);

node = xmlNewDocFragment(xml_doc->doc);

2 changes: 1 addition & 1 deletion ext/nokogiri/xml_entity_reference.c
Original file line number Diff line number Diff line change
@@ -20,7 +20,7 @@ new (int argc, VALUE *argv, VALUE klass)

rb_scan_args(argc, argv, "2*", &document, &name, &rest);

Data_Get_Struct(document, xmlDoc, xml_doc);
TypedData_Get_Struct(document, xmlDoc, &noko_xml_document_data_type, xml_doc);

node = xmlNewReference(
xml_doc,
2 changes: 1 addition & 1 deletion ext/nokogiri/xml_node.c
Original file line number Diff line number Diff line change
@@ -984,7 +984,7 @@ duplicate_node(int argc, VALUE *argv, VALUE self)
if (n_args < 2) {
new_parent_doc = node->doc;
} else {
Data_Get_Struct(r_new_parent_doc, xmlDoc, new_parent_doc);
TypedData_Get_Struct(r_new_parent_doc, xmlDoc, &noko_xml_document_data_type, new_parent_doc);
}

dup = xmlDocCopyNode(node, new_parent_doc, level);
2 changes: 1 addition & 1 deletion ext/nokogiri/xml_processing_instruction.c
Original file line number Diff line number Diff line change
@@ -22,7 +22,7 @@ new (int argc, VALUE *argv, VALUE klass)

rb_scan_args(argc, argv, "3*", &document, &name, &content, &rest);

Data_Get_Struct(document, xmlDoc, xml_doc);
TypedData_Get_Struct(document, xmlDoc, &noko_xml_document_data_type, xml_doc);

node = xmlNewDocPI(
xml_doc,
4 changes: 2 additions & 2 deletions ext/nokogiri/xml_relax_ng.c
Original file line number Diff line number Diff line change
@@ -23,7 +23,7 @@ validate_document(VALUE self, VALUE document)
xmlRelaxNGValidCtxtPtr valid_ctxt;

Data_Get_Struct(self, xmlRelaxNG, schema);
Data_Get_Struct(document, xmlDoc, doc);
TypedData_Get_Struct(document, xmlDoc, &noko_xml_document_data_type, doc);

errors = rb_ary_new();

@@ -127,7 +127,7 @@ from_document(int argc, VALUE *argv, VALUE klass)

scanned_args = rb_scan_args(argc, argv, "11", &document, &parse_options);

Data_Get_Struct(document, xmlDoc, doc);
TypedData_Get_Struct(document, xmlDoc, &noko_xml_document_data_type, doc);
doc = doc->doc; /* In case someone passes us a node. ugh. */

if (scanned_args == 1) {
11 changes: 8 additions & 3 deletions ext/nokogiri/xslt_stylesheet.c
Original file line number Diff line number Diff line change
@@ -79,7 +79,7 @@ parse_stylesheet_doc(VALUE klass, VALUE xmldocobj)
xmlDocPtr xml, xml_cpy;
VALUE errstr, exception;
xsltStylesheetPtr ss ;
Data_Get_Struct(xmldocobj, xmlDoc, xml);
TypedData_Get_Struct(xmldocobj, xmlDoc, &noko_xml_document_data_type, xml);

errstr = rb_str_new(0, 0);
xsltSetGenericErrorFunc((void *)errstr, xslt_generic_error_handler);
@@ -114,7 +114,12 @@ rb_xslt_stylesheet_serialize(VALUE self, VALUE xmlobj)
int doc_len ;
VALUE rval ;

Data_Get_Struct(xmlobj, xmlDoc, xml);
TypedData_Get_Struct(
xmlobj,
xmlDoc,
&noko_xml_document_data_type,
xml
);
TypedData_Get_Struct(
self,
nokogiriXsltStylesheetTuple,
@@ -265,7 +270,7 @@ rb_xslt_stylesheet_transform(int argc, VALUE *argv, VALUE self)

Check_Type(paramobj, T_ARRAY);

Data_Get_Struct(xmldoc, xmlDoc, xml);
TypedData_Get_Struct(xmldoc, xmlDoc, &noko_xml_document_data_type, xml);
TypedData_Get_Struct(self, nokogiriXsltStylesheetTuple, &xslt_stylesheet_type, wrapper);

param_len = RARRAY_LEN(paramobj);
41 changes: 41 additions & 0 deletions test/test_memory_leak.rb
Original file line number Diff line number Diff line change
@@ -272,6 +272,47 @@ def test_leaking_dtd_nodes_after_internal_subset_removal
end
end # if NOKOGIRI_GC

def test_object_space_memsize_of
require "objspace"
skip("memsize_of not defined") unless ObjectSpace.respond_to?(:memsize_of)

base_size = ObjectSpace.memsize_of(Nokogiri::XML(<<~XML))
<root>
<child>asdf</child>
</root>
XML

more_children_size = ObjectSpace.memsize_of(Nokogiri::XML(<<~XML))
<root>
<child>asdf</child>
<child>asdf</child>
<child>asdf</child>
</root>
XML
assert(more_children_size > base_size, "adding children should increase memsize")

attributes_size = ObjectSpace.memsize_of(Nokogiri::XML(<<~XML))
<root>
<child a="b" c="d">asdf</child>
</root>
XML
assert(attributes_size > base_size, "adding attributes should increase memsize")

string_size = ObjectSpace.memsize_of(Nokogiri::XML(<<~XML))
<root>
<child>asdfasdfasdfasdfasdfasdfasdfasdfasdfasdfasdfasdfasdf</child>
</root>
XML
assert(string_size > base_size, "longer strings should increase memsize")

bigger_name_size = ObjectSpace.memsize_of(Nokogiri::XML(<<~XML))
<root>
<superduperamazingchild>asdf</superduperamazingchild>
</root>
XML
assert(bigger_name_size > base_size, "longer tags should increase memsize")
end

module MemInfo
# from https://stackoverflow.com/questions/7220896/get-current-ruby-process-memory-usage
# this is only going to work on linux