Skip to content

Commit

Permalink
fix benchmarks, remove unnecessary resource cleanups and use a consta…
Browse files Browse the repository at this point in the history
…nt empty list for empty lists when building the tree
  • Loading branch information
Overbryd committed Aug 31, 2017
1 parent 38bab84 commit 8415ce4
Show file tree
Hide file tree
Showing 7 changed files with 4,180 additions and 48 deletions.
15 changes: 6 additions & 9 deletions bench/basic_html_bench.exs
Original file line number Diff line number Diff line change
@@ -1,24 +1,21 @@
defmodule BasicHtmlBench do
use Benchfella

@html File.read!("bench/w3c_html5.html")

setup_all do
ref = Myhtmlex.open(@html)
{:ok, ref}
html = File.read!("bench/w3c_html5.html")
context = {html, Myhtmlex.open(html)}
{:ok, context}
end

bench "decode" do
Myhtmlex.decode(@html)
{html, _} = bench_context
Myhtmlex.decode(html)
end

bench "decode with ref" do
ref = bench_context
{_, ref} = bench_context
Myhtmlex.decode_tree(ref)
end

# bench "encode" do
# Mixoml.encode(@decoded)
# end
end

29 changes: 29 additions & 0 deletions bench/file_sizes_bench.exs
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
defmodule FileSizesBench do
use Benchfella

setup_all do
refs = {
File.read!("bench/github_trending_js.html") |> Myhtmlex.open,
File.read!("bench/w3c_html5.html") |> Myhtmlex.open,
File.read!("bench/wikipedia_hyperlink.html") |> Myhtmlex.open
}
{:ok, refs}
end

bench "github_trending_js.html 341k" do
{ref, _, _} = bench_context
Myhtmlex.decode_tree(ref)
end

bench "w3c_html5.html 131k" do
{_, ref, _} = bench_context
Myhtmlex.decode_tree(ref)
end

bench "wikipedia_hyperlink.html 97k" do
{_, _, ref} = bench_context
Myhtmlex.decode_tree(ref)
end

end

3,524 changes: 3,524 additions & 0 deletions bench/github_trending_js.html

Large diffs are not rendered by default.

580 changes: 580 additions & 0 deletions bench/wikipedia_hyperlink.html

Large diffs are not rendered by default.

71 changes: 32 additions & 39 deletions src/myhtmlex.c
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,6 @@ nif_open(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]) {
}
ref->root = myhtml_tree_get_document(ref->tree);

// garbage collect argument
enif_release_binary(&html_bin);

result = enif_make_resource(env, ref);
return result;
}
Expand Down Expand Up @@ -94,9 +91,8 @@ nif_decode(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]) {
return enif_make_badarg(env);
}

// clear myhtml tree resources before parsing
myhtml_tree_clean(state->tree);
// parse html into tree
// use parse_single for now, threaded mode is buggy with some files
mystatus_t status = myhtml_parse(state->tree, MyENCODING_UTF_8, (char*) html_bin.data, (size_t) html_bin.size);
if (status != MyHTML_STATUS_OK)
{
Expand All @@ -108,30 +104,31 @@ nif_decode(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]) {
myhtml_tree_node_t *root = myhtml_tree_get_document(state->tree);
result = build_tree(env, state->tree, myhtml_node_last_child(root));

// garbage collect argument
enif_release_binary(&html_bin);
// release myhtml resources
myhtml_node_free(root);

// return tree to erlang
return result;
}

ERL_NIF_TERM
build_node_children(ErlNifEnv* env, myhtml_tree_t* tree, myhtml_tree_node_t* parent)
{
ERL_NIF_TERM list;

list = enif_make_list(env, 0);
if (myhtml_node_is_close_self(parent))
{
return ATOM_NIL;
}

myhtml_tree_node_t* child = myhtml_node_last_child(parent);
if (child == NULL)
{
return EMPTY_LIST;
}

ERL_NIF_TERM list = enif_make_list(env, 0);

while (child)
{
ERL_NIF_TERM node_tuple = build_tree(env, tree, child);
list = enif_make_list_cell(env, node_tuple, list);

// free allocated resources
myhtml_node_free(child);
// get previous child, building the list from reverse
child = myhtml_node_prev(child);
}
Expand All @@ -142,11 +139,13 @@ build_node_children(ErlNifEnv* env, myhtml_tree_t* tree, myhtml_tree_node_t* par
ERL_NIF_TERM
build_node_attrs(ErlNifEnv* env, myhtml_tree_t* tree, myhtml_tree_node_t* node)
{
ERL_NIF_TERM list;
myhtml_tree_attr_t* attr;
myhtml_tree_attr_t* attr = myhtml_node_attribute_last(node);
if (attr == NULL)
{
return EMPTY_LIST;
}

list = enif_make_list(env, 0);
attr = myhtml_node_attribute_last(node);
ERL_NIF_TERM list = enif_make_list(env, 0);

while (attr)
{
Expand Down Expand Up @@ -177,8 +176,6 @@ build_node_attrs(ErlNifEnv* env, myhtml_tree_t* tree, myhtml_tree_node_t* node)
attr_tuple = enif_make_tuple2(env, name_bin, value_bin);
list = enif_make_list_cell(env, attr_tuple, list);

// free allocated resources
myhtml_attribute_free(tree, attr);
// get prev attribute, building the list from reverse
attr = myhtml_attribute_prev(attr);
}
Expand Down Expand Up @@ -212,8 +209,8 @@ build_tree(ErlNifEnv* env, myhtml_tree_t* tree, myhtml_tree_node_t* node)
memcpy(comment.data, node_comment, comment_len);

return result = enif_make_tuple3(env,
make_atom(env, "comment"),
enif_make_list(env, 0),
ATOM_COMMENT,
EMPTY_LIST,
enif_make_binary(env, &comment)
);
}
Expand All @@ -226,31 +223,33 @@ build_tree(ErlNifEnv* env, myhtml_tree_t* tree, myhtml_tree_node_t* node)
// get name of tag
size_t tag_name_len;
const char *tag_name = myhtml_tag_name_by_id(tree, tag_id, &tag_name_len);
size_t tag_string_len;
// get namespace of tag
size_t tag_ns_len;
const char *tag_ns_name_ptr = myhtml_namespace_name_by_id(tag_ns, &tag_ns_len);
char *tag_ns_buffer;
char buffer [tag_ns_len + 2];
char buffer [tag_ns_len + tag_name_len + 1];
char *tag_string = buffer;
size_t tag_string_len;

if (tag_ns != MyHTML_NAMESPACE_HTML)
{
// tag_ns_name_ptr is unmodifyable
// tag_ns_name_ptr is unmodifyable, copy it in our tag_ns_buffer to make it modifyable.
tag_ns_buffer = malloc(tag_ns_len);
strcpy(tag_ns_buffer, tag_ns_name_ptr);
// lowercase tag buffer (can be removed, just a nice to have)
tag_ns_buffer = lowercase(tag_ns_buffer);
tag_string_len = tag_ns_len + tag_name_len + 1; // +1 for colon
// prepend namespace to tag name, e.g. "svg:path"
stpcpy(stpcpy(stpcpy(tag_string, tag_ns_buffer), ":"), tag_name);
tag_string_len = tag_ns_len + tag_name_len + 1; // +1 for colon
}
else
{
stpcpy(tag_string, tag_name);
tag_string_len = tag_name_len;
}

// put non-html tags it in a binary
if (tag_id == MyHTML_TAG__UNDEF || tag_ns != MyHTML_NAMESPACE_HTML)
// put unknown and non-html tags it in a binary
if (tag_id == MyHTML_TAG__UNDEF || tag_id == MyHTML_TAG_LAST_ENTRY || tag_ns != MyHTML_NAMESPACE_HTML)
{
ErlNifBinary tag_b;
enif_alloc_binary(tag_string_len, &tag_b);
Expand All @@ -266,14 +265,7 @@ build_tree(ErlNifEnv* env, myhtml_tree_t* tree, myhtml_tree_node_t* node)
attrs = build_node_attrs(env, tree, node);

// add children or nil as a self-closing flag
if (myhtml_node_is_close_self(node))
{
children = ATOM_NIL;
}
else
{
children = build_node_children(env, tree, node);
}
children = build_node_children(env, tree, node);

// free allocated resources
if (tag_ns != MyHTML_NAMESPACE_HTML)
Expand All @@ -295,7 +287,6 @@ nif_cleanup_myhtmlex_ref(ErlNifEnv* env, void* obj)
myhtmlex_ref_t* ref = (myhtmlex_ref_t*) obj;
// release myhtml resources
myhtml_tree_destroy(ref->tree);
myhtml_node_free(ref->root);
}

// Erlang NIF
Expand All @@ -318,10 +309,12 @@ load(ErlNifEnv *env, void **priv, ERL_NIF_TERM info)
NULL
);
ATOM_NIL = make_atom(env, "nil");
ATOM_COMMENT = make_atom(env, "comment");
EMPTY_LIST = enif_make_list(env, 0);

// myhtml basic init
state->myhtml = myhtml_create();
myhtml_init(state->myhtml, MyHTML_OPTIONS_DEFAULT, 4, 0);
myhtml_init(state->myhtml, MyHTML_OPTIONS_DEFAULT, 1, 0);
state->tree = myhtml_tree_create();
myhtml_tree_init(state->tree, state->myhtml);

Expand Down
2 changes: 2 additions & 0 deletions src/myhtmlex.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ nif_cleanup_myhtml_tree(ErlNifEnv* env, void* obj);

// consts
ERL_NIF_TERM ATOM_NIL;
ERL_NIF_TERM ATOM_COMMENT;
ERL_NIF_TERM EMPTY_LIST;

typedef struct {
myhtml_t* myhtml;
Expand Down
7 changes: 7 additions & 0 deletions test/myhtmlex_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ defmodule MyhtmlexTest do

test "open and decode_tree" do
ref = Myhtmlex.open(~s'text node')
assert is_reference(ref)
assert {:html, [], [
{:head, [], []},
{:body, [], [
Expand Down Expand Up @@ -81,5 +82,11 @@ defmodule MyhtmlexTest do
]} = Myhtmlex.decode(~s'<esi:include />')
end

test "open this nasty github file (works fine in parse single, parse threaded hangs)" do
html = File.read!("bench/github_trending_js.html")
ref = Myhtmlex.open(html)
assert is_reference(ref)
end

end

0 comments on commit 8415ce4

Please sign in to comment.