Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Skip Document Type Definition #33

Merged
merged 2 commits into from
Oct 2, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
144 changes: 143 additions & 1 deletion lib/saxy/parser/prolog.ex
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,7 @@ defmodule Saxy.Parser.Prolog do

case Emitter.emit(:start_document, prolog, state) do
{:ok, state} ->
Element.parse(rest, more?, original, pos, state)
dtd(rest, more?, original, pos, state)

{:stop, state} ->
{:ok, state}
Expand Down Expand Up @@ -385,4 +385,146 @@ defmodule Saxy.Parser.Prolog do
defp prolog_pi_content(<<charcode::utf8, rest::bits>>, more?, original, pos, state, prolog, len) do
prolog_pi_content(rest, more?, original, pos, state, prolog, len + Utils.compute_char_len(charcode))
end

defp dtd(<<"<!DOCTYPE", rest::bits>>, more?, original, pos, state) do
dtd_content(rest, more?, original, pos + 9, state, 0, 1)
end

defhalt(:dtd, 5, "")
defhalt(:dtd, 5, "<")
defhalt(:dtd, 5, "<!")
defhalt(:dtd, 5, "<!D")
defhalt(:dtd, 5, "<!DO")
defhalt(:dtd, 5, "<!DOC")
defhalt(:dtd, 5, "<!DOCT")
defhalt(:dtd, 5, "<!DOCTY")
defhalt(:dtd, 5, "<!DOCTYP")

defp dtd(<<rest::bits>>, more?, original, pos, state) do
Element.parse(rest, more?, original, pos, state)
end

defp dtd_content(<<?>, rest::bits>>, more?, original, pos, state, len, 1) do
dtd_misc(rest, more?, original, pos + len + 1, state)
end

defp dtd_content(<<?>, rest::bits>>, more?, original, pos, state, len, count) do
dtd_content(rest, more?, original, pos, state, len + 1, count - 1)
end

defp dtd_content(<<?<, rest::bits>>, more?, original, pos, state, len, count) do
dtd_content(rest, more?, original, pos, state, len + 1, count + 1)
end

defp dtd_content(<<charcode, rest::bits>>, more?, original, pos, state, len, count)
when is_ascii(charcode) do
dtd_content(rest, more?, original, pos, state, len + 1, count)
end

defp dtd_content(<<charcode::utf8, rest::bits>>, more?, original, pos, state, len, count) do
dtd_content(rest, more?, original, pos, state, len + Utils.compute_char_len(charcode), count)
end

defhalt(:dtd_content, 7, "")

defp dtd_content(<<_::bits>>, _more?, original, pos, state, _len, _count) do
Utils.parse_error(original, pos, state, {:token, :dtd_content})
end

defp dtd_misc(<<whitespace::integer, rest::bits>>, more?, original, pos, state)
when is_whitespace(whitespace) do
dtd_misc(rest, more?, original, pos + 1, state)
end

defp dtd_misc(<<"<!--", rest::bits>>, more?, original, pos, state) do
dtd_misc_comment(rest, more?, original, pos + 4, state, 0)
end

defp dtd_misc(<<"<?", rest::bits>>, more?, original, pos, state) do
dtd_processing_instruction(rest, more?, original, pos + 2, state)
end

defhalt(:dtd_misc, 5, "")
defhalt(:dtd_misc, 5, "<")
defhalt(:dtd_misc, 5, "<!")
defhalt(:dtd_misc, 5, "<!-")

defp dtd_misc(<<rest::bits>>, more?, original, pos, state) do
Element.parse(rest, more?, original, pos, state)
end

defp dtd_misc_comment(<<"--->", _rest::bits>>, _more?, original, pos, state, len) do
Utils.parse_error(original, pos + len, state, {:token, :comment})
end

defp dtd_misc_comment(<<"-->", rest::bits>>, more?, original, pos, state, len) do
dtd_misc(rest, more?, original, pos + len + 3, state)
end

defhalt(:dtd_misc_comment, 6, "")
defhalt(:dtd_misc_comment, 6, "-")
defhalt(:dtd_misc_comment, 6, "--")

defp dtd_misc_comment(<<charcode, rest::bits>>, more?, original, pos, state, len)
when is_ascii(charcode) do
dtd_misc_comment(rest, more?, original, pos, state, len + 1)
end

defp dtd_misc_comment(<<charcode::utf8, rest::bits>>, more?, original, pos, state, len) do
dtd_misc_comment(rest, more?, original, pos, state, len + Utils.compute_char_len(charcode))
end

defp dtd_processing_instruction(<<charcode, rest::bits>>, more?, original, pos, state)
when is_name_start_char(charcode) do
dtd_pi_name(rest, more?, original, pos, state, 1)
end

defp dtd_processing_instruction(<<charcode::utf8, rest::bits>>, more?, original, pos, state)
when is_name_start_char(charcode) do
dtd_pi_name(rest, more?, original, pos, state, Utils.compute_char_len(charcode))
end

defhalt(:dtd_processing_instruction, 5, "")

defp dtd_processing_instruction(<<_buffer::bits>>, _more?, original, pos, state) do
Utils.parse_error(original, pos, state, {:token, :processing_instruction})
end

defp dtd_pi_name(<<charcode, rest::bits>>, more?, original, pos, state, len)
when is_name_char(charcode) do
dtd_pi_name(rest, more?, original, pos, state, len + 1)
end

defp dtd_pi_name(<<charcode::utf8, rest::bits>>, more?, original, pos, state, len)
when is_name_char(charcode) do
dtd_pi_name(rest, more?, original, pos, state, len + Utils.compute_char_len(charcode))
end

defhalt(:prolog_pi_name, 6, "")

defp dtd_pi_name(<<rest::bits>>, more?, original, pos, state, len) do
pi_name = binary_part(original, pos, len)

if Utils.valid_pi_name?(pi_name) do
dtd_pi_content(rest, more?, original, pos + len, state, 0)
else
Utils.parse_error(original, pos, state, {:invalid_pi, pi_name})
end
end

defp dtd_pi_content(<<"?>", rest::bits>>, more?, original, pos, state, len) do
dtd_misc(rest, more?, original, pos + len + 2, state)
end

defhalt(:dtd_pi_content, 6, "")
defhalt(:dtd_pi_content, 6, "?")

defp dtd_pi_content(<<charcode, rest::bits>>, more?, original, pos, state, len)
when is_ascii(charcode) do
dtd_pi_content(rest, more?, original, pos, state, len + 1)
end

defp dtd_pi_content(<<charcode::utf8, rest::bits>>, more?, original, pos, state, len) do
dtd_pi_content(rest, more?, original, pos, state, len + Utils.compute_char_len(charcode))
end
end
51 changes: 51 additions & 0 deletions test/saxy/parser/prolog_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,57 @@ defmodule Saxy.Parser.PrologTest do
assert ParseError.message(error) == "unexpected byte \"!\", expected token: :processing_instruction"
end

describe "document type definition" do
test "skips parsing DTD" do
buffer = """
<?xml version="1.0" ?>
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
<foo/>
"""

assert {:ok, _state} = parse(buffer, false, buffer, 0, make_state())
end

test "skips parsing DTD with declaration" do
buffer = """
<?xml version="1.0" ?>
<!DOCTYPE note [
<!ELEMENT note (to,from,heading,body)>
<!ELEMENT to (#PCDATA)>
<!ELEMENT from (#PCDATA)>
<!ELEMENT heading (#PCDATA)>
<!ELEMENT body (#PCDATA)>
]>
<foo/>
"""

assert {:ok, _state} = parse(buffer, false, buffer, 0, make_state())
end

test "raises when DTD is incomplete" do
buffer = """
<?xml version="1.0" ?>
<!DOCTYPE note
<foo/>
"""

assert {:error, error} = parse(buffer, false, buffer, 0, make_state())
assert ParseError.message(error) == "unexpected byte \" \", expected token: :dtd_content"
end

test "parses Misc afters DTD" do
buffer = """
<?xml version="1.0" ?>
<!DOCTYPE html>
<!--This is comment-->
<?foo foo?>
<foo/>
"""

assert {:ok, _state} = parse(buffer, false, buffer, 0, make_state())
end
end

defp make_state(state \\ []) do
%Saxy.State{
prolog: nil,
Expand Down
17 changes: 15 additions & 2 deletions test/saxy_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ defmodule SaxyTest do
data = File.read!("./test/support/fixture/complex.xml")
assert {:ok, state} = Saxy.parse_string(data, StackHandler, [])
assert length(state) == 79

data = File.read!("./test/support/fixture/illustrator.svg")
assert {:ok, state} = Saxy.parse_string(data, StackHandler, [])
assert length(state) == 12
end

test "parse_string/4 parses XML binary with multiple \":expand_entity\" strategy" do
Expand Down Expand Up @@ -64,19 +68,28 @@ defmodule SaxyTest do

stream = File.stream!("./test/support/fixture/food.xml", [], 200)
assert {:ok, state} = Saxy.parse_stream(stream, StackHandler, [])

assert length(state) == 74

stream = File.stream!("./test/support/fixture/complex.xml", [], 200)
assert {:ok, state} = Saxy.parse_stream(stream, StackHandler, [])

assert length(state) == 79

stream = File.stream!("./test/support/fixture/illustrator.svg", [], 5)
assert {:ok, state} = Saxy.parse_stream(stream, StackHandler, [])
assert length(state) == 12
end

test "parse_stream/3 parses normal stream" do
stream =
"""
<?xml version='1.0' encoding="UTF-8" ?>
<!DOCTYPE note [
<!ELEMENT note (to,from,heading,body)>
<!ELEMENT to (#PCDATA)>
<!ELEMENT from (#PCDATA)>
<!ELEMENT heading (#PCDATA)>
<!ELEMENT body (#PCDATA)>
]>
<item name="[日本語] Tom &amp; Jerry" category='movie'>
<author name='William Hanna &#x26; Joseph Barbera' />
<!--Ignore me please I am just a comment-->
Expand Down
16 changes: 16 additions & 0 deletions test/support/fixture/illustrator.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.