Skip to content

Commit

Permalink
WIP gh-71 : doc update
Browse files Browse the repository at this point in the history
  • Loading branch information
Shakadak committed Mar 25, 2021
1 parent d26fffa commit 86e4b3d
Show file tree
Hide file tree
Showing 7 changed files with 121 additions and 36 deletions.
13 changes: 12 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ Add dependency to your project's `mix.exs`:

```elixir
def deps do
[{:sweet_xml, "~> 0.6.6"}]
[{:sweet_xml, "~> 0.7.0"}]
end
```

Expand Down Expand Up @@ -464,6 +464,17 @@ result = file_stream
|> stream_tags([:li, :special_match_key], discard: [:li, :special_match_key])
```

## Security

Whenever you have to deal with some XML that was not generated by your system (untrusted document),
it is highly recommended that you separate the parsing step from the mapping step, in order to be able
to prevent some default behavior through options. You can check the doc for `SweetXml.parse/2` for more details.
The current recommendations are:
```
doc |> parse(dtd: :none) |> xpath(spec, subspec)
enum |> stream_tags(tags, dtd: :none)
```

## Copyright and License

Copyright (c) 2014, Frank Liu
Expand Down
100 changes: 81 additions & 19 deletions lib/sweet_xml.ex
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
defmodule SweetXpath do
@moduledoc false

defmodule Priv do
@moduledoc false
Expand Down Expand Up @@ -115,20 +116,34 @@ defmodule SweetXml do
"""

require Record
@doc false
Record.defrecord :xmlDecl, Record.extract(:xmlDecl, from_lib: "xmerl/include/xmerl.hrl")
@doc false
Record.defrecord :xmlAttribute, Record.extract(:xmlAttribute, from_lib: "xmerl/include/xmerl.hrl")
@doc false
Record.defrecord :xmlNamespace, Record.extract(:xmlNamespace, from_lib: "xmerl/include/xmerl.hrl")
@doc false
Record.defrecord :xmlNsNode, Record.extract(:xmlNsNode, from_lib: "xmerl/include/xmerl.hrl")
@doc false
Record.defrecord :xmlElement, Record.extract(:xmlElement, from_lib: "xmerl/include/xmerl.hrl")
@doc false
Record.defrecord :xmlText, Record.extract(:xmlText, from_lib: "xmerl/include/xmerl.hrl")
@doc false
Record.defrecord :xmlComment, Record.extract(:xmlComment, from_lib: "xmerl/include/xmerl.hrl")
@doc false
Record.defrecord :xmlPI, Record.extract(:xmlPI, from_lib: "xmerl/include/xmerl.hrl")
@doc false
Record.defrecord :xmlDocument, Record.extract(:xmlDocument, from_lib: "xmerl/include/xmerl.hrl")
@doc false
Record.defrecord :xmlObj, Record.extract(:xmlObj, from_lib: "xmerl/include/xmerl.hrl")

@type doc :: (iodata | String.t | Enum.t)
@type spec :: %SweetXpath{}
@opaque xmlElement :: record(:xmlElement)


@doc ~s"""
`sigil_x/2` simply returns a `SweetXpath` struct, with modifiers converted to
`sigil_x/2` simply returns a `%SweetXpath{}` struct, with modifiers converted to
boolean fields:
iex> SweetXml.sigil_x("//some/path", 'e')
Expand Down Expand Up @@ -211,20 +226,31 @@ defmodule SweetXml do
| xpath.namespaces]}
end

@doc """
@doc """
Parse a document into a form ready to be used by `xpath/3` and `xmap/2`.
`doc` can be
- a byte list (iodata)
- a binary
- any enumerable of binaries (for instance `File.stream!/3` result)
`options` are `xmerl` options described here [http://www.erlang.org/doc/man/xmerl_scan.html](http://www.erlang.org/doc/man/xmerl_scan.html),
see [the erlang tutorial](http://www.erlang.org/doc/apps/xmerl/xmerl_examples.html) for usage.
`options` can be both:
* `xmerl`'s options as described on the [xmerl_scan](http://www.erlang.org/doc/man/xmerl_scan.html) documentation page,
see [the erlang tutorial](http://www.erlang.org/doc/apps/xmerl/xmerl_examples.html) for some advanced usage.
For example: `parse(doc, quiet: true)`
* `:dtd` to prevent DTD parsing or fetching, with the following possibilities:
* `:none`, will prevent both internal and external entities, it is the recommended options on untrusted XML;
* `:all`, the default, for backward compatibility, allows all DTDs;
* `:internal_only`, will block all attempt at external fetching;
* `[only: entities]` where `entities` is either an atom for a single entity, or a list of atoms.
If any other entity is defined in the XML, `parse` will raise on them.
When `doc` is an enumerable, the `:cont_fun` option cannot be given.
Returns an `xmlElement` record.
"""
@spec parse(doc, opts :: list) :: xmlElement
def parse(doc, opts \\ []) do
ets = :ets.new(nil, [])
dtd_arg = :proplists.get_value(:dtd, opts, :all)
Expand Down Expand Up @@ -264,6 +290,7 @@ defmodule SweetXml do
will be `{:tagname, xmlelem}`. e.g. :li, :header
- `options[:discard]` is the list of tag which will be discarded:
not added to its parent DOM.
- More options details are available with `parse/2`.
## Examples
Expand Down Expand Up @@ -338,9 +365,9 @@ defmodule SweetXml do
- `doc` is an enumerable, data will be pulled during the result stream
enumeration. e.g. `File.stream!("some_file.xml")`
- `options_callback` is an anonymous function `fn emit -> xmerl_opts` use it to
- `options_callback` is an anonymous function `fn emit -> (xmerl_opts | opts)` use it to
define your :xmerl callbacks and put data into the stream using
`emit.(elem)` in the callbacks.
`emit.(elem)` in the callbacks. More details are available with `parse/2`.
For example, here you define a stream of all `xmlElement` :
Expand Down Expand Up @@ -400,12 +427,12 @@ defmodule SweetXml do
end

@doc ~S"""
`xpath` allows you to query an XML document with xpath.
`xpath` allows you to query an XML document with XPath.
The second argument to xpath is a `SweetXpath` struct. The optional third
The second argument to xpath is a `%SweetXpath{}` struct. The optional third
argument is a keyword list, such that the value of each keyword is also
either a `SweetXpath` or a list with head being a `SweetXpath` and tail being
another keyword list exactly like before. Please see examples below for better
either a `%SweetXpath{}` or a list with head being a `%SweetXpath{}` and tail being
another keyword list exactly like before. Please see the examples below for better
understanding.
## Examples
Expand Down Expand Up @@ -438,32 +465,49 @@ defmodule SweetXml do
...> )
%{ul: %{a: 'Two'}}
## Security
Whenever you are working with some xml that was not generated by your system,
it is highly recommended that you restrain some functionalities of XML
during the parsing. SweetXml allows in particular to prevent DTD parsing and fetching.
Unless you know exactly what kind of DTD you want to permit in your xml,
it is recommended that you use the following code example to prevent possible attacks:
```
doc
|> parse(dtd: :none)
|> xpath(spec, subspec)
```
For more details, see `parse/2`.
"""
def xpath(parent, spec) when not is_tuple(parent) do
@spec xpath(parent :: (doc | xmlElement), spec, subspec) :: any
when subspec: keyword(spec | subspec)
def xpath(parent, spec, subspec \\ [])

def xpath(parent, spec, []) when not is_tuple(parent) do
parent |> parse |> xpath(spec)
end

def xpath(parent, %SweetXpath{is_list: true, is_value: true, cast_to: cast, is_optional: is_opt?} = spec) do
def xpath(parent, %SweetXpath{is_list: true, is_value: true, cast_to: cast, is_optional: is_opt?} = spec, []) do
get_current_entities(parent, spec) |> Enum.map(&(_value(&1)) |> to_cast(cast,is_opt?)) |> spec.transform_fun.()
end

def xpath(parent, %SweetXpath{is_list: true, is_value: false} = spec) do
def xpath(parent, %SweetXpath{is_list: true, is_value: false} = spec, []) do
get_current_entities(parent, spec) |> spec.transform_fun.()
end

def xpath(parent, %SweetXpath{is_list: false, is_value: true, cast_to: string_type, is_optional: is_opt?} = spec) when string_type in [:string,:soft_string] do
def xpath(parent, %SweetXpath{is_list: false, is_value: true, cast_to: string_type, is_optional: is_opt?} = spec, []) when string_type in [:string,:soft_string] do
spec = %SweetXpath{spec | is_list: true}
get_current_entities(parent, spec)
|> Enum.map(&(_value(&1) |> to_cast(string_type, is_opt?)))
|> Enum.join
|> spec.transform_fun.()
end

def xpath(parent, %SweetXpath{is_list: false, is_value: true, cast_to: cast, is_optional: is_opt?} = spec) do
def xpath(parent, %SweetXpath{is_list: false, is_value: true, cast_to: cast, is_optional: is_opt?} = spec, []) do
get_current_entities(parent, spec) |> _value |> to_cast(cast, is_opt?) |> spec.transform_fun.()
end

def xpath(parent, %SweetXpath{is_list: false, is_value: false} = spec) do
def xpath(parent, %SweetXpath{is_list: false, is_value: false} = spec, []) do
get_current_entities(parent, spec) |> spec.transform_fun.()
end

Expand All @@ -478,11 +522,13 @@ defmodule SweetXml do
end

@doc ~S"""
`xmap` returns a mapping with each value being the result of `xpath`
`xmap` returns a mapping with each value being the result of `xpath`.
Just as `xpath`, you can nest the mapping structure. Please see `xpath` for
Just as `xpath`, you can nest the mapping structure. Please see `xpath/3` for
more detail.
You can give the option `true` to get the result as a keyword list instead of a map.
## Examples
Simple:
Expand Down Expand Up @@ -530,8 +576,24 @@ defmodule SweetXml do
...> ]
...> ], true)
[message: 'Message', ul: %{a: 'Two'}]
## Security
Whenever you are working with some xml that was not generated by your system,
it is highly recommended that you restrain some functionalities of XML
during the parsing. SweetXml allows in particular to prevent DTD parsing and fetching.
Unless you know exactly what kind of DTD you want to permit in your xml,
it is recommended that you use the following code example to prevent possible attacks:
```
doc
|> parse(dtd: :none)
|> xmap(specs, options)
```
For more details, see `parse/2`.
"""
def xmap(parent, mapping), do: xmap(parent, mapping, %{is_keyword: false})
@spec xmap(parent :: (doc | xmlElement), mapping :: specs, options :: (boolean | map)) :: (map | keyword)
when specs: keyword(spec | specs)
def xmap(parent, mapping, options \\ false)

def xmap(nil, _, %{is_optional: true}), do: nil

Expand Down
2 changes: 2 additions & 0 deletions lib/sweet_xml/options.ex
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
defmodule SweetXml.Options do
@moduledoc false

def handle_dtd(:all) do
fn _ -> [] end
end
Expand Down
4 changes: 2 additions & 2 deletions mix.exs
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
defmodule SweetXml.Mixfile do
use Mix.Project

@source_url "https://github.com/awetzel/sweet_xml"
@source_url "https://github.com/kbrw/sweet_xml"

def project do
[
app: :sweet_xml,
version: "0.6.6",
version: "0.7.0-rc.1",
elixir: "~> 1.0",
description: "An sweet wrapper of :xmerl to help query XML docs",
deps: deps(),
Expand Down
5 changes: 3 additions & 2 deletions mix.lock
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
%{
"earmark": {:hex, :earmark, "1.3.1", "73812f447f7a42358d3ba79283cfa3075a7580a3a2ed457616d6517ac3738cb9", [:mix], [], "hexpm", "000aaeff08919e95e7aea13e4af7b2b9734577b3e6a7c50ee31ee88cab6ec4fb"},
"earmark_parser": {:hex, :earmark_parser, "1.4.12", "b245e875ec0a311a342320da0551da407d9d2b65d98f7a9597ae078615af3449", [:mix], [], "hexpm", "711e2cc4d64abb7d566d43f54b78f7dc129308a63bc103fbd88550d2174b3160"},
"ex_doc": {:hex, :ex_doc, "0.23.0", "a069bc9b0bf8efe323ecde8c0d62afc13d308b1fa3d228b65bca5cf8703a529d", [:mix], [{:earmark_parser, "~> 1.4.0", [hex: :earmark_parser, repo: "hexpm", optional: false]}, {:makeup_elixir, "~> 0.14", [hex: :makeup_elixir, repo: "hexpm", optional: false]}], "hexpm", "f5e2c4702468b2fd11b10d39416ddadd2fcdd173ba2a0285ebd92c39827a5a16"},
"ex_doc": {:hex, :ex_doc, "0.24.1", "15673de99154f93ca7f05900e4e4155ced1ee0cd34e0caeee567900a616871a4", [:mix], [{:earmark_parser, "~> 1.4.0", [hex: :earmark_parser, repo: "hexpm", optional: false]}, {:makeup_elixir, "~> 0.14", [hex: :makeup_elixir, repo: "hexpm", optional: false]}, {:makeup_erlang, "~> 0.1", [hex: :makeup_erlang, repo: "hexpm", optional: false]}], "hexpm", "07972f17bdf7dc7b5bd76ec97b556b26178ed3f056e7ec9288eb7cea7f91cce2"},
"makeup": {:hex, :makeup, "1.0.5", "d5a830bc42c9800ce07dd97fa94669dfb93d3bf5fcf6ea7a0c67b2e0e4a7f26c", [:mix], [{:nimble_parsec, "~> 0.5 or ~> 1.0", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "cfa158c02d3f5c0c665d0af11512fed3fba0144cf1aadee0f2ce17747fba2ca9"},
"makeup_elixir": {:hex, :makeup_elixir, "0.15.0", "98312c9f0d3730fde4049985a1105da5155bfe5c11e47bdc7406d88e01e4219b", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}, {:nimble_parsec, "~> 1.1", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "75ffa34ab1056b7e24844c90bfc62aaf6f3a37a15faa76b07bc5eba27e4a8b4a"},
"makeup_elixir": {:hex, :makeup_elixir, "0.15.1", "b5888c880d17d1cc3e598f05cdb5b5a91b7b17ac4eaf5f297cb697663a1094dd", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}, {:nimble_parsec, "~> 1.1", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "db68c173234b07ab2a07f645a5acdc117b9f99d69ebf521821d89690ae6c6ec8"},
"makeup_erlang": {:hex, :makeup_erlang, "0.1.1", "3fcb7f09eb9d98dc4d208f49cc955a34218fc41ff6b84df7c75b3e6e533cc65f", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}], "hexpm", "174d0809e98a4ef0b3309256cbf97101c6ec01c4ab0b23e926a9e17df2077cbb"},
"markdown": {:git, "git://github.com/devinus/markdown.git", "cd0df79b6f1cc374499d47f6ba6aaab5096f874f", []},
"nimble_parsec": {:hex, :nimble_parsec, "1.1.0", "3a6fca1550363552e54c216debb6a9e95bd8d32348938e13de5eda962c0d7f89", [:mix], [], "hexpm", "08eb32d66b706e913ff748f11694b17981c0b04a33ef470e33e11b3d3ac8f54b"},
}
4 changes: 4 additions & 0 deletions test/files/xxe.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE foo [ <!ELEMENT foo ANY >
<!ENTITY xxe SYSTEM "file:///etc/passwd" >]>
<response><result>&xxe;</result></response>
29 changes: 17 additions & 12 deletions test/issue_71_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,27 @@ defmodule Issue71Test do
use ExUnit.Case

test "raise on reading /etc/passwd with dtd: :none" do
sneaky_xml = """
<?xml version=\"1.0\" encoding=\"UTF-8\"?>
<!DOCTYPE foo [ <!ELEMENT foo ANY >
<!ENTITY xxe SYSTEM \"file:///etc/passwd\" >]>
<response><result>&xxe;</result></response>
"""
sneaky_xml = File.read!("./test/files/xxe.xml")

assert {:fatal, {{:error_fetching_DTD, {_, _}}, _file, _line, _col}} =
catch_exit(SweetXml.parse(sneaky_xml, dtd: :none, quiet: true))
end

test "raise on reading /etc/passwd with dtd: :internal_only" do
sneaky_xml = File.read!("./test/files/xxe.xml")

assert {:fatal, {{:error_fetching_DTD, {_, _}}, _file, _line, _col}} =
catch_exit(SweetXml.parse(sneaky_xml, dtd: :internal_only, quiet: true))
end

test "raise on reading /etc/passwd with dtd: [only: :banana]" do
sneaky_xml = File.read!("./test/files/xxe.xml")

assert_raise RuntimeError, fn ->
SweetXml.parse(sneaky_xml, dtd: [only: :banana])
end
end

test "raise on billion_laugh.xml with dtd: :none" do
dangerous_xml = File.read!("./test/files/billion_laugh.xml")
assert_raise RuntimeError, fn ->
Expand All @@ -21,12 +31,7 @@ defmodule Issue71Test do
end

test "stream: raise on reading /etc/passwd with dtd: :none" do
sneaky_xml = """
<?xml version=\"1.0\" encoding=\"UTF-8\"?>
<!DOCTYPE foo [ <!ELEMENT foo ANY >
<!ENTITY xxe SYSTEM \"file:///etc/passwd\" >]>
<response><result>&xxe;</result></response>
"""
sneaky_xml = File.read!("./test/files/xxe.xml")

_ = Process.flag(:trap_exit, true)
pid = spawn_link(fn -> Stream.run(SweetXml.stream_tags(sneaky_xml, :banana, dtd: :none, quiet: true)) end)
Expand Down

0 comments on commit 86e4b3d

Please sign in to comment.