Skip to content

Commit

Permalink
Merge branch 'issue-71'
Browse files Browse the repository at this point in the history
  • Loading branch information
Shakadak committed Jul 2, 2021
2 parents c224316 + 86e4b3d commit 088b3be
Show file tree
Hide file tree
Showing 9 changed files with 260 additions and 39 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@

* Improvement on the doc

### ADDED

* Added option to raise on DTD definitions

## [0.6.6] (2019-02-24)

* small bugfix: Fix compilation warnings on newer versions of Elixir
Expand Down
13 changes: 12 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ Add dependency to your project's `mix.exs`:

```elixir
def deps do
[{:sweet_xml, "~> 0.6.6"}]
[{:sweet_xml, "~> 0.7.0"}]
end
```

Expand Down Expand Up @@ -467,6 +467,17 @@ result = file_stream
|> stream_tags([:li, :special_match_key], discard: [:li, :special_match_key])
```

## Security

Whenever you have to deal with some XML that was not generated by your system (untrusted document),
it is highly recommended that you separate the parsing step from the mapping step, in order to be able
to prevent some default behavior through options. You can check the doc for `SweetXml.parse/2` for more details.
The current recommendations are:
```
doc |> parse(dtd: :none) |> xpath(spec, subspec)
enum |> stream_tags(tags, dtd: :none)
```

## Copyright and License

Copyright (c) 2014, Frank Liu
Expand Down
149 changes: 116 additions & 33 deletions lib/sweet_xml.ex
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
defmodule SweetXpath do
@moduledoc false

defmodule Priv do
@moduledoc false
Expand Down Expand Up @@ -115,20 +116,34 @@ defmodule SweetXml do
"""

require Record
@doc false
Record.defrecord :xmlDecl, Record.extract(:xmlDecl, from_lib: "xmerl/include/xmerl.hrl")
@doc false
Record.defrecord :xmlAttribute, Record.extract(:xmlAttribute, from_lib: "xmerl/include/xmerl.hrl")
@doc false
Record.defrecord :xmlNamespace, Record.extract(:xmlNamespace, from_lib: "xmerl/include/xmerl.hrl")
@doc false
Record.defrecord :xmlNsNode, Record.extract(:xmlNsNode, from_lib: "xmerl/include/xmerl.hrl")
@doc false
Record.defrecord :xmlElement, Record.extract(:xmlElement, from_lib: "xmerl/include/xmerl.hrl")
@doc false
Record.defrecord :xmlText, Record.extract(:xmlText, from_lib: "xmerl/include/xmerl.hrl")
@doc false
Record.defrecord :xmlComment, Record.extract(:xmlComment, from_lib: "xmerl/include/xmerl.hrl")
@doc false
Record.defrecord :xmlPI, Record.extract(:xmlPI, from_lib: "xmerl/include/xmerl.hrl")
@doc false
Record.defrecord :xmlDocument, Record.extract(:xmlDocument, from_lib: "xmerl/include/xmerl.hrl")
@doc false
Record.defrecord :xmlObj, Record.extract(:xmlObj, from_lib: "xmerl/include/xmerl.hrl")

@type doc :: (iodata | String.t | Enum.t)
@type spec :: %SweetXpath{}
@opaque xmlElement :: record(:xmlElement)


@doc ~s"""
`sigil_x/2` simply returns a `SweetXpath` struct, with modifiers converted to
`sigil_x/2` simply returns a `%SweetXpath{}` struct, with modifiers converted to
boolean fields:
iex> SweetXml.sigil_x("//some/path", 'e')
Expand Down Expand Up @@ -211,29 +226,52 @@ defmodule SweetXml do
| xpath.namespaces]}
end

@doc """
@doc """
Parse a document into a form ready to be used by `xpath/3` and `xmap/2`.
`doc` can be
- a byte list (iodata)
- a binary
- any enumerable of binaries (for instance `File.stream!/3` result)
`options` are `xmerl` options described here [http://www.erlang.org/doc/man/xmerl_scan.html](http://www.erlang.org/doc/man/xmerl_scan.html),
see [the erlang tutorial](http://www.erlang.org/doc/apps/xmerl/xmerl_examples.html) for usage.
`options` can be both:
* `xmerl`'s options as described on the [xmerl_scan](http://www.erlang.org/doc/man/xmerl_scan.html) documentation page,
see [the erlang tutorial](http://www.erlang.org/doc/apps/xmerl/xmerl_examples.html) for some advanced usage.
For example: `parse(doc, quiet: true)`
* `:dtd` to prevent DTD parsing or fetching, with the following possibilities:
* `:none`, will prevent both internal and external entities, it is the recommended options on untrusted XML;
* `:all`, the default, for backward compatibility, allows all DTDs;
* `:internal_only`, will block all attempt at external fetching;
* `[only: entities]` where `entities` is either an atom for a single entity, or a list of atoms.
If any other entity is defined in the XML, `parse` will raise on them.
When `doc` is an enumerable, the `:cont_fun` option cannot be given.
Returns an `xmlElement` record.
"""
def parse(doc), do: parse(doc, [])
def parse(doc, options) when is_binary(doc) do
doc |> :erlang.binary_to_list |> parse(options)
@spec parse(doc, opts :: list) :: xmlElement
def parse(doc, opts \\ []) do
ets = :ets.new(nil, [])
dtd_arg = :proplists.get_value(:dtd, opts, :all)
opts = :proplists.delete(:dtd, opts)
opts = SweetXml.Options.handle_dtd(dtd_arg).(ets) ++ opts
try do
do_parse(doc, opts)
after
_ = :ets.delete(ets)
end
end
def parse([c | _] = doc, options) when is_integer(c) do

@doc false
def do_parse(doc, options) when is_binary(doc) do
doc |> :erlang.binary_to_list |> do_parse(options)
end
def do_parse([c | _] = doc, options) when is_integer(c) do
{parsed_doc, _} = :xmerl_scan.string(doc, options)
parsed_doc
end
def parse(doc_enum, options) do
def do_parse(doc_enum, options) do
{parsed_doc, _} = :xmerl_scan.string('', options ++ continuation_opts(doc_enum))
parsed_doc
end
Expand All @@ -252,6 +290,7 @@ defmodule SweetXml do
will be `{:tagname, xmlelem}`. e.g. :li, :header
- `options[:discard]` is the list of tag which will be discarded:
not added to its parent DOM.
- More options details are available with `parse/2`.
## Examples
Expand Down Expand Up @@ -287,10 +326,9 @@ defmodule SweetXml do
def stream_tags(doc, tags, options \\ []) do
tags = if is_atom(tags), do: [tags], else: tags

{discard_tags, xmerl_options} = if options[:discard] do
{options[:discard], Keyword.delete(options, :discard)}
else
{[], options}
{discard_tags, xmerl_options} = case :proplists.lookup(:discard, options) do
{:discard, tags} -> {tags, :proplists.delete(:discard, options)}
:none -> {[], options}
end

doc |> stream(fn emit ->
Expand Down Expand Up @@ -327,9 +365,9 @@ defmodule SweetXml do
- `doc` is an enumerable, data will be pulled during the result stream
enumeration. e.g. `File.stream!("some_file.xml")`
- `options_callback` is an anonymous function `fn emit -> xmerl_opts` use it to
- `options_callback` is an anonymous function `fn emit -> (xmerl_opts | opts)` use it to
define your :xmerl callbacks and put data into the stream using
`emit.(elem)` in the callbacks.
`emit.(elem)` in the callbacks. More details are available with `parse/2`.
For example, here you define a stream of all `xmlElement` :
Expand Down Expand Up @@ -358,33 +396,43 @@ defmodule SweetXml do
Stream.resource fn ->
{parent, ref} = waiter = {self(), make_ref()}
opts = options_callback.(fn e -> send(parent, {:event, ref, e}) end)

ets = :ets.new(nil, [:public])
dtd_arg = :proplists.get_value(:dtd, opts, :all)
opts = :proplists.delete(:dtd, opts)
opts = SweetXml.Options.handle_dtd(dtd_arg).(ets) ++ opts

pid = spawn_link fn -> :xmerl_scan.string('', opts ++ continuation_opts(doc, waiter)) end
{ref, pid, Process.monitor(pid)}
end, fn {ref, pid, monref} = acc ->
{ref, pid, Process.monitor(pid), ets}
end, fn {ref, pid, monref, ets} = acc ->
receive do
{:DOWN, ^monref, _, _, _} ->
{:halt, :parse_ended} ## !!! maybe do something when reason !== :normal
{:halt, {:parse_ended, ets}} ## !!! maybe do something when reason !== :normal
{:event, ^ref, event} ->
{[event], acc}
{:wait, ^ref} ->
send(pid, {:continue, ref})
{[], acc}
end
end, fn
:parse_ended -> :ok
{ref, pid, monref} ->
{:parse_ended, ets} ->
_ = :ets.delete(ets)
:ok

{ref, pid, monref, ets} ->
Process.demonitor(monref)
_ = :ets.delete(ets)
flush_halt(pid, ref)
end
end

@doc ~S"""
`xpath` allows you to query an XML document with xpath.
`xpath` allows you to query an XML document with XPath.
The second argument to xpath is a `SweetXpath` struct. The optional third
The second argument to xpath is a `%SweetXpath{}` struct. The optional third
argument is a keyword list, such that the value of each keyword is also
either a `SweetXpath` or a list with head being a `SweetXpath` and tail being
another keyword list exactly like before. Please see examples below for better
either a `%SweetXpath{}` or a list with head being a `%SweetXpath{}` and tail being
another keyword list exactly like before. Please see the examples below for better
understanding.
## Examples
Expand Down Expand Up @@ -417,32 +465,49 @@ defmodule SweetXml do
...> )
%{ul: %{a: 'Two'}}
## Security
Whenever you are working with some xml that was not generated by your system,
it is highly recommended that you restrain some functionalities of XML
during the parsing. SweetXml allows in particular to prevent DTD parsing and fetching.
Unless you know exactly what kind of DTD you want to permit in your xml,
it is recommended that you use the following code example to prevent possible attacks:
```
doc
|> parse(dtd: :none)
|> xpath(spec, subspec)
```
For more details, see `parse/2`.
"""
def xpath(parent, spec) when not is_tuple(parent) do
@spec xpath(parent :: (doc | xmlElement), spec, subspec) :: any
when subspec: keyword(spec | subspec)
def xpath(parent, spec, subspec \\ [])

def xpath(parent, spec, []) when not is_tuple(parent) do
parent |> parse |> xpath(spec)
end

def xpath(parent, %SweetXpath{is_list: true, is_value: true, cast_to: cast, is_optional: is_opt?} = spec) do
def xpath(parent, %SweetXpath{is_list: true, is_value: true, cast_to: cast, is_optional: is_opt?} = spec, []) do
get_current_entities(parent, spec) |> Enum.map(&(_value(&1)) |> to_cast(cast,is_opt?)) |> spec.transform_fun.()
end

def xpath(parent, %SweetXpath{is_list: true, is_value: false} = spec) do
def xpath(parent, %SweetXpath{is_list: true, is_value: false} = spec, []) do
get_current_entities(parent, spec) |> spec.transform_fun.()
end

def xpath(parent, %SweetXpath{is_list: false, is_value: true, cast_to: string_type, is_optional: is_opt?} = spec) when string_type in [:string,:soft_string] do
def xpath(parent, %SweetXpath{is_list: false, is_value: true, cast_to: string_type, is_optional: is_opt?} = spec, []) when string_type in [:string,:soft_string] do
spec = %SweetXpath{spec | is_list: true}
get_current_entities(parent, spec)
|> Enum.map(&(_value(&1) |> to_cast(string_type, is_opt?)))
|> Enum.join
|> spec.transform_fun.()
end

def xpath(parent, %SweetXpath{is_list: false, is_value: true, cast_to: cast, is_optional: is_opt?} = spec) do
def xpath(parent, %SweetXpath{is_list: false, is_value: true, cast_to: cast, is_optional: is_opt?} = spec, []) do
get_current_entities(parent, spec) |> _value |> to_cast(cast, is_opt?) |> spec.transform_fun.()
end

def xpath(parent, %SweetXpath{is_list: false, is_value: false} = spec) do
def xpath(parent, %SweetXpath{is_list: false, is_value: false} = spec, []) do
get_current_entities(parent, spec) |> spec.transform_fun.()
end

Expand All @@ -457,11 +522,13 @@ defmodule SweetXml do
end

@doc ~S"""
`xmap` returns a mapping with each value being the result of `xpath`
`xmap` returns a mapping with each value being the result of `xpath`.
Just as `xpath`, you can nest the mapping structure. Please see `xpath` for
Just as `xpath`, you can nest the mapping structure. Please see `xpath/3` for
more detail.
You can give the option `true` to get the result as a keyword list instead of a map.
## Examples
Simple:
Expand Down Expand Up @@ -509,8 +576,24 @@ defmodule SweetXml do
...> ]
...> ], true)
[message: 'Message', ul: %{a: 'Two'}]
## Security
Whenever you are working with some xml that was not generated by your system,
it is highly recommended that you restrain some functionalities of XML
during the parsing. SweetXml allows in particular to prevent DTD parsing and fetching.
Unless you know exactly what kind of DTD you want to permit in your xml,
it is recommended that you use the following code example to prevent possible attacks:
```
doc
|> parse(dtd: :none)
|> xmap(specs, options)
```
For more details, see `parse/2`.
"""
def xmap(parent, mapping), do: xmap(parent, mapping, %{is_keyword: false})
@spec xmap(parent :: (doc | xmlElement), mapping :: specs, options :: (boolean | map)) :: (map | keyword)
when specs: keyword(spec | specs)
def xmap(parent, mapping, options \\ false)

def xmap(nil, _, %{is_optional: true}), do: nil

Expand Down
56 changes: 56 additions & 0 deletions lib/sweet_xml/options.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
defmodule SweetXml.Options do
@moduledoc false

def handle_dtd(:all) do
fn _ -> [] end
end
def handle_dtd(:none) do
fn ets ->
handle_dtd(:internal_only).(ets) ++ handle_dtd(only: []).(ets)
end
end
def handle_dtd(:internal_only) do
fn _ ->
[fetch_fun: fn _, _ -> {:error, "no external entity allowed"} end]
end
end
def handle_dtd(only: entity) when is_atom(entity) do
handle_dtd(only: [entity])
end
def handle_dtd(only: entities) when is_list(entities) do
fn ets ->
read = fn
context, name, state ->
ets = :xmerl_scan.rules_state(state)
case :ets.lookup(ets, {context, name}) do
[] -> :undefined
[{_, value}] -> value
end
end

write = fn
:entity = context, name, value, state ->
_ = case name in entities do
true ->
ets = :xmerl_scan.rules_state(state)
_ = case :ets.lookup(ets, {context, name}) do
[] -> :ets.insert(ets, {{context, name}, value})
_ -> :ok
end
false -> raise("DTD not allowed: #{name}")
end
state

context, name, value, state ->
ets = :xmerl_scan.rules_state(state)
_ = case :ets.lookup(ets, {context, name}) do
[] -> :ets.insert(ets, {{context, name}, value})
_ -> :ok
end
state
end

[{:rules, read, write, ets}]
end
end
end
Loading

0 comments on commit 088b3be

Please sign in to comment.