-
Notifications
You must be signed in to change notification settings - Fork 43
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
added IR.FeedItem with first selectors
- Loading branch information
Showing
7 changed files
with
225 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
defmodule Scrape.IR.FeedItem do | ||
@typedoc """ | ||
XML tree structure from `Floki.parse/1` | ||
""" | ||
@type html_tree :: tuple | list | ||
|
||
@doc """ | ||
Extract the (best) title from the feed item. | ||
## Example | ||
iex> Scrape.IR.FeedItem.title("<feed><title>abc</title></feed>") | ||
"abc" | ||
""" | ||
|
||
@spec title(String.t() | html_tree) :: String.t() | ||
|
||
defdelegate title(dom), to: Scrape.IR.FeedItem.Title, as: :execute | ||
|
||
@doc """ | ||
Extract the (best) description from the feed item. | ||
## Example | ||
iex> Scrape.IR.FeedItem.description("<feed><description>abc</description></feed>") | ||
"abc" | ||
""" | ||
|
||
@spec description(String.t() | html_tree) :: String.t() | ||
|
||
defdelegate description(dom), to: Scrape.IR.FeedItem.Description, as: :execute | ||
|
||
@doc """ | ||
Extract the article_url from the feed item. | ||
## Example | ||
iex> Scrape.IR.FeedItem.article_url("<feed><link href='http://example.com' /></feed>") | ||
"http://example.com" | ||
""" | ||
|
||
@spec article_url(String.t() | html_tree) :: String.t() | ||
|
||
defdelegate article_url(dom), to: Scrape.IR.FeedItem.ArticleURL, as: :execute | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
defmodule Scrape.IR.FeedItem.ArticleURL do | ||
@moduledoc false | ||
|
||
alias Scrape.IR.Query | ||
alias Scrape.IR.URL | ||
|
||
@spec execute(String.t() | [any()], String.t() | nil) :: String.t() | ||
|
||
def execute(dom, url \\ "") do | ||
link = format_atom(dom) || format_rss(dom) | ||
|
||
case link do | ||
nil -> "" | ||
"" -> "" | ||
_ -> URL.merge(link, url) | ||
end | ||
end | ||
|
||
defp format_rss(dom) do | ||
Query.find(dom, "link", :first) | ||
end | ||
|
||
defp format_atom(dom) do | ||
Query.attr(dom, "link", "href", :first) | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
defmodule Scrape.IR.FeedItem.Description do | ||
@moduledoc false | ||
|
||
alias Scrape.IR.Query | ||
alias Scrape.IR.Text | ||
|
||
@spec execute(String.t() | [any()]) :: String.t() | ||
|
||
def execute(dom) do | ||
Query.find(dom, "description, summary, content", :first) |> Text.clean() | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
defmodule Scrape.IR.FeedItem.Title do | ||
@moduledoc false | ||
|
||
alias Scrape.IR.Query | ||
alias Scrape.IR.Text | ||
|
||
@spec execute(String.t() | [any()]) :: String.t() | ||
|
||
def execute(dom) do | ||
Query.find(dom, "title", :first) |> Text.clean() | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
defmodule Scrape.IR.FeedItemTest do | ||
use ExUnit.Case | ||
|
||
alias Scrape.IR.FeedItem | ||
|
||
doctest FeedItem | ||
|
||
describe "FeedItem#title/1" do | ||
test "can extract from xml string of type atom" do | ||
xml = "<feed><entry><title>abc</title></entry></feed>" | ||
assert FeedItem.title(xml) == "abc" | ||
end | ||
|
||
test "can extract from xml string of type rss" do | ||
xml = "<feed><item><title>abc</title></item></feed>" | ||
assert FeedItem.title(xml) == "abc" | ||
end | ||
|
||
test "can extract from german atom feed" do | ||
xml = File.read!("cache/feed/heise.xml") | ||
item = xml |> Floki.find("entry") |> List.first() | ||
assert FeedItem.title(item) =~ "Fachkräftemangel" | ||
end | ||
|
||
test "can extract from german rss feed" do | ||
xml = File.read!("cache/feed/spiegel.xml") | ||
item = xml |> Floki.find("item") |> List.first() | ||
assert FeedItem.title(item) =~ "Schwertransporter" | ||
end | ||
|
||
test "can extract from english atom feed" do | ||
xml = File.read!("cache/feed/elixir-lang.xml") | ||
item = xml |> Floki.find("entry") |> List.first() | ||
assert FeedItem.title(item) == "Elixir v1.0 released" | ||
end | ||
|
||
test "can extract from english rss feed" do | ||
xml = File.read!("cache/feed/latimes.xml") | ||
item = xml |> Floki.find("item") |> List.first() | ||
assert FeedItem.title(item) =~ "Essential tracks" | ||
end | ||
end | ||
|
||
describe "FeedItem#description/1" do | ||
test "can extract from xml string of type atom" do | ||
xml = "<feed><entry><summary>abc</summary></entry></feed>" | ||
assert FeedItem.description(xml) == "abc" | ||
end | ||
|
||
test "can extract from xml string of type rss" do | ||
xml = "<rss><item><description>abc</description></item></rss>" | ||
assert FeedItem.description(xml) == "abc" | ||
end | ||
|
||
test "can extract from german atom feed" do | ||
xml = File.read!("cache/feed/heise.xml") | ||
item = xml |> Floki.find("entry") |> List.first() | ||
assert FeedItem.description(item) =~ "730.000 Mitarbeiter" | ||
end | ||
|
||
test "can extract from german rss feed" do | ||
xml = File.read!("cache/feed/spiegel.xml") | ||
item = xml |> Floki.find("item") |> List.first() | ||
assert FeedItem.description(item) =~ "Schweres Unglück in der Oberpfalz" | ||
end | ||
|
||
test "can extract from english atom feed" do | ||
xml = File.read!("cache/feed/elixir-lang.xml") | ||
item = xml |> Floki.find("entry") |> List.first() | ||
assert FeedItem.description(item) =~ "Elixir v1.0 is finally out" | ||
end | ||
|
||
test "can extract from english rss feed" do | ||
xml = File.read!("cache/feed/latimes.xml") | ||
item = xml |> Floki.find("item") |> List.first() | ||
assert FeedItem.description(item) =~ "high-energy party music" | ||
end | ||
end | ||
|
||
describe "FeedItem#website_url/1" do | ||
test "can extract from xml string of type atom" do | ||
xml = "<feed><entry><link href='http://example.com' /></entry></feed>" | ||
assert FeedItem.article_url(xml) == "http://example.com" | ||
end | ||
|
||
test "can extract from xml string of type rss" do | ||
xml = "<rss><item><link>http://example.com</link></item></rss>" | ||
assert FeedItem.article_url(xml) == "http://example.com" | ||
end | ||
|
||
test "can extract from german atom feed" do | ||
xml = File.read!("cache/feed/heise.xml") | ||
item = xml |> Floki.find("entry") |> List.first() | ||
assert FeedItem.article_url(item) =~ "https://www.heise.de/newsticker" | ||
end | ||
|
||
test "can extract from german rss feed" do | ||
xml = File.read!("cache/feed/spiegel.xml") | ||
item = xml |> Floki.find("item") |> List.first() | ||
assert FeedItem.article_url(item) =~ "http://www.spiegel.de/panorama" | ||
end | ||
|
||
test "can extract from english atom feed" do | ||
xml = File.read!("cache/feed/elixir-lang.xml") | ||
item = xml |> Floki.find("entry") |> List.first() | ||
assert FeedItem.article_url(item) =~ "http://elixir-lang.org/blog" | ||
end | ||
|
||
test "can extract from english rss feed" do | ||
xml = File.read!("cache/feed/latimes.xml") | ||
item = xml |> Floki.find("item") |> List.first() | ||
assert FeedItem.article_url(item) =~ "http://www.latimes.com/la-et-ms" | ||
end | ||
end | ||
end |