Skip to content

Commit

Permalink
added IR.FeedItem with first selectors
Browse files Browse the repository at this point in the history
  • Loading branch information
Anonyfox committed May 9, 2019
1 parent 4025c72 commit b986265
Show file tree
Hide file tree
Showing 7 changed files with 225 additions and 0 deletions.
42 changes: 42 additions & 0 deletions lib/scrape/ir/feed_item.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
defmodule Scrape.IR.FeedItem do
@typedoc """
XML tree structure from `Floki.parse/1`
"""
@type html_tree :: tuple | list

@doc """
Extract the (best) title from the feed item.
## Example
iex> Scrape.IR.FeedItem.title("<feed><title>abc</title></feed>")
"abc"
"""

@spec title(String.t() | html_tree) :: String.t()

defdelegate title(dom), to: Scrape.IR.FeedItem.Title, as: :execute

@doc """
Extract the (best) description from the feed item.
## Example
iex> Scrape.IR.FeedItem.description("<feed><description>abc</description></feed>")
"abc"
"""

@spec description(String.t() | html_tree) :: String.t()

defdelegate description(dom), to: Scrape.IR.FeedItem.Description, as: :execute

@doc """
Extract the article_url from the feed item.
## Example
iex> Scrape.IR.FeedItem.article_url("<feed><link href='http://example.com' /></feed>")
"http://example.com"
"""

@spec article_url(String.t() | html_tree) :: String.t()

defdelegate article_url(dom), to: Scrape.IR.FeedItem.ArticleURL, as: :execute
end
26 changes: 26 additions & 0 deletions lib/scrape/ir/feed_item/article_url.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
defmodule Scrape.IR.FeedItem.ArticleURL do
@moduledoc false

alias Scrape.IR.Query
alias Scrape.IR.URL

@spec execute(String.t() | [any()], String.t() | nil) :: String.t()

def execute(dom, url \\ "") do
link = format_atom(dom) || format_rss(dom)

case link do
nil -> ""
"" -> ""
_ -> URL.merge(link, url)
end
end

defp format_rss(dom) do
Query.find(dom, "link", :first)
end

defp format_atom(dom) do
Query.attr(dom, "link", "href", :first)
end
end
12 changes: 12 additions & 0 deletions lib/scrape/ir/feed_item/description.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
defmodule Scrape.IR.FeedItem.Description do
@moduledoc false

alias Scrape.IR.Query
alias Scrape.IR.Text

@spec execute(String.t() | [any()]) :: String.t()

def execute(dom) do
Query.find(dom, "description, summary, content", :first) |> Text.clean()
end
end
12 changes: 12 additions & 0 deletions lib/scrape/ir/feed_item/title.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
defmodule Scrape.IR.FeedItem.Title do
@moduledoc false

alias Scrape.IR.Query
alias Scrape.IR.Text

@spec execute(String.t() | [any()]) :: String.t()

def execute(dom) do
Query.find(dom, "title", :first) |> Text.clean()
end
end
4 changes: 4 additions & 0 deletions lib/scrape/ir/filter.ex
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ defmodule Scrape.IR.Filter do

def longest([]), do: nil
def longest(s) when is_list(s), do: s |> Enum.max_by(&String.length/1)
def longest(""), do: nil
def longest(s) when is_binary(s), do: s
def longest(_), do: nil

Expand All @@ -30,6 +31,7 @@ defmodule Scrape.IR.Filter do

def shortest([]), do: nil
def shortest(s) when is_list(s), do: s |> Enum.min_by(&String.length/1)
def shortest(""), do: nil
def shortest(s) when is_binary(s), do: s
def shortest(_), do: nil

Expand All @@ -43,6 +45,7 @@ defmodule Scrape.IR.Filter do
@spec first([String.t()] | String.t()) :: String.t() | nil

def first([s | _]), do: s
def first(""), do: nil
def first(s) when is_binary(s), do: s
def first(_), do: nil

Expand All @@ -56,6 +59,7 @@ defmodule Scrape.IR.Filter do
@spec all([String.t()] | String.t() | nil) :: [String.t()] | String.t() | nil

def all(s) when is_list(s), do: s |> Enum.uniq()
def all(""), do: nil
def all(s) when is_binary(s), do: s
def all(_), do: nil
end
14 changes: 14 additions & 0 deletions lib/scrape/ir/text.ex
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,20 @@ defmodule Scrape.IR.Text do
|> String.trim()
end

@doc """
Removes all junk from a given text, like javascript, html or mixed whitespace.
## Example
iex> Scrape.IR.Text.clean("\t hello, \r<b>world</b>!")
"hello, world!"
"""
def clean(text) do
text
|> without_js()
|> without_html()
|> normalize_whitespace()
end

@doc """
Dissect a text into word tokens.
Expand Down
115 changes: 115 additions & 0 deletions test/ir/feed_item_test.exs
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
defmodule Scrape.IR.FeedItemTest do
use ExUnit.Case

alias Scrape.IR.FeedItem

doctest FeedItem

describe "FeedItem#title/1" do
test "can extract from xml string of type atom" do
xml = "<feed><entry><title>abc</title></entry></feed>"
assert FeedItem.title(xml) == "abc"
end

test "can extract from xml string of type rss" do
xml = "<feed><item><title>abc</title></item></feed>"
assert FeedItem.title(xml) == "abc"
end

test "can extract from german atom feed" do
xml = File.read!("cache/feed/heise.xml")
item = xml |> Floki.find("entry") |> List.first()
assert FeedItem.title(item) =~ "Fachkräftemangel"
end

test "can extract from german rss feed" do
xml = File.read!("cache/feed/spiegel.xml")
item = xml |> Floki.find("item") |> List.first()
assert FeedItem.title(item) =~ "Schwertransporter"
end

test "can extract from english atom feed" do
xml = File.read!("cache/feed/elixir-lang.xml")
item = xml |> Floki.find("entry") |> List.first()
assert FeedItem.title(item) == "Elixir v1.0 released"
end

test "can extract from english rss feed" do
xml = File.read!("cache/feed/latimes.xml")
item = xml |> Floki.find("item") |> List.first()
assert FeedItem.title(item) =~ "Essential tracks"
end
end

describe "FeedItem#description/1" do
test "can extract from xml string of type atom" do
xml = "<feed><entry><summary>abc</summary></entry></feed>"
assert FeedItem.description(xml) == "abc"
end

test "can extract from xml string of type rss" do
xml = "<rss><item><description>abc</description></item></rss>"
assert FeedItem.description(xml) == "abc"
end

test "can extract from german atom feed" do
xml = File.read!("cache/feed/heise.xml")
item = xml |> Floki.find("entry") |> List.first()
assert FeedItem.description(item) =~ "730.000 Mitarbeiter"
end

test "can extract from german rss feed" do
xml = File.read!("cache/feed/spiegel.xml")
item = xml |> Floki.find("item") |> List.first()
assert FeedItem.description(item) =~ "Schweres Unglück in der Oberpfalz"
end

test "can extract from english atom feed" do
xml = File.read!("cache/feed/elixir-lang.xml")
item = xml |> Floki.find("entry") |> List.first()
assert FeedItem.description(item) =~ "Elixir v1.0 is finally out"
end

test "can extract from english rss feed" do
xml = File.read!("cache/feed/latimes.xml")
item = xml |> Floki.find("item") |> List.first()
assert FeedItem.description(item) =~ "high-energy party music"
end
end

describe "FeedItem#website_url/1" do
test "can extract from xml string of type atom" do
xml = "<feed><entry><link href='http://example.com' /></entry></feed>"
assert FeedItem.article_url(xml) == "http://example.com"
end

test "can extract from xml string of type rss" do
xml = "<rss><item><link>http://example.com</link></item></rss>"
assert FeedItem.article_url(xml) == "http://example.com"
end

test "can extract from german atom feed" do
xml = File.read!("cache/feed/heise.xml")
item = xml |> Floki.find("entry") |> List.first()
assert FeedItem.article_url(item) =~ "https://www.heise.de/newsticker"
end

test "can extract from german rss feed" do
xml = File.read!("cache/feed/spiegel.xml")
item = xml |> Floki.find("item") |> List.first()
assert FeedItem.article_url(item) =~ "http://www.spiegel.de/panorama"
end

test "can extract from english atom feed" do
xml = File.read!("cache/feed/elixir-lang.xml")
item = xml |> Floki.find("entry") |> List.first()
assert FeedItem.article_url(item) =~ "http://elixir-lang.org/blog"
end

test "can extract from english rss feed" do
xml = File.read!("cache/feed/latimes.xml")
item = xml |> Floki.find("item") |> List.first()
assert FeedItem.article_url(item) =~ "http://www.latimes.com/la-et-ms"
end
end
end

0 comments on commit b986265

Please sign in to comment.