added IR.FeedItem with first selectors

Anonyfox · May 9, 2019 · b986265 · b986265
1 parent 4025c72
commit b986265
Show file tree

Hide file tree

Showing 7 changed files with 225 additions and 0 deletions.
diff --git a/lib/scrape/ir/feed_item.ex b/lib/scrape/ir/feed_item.ex
@@ -0,0 +1,42 @@
+defmodule Scrape.IR.FeedItem do
+  @typedoc """
+  XML tree structure from `Floki.parse/1`
+  """
+  @type html_tree :: tuple | list
+
+  @doc """
+  Extract the (best) title from the feed item.
+
+  ## Example
+      iex> Scrape.IR.FeedItem.title("<feed><title>abc</title></feed>")
+      "abc"
+  """
+
+  @spec title(String.t() | html_tree) :: String.t()
+
+  defdelegate title(dom), to: Scrape.IR.FeedItem.Title, as: :execute
+
+  @doc """
+  Extract the (best) description from the feed item.
+
+  ## Example
+      iex> Scrape.IR.FeedItem.description("<feed><description>abc</description></feed>")
+      "abc"
+  """
+
+  @spec description(String.t() | html_tree) :: String.t()
+
+  defdelegate description(dom), to: Scrape.IR.FeedItem.Description, as: :execute
+
+  @doc """
+  Extract the article_url from the feed item.
+
+  ## Example
+      iex> Scrape.IR.FeedItem.article_url("<feed><link href='http://example.com' /></feed>")
+      "http://example.com"
+  """
+
+  @spec article_url(String.t() | html_tree) :: String.t()
+
+  defdelegate article_url(dom), to: Scrape.IR.FeedItem.ArticleURL, as: :execute
+end
diff --git a/lib/scrape/ir/feed_item/article_url.ex b/lib/scrape/ir/feed_item/article_url.ex
@@ -0,0 +1,26 @@
+defmodule Scrape.IR.FeedItem.ArticleURL do
+  @moduledoc false
+
+  alias Scrape.IR.Query
+  alias Scrape.IR.URL
+
+  @spec execute(String.t() | [any()], String.t() | nil) :: String.t()
+
+  def execute(dom, url \\ "") do
+    link = format_atom(dom) || format_rss(dom)
+
+    case link do
+      nil -> ""
+      "" -> ""
+      _ -> URL.merge(link, url)
+    end
+  end
+
+  defp format_rss(dom) do
+    Query.find(dom, "link", :first)
+  end
+
+  defp format_atom(dom) do
+    Query.attr(dom, "link", "href", :first)
+  end
+end
diff --git a/lib/scrape/ir/feed_item/description.ex b/lib/scrape/ir/feed_item/description.ex
@@ -0,0 +1,12 @@
+defmodule Scrape.IR.FeedItem.Description do
+  @moduledoc false
+
+  alias Scrape.IR.Query
+  alias Scrape.IR.Text
+
+  @spec execute(String.t() | [any()]) :: String.t()
+
+  def execute(dom) do
+    Query.find(dom, "description, summary, content", :first) |> Text.clean()
+  end
+end
diff --git a/lib/scrape/ir/feed_item/title.ex b/lib/scrape/ir/feed_item/title.ex
@@ -0,0 +1,12 @@
+defmodule Scrape.IR.FeedItem.Title do
+  @moduledoc false
+
+  alias Scrape.IR.Query
+  alias Scrape.IR.Text
+
+  @spec execute(String.t() | [any()]) :: String.t()
+
+  def execute(dom) do
+    Query.find(dom, "title", :first) |> Text.clean()
+  end
+end
diff --git a/lib/scrape/ir/filter.ex b/lib/scrape/ir/filter.ex
@@ -14,6 +14,7 @@ defmodule Scrape.IR.Filter do
 
   def longest([]), do: nil
   def longest(s) when is_list(s), do: s |> Enum.max_by(&String.length/1)
+  def longest(""), do: nil
   def longest(s) when is_binary(s), do: s
   def longest(_), do: nil
 
@@ -30,6 +31,7 @@ defmodule Scrape.IR.Filter do
 
   def shortest([]), do: nil
   def shortest(s) when is_list(s), do: s |> Enum.min_by(&String.length/1)
+  def shortest(""), do: nil
   def shortest(s) when is_binary(s), do: s
   def shortest(_), do: nil
 
@@ -43,6 +45,7 @@ defmodule Scrape.IR.Filter do
   @spec first([String.t()] | String.t()) :: String.t() | nil
 
   def first([s | _]), do: s
+  def first(""), do: nil
   def first(s) when is_binary(s), do: s
   def first(_), do: nil
 
@@ -56,6 +59,7 @@ defmodule Scrape.IR.Filter do
   @spec all([String.t()] | String.t() | nil) :: [String.t()] | String.t() | nil
 
   def all(s) when is_list(s), do: s |> Enum.uniq()
+  def all(""), do: nil
   def all(s) when is_binary(s), do: s
   def all(_), do: nil
 end
diff --git a/lib/scrape/ir/text.ex b/lib/scrape/ir/text.ex
@@ -93,6 +93,20 @@ defmodule Scrape.IR.Text do
     |> String.trim()
   end
 
+  @doc """
+  Removes all junk from a given text, like javascript, html or mixed whitespace.
+
+  ## Example
+      iex> Scrape.IR.Text.clean("\t hello, \r<b>world</b>!")
+      "hello, world!"
+  """
+  def clean(text) do
+    text
+    |> without_js()
+    |> without_html()
+    |> normalize_whitespace()
+  end
+
   @doc """
   Dissect a text into word tokens.
 

diff --git a/test/ir/feed_item_test.exs b/test/ir/feed_item_test.exs
@@ -0,0 +1,115 @@
+defmodule Scrape.IR.FeedItemTest do
+  use ExUnit.Case
+
+  alias Scrape.IR.FeedItem
+
+  doctest FeedItem
+
+  describe "FeedItem#title/1" do
+    test "can extract from xml string of type atom" do
+      xml = "<feed><entry><title>abc</title></entry></feed>"
+      assert FeedItem.title(xml) == "abc"
+    end
+
+    test "can extract from xml string of type rss" do
+      xml = "<feed><item><title>abc</title></item></feed>"
+      assert FeedItem.title(xml) == "abc"
+    end
+
+    test "can extract from german atom feed" do
+      xml = File.read!("cache/feed/heise.xml")
+      item = xml |> Floki.find("entry") |> List.first()
+      assert FeedItem.title(item) =~ "Fachkräftemangel"
+    end
+
+    test "can extract from german rss feed" do
+      xml = File.read!("cache/feed/spiegel.xml")
+      item = xml |> Floki.find("item") |> List.first()
+      assert FeedItem.title(item) =~ "Schwertransporter"
+    end
+
+    test "can extract from english atom feed" do
+      xml = File.read!("cache/feed/elixir-lang.xml")
+      item = xml |> Floki.find("entry") |> List.first()
+      assert FeedItem.title(item) == "Elixir v1.0 released"
+    end
+
+    test "can extract from english rss feed" do
+      xml = File.read!("cache/feed/latimes.xml")
+      item = xml |> Floki.find("item") |> List.first()
+      assert FeedItem.title(item) =~ "Essential tracks"
+    end
+  end
+
+  describe "FeedItem#description/1" do
+    test "can extract from xml string of type atom" do
+      xml = "<feed><entry><summary>abc</summary></entry></feed>"
+      assert FeedItem.description(xml) == "abc"
+    end
+
+    test "can extract from xml string of type rss" do
+      xml = "<rss><item><description>abc</description></item></rss>"
+      assert FeedItem.description(xml) == "abc"
+    end
+
+    test "can extract from german atom feed" do
+      xml = File.read!("cache/feed/heise.xml")
+      item = xml |> Floki.find("entry") |> List.first()
+      assert FeedItem.description(item) =~ "730.000 Mitarbeiter"
+    end
+
+    test "can extract from german rss feed" do
+      xml = File.read!("cache/feed/spiegel.xml")
+      item = xml |> Floki.find("item") |> List.first()
+      assert FeedItem.description(item) =~ "Schweres Unglück in der Oberpfalz"
+    end
+
+    test "can extract from english atom feed" do
+      xml = File.read!("cache/feed/elixir-lang.xml")
+      item = xml |> Floki.find("entry") |> List.first()
+      assert FeedItem.description(item) =~ "Elixir v1.0 is finally out"
+    end
+
+    test "can extract from english rss feed" do
+      xml = File.read!("cache/feed/latimes.xml")
+      item = xml |> Floki.find("item") |> List.first()
+      assert FeedItem.description(item) =~ "high-energy party music"
+    end
+  end
+
+  describe "FeedItem#website_url/1" do
+    test "can extract from xml string of type atom" do
+      xml = "<feed><entry><link href='http://example.com' /></entry></feed>"
+      assert FeedItem.article_url(xml) == "http://example.com"
+    end
+
+    test "can extract from xml string of type rss" do
+      xml = "<rss><item><link>http://example.com</link></item></rss>"
+      assert FeedItem.article_url(xml) == "http://example.com"
+    end
+
+    test "can extract from german atom feed" do
+      xml = File.read!("cache/feed/heise.xml")
+      item = xml |> Floki.find("entry") |> List.first()
+      assert FeedItem.article_url(item) =~ "https://www.heise.de/newsticker"
+    end
+
+    test "can extract from german rss feed" do
+      xml = File.read!("cache/feed/spiegel.xml")
+      item = xml |> Floki.find("item") |> List.first()
+      assert FeedItem.article_url(item) =~ "http://www.spiegel.de/panorama"
+    end
+
+    test "can extract from english atom feed" do
+      xml = File.read!("cache/feed/elixir-lang.xml")
+      item = xml |> Floki.find("entry") |> List.first()
+      assert FeedItem.article_url(item) =~ "http://elixir-lang.org/blog"
+    end
+
+    test "can extract from english rss feed" do
+      xml = File.read!("cache/feed/latimes.xml")
+      item = xml |> Floki.find("item") |> List.first()
+      assert FeedItem.article_url(item) =~ "http://www.latimes.com/la-et-ms"
+    end
+  end
+end