Clarified submodule documentation

Anonyfox · Jun 8, 2019 · 09a8a86 · 09a8a86
1 parent 28c549f
commit 09a8a86
Show file tree

Hide file tree

Showing 12 changed files with 96 additions and 15 deletions.
diff --git a/lib/scrape/flow.ex b/lib/scrape/flow.ex
@@ -29,6 +29,8 @@ defmodule Scrape.Flow do
   @doc """
   Initiate a new data processing flow with optional configuration.
 
+  NOTE: the options are currently not used but will be in upcoming versions.
+
   ## Example
           iex> Flow.start()
           %Flow{state: %{halted: false, error: nil}, assigns: %{}, options: []}
@@ -66,6 +68,14 @@ defmodule Scrape.Flow do
     end
   end
 
+  @doc """
+  Select keys from the flow assigns and return a map with the chosen fields.
+
+  Will result in an error object if the flow got halted previously.
+  """
+
+  @spec finish(flow, [atom()]) :: {:ok, map()} | {:error, any()}
+
   def finish(_flow, keys \\ [])
 
   def finish(%__MODULE__{state: %{halted: true, error: error}}, _) do

diff --git a/lib/scrape/ir/feed.ex b/lib/scrape/ir/feed.ex
@@ -1,4 +1,11 @@
 defmodule Scrape.IR.Feed do
+  @moduledoc """
+  Information Retrieval implementations to extract data from feeds (RSS or Atom).
+
+  Makes intense use of `Scrape.Tools.Tree` and it's functions to operate on
+  nested maps instead of raw XML documents.
+  """
+
   alias Scrape.Tools.Tree
   alias Scrape.Tools.URL
 

diff --git a/lib/scrape/ir/feed_item.ex b/lib/scrape/ir/feed_item.ex
@@ -1,4 +1,9 @@
 defmodule Scrape.IR.FeedItem do
+  @moduledoc """
+  Similar (and used by) `Scrape.IR.Feed`, but has specialized selectors
+  to extract data from feed items/entries.
+  """
+
   alias Scrape.Tools.Tree
   alias Scrape.Tools.URL
 

diff --git a/lib/scrape/ir/html.ex b/lib/scrape/ir/html.ex
@@ -1,4 +1,11 @@
 defmodule Scrape.IR.HTML do
+  @moduledoc """
+  Information Retrieval functions for extracting data out of HTML documents.
+
+  Makes extensive use of `Scrape.Tools.DOM` under the hood, so a customized
+  jQuery-like approach can be taken.
+  """
+
   alias Scrape.Tools.DOM
   alias Scrape.Tools.URL
 

diff --git a/lib/scrape/ir/text.ex b/lib/scrape/ir/text.ex
@@ -9,10 +9,17 @@ defmodule Scrape.IR.Text do
   alias Scrape.IR.Text.TFIDF
   alias Scrape.Tools.Word
 
+  @doc false
   def generate_summary(text) do
+    # TODO: my markov chain implementation belongs here.
     text
   end
 
+  @doc """
+  Dissect a text into sentences, weight their stemmed keywords against each other and
+  return the 3 semantically most important sentences.
+  """
+
   def extract_summary(text, start_words, language \\ :en) do
     text
     |> TFIDF.generate_database(language)
@@ -43,13 +50,13 @@ defmodule Scrape.IR.Text do
   end
 
   @doc """
-    Remove all occurences of javascript from a HTML snippet.
+  Remove all occurences of javascript from a HTML snippet.
 
-    Uses a regex (!)
+  Uses a regex (!)
 
-    ## Example
-        iex> Scrape.IR.Text.without_js("a<script>b</script>c")
-        "ac"
+  ## Example
+      iex> Scrape.IR.Text.without_js("a<script>b</script>c")
+      "ac"
   """
 
   @spec without_js(String.t()) :: String.t()
@@ -60,11 +67,11 @@ defmodule Scrape.IR.Text do
   end
 
   @doc """
-    Strip all HTML tags from a text.
+  Strip all HTML tags from a text.
 
-    ## Example
-        iex> Scrape.IR.Text.without_html("<p>stuff</p>")
-        "stuff"
+  ## Example
+      iex> Scrape.IR.Text.without_html("<p>stuff</p>")
+      "stuff"
   """
 
   @spec without_html(String.t()) :: String.t()
@@ -76,11 +83,11 @@ defmodule Scrape.IR.Text do
   end
 
   @doc """
-    A text paragraph shall not include any whitespace except single spaces
-    between words.
+  A text paragraph shall not include any whitespace except single spaces
+  between words.
 
-    ## Example
-      iex> Scrape.IR.Text.normalize_whitespace("\r\thello world\r ")
+  ## Example
+      iex> Scrape.IR.Text.normalize_whitespace("\\r\\thello world\\r ")
       "hello world"
   """
 
@@ -97,7 +104,7 @@ defmodule Scrape.IR.Text do
   Removes all junk from a given text, like javascript, html or mixed whitespace.
 
   ## Example
-      iex> Scrape.IR.Text.clean("\t hello, \r<b>world</b>!")
+      iex> Scrape.IR.Text.clean("\\t hello, \\r<b>world</b>!")
       "hello, world!"
   """
   def clean(text) do
@@ -145,6 +152,11 @@ defmodule Scrape.IR.Text do
     |> Enum.filter(fn word -> Word.is_meaningful?(word, language) end)
   end
 
+  @doc """
+  Similar to `semantic_tokenize/2`, but also determines the n (default: 20)
+  most relevant **stemmed** tokens from the list.
+  """
+
   def semantic_keywords(text, n \\ 20, language \\ :en) do
     text
     |> semantic_tokenize(language)

diff --git a/lib/scrape/options.ex b/lib/scrape/options.ex
@@ -1,4 +1,6 @@
 defmodule Scrape.Options do
+  @moduledoc false
+
   @defaults num_stems: 30
 
   def merge(opts \\ []) do

diff --git a/lib/scrape/source/disk.ex b/lib/scrape/source/disk.ex
@@ -1,8 +1,18 @@
 defmodule Scrape.Source.Disk do
+  @moduledoc """
+  Abstraction over the native `File` functions. Currently without additional logic.
+  """
+
+  @doc """
+  Same as `File.read/1`.
+  """
   def get(path) do
     File.read(path)
   end
 
+  @doc """
+  Same as `File.read!/1`.
+  """
   def get!(path) do
     File.read!(path)
   end

diff --git a/lib/scrape/source/http.ex b/lib/scrape/source/http.ex
@@ -3,10 +3,32 @@ defmodule Scrape.Source.HTTP do
   alias Scrape.Source.HTTP.Get
   alias Scrape.Source.HTTP.Transcode
 
+  @doc """
+  Perform a HTTP GET request against the given url.
+
+  This function is optimized for *text*-based data, not binary like images.
+  It will try to normalize the response into valid utf-8 and transcode if needed.
+
+  Everything that is not a status code 200 with valid encoding will result in
+  some error object.
+
+  ## Examples:
+      iex> HTTP.get("http://example.com")
+      {:ok, }"some response"}
+  """
+
+  @spec get(String.t()) :: {:ok, String.t()} | {:error, any()} | {:http_error, any()}
+
   def get(url) do
     url |> Get.execute() |> evaluate()
   end
 
+  @doc """
+  Same as `get/1`, but will raise if the result is not `:ok`.
+  """
+
+  @spec get!(String.t()) :: String.t()
+
   def get!(url) do
     {:ok, data} = get(url)
     data

diff --git a/lib/scrape/source/http/charset.ex b/lib/scrape/source/http/charset.ex
@@ -1,4 +1,6 @@
 defmodule Scrape.Source.HTTP.Charset do
+  @moduledoc false
+
   def from_headers(headers) do
     header =
       headers

diff --git a/lib/scrape/source/http/get.ex b/lib/scrape/source/http/get.ex
@@ -1,4 +1,6 @@
 defmodule Scrape.Source.HTTP.Get do
+  @moduledoc false
+
   @opts [
     follow_redirect: true,
     timeout: 33_000,

diff --git a/lib/scrape/source/http/transcode.ex b/lib/scrape/source/http/transcode.ex
@@ -1,4 +1,6 @@
 defmodule Scrape.Source.HTTP.Transcode do
+  @moduledoc false
+
   def execute(charset, text) do
     encoding = charset_to_encoding(charset)
     {_status, result} = Codepagex.to_string(text, encoding)

diff --git a/lib/scrape/tools/url.ex b/lib/scrape/tools/url.ex
@@ -5,7 +5,7 @@ defmodule Scrape.Tools.URL do
 
   @doc """
   Rebase an URL to another root URL, useful for turning relative URLs into
-  aboslute ones.
+  absolute ones.
 
   ## Example
       iex> URL.merge("/path", "http://example.com")