Skip to content

Commit

Permalink
Clarified submodule documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
Anonyfox committed Jun 8, 2019
1 parent 28c549f commit 09a8a86
Show file tree
Hide file tree
Showing 12 changed files with 96 additions and 15 deletions.
10 changes: 10 additions & 0 deletions lib/scrape/flow.ex
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ defmodule Scrape.Flow do
@doc """
Initiate a new data processing flow with optional configuration.
NOTE: the options are currently not used but will be in upcoming versions.
## Example
iex> Flow.start()
%Flow{state: %{halted: false, error: nil}, assigns: %{}, options: []}
Expand Down Expand Up @@ -66,6 +68,14 @@ defmodule Scrape.Flow do
end
end

@doc """
Select keys from the flow assigns and return a map with the chosen fields.
Will result in an error object if the flow got halted previously.
"""

@spec finish(flow, [atom()]) :: {:ok, map()} | {:error, any()}

def finish(_flow, keys \\ [])

def finish(%__MODULE__{state: %{halted: true, error: error}}, _) do
Expand Down
7 changes: 7 additions & 0 deletions lib/scrape/ir/feed.ex
Original file line number Diff line number Diff line change
@@ -1,4 +1,11 @@
defmodule Scrape.IR.Feed do
@moduledoc """
Information Retrieval implementations to extract data from feeds (RSS or Atom).
Makes intense use of `Scrape.Tools.Tree` and it's functions to operate on
nested maps instead of raw XML documents.
"""

alias Scrape.Tools.Tree
alias Scrape.Tools.URL

Expand Down
5 changes: 5 additions & 0 deletions lib/scrape/ir/feed_item.ex
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
defmodule Scrape.IR.FeedItem do
@moduledoc """
Similar (and used by) `Scrape.IR.Feed`, but has specialized selectors
to extract data from feed items/entries.
"""

alias Scrape.Tools.Tree
alias Scrape.Tools.URL

Expand Down
7 changes: 7 additions & 0 deletions lib/scrape/ir/html.ex
Original file line number Diff line number Diff line change
@@ -1,4 +1,11 @@
defmodule Scrape.IR.HTML do
@moduledoc """
Information Retrieval functions for extracting data out of HTML documents.
Makes extensive use of `Scrape.Tools.DOM` under the hood, so a customized
jQuery-like approach can be taken.
"""

alias Scrape.Tools.DOM
alias Scrape.Tools.URL

Expand Down
40 changes: 26 additions & 14 deletions lib/scrape/ir/text.ex
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,17 @@ defmodule Scrape.IR.Text do
alias Scrape.IR.Text.TFIDF
alias Scrape.Tools.Word

@doc false
def generate_summary(text) do
# TODO: my markov chain implementation belongs here.
text
end

@doc """
Dissect a text into sentences, weight their stemmed keywords against each other and
return the 3 semantically most important sentences.
"""

def extract_summary(text, start_words, language \\ :en) do
text
|> TFIDF.generate_database(language)
Expand Down Expand Up @@ -43,13 +50,13 @@ defmodule Scrape.IR.Text do
end

@doc """
Remove all occurences of javascript from a HTML snippet.
Remove all occurences of javascript from a HTML snippet.
Uses a regex (!)
Uses a regex (!)
## Example
iex> Scrape.IR.Text.without_js("a<script>b</script>c")
"ac"
## Example
iex> Scrape.IR.Text.without_js("a<script>b</script>c")
"ac"
"""

@spec without_js(String.t()) :: String.t()
Expand All @@ -60,11 +67,11 @@ defmodule Scrape.IR.Text do
end

@doc """
Strip all HTML tags from a text.
Strip all HTML tags from a text.
## Example
iex> Scrape.IR.Text.without_html("<p>stuff</p>")
"stuff"
## Example
iex> Scrape.IR.Text.without_html("<p>stuff</p>")
"stuff"
"""

@spec without_html(String.t()) :: String.t()
Expand All @@ -76,11 +83,11 @@ defmodule Scrape.IR.Text do
end

@doc """
A text paragraph shall not include any whitespace except single spaces
between words.
A text paragraph shall not include any whitespace except single spaces
between words.
## Example
iex> Scrape.IR.Text.normalize_whitespace("\r\thello world\r ")
## Example
iex> Scrape.IR.Text.normalize_whitespace("\\r\\thello world\\r ")
"hello world"
"""

Expand All @@ -97,7 +104,7 @@ defmodule Scrape.IR.Text do
Removes all junk from a given text, like javascript, html or mixed whitespace.
## Example
iex> Scrape.IR.Text.clean("\t hello, \r<b>world</b>!")
iex> Scrape.IR.Text.clean("\\t hello, \\r<b>world</b>!")
"hello, world!"
"""
def clean(text) do
Expand Down Expand Up @@ -145,6 +152,11 @@ defmodule Scrape.IR.Text do
|> Enum.filter(fn word -> Word.is_meaningful?(word, language) end)
end

@doc """
Similar to `semantic_tokenize/2`, but also determines the n (default: 20)
most relevant **stemmed** tokens from the list.
"""

def semantic_keywords(text, n \\ 20, language \\ :en) do
text
|> semantic_tokenize(language)
Expand Down
2 changes: 2 additions & 0 deletions lib/scrape/options.ex
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
defmodule Scrape.Options do
@moduledoc false

@defaults num_stems: 30

def merge(opts \\ []) do
Expand Down
10 changes: 10 additions & 0 deletions lib/scrape/source/disk.ex
Original file line number Diff line number Diff line change
@@ -1,8 +1,18 @@
defmodule Scrape.Source.Disk do
@moduledoc """
Abstraction over the native `File` functions. Currently without additional logic.
"""

@doc """
Same as `File.read/1`.
"""
def get(path) do
File.read(path)
end

@doc """
Same as `File.read!/1`.
"""
def get!(path) do
File.read!(path)
end
Expand Down
22 changes: 22 additions & 0 deletions lib/scrape/source/http.ex
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,32 @@ defmodule Scrape.Source.HTTP do
alias Scrape.Source.HTTP.Get
alias Scrape.Source.HTTP.Transcode

@doc """
Perform a HTTP GET request against the given url.
This function is optimized for *text*-based data, not binary like images.
It will try to normalize the response into valid utf-8 and transcode if needed.
Everything that is not a status code 200 with valid encoding will result in
some error object.
## Examples:
iex> HTTP.get("http://example.com")
{:ok, }"some response"}
"""

@spec get(String.t()) :: {:ok, String.t()} | {:error, any()} | {:http_error, any()}

def get(url) do
url |> Get.execute() |> evaluate()
end

@doc """
Same as `get/1`, but will raise if the result is not `:ok`.
"""

@spec get!(String.t()) :: String.t()

def get!(url) do
{:ok, data} = get(url)
data
Expand Down
2 changes: 2 additions & 0 deletions lib/scrape/source/http/charset.ex
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
defmodule Scrape.Source.HTTP.Charset do
@moduledoc false

def from_headers(headers) do
header =
headers
Expand Down
2 changes: 2 additions & 0 deletions lib/scrape/source/http/get.ex
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
defmodule Scrape.Source.HTTP.Get do
@moduledoc false

@opts [
follow_redirect: true,
timeout: 33_000,
Expand Down
2 changes: 2 additions & 0 deletions lib/scrape/source/http/transcode.ex
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
defmodule Scrape.Source.HTTP.Transcode do
@moduledoc false

def execute(charset, text) do
encoding = charset_to_encoding(charset)
{_status, result} = Codepagex.to_string(text, encoding)
Expand Down
2 changes: 1 addition & 1 deletion lib/scrape/tools/url.ex
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ defmodule Scrape.Tools.URL do

@doc """
Rebase an URL to another root URL, useful for turning relative URLs into
aboslute ones.
absolute ones.
## Example
iex> URL.merge("/path", "http://example.com")
Expand Down

0 comments on commit 09a8a86

Please sign in to comment.