-
-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathdv_glossary.py
73 lines (57 loc) · 2.51 KB
/
dv_glossary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
from typing import Any, Iterable, cast
from scrapy.http.response.html import HtmlResponse
from toolz.functoolz import pipe # type: ignore
from public_law import text
from ...metadata import Metadata, Subject
from ...models.glossary import GlossaryEntry, GlossaryParseResult
from ...text import URL, LoCSubject
from ...text import NonemptyString as String
from ...text import (Sentence, ensure_ends_with_period, make_soup,
normalize_nonempty)
def parse_glossary(html: HtmlResponse) -> GlossaryParseResult:
parsed_entries = tuple(__parse_entries(html))
return GlossaryParseResult(
metadata=Metadata(
dcterms_title=String("Family, domestic and sexual violence glossary"),
dcterms_language="en",
dcterms_coverage="AUS",
# Info about original source
dcterms_source=String(
"https://www.aihw.gov.au/reports-data/behaviours-risk-factors/domestic-violence/glossary"
),
publiclaw_sourceModified="unknown",
publiclaw_sourceCreator=String(
"Australian Institute of Health and Welfare"
),
dcterms_subject=(
Subject(
uri=LoCSubject("sh85047071"),
rdfs_label=String("Family violence"),
),
Subject(
uri=URL("https://www.wikidata.org/wiki/Q156537"),
rdfs_label=String("Domestic violence"),
),
),
),
entries=parsed_entries,
)
def __parse_entries(html: HtmlResponse) -> Iterable[GlossaryEntry]:
"""TODO: Refactor into a parent class"""
for phrase, defn in __raw_entries(html):
fixed_phrase = text.pipe(
phrase
, text.rstrip(": ") # type: ignore
)
fixed_definition: Sentence = cast(Sentence, pipe(defn, ensure_ends_with_period, normalize_nonempty, Sentence))
yield GlossaryEntry(fixed_phrase, fixed_definition)
def __raw_entries(response: HtmlResponse) -> Iterable[tuple[Any, Any]]:
"""
The core of this parser.
TODO: Refactor all the glossary parsers to need only this function.
"""
soup = make_soup(response)
paragraphs = soup.find_all("p")
strongs = filter(lambda s: s is not None, (p.strong for p in paragraphs))
strongs: filter[Any] = filter(lambda s: s.string != "Indigenous", strongs)
return ((phrase.string, "".join(map(str, phrase.next_siblings))) for phrase in strongs)