Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions docs/supported_publishers.md
Original file line number Diff line number Diff line change
Expand Up @@ -3758,6 +3758,25 @@
<td>&#160;</td>
<td>&#160;</td>
</tr>
<tr>
<td>
<code>EyethuNews</code>
</td>
<td>
<div>Eyethu News</div>
</td>
<td>
<a href="https://www.eyethunews.co.za/">
<span>www.eyethunews.co.za</span>
</a>
</td>
<td>
<code>zu</code>
</td>
<td>&#160;</td>
<td>&#160;</td>
<td>&#160;</td>
</tr>
<tr>
<td>
<code>TimesLive</code>
Expand Down
15 changes: 15 additions & 0 deletions src/fundus/publishers/za/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from fundus.publishers.base_objects import Publisher, PublisherGroup
from fundus.publishers.za.daily_maverick import DailyMaverickParser
from fundus.publishers.za.eyethu_news import EyethuNewsParser
from fundus.publishers.za.times_live import TimesLiveParser
from fundus.scraping.filter import inverse, regex_filter
from fundus.scraping.url import NewsMap, Sitemap
Expand All @@ -21,6 +22,20 @@ class ZA(metaclass=PublisherGroup):
],
)

EyethuNews = Publisher(
name="Eyethu News",
domain="https://www.eyethunews.co.za/",
parser=EyethuNewsParser,
sources=[
Sitemap(
"https://eyethunews.co.za/sitemap.xml",
sitemap_filter=inverse(regex_filter("/post-sitemap")),
languages={"zu"},
reverse=True,
),
],
)

TimesLive = Publisher(
name="Times Live",
domain="https://www.timeslive.co.za/",
Expand Down
63 changes: 63 additions & 0 deletions src/fundus/publishers/za/eyethu_news.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import datetime
import re
from typing import List, Optional

from lxml.etree import XPath

from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute
from fundus.parser.utility import (
extract_article_body_with_selector,
generic_author_parsing,
generic_date_parsing,
generic_nodes_to_text,
generic_topic_parsing,
image_extraction,
strip_nodes_to_text,
)


class EyethuNewsParser(ParserProxy):
class V1(BaseParser):
_paragraph_selector = XPath("//div[contains(@class, 'entry-content')]/p[text() and not(a)] | //blockquote")
_summary_selector = XPath("//h2[@class='entry-sub-title']")
_subheadline_selector = XPath("//div[contains(@class, 'entry-content')]/p[not(text() or a)]/strong[not(a)]")

_author_selector = XPath("//header//span[@class='meta-author']")

@attribute
def body(self) -> Optional[ArticleBody]:
return extract_article_body_with_selector(
self.precomputed.doc,
paragraph_selector=self._paragraph_selector,
summary_selector=self._summary_selector,
subheadline_selector=self._subheadline_selector,
)

@attribute
def authors(self) -> List[str]:
return generic_author_parsing(
generic_nodes_to_text(self._author_selector(self.precomputed.doc)),
result_filter=re.compile(r"(?i)content "),
)

@attribute
def publishing_date(self) -> Optional[datetime.datetime]:
return generic_date_parsing(self.precomputed.ld.bf_search("datePublished"))

@attribute
def title(self) -> Optional[str]:
return self.precomputed.ld.bf_search("headline")

@attribute
def topics(self) -> List[str]:
return generic_topic_parsing(self.precomputed.ld.bf_search("keywords"))

@attribute
def images(self) -> List[Image]:
return image_extraction(
doc=self.precomputed.doc,
paragraph_selector=self._paragraph_selector,
image_selector=XPath("//article//img[@alt]"),
upper_boundary_selector=XPath("//h1"),
author_selector=re.compile(r"(?i)IZITHOMBE:(?P<credits>.+)"),
)
61 changes: 61 additions & 0 deletions tests/resources/parser/test_data/za/EyethuNews.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
{
"V1": {
"authors": [
"Ntombizethu Ngcobo"
],
"body": {
"summary": [
"Ngabona amanxeba amathathu, egxalabeni, emqaleni kanye nasemhlane."
],
"sections": [
{
"headline": [],
"paragraphs": [
"Uhlalele ovalweni umndeni wesaguga saseMbali Unit 14, okuthiwa sibhokodwe ngommese emzini waso eMbali Unit 14, eMgungundlovu yilungu lomndeni ngoLwesine olwedlule.Ngokuthola kwe- Echo Eyethu, uRejoice Ndlovu (64) utholwe edindilizile edamini legazi egumbini lakhe lokulala.",
"Umsolwa oneminyaka engu – 18 ube esekhalelwa ngamasongo kaSigonyela ngalo usuku lwesigameko.Indodakazi yakhe engathandanga ukudalulwa ithe bewumndeni kusenzima ukwamukela isihluku ekubulawe ngaso unina.",
"“Ngithole ucingo lapho engiqashe khona lungazisa ukuthi umama akaphilile. Ngamphakela ukudla ngase ngiphuthuma ekhaya. Uma ngifika, ngabona sekugcwele amaphoyisa kungasangeneki ngaphakathi.”“Ngahluleka nokuzibamba ngamemeza igama lomsolwa, ngambuza ukuthi usenzeni,” kusho yena. Ngafica umama elele edamini legazi lapho amugwazele khona.",
"Ngabona amanxeba amathathu, egxalabeni, emqaleni kanye nasemhlane. Ummese wawusasalele emhlane,” kusho yena.",
"Uthe emuva kwesigameko, umsolwa wahamba waya eMbali Unit 15 wabuya esehamba namaphoyisa beze endaweni yesigameko.",
"“Ngiyamsaba. Umama wayengeve emthanda umsolwa. Wamkhulisa. Asiphephile neze.Ngizobuya ngizohlala nodadewethu ekhaya, kodwa uma ededelwa ngizophinde ngihambe.Ukube angiyithathanga ingane kadadewethu ngabe hlampe nayo uyibulele. Asisamdingi lapha,” kusho yena.",
"Uphinde waveza ukuthi emasontweni ambalwa adlule umsolwa uke walwa nogogo kubangwa imali abembekela yona akade eyithola uma ebambe amatoho.",
"“Ngafika ekhaya kugcwele amabhodlela afile.Wayezama ukusakazi umama ngevazi, ngenhlanhla wamugeja kwasala umaka odongenni. Ziningi izinto azenzile okubalwa ngisho ukuntshontsha,” kusho yena.",
"“Sekuphele iminyaka umama ehlala lapha, kodwa akaze ahlaselwe ekhaya. Umsolwa usanda kuntshontsha umabonakude kanye ne – decoder. Sabika kodwa sangaluthola usizo esiludingayo.”Ungeze ngokuthi uzamile ukunqonda indlela leli lungu lomndeni eliphila ngayo.",
"“Ngibuze abangani bakhe ukuthi iziphi izidakamizwa azithandayo bangitshela ukuthi, insangu, xanax kanye nencika. U-aunt wamuhambisa kudokotela ukuze ahlolwe ukuthi akaphazamisekile yini emqondweni. Imiphumela yaveza ukuthi akaphazamisekile,” kusho yena.",
"Unina kamsolwa, naye ongathandanga ukudalulwa, uthe ufuna kwenzeke ubulungiswa ngokufa kuka mama wakhe.",
"“Bekumele abulale mina, ayi umama. Kubuhlungu. Besilokhu simvulela amacala uma kukhona akwenzile.Unefayela enkulu esiteshini samaphoyisa iPlessislaer. Ngiwunina, kodwa sengiyamsaba. Uhlukanise umndeni wethu phakathi kwaze kuba udadewethu omdala uyaphuma ekhaya uyoqasha. Ngizamile ukufuna usizo kodwa ngangaluthola,” kusho yena.",
"Ukhansela Busani Zuma uthe lesigameko sishiye amalungu omphakathi ethuthumela, ikakhulukazi njengoba senzeke emini. Unxuse umphakathi ukuba uweseke lomndeni kulesikhathi esinzima obhekene naso.",
"Okhulumela amaphoyisa KwaZulu Natal, uColonel Robert Netshiunda uqinisekisile ukuboshwa komsolwa. Uthe imbangela yokubulawa kwakhe ugogo ibingakaziwa."
]
}
]
},
"images": [
{
"versions": [
{
"url": "https://images.caxton.co.za/wp-content/uploads/sites/59/2025/11/Rejoice-Ndlovu-780x470-1.jpg",
"query_width": null,
"size": {
"width": 780,
"height": 470
},
"type": "image/jpeg"
}
],
"is_cover": true,
"description": null,
"caption": "Rejoice Ndlovu. Isithombe: Sithunyelwe",
"authors": [],
"position": 322
}
],
"publishing_date": "2025-11-06 10:15:38+00:00",
"title": "Isihluku kubulawa isaguga eMbali",
"topics": [
"eMbali",
"eMgungundlovu",
"kubulawe ugogo",
"umzukulu"
]
}
}
Binary file not shown.
4 changes: 4 additions & 0 deletions tests/resources/parser/test_data/za/meta.info
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
"url": "https://www.dailymaverick.co.za/article/2025-05-03-icj-concludes-hearings-on-israel-aid-obligations-in-gaza/",
"crawl_date": "2025-05-22 13:24:27.994785"
},
"EyethuNews_2025_11_25.html.gz": {
"url": "https://eyethunews.co.za/119166/isihluku-kubulawa-isaguga-embali/",
"crawl_date": "2025-11-25 01:00:34.773578"
},
"TimesLive_2025_05_22.html.gz": {
"url": "https://www.timeslive.co.za/sunday-times-daily/business/2025-05-21-2025-budget-30-sars-gets-r4bn-to-hire-army-of-debt-collectors/",
"crawl_date": "2025-05-22 12:29:24.622820"
Expand Down