Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions docs/supported_publishers.md
Original file line number Diff line number Diff line change
Expand Up @@ -3815,6 +3815,27 @@
<td>&#160;</td>
<td>&#160;</td>
</tr>
<tr>
<td>
<code>Ilanga</code>
</td>
<td>
<div>Ilanga</div>
</td>
<td>
<a href="https://www.ilanganews.co.za/">
<span>www.ilanganews.co.za</span>
</a>
</td>
<td>
<code>zu</code>
</td>
<td>
<code>topics</code>
</td>
<td>&#160;</td>
<td>&#160;</td>
</tr>
<tr>
<td>
<code>Isolezwe</code>
Expand Down
14 changes: 14 additions & 0 deletions src/fundus/publishers/za/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from fundus.publishers.base_objects import Publisher, PublisherGroup
from fundus.publishers.za.daily_maverick import DailyMaverickParser
from fundus.publishers.za.dizindaba import DizindabaParser
from fundus.publishers.za.ilanga import IlangaParser
from fundus.publishers.za.independent_online import IndependentOnlineParser
from fundus.publishers.za.the_citizen import TheCitizenParser
from fundus.publishers.za.times_live import TimesLiveParser
Expand Down Expand Up @@ -59,6 +60,19 @@ class ZA(metaclass=PublisherGroup):
],
)

Ilanga = Publisher(
name="Ilanga",
domain="https://www.ilanganews.co.za/",
parser=IlangaParser,
sources=[
Sitemap(
"https://ilanganews.co.za/wp-sitemap.xml",
sitemap_filter=inverse(regex_filter("-posts-post-")),
languages={"zu"},
),
],
)

Isolezwe = Publisher(
name="Isolezwe",
domain="https://www.isolezwe.co.za/",
Expand Down
54 changes: 54 additions & 0 deletions src/fundus/publishers/za/ilanga.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import datetime
import re
from typing import List, Optional

from lxml.etree import XPath

from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute
from fundus.parser.utility import (
extract_article_body_with_selector,
generic_author_parsing,
generic_date_parsing,
generic_topic_parsing,
image_extraction,
strip_nodes_to_text,
)


class IlangaParser(ParserProxy):
class V1(BaseParser):
_paragraph_selector = XPath("//div[contains(@class,'post_content')]//p[text() and not(strong)]")

_author_selector = XPath("(//div[contains(@class,'post_content')]//p[position()=1])[strong and not(text())]")

@attribute
def body(self) -> Optional[ArticleBody]:
return extract_article_body_with_selector(
self.precomputed.doc,
paragraph_selector=self._paragraph_selector,
)

@attribute
def authors(self) -> List[str]:
if authors := generic_author_parsing(strip_nodes_to_text(self._author_selector(self.precomputed.doc))):
return authors
return generic_author_parsing(self.precomputed.ld.bf_search("author"))

@attribute
def publishing_date(self) -> Optional[datetime.datetime]:
return generic_date_parsing(self.precomputed.ld.bf_search("datePublished"))

@attribute
def title(self) -> Optional[str]:
return re.sub(re.compile(r"(?i)\s*-\s*ilanga news"), "", self.precomputed.ld.bf_search("headline"))

@attribute
def images(self) -> List[Image]:
return image_extraction(
doc=self.precomputed.doc,
paragraph_selector=self._paragraph_selector,
upper_boundary_selector=XPath("//div[contains(@class,'post_content')]"),
caption_selector=XPath(
"(./ancestor::figure/following-sibling::p[position()=1])[strong and not(text())]"
),
)
87 changes: 87 additions & 0 deletions tests/resources/parser/test_data/za/Ilanga.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
{
"V1": {
"authors": [
"MTHOBISI SITHOLE"
],
"body": {
"summary": [],
"sections": [
{
"headline": [],
"paragraphs": [
"Kuthiwaligqinsile imbuzi efile, esiyona-kele emagcekeni aseMabaso High School kubhalwa izivivinyo zikamatikuletsheni ikhansela lendawo okuthiwa lisola lesi sikole ngokufa kwezimbuzi zalo ezingaphezu kuka-70. Kuthiwa ikhansela lendawo, uMnu Mhawukeleni Sokhela, lifike emagcekeneni esikole lithelwe ngezibonkolo, kuthi malimudle limfele uthishanhloko wesikole, uNkk Thabisile Dlamini.",
"Kuthiwa lisola isikole ngokubulala izimbuzi zalo ezidla intshela ye-rice eziyithola khona. Kubikwa ukuthi ngesonto eledlule uSokhela ungene esikoleni ngemoto kwashunqa izintuli, evutha engabaselwe. Kuvela ukuthi le nkinga yokufa kwezimbuzi zakhe, iqale nyakenye.",
"UNkk Dlamini uthi bahlaliseke kabi ngenxa yokuhlaselwa kwabo esikoleni ngumholi womphakathi.",
"“Ngiyazibuza ukuthi kungani ikhansela lingaqinisi isandla kumelusi walo. Ngiyafisa ukusizakala ngoba le nkinga isazoqhubeka nakomunye oyofika aphathe lesi sikole, ngisho sengathatha umhlalaphansi,” kusho uNkk Dlamini.",
"Utshele leli phephandaba ukuthi usebikile enkosini yesizwe emuva kokuba ewubhalele umnyango wezemfundo namaphoyisa ngalesi simo.",
"Uthe njengoba ikhansela lifuna ukukhokhelwa imfuyo yalo, isikole kasinayo imali eyabelwe ukwenza eminye imisebenzi engaphathelene nokufunda nokufundisa. “Isikole sibiyiwe, isango liyavalwa, kodwa ziyangena ngenxa yokuthi kunezimbobo ezivulwa yizingane nokuganga kwabantu endaweni. Uma kwenzeka sivala imbobo, izimbuzi zingaphakathi esikoleni kuyimpelasonto, sifica sekuvulwe enye imbobo, kwakhishwa izimbuzi,” esho.",
"Ethintwa uSokhela, uthe nyakenye ufelwe yizimbuzi eziwu-70 ngenxa ye-rice ezilidla esikoleni. Uthe ngesonto eledlule kufe eziwu-7. Likuqinisekisile kuleli phephandaba ukuthi liyifunile inkokhelo esikoleni ngezimbuzi zalo. “Izimbuzi zami zifela ngaphakathi esikoleni, bazithathe, bazihudelele ngaphandle kwesango, kube sengathi akukho nje okwenzekile.",
"“Ezinye zifela ekhaya. Iphelile imfuyo yami ngenxa yobudedengu besikole. NgoSeptember nyakenye, siluxoxile nesikole udaba, induna namaphoyisa kwacaca ukuthi icala lilahla isikole. Kulo mhlangano sivumelene ngokuthi i-rice malingalahlwa emagcekeni esikoleni, kodwa malinikezwe abafuyi bezinja naba-nezingulube.",
"“Bakwenzile-ke lokho nyakenye, njengoba sebeyekile, sekuqale phansi futhi ukufa kwemfuyo yami. Izimbuzi zami ziphelile, akekho umuntu ongavuma ukuphelelwa yimfuyo yakhe. Uma sezifile lezi zimbuzi, ngiyazihlinza ngifuna ukuqinisekisa ukuthi zibulawe yini, ngilifice i-rice,” kusho uSokhela.",
"Uthe kuwumsebenzi wesikole ukuqinisekisa ukuthi isikole kasinazo izimbobo nokugada imfuyo uma isingene ngaphakathi esikoleni.",
"INkosi yendawo uThokozani Mabaso, ithe isikole sibikile kuyona ngesonto eledlule. Ithe izobiza izinhlaka ezithintekayo nomkhandlu wesizwe senkosi, kubhungwe ngalolu daba ukuze kutholwe isixazululo.",
"Okhulumela uMnyango wezeMfundo KwaZulu-Natal, uMnu Muzi Mahlambi, ugcine engatholakalanga kwaze kwashaya isikhathi sokuloba."
]
}
]
},
"images": [
{
"versions": [
{
"url": "https://ilanganews.co.za/wp-content/uploads/2024/11/IKHANSELA-480x360.jpg",
"query_width": null,
"size": {
"width": 480,
"height": 360
},
"type": "image/jpeg"
},
{
"url": "https://ilanganews.co.za/wp-content/uploads/2024/11/IKHANSELA-980x735.jpg",
"query_width": null,
"size": {
"width": 980,
"height": 735
},
"type": "image/jpeg"
}
],
"is_cover": false,
"description": null,
"caption": "NGUMNU Mhawukeleni Sokhela.",
"authors": [],
"position": 306
},
{
"versions": [
{
"url": "https://ilanganews.co.za/wp-content/uploads/2024/11/IKHANSELA-1-480x360.jpg",
"query_width": null,
"size": {
"width": 480,
"height": 360
},
"type": "image/jpeg"
},
{
"url": "https://ilanganews.co.za/wp-content/uploads/2024/11/IKHANSELA-1-980x735.jpg",
"query_width": null,
"size": {
"width": 980,
"height": 735
},
"type": "image/jpeg"
}
],
"is_cover": false,
"description": null,
"caption": null,
"authors": [],
"position": 317
}
],
"publishing_date": "2024-11-05 13:11:53+02:00",
"title": "Ikhansela “lisola uthishanhloko” ngezimbuzi zalo ezifohla zidle i-rice lesikole zife"
}
}
Binary file not shown.
4 changes: 4 additions & 0 deletions tests/resources/parser/test_data/za/meta.info
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@
"url": "https://durbanlocal.co.za/entertainment/2025-06-23-we-love-romance-declared-at-rom-con/",
"crawl_date": "2025-11-09 00:45:15.046807"
},
"Ilanga_2025_12_03.html.gz": {
"url": "https://ilanganews.co.za/ikhansela-lisola-uthishanhloko-ngezimbuzi-zalo-ezifohla-zidle-i-rice-lesikole-zife/",
"crawl_date": "2025-12-03 00:57:47.763461"
},
"IsolezweLesiXhosa_2025_11_09.html.gz": {
"url": "https://isolezwelesixhosa.co.za/iindaba/2025-11-07-ugigaba-akabanjwanga-emva-kokutyelela-i-idac/",
"crawl_date": "2025-11-09 01:07:59.558667"
Expand Down