Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions docs/supported_publishers.md
Original file line number Diff line number Diff line change
Expand Up @@ -3853,6 +3853,25 @@
<td>&#160;</td>
<td>&#160;</td>
</tr>
<tr>
<td>
<code>TheCitizen</code>
</td>
<td>
<div>The Citizen</div>
</td>
<td>
<a href="https://www.citizen.co.za/">
<span>www.citizen.co.za</span>
</a>
</td>
<td>
<code>en</code>
</td>
<td>&#160;</td>
<td>&#160;</td>
<td>&#160;</td>
</tr>
<tr>
<td>
<code>TimesLive</code>
Expand Down
14 changes: 14 additions & 0 deletions src/fundus/publishers/za/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from fundus.publishers.za.daily_maverick import DailyMaverickParser
from fundus.publishers.za.dizindaba import DizindabaParser
from fundus.publishers.za.independent_online import IndependentOnlineParser
from fundus.publishers.za.the_citizen import TheCitizenParser
from fundus.publishers.za.times_live import TimesLiveParser
from fundus.scraping.filter import inverse, lor, regex_filter
from fundus.scraping.url import NewsMap, RSSFeed, Sitemap
Expand Down Expand Up @@ -81,3 +82,16 @@ class ZA(metaclass=PublisherGroup):
),
],
)

TheCitizen = Publisher(
name="The Citizen",
domain="https://www.citizen.co.za/",
parser=TheCitizenParser,
sources=[
Sitemap(
"https://www.citizen.co.za/sitemap_index.xml",
reverse=True,
sitemap_filter=inverse(regex_filter("/post-sitemap")),
),
],
)
58 changes: 58 additions & 0 deletions src/fundus/publishers/za/the_citizen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import datetime
import re
from typing import List, Optional

from lxml.etree import XPath

from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute
from fundus.parser.utility import (
extract_article_body_with_selector,
generic_author_parsing,
generic_date_parsing,
generic_topic_parsing,
image_extraction,
)


class TheCitizenParser(ParserProxy):
class V1(BaseParser):
_paragraph_selector = XPath("//div[@class='single-content']//p[string-length(text())>2]")
_summary_selector = XPath("//div[@class='single-excerpt']/h2")
_subheadline_selector = XPath("//div[@class='single-content']/h2")

@attribute
def body(self) -> Optional[ArticleBody]:
return extract_article_body_with_selector(
self.precomputed.doc,
paragraph_selector=self._paragraph_selector,
summary_selector=self._summary_selector,
subheadline_selector=self._subheadline_selector,
)

@attribute
def authors(self) -> List[str]:
return generic_author_parsing(self.precomputed.ld.bf_search("author"))

@attribute
def publishing_date(self) -> Optional[datetime.datetime]:
return generic_date_parsing(self.precomputed.ld.bf_search("datePublished"))

@attribute
def title(self) -> Optional[str]:
return self.precomputed.ld.bf_search("headline")

@attribute
def topics(self) -> List[str]:
return generic_topic_parsing(self.precomputed.ld.bf_search("keywords"))

@attribute
def images(self) -> List[Image]:
return image_extraction(
doc=self.precomputed.doc,
paragraph_selector=self._paragraph_selector,
image_selector=XPath("//div[contains(@class, 'featured-image')]/img"),
caption_selector=XPath(
"./ancestor::div[contains(@class, 'featured-image')]//div[contains(@class, 'image-caption')]//p"
),
author_selector=re.compile(r"(?i)(image courtesy( of)?\s*|image|picture):?(?P<credits>.+)"),
)
118 changes: 118 additions & 0 deletions tests/resources/parser/test_data/za/TheCitizen.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
{
"V1": {
"authors": [
"Enkosi Selane"
],
"body": {
"summary": [
"MEC Mamabolo urged the public to limit travel during severe weather conditions."
],
"sections": [
{
"headline": [],
"paragraphs": [
"The Gauteng Provincial Government has issued urgent warnings to residents as severe summer weather continues to intensify, with persistent heavy rainfall, powerful thunderstorms and localised flooding creating hazardous conditions across the province.",
"Driven by a developing La Niña pattern, fast-moving storm systems are threatening communities throughout Gauteng.",
"The government has confirmed that while disaster structures remain fully activated, safety risks are escalating and residents must strictly adhere to official alerts to prevent injuries, fatalities and avoidable damage."
]
},
{
"headline": [
"Provincial disaster teams on high alert"
],
"paragraphs": [
"MEC for Cooperative Governance and Traditional Affairs, Jacob Mamabolo, said the Gauteng Provincial Disaster Management Centre (PDMC) briefed the Local Government Turnaround Strategy weekly meeting on Thursday evening, reviewing the province’s state of readiness in response to the abnormal rainfall.",
"“The session reviewed early impacts, assessed municipal response capacity and confirmed the activation of emergency protocols to safeguard communities as weather conditionsdeteriorate,” he stated.",
"Additionally, Mamabolo confirmed that the PDMC and all municipal disaster management centres remain on full alert, with continuous monitoring of storm activity and rapid-response teams on standby.",
"“The storms we are experiencing are unpredictable and fast-moving. We urge all residents to stay vigilant, follow official updates and avoid any situation that may place their lives at risk.”",
"The MEC emphasised that the combination of abnormal weather patterns and already saturated ground conditions requires communities to treat every warning with the utmost seriousness.",
"He urged residents to avoid flooded areas and immediately report hazardous situations.",
"Mamabolo emphasised the need to steer clear of low-lying bridges, flooded roads, and underpasses, and urged the public to limit travel during severe weather conditions.",
"Furthermore, he encouraged communities to report blocked drains, sinkholes, damaged roads, burst pipes, or any emerging hazards so that response teams can act quickly.",
"“Warnings issued by the South African Weather Service and the PDMC must be treated as protective measures as they are designed to save lives,” he stated.",
"According to Gauteng Cogta, the province strengthened its early warning capability to ensure real-time weather alerts reach residents through WhatsApp, SMS, email and social media channels.",
"“The province’s summer readiness plan, approved by the National Disaster Management Centre, has introduced a uniform disaster-response system across all municipalities to support quicker activation and more coordinated emergency action,” the department detailed.",
"Mamabolo said the ongoing work across disaster preparedness, infrastructure and community safety reflects a disciplined, province-wide effort to stabilise municipalities and protect communities during this critical period.",
"“Our priority is to safeguard lives during this period of abnormal weather while continuing to strengthen service delivery across all municipalities. Gauteng is acting with urgency, and we will continue issuing updates to keep the public informed and safe,” he said.",
"The provincial department pledged to issue regular advisories throughout the summer season and urged residents to stay vigilant, follow official guidance, and prioritise safety at all times."
]
}
]
},
"images": [
{
"versions": [
{
"url": "https://media.citizen.co.za/wp-content/uploads/2025/11/cold-rainy-day-300x200.jpg",
"query_width": null,
"size": {
"width": 300,
"height": 200
},
"type": "image/jpeg"
},
{
"url": "https://media.citizen.co.za/wp-content/uploads/2025/11/cold-rainy-day-456x304.jpg",
"query_width": null,
"size": {
"width": 456,
"height": 304
},
"type": "image/jpeg"
},
{
"url": "https://media.citizen.co.za/wp-content/uploads/2025/11/cold-rainy-day-768x512.jpg",
"query_width": null,
"size": {
"width": 768,
"height": 512
},
"type": "image/jpeg"
},
{
"url": "https://media.citizen.co.za/wp-content/uploads/2025/11/cold-rainy-day-825x550.jpg",
"query_width": null,
"size": {
"width": 825,
"height": 550
},
"type": "image/jpeg"
},
{
"url": "https://media.citizen.co.za/wp-content/uploads/2025/11/cold-rainy-day-1200x800.jpg",
"query_width": null,
"size": {
"width": 1200,
"height": 800
},
"type": "image/jpeg"
},
{
"url": "https://media.citizen.co.za/wp-content/uploads/2025/11/cold-rainy-day.jpg",
"query_width": null,
"size": {
"width": 1500,
"height": 1000
},
"type": "image/jpeg"
}
],
"is_cover": true,
"description": "Gauteng weekend weather",
"caption": null,
"authors": [
"iStock"
],
"position": 476
}
],
"publishing_date": "2025-11-28 13:26:06+00:00",
"title": "Severe storms escalate in Gauteng, residents urged to avoid flood risks",
"topics": [
"Cooperative Governance & Traditional Affairs (CoGTA)",
"Gauteng",
"Jacob Boy Mamabolo",
"weather"
]
}
}
Binary file not shown.
4 changes: 4 additions & 0 deletions tests/resources/parser/test_data/za/meta.info
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@
"url": "https://isolezwe.co.za/izindaba/2025-11-08-kubukeka-kukude-phambili-amaphara-edla-umzila-ethekwini/",
"crawl_date": "2025-11-09 00:51:18.108142"
},
"TheCitizen_2025_11_28.html.gz": {
"url": "https://www.citizen.co.za/news/south-africa/severe-storms-gauteng-residents-flood-risks/",
"crawl_date": "2025-11-28 14:43:08.646278"
},
"TimesLive_2025_05_22.html.gz": {
"url": "https://www.timeslive.co.za/sunday-times-daily/business/2025-05-21-2025-budget-30-sars-gets-r4bn-to-hire-army-of-debt-collectors/",
"crawl_date": "2025-05-22 12:29:24.622820"
Expand Down