Skip to content

Commit

Permalink
Merge pull request #249 from flairNLP/add_la_times
Browse files Browse the repository at this point in the history
Add LA times [Based on #251]
  • Loading branch information
MaxDall authored Jul 11, 2023
2 parents e7e2c7d + 27458f2 commit 87dfb38
Show file tree
Hide file tree
Showing 6 changed files with 73 additions and 0 deletions.
17 changes: 17 additions & 0 deletions docs/supported_publishers.md
Original file line number Diff line number Diff line change
Expand Up @@ -401,6 +401,23 @@
<td>&#160;</td>
<td>&#160;</td>
</tr>
<tr>
<td>
<code>LATimes</code>
</td>
<td>
<div>Los Angeles Times</div>
</td>
<td>
<a href="https://www.latimes.com/">
<span>www.latimes.com</span>
</a>
</td>
<td>
<code>topics</code>
</td>
<td>&#160;</td>
</tr>
<tr>
<td>
<code>OccupyDemocrats</code>
Expand Down
8 changes: 8 additions & 0 deletions src/fundus/publishers/us/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from .cnbc import CNBCParser
from .fox_news import FoxNewsParser
from .free_beacon import FreeBeaconParser
from .la_times import LATimesParser
from .occupy_democrats import OccupyDemocratsParser
from .reuters import ReutersParser
from .the_gateway_pundit import TheGatewayPunditParser
Expand Down Expand Up @@ -132,3 +133,10 @@ class US(PublisherEnum):
sources=[Sitemap(url="https://occupydemocrats.com/sitemap.xml", sitemap_filter=regex_filter(r"-tax-|-misc"))],
parser=OccupyDemocratsParser,
)

LATimes = PublisherSpec(
name="Los Angeles Times",
domain="https://www.latimes.com/",
sources=[Sitemap("https://www.latimes.com/sitemap.xml"), NewsMap("https://www.latimes.com/news-sitemap.xml")],
parser=LATimesParser,
)
35 changes: 35 additions & 0 deletions src/fundus/publishers/us/la_times.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import datetime
from typing import List, Optional

from lxml.cssselect import CSSSelector

from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute
from fundus.parser.utility import (
extract_article_body_with_selector,
generic_author_parsing,
generic_date_parsing,
)


class LATimesParser(ParserProxy):
class V1(BaseParser):
_paragraph_selector = CSSSelector("div[data-element*=story-body] > p")

@attribute
def body(self) -> ArticleBody:
return extract_article_body_with_selector(
self.precomputed.doc,
paragraph_selector=self._paragraph_selector,
)

@attribute
def publishing_date(self) -> Optional[datetime.datetime]:
return generic_date_parsing(self.precomputed.ld.bf_search("datePublished"))

@attribute
def authors(self) -> List[str]:
return generic_author_parsing(self.precomputed.ld.bf_search("author"))

@attribute
def title(self) -> Optional[str]:
return self.precomputed.meta.get("og:title")
9 changes: 9 additions & 0 deletions tests/resources/parser/test_data/us/LATimes.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"V1": {
"authors": [
"Houston Mitchell"
],
"publishing_date": "2023-06-26 12:00:29.055000+00:00",
"title": "One hundred years at the Coliseum: Much more than a sports venue"
}
}
Binary file not shown.
4 changes: 4 additions & 0 deletions tests/resources/parser/test_data/us/meta.info
Original file line number Diff line number Diff line change
Expand Up @@ -46,5 +46,9 @@
"OccupyDemocrats_2023_06_20.html.gz": {
"url": "https://occupydemocrats.com/2023/06/19/gop-house-speaker-in-north-carolina-accused-of-blackmail-and-group-sex/",
"crawl_date": "2023-06-20 23:47:49.443476"
},
"LATimes_2023_06_26.html.gz": {
"url": "https://www.latimes.com/sports/story/2023-06-26/100-years-los-angeles-coliseum-historical-events",
"crawl_date": "2023-06-26 22:35:21.753444"
}
}

0 comments on commit 87dfb38

Please sign in to comment.