Skip to content

Commit

Permalink
Merge pull request #926 from marwoodandrew/SDAAP-103
Browse files Browse the repository at this point in the history
SDAAP-103 Headline length exceeded when publishing to BOB
  • Loading branch information
marwoodandrew authored Jan 24, 2024
2 parents f6073f1 + 1a3a320 commit 3c9ef4c
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def format(self, article, subscriber, codes=None):
formatted_article['abstract'] = self.get_text_content(
to_ascii(formatted_article.get('abstract', '') or '')).strip()
formatted_article['headline'] = self.get_text_content(
to_ascii(formatted_article.get('headline', ''))).strip()
to_ascii(formatted_article.get('headline', '')), space_on_elements=False).strip()
formatted_article['byline'] = self.get_text_content(
to_ascii(formatted_article.get('byline', '') or '')).strip()

Expand Down Expand Up @@ -116,7 +116,7 @@ def format(self, article, subscriber, codes=None):
def can_format(self, format_type, article):
return format_type == 'AAP BULLETIN BUILDER'

def get_text_content(self, content):
def get_text_content(self, content, space_on_elements=True):
content = content.replace('<br>', '<br/>').replace('</br>', '')
# remove control chars except \n
content = re.sub('[\x00-\x09\x0b-\x1f]', '', content)
Expand All @@ -125,7 +125,7 @@ def get_text_content(self, content):
if content == '':
return ''

parsed = parse_html(content, content='html', space_on_elements=True)
parsed = parse_html(content, content='html', space_on_elements=space_on_elements)

# breaks are replaced with spaces
for br in parsed.xpath('//br'):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -786,3 +786,59 @@ def test_embedded_item(self):
self.assertGreater(int(seq), 0)
test_article = json.loads(item.get('data'))
self.assertEqual(test_article['body_html'], '<p>pre amble</p><p>post amble</p>')

def test_clean_headline_html(self):
article = {
config.ID_FIELD: '123',
config.VERSION: 2,
'source': 'AAP',
'headline': '1234567890123456789012345123456789012345678901234567890',
'slugline': 'slugline',
'abstract': '<p>abstract</p>',
'type': 'text',
'anpa_category': [{'qcode': 'a', 'name': 'Australian General News'}],
'flags': {
'marked_for_legal': True
},
'body_html': ('<p>The story<p>'),
"fields_meta": {
"headline": {
"draftjsState": [
{
"blocks": [
{
"key": "2fvvl",
"text": "1234567890123456789012345123456789012345678901234567890",
"type": "unstyled",
"depth": 0,
"inlineStyleRanges": [
{
"offset": 0,
"length": 55,
"style": "BOLD"
},
{
"offset": 54,
"length": 1,
"style": "LIMIT_CHARACTERS_OVERFLOW"
}
],
"entityRanges": [],
"data": {
"MULTIPLE_HIGHLIGHTS": {}
}
}
],
"entityMap": {}
}
]
}
}
}

subscriber = self.app.data.find('subscribers', None, None)[0][0]
seq, item = self._formatter.format(article, subscriber)[0]
item = json.loads(item)
self.assertGreater(int(seq), 0)
test_article = json.loads(item.get('data'))
self.assertEqual(test_article['headline'], '1234567890123456789012345123456789012345678901234567890')

0 comments on commit 3c9ef4c

Please sign in to comment.