Skip to content

Commit

Permalink
Extract feed and item images from more places (#220)
Browse files Browse the repository at this point in the history
  • Loading branch information
infogulch authored Feb 23, 2024
1 parent b187685 commit 454d6a3
Show file tree
Hide file tree
Showing 13 changed files with 240 additions and 12 deletions.
33 changes: 33 additions & 0 deletions extensions/itunes_test.go → extensions/extensions_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,36 @@ func TestITunes_Extensions(t *testing.T) {
}
}
}


func TestMedia_Extensions(t *testing.T) {
files, _ := filepath.Glob("../testdata/extensions/media/*.xml")
for _, f := range files {
base := filepath.Base(f)
name := strings.TrimSuffix(base, filepath.Ext(base))

fmt.Printf("Testing %s... ", name)

// Get actual source feed
ff := fmt.Sprintf("../testdata/extensions/media/%s.xml", name)
f, _ := os.ReadFile(ff)

// Parse actual feed
fp := gofeed.NewParser()
actual, _ := fp.Parse(bytes.NewReader(f))

// Get json encoded expected feed result
ef := fmt.Sprintf("../testdata/extensions/media/%s.json", name)
e, _ := os.ReadFile(ef)

// Unmarshal expected feed
expected := &gofeed.Feed{}
json.Unmarshal(e, &expected)

if assert.Equal(t, expected, actual, "Feed file %s.xml did not match expected output %s.json", name, name) {
fmt.Printf("OK\n")
} else {
fmt.Printf("Failed\n")
}
}
}
22 changes: 22 additions & 0 deletions testdata/extensions/media/feed_image_-_rss_channel_media.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
{
"feedType": "rss",
"feedVersion": "0.91",
"image": {
"url": "http://example.com/channel.png"
},
"extensions": {
"media": {
"content": [
{
"name": "content",
"attrs": {
"url": "http://example.com/channel.png",
"medium": "image"
},
"children": {}
}
]
}
},
"items": []
}
8 changes: 8 additions & 0 deletions testdata/extensions/media/feed_image_-_rss_channel_media.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
<!--
Description: channel image
-->
<rss version="0.91">
<channel>
<media:content url="http://example.com/channel.png" medium="image"></media:content>
</channel>
</rss>
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
{
"items": [
{
"title": "rss item image from media content",
"image": {
"url": "https://example.com/blog-open.png",
"title": ""
},
"extensions": {
"media": {
"content": [
{
"name": "content",
"value": "",
"attrs": {
"medium": "image",
"url": "https://example.com/blog-open.png"
},
"children": {
"title": [
{
"name": "title",
"value": "blog-open",
"attrs": {
"type": "html"
},
"children": {}
}
]
}
}
]
}
}
}
],
"feedType": "rss",
"feedVersion": "2.0"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
<!--
Description: rss item image from media content
-->
<rss version="2.0">
<channel>
<item>
<title>rss item image from media content</title>
<media:content url="https://example.com/blog-open.png" medium="image">
<media:title type="html">blog-open</media:title>
</media:content>
</item>
</channel>
</rss>
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"feedType": "rss",
"feedVersion": "0.91",
"description": "<img src=\"http://example.com/description.png\">",
"image": {
"url": "http://example.com/description.png"
},
"items": []
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
<!--
Description: channel image
-->
<rss version="0.91">
<channel>
<description><![CDATA[<img src="http://example.com/description.png">]]></description>
</channel>
</rss>
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@
"type": "audio/jpeg",
"url": "http://example.org/podcast.jpg"
},
"image": {
"url": "http://example.org/podcast.jpg",
"title": ""
},
"enclosures": [
{
"length": "123456",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"feedType": "rss",
"feedVersion": "0.91",
"items": [
{
"image": {
"url": "http://example.com/content.png"
},
"content": "<img src=\"http://example.com/content.png\">"
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
<!--
Description: item image from content
-->
<rss version="0.91">
<channel>
<item>
<content><![CDATA[<img src="http://example.com/content.png">]]></content>
</item>
</channel>
</rss>
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"feedType": "rss",
"feedVersion": "0.91",
"items": [
{
"image": {
"url": "http://example.com/description.png"
},
"description": "<img src=\"http://example.com/description.png\">"
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
<!--
Description: item image from description
-->
<rss version="0.91">
<channel>
<item>
<description><![CDATA[<img src="http://example.com/description.png">]]></description>
</item>
</channel>
</rss>
72 changes: 60 additions & 12 deletions translator.go
Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
package gofeed

import (
"bytes"
"fmt"
"strings"
"time"

"github.com/PuerkitoBio/goquery"
"github.com/mmcdole/gofeed/atom"
ext "github.com/mmcdole/gofeed/extensions"
"github.com/mmcdole/gofeed/internal/shared"
"github.com/mmcdole/gofeed/json"
"github.com/mmcdole/gofeed/rss"
"golang.org/x/net/html"
)

// Translator converts a particular feed (atom.Feed or rss.Feed of json.Feed)
Expand Down Expand Up @@ -218,16 +221,26 @@ func (t *DefaultRSSTranslator) translateFeedLanguage(rss *rss.Feed) (language st
return
}

func (t *DefaultRSSTranslator) translateFeedImage(rss *rss.Feed) (image *Image) {
func (t *DefaultRSSTranslator) translateFeedImage(rss *rss.Feed) *Image {
if rss.Image != nil {
image = &Image{}
image.Title = rss.Image.Title
image.URL = rss.Image.URL
} else if rss.ITunesExt != nil && rss.ITunesExt.Image != "" {
image = &Image{}
image.URL = rss.ITunesExt.Image
return &Image{
Title: rss.Image.Title,
URL: rss.Image.URL,
}
}
return
if rss.ITunesExt != nil && rss.ITunesExt.Image != "" {
return &Image{URL: rss.ITunesExt.Image}
}
if media, ok := rss.Extensions["media"]; ok {
if content, ok := media["content"]; ok {
for _, c := range content {
if strings.HasPrefix(c.Attrs["type"], "image/") || c.Attrs["medium"] == "image" {
return &Image{URL: c.Attrs["url"]}
}
}
}
}
return firstImageFromHtmlDocument(rss.Description)
}

func (t *DefaultRSSTranslator) translateFeedCopyright(rss *rss.Feed) (rights string) {
Expand Down Expand Up @@ -400,12 +413,47 @@ func (t *DefaultRSSTranslator) translateItemGUID(rssItem *rss.Item) (guid string
return
}

func (t *DefaultRSSTranslator) translateItemImage(rssItem *rss.Item) (image *Image) {
func (t *DefaultRSSTranslator) translateItemImage(rssItem *rss.Item) *Image {
if rssItem.ITunesExt != nil && rssItem.ITunesExt.Image != "" {
image = &Image{}
image.URL = rssItem.ITunesExt.Image
return &Image{URL: rssItem.ITunesExt.Image}
}
return
if media, ok := rssItem.Extensions["media"]; ok {
if content, ok := media["content"]; ok {
for _, c := range content {
if strings.Contains(c.Attrs["type"], "image") || strings.Contains(c.Attrs["medium"], "image") {
return &Image{URL: c.Attrs["url"]}
}
}
}
}
for _, enc := range rssItem.Enclosures {
if strings.HasPrefix(enc.Type, "image/") {
return &Image{URL: enc.URL}
}
}
if img := firstImageFromHtmlDocument(rssItem.Content); img != nil {
return img
}
if img := firstImageFromHtmlDocument(rssItem.Description); img != nil {
return img
}
return nil
}

func firstImageFromHtmlDocument(document string) *Image {
if doc, err := html.Parse(bytes.NewBufferString(document)); err == nil {
doc := goquery.NewDocumentFromNode(doc)
for _, node := range doc.FindMatcher(goquery.Single("img[src]")).Nodes {
for _, attr := range node.Attr {
if attr.Key == "src" {
return &Image{
URL: attr.Val,
}
}
}
}
}
return nil
}

func (t *DefaultRSSTranslator) translateItemCategories(rssItem *rss.Item) (categories []string) {
Expand Down

0 comments on commit 454d6a3

Please sign in to comment.