glance/internal/feed/rss.go
2024-06-29 17:17:27 +01:00

231 lines
5.0 KiB
Go

package feed
import (
"context"
"fmt"
"html"
"log/slog"
"net/url"
"regexp"
"sort"
"strings"
"time"
"github.com/mmcdole/gofeed"
gofeedext "github.com/mmcdole/gofeed/extensions"
)
type RSSFeedItem struct {
ChannelName string
ChannelURL string
Title string
Link string
ImageURL string
Categories []string
Description string
PublishedAt time.Time
}
// doesn't cover all cases but works the vast majority of the time
var htmlTagsWithAttributesPattern = regexp.MustCompile(`<\/?[a-zA-Z0-9-]+ *(?:[a-zA-Z-]+=(?:"|').*?(?:"|') ?)* *\/?>`)
var sequentialWhitespacePattern = regexp.MustCompile(`\s+`)
func sanitizeFeedDescription(description string) string {
if description == "" {
return ""
}
description = strings.ReplaceAll(description, "\n", " ")
description = htmlTagsWithAttributesPattern.ReplaceAllString(description, "")
description = sequentialWhitespacePattern.ReplaceAllString(description, " ")
description = strings.TrimSpace(description)
description = html.UnescapeString(description)
return description
}
type RSSFeedRequest struct {
Url string `yaml:"url"`
Title string `yaml:"title"`
HideCategories bool `yaml:"hide-categories"`
HideDescription bool `yaml:"hide-description"`
ItemLinkPrefix string `yaml:"item-link-prefix"`
}
type RSSFeedItems []RSSFeedItem
func (f RSSFeedItems) SortByNewest() RSSFeedItems {
sort.Slice(f, func(i, j int) bool {
return f[i].PublishedAt.After(f[j].PublishedAt)
})
return f
}
var feedParser = gofeed.NewParser()
func getItemsFromRSSFeedTask(request RSSFeedRequest) ([]RSSFeedItem, error) {
ctx, cancel := context.WithTimeout(context.Background(), time.Second*5)
defer cancel()
feed, err := feedParser.ParseURLWithContext(request.Url, ctx)
if err != nil {
return nil, err
}
items := make(RSSFeedItems, 0, len(feed.Items))
for i := range feed.Items {
item := feed.Items[i]
rssItem := RSSFeedItem{
ChannelURL: feed.Link,
Title: item.Title,
}
if request.ItemLinkPrefix != "" {
rssItem.Link = request.ItemLinkPrefix + item.Link
} else if strings.HasPrefix(item.Link, "http://") || strings.HasPrefix(item.Link, "https://") {
rssItem.Link = item.Link
} else {
parsedUrl, err := url.Parse(feed.Link)
if err != nil {
parsedUrl, err = url.Parse(request.Url)
}
if err == nil {
var link string
if item.Link[0] == '/' {
link = item.Link
} else {
link = "/" + item.Link
}
rssItem.Link = parsedUrl.Scheme + "://" + parsedUrl.Host + link
}
}
if !request.HideDescription && item.Description != "" {
description, _ := limitStringLength(item.Description, 1000)
description = sanitizeFeedDescription(description)
description, limited := limitStringLength(description, 200)
if limited {
description += "…"
}
rssItem.Description = description
}
if !request.HideCategories {
var categories = make([]string, 0, 6)
for _, category := range item.Categories {
if len(categories) == 6 {
break
}
if len(category) == 0 || len(category) > 30 {
continue
}
categories = append(categories, category)
}
rssItem.Categories = categories
}
if request.Title != "" {
rssItem.ChannelName = request.Title
} else {
rssItem.ChannelName = feed.Title
}
if item.Image != nil {
rssItem.ImageURL = item.Image.URL
} else if url := findThumbnailInItemExtensions(item); url != "" {
rssItem.ImageURL = url
} else if feed.Image != nil {
rssItem.ImageURL = feed.Image.URL
}
if item.PublishedParsed != nil {
rssItem.PublishedAt = *item.PublishedParsed
} else {
rssItem.PublishedAt = time.Now()
}
items = append(items, rssItem)
}
return items, nil
}
func recursiveFindThumbnailInExtensions(extensions map[string][]gofeedext.Extension) string {
for _, exts := range extensions {
for _, ext := range exts {
if ext.Name == "thumbnail" || ext.Name == "image" {
if url, ok := ext.Attrs["url"]; ok {
return url
}
}
if ext.Children != nil {
if url := recursiveFindThumbnailInExtensions(ext.Children); url != "" {
return url
}
}
}
}
return ""
}
func findThumbnailInItemExtensions(item *gofeed.Item) string {
media, ok := item.Extensions["media"]
if !ok {
return ""
}
return recursiveFindThumbnailInExtensions(media)
}
func GetItemsFromRSSFeeds(requests []RSSFeedRequest) (RSSFeedItems, error) {
job := newJob(getItemsFromRSSFeedTask, requests).withWorkers(10)
feeds, errs, err := workerPoolDo(job)
if err != nil {
return nil, fmt.Errorf("%w: %v", ErrNoContent, err)
}
failed := 0
entries := make(RSSFeedItems, 0, len(feeds)*10)
for i := range feeds {
if errs[i] != nil {
failed++
slog.Error("failed to get rss feed", "error", errs[i], "url", requests[i].Url)
continue
}
entries = append(entries, feeds[i]...)
}
if len(entries) == 0 {
return nil, ErrNoContent
}
entries.SortByNewest()
if failed > 0 {
return entries, fmt.Errorf("%w: missing %d RSS feeds", ErrPartialContent, failed)
}
return entries, nil
}