Refactor RSS widget

This commit is contained in:
Svilen Markov 2025-04-29 10:37:46 +01:00
parent f36527995e
commit 129441713b
2 changed files with 130 additions and 151 deletions

View File

@ -2,7 +2,6 @@ package glance
import ( import (
"context" "context"
"errors"
"fmt" "fmt"
"html" "html"
"html/template" "html/template"
@ -13,6 +12,7 @@ import (
"regexp" "regexp"
"sort" "sort"
"strings" "strings"
"sync"
"time" "time"
"github.com/mmcdole/gofeed" "github.com/mmcdole/gofeed"
@ -26,11 +26,7 @@ var (
rssWidgetHorizontalCards2Template = mustParseTemplate("rss-horizontal-cards-2.html", "widget-base.html") rssWidgetHorizontalCards2Template = mustParseTemplate("rss-horizontal-cards-2.html", "widget-base.html")
) )
type cachedFeed struct { var feedParser = gofeed.NewParser()
LastModified time.Time
Etag string
Items rssFeedItemList
}
type rssWidget struct { type rssWidget struct {
widgetBase `yaml:",inline"` widgetBase `yaml:",inline"`
@ -38,17 +34,20 @@ type rssWidget struct {
Style string `yaml:"style"` Style string `yaml:"style"`
ThumbnailHeight float64 `yaml:"thumbnail-height"` ThumbnailHeight float64 `yaml:"thumbnail-height"`
CardHeight float64 `yaml:"card-height"` CardHeight float64 `yaml:"card-height"`
Items rssFeedItemList `yaml:"-"`
Limit int `yaml:"limit"` Limit int `yaml:"limit"`
CollapseAfter int `yaml:"collapse-after"` CollapseAfter int `yaml:"collapse-after"`
SingleLineTitles bool `yaml:"single-line-titles"` SingleLineTitles bool `yaml:"single-line-titles"`
PreserveOrder bool `yaml:"preserve-order"` PreserveOrder bool `yaml:"preserve-order"`
Items rssFeedItemList `yaml:"-"`
NoItemsMessage string `yaml:"-"` NoItemsMessage string `yaml:"-"`
CachedFeeds map[string]cachedFeed `yaml:"-"`
feedCacheMutex sync.Mutex
cachedFeeds map[string]*cachedRSSFeed `yaml:"-"`
} }
func (widget *rssWidget) initialize() error { func (widget *rssWidget) initialize() error {
widget.withTitle("RSS Feed").withCacheDuration(1 * time.Hour) widget.withTitle("RSS Feed").withCacheDuration(2 * time.Hour)
if widget.Limit <= 0 { if widget.Limit <= 0 {
widget.Limit = 25 widget.Limit = 25
@ -73,46 +72,27 @@ func (widget *rssWidget) initialize() error {
} }
widget.NoItemsMessage = "No items were returned from the feeds." widget.NoItemsMessage = "No items were returned from the feeds."
widget.cachedFeeds = make(map[string]*cachedRSSFeed)
return nil return nil
} }
func (widget *rssWidget) update(ctx context.Context) { func (widget *rssWidget) update(ctx context.Context) {
// Populate If-Modified-Since header and Etag items, err := widget.fetchItemsFromFeeds()
for i, req := range widget.FeedRequests {
if cachedFeed, ok := widget.CachedFeeds[req.URL]; ok {
widget.FeedRequests[i].IfModifiedSince = cachedFeed.LastModified
widget.FeedRequests[i].Etag = cachedFeed.Etag
}
}
allItems, feeds, err := fetchItemsFromRSSFeeds(widget.FeedRequests, widget.CachedFeeds)
if !widget.canContinueUpdateAfterHandlingErr(err) { if !widget.canContinueUpdateAfterHandlingErr(err) {
return return
} }
if !widget.PreserveOrder { if !widget.PreserveOrder {
allItems.sortByNewest() items.sortByNewest()
} }
if len(allItems) > widget.Limit { if len(items) > widget.Limit {
allItems = allItems[:widget.Limit] items = items[:widget.Limit]
} }
widget.Items = allItems widget.Items = items
cachedFeeds := make(map[string]cachedFeed)
for _, feed := range feeds {
if !feed.LastModified.IsZero() || feed.Etag != "" {
cachedFeeds[feed.URL] = cachedFeed{
LastModified: feed.LastModified,
Etag: feed.Etag,
Items: feed.Items,
}
}
}
widget.CachedFeeds = cachedFeeds
} }
func (widget *rssWidget) Render() template.HTML { func (widget *rssWidget) Render() template.HTML {
@ -131,6 +111,12 @@ func (widget *rssWidget) Render() template.HTML {
return widget.renderTemplate(widget, rssWidgetTemplate) return widget.renderTemplate(widget, rssWidgetTemplate)
} }
type cachedRSSFeed struct {
etag string
lastModified string
items []rssFeedItem
}
type rssFeedItem struct { type rssFeedItem struct {
ChannelName string ChannelName string
ChannelURL string ChannelURL string
@ -142,35 +128,6 @@ type rssFeedItem struct {
PublishedAt time.Time PublishedAt time.Time
} }
// doesn't cover all cases but works the vast majority of the time
var htmlTagsWithAttributesPattern = regexp.MustCompile(`<\/?[a-zA-Z0-9-]+ *(?:[a-zA-Z-]+=(?:"|').*?(?:"|') ?)* *\/?>`)
func sanitizeFeedDescription(description string) string {
if description == "" {
return ""
}
description = strings.ReplaceAll(description, "\n", " ")
description = htmlTagsWithAttributesPattern.ReplaceAllString(description, "")
description = sequentialWhitespacePattern.ReplaceAllString(description, " ")
description = strings.TrimSpace(description)
description = html.UnescapeString(description)
return description
}
func shortenFeedDescriptionLen(description string, maxLen int) string {
description, _ = limitStringLength(description, 1000)
description = sanitizeFeedDescription(description)
description, limited := limitStringLength(description, maxLen)
if limited {
description += "…"
}
return description
}
type rssFeedRequest struct { type rssFeedRequest struct {
URL string `yaml:"url"` URL string `yaml:"url"`
Title string `yaml:"title"` Title string `yaml:"title"`
@ -180,19 +137,10 @@ type rssFeedRequest struct {
ItemLinkPrefix string `yaml:"item-link-prefix"` ItemLinkPrefix string `yaml:"item-link-prefix"`
Headers map[string]string `yaml:"headers"` Headers map[string]string `yaml:"headers"`
IsDetailed bool `yaml:"-"` IsDetailed bool `yaml:"-"`
IfModifiedSince time.Time `yaml:"-"`
Etag string `yaml:"-"`
} }
type rssFeedItemList []rssFeedItem type rssFeedItemList []rssFeedItem
type rssFeedResponse struct {
URL string
Items rssFeedItemList
LastModified time.Time
Etag string
}
func (f rssFeedItemList) sortByNewest() rssFeedItemList { func (f rssFeedItemList) sortByNewest() rssFeedItemList {
sort.Slice(f, func(i, j int) bool { sort.Slice(f, func(i, j int) bool {
return f[i].PublishedAt.After(f[j].PublishedAt) return f[i].PublishedAt.After(f[j].PublishedAt)
@ -201,69 +149,99 @@ func (f rssFeedItemList) sortByNewest() rssFeedItemList {
return f return f
} }
var feedParser = gofeed.NewParser() func (widget *rssWidget) fetchItemsFromFeeds() (rssFeedItemList, error) {
requests := widget.FeedRequests
func fetchItemsFromRSSFeedTask(request rssFeedRequest) (rssFeedResponse, error) { job := newJob(widget.fetchItemsFromFeedTask, requests).withWorkers(30)
feedResponse := rssFeedResponse{URL: request.URL} feeds, errs, err := workerPoolDo(job)
if err != nil {
return nil, fmt.Errorf("%w: %v", errNoContent, err)
}
failed := 0
entries := make(rssFeedItemList, 0, len(feeds)*10)
seen := make(map[string]struct{})
for i := range feeds {
if errs[i] != nil {
failed++
slog.Error("Failed to get RSS feed", "url", requests[i].URL, "error", errs[i])
continue
}
for _, item := range feeds[i] {
if _, exists := seen[item.Link]; exists {
continue
}
entries = append(entries, item)
seen[item.Link] = struct{}{}
}
}
if failed == len(requests) {
return nil, errNoContent
}
if failed > 0 {
return entries, fmt.Errorf("%w: missing %d RSS feeds", errPartialContent, failed)
}
return entries, nil
}
func (widget *rssWidget) fetchItemsFromFeedTask(request rssFeedRequest) ([]rssFeedItem, error) {
req, err := http.NewRequest("GET", request.URL, nil) req, err := http.NewRequest("GET", request.URL, nil)
if err != nil { if err != nil {
return feedResponse, err return nil, err
} }
req.Header.Add("User-Agent", fmt.Sprintf("Glance v%s", buildVersion)) req.Header.Add("User-Agent", glanceUserAgentString)
widget.feedCacheMutex.Lock()
cache, isCached := widget.cachedFeeds[request.URL]
if isCached {
if cache.etag != "" {
req.Header.Add("If-None-Match", cache.etag)
}
if cache.lastModified != "" {
req.Header.Add("If-Modified-Since", cache.lastModified)
}
}
widget.feedCacheMutex.Unlock()
for key, value := range request.Headers { for key, value := range request.Headers {
req.Header.Add(key, value) req.Header.Set(key, value)
}
if !request.IfModifiedSince.IsZero() {
req.Header.Add("If-Modified-Since", request.IfModifiedSince.Format(http.TimeFormat))
}
if request.Etag != "" {
req.Header.Add("If-None-Match", request.Etag)
} }
resp, err := defaultHTTPClient.Do(req) resp, err := defaultHTTPClient.Do(req)
if err != nil { if err != nil {
return feedResponse, err return nil, err
} }
defer resp.Body.Close() defer resp.Body.Close()
if resp.StatusCode == http.StatusNotModified { if resp.StatusCode == http.StatusNotModified && isCached {
return feedResponse, errNotModified return cache.items, nil
} }
if resp.StatusCode != http.StatusOK { if resp.StatusCode != http.StatusOK {
return feedResponse, fmt.Errorf("unexpected status code %d from %s", resp.StatusCode, request.URL) return nil, fmt.Errorf("unexpected status code %d from %s", resp.StatusCode, request.URL)
} }
body, err := io.ReadAll(resp.Body) body, err := io.ReadAll(resp.Body)
if err != nil { if err != nil {
return feedResponse, err return nil, err
} }
feed, err := feedParser.ParseString(string(body)) feed, err := feedParser.ParseString(string(body))
if err != nil { if err != nil {
return feedResponse, err return nil, err
} }
if request.Limit > 0 && len(feed.Items) > request.Limit { if request.Limit > 0 && len(feed.Items) > request.Limit {
feed.Items = feed.Items[:request.Limit] feed.Items = feed.Items[:request.Limit]
} }
items := make([]rssFeedItem, 0, len(feed.Items)) items := make(rssFeedItemList, 0, len(feed.Items))
if lastModified := resp.Header.Get("Last-Modified"); lastModified != "" {
if t, err := time.Parse(http.TimeFormat, lastModified); err == nil {
feedResponse.LastModified = t
}
}
if etag := resp.Header.Get("Etag"); etag != "" {
feedResponse.Etag = etag
}
for i := range feed.Items { for i := range feed.Items {
item := feed.Items[i] item := feed.Items[i]
@ -352,8 +330,27 @@ func fetchItemsFromRSSFeedTask(request rssFeedRequest) (rssFeedResponse, error)
items = append(items, rssItem) items = append(items, rssItem)
} }
feedResponse.Items = items if resp.Header.Get("ETag") != "" || resp.Header.Get("Last-Modified") != "" {
return feedResponse, nil widget.feedCacheMutex.Lock()
widget.cachedFeeds[request.URL] = &cachedRSSFeed{
etag: resp.Header.Get("ETag"),
lastModified: resp.Header.Get("Last-Modified"),
items: items,
}
widget.feedCacheMutex.Unlock()
}
return items, nil
}
func findThumbnailInItemExtensions(item *gofeed.Item) string {
media, ok := item.Extensions["media"]
if !ok {
return ""
}
return recursiveFindThumbnailInExtensions(media)
} }
func recursiveFindThumbnailInExtensions(extensions map[string][]gofeedext.Extension) string { func recursiveFindThumbnailInExtensions(extensions map[string][]gofeedext.Extension) string {
@ -376,48 +373,30 @@ func recursiveFindThumbnailInExtensions(extensions map[string][]gofeedext.Extens
return "" return ""
} }
func findThumbnailInItemExtensions(item *gofeed.Item) string { var htmlTagsWithAttributesPattern = regexp.MustCompile(`<\/?[a-zA-Z0-9-]+ *(?:[a-zA-Z-]+=(?:"|').*?(?:"|') ?)* *\/?>`)
media, ok := item.Extensions["media"]
if !ok { func sanitizeFeedDescription(description string) string {
if description == "" {
return "" return ""
} }
return recursiveFindThumbnailInExtensions(media) description = strings.ReplaceAll(description, "\n", " ")
description = htmlTagsWithAttributesPattern.ReplaceAllString(description, "")
description = sequentialWhitespacePattern.ReplaceAllString(description, " ")
description = strings.TrimSpace(description)
description = html.UnescapeString(description)
return description
} }
func fetchItemsFromRSSFeeds(requests []rssFeedRequest, cachedFeeds map[string]cachedFeed) (rssFeedItemList, []rssFeedResponse, error) { func shortenFeedDescriptionLen(description string, maxLen int) string {
job := newJob(fetchItemsFromRSSFeedTask, requests).withWorkers(30) description, _ = limitStringLength(description, 1000)
feeds, errs, err := workerPoolDo(job) description = sanitizeFeedDescription(description)
if err != nil { description, limited := limitStringLength(description, maxLen)
return nil, nil, fmt.Errorf("%w: %v", errNoContent, err)
if limited {
description += "…"
} }
failed := 0 return description
notModified := 0
entries := make(rssFeedItemList, 0, len(feeds)*10)
for i := range feeds {
if errs[i] == nil {
entries = append(entries, feeds[i].Items...)
} else if errors.Is(errs[i], errNotModified) {
notModified++
entries = append(entries, cachedFeeds[feeds[i].URL].Items...)
slog.Debug("Feed not modified", "url", requests[i].URL, "debug", errs[i])
} else {
failed++
slog.Error("Failed to get RSS feed", "url", requests[i].URL, "error", errs[i])
}
}
if failed == len(requests) {
return nil, nil, errNoContent
}
if failed > 0 {
return entries, feeds, fmt.Errorf("%w: missing %d RSS feeds", errPartialContent, failed)
}
return entries, feeds, nil
} }

View File

@ -19,7 +19,6 @@ import (
var ( var (
errNoContent = errors.New("failed to retrieve any content") errNoContent = errors.New("failed to retrieve any content")
errPartialContent = errors.New("failed to retrieve some of the content") errPartialContent = errors.New("failed to retrieve some of the content")
errNotModified = errors.New("content not modified")
) )
const defaultClientTimeout = 5 * time.Second const defaultClientTimeout = 5 * time.Second
@ -39,6 +38,7 @@ type requestDoer interface {
Do(*http.Request) (*http.Response, error) Do(*http.Request) (*http.Response, error)
} }
var glanceUserAgentString = "Glance/" + buildVersion + " +https://github.com/glanceapp/glance"
var userAgentPersistentVersion atomic.Int32 var userAgentPersistentVersion atomic.Int32
func setBrowserUserAgentHeader(request *http.Request) { func setBrowserUserAgentHeader(request *http.Request) {