Merge pull request #4383 from JKubovy/improve-podcast-episode-search

Use fuse.js for podcast episode search
This commit is contained in:
advplyr 2025-06-13 17:29:13 -05:00 committed by GitHub
commit 5025c6a3ea
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 35 additions and 24 deletions

File diff suppressed because one or more lines are too long

View File

@ -370,7 +370,7 @@ class Scanner {
let numEpisodesUpdated = 0 let numEpisodesUpdated = 0
for (const episode of episodesToQuickMatch) { for (const episode of episodesToQuickMatch) {
const episodeMatches = findMatchingEpisodesInFeed(feed, episode.title) const episodeMatches = findMatchingEpisodesInFeed(feed, episode.title, 0.1)
if (episodeMatches?.length) { if (episodeMatches?.length) {
const wasUpdated = await this.updateEpisodeWithMatch(episode, episodeMatches[0].episode, options) const wasUpdated = await this.updateEpisodeWithMatch(episode, episodeMatches[0].episode, options)
if (wasUpdated) numEpisodesUpdated++ if (wasUpdated) numEpisodesUpdated++

View File

@ -1,8 +1,9 @@
const axios = require('axios') const axios = require('axios')
const ssrfFilter = require('ssrf-req-filter') const ssrfFilter = require('ssrf-req-filter')
const Logger = require('../Logger') const Logger = require('../Logger')
const { xmlToJSON, levenshteinDistance, timestampToSeconds } = require('./index') const { xmlToJSON, timestampToSeconds } = require('./index')
const htmlSanitizer = require('../utils/htmlSanitizer') const htmlSanitizer = require('../utils/htmlSanitizer')
const Fuse = require('../libs/fusejs')
/** /**
* @typedef RssPodcastChapter * @typedef RssPodcastChapter
@ -407,7 +408,7 @@ module.exports.getPodcastFeed = (feedUrl, excludeEpisodeMetadata = false) => {
}) })
} }
// Return array of episodes ordered by closest match (Levenshtein distance of 6 or less) // Return array of episodes ordered by closest match using fuse.js
module.exports.findMatchingEpisodes = async (feedUrl, searchTitle) => { module.exports.findMatchingEpisodes = async (feedUrl, searchTitle) => {
const feed = await this.getPodcastFeed(feedUrl).catch(() => { const feed = await this.getPodcastFeed(feedUrl).catch(() => {
return null return null
@ -420,32 +421,29 @@ module.exports.findMatchingEpisodes = async (feedUrl, searchTitle) => {
* *
* @param {RssPodcast} feed * @param {RssPodcast} feed
* @param {string} searchTitle * @param {string} searchTitle
* @returns {Array<{ episode: RssPodcastEpisode, levenshtein: number }>} * @param {number} [threshold=0.4] - 0.0 for perfect match, 1.0 for match anything
* @returns {Array<{ episode: RssPodcastEpisode }>}
*/ */
module.exports.findMatchingEpisodesInFeed = (feed, searchTitle) => { module.exports.findMatchingEpisodesInFeed = (feed, searchTitle, threshold = 0.4) => {
searchTitle = searchTitle.toLowerCase().trim()
if (!feed?.episodes) { if (!feed?.episodes) {
return null return null
} }
const fuseOptions = {
ignoreDiacritics: true,
threshold,
keys: [
{ name: 'title', weight: 0.7 }, // prefer match in title
{ name: 'subtitle', weight: 0.3 }
]
}
const fuse = new Fuse(feed.episodes, fuseOptions)
const matches = [] const matches = []
feed.episodes.forEach((ep) => { fuse.search(searchTitle).forEach((match) => {
if (!ep.title) return matches.push({
const epTitle = ep.title.toLowerCase().trim() episode: match.item
if (epTitle === searchTitle) { })
matches.push({
episode: ep,
levenshtein: 0
})
} else {
const levenshtein = levenshteinDistance(searchTitle, epTitle, true)
if (levenshtein <= 6 && epTitle.length > levenshtein) {
matches.push({
episode: ep,
levenshtein
})
}
}
}) })
return matches.sort((a, b) => a.levenshtein - b.levenshtein) return matches
} }