Use fuse.js for podcast episode search

Replace levenshtein distance with fuse.js fuzzy searching library. Search in episode's title and subtitle
This commit is contained in:
Jan Kubovy 2025-06-06 10:43:52 +00:00
parent e669a8d378
commit eda7036f70
3 changed files with 28 additions and 21 deletions

9
package-lock.json generated
View File

@ -13,6 +13,7 @@
"cookie-parser": "^1.4.6", "cookie-parser": "^1.4.6",
"express": "^4.17.1", "express": "^4.17.1",
"express-session": "^1.17.3", "express-session": "^1.17.3",
"fuse.js": "^7.1.0",
"graceful-fs": "^4.2.10", "graceful-fs": "^4.2.10",
"htmlparser2": "^8.0.1", "htmlparser2": "^8.0.1",
"lru-cache": "^10.0.3", "lru-cache": "^10.0.3",
@ -2105,6 +2106,14 @@
"url": "https://github.com/sponsors/ljharb" "url": "https://github.com/sponsors/ljharb"
} }
}, },
"node_modules/fuse.js": {
"version": "7.1.0",
"resolved": "https://registry.npmjs.org/fuse.js/-/fuse.js-7.1.0.tgz",
"integrity": "sha512-trLf4SzuuUxfusZADLINj+dE8clK1frKdmqiJNb1Es75fmI5oY6X2mxLVUciLLjxqw/xr72Dhy+lER6dGd02FQ==",
"engines": {
"node": ">=10"
}
},
"node_modules/gensync": { "node_modules/gensync": {
"version": "1.0.0-beta.2", "version": "1.0.0-beta.2",
"resolved": "https://registry.npmjs.org/gensync/-/gensync-1.0.0-beta.2.tgz", "resolved": "https://registry.npmjs.org/gensync/-/gensync-1.0.0-beta.2.tgz",

View File

@ -40,6 +40,7 @@
"cookie-parser": "^1.4.6", "cookie-parser": "^1.4.6",
"express": "^4.17.1", "express": "^4.17.1",
"express-session": "^1.17.3", "express-session": "^1.17.3",
"fuse.js": "^7.1.0",
"graceful-fs": "^4.2.10", "graceful-fs": "^4.2.10",
"htmlparser2": "^8.0.1", "htmlparser2": "^8.0.1",
"lru-cache": "^10.0.3", "lru-cache": "^10.0.3",

View File

@ -3,6 +3,7 @@ const ssrfFilter = require('ssrf-req-filter')
const Logger = require('../Logger') const Logger = require('../Logger')
const { xmlToJSON, levenshteinDistance, timestampToSeconds } = require('./index') const { xmlToJSON, levenshteinDistance, timestampToSeconds } = require('./index')
const htmlSanitizer = require('../utils/htmlSanitizer') const htmlSanitizer = require('../utils/htmlSanitizer')
const Fuse = require('fuse.js')
/** /**
* @typedef RssPodcastChapter * @typedef RssPodcastChapter
@ -407,7 +408,7 @@ module.exports.getPodcastFeed = (feedUrl, excludeEpisodeMetadata = false) => {
}) })
} }
// Return array of episodes ordered by closest match (Levenshtein distance of 6 or less) // Return array of episodes ordered by closest match using fuse.js
module.exports.findMatchingEpisodes = async (feedUrl, searchTitle) => { module.exports.findMatchingEpisodes = async (feedUrl, searchTitle) => {
const feed = await this.getPodcastFeed(feedUrl).catch(() => { const feed = await this.getPodcastFeed(feedUrl).catch(() => {
return null return null
@ -420,32 +421,28 @@ module.exports.findMatchingEpisodes = async (feedUrl, searchTitle) => {
* *
* @param {RssPodcast} feed * @param {RssPodcast} feed
* @param {string} searchTitle * @param {string} searchTitle
* @returns {Array<{ episode: RssPodcastEpisode, levenshtein: number }>} * @returns {Array<{ episode: RssPodcastEpisode }>}
*/ */
module.exports.findMatchingEpisodesInFeed = (feed, searchTitle) => { module.exports.findMatchingEpisodesInFeed = (feed, searchTitle) => {
searchTitle = searchTitle.toLowerCase().trim()
if (!feed?.episodes) { if (!feed?.episodes) {
return null return null
} }
const fuseOptions = {
ignoreDiacritics: true,
threshold: 0.4, // default 0.6 return too many matches
keys: [
{name: 'title', weight: 0.7}, // prefer match in title
{name: 'subtitle', weight: 0.3}
]
}
const fuse = new Fuse(feed.episodes, fuseOptions)
const matches = [] const matches = []
feed.episodes.forEach((ep) => { fuse.search(searchTitle).forEach((match) => {
if (!ep.title) return matches.push({
const epTitle = ep.title.toLowerCase().trim() episode: match.item
if (epTitle === searchTitle) { })
matches.push({
episode: ep,
levenshtein: 0
})
} else {
const levenshtein = levenshteinDistance(searchTitle, epTitle, true)
if (levenshtein <= 6 && epTitle.length > levenshtein) {
matches.push({
episode: ep,
levenshtein
})
}
}
}) })
return matches.sort((a, b) => a.levenshtein - b.levenshtein) return matches
} }