Improve language auto detection

If content starts and ends with { & } or [ & ], we first try to parse is as JSON. If that succeeds, we'll assume it's JSON.

Increase the highlightjs relevance threshold. 

Hopefully this will result in fewer false positives that have been observed (e.g. for CSS and JSON).
This commit is contained in:
Jonatan Heyman 2023-03-07 15:01:41 +01:00
parent ef5955feeb
commit b26381164a
2 changed files with 32 additions and 3 deletions

View File

@ -20,14 +20,43 @@ onmessage = (event) => {
//console.log("worker received message:", event.data)
//importScripts("../../lib/highlight.min.js")
const result = self.hljs.highlightAuto(event.data.content, HIGHLIGHTJS_LANGUAGES);
const content = event.data.content
// we first check some custom heuristic rules to determine if the language is JSON
const trimmedContent = content.trim()
if ((
trimmedContent.startsWith("{") &&
trimmedContent.endsWith("}")
) || (
trimmedContent.startsWith("[") &&
trimmedContent.endsWith("]")
)) {
try {
if (typeof JSON.parse(trimmedContent) === "object") {
postMessage({
highlightjs: {
language: "json",
relevance: 100,
illegal: false,
},
content: content,
idx: event.data.idx,
})
return
}
} catch (e) {
// JSON could not be parsed, do nothing
}
}
const result = self.hljs.highlightAuto(content, HIGHLIGHTJS_LANGUAGES);
postMessage({
highlightjs: {
language: result.language,
relevance: result.relevance,
illegal: result.illegal,
},
content: event.data.content,
content: content,
idx: event.data.idx,
})
}

View File

@ -26,7 +26,7 @@ export function languageDetection(getView) {
const newLang = HIGHLIGHTJS_TO_TOKEN[event.data.highlightjs.language]
if (block.language.auto === true && block.language.name !== newLang) {
console.log("New auto detected language:", newLang, "Relevance:", event.data.highlightjs.relevance)
if (event.data.highlightjs.relevance >= 5) {
if (event.data.highlightjs.relevance >= 7) {
let content = state.doc.sliceString(block.content.from, block.content.to)
const threshold = content.length * 0.1
if (levenshtein_distance(content, event.data.content) <= threshold) {