Use guesslang-js for language auto detection instead of Highlight.js

Should (hopefully) reduce false positives.
This commit is contained in:
Jonatan Heyman 2023-12-06 02:52:51 +01:00
parent 201ae99370
commit 6f53b61bb0
5 changed files with 99 additions and 1250 deletions

28
public/guesslang.min.js vendored Normal file

File diff suppressed because one or more lines are too long

1202
public/highlight.min.js vendored

File diff suppressed because one or more lines are too long

View File

@ -1,21 +1,22 @@
importScripts("highlight.min.js")
importScripts("guesslang.min.js")
const HIGHLIGHTJS_LANGUAGES = [
GUESSLANG_LANGUAGES = [
"json",
"python",
"javascript",
"py",
"js",
"html",
"sql",
"java",
"plaintext",
"cpp",
"php",
"css",
"markdown",
"xml",
"rust",
"rs",
"md",
]
const guessLang = new self.GuessLang()
onmessage = (event) => {
//console.log("worker received message:", event.data)
//importScripts("../../lib/highlight.min.js")
@ -34,10 +35,9 @@ onmessage = (event) => {
try {
if (typeof JSON.parse(trimmedContent) === "object") {
postMessage({
highlightjs: {
guesslang: {
language: "json",
relevance: 100,
illegal: false,
confidence: 1.0,
},
content: content,
idx: event.data.idx,
@ -49,14 +49,39 @@ onmessage = (event) => {
}
}
const result = self.hljs.highlightAuto(content, HIGHLIGHTJS_LANGUAGES);
postMessage({
highlightjs: {
language: result.language,
relevance: result.relevance,
illegal: result.illegal,
},
content: content,
idx: event.data.idx,
//let startTime = performance.now()
guessLang.runModel(content).then((result) => {
//const duration = performance.now() - startTime
//console.log("Guessing language done:", result, result[0]?.languageId, result[0]?.confidence)
//console.log("Guessing language took", duration, "ms")
if (result.length > 0) {
// for the language that is most likely according to GuessLang we have a lower threshold (0.15)
const lang = result[0]
if (GUESSLANG_LANGUAGES.includes(lang.languageId) && lang.confidence > 0.15) {
postMessage({
guesslang: {
language: lang.languageId,
confidence: lang.confidence,
},
content: content,
idx: event.data.idx,
})
return
}
}
for (let lang of result) {
if (GUESSLANG_LANGUAGES.includes(lang.languageId) && lang.confidence > 0.5) {
postMessage({
guesslang: {
language: lang.languageId,
confidence: lang.confidence,
},
content: content,
idx: event.data.idx,
})
return
}
}
})
}

View File

@ -7,7 +7,7 @@ import { LANGUAGES } from "../languages";
import { changeLanguageTo } from "../block/commands";
import { LANGUAGE_CHANGE } from "../annotation";
const HIGHLIGHTJS_TO_TOKEN = Object.fromEntries(LANGUAGES.map(l => [l.highlightjs,l.token]))
const GUESSLANG_TO_TOKEN = Object.fromEntries(LANGUAGES.map(l => [l.guesslang,l.token]))
export function languageDetection(getView) {
@ -17,29 +17,27 @@ export function languageDetection(getView) {
const detectionWorker = new Worker('langdetect-worker.js?worker');
detectionWorker.onmessage = (event) => {
//console.log("event:", event.data)
if (!event.data.highlightjs.language) {
if (!event.data.guesslang.language) {
return
}
const view = getView()
const state = view.state
const block = getActiveNoteBlock(state)
const newLang = HIGHLIGHTJS_TO_TOKEN[event.data.highlightjs.language]
const newLang = GUESSLANG_TO_TOKEN[event.data.guesslang.language]
if (block.language.auto === true && block.language.name !== newLang) {
console.log("New auto detected language:", newLang, "Relevance:", event.data.highlightjs.relevance)
if (event.data.highlightjs.relevance >= 7) {
let content = state.doc.sliceString(block.content.from, block.content.to)
const threshold = content.length * 0.1
if (levenshtein_distance(content, event.data.content) <= threshold) {
// the content has not changed significantly so it's safe to change the language
if (redoDepth(state) === 0) {
console.log("Changing language to", newLang)
changeLanguageTo(state, view.dispatch, block, newLang, true)
} else {
console.log("Not changing language because the user has undo:ed and has redo history")
}
console.log("New auto detected language:", newLang, "Confidence:", event.data.guesslang.confidence)
let content = state.doc.sliceString(block.content.from, block.content.to)
const threshold = content.length * 0.1
if (levenshtein_distance(content, event.data.content) <= threshold) {
// the content has not changed significantly so it's safe to change the language
if (redoDepth(state) === 0) {
console.log("Changing language to", newLang)
changeLanguageTo(state, view.dispatch, block, newLang, true)
} else {
console.log("Content has changed significantly, not setting new language")
console.log("Not changing language because the user has undo:ed and has redo history")
}
} else {
console.log("Content has changed significantly, not setting new language")
}
}
}

View File

@ -14,24 +14,24 @@ import { rustLanguage } from "@codemirror/lang-rust"
class Language {
constructor(token, name, parser, highlightjs, supportsFormat=false) {
constructor(token, name, parser, guesslang, supportsFormat=false) {
this.token = token
this.name = name
this.parser = parser
this.highlightjs = highlightjs
this.guesslang = guesslang
this.supportsFormat = supportsFormat
}
}
export const LANGUAGES = [
new Language("text", "Plain Text", null, "plaintext"),
new Language("text", "Plain Text", null, null),
new Language("math", "Math", null, null),
new Language("javascript", "JavaScript", javascriptLanguage.parser, "javascript", true),
new Language("javascript", "JavaScript", javascriptLanguage.parser, "js", true),
new Language("json", "JSON", jsonLanguage.parser, "json", true),
new Language("python", "Python", pythonLanguage.parser, "python"),
new Language("python", "Python", pythonLanguage.parser, "py"),
new Language("html", "HTML", htmlLanguage.parser, "html", true),
new Language("sql", "SQL", StandardSQL.language.parser, "sql"),
new Language("markdown", "Markdown", markdownLanguage.parser, "markdown", true),
new Language("markdown", "Markdown", markdownLanguage.parser, "md", true),
new Language("java", "Java", javaLanguage.parser, "java"),
//new Language("lezer", "Lezer", lezerLanguage.parser, "lezer"),
new Language("php", "PHP", phpLanguage.parser, "php"),