Use guesslang-js for language auto detection instead of Highlight.js

Should (hopefully) reduce false positives.
This commit is contained in:
Jonatan Heyman 2023-12-06 02:52:51 +01:00
parent 201ae99370
commit 6f53b61bb0
5 changed files with 99 additions and 1250 deletions

28
public/guesslang.min.js vendored Normal file

File diff suppressed because one or more lines are too long

1202
public/highlight.min.js vendored

File diff suppressed because one or more lines are too long

View File

@ -1,21 +1,22 @@
importScripts("highlight.min.js") importScripts("guesslang.min.js")
const HIGHLIGHTJS_LANGUAGES = [ GUESSLANG_LANGUAGES = [
"json", "json",
"python", "py",
"javascript", "js",
"html", "html",
"sql", "sql",
"java", "java",
"plaintext",
"cpp", "cpp",
"php", "php",
"css", "css",
"markdown",
"xml", "xml",
"rust", "rs",
"md",
] ]
const guessLang = new self.GuessLang()
onmessage = (event) => { onmessage = (event) => {
//console.log("worker received message:", event.data) //console.log("worker received message:", event.data)
//importScripts("../../lib/highlight.min.js") //importScripts("../../lib/highlight.min.js")
@ -34,10 +35,9 @@ onmessage = (event) => {
try { try {
if (typeof JSON.parse(trimmedContent) === "object") { if (typeof JSON.parse(trimmedContent) === "object") {
postMessage({ postMessage({
highlightjs: { guesslang: {
language: "json", language: "json",
relevance: 100, confidence: 1.0,
illegal: false,
}, },
content: content, content: content,
idx: event.data.idx, idx: event.data.idx,
@ -49,14 +49,39 @@ onmessage = (event) => {
} }
} }
const result = self.hljs.highlightAuto(content, HIGHLIGHTJS_LANGUAGES); //let startTime = performance.now()
guessLang.runModel(content).then((result) => {
//const duration = performance.now() - startTime
//console.log("Guessing language done:", result, result[0]?.languageId, result[0]?.confidence)
//console.log("Guessing language took", duration, "ms")
if (result.length > 0) {
// for the language that is most likely according to GuessLang we have a lower threshold (0.15)
const lang = result[0]
if (GUESSLANG_LANGUAGES.includes(lang.languageId) && lang.confidence > 0.15) {
postMessage({ postMessage({
highlightjs: { guesslang: {
language: result.language, language: lang.languageId,
relevance: result.relevance, confidence: lang.confidence,
illegal: result.illegal,
}, },
content: content, content: content,
idx: event.data.idx, idx: event.data.idx,
}) })
return
}
}
for (let lang of result) {
if (GUESSLANG_LANGUAGES.includes(lang.languageId) && lang.confidence > 0.5) {
postMessage({
guesslang: {
language: lang.languageId,
confidence: lang.confidence,
},
content: content,
idx: event.data.idx,
})
return
}
}
})
} }

View File

@ -7,7 +7,7 @@ import { LANGUAGES } from "../languages";
import { changeLanguageTo } from "../block/commands"; import { changeLanguageTo } from "../block/commands";
import { LANGUAGE_CHANGE } from "../annotation"; import { LANGUAGE_CHANGE } from "../annotation";
const HIGHLIGHTJS_TO_TOKEN = Object.fromEntries(LANGUAGES.map(l => [l.highlightjs,l.token])) const GUESSLANG_TO_TOKEN = Object.fromEntries(LANGUAGES.map(l => [l.guesslang,l.token]))
export function languageDetection(getView) { export function languageDetection(getView) {
@ -17,16 +17,15 @@ export function languageDetection(getView) {
const detectionWorker = new Worker('langdetect-worker.js?worker'); const detectionWorker = new Worker('langdetect-worker.js?worker');
detectionWorker.onmessage = (event) => { detectionWorker.onmessage = (event) => {
//console.log("event:", event.data) //console.log("event:", event.data)
if (!event.data.highlightjs.language) { if (!event.data.guesslang.language) {
return return
} }
const view = getView() const view = getView()
const state = view.state const state = view.state
const block = getActiveNoteBlock(state) const block = getActiveNoteBlock(state)
const newLang = HIGHLIGHTJS_TO_TOKEN[event.data.highlightjs.language] const newLang = GUESSLANG_TO_TOKEN[event.data.guesslang.language]
if (block.language.auto === true && block.language.name !== newLang) { if (block.language.auto === true && block.language.name !== newLang) {
console.log("New auto detected language:", newLang, "Relevance:", event.data.highlightjs.relevance) console.log("New auto detected language:", newLang, "Confidence:", event.data.guesslang.confidence)
if (event.data.highlightjs.relevance >= 7) {
let content = state.doc.sliceString(block.content.from, block.content.to) let content = state.doc.sliceString(block.content.from, block.content.to)
const threshold = content.length * 0.1 const threshold = content.length * 0.1
if (levenshtein_distance(content, event.data.content) <= threshold) { if (levenshtein_distance(content, event.data.content) <= threshold) {
@ -42,7 +41,6 @@ export function languageDetection(getView) {
} }
} }
} }
}
const plugin = EditorView.updateListener.of(update => { const plugin = EditorView.updateListener.of(update => {
if (update.docChanged) { if (update.docChanged) {

View File

@ -14,24 +14,24 @@ import { rustLanguage } from "@codemirror/lang-rust"
class Language { class Language {
constructor(token, name, parser, highlightjs, supportsFormat=false) { constructor(token, name, parser, guesslang, supportsFormat=false) {
this.token = token this.token = token
this.name = name this.name = name
this.parser = parser this.parser = parser
this.highlightjs = highlightjs this.guesslang = guesslang
this.supportsFormat = supportsFormat this.supportsFormat = supportsFormat
} }
} }
export const LANGUAGES = [ export const LANGUAGES = [
new Language("text", "Plain Text", null, "plaintext"), new Language("text", "Plain Text", null, null),
new Language("math", "Math", null, null), new Language("math", "Math", null, null),
new Language("javascript", "JavaScript", javascriptLanguage.parser, "javascript", true), new Language("javascript", "JavaScript", javascriptLanguage.parser, "js", true),
new Language("json", "JSON", jsonLanguage.parser, "json", true), new Language("json", "JSON", jsonLanguage.parser, "json", true),
new Language("python", "Python", pythonLanguage.parser, "python"), new Language("python", "Python", pythonLanguage.parser, "py"),
new Language("html", "HTML", htmlLanguage.parser, "html", true), new Language("html", "HTML", htmlLanguage.parser, "html", true),
new Language("sql", "SQL", StandardSQL.language.parser, "sql"), new Language("sql", "SQL", StandardSQL.language.parser, "sql"),
new Language("markdown", "Markdown", markdownLanguage.parser, "markdown", true), new Language("markdown", "Markdown", markdownLanguage.parser, "md", true),
new Language("java", "Java", javaLanguage.parser, "java"), new Language("java", "Java", javaLanguage.parser, "java"),
//new Language("lezer", "Lezer", lezerLanguage.parser, "lezer"), //new Language("lezer", "Lezer", lezerLanguage.parser, "lezer"),
new Language("php", "PHP", phpLanguage.parser, "php"), new Language("php", "PHP", phpLanguage.parser, "php"),