mirror of
https://github.com/heyman/heynote.git
synced 2024-11-21 15:33:14 +01:00
Use guesslang-js for language auto detection instead of Highlight.js
Should (hopefully) reduce false positives.
This commit is contained in:
parent
201ae99370
commit
6f53b61bb0
28
public/guesslang.min.js
vendored
Normal file
28
public/guesslang.min.js
vendored
Normal file
File diff suppressed because one or more lines are too long
1202
public/highlight.min.js
vendored
1202
public/highlight.min.js
vendored
File diff suppressed because one or more lines are too long
@ -1,21 +1,22 @@
|
||||
importScripts("highlight.min.js")
|
||||
importScripts("guesslang.min.js")
|
||||
|
||||
const HIGHLIGHTJS_LANGUAGES = [
|
||||
"json",
|
||||
"python",
|
||||
"javascript",
|
||||
"html",
|
||||
"sql",
|
||||
"java",
|
||||
"plaintext",
|
||||
"cpp",
|
||||
"php",
|
||||
"css",
|
||||
"markdown",
|
||||
GUESSLANG_LANGUAGES = [
|
||||
"json",
|
||||
"py",
|
||||
"js",
|
||||
"html",
|
||||
"sql",
|
||||
"java",
|
||||
"cpp",
|
||||
"php",
|
||||
"css",
|
||||
"xml",
|
||||
"rust",
|
||||
"rs",
|
||||
"md",
|
||||
]
|
||||
|
||||
const guessLang = new self.GuessLang()
|
||||
|
||||
onmessage = (event) => {
|
||||
//console.log("worker received message:", event.data)
|
||||
//importScripts("../../lib/highlight.min.js")
|
||||
@ -34,10 +35,9 @@ onmessage = (event) => {
|
||||
try {
|
||||
if (typeof JSON.parse(trimmedContent) === "object") {
|
||||
postMessage({
|
||||
highlightjs: {
|
||||
guesslang: {
|
||||
language: "json",
|
||||
relevance: 100,
|
||||
illegal: false,
|
||||
confidence: 1.0,
|
||||
},
|
||||
content: content,
|
||||
idx: event.data.idx,
|
||||
@ -49,14 +49,39 @@ onmessage = (event) => {
|
||||
}
|
||||
}
|
||||
|
||||
const result = self.hljs.highlightAuto(content, HIGHLIGHTJS_LANGUAGES);
|
||||
postMessage({
|
||||
highlightjs: {
|
||||
language: result.language,
|
||||
relevance: result.relevance,
|
||||
illegal: result.illegal,
|
||||
},
|
||||
content: content,
|
||||
idx: event.data.idx,
|
||||
//let startTime = performance.now()
|
||||
guessLang.runModel(content).then((result) => {
|
||||
//const duration = performance.now() - startTime
|
||||
//console.log("Guessing language done:", result, result[0]?.languageId, result[0]?.confidence)
|
||||
//console.log("Guessing language took", duration, "ms")
|
||||
|
||||
if (result.length > 0) {
|
||||
// for the language that is most likely according to GuessLang we have a lower threshold (0.15)
|
||||
const lang = result[0]
|
||||
if (GUESSLANG_LANGUAGES.includes(lang.languageId) && lang.confidence > 0.15) {
|
||||
postMessage({
|
||||
guesslang: {
|
||||
language: lang.languageId,
|
||||
confidence: lang.confidence,
|
||||
},
|
||||
content: content,
|
||||
idx: event.data.idx,
|
||||
})
|
||||
return
|
||||
}
|
||||
}
|
||||
for (let lang of result) {
|
||||
if (GUESSLANG_LANGUAGES.includes(lang.languageId) && lang.confidence > 0.5) {
|
||||
postMessage({
|
||||
guesslang: {
|
||||
language: lang.languageId,
|
||||
confidence: lang.confidence,
|
||||
},
|
||||
content: content,
|
||||
idx: event.data.idx,
|
||||
})
|
||||
return
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
@ -7,7 +7,7 @@ import { LANGUAGES } from "../languages";
|
||||
import { changeLanguageTo } from "../block/commands";
|
||||
import { LANGUAGE_CHANGE } from "../annotation";
|
||||
|
||||
const HIGHLIGHTJS_TO_TOKEN = Object.fromEntries(LANGUAGES.map(l => [l.highlightjs,l.token]))
|
||||
const GUESSLANG_TO_TOKEN = Object.fromEntries(LANGUAGES.map(l => [l.guesslang,l.token]))
|
||||
|
||||
|
||||
export function languageDetection(getView) {
|
||||
@ -17,29 +17,27 @@ export function languageDetection(getView) {
|
||||
const detectionWorker = new Worker('langdetect-worker.js?worker');
|
||||
detectionWorker.onmessage = (event) => {
|
||||
//console.log("event:", event.data)
|
||||
if (!event.data.highlightjs.language) {
|
||||
if (!event.data.guesslang.language) {
|
||||
return
|
||||
}
|
||||
const view = getView()
|
||||
const state = view.state
|
||||
const block = getActiveNoteBlock(state)
|
||||
const newLang = HIGHLIGHTJS_TO_TOKEN[event.data.highlightjs.language]
|
||||
const newLang = GUESSLANG_TO_TOKEN[event.data.guesslang.language]
|
||||
if (block.language.auto === true && block.language.name !== newLang) {
|
||||
console.log("New auto detected language:", newLang, "Relevance:", event.data.highlightjs.relevance)
|
||||
if (event.data.highlightjs.relevance >= 7) {
|
||||
let content = state.doc.sliceString(block.content.from, block.content.to)
|
||||
const threshold = content.length * 0.1
|
||||
if (levenshtein_distance(content, event.data.content) <= threshold) {
|
||||
// the content has not changed significantly so it's safe to change the language
|
||||
if (redoDepth(state) === 0) {
|
||||
console.log("Changing language to", newLang)
|
||||
changeLanguageTo(state, view.dispatch, block, newLang, true)
|
||||
} else {
|
||||
console.log("Not changing language because the user has undo:ed and has redo history")
|
||||
}
|
||||
console.log("New auto detected language:", newLang, "Confidence:", event.data.guesslang.confidence)
|
||||
let content = state.doc.sliceString(block.content.from, block.content.to)
|
||||
const threshold = content.length * 0.1
|
||||
if (levenshtein_distance(content, event.data.content) <= threshold) {
|
||||
// the content has not changed significantly so it's safe to change the language
|
||||
if (redoDepth(state) === 0) {
|
||||
console.log("Changing language to", newLang)
|
||||
changeLanguageTo(state, view.dispatch, block, newLang, true)
|
||||
} else {
|
||||
console.log("Content has changed significantly, not setting new language")
|
||||
console.log("Not changing language because the user has undo:ed and has redo history")
|
||||
}
|
||||
} else {
|
||||
console.log("Content has changed significantly, not setting new language")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -14,24 +14,24 @@ import { rustLanguage } from "@codemirror/lang-rust"
|
||||
|
||||
|
||||
class Language {
|
||||
constructor(token, name, parser, highlightjs, supportsFormat=false) {
|
||||
constructor(token, name, parser, guesslang, supportsFormat=false) {
|
||||
this.token = token
|
||||
this.name = name
|
||||
this.parser = parser
|
||||
this.highlightjs = highlightjs
|
||||
this.guesslang = guesslang
|
||||
this.supportsFormat = supportsFormat
|
||||
}
|
||||
}
|
||||
|
||||
export const LANGUAGES = [
|
||||
new Language("text", "Plain Text", null, "plaintext"),
|
||||
new Language("text", "Plain Text", null, null),
|
||||
new Language("math", "Math", null, null),
|
||||
new Language("javascript", "JavaScript", javascriptLanguage.parser, "javascript", true),
|
||||
new Language("javascript", "JavaScript", javascriptLanguage.parser, "js", true),
|
||||
new Language("json", "JSON", jsonLanguage.parser, "json", true),
|
||||
new Language("python", "Python", pythonLanguage.parser, "python"),
|
||||
new Language("python", "Python", pythonLanguage.parser, "py"),
|
||||
new Language("html", "HTML", htmlLanguage.parser, "html", true),
|
||||
new Language("sql", "SQL", StandardSQL.language.parser, "sql"),
|
||||
new Language("markdown", "Markdown", markdownLanguage.parser, "markdown", true),
|
||||
new Language("markdown", "Markdown", markdownLanguage.parser, "md", true),
|
||||
new Language("java", "Java", javaLanguage.parser, "java"),
|
||||
//new Language("lezer", "Lezer", lezerLanguage.parser, "lezer"),
|
||||
new Language("php", "PHP", phpLanguage.parser, "php"),
|
||||
|
Loading…
Reference in New Issue
Block a user