mirror of
https://github.com/heyman/heynote.git
synced 2024-11-21 23:43:22 +01:00
Use guesslang-js for language auto detection instead of Highlight.js
Should (hopefully) reduce false positives.
This commit is contained in:
parent
201ae99370
commit
6f53b61bb0
28
public/guesslang.min.js
vendored
Normal file
28
public/guesslang.min.js
vendored
Normal file
File diff suppressed because one or more lines are too long
1202
public/highlight.min.js
vendored
1202
public/highlight.min.js
vendored
File diff suppressed because one or more lines are too long
@ -1,21 +1,22 @@
|
|||||||
importScripts("highlight.min.js")
|
importScripts("guesslang.min.js")
|
||||||
|
|
||||||
const HIGHLIGHTJS_LANGUAGES = [
|
GUESSLANG_LANGUAGES = [
|
||||||
"json",
|
"json",
|
||||||
"python",
|
"py",
|
||||||
"javascript",
|
"js",
|
||||||
"html",
|
"html",
|
||||||
"sql",
|
"sql",
|
||||||
"java",
|
"java",
|
||||||
"plaintext",
|
|
||||||
"cpp",
|
"cpp",
|
||||||
"php",
|
"php",
|
||||||
"css",
|
"css",
|
||||||
"markdown",
|
|
||||||
"xml",
|
"xml",
|
||||||
"rust",
|
"rs",
|
||||||
|
"md",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
const guessLang = new self.GuessLang()
|
||||||
|
|
||||||
onmessage = (event) => {
|
onmessage = (event) => {
|
||||||
//console.log("worker received message:", event.data)
|
//console.log("worker received message:", event.data)
|
||||||
//importScripts("../../lib/highlight.min.js")
|
//importScripts("../../lib/highlight.min.js")
|
||||||
@ -34,10 +35,9 @@ onmessage = (event) => {
|
|||||||
try {
|
try {
|
||||||
if (typeof JSON.parse(trimmedContent) === "object") {
|
if (typeof JSON.parse(trimmedContent) === "object") {
|
||||||
postMessage({
|
postMessage({
|
||||||
highlightjs: {
|
guesslang: {
|
||||||
language: "json",
|
language: "json",
|
||||||
relevance: 100,
|
confidence: 1.0,
|
||||||
illegal: false,
|
|
||||||
},
|
},
|
||||||
content: content,
|
content: content,
|
||||||
idx: event.data.idx,
|
idx: event.data.idx,
|
||||||
@ -49,14 +49,39 @@ onmessage = (event) => {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const result = self.hljs.highlightAuto(content, HIGHLIGHTJS_LANGUAGES);
|
//let startTime = performance.now()
|
||||||
|
guessLang.runModel(content).then((result) => {
|
||||||
|
//const duration = performance.now() - startTime
|
||||||
|
//console.log("Guessing language done:", result, result[0]?.languageId, result[0]?.confidence)
|
||||||
|
//console.log("Guessing language took", duration, "ms")
|
||||||
|
|
||||||
|
if (result.length > 0) {
|
||||||
|
// for the language that is most likely according to GuessLang we have a lower threshold (0.15)
|
||||||
|
const lang = result[0]
|
||||||
|
if (GUESSLANG_LANGUAGES.includes(lang.languageId) && lang.confidence > 0.15) {
|
||||||
postMessage({
|
postMessage({
|
||||||
highlightjs: {
|
guesslang: {
|
||||||
language: result.language,
|
language: lang.languageId,
|
||||||
relevance: result.relevance,
|
confidence: lang.confidence,
|
||||||
illegal: result.illegal,
|
|
||||||
},
|
},
|
||||||
content: content,
|
content: content,
|
||||||
idx: event.data.idx,
|
idx: event.data.idx,
|
||||||
})
|
})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (let lang of result) {
|
||||||
|
if (GUESSLANG_LANGUAGES.includes(lang.languageId) && lang.confidence > 0.5) {
|
||||||
|
postMessage({
|
||||||
|
guesslang: {
|
||||||
|
language: lang.languageId,
|
||||||
|
confidence: lang.confidence,
|
||||||
|
},
|
||||||
|
content: content,
|
||||||
|
idx: event.data.idx,
|
||||||
|
})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
@ -7,7 +7,7 @@ import { LANGUAGES } from "../languages";
|
|||||||
import { changeLanguageTo } from "../block/commands";
|
import { changeLanguageTo } from "../block/commands";
|
||||||
import { LANGUAGE_CHANGE } from "../annotation";
|
import { LANGUAGE_CHANGE } from "../annotation";
|
||||||
|
|
||||||
const HIGHLIGHTJS_TO_TOKEN = Object.fromEntries(LANGUAGES.map(l => [l.highlightjs,l.token]))
|
const GUESSLANG_TO_TOKEN = Object.fromEntries(LANGUAGES.map(l => [l.guesslang,l.token]))
|
||||||
|
|
||||||
|
|
||||||
export function languageDetection(getView) {
|
export function languageDetection(getView) {
|
||||||
@ -17,16 +17,15 @@ export function languageDetection(getView) {
|
|||||||
const detectionWorker = new Worker('langdetect-worker.js?worker');
|
const detectionWorker = new Worker('langdetect-worker.js?worker');
|
||||||
detectionWorker.onmessage = (event) => {
|
detectionWorker.onmessage = (event) => {
|
||||||
//console.log("event:", event.data)
|
//console.log("event:", event.data)
|
||||||
if (!event.data.highlightjs.language) {
|
if (!event.data.guesslang.language) {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
const view = getView()
|
const view = getView()
|
||||||
const state = view.state
|
const state = view.state
|
||||||
const block = getActiveNoteBlock(state)
|
const block = getActiveNoteBlock(state)
|
||||||
const newLang = HIGHLIGHTJS_TO_TOKEN[event.data.highlightjs.language]
|
const newLang = GUESSLANG_TO_TOKEN[event.data.guesslang.language]
|
||||||
if (block.language.auto === true && block.language.name !== newLang) {
|
if (block.language.auto === true && block.language.name !== newLang) {
|
||||||
console.log("New auto detected language:", newLang, "Relevance:", event.data.highlightjs.relevance)
|
console.log("New auto detected language:", newLang, "Confidence:", event.data.guesslang.confidence)
|
||||||
if (event.data.highlightjs.relevance >= 7) {
|
|
||||||
let content = state.doc.sliceString(block.content.from, block.content.to)
|
let content = state.doc.sliceString(block.content.from, block.content.to)
|
||||||
const threshold = content.length * 0.1
|
const threshold = content.length * 0.1
|
||||||
if (levenshtein_distance(content, event.data.content) <= threshold) {
|
if (levenshtein_distance(content, event.data.content) <= threshold) {
|
||||||
@ -42,7 +41,6 @@ export function languageDetection(getView) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
const plugin = EditorView.updateListener.of(update => {
|
const plugin = EditorView.updateListener.of(update => {
|
||||||
if (update.docChanged) {
|
if (update.docChanged) {
|
||||||
|
@ -14,24 +14,24 @@ import { rustLanguage } from "@codemirror/lang-rust"
|
|||||||
|
|
||||||
|
|
||||||
class Language {
|
class Language {
|
||||||
constructor(token, name, parser, highlightjs, supportsFormat=false) {
|
constructor(token, name, parser, guesslang, supportsFormat=false) {
|
||||||
this.token = token
|
this.token = token
|
||||||
this.name = name
|
this.name = name
|
||||||
this.parser = parser
|
this.parser = parser
|
||||||
this.highlightjs = highlightjs
|
this.guesslang = guesslang
|
||||||
this.supportsFormat = supportsFormat
|
this.supportsFormat = supportsFormat
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export const LANGUAGES = [
|
export const LANGUAGES = [
|
||||||
new Language("text", "Plain Text", null, "plaintext"),
|
new Language("text", "Plain Text", null, null),
|
||||||
new Language("math", "Math", null, null),
|
new Language("math", "Math", null, null),
|
||||||
new Language("javascript", "JavaScript", javascriptLanguage.parser, "javascript", true),
|
new Language("javascript", "JavaScript", javascriptLanguage.parser, "js", true),
|
||||||
new Language("json", "JSON", jsonLanguage.parser, "json", true),
|
new Language("json", "JSON", jsonLanguage.parser, "json", true),
|
||||||
new Language("python", "Python", pythonLanguage.parser, "python"),
|
new Language("python", "Python", pythonLanguage.parser, "py"),
|
||||||
new Language("html", "HTML", htmlLanguage.parser, "html", true),
|
new Language("html", "HTML", htmlLanguage.parser, "html", true),
|
||||||
new Language("sql", "SQL", StandardSQL.language.parser, "sql"),
|
new Language("sql", "SQL", StandardSQL.language.parser, "sql"),
|
||||||
new Language("markdown", "Markdown", markdownLanguage.parser, "markdown", true),
|
new Language("markdown", "Markdown", markdownLanguage.parser, "md", true),
|
||||||
new Language("java", "Java", javaLanguage.parser, "java"),
|
new Language("java", "Java", javaLanguage.parser, "java"),
|
||||||
//new Language("lezer", "Lezer", lezerLanguage.parser, "lezer"),
|
//new Language("lezer", "Lezer", lezerLanguage.parser, "lezer"),
|
||||||
new Language("php", "PHP", phpLanguage.parser, "php"),
|
new Language("php", "PHP", phpLanguage.parser, "php"),
|
||||||
|
Loading…
Reference in New Issue
Block a user