Language auto detection

This commit is contained in:
Jonatan Heyman 2023-01-03 16:56:07 +01:00
parent 95328ba739
commit af86743f19
19 changed files with 2172 additions and 2704 deletions

1202
heynote-codemirror/lib/highlight.min.js vendored Normal file

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

View File

@ -12,7 +12,7 @@
},
"keywords": [],
"author": "",
"license": "ISC",
"license": "",
"dependencies": {
"@codemirror/commands": "^6.1.2",
"@codemirror/lang-html": "^6.4.0",
@ -29,8 +29,7 @@
"@lezer/generator": "^1.1.3",
"@rollup/plugin-node-resolve": "^15.0.1",
"codemirror": "^6.0.1",
"i": "^0.3.7",
"npm": "^9.2.0",
"highlight.js": "^11.7.0",
"rollup": "^3.8.1",
"rollup-plugin-typescript2": "^0.34.1",
"typescript": "^4.9.4"

View File

@ -1,5 +1,5 @@
import { Annotation } from "@codemirror/state"
export const heynoteEvent = Annotation.define()
export const INITIAL_DATA = "initial-data"
export const LANGUAGE_CHANGE = "heynote-change"

View File

@ -3,14 +3,17 @@ import {
selectAll as defaultSelectAll,
moveLineUp as defaultMoveLineUp,
} from "@codemirror/commands"
import { blockState } from "./note-block"
import { heynoteEvent, LANGUAGE_CHANGE } from "../annotation.js";
import { HIGHLIGHTJS_TO_TOKEN } from "../languages"
import { blockState, getActiveNoteBlock } from "./note-block"
import { levenshtein_distance } from "../language-detection/levenshtein"
export const insertNewNote = ({ state, dispatch }) => {
if (state.readOnly)
return false
const delimText = "\n∞∞∞text\n"
const delimText = "\n∞∞∞text-a\n"
dispatch(state.replaceSelection(delimText),
{
scrollIntoView: true,
@ -22,9 +25,8 @@ export const insertNewNote = ({ state, dispatch }) => {
}
export const selectAll = ({ state, dispatch }) => {
// find which block the cursor is in
const range = state.selection.asSingle().ranges[0]
const block = state.facet(blockState).find(block => block.content.from <= range.from && block.content.to >= range.from)
const block = getActiveNoteBlock(state)
// check if all the text of the note is already selected, in which case we want to select all the text of the whole document
if (range.from === block.content.from && range.to === block.content.to) {
@ -51,3 +53,39 @@ export function moveLineUp({ state, dispatch }) {
}
return defaultMoveLineUp({state, dispatch})
}
export function changeLanguageTo(state, dispatch, block, language, auto) {
const delimRegex = /^\n∞∞∞[a-z]{0,16}(-a)?\n/g
if (state.doc.sliceString(block.delimiter.from, block.delimiter.to).match(delimRegex)) {
//console.log("changing language to", language)
dispatch(state.update({
changes: {
from: block.delimiter.from,
to: block.delimiter.to,
insert: `\n∞∞∞${language}${auto ? '-a' : ''}\n`,
},
annotations: [heynoteEvent.of(LANGUAGE_CHANGE)],
}))
} else {
throw new Error("Invalid delimiter: " + state.doc.sliceString(block.delimiter.from, block.delimiter.to))
}
}
export function autoDetectLanguage({ state, dispatch }) {
console.log("state:", state)
const block = getActiveNoteBlock(state)
//console.log("content:", state.doc.sliceString(block.content.from, block.content.to))
//console.log("langs:", hljs.listLanguages())
let startTime = new Date();
const result = hljs.highlightAuto(state.doc.sliceString(block.content.from, block.content.to), ["json", "python", "javascript", "html", "sql", "java", "plaintext"])
console.log("took:", new Date() - startTime)
console.log("highlight.js result", result)
if (result.language) {
changeLanguageTo(state, dispatch, block, HIGHLIGHTJS_TO_TOKEN[result.language], true)
}
}

View File

@ -5,7 +5,7 @@ import { RangeSet } from "@codemirror/rangeset";
import { syntaxTree } from "@codemirror/language"
import { Note, Document, NoteDelimiter } from "../lang-heynote/parser.terms.js"
import { IterMode } from "@lezer/common";
import { INITIAL_DATA } from "../annotation.js";
import { heynoteEvent, LANGUAGE_CHANGE } from "../annotation.js";
// tracks the size of the first delimiter
@ -18,8 +18,15 @@ function getBlocks(state) {
if (type.type.id == Document || type.type.id == Note) {
return true
} else if (type.type.id === NoteDelimiter) {
const langNode = type.node.getChild("NoteLanguage")
const language = state.doc.sliceString(langNode.from, langNode.to)
const isAuto = !!type.node.getChild("Auto")
const contentNode = type.node.nextSibling
blocks.push({
language: {
name: language,
auto: isAuto,
},
content: {
from: contentNode.from,
to: contentNode.to,
@ -198,7 +205,8 @@ const blockLayer = layer({
const preventFirstBlockFromBeingDeleted = EditorState.changeFilter.of((tr) => {
if (!tr.annotations.some(a => a.value === INITIAL_DATA) && firstBlockDelimiterSize) {
//console.log("annotations:", tr.annotation(heynoteEvent), tr.annotations.some(a => tr.annotation(heynoteEvent)))
if (!tr.annotations.some(a => a.type === heynoteEvent) && firstBlockDelimiterSize) {
return [0, firstBlockDelimiterSize]
}
})

View File

@ -9,6 +9,7 @@ import { customSetup } from "./setup.js"
import { heynoteLang } from "./lang-heynote/heynote.js"
import { noteBlockExtension } from "./block/note-block.js"
import { heynoteKeymap } from "./keymap.js"
import { languageDetection } from "./language-detection/autodetect.js"
export class HeynoteEditor {
@ -30,6 +31,7 @@ export class HeynoteEditor {
}),
heynoteLang(),
noteBlockExtension(),
languageDetection(() => this.view),
// set cursor blink rate to 1 second
drawSelection({cursorBlinkRate:1000}),
@ -70,23 +72,3 @@ editor.update([
})
])*/
/*
// render syntax tree
setTimeout(() => {
function render(tree) {
let lists = ''
tree.iterate({
enter(type) {
lists += `<ul><li>${type.name} (${type.from},${type.to})`
},
leave() {
lists += '</ul>'
}
})
return lists
}
let html = render(syntaxTree(editor.state))
document.getElementById("syntaxTree").innerHTML = html;
}, 1000)
*/

View File

@ -5,7 +5,7 @@ hej∞∞∞python
f = lambda: 2 +1`;*/
export default `
python
text-a
# hmm
def my_func():
print("hejsan")
@ -16,7 +16,7 @@ import {EditorView, keymap} from "@codemirror/view"
import {javascript} from "@codemirror/lang-javascript"
import {indentWithTab, insertTab, indentLess, indentMore} from "@codemirror/commands"
import {nord} from "./nord.mjs"
javascript
javascript-a
let editor = new EditorView({
//extensions: [basicSetup, javascript()],
extensions: [

View File

@ -1,3 +1,5 @@
import { syntaxTree } from "@codemirror/language"
import { HeynoteEditor } from "./editor.js"
import initialData from "./fixture.js"
@ -5,3 +7,23 @@ let editor = new HeynoteEditor({
element: document.getElementById("editor"),
content: initialData,
})
/*// render syntax tree
setTimeout(() => {
function render(tree) {
let lists = ''
tree.iterate({
enter(type) {
lists += `<ul><li>${type.name} (${type.from},${type.to})`
},
leave() {
lists += '</ul>'
}
})
return lists
}
let html = render(syntaxTree(editor.state))
document.getElementById("syntaxTree").innerHTML = html;
}, 1000)
*/

View File

@ -1,6 +1,6 @@
import { keymap } from "@codemirror/view"
import { indentWithTab, insertTab, indentLess, indentMore } from "@codemirror/commands"
import { insertNewNote, moveLineUp, selectAll } from "./block/commands.js";
import { insertNewNote, moveLineUp, selectAll, autoDetectLanguage } from "./block/commands.js";
export const heynoteKeymap = keymap.of([
{
@ -28,5 +28,10 @@ export const heynoteKeymap = keymap.of([
key: "Alt-ArrowUp",
preventDefault: true,
run: moveLineUp,
}
},
{
key: "Mod-Shift-a",
preventDefault: true,
run: autoDetectLanguage,
},
])

View File

@ -13,10 +13,10 @@ export const noteContent = new ExternalTokenizer((input) => {
while (true) {
let potentialLang = "";
for (let i=0; i<16; i++) {
for (let i=0; i<18; i++) {
potentialLang += String.fromCharCode(input.peek(i));
}
if (potentialLang.match(/^\n∞∞∞(text|javascript|json|python|html|sql|markdown|java|lezer|php)\n/g)) {
if (potentialLang.match(/^\n∞∞∞(text|javascript|json|python|html|sql|markdown|java|lezer|php)(-a)?\n/g)) {
input.acceptToken(NoteContent);
return;
}

View File

@ -5,13 +5,14 @@ Note {
}
NoteDelimiter {
noteDelimiterEnter noteDelimiterMark NoteLanguage noteDelimiterEnter
noteDelimiterEnter noteDelimiterMark NoteLanguage Auto? noteDelimiterEnter
}
@tokens {
noteDelimiterMark { "∞∞∞" }
NoteLanguage { "text" | "javascript" | "json" | "python" | "html" | "sql" | "markdown" | "java" | "lezer" | "php" }
Auto { "-a" }
noteDelimiterEnter { "\n" }
//NoteContent { String }
//String { (![∞])+ }

View File

@ -3,14 +3,14 @@ import {LRParser} from "@lezer/lr"
import {noteContent} from "./external-tokens.js"
export const parser = LRParser.deserialize({
version: 14,
states: "!^QQOPOOOVOPO'#C`O[OQO'#C_OOOO'#Cb'#CbQQOPOOOaOPO,58zOOOO,58y,58yOOOO-E6`-E6`OfOPO1G.fOOOQ7+$Q7+$Q",
stateData: "k~OWPO~OXTO~OPUO~OTWO~OWXO~O",
goto: "fVPPPW[P`TROSTQOSQSORVS",
nodeNames: "⚠ NoteContent Document Note NoteDelimiter NoteLanguage",
maxTerm: 9,
states: "!jQQOPOOOVOPO'#C`O[OQO'#C_OOOO'#Cc'#CcQQOPOOOaOPO,58zOOOO,58y,58yOOOO-E6a-E6aOfOPO1G.fOOOQ7+$Q7+$QOnOPO7+$QOOOQ<<Gl<<Gl",
stateData: "s~OXPO~OYTO~OPUO~OTWO~OUYOXXO~OXZO~O",
goto: "gWPPPX]PPaTROSTQOSQSORVS",
nodeNames: "⚠ NoteContent Document Note NoteDelimiter NoteLanguage Auto",
maxTerm: 10,
skippedNodes: [0],
repeatNodeCount: 1,
tokenData: "&e~RXYZn#[#]s#^#_![#`#a#u#a#b$_#d#e%T#g#h%p#h#i%v%&x%&y&S~sOW~~vP#h#iy~|P#a#b!P~!SP#`#a!V~![OT~~!_Q#T#U!e#g#h#i~!hP#j#k!k~!nP#T#U!q~!vPT~#g#h!y~!|P#V#W#P~#SP#f#g#V~#YP#]#^#]~#`P#d#e#c~#fP#h#i!V~#lP#c#d#o~#rP#b#c!V~#xP#X#Y#{~$OP#n#o$R~$UP#X#Y$X~$[P#f#g!V~$bP#T#U$e~$hP#f#g$k~$nP#_#`$q~$tP#W#X$w~$zP#c#d$}~%QP#k#l#o~%WQ#[#]%^#m#n%d~%aP#d#e!V~%gP#h#i%j~%mP#[#]#i~%sP#e#f!P~%yP#X#Y%|~&PP#l#m#c~&VP%&x%&y&Y~&]P%&x%&y&`~&eOX~",
tokenData: "&s~RYYZq}!Ov#[#]!R#^#_!j#`#a$T#a#b$m#d#e%c#g#h&O#h#i&U%&x%&y&b~vOX~~yP#T#U|~!ROU~~!UP#h#i!X~![P#a#b!_~!bP#`#a!e~!jOT~~!mQ#T#U!s#g#h#w~!vP#j#k!y~!|P#T#U#P~#UPT~#g#h#X~#[P#V#W#_~#bP#f#g#e~#hP#]#^#k~#nP#d#e#q~#tP#h#i!e~#zP#c#d#}~$QP#b#c!e~$WP#X#Y$Z~$^P#n#o$a~$dP#X#Y$g~$jP#f#g!e~$pP#T#U$s~$vP#f#g$y~$|P#_#`%P~%SP#W#X%V~%YP#c#d%]~%`P#k#l#}~%fQ#[#]%l#m#n%r~%oP#d#e!e~%uP#h#i%x~%{P#[#]#w~&RP#e#f!_~&XP#X#Y&[~&_P#l#m#q~&eP%&x%&y&h~&kP%&x%&y&n~&sOY~",
tokenizers: [0, noteContent],
topRules: {"Document":[0,2]},
tokenPrec: 0

View File

@ -4,4 +4,5 @@ export const
Document = 2,
Note = 3,
NoteDelimiter = 4,
NoteLanguage = 5
NoteLanguage = 5,
Auto = 6

View File

@ -0,0 +1,92 @@
import { EditorState } from "@codemirror/state";
import { EditorView } from "codemirror";
import { getActiveNoteBlock, blockState } from "../block/note-block";
import { levenshtein_distance } from "./levenshtein";
import { HIGHLIGHTJS_TO_TOKEN } from "../languages";
import { changeLanguageTo } from "../block/commands";
import { LANGUAGE_CHANGE } from "../annotation";
export function languageDetection(getView) {
const previousBlockContent = []
let idleCallbackId = null
const detectionWorker = new Worker('language-detection/worker.js');
detectionWorker.onmessage = (event) => {
//console.log("event:", event.data)
if (!event.data.highlightjs.language) {
return
}
const view = getView()
const state = view.state
const block = getActiveNoteBlock(state)
const newLang = HIGHLIGHTJS_TO_TOKEN[event.data.highlightjs.language]
if (block.language.auto === true && block.language.name !== newLang) {
let content = state.doc.sliceString(block.content.from, block.content.to)
const threshold = content.length * 0.1
if (levenshtein_distance(content, event.data.content) <= threshold) {
console.log("Setting new auto detected language:", newLang)
// the content has not changed significantly
changeLanguageTo(state, view.dispatch, block, newLang, true)
} else {
console.log("Content has changed significantly, not setting new language")
}
}
}
const plugin = EditorView.updateListener.of(update => {
if (update.docChanged) {
if (idleCallbackId !== null) {
cancelIdleCallback(idleCallbackId)
idleCallbackId = null
}
if (update.transactions.every(tr => tr.annotations.some(a => a.value == LANGUAGE_CHANGE))) {
// don't run language detection if the change was triggered by a language change
//console.log("ignoring check after language change")
return
}
idleCallbackId = requestIdleCallback(() => {
idleCallbackId = null
const range = update.state.selection.asSingle().ranges[0]
const blocks = update.state.facet(blockState)
let block = null, idx = null;
for (let i=0; i<blocks.length; i++) {
if (blocks[i].content.from <= range.from && blocks[i].content.to >= range.from) {
block = blocks[i]
idx = i
break
}
}
if (block === null || block.language.auto === false) {
return
}
const content = update.state.doc.sliceString(block.content.from, block.content.to)
if (content === "") {
// if content is cleared, set language to plaintext
const view = getView()
const block = getActiveNoteBlock(view.state)
changeLanguageTo(view.state, view.dispatch, block, "text", true)
}
if (content.length <= 8) {
return
}
const threshold = content.length * 0.1
if (!previousBlockContent[idx] || levenshtein_distance(previousBlockContent[idx], content) >= threshold) {
// the content has changed significantly, so schedule a language detection
//console.log("Scheduling language detection for block", idx, "with threshold", threshold)
detectionWorker.postMessage({
content: content,
idx: idx,
})
previousBlockContent[idx] = content
}
})
}
})
return plugin
}

View File

@ -0,0 +1,99 @@
function _min(d0, d1, d2, bx, ay) {
return d0 < d1 || d2 < d1
? d0 > d2
? d2 + 1
: d0 + 1
: bx === ay
? d1
: d1 + 1;
}
export function levenshtein_distance (a, b) {
if (a === b) {
return 0;
}
if (a.length > b.length) {
var tmp = a;
a = b;
b = tmp;
}
var la = a.length;
var lb = b.length;
while (la > 0 && (a.charCodeAt(la - 1) === b.charCodeAt(lb - 1))) {
la--;
lb--;
}
var offset = 0;
while (offset < la && (a.charCodeAt(offset) === b.charCodeAt(offset))) {
offset++;
}
la -= offset;
lb -= offset;
if (la === 0 || lb < 3) {
return lb;
}
var x = 0;
var y;
var d0;
var d1;
var d2;
var d3;
var dd;
var dy;
var ay;
var bx0;
var bx1;
var bx2;
var bx3;
var vector = [];
for (y = 0; y < la; y++) {
vector.push(y + 1);
vector.push(a.charCodeAt(offset + y));
}
var len = vector.length - 1;
for (; x < lb - 3;) {
bx0 = b.charCodeAt(offset + (d0 = x));
bx1 = b.charCodeAt(offset + (d1 = x + 1));
bx2 = b.charCodeAt(offset + (d2 = x + 2));
bx3 = b.charCodeAt(offset + (d3 = x + 3));
dd = (x += 4);
for (y = 0; y < len; y += 2) {
dy = vector[y];
ay = vector[y + 1];
d0 = _min(dy, d0, d1, bx0, ay);
d1 = _min(d0, d1, d2, bx1, ay);
d2 = _min(d1, d2, d3, bx2, ay);
dd = _min(d2, d3, dd, bx3, ay);
vector[y] = dd;
d3 = d2;
d2 = d1;
d1 = d0;
d0 = dy;
}
}
for (; x < lb;) {
bx0 = b.charCodeAt(offset + (d0 = x));
dd = ++x;
for (y = 0; y < len; y += 2) {
dy = vector[y];
vector[y] = dd = _min(dy, d0, dd, bx0, vector[y + 1]);
d0 = dy;
}
}
return dd;
};

View File

@ -0,0 +1,16 @@
const HIGHLIGHTJS_LANGUAGES = ["json", "python", "javascript", "html", "sql", "java", "plaintext"]
onmessage = (event) => {
//console.log("worker received message:", event.data)
importScripts("../../lib/highlight.min.js")
const result = self.hljs.highlightAuto(event.data.content, HIGHLIGHTJS_LANGUAGES);
postMessage({
highlightjs: {
language: result.language,
relevance: result.relevance,
illegal: result.illegal,
},
content: event.data.content,
idx: event.data.idx,
})
}

View File

@ -0,0 +1,17 @@
export const LANGUAGE_TOKENS = [
"text",
"javascript",
"json",
"python",
"html",
"sql",
"markdown",
"java",
"lezer",
"php",
]
export const HIGHLIGHTJS_TO_TOKEN = Object.fromEntries(LANGUAGE_TOKENS.map(l => [l,l]))
HIGHLIGHTJS_TO_TOKEN["plaintext"] = "text"