Load independent and minimal syntax sets when using --language (#1787)

This significantly speeds up the startup time of bat, since only a single
linked SyntaxDefinition is loaded for each file. The size increase of the
binary is just ~400 kB.

In order for startup time to be improved, the --language arg must be used, and
it must match one of the following names:

"Plain Text", "ActionScript", "AppleScript", "Batch File", "NAnt Build File",
"C#", "C", "CSS", "D", "Diff", "Erlang", "Go", "Haskell", "JSON", "Java
Properties", "BibTeX", "LaTeX Log", "TeX", "Lisp", "Lua", "MATLAB", "Pascal",
"R", "Regular Expression", "Rust", "SQL", "Scala", "Tcl", "XML", "YAML", "Apache
Conf", "ARM Assembly", "Assembly (x86_64)", "CMakeCache", "Comma Separated
Values", "Cabal", "CoffeeScript", "CpuInfo", "Dart Analysis Output", "Dart",
"Dockerfile", "DotENV", "F#", "Friendly Interactive Shell (fish)", "Fortran
(Fixed Form)", "Fortran (Modern)", "Fortran Namelist", "fstab", "GLSL",
"GraphQL", "Groff/troff", "group", "hosts", "INI", "Jinja2", "jsonnet",
"Kotlin", "Less", "LLVM", "Lean", "MemInfo", "Nim", "Ninja", "Nix", "passwd",
"PowerShell", "Protocol Buffer (TEXT)", "Puppet", "Rego", "resolv", "Robot
Framework", "SML", "Strace", "Stylus", "Solidity", "Vyper", "Swift",
"SystemVerilog", "TOML", "Terraform", "TypeScript", "TypeScriptReact",
"Verilog", "VimL", "Zig", "gnuplot", "log", "requirements.txt", "Highlight
non-printables", "Private Key", "varlink"

Later commits will improve startup time for more code paths.

* fix some typos and misspellings

* CHANGELOG.md: Add Performance section (preliminary)

* Add a CHANGELOG.md entry for this PR
This commit is contained in:
Martin Nordholts 2021-09-09 20:52:33 +02:00 committed by GitHub
parent 156dec2737
commit 9124271eaf
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 182 additions and 28 deletions

View File

@ -9,12 +9,15 @@
- Python syntax highlighting no longer suffers from abysmal performance in specific scenarios. See #1688 (@keith-hall)
## Performance
- Load cached assets as fast as integrated assets, see #1753 (@Enselic)
- Greatly reduce startup time in loop-through mode, e.g. when redirecting output. Instead of *50 ms* - *100 ms*, startup takes *5 ms* - *10 ms*. See #1747 (@Enselic)
- Reduce startup time by approximately 80% for 91 out of 168 syntaxes when using `--language`. See #1787 (@Enselic)
## Other
- Add PowerShell completion, see #1826 (@rashil2000)
- Load cached assets as fast as integrated assets, see #1753 (@Enselic)
- Greatly reduce startup time in loop-through mode, e.g. when redirecting output. Instead of *50 ms* - *100 ms*, startup takes *5 ms* - *10 ms*. See #1747 (@Enselic)
## Syntaxes

BIN
assets/minimal_syntaxes.bin vendored Normal file

Binary file not shown.

View File

@ -1,3 +1,4 @@
use std::collections::HashMap;
use std::ffi::OsStr;
use std::fs;
use std::path::{Path, PathBuf};
@ -18,6 +19,14 @@ use crate::syntax_mapping::{MappingTarget, SyntaxMapping};
pub struct HighlightingAssets {
syntax_set_cell: LazyCell<SyntaxSet>,
serialized_syntax_set: SerializedSyntaxSet,
minimal_syntaxes: MinimalSyntaxes,
/// Lazily load serialized [SyntaxSet]s from [Self.minimal_syntaxes]. The
/// index in this vec matches the index in
/// [Self.minimal_syntaxes.serialized_syntax_sets]
deserialized_minimal_syntaxes: Vec<LazyCell<SyntaxSet>>,
theme_set: ThemeSet,
fallback_theme: Option<&'static str>,
}
@ -28,12 +37,39 @@ pub struct SyntaxReferenceInSet<'a> {
pub syntax_set: &'a SyntaxSet,
}
/// Stores and allows lookup of minimal [SyntaxSet]s. The [SyntaxSet]s are
/// stored in serialized form, and are deserialized on-demand. This gives good
/// startup performance since only the necessary [SyntaxReference]s needs to be
/// deserialized.
#[derive(Debug, Clone, serde::Deserialize, serde::Serialize)]
pub(crate) struct MinimalSyntaxes {
/// Lookup the index into `serialized_syntax_sets` of a [SyntaxSet] by the
/// name of any [SyntaxReference] inside the [SyntaxSet]
/// (We will later add `by_extension`, `by_first_line`, etc.)
pub(crate) by_name: HashMap<String, usize>,
/// Serialized [SyntaxSet]s. Whether or not this data is compressed is
/// decided by [COMPRESS_SERIALIZED_MINIMAL_SYNTAXES]
pub(crate) serialized_syntax_sets: Vec<Vec<u8>>,
}
// Compress for size of ~700 kB instead of ~4600 kB at the cost of ~30% longer deserialization time
pub(crate) const COMPRESS_SYNTAXES: bool = true;
// Compress for size of ~20 kB instead of ~200 kB at the cost of ~30% longer deserialization time
pub(crate) const COMPRESS_THEMES: bool = true;
// Compress for size of ~400 kB instead of ~2100 kB at the cost of ~30% longer deserialization time
pub(crate) const COMPRESS_SERIALIZED_MINIMAL_SYNTAXES: bool = true;
// Whether or not to compress the serialized form of [MinimalSyntaxes]. Shall
// always be `false`, because the data in
// [MinimalSyntaxes.serialized_syntax_sets] has already been compressed
// (assuming [COMPRESS_SERIALIZED_MINIMAL_SYNTAXES] is `true`). The "outer" data
// structures like `by_name` are tiny. If we compress, deserialization can't do
// efficient byte-by-byte copy of `serialized_syntax_sets`.
pub(crate) const COMPRESS_MINIMAL_SYNTAXES: bool = false;
const IGNORED_SUFFIXES: [&str; 13] = [
// Editor etc backups
"~",
@ -55,10 +91,20 @@ const IGNORED_SUFFIXES: [&str; 13] = [
];
impl HighlightingAssets {
fn new(serialized_syntax_set: SerializedSyntaxSet, theme_set: ThemeSet) -> Self {
fn new(
serialized_syntax_set: SerializedSyntaxSet,
minimal_syntaxes: MinimalSyntaxes,
theme_set: ThemeSet,
) -> Self {
// Prepare so we can lazily load minimal syntaxes without a mut reference
let deserialized_minimal_syntaxes =
vec![LazyCell::new(); minimal_syntaxes.serialized_syntax_sets.len()];
HighlightingAssets {
syntax_set_cell: LazyCell::new(),
serialized_syntax_set,
deserialized_minimal_syntaxes,
minimal_syntaxes,
theme_set,
fallback_theme: None,
}
@ -71,6 +117,11 @@ impl HighlightingAssets {
pub fn from_cache(cache_path: &Path) -> Result<Self> {
Ok(HighlightingAssets::new(
SerializedSyntaxSet::FromFile(cache_path.join("syntaxes.bin")),
asset_from_cache(
&cache_path.join("minimal_syntaxes.bin"),
"minimal syntax sets",
COMPRESS_MINIMAL_SYNTAXES,
)?,
asset_from_cache(&cache_path.join("themes.bin"), "theme set", COMPRESS_THEMES)?,
))
}
@ -78,6 +129,7 @@ impl HighlightingAssets {
pub fn from_binary() -> Self {
HighlightingAssets::new(
SerializedSyntaxSet::FromBinary(get_serialized_integrated_syntaxset()),
get_integrated_minimal_syntaxes(),
get_integrated_themeset(),
)
}
@ -111,6 +163,41 @@ impl HighlightingAssets {
self.get_theme_set().themes.keys().map(|s| s.as_ref())
}
/// Finds a [SyntaxSet] that contains a [SyntaxReference] by its name. First
/// tries to find a minimal [SyntaxSet]. If none is found, returns the
/// [SyntaxSet] that contains all syntaxes.
fn get_syntax_set_by_name(&self, name: &str) -> Result<&SyntaxSet> {
let minimal_syntax_set = self
.minimal_syntaxes
.by_name
.get(&name.to_ascii_lowercase())
.and_then(|index| self.get_minimal_syntax_set_with_index(*index));
match minimal_syntax_set {
Some(syntax_set) => Ok(syntax_set),
None => self.get_syntax_set(),
}
}
fn load_minimal_syntax_set_with_index(&self, index: usize) -> Result<SyntaxSet> {
let serialized_syntax_set = &self.minimal_syntaxes.serialized_syntax_sets[index];
asset_from_contents(
&serialized_syntax_set[..],
&format!("minimal syntax set {}", index),
COMPRESS_SERIALIZED_MINIMAL_SYNTAXES,
)
.map_err(|_| format!("Could not parse minimal syntax set {}", index).into())
}
fn get_minimal_syntax_set_with_index(&self, index: usize) -> Option<&SyntaxSet> {
self.deserialized_minimal_syntaxes
.get(index)
.and_then(|cell| {
cell.try_borrow_with(|| self.load_minimal_syntax_set_with_index(index))
.ok()
})
}
/// Use [Self::get_syntax_for_file_name] instead
#[deprecated]
pub fn syntax_for_file_name(
@ -167,7 +254,7 @@ impl HighlightingAssets {
mapping: &SyntaxMapping,
) -> Result<SyntaxReferenceInSet> {
if let Some(language) = language {
let syntax_set = self.get_syntax_set()?;
let syntax_set = self.get_syntax_set_by_name(language)?;
syntax_set
.find_syntax_by_token(language)
.map(|syntax| SyntaxReferenceInSet { syntax, syntax_set })
@ -320,6 +407,13 @@ pub(crate) fn get_integrated_themeset() -> ThemeSet {
from_binary(include_bytes!("../assets/themes.bin"), COMPRESS_THEMES)
}
fn get_integrated_minimal_syntaxes() -> MinimalSyntaxes {
from_binary(
include_bytes!("../assets/minimal_syntaxes.bin"),
COMPRESS_MINIMAL_SYNTAXES,
)
}
pub(crate) fn from_binary<T: serde::de::DeserializeOwned>(v: &[u8], compressed: bool) -> T {
asset_from_contents(v, "n/a", compressed)
.expect("data integrated in binary is never faulty, but make sure `compressed` is in sync!")

View File

@ -20,6 +20,7 @@ pub fn cache_dir() -> Cow<'static, str> {
pub fn clear_assets() {
clear_asset("themes.bin", "theme set cache");
clear_asset("syntaxes.bin", "syntax set cache");
clear_asset("minimal_syntaxes.bin", "minimal syntax sets cache");
clear_asset("metadata.yaml", "metadata file");
}

View File

@ -37,17 +37,19 @@ pub fn build_assets(
let syntax_set_builder = build_syntax_set_builder(source_dir, include_integrated_assets)?;
if std::env::var("BAT_PRINT_SYNTAX_DEPENDENCIES").is_ok() {
// To trigger this code, run:
// BAT_PRINT_SYNTAX_DEPENDENCIES=1 cargo run -- cache --build --source assets --blank --target /tmp
print_syntax_dependencies(&syntax_set_builder);
}
let minimal_syntaxes = build_minimal_syntaxes(&syntax_set_builder, include_integrated_assets)?;
let syntax_set = syntax_set_builder.build();
print_unlinked_contexts(&syntax_set);
write_assets(&theme_set, &syntax_set, target_dir, current_version)
write_assets(
&theme_set,
&syntax_set,
&minimal_syntaxes,
target_dir,
current_version,
)
}
fn build_theme_set(source_dir: &Path, include_integrated_assets: bool) -> ThemeSet {
@ -116,6 +118,7 @@ fn print_unlinked_contexts(syntax_set: &SyntaxSet) {
fn write_assets(
theme_set: &ThemeSet,
syntax_set: &SyntaxSet,
minimal_syntaxes: &MinimalSyntaxes,
target_dir: &Path,
current_version: &str,
) -> Result<()> {
@ -132,6 +135,12 @@ fn write_assets(
"syntax set",
COMPRESS_SYNTAXES,
)?;
asset_to_cache(
minimal_syntaxes,
&target_dir.join("minimal_syntaxes.bin"),
"minimal syntax sets",
COMPRESS_MINIMAL_SYNTAXES,
)?;
print!(
"Writing metadata to folder {} ... ",
@ -143,14 +152,7 @@ fn write_assets(
Ok(())
}
/// Generates independent [SyntaxSet]s after analyzing dependencies between syntaxes
/// in a [SyntaxSetBuilder], and then prints the reults.
fn print_syntax_dependencies(syntax_set_builder: &SyntaxSetBuilder) {
println!("Constructing independent SyntaxSets...");
let independent_syntax_sets = build_independent_syntax_sets(syntax_set_builder);
println!("Independent SyntaxSets:");
for syntax_set in independent_syntax_sets {
fn print_syntax_set_names(syntax_set: &SyntaxSet) {
let names = syntax_set
.syntaxes()
.iter()
@ -158,11 +160,57 @@ fn print_syntax_dependencies(syntax_set_builder: &SyntaxSetBuilder) {
.collect::<Vec<_>>();
println!("{:?}", names);
}
fn build_minimal_syntaxes(
syntax_set_builder: &'_ SyntaxSetBuilder,
include_integrated_assets: bool,
) -> Result<MinimalSyntaxes> {
let mut minimal_syntaxes = MinimalSyntaxes {
by_name: HashMap::new(),
serialized_syntax_sets: vec![],
};
if include_integrated_assets {
// Dependency info is not present in integrated assets, so we can't
// calculate minimal syntax sets. Return early without any data filled
// in. This means that no minimal syntax sets will be available to use, and
// the full, slow-to-deserialize, fallback syntax set will be used instead.
return Ok(minimal_syntaxes);
}
let minimal_syntax_sets_to_serialize = build_minimal_syntax_sets(syntax_set_builder)
// For now, only store syntax sets with one syntax, otherwise
// the binary grows by several megs
.filter(|syntax_set| syntax_set.syntaxes().len() == 1);
for minimal_syntax_set in minimal_syntax_sets_to_serialize {
// Remember what index it is found at
let current_index = minimal_syntaxes.serialized_syntax_sets.len();
for syntax in minimal_syntax_set.syntaxes() {
minimal_syntaxes
.by_name
.insert(syntax.name.to_ascii_lowercase().clone(), current_index);
}
let serialized_syntax_set = asset_to_contents(
&minimal_syntax_set,
&format!("failed to serialize minimal syntax set {}", current_index),
COMPRESS_SERIALIZED_MINIMAL_SYNTAXES,
)?;
// Add last so that it ends up at `current_index`
minimal_syntaxes
.serialized_syntax_sets
.push(serialized_syntax_set);
}
Ok(minimal_syntaxes)
}
/// Analyzes dependencies between syntaxes in a [SyntaxSetBuilder].
/// From that, it builds independent [SyntaxSet]s.
fn build_independent_syntax_sets(
/// From that, it builds minimal [SyntaxSet]s.
fn build_minimal_syntax_sets(
syntax_set_builder: &'_ SyntaxSetBuilder,
) -> impl Iterator<Item = SyntaxSet> + '_ {
let syntaxes = syntax_set_builder.syntaxes();
@ -170,7 +218,7 @@ fn build_independent_syntax_sets(
// Build the data structures we need for dependency resolution
let (syntax_to_dependencies, dependency_to_syntax) = generate_maps(syntaxes);
// Create one independent SyntaxSet from each (non-hidden) SyntaxDefinition
// Create one minimal SyntaxSet from each (non-hidden) SyntaxDefinition
syntaxes.iter().filter_map(move |syntax| {
if syntax.hidden {
return None;
@ -178,7 +226,15 @@ fn build_independent_syntax_sets(
let mut builder = SyntaxSetDependencyBuilder::new();
builder.add_with_dependencies(syntax, &syntax_to_dependencies, &dependency_to_syntax);
Some(builder.build())
let syntax_set = builder.build();
if std::env::var("BAT_PRINT_SYNTAX_DEPENDENCIES").is_ok() {
// To trigger this code, run:
// BAT_PRINT_SYNTAX_DEPENDENCIES=1 cargo run -- cache --build --source assets --blank --target /tmp
print_syntax_set_names(&syntax_set);
}
Some(syntax_set)
})
}