diff --git a/crates/nu-cli/src/cli.rs b/crates/nu-cli/src/cli.rs index f9e195d87..18c3eb8c9 100644 --- a/crates/nu-cli/src/cli.rs +++ b/crates/nu-cli/src/cli.rs @@ -1,5 +1,5 @@ use crate::commands::classified::block::run_block; -use crate::commands::classified::external::{MaybeTextCodec, StringOrBinary}; +use crate::commands::classified::maybe_text_codec::{MaybeTextCodec, StringOrBinary}; use crate::commands::plugin::JsonRpc; use crate::commands::plugin::{PluginCommand, PluginSink}; use crate::commands::whole_stream_command; @@ -953,7 +953,7 @@ pub async fn process_line( let input_stream = if redirect_stdin { let file = futures::io::AllowStdIo::new(std::io::stdin()); - let stream = FramedRead::new(file, MaybeTextCodec).map(|line| { + let stream = FramedRead::new(file, MaybeTextCodec::default()).map(|line| { if let Ok(line) = line { match line { StringOrBinary::String(s) => Ok(Value { diff --git a/crates/nu-cli/src/commands.rs b/crates/nu-cli/src/commands.rs index 7ea072010..43f4feddc 100644 --- a/crates/nu-cli/src/commands.rs +++ b/crates/nu-cli/src/commands.rs @@ -20,6 +20,7 @@ pub(crate) mod clip; pub(crate) mod command; pub(crate) mod compact; pub(crate) mod config; +pub(crate) mod constants; pub(crate) mod count; pub(crate) mod cp; pub(crate) mod date; diff --git a/crates/nu-cli/src/commands/classified/external.rs b/crates/nu-cli/src/commands/classified/external.rs index 554a394ed..a9953d81d 100644 --- a/crates/nu-cli/src/commands/classified/external.rs +++ b/crates/nu-cli/src/commands/classified/external.rs @@ -1,3 +1,4 @@ +use crate::commands::classified::maybe_text_codec::{MaybeTextCodec, StringOrBinary}; use crate::evaluate::evaluate_baseline_expr; use crate::futures::ThreadedReceiver; use crate::prelude::*; @@ -7,9 +8,7 @@ use std::ops::Deref; use std::process::{Command, Stdio}; use std::sync::mpsc; -use bytes::{BufMut, Bytes, BytesMut}; use futures::executor::block_on_stream; -// use futures::stream::StreamExt; use futures_codec::FramedRead; use log::trace; @@ -18,70 +17,6 @@ use nu_protocol::hir::ExternalCommand; use nu_protocol::{Primitive, Scope, ShellTypeName, UntaggedValue, Value}; use nu_source::Tag; -pub enum StringOrBinary { - String(String), - Binary(Vec), -} -pub struct MaybeTextCodec; - -impl futures_codec::Encoder for MaybeTextCodec { - type Item = StringOrBinary; - type Error = std::io::Error; - - fn encode(&mut self, item: Self::Item, dst: &mut BytesMut) -> Result<(), Self::Error> { - match item { - StringOrBinary::String(s) => { - dst.reserve(s.len()); - dst.put(s.as_bytes()); - Ok(()) - } - StringOrBinary::Binary(b) => { - dst.reserve(b.len()); - dst.put(Bytes::from(b)); - Ok(()) - } - } - } -} - -impl futures_codec::Decoder for MaybeTextCodec { - type Item = StringOrBinary; - type Error = std::io::Error; - - fn decode(&mut self, src: &mut BytesMut) -> Result, Self::Error> { - let v: Vec = src.to_vec(); - match String::from_utf8(v) { - Ok(s) => { - src.clear(); - if s.is_empty() { - Ok(None) - } else { - Ok(Some(StringOrBinary::String(s))) - } - } - Err(err) => { - // Note: the longest UTF-8 character per Unicode spec is currently 6 bytes. If we fail somewhere earlier than the last 6 bytes, - // we know that we're failing to understand the string encoding and not just seeing a partial character. When this happens, let's - // fall back to assuming it's a binary buffer. - if src.is_empty() { - Ok(None) - } else if src.len() > 6 && (src.len() - err.utf8_error().valid_up_to() > 6) { - // Fall back to assuming binary - let buf = src.to_vec(); - src.clear(); - Ok(Some(StringOrBinary::Binary(buf))) - } else { - // Looks like a utf-8 string, so let's assume that - let buf = src.split_to(err.utf8_error().valid_up_to() + 1); - String::from_utf8(buf.to_vec()) - .map(|x| Some(StringOrBinary::String(x))) - .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e)) - } - } - } - } -} - pub(crate) async fn run_external_command( command: ExternalCommand, context: &mut Context, @@ -319,7 +254,7 @@ fn spawn( }; let file = futures::io::AllowStdIo::new(stdout); - let stream = FramedRead::new(file, MaybeTextCodec); + let stream = FramedRead::new(file, MaybeTextCodec::default()); for line in block_on_stream(stream) { match line { @@ -373,7 +308,7 @@ fn spawn( } let file = futures::io::AllowStdIo::new(stderr); - let err_stream = FramedRead::new(file, MaybeTextCodec); + let err_stream = FramedRead::new(file, MaybeTextCodec::default()); for err_line in block_on_stream(err_stream) { match err_line { diff --git a/crates/nu-cli/src/commands/classified/maybe_text_codec.rs b/crates/nu-cli/src/commands/classified/maybe_text_codec.rs new file mode 100644 index 000000000..e7d11cfd2 --- /dev/null +++ b/crates/nu-cli/src/commands/classified/maybe_text_codec.rs @@ -0,0 +1,103 @@ +use bytes::{BufMut, Bytes, BytesMut}; + +use nu_errors::ShellError; + +extern crate encoding_rs; +use encoding_rs::{CoderResult, Decoder, Encoding, UTF_8}; + +const OUTPUT_BUFFER_SIZE: usize = 8192; + +pub enum StringOrBinary { + String(String), + Binary(Vec), +} + +pub struct MaybeTextCodec { + decoder: Decoder, +} + +impl MaybeTextCodec { + // The constructor takes an Option<&'static Encoding>, because an absence of an encoding indicates that we want BOM sniffing enabled + pub fn new(encoding: Option<&'static Encoding>) -> Self { + let decoder = match encoding { + Some(e) => e.new_decoder_with_bom_removal(), + None => UTF_8.new_decoder(), + }; + MaybeTextCodec { decoder } + } +} + +impl Default for MaybeTextCodec { + // The default MaybeTextCodec uses a UTF_8 decoder + fn default() -> Self { + MaybeTextCodec { + decoder: UTF_8.new_decoder(), + } + } +} + +impl futures_codec::Encoder for MaybeTextCodec { + type Item = StringOrBinary; + type Error = std::io::Error; + + fn encode(&mut self, item: Self::Item, dst: &mut BytesMut) -> Result<(), Self::Error> { + match item { + StringOrBinary::String(s) => { + dst.reserve(s.len()); + dst.put(s.as_bytes()); + Ok(()) + } + StringOrBinary::Binary(b) => { + dst.reserve(b.len()); + dst.put(Bytes::from(b)); + Ok(()) + } + } + } +} + +// TODO: Write some tests +impl futures_codec::Decoder for MaybeTextCodec { + type Item = StringOrBinary; + type Error = ShellError; + + fn decode(&mut self, src: &mut BytesMut) -> Result, Self::Error> { + if src.is_empty() { + return Ok(None); + } + + let mut s = String::with_capacity(OUTPUT_BUFFER_SIZE); + + let (res, read, replacements) = self.decoder.decode_to_string(src, &mut s, false); + // If we had to make replacements when converting to utf8, fallback to binary + if replacements { + return Ok(Some(StringOrBinary::Binary(src.to_vec()))); + } + + match res { + CoderResult::InputEmpty => { + src.clear(); + Ok(Some(StringOrBinary::String(s))) + } + CoderResult::OutputFull => { + // If the original buffer size is too small, + // We continue to allocate new Strings and append them to the result until the input buffer is smaller than the allocated String + let mut starting_index = read; + loop { + let mut more = String::with_capacity(OUTPUT_BUFFER_SIZE); + let (res, read, _replacements) = + self.decoder + .decode_to_string(&src[starting_index..], &mut more, false); + s.push_str(&more); + // Our input buffer is smaller than out allocated String, we can stop now + if let CoderResult::InputEmpty = res { + break; + } + starting_index += read; + } + src.clear(); + Ok(Some(StringOrBinary::String(s))) + } + } + } +} diff --git a/crates/nu-cli/src/commands/classified/mod.rs b/crates/nu-cli/src/commands/classified/mod.rs index 67f25150b..17db882a7 100644 --- a/crates/nu-cli/src/commands/classified/mod.rs +++ b/crates/nu-cli/src/commands/classified/mod.rs @@ -3,6 +3,7 @@ mod dynamic; pub(crate) mod expr; pub(crate) mod external; pub(crate) mod internal; +pub(crate) mod maybe_text_codec; #[allow(unused_imports)] pub(crate) use dynamic::Command as DynamicCommand; diff --git a/crates/nu-cli/src/commands/constants.rs b/crates/nu-cli/src/commands/constants.rs new file mode 100644 index 000000000..0b7ca91a5 --- /dev/null +++ b/crates/nu-cli/src/commands/constants.rs @@ -0,0 +1,358 @@ +pub const BAT_LANGUAGES: &[&str] = &[ + "as", + "csv", + "tsv", + "applescript", + "script editor", + "s", + "S", + "adoc", + "asciidoc", + "asc", + "asa", + "yasm", + "nasm", + "asm", + "inc", + "mac", + "awk", + "bat", + "cmd", + "bib", + "sh", + "bash", + "zsh", + ".bash_aliases", + ".bash_completions", + ".bash_functions", + ".bash_login", + ".bash_logout", + ".bash_profile", + ".bash_variables", + ".bashrc", + ".profile", + ".textmate_init", + ".zshrc", + "PKGBUILD", + ".ebuild", + ".eclass", + "c", + "h", + "cs", + "csx", + "cpp", + "cc", + "cp", + "cxx", + "c++", + "C", + "h", + "hh", + "hpp", + "hxx", + "h++", + "inl", + "ipp", + "cabal", + "clj", + "cljc", + "cljs", + "edn", + "CMakeLists.txt", + "cmake", + "h.in", + "hh.in", + "hpp.in", + "hxx.in", + "h++.in", + "CMakeCache.txt", + "cr", + "css", + "css.erb", + "css.liquid", + "d", + "di", + "dart", + "diff", + "patch", + "Dockerfile", + "dockerfile", + "ex", + "exs", + "elm", + "erl", + "hrl", + "Emakefile", + "emakefile", + "fs", + "fsi", + "fsx", + "fs", + "fsi", + "fsx", + "fish", + "attributes", + "gitattributes", + ".gitattributes", + "COMMIT_EDITMSG", + "MERGE_MSG", + "TAG_EDITMSG", + "gitconfig", + ".gitconfig", + ".gitmodules", + "exclude", + "gitignore", + ".gitignore", + ".git", + "gitlog", + "git-rebase-todo", + "go", + "dot", + "DOT", + "gv", + "groovy", + "gvy", + "gradle", + "Jenkinsfile", + "hs", + "hs", + "hsc", + "show-nonprintable", + "html", + "htm", + "shtml", + "xhtml", + "asp", + "html.eex", + "yaws", + "rails", + "rhtml", + "erb", + "html.erb", + "adp", + "twig", + "html.twig", + "ini", + "INI", + "INF", + "reg", + "REG", + "lng", + "cfg", + "CFG", + "desktop", + "url", + "URL", + ".editorconfig", + ".hgrc", + "hgrc", + "java", + "bsh", + "properties", + "jsp", + "js", + "htc", + "js", + "jsx", + "babel", + "es6", + "js.erb", + "json", + "sublime-settings", + "sublime-menu", + "sublime-keymap", + "sublime-mousemap", + "sublime-theme", + "sublime-build", + "sublime-project", + "sublime-completions", + "sublime-commands", + "sublime-macro", + "sublime-color-scheme", + "ipynb", + "Pipfile.lock", + "jsonnet", + "libsonnet", + "libjsonnet", + "jl", + "kt", + "kts", + "tex", + "ltx", + "less", + "css.less", + "lisp", + "cl", + "clisp", + "l", + "mud", + "el", + "scm", + "ss", + "lsp", + "fasl", + "lhs", + "lua", + "make", + "GNUmakefile", + "makefile", + "Makefile", + "makefile.am", + "Makefile.am", + "makefile.in", + "Makefile.in", + "OCamlMakefile", + "mak", + "mk", + "md", + "mdown", + "markdown", + "markdn", + "matlab", + "build", + "nix", + "m", + "h", + "mm", + "M", + "h", + "ml", + "mli", + "mll", + "mly", + "pas", + "p", + "dpr", + "pl", + "pm", + "pod", + "t", + "PL", + "php", + "php3", + "php4", + "php5", + "php7", + "phps", + "phpt", + "phtml", + "txt", + "ps1", + "psm1", + "psd1", + "proto", + "protodevel", + "pb.txt", + "proto.text", + "textpb", + "pbtxt", + "prototxt", + "pp", + "epp", + "purs", + "py", + "py3", + "pyw", + "pyi", + "pyx", + "pyx.in", + "pxd", + "pxd.in", + "pxi", + "pxi.in", + "rpy", + "cpy", + "SConstruct", + "Sconstruct", + "sconstruct", + "SConscript", + "gyp", + "gypi", + "Snakefile", + "wscript", + "R", + "r", + "s", + "S", + "Rprofile", + "rd", + "re", + "rst", + "rest", + "robot", + "rb", + "Appfile", + "Appraisals", + "Berksfile", + "Brewfile", + "capfile", + "cgi", + "Cheffile", + "config.ru", + "Deliverfile", + "Fastfile", + "fcgi", + "Gemfile", + "gemspec", + "Guardfile", + "irbrc", + "jbuilder", + "Podfile", + "podspec", + "prawn", + "rabl", + "rake", + "Rakefile", + "Rantfile", + "rbx", + "rjs", + "ruby.rail", + "Scanfile", + "simplecov", + "Snapfile", + "thor", + "Thorfile", + "Vagrantfile", + "haml", + "sass", + "rxml", + "builder", + "rs", + "scala", + "sbt", + "sql", + "ddl", + "dml", + "erbsql", + "sql.erb", + "swift", + "log", + "tcl", + "tf", + "tfvars", + "hcl", + "sty", + "cls", + "textile", + "toml", + "tml", + "Cargo.lock", + "Gopkg.lock", + "Pipfile", + "ts", + "tsx", + "varlink", + "vim", + ".vimrc", + "xml", + "xsd", + "xslt", + "tld", + "dtml", + "rss", + "opml", + "svg", + "yaml", + "yml", + "sublime-syntax", +]; diff --git a/crates/nu-cli/src/commands/enter.rs b/crates/nu-cli/src/commands/enter.rs index 572427eb7..28142fc8a 100644 --- a/crates/nu-cli/src/commands/enter.rs +++ b/crates/nu-cli/src/commands/enter.rs @@ -121,21 +121,16 @@ async fn enter( let full_path = std::path::PathBuf::from(cwd); - let (file_extension, contents, contents_tag) = crate::commands::open::fetch( + let (file_extension, tagged_contents) = crate::commands::open::fetch( &full_path, &PathBuf::from(location_clone), tag.span, - match encoding { - Some(e) => e.to_string(), - _ => "".to_string(), - }, + encoding, ) .await?; - match contents { + match tagged_contents.value { UntaggedValue::Primitive(Primitive::String(_)) => { - let tagged_contents = contents.into_value(&contents_tag); - if let Some(extension) = file_extension { let command_name = format!("from {}", extension); if let Some(converter) = registry.get_command(&command_name) { @@ -156,18 +151,18 @@ async fn enter( scope: scope.clone(), }, }; + let tag = tagged_contents.tag.clone(); let mut result = converter .run(new_args.with_input(vec![tagged_contents]), ®istry) .await?; let result_vec: Vec> = result.drain_vec().await; - Ok(futures::stream::iter(result_vec.into_iter().map( move |res| match res { Ok(ReturnSuccess::Value(Value { value, .. })) => Ok( ReturnSuccess::Action(CommandAction::EnterValueShell(Value { value, - tag: contents_tag.clone(), + tag: tag.clone(), })), ), x => x, @@ -185,13 +180,9 @@ async fn enter( ))) } } - _ => { - let tagged_contents = contents.into_value(contents_tag); - - Ok(OutputStream::one(ReturnSuccess::action( - CommandAction::EnterValueShell(tagged_contents), - ))) - } + _ => Ok(OutputStream::one(ReturnSuccess::action( + CommandAction::EnterValueShell(tagged_contents), + ))), } } } diff --git a/crates/nu-cli/src/commands/open.rs b/crates/nu-cli/src/commands/open.rs index 8bdc0a59c..957f70c29 100644 --- a/crates/nu-cli/src/commands/open.rs +++ b/crates/nu-cli/src/commands/open.rs @@ -1,15 +1,17 @@ +use crate::commands::classified::maybe_text_codec::{MaybeTextCodec, StringOrBinary}; use crate::commands::WholeStreamCommand; use crate::prelude::*; +use futures_codec::FramedRead; use nu_errors::ShellError; -use nu_protocol::{CommandAction, ReturnSuccess, Signature, SyntaxShape, UntaggedValue}; +use nu_protocol::{CommandAction, ReturnSuccess, Signature, SyntaxShape, UntaggedValue, Value}; use nu_source::{AnchorLocation, Span, Tagged}; -use std::path::{Path, PathBuf}; +use std::path::PathBuf; extern crate encoding_rs; +use crate::commands::constants::BAT_LANGUAGES; use encoding_rs::*; +use futures::prelude::*; +use log::debug; use std::fs::File; -use std::io::BufWriter; -use std::io::Read; -use std::io::Write; pub struct Open; @@ -81,23 +83,25 @@ documentation link at https://docs.rs/encoding_rs/0.8.23/encoding_rs/#statics"# } } -pub fn get_encoding(opt: Option) -> &'static Encoding { +pub fn get_encoding(opt: Option>) -> Result<&'static Encoding, ShellError> { match opt { - None => UTF_8, - Some(label) => match Encoding::for_label((&label).as_bytes()) { - None => { - //print!("{} is not a known encoding label. Trying UTF-8.", label); - //std::process::exit(-2); - get_encoding(Some("utf-8".to_string())) - } - Some(encoding) => encoding, + None => Ok(UTF_8), + Some(label) => match Encoding::for_label((&label.item).as_bytes()) { + None => Err(ShellError::labeled_error( + format!( + r#"{} is not a valid encoding, refer to https://docs.rs/encoding_rs/0.8.23/encoding_rs/#statics for a valid list of encodings"#, + label.item + ), + "invalid encoding", + label.span(), + )), + Some(encoding) => Ok(encoding), }, } } async fn open(args: CommandArgs, registry: &CommandRegistry) -> Result { let cwd = PathBuf::from(args.shell_manager.path()); - let full_path = cwd; let registry = registry.clone(); let ( @@ -108,329 +112,135 @@ async fn open(args: CommandArgs, registry: &CommandRegistry) -> Result e.to_string(), - _ => "".to_string(), - }; - let result = fetch(&full_path, &path.item, path.tag.span, enc).await; - let (file_extension, contents, contents_tag) = result?; + // TODO: Remove once Streams are supported everywhere! + // As a short term workaround for getting AutoConvert and Bat functionality (Those don't currently support Streams) - let file_extension = if raw.item { + // Check if the extension has a "from *" command OR "bat" supports syntax highlighting + // AND the user doesn't want the raw output + // In these cases, we will collect the Stream + let ext = if raw.item { None } else { - // If the extension could not be determined via mimetype, try to use the path - // extension. Some file types do not declare their mimetypes (such as bson files). - file_extension.or_else(|| path.extension().map(|x| x.to_string_lossy().to_string())) + path.extension() + .map(|name| name.to_string_lossy().to_string()) }; - let tagged_contents = contents.into_value(&contents_tag); - - if let Some(extension) = file_extension { - Ok(OutputStream::one(ReturnSuccess::action( - CommandAction::AutoConvert(tagged_contents, extension), - ))) - } else { - Ok(OutputStream::one(ReturnSuccess::value(tagged_contents))) + if let Some(ext) = ext { + // Check if we have a conversion command + if let Some(_command) = registry.get_command(&format!("from {}", ext)) { + let (_, tagged_contents) = crate::commands::open::fetch( + &cwd, + &PathBuf::from(&path.item), + path.tag.span, + encoding, + ) + .await?; + return Ok(OutputStream::one(ReturnSuccess::action( + CommandAction::AutoConvert(tagged_contents, ext), + ))); + } + // Check if bat does syntax highlighting + if BAT_LANGUAGES.contains(&ext.as_ref()) { + let (_, tagged_contents) = crate::commands::open::fetch( + &cwd, + &PathBuf::from(&path.item), + path.tag.span, + encoding, + ) + .await?; + return Ok(OutputStream::one(ReturnSuccess::value(tagged_contents))); + } } + + // Normal Streaming operation + let with_encoding = if encoding.is_none() { + None + } else { + Some(get_encoding(encoding)?) + }; + let f = File::open(&path).map_err(|e| { + ShellError::labeled_error( + format!("Error opening file: {:?}", e), + "Error opening file", + path.span(), + ) + })?; + let async_reader = futures::io::AllowStdIo::new(f); + let sob_stream = FramedRead::new(async_reader, MaybeTextCodec::new(with_encoding)) + .map_err(|e| ShellError::unexpected(format!("AsyncRead failed in open function: {:?}", e))) + .into_stream(); + + let final_stream = sob_stream.map(|x| match x { + Ok(StringOrBinary::String(s)) => { + ReturnSuccess::value(UntaggedValue::string(s).into_untagged_value()) + } + Ok(StringOrBinary::Binary(b)) => ReturnSuccess::value( + UntaggedValue::binary(b.into_iter().collect()).into_untagged_value(), + ), + Err(se) => Err(se), + }); + + Ok(OutputStream::new(final_stream)) } +// Note that we do not output a Stream in "fetch" since it is only used by "enter" command +// Which we expect to use a concrete Value a not a Stream pub async fn fetch( cwd: &PathBuf, location: &PathBuf, span: Span, - encoding: String, -) -> Result<(Option, UntaggedValue, Tag), ShellError> { + encoding_choice: Option>, +) -> Result<(Option, Value), ShellError> { + // TODO: I don't understand the point of this? Maybe for better error reporting let mut cwd = cwd.clone(); - let output_encoding: &Encoding = get_encoding(Some("utf-8".to_string())); - let input_encoding: &Encoding = get_encoding(Some(encoding.clone())); - let mut decoder = input_encoding.new_decoder(); - let mut encoder = output_encoding.new_encoder(); - let mut _file: File; - let buf = Vec::new(); - let mut bufwriter = BufWriter::new(buf); - - cwd.push(Path::new(location)); - if let Ok(cwd) = dunce::canonicalize(&cwd) { - if !encoding.is_empty() { - // use the encoding string - match File::open(&Path::new(&cwd)) { - Ok(mut _file) => { - convert_via_utf8( - &mut decoder, - &mut encoder, - &mut _file, - &mut bufwriter, - false, - ); - //bufwriter.flush()?; - Ok(( - cwd.extension() - .map(|name| name.to_string_lossy().to_string()), - UntaggedValue::string(String::from_utf8_lossy(&bufwriter.buffer())), - Tag { - span, - anchor: Some(AnchorLocation::File(cwd.to_string_lossy().to_string())), - }, - )) - } - Err(_) => Err(ShellError::labeled_error( - format!("Cannot open {:?} for reading.", &cwd), - "file not found", - span, - )), - } - } else { - // Do the old stuff - match std::fs::read(&cwd) { - Ok(bytes) => match std::str::from_utf8(&bytes) { - Ok(s) => Ok(( - cwd.extension() - .map(|name| name.to_string_lossy().to_string()), - UntaggedValue::string(s), - Tag { - span, - anchor: Some(AnchorLocation::File(cwd.to_string_lossy().to_string())), - }, - )), - Err(_) => { - //Non utf8 data. - match (bytes.get(0), bytes.get(1)) { - (Some(x), Some(y)) if *x == 0xff && *y == 0xfe => { - // Possibly UTF-16 little endian - let utf16 = read_le_u16(&bytes[2..]); - - if let Some(utf16) = utf16 { - match std::string::String::from_utf16(&utf16) { - Ok(s) => Ok(( - cwd.extension() - .map(|name| name.to_string_lossy().to_string()), - UntaggedValue::string(s), - Tag { - span, - anchor: Some(AnchorLocation::File( - cwd.to_string_lossy().to_string(), - )), - }, - )), - Err(_) => Ok(( - None, - UntaggedValue::binary(bytes), - Tag { - span, - anchor: Some(AnchorLocation::File( - cwd.to_string_lossy().to_string(), - )), - }, - )), - } - } else { - Ok(( - None, - UntaggedValue::binary(bytes), - Tag { - span, - anchor: Some(AnchorLocation::File( - cwd.to_string_lossy().to_string(), - )), - }, - )) - } - } - (Some(x), Some(y)) if *x == 0xfe && *y == 0xff => { - // Possibly UTF-16 big endian - let utf16 = read_be_u16(&bytes[2..]); - - if let Some(utf16) = utf16 { - match std::string::String::from_utf16(&utf16) { - Ok(s) => Ok(( - cwd.extension() - .map(|name| name.to_string_lossy().to_string()), - UntaggedValue::string(s), - Tag { - span, - anchor: Some(AnchorLocation::File( - cwd.to_string_lossy().to_string(), - )), - }, - )), - Err(_) => Ok(( - None, - UntaggedValue::binary(bytes), - Tag { - span, - anchor: Some(AnchorLocation::File( - cwd.to_string_lossy().to_string(), - )), - }, - )), - } - } else { - Ok(( - None, - UntaggedValue::binary(bytes), - Tag { - span, - anchor: Some(AnchorLocation::File( - cwd.to_string_lossy().to_string(), - )), - }, - )) - } - } - _ => Ok(( - None, - UntaggedValue::binary(bytes), - Tag { - span, - anchor: Some(AnchorLocation::File( - cwd.to_string_lossy().to_string(), - )), - }, - )), - } - } - }, - Err(_) => Err(ShellError::labeled_error( - format!("Cannot open {:?} for reading.", &cwd), - "file not found", - span, - )), - } - } - } else { - Err(ShellError::labeled_error( - format!("Cannot open {:?} for reading.", &cwd), - "file not found", + cwd.push(location); + let nice_location = dunce::canonicalize(&cwd).map_err(|e| { + ShellError::labeled_error( + format!("Cannot canonicalize file {:?} because {:?}", &cwd, e), + "Cannot canonicalize", span, - )) - } -} + ) + })?; -fn convert_via_utf8( - decoder: &mut Decoder, - encoder: &mut Encoder, - read: &mut dyn Read, - write: &mut dyn Write, - last: bool, -) { - let mut input_buffer = [0u8; 2048]; - let mut intermediate_buffer_bytes = [0u8; 4096]; - // Is there a safe way to create a stack-allocated &mut str? - let mut intermediate_buffer: &mut str = - //unsafe { std::mem::transmute(&mut intermediate_buffer_bytes[..]) }; - std::str::from_utf8_mut(&mut intermediate_buffer_bytes[..]).expect("error with from_utf8_mut"); - let mut output_buffer = [0u8; 4096]; - let mut current_input_ended = false; - while !current_input_ended { - match read.read(&mut input_buffer) { - Err(_) => { - print!("Error reading input."); - //std::process::exit(-5); - } - Ok(decoder_input_end) => { - current_input_ended = decoder_input_end == 0; - let input_ended = last && current_input_ended; - let mut decoder_input_start = 0usize; - loop { - let (decoder_result, decoder_read, decoder_written, _) = decoder.decode_to_str( - &input_buffer[decoder_input_start..decoder_input_end], - &mut intermediate_buffer, - input_ended, - ); - decoder_input_start += decoder_read; + // The extension may be used in AutoConvert later on + let ext = location + .extension() + .map(|name| name.to_string_lossy().to_string()); - let last_output = if input_ended { - match decoder_result { - CoderResult::InputEmpty => true, - CoderResult::OutputFull => false, - } - } else { - false - }; + // The tag that will used when returning a Value + let file_tag = Tag { + span, + anchor: Some(AnchorLocation::File( + nice_location.to_string_lossy().to_string(), + )), + }; - // Regardless of whether the intermediate buffer got full - // or the input buffer was exhausted, let's process what's - // in the intermediate buffer. + let res = std::fs::read(location)?; - if encoder.encoding() == UTF_8 { - // If the target is UTF-8, optimize out the encoder. - if write - .write_all(&intermediate_buffer.as_bytes()[..decoder_written]) - .is_err() - { - print!("Error writing output."); - //std::process::exit(-7); - } - } else { - let mut encoder_input_start = 0usize; - loop { - let (encoder_result, encoder_read, encoder_written, _) = encoder - .encode_from_utf8( - &intermediate_buffer[encoder_input_start..decoder_written], - &mut output_buffer, - last_output, - ); - encoder_input_start += encoder_read; - if write.write_all(&output_buffer[..encoder_written]).is_err() { - print!("Error writing output."); - //std::process::exit(-6); - } - match encoder_result { - CoderResult::InputEmpty => { - break; - } - CoderResult::OutputFull => { - continue; - } - } - } - } - - // Now let's see if we should read again or process the - // rest of the current input buffer. - match decoder_result { - CoderResult::InputEmpty => { - break; - } - CoderResult::OutputFull => { - continue; - } - } - } - } - } - } -} - -fn read_le_u16(input: &[u8]) -> Option> { - if input.len() % 2 != 0 || input.len() < 2 { - None + // If no encoding is provided we try to guess the encoding to read the file with + let encoding = if encoding_choice.is_none() { + UTF_8 } else { - let mut result = vec![]; - let mut pos = 0; - while pos < input.len() { - result.push(u16::from_le_bytes([input[pos], input[pos + 1]])); - pos += 2; - } + get_encoding(encoding_choice.clone())? + }; - Some(result) - } -} - -fn read_be_u16(input: &[u8]) -> Option> { - if input.len() % 2 != 0 || input.len() < 2 { - None + // If the user specified an encoding, then do not do BOM sniffing + let decoded_res = if encoding_choice.is_some() { + let (cow_res, _replacements) = encoding.decode_with_bom_removal(&res); + cow_res } else { - let mut result = vec![]; - let mut pos = 0; - while pos < input.len() { - result.push(u16::from_be_bytes([input[pos], input[pos + 1]])); - pos += 2; + // Otherwise, use the default UTF-8 encoder with BOM sniffing + let (cow_res, actual_encoding, replacements) = encoding.decode(&res); + // If we had to use replacement characters then fallback to binary + if replacements { + return Ok((ext, UntaggedValue::binary(res).into_value(file_tag))); } - - Some(result) - } + debug!("Decoded using {:?}", actual_encoding); + cow_res + }; + let v = UntaggedValue::string(decoded_res.to_string()).into_value(file_tag); + Ok((ext, v)) } #[cfg(test)] diff --git a/crates/nu-cli/tests/commands/enter.rs b/crates/nu-cli/tests/commands/enter.rs index bbacd5a1f..3156f7541 100644 --- a/crates/nu-cli/tests/commands/enter.rs +++ b/crates/nu-cli/tests/commands/enter.rs @@ -80,7 +80,6 @@ fn errors_if_file_not_found() { "enter i_dont_exist.csv" ); - //assert!(actual.err.contains("File could not be opened")); - assert!(actual.err.contains("file not found")); + assert!(actual.err.contains("Cannot canonicalize")); }) } diff --git a/crates/nu-cli/tests/commands/open.rs b/crates/nu-cli/tests/commands/open.rs index b24eb1dca..22a91173c 100644 --- a/crates/nu-cli/tests/commands/open.rs +++ b/crates/nu-cli/tests/commands/open.rs @@ -224,7 +224,11 @@ fn errors_if_file_not_found() { cwd: "tests/fixtures/formats", "open i_dont_exist.txt" ); - - //assert!(actual.err.contains("File could not be opened")); - assert!(actual.err.contains("Cannot open")); + let expected = "Cannot canonicalize"; + assert!( + actual.err.contains(expected), + "Error:\n{}\ndoes not contain{}", + actual.err, + expected + ); }