diff --git a/crates/tabby-common/assets/languages.toml b/crates/tabby-common/assets/languages.toml index baa22bccddcb..7fd025322e26 100644 --- a/crates/tabby-common/assets/languages.toml +++ b/crates/tabby-common/assets/languages.toml @@ -1,10 +1,12 @@ [[config]] languages = ["python"] +exts = ["py"] line_comment = "#" top_level_keywords = ["def", "from", "class", "import"] [[config]] languages = ["rust"] +exts = ["rs"] line_comment = "//" top_level_keywords = [ "fn", @@ -21,6 +23,7 @@ top_level_keywords = [ [[config]] languages = ["java"] +exts = ["java"] line_comment = "//" top_level_keywords = [ "abstract", @@ -43,6 +46,7 @@ top_level_keywords = [ [[config]] languages = ["kotlin"] +exts = ["kt", "kts"] line_comment = "//" top_level_keywords = [ "abstract", @@ -66,7 +70,8 @@ top_level_keywords = [ ] [[config]] -languages = ["javascript", "typescript", "javascriptreact", "typescriptreact"] +languages = ["javascript-typescript", "javascript", "typescript", "javascriptreact", "typescriptreact"] +exts = ["js", "ts", "jsx", "tsx", "mjs", "mts"] line_comment = "//" top_level_keywords = [ "abstract", @@ -86,6 +91,7 @@ top_level_keywords = [ [[config]] languages = ["go"] +exts = ["go"] line_comment = "//" top_level_keywords = [ "func", @@ -100,6 +106,7 @@ top_level_keywords = [ [[config]] languages = ["ruby"] +exts = ["rb"] line_comment = "#" top_level_keywords = [ "begin", @@ -116,6 +123,7 @@ top_level_keywords = [ [[config]] languages = ["c"] +exts = ["c", "h"] line_comment = "//" top_level_keywords = [ "const", @@ -132,6 +140,7 @@ top_level_keywords = [ [[config]] languages = ["cpp"] +exts= ["cpp", "hpp", "c++", "h++", "cc", "hh", "C", "H", "tcc"] line_comment = "//" top_level_keywords = [ "auto", @@ -153,6 +162,7 @@ top_level_keywords = [ [[config]] languages = ["csharp"] +exts = ["cs"] line_comment = "//" top_level_keywords = [ "class", @@ -169,9 +179,9 @@ top_level_keywords = [ "async", ] - [[config]] languages = ["php"] +exts = ["php", "php3", "php4", "php5", "phps", "phpt"] line_comment = "//" top_level_keywords = [ "abstract", @@ -198,9 +208,9 @@ top_level_keywords = [ "use", ] - [[config]] languages = ["solidity"] +exts = ["sol"] line_comment = "//" top_level_keywords = [ "contract", @@ -212,3 +222,63 @@ top_level_keywords = [ "function", "type", ] + +[[config]] +languages = ["css"] +exts = ["css"] + +[[config]] +languages = ["dockerfile"] +exts = ["Dockerfile"] + +[[config]] +languages = ["haskell"] +exts = ["hs"] + +[[config]] +languages = ["html"] +exts = ["html"] + +[[config]] +languages = ["julia"] +exts = ["jl"] + +[[config]] +languages = ["lua"] +exts = ["lua"] + +[[config]] +languages = ["makefile"] +exts = ["Makefile"] + +[[config]] +languages = ["markdown"] +exts = ["md", "markdown"] + +[[config]] +languages = ["perl"] +exts = ["pl", "pm", "pod", "perl"] + +[[config]] +languages = ["powershell"] +exts = ["ps1", "psd1", "psm1"] + +[[config]] +languages = ["sql"] +exts = ["sql"] + +[[config]] +languages = ["scala"] +exts = ["scala"] + +[[config]] +languages = ["shellscript"] +exts = ["sh", "bash", "command", "zsh"] + +[[config]] +languages = ["tex"] +exts = ["tex"] + +[[config]] +languages = ["vb"] +exts = ["vb"] \ No newline at end of file diff --git a/crates/tabby-common/src/languages.rs b/crates/tabby-common/src/languages.rs index b5f9c8bdb2d0..625f7459e02e 100644 --- a/crates/tabby-common/src/languages.rs +++ b/crates/tabby-common/src/languages.rs @@ -1,3 +1,5 @@ +use std::{collections::HashMap, ffi::OsStr}; + use lazy_static::lazy_static; use serde::Deserialize; @@ -29,45 +31,88 @@ struct ConfigList { #[derive(Deserialize, Debug)] pub struct Language { languages: Vec, - top_level_keywords: Vec, + exts: Vec, - pub line_comment: String, + top_level_keywords: Option>, + pub line_comment: Option, } impl Language { pub fn get_stop_words(&self) -> Vec { let mut out = vec![]; - out.push(format!("\n{}", self.line_comment)); - for word in &self.top_level_keywords { - out.push(format!("\n{}", word)); - } for x in DEFAULT.iter() { out.push((*x).to_owned()); } + if let Some(line_comment) = &self.line_comment { + out.push(format!("\n{}", line_comment)); + }; + + if let Some(top_level_keywords) = &self.top_level_keywords { + for word in top_level_keywords { + out.push(format!("\n{}", word)); + } + }; + out } - pub fn get_hashkey(&self) -> String { - self.languages[0].clone() + pub fn language(&'static self) -> &'static str { + self.languages[0].as_str() } } lazy_static! { static ref CONFIG: ConfigList = serdeconv::from_toml_str(include_str!("../assets/languages.toml")).unwrap(); + static ref LANGUAGE_CONFIG_MAPPING: HashMap<&'static str, &'static Language> = { + let mut map = HashMap::new(); + for c in &CONFIG.config { + for l in &c.languages { + assert!( + !map.contains_key(l.as_str()), + "Duplicate language found: {}", + l + ); + map.insert(l.as_str(), c); + } + } + map + }; + static ref EXTS_LANGUAGE_MAPPING: HashMap<&'static str, &'static str> = { + let mut map = HashMap::new(); + for c in &CONFIG.config { + for e in &c.exts { + for l in &c.languages { + assert!( + !map.contains_key(e.as_str()), + "Duplicate extension found: {}", + e + ); + map.insert(e.as_str(), l.as_str()); + } + } + } + map + }; pub static ref UNKNOWN_LANGUAGE: Language = Language { languages: vec!["unknown".to_owned()], - line_comment: "".to_owned(), - top_level_keywords: vec![], + line_comment: Some("".into()), + top_level_keywords: Some(vec![]), + exts: vec![], }; } pub fn get_language(language: &str) -> &'static Language { - CONFIG - .config - .iter() - .find(|c| c.languages.iter().any(|x| x == language)) - .unwrap_or(&UNKNOWN_LANGUAGE) + if let Some(lang) = LANGUAGE_CONFIG_MAPPING.get(language) { + lang + } else { + &UNKNOWN_LANGUAGE + } +} + +pub fn get_language_by_ext(ext: &OsStr) -> Option<&'static Language> { + let ext = ext.to_str()?; + EXTS_LANGUAGE_MAPPING.get(ext).map(|x| get_language(x)) } diff --git a/crates/tabby-inference/src/decoding.rs b/crates/tabby-inference/src/decoding.rs index 33cefd49ab60..320991cae947 100644 --- a/crates/tabby-inference/src/decoding.rs +++ b/crates/tabby-inference/src/decoding.rs @@ -42,7 +42,7 @@ impl StopConditionFactory { if stop_words.is_empty() { None } else { - let hashkey = language.get_hashkey(); + let hashkey = language.language().to_owned(); let mut trie = self.stop_trie_cache.get(&hashkey); if trie.is_none() { self.stop_trie_cache diff --git a/crates/tabby-scheduler/src/dataset/mod.rs b/crates/tabby-scheduler/src/dataset/mod.rs index 6bbd37c02201..6cfd50a93299 100644 --- a/crates/tabby-scheduler/src/dataset/mod.rs +++ b/crates/tabby-scheduler/src/dataset/mod.rs @@ -1,8 +1,6 @@ mod deps; use std::{ - collections::HashMap, - ffi::OsStr, fs::{self, read_to_string}, io::{IsTerminal, Write}, }; @@ -11,10 +9,10 @@ use anyhow::{anyhow, Result}; use file_rotate::{compression::Compression, suffix::AppendCount, ContentLimit, FileRotate}; use ignore::{DirEntry, Walk}; use kdam::BarExt; -use lazy_static::lazy_static; use serde_jsonlines::WriteExt; use tabby_common::{ config::RepositoryConfig, + languages::get_language_by_ext, path::{dataset_dir, dependency_file}, DependencyFile, SourceFile, }; @@ -48,13 +46,14 @@ impl RepositoryExt for RepositoryConfig { .path() .strip_prefix(basedir.as_path()) .expect("Paths always begin with the prefix"); - let language = get_language( + let language = get_language_by_ext( relative_path .extension() .ok_or_else(|| anyhow!("Unknown file extension for {relative_path:?}"))?, ) .ok_or_else(|| anyhow!("Unknown language for {relative_path:?}"))? - .to_owned(); + .to_owned() + .language(); match read_to_string(entry.path()) { Ok(file_content) => { let source_file = SourceFile { @@ -64,8 +63,8 @@ impl RepositoryExt for RepositoryConfig { max_line_length: metrics::max_line_length(&file_content), avg_line_length: metrics::avg_line_length(&file_content), alphanum_fraction: metrics::alphanum_fraction(&file_content), - tags: code.find_tags(&language, &file_content), - language, + tags: code.find_tags(language, &file_content), + language: language.into(), }; writer.write_json_lines([source_file.clone()])?; } @@ -83,14 +82,13 @@ impl RepositoryExt for RepositoryConfig { } } -fn get_language(ext: &OsStr) -> Option<&str> { - let ext = ext.to_str().unwrap_or(""); - EXTENSION_LANGUAGE.get(ext).copied() -} - fn is_source_code(entry: &DirEntry) -> bool { if entry.file_type().is_some_and(|x| x.is_file()) { - entry.path().extension().and_then(get_language).is_some() + entry + .path() + .extension() + .and_then(get_language_by_ext) + .is_some() } else { false } @@ -155,53 +153,3 @@ mod metrics { } } } - -lazy_static! { - static ref LANGUAGE_EXTENSION: HashMap<&'static str, Vec<&'static str>> = { - HashMap::from([ - ("c", vec!["c", "h"]), - ("csharp", vec!["cs"]), - ( - "cpp", - vec!["cpp", "hpp", "c++", "h++", "cc", "hh", "C", "H", "tcc"], - ), - ("css", vec!["css"]), - ("dockerfile", vec!["Dockerfile"]), - ("go", vec!["go"]), - ("haskell", vec!["hs"]), - ("html", vec!["html"]), - ("java", vec!["java"]), - ("kotlin", vec!["kt", "kts"]), - ("julia", vec!["jl"]), - ("lua", vec!["lua"]), - ("makefile", vec!["Makefile"]), - ("markdown", vec!["md", "markdown"]), - ("php", vec!["php", "php3", "php4", "php5", "phps", "phpt"]), - ("perl", vec!["pl", "pm", "pod", "perl"]), - ("powershell", vec!["ps1", "psd1", "psm1"]), - ("python", vec!["py"]), - ("ruby", vec!["rb"]), - ("rust", vec!["rs"]), - ("solidity", vec!["sol"]), - ("sql", vec!["sql"]), - ("scala", vec!["scala"]), - ("shellscript", vec!["sh", "bash", "command", "zsh"]), - ( - "javascript-typescript", - vec!["ts", "mts", "js", "mjs", "jsx", "tsx"], - ), - ("tex", vec!["tex"]), - ("vb", vec!["vb"]), - ]) - }; - static ref EXTENSION_LANGUAGE: HashMap<&'static str, &'static str> = { - let mut map = HashMap::new(); - for (lang, exts) in &*LANGUAGE_EXTENSION { - for ext in exts { - map.insert(*ext, *lang); - } - } - - map - }; -} diff --git a/crates/tabby/src/services/completion/completion_prompt.rs b/crates/tabby/src/services/completion/completion_prompt.rs index f456738e7bd5..c0ed4d1c6767 100644 --- a/crates/tabby/src/services/completion/completion_prompt.rs +++ b/crates/tabby/src/services/completion/completion_prompt.rs @@ -99,7 +99,10 @@ fn build_prefix(language: &str, prefix: &str, snippets: &[Snippet]) -> String { return prefix.to_owned(); } - let comment_char = &get_language(language).line_comment; + let Some(comment_char) = &get_language(language).line_comment else { + return prefix.to_owned(); + }; + let mut lines: Vec = vec![]; for (i, snippet) in snippets.iter().enumerate() {