Skip to content

Commit

Permalink
Identify comments with independent query files
Browse files Browse the repository at this point in the history
Instead of using ad hoc logic to identify comment nodes, this commit
uses a separate query file.
nbacquey committed Jan 13, 2025

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature. The key has expired.
1 parent e965cd7 commit 7a31f23
Showing 20 changed files with 256 additions and 40 deletions.
1 change: 1 addition & 0 deletions examples/client-app/src/main.rs
Original file line number Diff line number Diff line change
@@ -26,6 +26,7 @@ async fn main() {
let language: Language = Language {
name: "json".to_owned(),
query: TopiaryQuery::new(&grammar, query).unwrap(),
comment_query: None,
grammar,
indent: None,
};
78 changes: 70 additions & 8 deletions topiary-cli/src/io.rs
Original file line number Diff line number Diff line change
@@ -113,6 +113,7 @@ pub struct InputFile<'cfg> {
source: InputSource,
language: &'cfg topiary_config::language::Language,
query: QuerySource,
comment_query: Option<QuerySource>,
}

impl<'cfg> InputFile<'cfg> {
@@ -125,9 +126,21 @@ impl<'cfg> InputFile<'cfg> {
};
let query = TopiaryQuery::new(&grammar, &contents)?;

// Can't use `map` because of closures, async, and Result.
let comment_contents = match &self.comment_query {
Some(QuerySource::Path(query)) => Some(tokio::fs::read_to_string(query).await?),
Some(QuerySource::BuiltIn(contents)) => Some(contents.to_owned()),
None => None,
};
let comment_query = match comment_contents {
Some(c) => Some(TopiaryQuery::new(&grammar, &c)?),
None => None,
};

Ok(Language {
name: self.language.name.clone(),
query,
comment_query,
grammar,
indent: self.language().config.indent.clone(),
})
@@ -178,17 +191,18 @@ impl<'cfg, 'i> Inputs<'cfg> {
InputFrom::Stdin(language_name, query) => {
vec![(|| {
let language = config.get_language(&language_name)?;
let query_source: QuerySource = match query {
let (query, comment_query) = match query {
// The user specified a query file
Some(p) => p,
Some(p) => (p, None),
// The user did not specify a file, try the default locations
None => to_query_from_language(language)?,
};

Ok(InputFile {
source: InputSource::Stdin,
language,
query: query_source,
query,
comment_query,
})
})()]
}
@@ -197,12 +211,13 @@ impl<'cfg, 'i> Inputs<'cfg> {
.into_iter()
.map(|path| {
let language = config.detect(&path)?;
let query: QuerySource = to_query_from_language(language)?;
let (query, comment_query) = to_query_from_language(language)?;

Ok(InputFile {
source: InputSource::Disk(path, None),
language,
query,
comment_query,
})
})
.collect(),
@@ -212,16 +227,21 @@ impl<'cfg, 'i> Inputs<'cfg> {
}
}

fn to_query_from_language(language: &topiary_config::language::Language) -> CLIResult<QuerySource> {
let query: QuerySource = match language.find_query_file() {
Ok(p) => p.into(),
fn to_query_from_language(
language: &topiary_config::language::Language,
) -> CLIResult<(QuerySource, Option<QuerySource>)> {
let query: (QuerySource, Option<QuerySource>) = match language.find_query_file() {
Ok((path, comment_path)) => (path.into(), comment_path.map(|p| p.into())),
// For some reason, Topiary could not find any
// matching file in a default location. As a final attempt, try the
// builtin ones. Store the error, return that if we
// fail to find anything, because the builtin error might be unexpected.
Err(e) => {
log::warn!("No query files found in any of the expected locations. Falling back to compile-time included files.");
to_query(&language.name).map_err(|_| e)?
(
to_query(&language.name).map_err(|_| e)?,
to_comment_query(&language.name)?,
)
}
};
Ok(query)
@@ -364,3 +384,45 @@ where
)),
}
}

fn to_comment_query<T>(name: T) -> CLIResult<Option<QuerySource>>
where
T: AsRef<str> + fmt::Display,
{
match name.as_ref() {
#[cfg(feature = "bash")]
"bash" => Ok(Some(topiary_queries::bash_comment().into())),

#[cfg(feature = "css")]
"css" => Ok(Some(topiary_queries::css_comment().into())),

#[cfg(feature = "json")]
"json" => Ok(None),

#[cfg(feature = "nickel")]
"nickel" => Ok(Some(topiary_queries::nickel_comment().into())),

#[cfg(feature = "ocaml")]
"ocaml" => Ok(Some(topiary_queries::ocaml_comment().into())),

#[cfg(feature = "ocaml_interface")]
"ocaml_interface" => Ok(Some(topiary_queries::ocaml_interface_comment().into())),

#[cfg(feature = "ocamllex")]
"ocamllex" => Ok(Some(topiary_queries::ocamllex_comment().into())),

#[cfg(feature = "rust")]
"rust" => Ok(Some(topiary_queries::rust_comment().into())),

#[cfg(feature = "toml")]
"toml" => Ok(Some(topiary_queries::toml_comment().into())),

#[cfg(feature = "tree_sitter_query")]
"tree_sitter_query" => Ok(Some(topiary_queries::tree_sitter_query_comment().into())),

name => Err(TopiaryError::Bin(
format!("The specified language is unsupported: {}", name),
Some(CLIError::UnsupportedLanguage(name.to_string())),
)),
}
}
18 changes: 14 additions & 4 deletions topiary-config/src/language.rs
Original file line number Diff line number Diff line change
@@ -81,8 +81,10 @@ impl Language {
}

#[cfg(not(target_arch = "wasm32"))]
pub fn find_query_file(&self) -> TopiaryConfigResult<PathBuf> {
let basename = PathBuf::from(self.name.as_str()).with_extension("scm");
pub fn find_query_file(&self) -> TopiaryConfigResult<(PathBuf, Option<PathBuf>)> {
let name = self.name.clone();
let basename = PathBuf::from(&name).with_extension("scm"); // "clang.scm"
let comment_basename = PathBuf::from(&name).with_extension("comment.scm"); // "clang.comment.scm"

#[rustfmt::skip]
let potentials: [Option<PathBuf>; 4] = [
@@ -92,12 +94,20 @@ impl Language {
Some(PathBuf::from("../topiary-queries/queries")),
];

potentials
let query_file = potentials
.into_iter()
.flatten()
.map(|path| path.join(&basename))
.find(|path| path.exists())
.ok_or_else(|| TopiaryConfigError::QueryFileNotFound(basename))
.ok_or_else(|| TopiaryConfigError::QueryFileNotFound(basename))?;

let comment_query_file = query_file.parent().unwrap().join(comment_basename);

if comment_query_file.exists() {
Ok((query_file, Some(comment_query_file)))
} else {
Ok((query_file, None))
}
}

#[cfg(not(target_arch = "wasm32"))]
5 changes: 5 additions & 0 deletions topiary-core/benches/benchmark.rs
Original file line number Diff line number Diff line change
@@ -7,6 +7,8 @@ use topiary_core::{formatter, Language, Operation, TopiaryQuery};
async fn format() {
let input = fs::read_to_string("../topiary-cli/tests/samples/input/ocaml.ml").unwrap();
let query_content = fs::read_to_string("../topiary-queries/queries/ocaml.scm").unwrap();
let comment_query_content =
fs::read_to_string("../topiary-queries/queries/ocaml.comment.scm").unwrap();
let ocaml = tree_sitter_ocaml::LANGUAGE_OCAML;

let mut input = input.as_bytes();
@@ -15,6 +17,9 @@ async fn format() {
let language: Language = Language {
name: "ocaml".to_owned(),
query: TopiaryQuery::new(&ocaml.clone().into(), &query_content).unwrap(),
comment_query: Some(
TopiaryQuery::new(&ocaml.clone().into(), &comment_query_content).unwrap(),
),
grammar: ocaml.into(),
indent: None,
};
53 changes: 31 additions & 22 deletions topiary-core/src/comments.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
use std::collections::HashSet;

use topiary_tree_sitter_facade::{InputEdit, Language, Node, Tree};

use crate::{
@@ -63,11 +65,6 @@ impl Diff<InputSection> for InputSection {
}
}

// TODO: allow the users to manually identify comments. Maybe with a separate query file?
fn is_comment(node: &Node) -> bool {
node.is_extra() && node.kind().to_string().contains("comment")
}

fn into_edit(node: &Node<'_>) -> InputEdit {
InputEdit::new(
node.start_byte(),
@@ -82,10 +79,11 @@ fn into_edit(node: &Node<'_>) -> InputEdit {
fn find_comments(
node: Node,
input: &str,
comment_ids: &HashSet<usize>,
comments: &mut Vec<(InputEdit, AnchoredComment)>,
) -> FormatterResult<()> {
if is_comment(&node) {
let commented = find_anchor(&node, input)?;
if comment_ids.contains(&node.id()) {
let commented = find_anchor(&node, input, comment_ids)?;
// Build the corresponding InputEdit:
// - If the comment is not alone on its line, return its bounds
// - If the comment is alone on its line, return the bounds of all its line
@@ -150,7 +148,7 @@ fn find_comments(
} else {
let mut walker = node.walk();
for child in node.children(&mut walker) {
find_comments(child, input, comments)?;
find_comments(child, input, comment_ids, comments)?;
}
Ok(())
}
@@ -259,7 +257,10 @@ fn previous_disjoint_node<'tree>(starting_node: &'tree Node<'tree>) -> Option<No
}

// TODO: if performance is an issue, use TreeCursor to navigate the tree
fn next_non_comment_leaf<'tree>(starting_node: Node<'tree>) -> Option<Node<'tree>> {
fn next_non_comment_leaf<'tree>(
starting_node: Node<'tree>,
comment_ids: &HashSet<usize>,
) -> Option<Node<'tree>> {
let mut node: Node<'tree> = starting_node;
loop {
// get the next leaf:
@@ -275,7 +276,7 @@ fn next_non_comment_leaf<'tree>(starting_node: Node<'tree>) -> Option<Node<'tree
}
Some(sibling) => {
node = sibling;
if is_comment(&node) {
if comment_ids.contains(&node.id()) {
// get the following sibling
continue;
} else {
@@ -287,14 +288,14 @@ fn next_non_comment_leaf<'tree>(starting_node: Node<'tree>) -> Option<Node<'tree
// 2) get the leftmost leaf of the sibling.
// If we encounter a comment, we stop. We'll get back to 1) after the loop
while let Some(child) = node.child(0) {
if is_comment(&child) {
if comment_ids.contains(&child.id()) {
break;
} else {
node = child
}
}
// check if the leaf is a comment. If it is not, start over again.
if is_comment(&node) {
if comment_ids.contains(&node.id()) {
continue;
} else {
return Some(node);
@@ -303,7 +304,10 @@ fn next_non_comment_leaf<'tree>(starting_node: Node<'tree>) -> Option<Node<'tree
}

// TODO: if performance is an issue, use TreeCursor to navigate the tree
fn previous_non_comment_leaf<'tree>(starting_node: Node<'tree>) -> Option<Node<'tree>> {
fn previous_non_comment_leaf<'tree>(
starting_node: Node<'tree>,
comment_ids: &HashSet<usize>,
) -> Option<Node<'tree>> {
let mut node: Node<'tree> = starting_node;
loop {
// get the previous leaf:
@@ -320,7 +324,7 @@ fn previous_non_comment_leaf<'tree>(starting_node: Node<'tree>) -> Option<Node<'
}
Some(sibling) => {
node = sibling;
if is_comment(&node) {
if comment_ids.contains(&node.id()) {
// get the previous sibling
continue;
} else {
@@ -338,14 +342,14 @@ fn previous_non_comment_leaf<'tree>(starting_node: Node<'tree>) -> Option<Node<'
node.child(node.child_count() - 1)
}
} {
if is_comment(&child) {
if comment_ids.contains(&child.id()) {
break;
} else {
node = child
}
}
// check if the leaf is a comment. If it is not, start over again.
if is_comment(&node) {
if comment_ids.contains(&node.id()) {
continue;
} else {
return Some(node);
@@ -360,7 +364,11 @@ fn previous_non_comment_leaf<'tree>(starting_node: Node<'tree>) -> Option<Node<'
// If there is no such node, we anchor to the first non-comment sibling node
// in the other direction.
#[allow(clippy::collapsible_else_if)]
fn find_anchor<'tree>(node: &'tree Node<'tree>, input: &str) -> FormatterResult<Commented> {
fn find_anchor<'tree>(
node: &'tree Node<'tree>,
input: &str,
comment_ids: &HashSet<usize>,
) -> FormatterResult<Commented> {
let point = node.start_position();
let mut lines = input.lines();
let prefix = lines
@@ -377,7 +385,7 @@ fn find_anchor<'tree>(node: &'tree Node<'tree>, input: &str) -> FormatterResult<
)
})?;
if prefix.trim_start() == "" {
if let Some(anchor) = next_non_comment_leaf(node.clone()) {
if let Some(anchor) = next_non_comment_leaf(node.clone(), comment_ids) {
let prev = previous_disjoint_node(node);
let next = next_disjoint_node(node);
Ok(Commented::CommentedAfter {
@@ -389,17 +397,17 @@ fn find_anchor<'tree>(node: &'tree Node<'tree>, input: &str) -> FormatterResult<
.map(|prev| prev.end_position().row() + 1 < node.start_position().row())
.unwrap_or(false),
})
} else if let Some(anchor) = previous_non_comment_leaf(node.clone()) {
} else if let Some(anchor) = previous_non_comment_leaf(node.clone(), comment_ids) {
Ok(Commented::CommentedBefore((&anchor).into()))
} else {
Err(FormatterError::CommentOrphaned(
node.utf8_text(input.as_bytes())?.to_string(),
))
}
} else {
if let Some(anchor) = previous_non_comment_leaf(node.clone()) {
if let Some(anchor) = previous_non_comment_leaf(node.clone(), comment_ids) {
Ok(Commented::CommentedBefore((&anchor).into()))
} else if let Some(anchor) = next_non_comment_leaf(node.clone()) {
} else if let Some(anchor) = next_non_comment_leaf(node.clone(), comment_ids) {
let prev = previous_disjoint_node(node);
let next = next_disjoint_node(node);
Ok(Commented::CommentedAfter {
@@ -445,14 +453,15 @@ pub struct SeparatedInput {
pub fn extract_comments<'a>(
tree: &'a Tree,
input: &'a str,
comment_ids: HashSet<usize>,
grammar: &Language,
tolerate_parsing_errors: bool,
) -> FormatterResult<SeparatedInput> {
let mut anchors: Vec<(InputEdit, AnchoredComment)> = Vec::new();
let mut anchored_comments: Vec<AnchoredComment> = Vec::new();
let mut new_input: String = input.to_string();
let mut new_tree: Tree = tree.clone();
find_comments(tree.root_node(), input, &mut anchors)?;
find_comments(tree.root_node(), input, &comment_ids, &mut anchors)?;
anchors.sort_by_key(|(node, _)| node.start_byte());
let mut edits: Vec<InputEdit> = Vec::new();
// for each (comment, anchor) pair in reverse order, we:
3 changes: 3 additions & 0 deletions topiary-core/src/language.rs
Original file line number Diff line number Diff line change
@@ -13,6 +13,9 @@ pub struct Language {
/// The Query Topiary will use to get the formating captures, must be
/// present. The topiary engine does not include any formatting queries.
pub query: TopiaryQuery,
/// The Query Topiary will use to determine which nodes are comments.
/// When missing, ther ewill be no separate comment processing.
pub comment_query: Option<TopiaryQuery>,
/// The tree-sitter Language. Topiary will use this Language for parsing.
pub grammar: topiary_tree_sitter_facade::Language,
/// The indentation string used for that particular language. Defaults to " "
Loading

0 comments on commit 7a31f23

Please sign in to comment.