/ src / parser.rs
parser.rs
  1  use serde::{Deserialize, Serialize};
  2  use tree_sitter::{Language, Node, Parser, Tree};
  3  
  4  /// Byte range span within the source text.
  5  #[derive(Debug, Serialize, Deserialize)]
  6  pub struct Span {
  7  	pub start: usize,
  8  	pub end: usize,
  9  }
 10  
 11  impl From<Node<'_>> for Span {
 12  	fn from(node: Node) -> Self {
 13  		Self { start: node.start_byte(), end: node.end_byte() }
 14  	}
 15  }
 16  
 17  /// Output structure representing a node in the syntax tree.
 18  #[derive(Debug, Serialize, Deserialize)]
 19  pub struct Output {
 20  	/// The node kind
 21  	pub kind: String,
 22  
 23  	/// The original text content of the node
 24  	pub original: String,
 25  
 26  	/// The text content
 27  	pub content: String,
 28  
 29  	/// The byte range span
 30  	pub span: Span,
 31  
 32  	/// Child nodes
 33  	#[serde(skip_serializing_if = "Vec::is_empty", default)]
 34  	pub children: Vec<Output>,
 35  }
 36  
 37  impl Output {
 38  	pub fn process<F>(&mut self, processor: &F) -> &mut Self
 39  	where
 40  		F: Fn(&mut Output),
 41  	{
 42  		for child in &mut self.children {
 43  			child.process(processor);
 44  		}
 45  
 46  		processor(self);
 47  		self
 48  	}
 49  }
 50  
 51  /// A parsed document with its syntax tree.
 52  pub struct Parsed<T> {
 53  	tree: T,
 54  	source: String,
 55  	walker: fn(&T, &str) -> Output,
 56  }
 57  
 58  impl<T> Parsed<T> {
 59  	/// Converts a `Node` into an `Output` structure recursively.
 60  	pub fn to_output(&self) -> Output {
 61  		(self.walker)(&self.tree, &self.source)
 62  	}
 63  
 64  	/// Serializes an output tree to a JSON string.
 65  	pub fn to_json(&self) -> Result<String, serde_json::Error> {
 66  		serde_json::to_string_pretty(&self.to_output())
 67  	}
 68  }
 69  
 70  /// Generic `tree-sitter` parser for any parseable language.
 71  pub struct Treesitter {
 72  	parser: Parser,
 73  	walker: fn(&Tree, &str) -> Output,
 74  }
 75  
 76  impl Treesitter {
 77  	/// Creates a new parser for the specified language.
 78  	pub fn new(language: impl Into<Language>) -> Self {
 79  		Self::with_walker(language, Self::default_walker)
 80  	}
 81  
 82  	/// Creates a new parser with a walker.
 83  	pub fn with_walker(
 84  		language: impl Into<Language>,
 85  		walker: fn(&Tree, &str) -> Output,
 86  	) -> Self {
 87  		let mut parser = Parser::new();
 88  		parser
 89  			.set_language(&language.into())
 90  			.expect("Treesitter language not found");
 91  
 92  		Self { parser, walker }
 93  	}
 94  
 95  	/// Parses the given source into a format for programmatic input.
 96  	pub fn parse(&mut self, source: String) -> Option<Parsed<Tree>> {
 97  		let tree = self.parser.parse(&source, None)?;
 98  		Some(Parsed { tree, source, walker: self.walker })
 99  	}
100  
101  	fn default_walker(tree: &Tree, source: &str) -> Output {
102  		fn walk(node: Node, src: &str) -> Output {
103  			let original = node.utf8_text(src.as_bytes()).unwrap().to_string();
104  
105  			let mut cursor = node.walk();
106  			let children =
107  				node.children(&mut cursor).map(|n| walk(n, src)).collect();
108  
109  			Output {
110  				kind: node.kind().to_string(),
111  				content: original.clone(),
112  				original,
113  				span: Span::from(node),
114  				children,
115  			}
116  		}
117  
118  		walk(tree.root_node(), source)
119  	}
120  }
121  
122  #[cfg(test)]
123  mod tests {
124  	use super::*;
125  
126  	const SOURCE: &str = r"# Foo
127  Bar
128  
129  ## Baz
130  Foo Bar
131  ";
132  
133  	fn assert_node(
134  		node: &Output,
135  		kind: &str,
136  		original: &str,
137  		content: Option<&str>,
138  	) {
139  		assert_eq!(node.kind, kind);
140  		assert_eq!(node.original, original);
141  
142  		if let Some(content) = content {
143  			assert_eq!(node.content, content);
144  		}
145  	}
146  
147  	fn uppercase_headings_processor(output: &mut Output) {
148  		if output.kind == "atx_heading" {
149  			output.content = output.content.to_uppercase();
150  		}
151  	}
152  
153  	fn protocol_processor(output: &mut Output) {
154  		if output.kind == "inline" && output.content.contains("http://") {
155  			output.content = output.content.replace("http://", "https://");
156  		}
157  	}
158  
159  	#[test]
160  	fn test_parsing() {
161  		let doc = Treesitter::new(tree_sitter_md::LANGUAGE)
162  			.parse(SOURCE.to_string())
163  			.unwrap();
164  
165  		let output = doc.to_output();
166  		assert_node(&output, "document", SOURCE, Some(SOURCE));
167  
168  		let section = &output.children[0];
169  		assert_node(
170  			section,
171  			"section",
172  			"# Foo\nBar\n\n## Baz\nFoo Bar\n",
173  			None,
174  		);
175  
176  		let heading = &section.children[0];
177  		assert_node(heading, "atx_heading", "# Foo\n", Some("# Foo\n"));
178  
179  		let paragraph = &section.children[1];
180  		assert_node(paragraph, "paragraph", "Bar\n", Some("Bar\n"));
181  
182  		let nested = &section.children[2];
183  		assert_node(nested, "section", "## Baz\nFoo Bar\n", None);
184  
185  		let nested_heading = &nested.children[0];
186  		assert_node(
187  			nested_heading,
188  			"atx_heading",
189  			"## Baz\n",
190  			Some("## Baz\n"),
191  		);
192  
193  		let nested_paragraph = &nested.children[1];
194  		assert_node(
195  			nested_paragraph,
196  			"paragraph",
197  			"Foo Bar\n",
198  			Some("Foo Bar\n"),
199  		);
200  	}
201  
202  	#[test]
203  	fn test_serialization() {
204  		let doc = Treesitter::new(tree_sitter_md::LANGUAGE)
205  			.parse(SOURCE.to_string())
206  			.unwrap();
207  
208  		let json = doc.to_json().unwrap();
209  		let parsed: Output = serde_json::from_str(&json).unwrap();
210  
211  		assert_node(&parsed, "document", SOURCE, Some(SOURCE));
212  	}
213  
214  	#[test]
215  	fn test_uppercase_headings_processor() {
216  		let doc = Treesitter::new(tree_sitter_md::LANGUAGE)
217  			.parse(SOURCE.to_string())
218  			.unwrap();
219  
220  		let mut output = doc.to_output();
221  		output.process(&uppercase_headings_processor);
222  
223  		let section = &output.children[0];
224  		let heading = &section.children[0];
225  
226  		assert_node(heading, "atx_heading", "# Foo\n", Some("# FOO\n"));
227  	}
228  
229  	#[test]
230  	fn test_protocol_processor() {
231  		let source = "# Foo\n\n[Link](http://example.com)";
232  		let doc = Treesitter::new(tree_sitter_md::LANGUAGE)
233  			.parse(source.to_string())
234  			.unwrap();
235  
236  		let mut output = doc.to_output();
237  		output
238  			.process(&uppercase_headings_processor)
239  			.process(&protocol_processor);
240  
241  		let section = &output.children[0];
242  
243  		let heading = &section.children[0];
244  		assert_node(heading, "atx_heading", "# Foo\n", Some("# FOO\n"));
245  
246  		let paragraph = &section.children[1];
247  		assert_node(paragraph, "paragraph", "[Link](http://example.com)", None);
248  
249  		let inline = &paragraph.children[0];
250  		assert_node(
251  			inline,
252  			"inline",
253  			"[Link](http://example.com)",
254  			Some("[Link](https://example.com)"),
255  		);
256  	}
257  }