parser.rs
1 use serde::{Deserialize, Serialize}; 2 use tree_sitter::{Language, Node, Parser, Tree}; 3 4 /// Byte range span within the source text. 5 #[derive(Debug, Serialize, Deserialize)] 6 pub struct Span { 7 pub start: usize, 8 pub end: usize, 9 } 10 11 impl From<Node<'_>> for Span { 12 fn from(node: Node) -> Self { 13 Self { start: node.start_byte(), end: node.end_byte() } 14 } 15 } 16 17 /// Output structure representing a node in the syntax tree. 18 #[derive(Debug, Serialize, Deserialize)] 19 pub struct Output { 20 /// The node kind 21 pub kind: String, 22 23 /// The original text content of the node 24 pub original: String, 25 26 /// The text content 27 pub content: String, 28 29 /// The byte range span 30 pub span: Span, 31 32 /// Child nodes 33 #[serde(skip_serializing_if = "Vec::is_empty", default)] 34 pub children: Vec<Output>, 35 } 36 37 impl Output { 38 pub fn process<F>(&mut self, processor: &F) -> &mut Self 39 where 40 F: Fn(&mut Output), 41 { 42 for child in &mut self.children { 43 child.process(processor); 44 } 45 46 processor(self); 47 self 48 } 49 } 50 51 /// A parsed document with its syntax tree. 52 pub struct Parsed<T> { 53 tree: T, 54 source: String, 55 walker: fn(&T, &str) -> Output, 56 } 57 58 impl<T> Parsed<T> { 59 /// Converts a `Node` into an `Output` structure recursively. 60 pub fn to_output(&self) -> Output { 61 (self.walker)(&self.tree, &self.source) 62 } 63 64 /// Serializes an output tree to a JSON string. 65 pub fn to_json(&self) -> Result<String, serde_json::Error> { 66 serde_json::to_string_pretty(&self.to_output()) 67 } 68 } 69 70 /// Generic `tree-sitter` parser for any parseable language. 71 pub struct Treesitter { 72 parser: Parser, 73 walker: fn(&Tree, &str) -> Output, 74 } 75 76 impl Treesitter { 77 /// Creates a new parser for the specified language. 78 pub fn new(language: impl Into<Language>) -> Self { 79 Self::with_walker(language, Self::default_walker) 80 } 81 82 /// Creates a new parser with a walker. 83 pub fn with_walker( 84 language: impl Into<Language>, 85 walker: fn(&Tree, &str) -> Output, 86 ) -> Self { 87 let mut parser = Parser::new(); 88 parser 89 .set_language(&language.into()) 90 .expect("Treesitter language not found"); 91 92 Self { parser, walker } 93 } 94 95 /// Parses the given source into a format for programmatic input. 96 pub fn parse(&mut self, source: String) -> Option<Parsed<Tree>> { 97 let tree = self.parser.parse(&source, None)?; 98 Some(Parsed { tree, source, walker: self.walker }) 99 } 100 101 fn default_walker(tree: &Tree, source: &str) -> Output { 102 fn walk(node: Node, src: &str) -> Output { 103 let original = node.utf8_text(src.as_bytes()).unwrap().to_string(); 104 105 let mut cursor = node.walk(); 106 let children = 107 node.children(&mut cursor).map(|n| walk(n, src)).collect(); 108 109 Output { 110 kind: node.kind().to_string(), 111 content: original.clone(), 112 original, 113 span: Span::from(node), 114 children, 115 } 116 } 117 118 walk(tree.root_node(), source) 119 } 120 } 121 122 #[cfg(test)] 123 mod tests { 124 use super::*; 125 126 const SOURCE: &str = r"# Foo 127 Bar 128 129 ## Baz 130 Foo Bar 131 "; 132 133 fn assert_node( 134 node: &Output, 135 kind: &str, 136 original: &str, 137 content: Option<&str>, 138 ) { 139 assert_eq!(node.kind, kind); 140 assert_eq!(node.original, original); 141 142 if let Some(content) = content { 143 assert_eq!(node.content, content); 144 } 145 } 146 147 fn uppercase_headings_processor(output: &mut Output) { 148 if output.kind == "atx_heading" { 149 output.content = output.content.to_uppercase(); 150 } 151 } 152 153 fn protocol_processor(output: &mut Output) { 154 if output.kind == "inline" && output.content.contains("http://") { 155 output.content = output.content.replace("http://", "https://"); 156 } 157 } 158 159 #[test] 160 fn test_parsing() { 161 let doc = Treesitter::new(tree_sitter_md::LANGUAGE) 162 .parse(SOURCE.to_string()) 163 .unwrap(); 164 165 let output = doc.to_output(); 166 assert_node(&output, "document", SOURCE, Some(SOURCE)); 167 168 let section = &output.children[0]; 169 assert_node( 170 section, 171 "section", 172 "# Foo\nBar\n\n## Baz\nFoo Bar\n", 173 None, 174 ); 175 176 let heading = §ion.children[0]; 177 assert_node(heading, "atx_heading", "# Foo\n", Some("# Foo\n")); 178 179 let paragraph = §ion.children[1]; 180 assert_node(paragraph, "paragraph", "Bar\n", Some("Bar\n")); 181 182 let nested = §ion.children[2]; 183 assert_node(nested, "section", "## Baz\nFoo Bar\n", None); 184 185 let nested_heading = &nested.children[0]; 186 assert_node( 187 nested_heading, 188 "atx_heading", 189 "## Baz\n", 190 Some("## Baz\n"), 191 ); 192 193 let nested_paragraph = &nested.children[1]; 194 assert_node( 195 nested_paragraph, 196 "paragraph", 197 "Foo Bar\n", 198 Some("Foo Bar\n"), 199 ); 200 } 201 202 #[test] 203 fn test_serialization() { 204 let doc = Treesitter::new(tree_sitter_md::LANGUAGE) 205 .parse(SOURCE.to_string()) 206 .unwrap(); 207 208 let json = doc.to_json().unwrap(); 209 let parsed: Output = serde_json::from_str(&json).unwrap(); 210 211 assert_node(&parsed, "document", SOURCE, Some(SOURCE)); 212 } 213 214 #[test] 215 fn test_uppercase_headings_processor() { 216 let doc = Treesitter::new(tree_sitter_md::LANGUAGE) 217 .parse(SOURCE.to_string()) 218 .unwrap(); 219 220 let mut output = doc.to_output(); 221 output.process(&uppercase_headings_processor); 222 223 let section = &output.children[0]; 224 let heading = §ion.children[0]; 225 226 assert_node(heading, "atx_heading", "# Foo\n", Some("# FOO\n")); 227 } 228 229 #[test] 230 fn test_protocol_processor() { 231 let source = "# Foo\n\n[Link](http://example.com)"; 232 let doc = Treesitter::new(tree_sitter_md::LANGUAGE) 233 .parse(source.to_string()) 234 .unwrap(); 235 236 let mut output = doc.to_output(); 237 output 238 .process(&uppercase_headings_processor) 239 .process(&protocol_processor); 240 241 let section = &output.children[0]; 242 243 let heading = §ion.children[0]; 244 assert_node(heading, "atx_heading", "# Foo\n", Some("# FOO\n")); 245 246 let paragraph = §ion.children[1]; 247 assert_node(paragraph, "paragraph", "[Link](http://example.com)", None); 248 249 let inline = ¶graph.children[0]; 250 assert_node( 251 inline, 252 "inline", 253 "[Link](http://example.com)", 254 Some("[Link](https://example.com)"), 255 ); 256 } 257 }