html_lexer.lua
1 -- File: html_lexer.lua 2 3 --[[ 4 Copyright (C) 2024 chmod777 5 6 This program is free software: you can redistribute it and/or modify it under 7 the terms of the GNU Affero General Public License version 3 as published by the 8 Free Software Foundation. 9 10 This program is distributed in the hope that it will be useful, but WITHOUT ANY 11 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 12 PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 13 14 You should have received a copy of the GNU Affero General Public License along 15 with this program. If not, see <https://www.gnu.org/licenses/>. 16 ]] 17 18 LexemeType = { 19 OPEN = 1, -- "<" 20 CLOSE = 2, -- ">" 21 END_OPEN = 3, -- "</" 22 SELF_CLOSING = 4, -- "/>" 23 EQ = 5, -- "=" 24 SINGLE_QUOTE = 6, -- ' 25 DOUBLE_QUOTE = 7, -- " 26 WHITESPACE = 8, -- \t\n\b 27 IDENTIFIER = 9, -- tag 28 } 29 30 local Lexer = {} 31 function Lexer:new(source) 32 local this = {} 33 this.source = source 34 this.index = 1 35 36 function is_whitespace(c) 37 return c == ' ' or c == '\n' or c == '\t' or c == '\r' or c == '\b' 38 end 39 function is_special(c) 40 return c=='<' or c=='>' or c=='/' or c=='=' or c=='"' or c=="'" 41 end 42 function is_digit(c) 43 return c=='0' or c=='1' or c=='2' or c=='3' or c=='4' or c=='5' or c=='6' or c=='7' or c=='8' or c=='9' 44 end 45 46 function this:next() 47 local current = this:current_char() 48 if current == nil then 49 return nil 50 end 51 52 local start = this.index 53 54 if current == '<' then 55 this:advance() 56 local next = this:current_char() 57 if next == '/' then 58 local source = this.source:sub(start, this.index) 59 this:advance() 60 return LexemeType.END_OPEN, source 61 else 62 return LexemeType.OPEN, current 63 end 64 elseif current == '/' and this:next_char() ~= nil and this:next_char() == '>' then 65 this:advance() 66 this:advance() 67 return LexemeType.SELF_CLOSING, this.source:sub(start, this.index) 68 elseif current == '>' then 69 this:advance() 70 return LexemeType.CLOSE, current 71 elseif current == '=' then 72 this:advance() 73 return LexemeType.EQ, current 74 elseif current == "'" then 75 this:advance() 76 return LexemeType.SINGLE_QUOTE, current 77 elseif current == '"' then 78 this:advance() 79 return LexemeType.DOUBLE_QUOTE, current 80 elseif is_whitespace(current) then 81 while this:next_char() ~= nil and is_whitespace(this:next_char()) do 82 this:advance() 83 end 84 local source = this.source:sub(start, this.index) 85 this:advance() 86 return LexemeType.WHITESPACE, source 87 else 88 while this:next_char() ~= nil and not is_special(this:next_char()) and not is_whitespace(this:next_char()) do 89 this:advance() 90 end 91 local source = this.source:sub(start, this.index) 92 this:advance() 93 return LexemeType.IDENTIFIER, source 94 end 95 end 96 function this:advance() 97 this.index = this.index+1 98 end 99 function this:current_char() 100 if this.index > #this.source then 101 return nil 102 end 103 return this.source:sub(this.index, this.index) 104 end 105 function this:next_char() 106 return this.source:sub(this.index+1, this.index+1) 107 end 108 109 return this 110 end 111 112 module = { 113 LexemeType = LexemeType, 114 Lexer = Lexer, 115 } 116 117 return module