Html5Normalizer.cpp
1 #include "Html5Normalizer.h" 2 3 #include <SDCardManager.h> 4 5 #include <algorithm> 6 #include <cctype> 7 8 namespace html5 { 9 10 namespace { 11 12 // HTML5 void elements that cannot have closing tags (lowercase for case-insensitive matching) 13 constexpr const char* VOID_ELEMENTS[] = {"img", "br", "hr", "input", "meta", "link", "area", 14 "base", "col", "embed", "param", "source", "track", "wbr"}; 15 constexpr size_t VOID_ELEMENT_COUNT = sizeof(VOID_ELEMENTS) / sizeof(VOID_ELEMENTS[0]); 16 constexpr size_t MAX_TAG_NAME_LENGTH = 8; 17 constexpr size_t BUFFER_SIZE = 512; 18 19 enum class State { Normal, InTagStart, InTagName, InTagAttrs, InQuote, InClosingTagName, InClosingTagRest }; 20 21 char toLowerAscii(char c) { return (c >= 'A' && c <= 'Z') ? static_cast<char>(c + ('a' - 'A')) : c; } 22 23 bool isVoidElement(const char* name, size_t len) { 24 for (size_t i = 0; i < VOID_ELEMENT_COUNT; i++) { 25 const char* ve = VOID_ELEMENTS[i]; 26 size_t veLen = 0; 27 while (ve[veLen] != '\0') veLen++; 28 if (len == veLen) { 29 bool match = true; 30 for (size_t j = 0; j < len && match; j++) { 31 if (toLowerAscii(name[j]) != ve[j]) match = false; 32 } 33 if (match) return true; 34 } 35 } 36 return false; 37 } 38 39 } // namespace 40 41 bool normalizeVoidElements(const std::string& inputPath, const std::string& outputPath) { 42 FsFile inFile, outFile; 43 44 if (!SdMan.openFileForRead("H5N", inputPath, inFile)) { 45 return false; 46 } 47 48 if (!SdMan.openFileForWrite("H5N", outputPath, outFile)) { 49 inFile.close(); 50 return false; 51 } 52 53 State state = State::Normal; 54 char tagName[MAX_TAG_NAME_LENGTH + 1] = {0}; 55 size_t tagNameLen = 0; 56 char closingTagWhitespace[8] = {0}; // Buffer for whitespace in closing tags 57 size_t closingTagWsLen = 0; 58 bool isCurrentTagVoid = false; 59 char quoteChar = 0; 60 char prevChar = 0; 61 62 uint8_t readBuffer[BUFFER_SIZE]; 63 uint8_t writeBuffer[BUFFER_SIZE + 64]; // Extra space for insertions 64 size_t writePos = 0; 65 66 auto flushWrite = [&]() -> bool { 67 if (writePos > 0) { 68 if (outFile.write(writeBuffer, writePos) != writePos) { 69 return false; 70 } 71 writePos = 0; 72 } 73 return true; 74 }; 75 76 auto writeChar = [&](char c) -> bool { 77 writeBuffer[writePos++] = static_cast<uint8_t>(c); 78 if (writePos >= BUFFER_SIZE) { 79 return flushWrite(); 80 } 81 return true; 82 }; 83 84 while (inFile.available()) { 85 int bytesRead = inFile.read(readBuffer, BUFFER_SIZE); 86 if (bytesRead <= 0) break; 87 88 for (int i = 0; i < bytesRead; i++) { 89 char c = static_cast<char>(readBuffer[i]); 90 91 switch (state) { 92 case State::Normal: 93 if (c == '<') { 94 state = State::InTagStart; 95 tagNameLen = 0; 96 isCurrentTagVoid = false; 97 // Don't write '<' yet - might need to skip if it's a void element closing tag 98 } else { 99 if (!writeChar(c)) goto error; 100 } 101 break; 102 103 case State::InTagStart: 104 if (c == '/') { 105 // Closing tag - need to check if it's a void element 106 state = State::InClosingTagName; 107 tagNameLen = 0; 108 closingTagWsLen = 0; 109 // Don't write '</' yet - buffer it in case we need to skip 110 } else if (c == '!' || c == '?') { 111 // Comment or processing instruction - skip normalization 112 state = State::Normal; 113 if (!writeChar('<')) goto error; 114 if (!writeChar(c)) goto error; 115 } else if (std::isalpha(static_cast<unsigned char>(c))) { 116 state = State::InTagName; 117 tagName[0] = c; 118 tagNameLen = 1; 119 if (!writeChar('<')) goto error; 120 if (!writeChar(c)) goto error; 121 } else { 122 state = State::Normal; 123 if (!writeChar('<')) goto error; 124 if (!writeChar(c)) goto error; 125 } 126 break; 127 128 case State::InTagName: 129 if (std::isalnum(static_cast<unsigned char>(c)) || c == '-' || c == ':') { 130 if (tagNameLen < MAX_TAG_NAME_LENGTH) { 131 tagName[tagNameLen++] = c; 132 } 133 if (!writeChar(c)) goto error; 134 } else { 135 // End of tag name 136 tagName[tagNameLen] = '\0'; 137 isCurrentTagVoid = isVoidElement(tagName, tagNameLen); 138 139 if (c == '>') { 140 // Tag ends immediately after name 141 if (isCurrentTagVoid && prevChar != '/') { 142 if (!writeChar(' ')) goto error; 143 if (!writeChar('/')) goto error; 144 } 145 if (!writeChar(c)) goto error; 146 state = State::Normal; 147 } else if (std::isspace(static_cast<unsigned char>(c))) { 148 state = State::InTagAttrs; 149 if (!writeChar(c)) goto error; 150 } else if (c == '/') { 151 // Self-closing indicator 152 if (!writeChar(c)) goto error; 153 state = State::InTagAttrs; 154 } else { 155 // Unexpected character 156 if (!writeChar(c)) goto error; 157 state = State::Normal; 158 } 159 } 160 break; 161 162 case State::InTagAttrs: 163 if (c == '"' || c == '\'') { 164 state = State::InQuote; 165 quoteChar = c; 166 if (!writeChar(c)) goto error; 167 } else if (c == '>') { 168 // End of tag - insert self-closing if needed 169 if (isCurrentTagVoid && prevChar != '/') { 170 if (!writeChar(' ')) goto error; 171 if (!writeChar('/')) goto error; 172 } 173 if (!writeChar(c)) goto error; 174 state = State::Normal; 175 } else { 176 if (!writeChar(c)) goto error; 177 } 178 break; 179 180 case State::InQuote: 181 if (c == quoteChar) { 182 state = State::InTagAttrs; 183 } 184 if (!writeChar(c)) goto error; 185 break; 186 187 case State::InClosingTagName: 188 if (std::isalnum(static_cast<unsigned char>(c)) || c == '-' || c == ':') { 189 if (tagNameLen < MAX_TAG_NAME_LENGTH) { 190 tagName[tagNameLen++] = c; 191 } else { 192 // Tag too long to be void - flush buffer and passthrough 193 if (!writeChar('<')) goto error; 194 if (!writeChar('/')) goto error; 195 for (size_t j = 0; j < tagNameLen; j++) { 196 if (!writeChar(tagName[j])) goto error; 197 } 198 if (!writeChar(c)) goto error; 199 state = State::InClosingTagRest; 200 } 201 } else if (c == '>') { 202 // End of closing tag - check if it's a void element 203 tagName[tagNameLen] = '\0'; 204 if (isVoidElement(tagName, tagNameLen)) { 205 // Skip the entire closing tag (don't output anything) 206 } else { 207 // Not a void element - output the buffered "</tagname>" with any whitespace 208 if (!writeChar('<')) goto error; 209 if (!writeChar('/')) goto error; 210 for (size_t j = 0; j < tagNameLen; j++) { 211 if (!writeChar(tagName[j])) goto error; 212 } 213 for (size_t j = 0; j < closingTagWsLen; j++) { 214 if (!writeChar(closingTagWhitespace[j])) goto error; 215 } 216 if (!writeChar('>')) goto error; 217 } 218 state = State::Normal; 219 } else if (std::isspace(static_cast<unsigned char>(c))) { 220 // Whitespace before '>' in closing tag (unusual but valid) 221 // Buffer it in case we need to replay for non-void elements 222 if (closingTagWsLen < sizeof(closingTagWhitespace)) { 223 closingTagWhitespace[closingTagWsLen++] = c; 224 } 225 } else { 226 // Unexpected character - output what we have and return to normal 227 if (!writeChar('<')) goto error; 228 if (!writeChar('/')) goto error; 229 for (size_t j = 0; j < tagNameLen; j++) { 230 if (!writeChar(tagName[j])) goto error; 231 } 232 if (!writeChar(c)) goto error; 233 state = State::Normal; 234 } 235 break; 236 237 case State::InClosingTagRest: 238 if (!writeChar(c)) goto error; 239 if (c == '>') { 240 state = State::Normal; 241 } 242 break; 243 } 244 245 prevChar = c; 246 } 247 } 248 249 // Handle EOF - flush any buffered but uncommitted content 250 if (state == State::InTagStart) { 251 // We saw '<' but nothing else 252 if (!writeChar('<')) goto error; 253 } else if (state == State::InClosingTagName) { 254 // We were in the middle of a closing tag - output what we have 255 if (!writeChar('<')) goto error; 256 if (!writeChar('/')) goto error; 257 for (size_t j = 0; j < tagNameLen; j++) { 258 if (!writeChar(tagName[j])) goto error; 259 } 260 for (size_t j = 0; j < closingTagWsLen; j++) { 261 if (!writeChar(closingTagWhitespace[j])) goto error; 262 } 263 } 264 265 if (!flushWrite()) goto error; 266 267 inFile.close(); 268 outFile.close(); 269 return true; 270 271 error: 272 inFile.close(); 273 outFile.close(); 274 SdMan.remove(outputPath.c_str()); 275 return false; 276 } 277 278 } // namespace html5