/ lib / Html5 / Html5Normalizer.cpp
Html5Normalizer.cpp
  1  #include "Html5Normalizer.h"
  2  
  3  #include <SDCardManager.h>
  4  
  5  #include <algorithm>
  6  #include <cctype>
  7  
  8  namespace html5 {
  9  
 10  namespace {
 11  
 12  // HTML5 void elements that cannot have closing tags (lowercase for case-insensitive matching)
 13  constexpr const char* VOID_ELEMENTS[] = {"img",  "br",  "hr",    "input", "meta",   "link",  "area",
 14                                           "base", "col", "embed", "param", "source", "track", "wbr"};
 15  constexpr size_t VOID_ELEMENT_COUNT = sizeof(VOID_ELEMENTS) / sizeof(VOID_ELEMENTS[0]);
 16  constexpr size_t MAX_TAG_NAME_LENGTH = 8;
 17  constexpr size_t BUFFER_SIZE = 512;
 18  
 19  enum class State { Normal, InTagStart, InTagName, InTagAttrs, InQuote, InClosingTagName, InClosingTagRest };
 20  
 21  char toLowerAscii(char c) { return (c >= 'A' && c <= 'Z') ? static_cast<char>(c + ('a' - 'A')) : c; }
 22  
 23  bool isVoidElement(const char* name, size_t len) {
 24    for (size_t i = 0; i < VOID_ELEMENT_COUNT; i++) {
 25      const char* ve = VOID_ELEMENTS[i];
 26      size_t veLen = 0;
 27      while (ve[veLen] != '\0') veLen++;
 28      if (len == veLen) {
 29        bool match = true;
 30        for (size_t j = 0; j < len && match; j++) {
 31          if (toLowerAscii(name[j]) != ve[j]) match = false;
 32        }
 33        if (match) return true;
 34      }
 35    }
 36    return false;
 37  }
 38  
 39  }  // namespace
 40  
 41  bool normalizeVoidElements(const std::string& inputPath, const std::string& outputPath) {
 42    FsFile inFile, outFile;
 43  
 44    if (!SdMan.openFileForRead("H5N", inputPath, inFile)) {
 45      return false;
 46    }
 47  
 48    if (!SdMan.openFileForWrite("H5N", outputPath, outFile)) {
 49      inFile.close();
 50      return false;
 51    }
 52  
 53    State state = State::Normal;
 54    char tagName[MAX_TAG_NAME_LENGTH + 1] = {0};
 55    size_t tagNameLen = 0;
 56    char closingTagWhitespace[8] = {0};  // Buffer for whitespace in closing tags
 57    size_t closingTagWsLen = 0;
 58    bool isCurrentTagVoid = false;
 59    char quoteChar = 0;
 60    char prevChar = 0;
 61  
 62    uint8_t readBuffer[BUFFER_SIZE];
 63    uint8_t writeBuffer[BUFFER_SIZE + 64];  // Extra space for insertions
 64    size_t writePos = 0;
 65  
 66    auto flushWrite = [&]() -> bool {
 67      if (writePos > 0) {
 68        if (outFile.write(writeBuffer, writePos) != writePos) {
 69          return false;
 70        }
 71        writePos = 0;
 72      }
 73      return true;
 74    };
 75  
 76    auto writeChar = [&](char c) -> bool {
 77      writeBuffer[writePos++] = static_cast<uint8_t>(c);
 78      if (writePos >= BUFFER_SIZE) {
 79        return flushWrite();
 80      }
 81      return true;
 82    };
 83  
 84    while (inFile.available()) {
 85      int bytesRead = inFile.read(readBuffer, BUFFER_SIZE);
 86      if (bytesRead <= 0) break;
 87  
 88      for (int i = 0; i < bytesRead; i++) {
 89        char c = static_cast<char>(readBuffer[i]);
 90  
 91        switch (state) {
 92          case State::Normal:
 93            if (c == '<') {
 94              state = State::InTagStart;
 95              tagNameLen = 0;
 96              isCurrentTagVoid = false;
 97              // Don't write '<' yet - might need to skip if it's a void element closing tag
 98            } else {
 99              if (!writeChar(c)) goto error;
100            }
101            break;
102  
103          case State::InTagStart:
104            if (c == '/') {
105              // Closing tag - need to check if it's a void element
106              state = State::InClosingTagName;
107              tagNameLen = 0;
108              closingTagWsLen = 0;
109              // Don't write '</' yet - buffer it in case we need to skip
110            } else if (c == '!' || c == '?') {
111              // Comment or processing instruction - skip normalization
112              state = State::Normal;
113              if (!writeChar('<')) goto error;
114              if (!writeChar(c)) goto error;
115            } else if (std::isalpha(static_cast<unsigned char>(c))) {
116              state = State::InTagName;
117              tagName[0] = c;
118              tagNameLen = 1;
119              if (!writeChar('<')) goto error;
120              if (!writeChar(c)) goto error;
121            } else {
122              state = State::Normal;
123              if (!writeChar('<')) goto error;
124              if (!writeChar(c)) goto error;
125            }
126            break;
127  
128          case State::InTagName:
129            if (std::isalnum(static_cast<unsigned char>(c)) || c == '-' || c == ':') {
130              if (tagNameLen < MAX_TAG_NAME_LENGTH) {
131                tagName[tagNameLen++] = c;
132              }
133              if (!writeChar(c)) goto error;
134            } else {
135              // End of tag name
136              tagName[tagNameLen] = '\0';
137              isCurrentTagVoid = isVoidElement(tagName, tagNameLen);
138  
139              if (c == '>') {
140                // Tag ends immediately after name
141                if (isCurrentTagVoid && prevChar != '/') {
142                  if (!writeChar(' ')) goto error;
143                  if (!writeChar('/')) goto error;
144                }
145                if (!writeChar(c)) goto error;
146                state = State::Normal;
147              } else if (std::isspace(static_cast<unsigned char>(c))) {
148                state = State::InTagAttrs;
149                if (!writeChar(c)) goto error;
150              } else if (c == '/') {
151                // Self-closing indicator
152                if (!writeChar(c)) goto error;
153                state = State::InTagAttrs;
154              } else {
155                // Unexpected character
156                if (!writeChar(c)) goto error;
157                state = State::Normal;
158              }
159            }
160            break;
161  
162          case State::InTagAttrs:
163            if (c == '"' || c == '\'') {
164              state = State::InQuote;
165              quoteChar = c;
166              if (!writeChar(c)) goto error;
167            } else if (c == '>') {
168              // End of tag - insert self-closing if needed
169              if (isCurrentTagVoid && prevChar != '/') {
170                if (!writeChar(' ')) goto error;
171                if (!writeChar('/')) goto error;
172              }
173              if (!writeChar(c)) goto error;
174              state = State::Normal;
175            } else {
176              if (!writeChar(c)) goto error;
177            }
178            break;
179  
180          case State::InQuote:
181            if (c == quoteChar) {
182              state = State::InTagAttrs;
183            }
184            if (!writeChar(c)) goto error;
185            break;
186  
187          case State::InClosingTagName:
188            if (std::isalnum(static_cast<unsigned char>(c)) || c == '-' || c == ':') {
189              if (tagNameLen < MAX_TAG_NAME_LENGTH) {
190                tagName[tagNameLen++] = c;
191              } else {
192                // Tag too long to be void - flush buffer and passthrough
193                if (!writeChar('<')) goto error;
194                if (!writeChar('/')) goto error;
195                for (size_t j = 0; j < tagNameLen; j++) {
196                  if (!writeChar(tagName[j])) goto error;
197                }
198                if (!writeChar(c)) goto error;
199                state = State::InClosingTagRest;
200              }
201            } else if (c == '>') {
202              // End of closing tag - check if it's a void element
203              tagName[tagNameLen] = '\0';
204              if (isVoidElement(tagName, tagNameLen)) {
205                // Skip the entire closing tag (don't output anything)
206              } else {
207                // Not a void element - output the buffered "</tagname>" with any whitespace
208                if (!writeChar('<')) goto error;
209                if (!writeChar('/')) goto error;
210                for (size_t j = 0; j < tagNameLen; j++) {
211                  if (!writeChar(tagName[j])) goto error;
212                }
213                for (size_t j = 0; j < closingTagWsLen; j++) {
214                  if (!writeChar(closingTagWhitespace[j])) goto error;
215                }
216                if (!writeChar('>')) goto error;
217              }
218              state = State::Normal;
219            } else if (std::isspace(static_cast<unsigned char>(c))) {
220              // Whitespace before '>' in closing tag (unusual but valid)
221              // Buffer it in case we need to replay for non-void elements
222              if (closingTagWsLen < sizeof(closingTagWhitespace)) {
223                closingTagWhitespace[closingTagWsLen++] = c;
224              }
225            } else {
226              // Unexpected character - output what we have and return to normal
227              if (!writeChar('<')) goto error;
228              if (!writeChar('/')) goto error;
229              for (size_t j = 0; j < tagNameLen; j++) {
230                if (!writeChar(tagName[j])) goto error;
231              }
232              if (!writeChar(c)) goto error;
233              state = State::Normal;
234            }
235            break;
236  
237          case State::InClosingTagRest:
238            if (!writeChar(c)) goto error;
239            if (c == '>') {
240              state = State::Normal;
241            }
242            break;
243        }
244  
245        prevChar = c;
246      }
247    }
248  
249    // Handle EOF - flush any buffered but uncommitted content
250    if (state == State::InTagStart) {
251      // We saw '<' but nothing else
252      if (!writeChar('<')) goto error;
253    } else if (state == State::InClosingTagName) {
254      // We were in the middle of a closing tag - output what we have
255      if (!writeChar('<')) goto error;
256      if (!writeChar('/')) goto error;
257      for (size_t j = 0; j < tagNameLen; j++) {
258        if (!writeChar(tagName[j])) goto error;
259      }
260      for (size_t j = 0; j < closingTagWsLen; j++) {
261        if (!writeChar(closingTagWhitespace[j])) goto error;
262      }
263    }
264  
265    if (!flushWrite()) goto error;
266  
267    inFile.close();
268    outFile.close();
269    return true;
270  
271  error:
272    inFile.close();
273    outFile.close();
274    SdMan.remove(outputPath.c_str());
275    return false;
276  }
277  
278  }  // namespace html5