/ libxml2 / testchar.c
testchar.c
  1  /**
  2   * Test the UTF-8 decoding routines
  3   *
  4   * author: Daniel Veillard
  5   * copy: see Copyright for the status of this software.
  6   */
  7  
  8  #include <stdio.h>
  9  #include <string.h>
 10  #include <libxml/parser.h>
 11  #include <libxml/parserInternals.h>
 12  
 13  #include "buf.h"
 14  
 15  int lastError;
 16  
 17  static void errorHandler(void *unused, xmlErrorPtr err) {
 18      if ((unused == NULL) && (err != NULL) && (lastError == 0)) {
 19          lastError = err->code;
 20      }
 21  }
 22  
 23  char document1[100] = "<doc>XXXX</doc>";
 24  char document2[100] = "<doc foo='XXXX'/>";
 25  
 26  static void testDocumentRangeByte1(xmlParserCtxtPtr ctxt, char *document,
 27                    int len,  char *data, int forbid1, int forbid2) {
 28      int i;
 29      xmlDocPtr res;
 30  
 31      for (i = 0;i <= 0xFF;i++) {
 32  	lastError = 0;
 33  	xmlCtxtReset(ctxt);
 34  
 35          data[0] = i;
 36  
 37  	res = xmlReadMemory(document, len, "test", NULL, 0);
 38  
 39  	if ((i == forbid1) || (i == forbid2)) {
 40  	    if ((lastError == 0) || (res != NULL))
 41  	        fprintf(stderr,
 42  		    "Failed to detect invalid char for Byte 0x%02X: %c\n",
 43  		        i, i);
 44  	}
 45  
 46  	else if ((i == '<') || (i == '&')) {
 47  	    if ((lastError == 0) || (res != NULL))
 48  	        fprintf(stderr,
 49  		    "Failed to detect illegal char %c for Byte 0x%02X\n", i, i);
 50  	}
 51  	else if (((i < 0x20) || (i >= 0x80)) &&
 52  	    (i != 0x9) && (i != 0xA) && (i != 0xD)) {
 53  	    if ((lastError != XML_ERR_INVALID_CHAR) && (res != NULL))
 54  	        fprintf(stderr,
 55  		    "Failed to detect invalid char for Byte 0x%02X\n", i);
 56  	}
 57  	else if (res == NULL) {
 58  	    fprintf(stderr,
 59  		"Failed to parse valid char for Byte 0x%02X : %c\n", i, i);
 60  	}
 61  	if (res != NULL)
 62  	    xmlFreeDoc(res);
 63      }
 64  }
 65  
 66  static void testDocumentRangeByte2(xmlParserCtxtPtr ctxt, char *document,
 67                    int len,  char *data) {
 68      int i, j;
 69      xmlDocPtr res;
 70  
 71      for (i = 0x80;i <= 0xFF;i++) {
 72      for (j = 0;j <= 0xFF;j++) {
 73  	lastError = 0;
 74  	xmlCtxtReset(ctxt);
 75  
 76          data[0] = i;
 77          data[1] = j;
 78  
 79  	res = xmlReadMemory(document, len, "test", NULL, 0);
 80  
 81  	/* if first bit of first char is set, then second bit must too */
 82  	if ((i & 0x80) && ((i & 0x40) == 0)) {
 83  	    if ((lastError == 0) || (res != NULL))
 84  		fprintf(stderr,
 85  		"Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
 86  			i, j);
 87  	}
 88  
 89  	/*
 90  	 * if first bit of first char is set, then second char first
 91  	 * bits must be 10
 92  	 */
 93  	else if ((i & 0x80) && ((j & 0xC0) != 0x80)) {
 94  	    if ((lastError == 0) || (res != NULL))
 95  		fprintf(stderr,
 96  	    "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
 97  			i, j);
 98  	}
 99  
100  	/*
101  	 * if using a 2 byte encoding then the value must be greater
102  	 * than 0x80, i.e. one of bits 5 to 1 of i must be set
103  	 */
104  	else if ((i & 0x80) && ((i & 0x1E) == 0)) {
105  	    if ((lastError == 0) || (res != NULL))
106  		fprintf(stderr,
107  	    "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
108  			i, j);
109  	}
110  
111  	/*
112  	 * if third bit of first char is set, then the sequence would need
113  	 * at least 3 bytes, but we give only 2 !
114  	 */
115  	else if ((i & 0xE0) == 0xE0) {
116  	    if ((lastError == 0) || (res != NULL))
117  		fprintf(stderr,
118  	    "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x00\n",
119  			i, j);
120  	}
121  
122  	/*
123  	 * We should see no error in remaning cases
124  	 */
125  	else if ((lastError != 0) || (res == NULL)) {
126  	    fprintf(stderr,
127  		"Failed to parse document for Bytes 0x%02X 0x%02X\n", i, j);
128  	}
129  	if (res != NULL)
130  	    xmlFreeDoc(res);
131      }
132      }
133  }
134  
135  /**
136   * testDocumentRanges:
137   *
138   * Test the correct UTF8 character parsing in context of XML documents
139   * Those are in-context injection tests checking the parser behaviour on
140   * edge case values at different point in content, beginning and end of
141   * CDATA in text or in attribute values.
142   */
143  
144  static void testDocumentRanges(void) {
145      xmlParserCtxtPtr ctxt;
146      char *data;
147  
148      /*
149       * Set up a parsing context using the first document as
150       * the current input source.
151       */
152      ctxt = xmlNewParserCtxt();
153      if (ctxt == NULL) {
154          fprintf(stderr, "Failed to allocate parser context\n");
155  	return;
156      }
157  
158      printf("testing 1 byte char in document: 1");
159      fflush(stdout);
160      data = &document1[5];
161      data[0] = ' ';
162      data[1] = ' ';
163      data[2] = ' ';
164      data[3] = ' ';
165      /* test 1 byte injection at beginning of area */
166      testDocumentRangeByte1(ctxt, &document1[0], strlen(document1),
167                             data, -1, -1);
168      printf(" 2");
169      fflush(stdout);
170      data[0] = ' ';
171      data[1] = ' ';
172      data[2] = ' ';
173      data[3] = ' ';
174      /* test 1 byte injection at end of area */
175      testDocumentRangeByte1(ctxt, &document1[0], strlen(document1),
176                             data + 3, -1, -1);
177  
178      printf(" 3");
179      fflush(stdout);
180      data = &document2[10];
181      data[0] = ' ';
182      data[1] = ' ';
183      data[2] = ' ';
184      data[3] = ' ';
185      /* test 1 byte injection at beginning of area */
186      testDocumentRangeByte1(ctxt, &document2[0], strlen(document2),
187                             data, '\'', -1);
188      printf(" 4");
189      fflush(stdout);
190      data[0] = ' ';
191      data[1] = ' ';
192      data[2] = ' ';
193      data[3] = ' ';
194      /* test 1 byte injection at end of area */
195      testDocumentRangeByte1(ctxt, &document2[0], strlen(document2),
196                             data + 3, '\'', -1);
197      printf(" done\n");
198  
199      printf("testing 2 byte char in document: 1");
200      fflush(stdout);
201      data = &document1[5];
202      data[0] = ' ';
203      data[1] = ' ';
204      data[2] = ' ';
205      data[3] = ' ';
206      /* test 2 byte injection at beginning of area */
207      testDocumentRangeByte2(ctxt, &document1[0], strlen(document1),
208                             data);
209      printf(" 2");
210      fflush(stdout);
211      data[0] = ' ';
212      data[1] = ' ';
213      data[2] = ' ';
214      data[3] = ' ';
215      /* test 2 byte injection at end of area */
216      testDocumentRangeByte2(ctxt, &document1[0], strlen(document1),
217                             data + 2);
218  
219      printf(" 3");
220      fflush(stdout);
221      data = &document2[10];
222      data[0] = ' ';
223      data[1] = ' ';
224      data[2] = ' ';
225      data[3] = ' ';
226      /* test 2 byte injection at beginning of area */
227      testDocumentRangeByte2(ctxt, &document2[0], strlen(document2),
228                             data);
229      printf(" 4");
230      fflush(stdout);
231      data[0] = ' ';
232      data[1] = ' ';
233      data[2] = ' ';
234      data[3] = ' ';
235      /* test 2 byte injection at end of area */
236      testDocumentRangeByte2(ctxt, &document2[0], strlen(document2),
237                             data + 2);
238      printf(" done\n");
239  
240      xmlFreeParserCtxt(ctxt);
241  }
242  
243  static void testCharRangeByte1(xmlParserCtxtPtr ctxt, char *data) {
244      int i = 0;
245      int len, c;
246  
247      data[1] = 0;
248      data[2] = 0;
249      data[3] = 0;
250      for (i = 0;i <= 0xFF;i++) {
251          data[0] = i;
252  	ctxt->charset = XML_CHAR_ENCODING_UTF8;
253  
254  	lastError = 0;
255          c = xmlCurrentChar(ctxt, &len);
256  	if ((i == 0) || (i >= 0x80)) {
257  	    /* we must see an error there */
258  	    if (lastError != XML_ERR_INVALID_CHAR)
259  	        fprintf(stderr,
260  		    "Failed to detect invalid char for Byte 0x%02X\n", i);
261  	} else if (i == 0xD) {
262  	    if ((c != 0xA) || (len != 1))
263  		fprintf(stderr, "Failed to convert char for Byte 0x%02X\n", i);
264  	} else if ((c != i) || (len != 1)) {
265  	    fprintf(stderr, "Failed to parse char for Byte 0x%02X\n", i);
266  	}
267      }
268  }
269  
270  static void testCharRangeByte2(xmlParserCtxtPtr ctxt, char *data) {
271      int i, j;
272      int len, c;
273  
274      data[2] = 0;
275      data[3] = 0;
276      for (i = 0x80;i <= 0xFF;i++) {
277  	for (j = 0;j <= 0xFF;j++) {
278  	    data[0] = i;
279  	    data[1] = j;
280  	    ctxt->charset = XML_CHAR_ENCODING_UTF8;
281  
282  	    lastError = 0;
283  	    c = xmlCurrentChar(ctxt, &len);
284  
285  	    /* if first bit of first char is set, then second bit must too */
286  	    if ((i & 0x80) && ((i & 0x40) == 0)) {
287  		if (lastError != XML_ERR_INVALID_CHAR)
288  		    fprintf(stderr,
289  		    "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
290  		            i, j);
291  	    }
292  
293  	    /*
294  	     * if first bit of first char is set, then second char first
295  	     * bits must be 10
296  	     */
297  	    else if ((i & 0x80) && ((j & 0xC0) != 0x80)) {
298  		if (lastError != XML_ERR_INVALID_CHAR)
299  		    fprintf(stderr,
300  		"Failed to detect invalid char for Bytes 0x%02X 0x%02X: %d\n",
301  		            i, j, c);
302  	    }
303  
304  	    /*
305  	     * if using a 2 byte encoding then the value must be greater
306  	     * than 0x80, i.e. one of bits 5 to 1 of i must be set
307  	     */
308  	    else if ((i & 0x80) && ((i & 0x1E) == 0)) {
309  		if (lastError != XML_ERR_INVALID_CHAR)
310  		    fprintf(stderr,
311  		"Failed to detect invalid char for Bytes 0x%02X 0x%02X: %d\n",
312  		            i, j, c);
313  	    }
314  
315  	    /*
316  	     * if third bit of first char is set, then the sequence would need
317  	     * at least 3 bytes, but we give only 2 !
318  	     */
319  	    else if ((i & 0xE0) == 0xE0) {
320  		if (lastError != XML_ERR_INVALID_CHAR)
321  		    fprintf(stderr,
322  		"Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x00\n",
323  		            i, j);
324  	    }
325  
326              /*
327  	     * We should see no error in remaning cases
328  	     */
329  	    else if ((lastError != 0) || (len != 2)) {
330  		fprintf(stderr,
331  		    "Failed to parse char for Bytes 0x%02X 0x%02X\n", i, j);
332  	    }
333  
334              /*
335  	     * Finally check the value is right
336  	     */
337  	    else if (c != (j & 0x3F) + ((i & 0x1F) << 6)) {
338  		fprintf(stderr,
339  	"Failed to parse char for Bytes 0x%02X 0x%02X: expect %d got %d\n",
340  	                i, j, ((j & 0x3F) + ((i & 0x1F) << 6)), c);
341  	    }
342          }
343      }
344  }
345  
346  static void testCharRangeByte3(xmlParserCtxtPtr ctxt, char *data) {
347      int i, j, k, K;
348      int len, c;
349      unsigned char lows[6] = {0, 0x80, 0x81, 0xC1, 0xFF, 0xBF};
350      int value;
351  
352      data[3] = 0;
353      for (i = 0xE0;i <= 0xFF;i++) {
354      for (j = 0;j <= 0xFF;j++) {
355      for (k = 0;k < 6;k++) {
356  	data[0] = i;
357  	data[1] = j;
358  	K = lows[k];
359  	data[2] = (char) K;
360  	value = (K & 0x3F) + ((j & 0x3F) << 6) + ((i & 0xF) << 12);
361  	ctxt->charset = XML_CHAR_ENCODING_UTF8;
362  
363  	lastError = 0;
364  	c = xmlCurrentChar(ctxt, &len);
365  
366  	/*
367  	 * if fourth bit of first char is set, then the sequence would need
368  	 * at least 4 bytes, but we give only 3 !
369  	 */
370  	if ((i & 0xF0) == 0xF0) {
371  	    if (lastError != XML_ERR_INVALID_CHAR)
372  		fprintf(stderr,
373  	"Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
374  			i, j, K, data[3]);
375  	}
376  
377          /*
378  	 * The second and the third bytes must start with 10
379  	 */
380  	else if (((j & 0xC0) != 0x80) || ((K & 0xC0) != 0x80)) {
381  	    if (lastError != XML_ERR_INVALID_CHAR)
382  		fprintf(stderr,
383  	"Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X\n",
384  			i, j, K);
385  	}
386  
387  	/*
388  	 * if using a 3 byte encoding then the value must be greater
389  	 * than 0x800, i.e. one of bits 4 to 0 of i must be set or
390  	 * the 6th byte of data[1] must be set
391  	 */
392  	else if (((i & 0xF) == 0) && ((j & 0x20) == 0)) {
393  	    if (lastError != XML_ERR_INVALID_CHAR)
394  		fprintf(stderr,
395  	    "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X\n",
396  			i, j, K);
397  	}
398  
399          /*
400  	 * There are values in that range that are not allowed in XML-1.0
401  	 */
402  	else if (((value > 0xD7FF) && (value <0xE000)) ||
403  	         ((value > 0xFFFD) && (value <0x10000))) {
404  	    if (lastError != XML_ERR_INVALID_CHAR)
405  		fprintf(stderr,
406  	"Failed to detect invalid char 0x%04X for Bytes 0x%02X 0x%02X 0x%02X\n",
407  			value, i, j, K);
408  	}
409  
410  	/*
411  	 * We should see no error in remaining cases
412  	 */
413  	else if ((lastError != 0) || (len != 3)) {
414  	    fprintf(stderr,
415  		"Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X\n",
416  		    i, j, K);
417  	}
418  
419  	/*
420  	 * Finally check the value is right
421  	 */
422  	else if (c != value) {
423  	    fprintf(stderr,
424      "Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X: expect %d got %d\n",
425  		i, j, data[2], value, c);
426  	}
427      }
428      }
429      }
430  }
431  
432  static void testCharRangeByte4(xmlParserCtxtPtr ctxt, char *data) {
433      int i, j, k, K, l, L;
434      int len, c;
435      unsigned char lows[6] = {0, 0x80, 0x81, 0xC1, 0xFF, 0xBF};
436      int value;
437  
438      data[4] = 0;
439      for (i = 0xF0;i <= 0xFF;i++) {
440      for (j = 0;j <= 0xFF;j++) {
441      for (k = 0;k < 6;k++) {
442      for (l = 0;l < 6;l++) {
443  	data[0] = i;
444  	data[1] = j;
445  	K = lows[k];
446  	data[2] = (char) K;
447  	L = lows[l];
448  	data[3] = (char) L;
449  	value = (L & 0x3F) + ((K & 0x3F) << 6) + ((j & 0x3F) << 12) +
450  	        ((i & 0x7) << 18);
451  	ctxt->charset = XML_CHAR_ENCODING_UTF8;
452  
453  	lastError = 0;
454  	c = xmlCurrentChar(ctxt, &len);
455  
456  	/*
457  	 * if fifth bit of first char is set, then the sequence would need
458  	 * at least 5 bytes, but we give only 4 !
459  	 */
460  	if ((i & 0xF8) == 0xF8) {
461  	    if (lastError != XML_ERR_INVALID_CHAR)
462  		fprintf(stderr,
463    "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
464  			i, j, K, data[3]);
465  	}
466  
467          /*
468  	 * The second, third and fourth bytes must start with 10
469  	 */
470  	else if (((j & 0xC0) != 0x80) || ((K & 0xC0) != 0x80) ||
471  	         ((L & 0xC0) != 0x80)) {
472  	    if (lastError != XML_ERR_INVALID_CHAR)
473  		fprintf(stderr,
474  	"Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
475  			i, j, K, L);
476  	}
477  
478  	/*
479  	 * if using a 3 byte encoding then the value must be greater
480  	 * than 0x10000, i.e. one of bits 3 to 0 of i must be set or
481  	 * the 6 or 5th byte of j must be set
482  	 */
483  	else if (((i & 0x7) == 0) && ((j & 0x30) == 0)) {
484  	    if (lastError != XML_ERR_INVALID_CHAR)
485  		fprintf(stderr,
486  	"Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
487  			i, j, K, L);
488  	}
489  
490          /*
491  	 * There are values in that range that are not allowed in XML-1.0
492  	 */
493  	else if (((value > 0xD7FF) && (value <0xE000)) ||
494  	         ((value > 0xFFFD) && (value <0x10000)) ||
495  		 (value > 0x10FFFF)) {
496  	    if (lastError != XML_ERR_INVALID_CHAR)
497  		fprintf(stderr,
498  "Failed to detect invalid char 0x%04X for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
499  			value, i, j, K, L);
500  	}
501  
502  	/*
503  	 * We should see no error in remaining cases
504  	 */
505  	else if ((lastError != 0) || (len != 4)) {
506  	    fprintf(stderr,
507  		"Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X\n",
508  		    i, j, K);
509  	}
510  
511  	/*
512  	 * Finally check the value is right
513  	 */
514  	else if (c != value) {
515  	    fprintf(stderr,
516      "Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X: expect %d got %d\n",
517  		i, j, data[2], value, c);
518  	}
519      }
520      }
521      }
522      }
523  }
524  
525  /**
526   * testCharRanges:
527   *
528   * Test the correct UTF8 character parsing in isolation i.e.
529   * not when parsing a full document, this is less expensive and we can
530   * cover the full range of UTF-8 chars accepted by XML-1.0
531   */
532  
533  static void testCharRanges(void) {
534      char data[5];
535      xmlParserCtxtPtr ctxt;
536      xmlParserInputBufferPtr buf;
537      xmlParserInputPtr input;
538  
539      memset(data, 0, 5);
540  
541      /*
542       * Set up a parsing context using the above data buffer as
543       * the current input source.
544       */
545      ctxt = xmlNewParserCtxt();
546      if (ctxt == NULL) {
547          fprintf(stderr, "Failed to allocate parser context\n");
548  	return;
549      }
550      buf = xmlParserInputBufferCreateStatic(data, sizeof(data),
551                                             XML_CHAR_ENCODING_NONE);
552      if (buf == NULL) {
553          fprintf(stderr, "Failed to allocate input buffer\n");
554  	goto error;
555      }
556      input = xmlNewInputStream(ctxt);
557      if (input == NULL) {
558          xmlFreeParserInputBuffer(buf);
559  	goto error;
560      }
561      input->filename = NULL;
562      input->buf = buf;
563      input->cur =
564      input->base = xmlBufContent(input->buf->buffer);
565      input->end = input->base + 4;
566      inputPush(ctxt, input);
567  
568      printf("testing char range: 1");
569      fflush(stdout);
570      testCharRangeByte1(ctxt, data);
571      printf(" 2");
572      fflush(stdout);
573      testCharRangeByte2(ctxt, data);
574      printf(" 3");
575      fflush(stdout);
576      testCharRangeByte3(ctxt, data);
577      printf(" 4");
578      fflush(stdout);
579      testCharRangeByte4(ctxt, data);
580      printf(" done\n");
581      fflush(stdout);
582  
583  error:
584      xmlFreeParserCtxt(ctxt);
585  }
586  
587  int main(void) {
588  
589      /*
590       * this initialize the library and check potential ABI mismatches
591       * between the version it was compiled for and the actual shared
592       * library used.
593       */
594      LIBXML_TEST_VERSION
595  
596      /*
597       * Catch errors separately
598       */
599  
600      xmlSetStructuredErrorFunc(NULL, errorHandler);
601  
602      /*
603       * Run the tests
604       */
605      testCharRanges();
606      testDocumentRanges();
607  
608      /*
609       * Cleanup function for the XML library.
610       */
611      xmlCleanupParser();
612      /*
613       * this is to debug memory for regression tests
614       */
615      xmlMemoryDump();
616      return(0);
617  }