xmlregexp.c
1 /* 2 * regexp.c: generic and extensible Regular Expression engine 3 * 4 * Basically designed with the purpose of compiling regexps for 5 * the variety of validation/shemas mechanisms now available in 6 * XML related specifications these include: 7 * - XML-1.0 DTD validation 8 * - XML Schemas structure part 1 9 * - XML Schemas Datatypes part 2 especially Appendix F 10 * - RELAX-NG/TREX i.e. the counter proposal 11 * 12 * See Copyright for the status of this software. 13 * 14 * Daniel Veillard <veillard@redhat.com> 15 */ 16 17 #define IN_LIBXML 18 #include "libxml.h" 19 20 #ifdef LIBXML_REGEXP_ENABLED 21 22 /* #define DEBUG_ERR */ 23 24 #include <stdio.h> 25 #include <string.h> 26 #ifdef HAVE_LIMITS_H 27 #include <limits.h> 28 #endif 29 #ifdef HAVE_STDINT_H 30 #include <stdint.h> 31 #endif 32 33 #include <libxml/tree.h> 34 #include <libxml/parserInternals.h> 35 #include <libxml/xmlregexp.h> 36 #include <libxml/xmlautomata.h> 37 #include <libxml/xmlunicode.h> 38 39 #ifndef INT_MAX 40 #define INT_MAX 123456789 /* easy to flag and big enough for our needs */ 41 #endif 42 #ifndef SIZE_MAX 43 #define SIZE_MAX ((size_t) -1) 44 #endif 45 46 /* #define DEBUG_REGEXP_GRAPH */ 47 /* #define DEBUG_REGEXP_EXEC */ 48 /* #define DEBUG_PUSH */ 49 /* #define DEBUG_COMPACTION */ 50 51 #define MAX_PUSH 10000000 52 53 #ifdef ERROR 54 #undef ERROR 55 #endif 56 #define ERROR(str) \ 57 ctxt->error = XML_REGEXP_COMPILE_ERROR; \ 58 xmlRegexpErrCompile(ctxt, str); 59 #define NEXT ctxt->cur++ 60 #define CUR (*(ctxt->cur)) 61 #define NXT(index) (ctxt->cur[index]) 62 63 #define CUR_SCHAR(s, l) xmlStringCurrentChar(NULL, s, &l) 64 #define NEXTL(l) ctxt->cur += l; 65 #define XML_REG_STRING_SEPARATOR '|' 66 /* 67 * Need PREV to check on a '-' within a Character Group. May only be used 68 * when it's guaranteed that cur is not at the beginning of ctxt->string! 69 */ 70 #define PREV (ctxt->cur[-1]) 71 72 /** 73 * TODO: 74 * 75 * macro to flag unimplemented blocks 76 */ 77 #define TODO \ 78 xmlGenericError(xmlGenericErrorContext, \ 79 "Unimplemented block at %s:%d\n", \ 80 __FILE__, __LINE__); 81 82 /************************************************************************ 83 * * 84 * Datatypes and structures * 85 * * 86 ************************************************************************/ 87 88 /* 89 * Note: the order of the enums below is significant, do not shuffle 90 */ 91 typedef enum { 92 XML_REGEXP_EPSILON = 1, 93 XML_REGEXP_CHARVAL, 94 XML_REGEXP_RANGES, 95 XML_REGEXP_SUBREG, /* used for () sub regexps */ 96 XML_REGEXP_STRING, 97 XML_REGEXP_ANYCHAR, /* . */ 98 XML_REGEXP_ANYSPACE, /* \s */ 99 XML_REGEXP_NOTSPACE, /* \S */ 100 XML_REGEXP_INITNAME, /* \l */ 101 XML_REGEXP_NOTINITNAME, /* \L */ 102 XML_REGEXP_NAMECHAR, /* \c */ 103 XML_REGEXP_NOTNAMECHAR, /* \C */ 104 XML_REGEXP_DECIMAL, /* \d */ 105 XML_REGEXP_NOTDECIMAL, /* \D */ 106 XML_REGEXP_REALCHAR, /* \w */ 107 XML_REGEXP_NOTREALCHAR, /* \W */ 108 XML_REGEXP_LETTER = 100, 109 XML_REGEXP_LETTER_UPPERCASE, 110 XML_REGEXP_LETTER_LOWERCASE, 111 XML_REGEXP_LETTER_TITLECASE, 112 XML_REGEXP_LETTER_MODIFIER, 113 XML_REGEXP_LETTER_OTHERS, 114 XML_REGEXP_MARK, 115 XML_REGEXP_MARK_NONSPACING, 116 XML_REGEXP_MARK_SPACECOMBINING, 117 XML_REGEXP_MARK_ENCLOSING, 118 XML_REGEXP_NUMBER, 119 XML_REGEXP_NUMBER_DECIMAL, 120 XML_REGEXP_NUMBER_LETTER, 121 XML_REGEXP_NUMBER_OTHERS, 122 XML_REGEXP_PUNCT, 123 XML_REGEXP_PUNCT_CONNECTOR, 124 XML_REGEXP_PUNCT_DASH, 125 XML_REGEXP_PUNCT_OPEN, 126 XML_REGEXP_PUNCT_CLOSE, 127 XML_REGEXP_PUNCT_INITQUOTE, 128 XML_REGEXP_PUNCT_FINQUOTE, 129 XML_REGEXP_PUNCT_OTHERS, 130 XML_REGEXP_SEPAR, 131 XML_REGEXP_SEPAR_SPACE, 132 XML_REGEXP_SEPAR_LINE, 133 XML_REGEXP_SEPAR_PARA, 134 XML_REGEXP_SYMBOL, 135 XML_REGEXP_SYMBOL_MATH, 136 XML_REGEXP_SYMBOL_CURRENCY, 137 XML_REGEXP_SYMBOL_MODIFIER, 138 XML_REGEXP_SYMBOL_OTHERS, 139 XML_REGEXP_OTHER, 140 XML_REGEXP_OTHER_CONTROL, 141 XML_REGEXP_OTHER_FORMAT, 142 XML_REGEXP_OTHER_PRIVATE, 143 XML_REGEXP_OTHER_NA, 144 XML_REGEXP_BLOCK_NAME 145 } xmlRegAtomType; 146 147 typedef enum { 148 XML_REGEXP_QUANT_EPSILON = 1, 149 XML_REGEXP_QUANT_ONCE, 150 XML_REGEXP_QUANT_OPT, 151 XML_REGEXP_QUANT_MULT, 152 XML_REGEXP_QUANT_PLUS, 153 XML_REGEXP_QUANT_ONCEONLY, 154 XML_REGEXP_QUANT_ALL, 155 XML_REGEXP_QUANT_RANGE 156 } xmlRegQuantType; 157 158 typedef enum { 159 XML_REGEXP_START_STATE = 1, 160 XML_REGEXP_FINAL_STATE, 161 XML_REGEXP_TRANS_STATE, 162 XML_REGEXP_SINK_STATE, 163 XML_REGEXP_UNREACH_STATE 164 } xmlRegStateType; 165 166 typedef enum { 167 XML_REGEXP_MARK_NORMAL = 0, 168 XML_REGEXP_MARK_START, 169 XML_REGEXP_MARK_VISITED 170 } xmlRegMarkedType; 171 172 typedef struct _xmlRegRange xmlRegRange; 173 typedef xmlRegRange *xmlRegRangePtr; 174 175 struct _xmlRegRange { 176 int neg; /* 0 normal, 1 not, 2 exclude */ 177 xmlRegAtomType type; 178 int start; 179 int end; 180 xmlChar *blockName; 181 }; 182 183 typedef struct _xmlRegAtom xmlRegAtom; 184 typedef xmlRegAtom *xmlRegAtomPtr; 185 186 typedef struct _xmlAutomataState xmlRegState; 187 typedef xmlRegState *xmlRegStatePtr; 188 189 struct _xmlRegAtom { 190 int no; 191 xmlRegAtomType type; 192 xmlRegQuantType quant; 193 int min; 194 int max; 195 196 void *valuep; 197 void *valuep2; 198 int neg; 199 int codepoint; 200 xmlRegStatePtr start; 201 xmlRegStatePtr start0; 202 xmlRegStatePtr stop; 203 int maxRanges; 204 int nbRanges; 205 xmlRegRangePtr *ranges; 206 void *data; 207 }; 208 209 typedef struct _xmlRegCounter xmlRegCounter; 210 typedef xmlRegCounter *xmlRegCounterPtr; 211 212 struct _xmlRegCounter { 213 int min; 214 int max; 215 }; 216 217 typedef struct _xmlRegTrans xmlRegTrans; 218 typedef xmlRegTrans *xmlRegTransPtr; 219 220 struct _xmlRegTrans { 221 xmlRegAtomPtr atom; 222 int to; 223 int counter; 224 int count; 225 int nd; 226 }; 227 228 struct _xmlAutomataState { 229 xmlRegStateType type; 230 xmlRegMarkedType mark; 231 xmlRegMarkedType markd; 232 xmlRegMarkedType reached; 233 int no; 234 int maxTrans; 235 int nbTrans; 236 xmlRegTrans *trans; 237 /* knowing states ponting to us can speed things up */ 238 int maxTransTo; 239 int nbTransTo; 240 int *transTo; 241 }; 242 243 typedef struct _xmlAutomata xmlRegParserCtxt; 244 typedef xmlRegParserCtxt *xmlRegParserCtxtPtr; 245 246 #define AM_AUTOMATA_RNG 1 247 248 struct _xmlAutomata { 249 xmlChar *string; 250 xmlChar *cur; 251 252 int error; 253 int neg; 254 255 xmlRegStatePtr start; 256 xmlRegStatePtr end; 257 xmlRegStatePtr state; 258 259 xmlRegAtomPtr atom; 260 261 int maxAtoms; 262 int nbAtoms; 263 xmlRegAtomPtr *atoms; 264 265 int maxStates; 266 int nbStates; 267 xmlRegStatePtr *states; 268 269 int maxCounters; 270 int nbCounters; 271 xmlRegCounter *counters; 272 273 int determinist; 274 int negs; 275 int flags; 276 }; 277 278 struct _xmlRegexp { 279 xmlChar *string; 280 int nbStates; 281 xmlRegStatePtr *states; 282 int nbAtoms; 283 xmlRegAtomPtr *atoms; 284 int nbCounters; 285 xmlRegCounter *counters; 286 int determinist; 287 int flags; 288 /* 289 * That's the compact form for determinists automatas 290 */ 291 int nbstates; 292 int *compact; 293 void **transdata; 294 int nbstrings; 295 xmlChar **stringMap; 296 }; 297 298 typedef struct _xmlRegExecRollback xmlRegExecRollback; 299 typedef xmlRegExecRollback *xmlRegExecRollbackPtr; 300 301 struct _xmlRegExecRollback { 302 xmlRegStatePtr state;/* the current state */ 303 int index; /* the index in the input stack */ 304 int nextbranch; /* the next transition to explore in that state */ 305 int *counts; /* save the automata state if it has some */ 306 }; 307 308 typedef struct _xmlRegInputToken xmlRegInputToken; 309 typedef xmlRegInputToken *xmlRegInputTokenPtr; 310 311 struct _xmlRegInputToken { 312 xmlChar *value; 313 void *data; 314 }; 315 316 struct _xmlRegExecCtxt { 317 int status; /* execution status != 0 indicate an error */ 318 int determinist; /* did we find an indeterministic behaviour */ 319 xmlRegexpPtr comp; /* the compiled regexp */ 320 xmlRegExecCallbacks callback; 321 void *data; 322 323 xmlRegStatePtr state;/* the current state */ 324 int transno; /* the current transition on that state */ 325 int transcount; /* the number of chars in char counted transitions */ 326 327 /* 328 * A stack of rollback states 329 */ 330 int maxRollbacks; 331 int nbRollbacks; 332 xmlRegExecRollback *rollbacks; 333 334 /* 335 * The state of the automata if any 336 */ 337 int *counts; 338 339 /* 340 * The input stack 341 */ 342 int inputStackMax; 343 int inputStackNr; 344 int index; 345 int *charStack; 346 const xmlChar *inputString; /* when operating on characters */ 347 xmlRegInputTokenPtr inputStack;/* when operating on strings */ 348 349 /* 350 * error handling 351 */ 352 int errStateNo; /* the error state number */ 353 xmlRegStatePtr errState; /* the error state */ 354 xmlChar *errString; /* the string raising the error */ 355 int *errCounts; /* counters at the error state */ 356 int nbPush; 357 }; 358 359 #define REGEXP_ALL_COUNTER 0x123456 360 #define REGEXP_ALL_LAX_COUNTER 0x123457 361 362 static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top); 363 static void xmlRegFreeState(xmlRegStatePtr state); 364 static void xmlRegFreeAtom(xmlRegAtomPtr atom); 365 static int xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr); 366 static int xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint); 367 static int xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, 368 int neg, int start, int end, const xmlChar *blockName); 369 370 void xmlAutomataSetFlags(xmlAutomataPtr am, int flags); 371 372 /************************************************************************ 373 * * 374 * Regexp memory error handler * 375 * * 376 ************************************************************************/ 377 /** 378 * xmlRegexpErrMemory: 379 * @extra: extra information 380 * 381 * Handle an out of memory condition 382 */ 383 static void 384 xmlRegexpErrMemory(xmlRegParserCtxtPtr ctxt, const char *extra) 385 { 386 const char *regexp = NULL; 387 if (ctxt != NULL) { 388 regexp = (const char *) ctxt->string; 389 ctxt->error = XML_ERR_NO_MEMORY; 390 } 391 __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP, 392 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra, 393 regexp, NULL, 0, 0, 394 "Memory allocation failed : %s\n", extra); 395 } 396 397 /** 398 * xmlRegexpErrCompile: 399 * @extra: extra information 400 * 401 * Handle a compilation failure 402 */ 403 static void 404 xmlRegexpErrCompile(xmlRegParserCtxtPtr ctxt, const char *extra) 405 { 406 const char *regexp = NULL; 407 int idx = 0; 408 409 if (ctxt != NULL) { 410 regexp = (const char *) ctxt->string; 411 idx = ctxt->cur - ctxt->string; 412 ctxt->error = XML_REGEXP_COMPILE_ERROR; 413 } 414 __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP, 415 XML_REGEXP_COMPILE_ERROR, XML_ERR_FATAL, NULL, 0, extra, 416 regexp, NULL, idx, 0, 417 "failed to compile: %s\n", extra); 418 } 419 420 /************************************************************************ 421 * * 422 * Allocation/Deallocation * 423 * * 424 ************************************************************************/ 425 426 static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt); 427 428 /** 429 * xmlRegCalloc2: 430 * @dim1: size of first dimension 431 * @dim2: size of second dimension 432 * @elemSize: size of element 433 * 434 * Allocate a two-dimensional array and set all elements to zero. 435 * 436 * Returns the new array or NULL in case of error. 437 */ 438 static void* 439 xmlRegCalloc2(size_t dim1, size_t dim2, size_t elemSize) { 440 size_t totalSize; 441 void *ret; 442 443 /* Check for overflow */ 444 if (dim1 > SIZE_MAX / dim2 / elemSize) 445 return (NULL); 446 totalSize = dim1 * dim2 * elemSize; 447 ret = xmlMalloc(totalSize); 448 if (ret != NULL) 449 memset(ret, 0, totalSize); 450 return (ret); 451 } 452 453 /** 454 * xmlRegEpxFromParse: 455 * @ctxt: the parser context used to build it 456 * 457 * Allocate a new regexp and fill it with the result from the parser 458 * 459 * Returns the new regexp or NULL in case of error 460 */ 461 static xmlRegexpPtr 462 xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) { 463 xmlRegexpPtr ret; 464 465 ret = (xmlRegexpPtr) xmlMalloc(sizeof(xmlRegexp)); 466 if (ret == NULL) { 467 xmlRegexpErrMemory(ctxt, "compiling regexp"); 468 return(NULL); 469 } 470 memset(ret, 0, sizeof(xmlRegexp)); 471 ret->string = ctxt->string; 472 ret->nbStates = ctxt->nbStates; 473 ret->states = ctxt->states; 474 ret->nbAtoms = ctxt->nbAtoms; 475 ret->atoms = ctxt->atoms; 476 ret->nbCounters = ctxt->nbCounters; 477 ret->counters = ctxt->counters; 478 ret->determinist = ctxt->determinist; 479 ret->flags = ctxt->flags; 480 if (ret->determinist == -1) { 481 xmlRegexpIsDeterminist(ret); 482 } 483 484 if ((ret->determinist != 0) && 485 (ret->nbCounters == 0) && 486 (ctxt->negs == 0) && 487 (ret->atoms != NULL) && 488 (ret->atoms[0] != NULL) && 489 (ret->atoms[0]->type == XML_REGEXP_STRING)) { 490 int i, j, nbstates = 0, nbatoms = 0; 491 int *stateRemap; 492 int *stringRemap; 493 int *transitions; 494 void **transdata; 495 xmlChar **stringMap; 496 xmlChar *value; 497 498 /* 499 * Switch to a compact representation 500 * 1/ counting the effective number of states left 501 * 2/ counting the unique number of atoms, and check that 502 * they are all of the string type 503 * 3/ build a table state x atom for the transitions 504 */ 505 506 stateRemap = xmlMalloc(ret->nbStates * sizeof(int)); 507 if (stateRemap == NULL) { 508 xmlRegexpErrMemory(ctxt, "compiling regexp"); 509 xmlFree(ret); 510 return(NULL); 511 } 512 for (i = 0;i < ret->nbStates;i++) { 513 if (ret->states[i] != NULL) { 514 stateRemap[i] = nbstates; 515 nbstates++; 516 } else { 517 stateRemap[i] = -1; 518 } 519 } 520 #ifdef DEBUG_COMPACTION 521 printf("Final: %d states\n", nbstates); 522 #endif 523 stringMap = xmlMalloc(ret->nbAtoms * sizeof(char *)); 524 if (stringMap == NULL) { 525 xmlRegexpErrMemory(ctxt, "compiling regexp"); 526 xmlFree(stateRemap); 527 xmlFree(ret); 528 return(NULL); 529 } 530 stringRemap = xmlMalloc(ret->nbAtoms * sizeof(int)); 531 if (stringRemap == NULL) { 532 xmlRegexpErrMemory(ctxt, "compiling regexp"); 533 xmlFree(stringMap); 534 xmlFree(stateRemap); 535 xmlFree(ret); 536 return(NULL); 537 } 538 for (i = 0;i < ret->nbAtoms;i++) { 539 if ((ret->atoms[i]->type == XML_REGEXP_STRING) && 540 (ret->atoms[i]->quant == XML_REGEXP_QUANT_ONCE)) { 541 value = ret->atoms[i]->valuep; 542 for (j = 0;j < nbatoms;j++) { 543 if (xmlStrEqual(stringMap[j], value)) { 544 stringRemap[i] = j; 545 break; 546 } 547 } 548 if (j >= nbatoms) { 549 stringRemap[i] = nbatoms; 550 stringMap[nbatoms] = xmlStrdup(value); 551 if (stringMap[nbatoms] == NULL) { 552 for (i = 0;i < nbatoms;i++) 553 xmlFree(stringMap[i]); 554 xmlFree(stringRemap); 555 xmlFree(stringMap); 556 xmlFree(stateRemap); 557 xmlFree(ret); 558 return(NULL); 559 } 560 nbatoms++; 561 } 562 } else { 563 xmlFree(stateRemap); 564 xmlFree(stringRemap); 565 for (i = 0;i < nbatoms;i++) 566 xmlFree(stringMap[i]); 567 xmlFree(stringMap); 568 xmlFree(ret); 569 return(NULL); 570 } 571 } 572 #ifdef DEBUG_COMPACTION 573 printf("Final: %d atoms\n", nbatoms); 574 #endif 575 transitions = (int *) xmlRegCalloc2(nbstates + 1, nbatoms + 1, 576 sizeof(int)); 577 if (transitions == NULL) { 578 xmlFree(stateRemap); 579 xmlFree(stringRemap); 580 xmlFree(stringMap); 581 xmlFree(ret); 582 return(NULL); 583 } 584 585 /* 586 * Allocate the transition table. The first entry for each 587 * state corresponds to the state type. 588 */ 589 transdata = NULL; 590 591 for (i = 0;i < ret->nbStates;i++) { 592 int stateno, atomno, targetno, prev; 593 xmlRegStatePtr state; 594 xmlRegTransPtr trans; 595 596 stateno = stateRemap[i]; 597 if (stateno == -1) 598 continue; 599 state = ret->states[i]; 600 601 transitions[stateno * (nbatoms + 1)] = state->type; 602 603 for (j = 0;j < state->nbTrans;j++) { 604 trans = &(state->trans[j]); 605 if ((trans->to == -1) || (trans->atom == NULL)) 606 continue; 607 atomno = stringRemap[trans->atom->no]; 608 if ((trans->atom->data != NULL) && (transdata == NULL)) { 609 transdata = (void **) xmlRegCalloc2(nbstates, nbatoms, 610 sizeof(void *)); 611 if (transdata == NULL) { 612 xmlRegexpErrMemory(ctxt, "compiling regexp"); 613 break; 614 } 615 } 616 targetno = stateRemap[trans->to]; 617 /* 618 * if the same atom can generate transitions to 2 different 619 * states then it means the automata is not determinist and 620 * the compact form can't be used ! 621 */ 622 prev = transitions[stateno * (nbatoms + 1) + atomno + 1]; 623 if (prev != 0) { 624 if (prev != targetno + 1) { 625 ret->determinist = 0; 626 #ifdef DEBUG_COMPACTION 627 printf("Indet: state %d trans %d, atom %d to %d : %d to %d\n", 628 i, j, trans->atom->no, trans->to, atomno, targetno); 629 printf(" previous to is %d\n", prev); 630 #endif 631 if (transdata != NULL) 632 xmlFree(transdata); 633 xmlFree(transitions); 634 xmlFree(stateRemap); 635 xmlFree(stringRemap); 636 for (i = 0;i < nbatoms;i++) 637 xmlFree(stringMap[i]); 638 xmlFree(stringMap); 639 goto not_determ; 640 } 641 } else { 642 #if 0 643 printf("State %d trans %d: atom %d to %d : %d to %d\n", 644 i, j, trans->atom->no, trans->to, atomno, targetno); 645 #endif 646 transitions[stateno * (nbatoms + 1) + atomno + 1] = 647 targetno + 1; /* to avoid 0 */ 648 if (transdata != NULL) 649 transdata[stateno * nbatoms + atomno] = 650 trans->atom->data; 651 } 652 } 653 } 654 ret->determinist = 1; 655 #ifdef DEBUG_COMPACTION 656 /* 657 * Debug 658 */ 659 for (i = 0;i < nbstates;i++) { 660 for (j = 0;j < nbatoms + 1;j++) { 661 printf("%02d ", transitions[i * (nbatoms + 1) + j]); 662 } 663 printf("\n"); 664 } 665 printf("\n"); 666 #endif 667 /* 668 * Cleanup of the old data 669 */ 670 if (ret->states != NULL) { 671 for (i = 0;i < ret->nbStates;i++) 672 xmlRegFreeState(ret->states[i]); 673 xmlFree(ret->states); 674 } 675 ret->states = NULL; 676 ret->nbStates = 0; 677 if (ret->atoms != NULL) { 678 for (i = 0;i < ret->nbAtoms;i++) 679 xmlRegFreeAtom(ret->atoms[i]); 680 xmlFree(ret->atoms); 681 } 682 ret->atoms = NULL; 683 ret->nbAtoms = 0; 684 685 ret->compact = transitions; 686 ret->transdata = transdata; 687 ret->stringMap = stringMap; 688 ret->nbstrings = nbatoms; 689 ret->nbstates = nbstates; 690 xmlFree(stateRemap); 691 xmlFree(stringRemap); 692 } 693 not_determ: 694 ctxt->string = NULL; 695 ctxt->nbStates = 0; 696 ctxt->states = NULL; 697 ctxt->nbAtoms = 0; 698 ctxt->atoms = NULL; 699 ctxt->nbCounters = 0; 700 ctxt->counters = NULL; 701 return(ret); 702 } 703 704 /** 705 * xmlRegNewParserCtxt: 706 * @string: the string to parse 707 * 708 * Allocate a new regexp parser context 709 * 710 * Returns the new context or NULL in case of error 711 */ 712 static xmlRegParserCtxtPtr 713 xmlRegNewParserCtxt(const xmlChar *string) { 714 xmlRegParserCtxtPtr ret; 715 716 ret = (xmlRegParserCtxtPtr) xmlMalloc(sizeof(xmlRegParserCtxt)); 717 if (ret == NULL) 718 return(NULL); 719 memset(ret, 0, sizeof(xmlRegParserCtxt)); 720 if (string != NULL) 721 ret->string = xmlStrdup(string); 722 ret->cur = ret->string; 723 ret->neg = 0; 724 ret->negs = 0; 725 ret->error = 0; 726 ret->determinist = -1; 727 return(ret); 728 } 729 730 /** 731 * xmlRegNewRange: 732 * @ctxt: the regexp parser context 733 * @neg: is that negative 734 * @type: the type of range 735 * @start: the start codepoint 736 * @end: the end codepoint 737 * 738 * Allocate a new regexp range 739 * 740 * Returns the new range or NULL in case of error 741 */ 742 static xmlRegRangePtr 743 xmlRegNewRange(xmlRegParserCtxtPtr ctxt, 744 int neg, xmlRegAtomType type, int start, int end) { 745 xmlRegRangePtr ret; 746 747 ret = (xmlRegRangePtr) xmlMalloc(sizeof(xmlRegRange)); 748 if (ret == NULL) { 749 xmlRegexpErrMemory(ctxt, "allocating range"); 750 return(NULL); 751 } 752 ret->neg = neg; 753 ret->type = type; 754 ret->start = start; 755 ret->end = end; 756 return(ret); 757 } 758 759 /** 760 * xmlRegFreeRange: 761 * @range: the regexp range 762 * 763 * Free a regexp range 764 */ 765 static void 766 xmlRegFreeRange(xmlRegRangePtr range) { 767 if (range == NULL) 768 return; 769 770 if (range->blockName != NULL) 771 xmlFree(range->blockName); 772 xmlFree(range); 773 } 774 775 /** 776 * xmlRegCopyRange: 777 * @range: the regexp range 778 * 779 * Copy a regexp range 780 * 781 * Returns the new copy or NULL in case of error. 782 */ 783 static xmlRegRangePtr 784 xmlRegCopyRange(xmlRegParserCtxtPtr ctxt, xmlRegRangePtr range) { 785 xmlRegRangePtr ret; 786 787 if (range == NULL) 788 return(NULL); 789 790 ret = xmlRegNewRange(ctxt, range->neg, range->type, range->start, 791 range->end); 792 if (ret == NULL) 793 return(NULL); 794 if (range->blockName != NULL) { 795 ret->blockName = xmlStrdup(range->blockName); 796 if (ret->blockName == NULL) { 797 xmlRegexpErrMemory(ctxt, "allocating range"); 798 xmlRegFreeRange(ret); 799 return(NULL); 800 } 801 } 802 return(ret); 803 } 804 805 /** 806 * xmlRegNewAtom: 807 * @ctxt: the regexp parser context 808 * @type: the type of atom 809 * 810 * Allocate a new atom 811 * 812 * Returns the new atom or NULL in case of error 813 */ 814 static xmlRegAtomPtr 815 xmlRegNewAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomType type) { 816 xmlRegAtomPtr ret; 817 818 ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom)); 819 if (ret == NULL) { 820 xmlRegexpErrMemory(ctxt, "allocating atom"); 821 return(NULL); 822 } 823 memset(ret, 0, sizeof(xmlRegAtom)); 824 ret->type = type; 825 ret->quant = XML_REGEXP_QUANT_ONCE; 826 ret->min = 0; 827 ret->max = 0; 828 return(ret); 829 } 830 831 /** 832 * xmlRegFreeAtom: 833 * @atom: the regexp atom 834 * 835 * Free a regexp atom 836 */ 837 static void 838 xmlRegFreeAtom(xmlRegAtomPtr atom) { 839 int i; 840 841 if (atom == NULL) 842 return; 843 844 for (i = 0;i < atom->nbRanges;i++) 845 xmlRegFreeRange(atom->ranges[i]); 846 if (atom->ranges != NULL) 847 xmlFree(atom->ranges); 848 if ((atom->type == XML_REGEXP_STRING) && (atom->valuep != NULL)) 849 xmlFree(atom->valuep); 850 if ((atom->type == XML_REGEXP_STRING) && (atom->valuep2 != NULL)) 851 xmlFree(atom->valuep2); 852 if ((atom->type == XML_REGEXP_BLOCK_NAME) && (atom->valuep != NULL)) 853 xmlFree(atom->valuep); 854 xmlFree(atom); 855 } 856 857 /** 858 * xmlRegCopyAtom: 859 * @ctxt: the regexp parser context 860 * @atom: the oiginal atom 861 * 862 * Allocate a new regexp range 863 * 864 * Returns the new atom or NULL in case of error 865 */ 866 static xmlRegAtomPtr 867 xmlRegCopyAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) { 868 xmlRegAtomPtr ret; 869 870 ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom)); 871 if (ret == NULL) { 872 xmlRegexpErrMemory(ctxt, "copying atom"); 873 return(NULL); 874 } 875 memset(ret, 0, sizeof(xmlRegAtom)); 876 ret->type = atom->type; 877 ret->quant = atom->quant; 878 ret->min = atom->min; 879 ret->max = atom->max; 880 if (atom->nbRanges > 0) { 881 int i; 882 883 ret->ranges = (xmlRegRangePtr *) xmlMalloc(sizeof(xmlRegRangePtr) * 884 atom->nbRanges); 885 if (ret->ranges == NULL) { 886 xmlRegexpErrMemory(ctxt, "copying atom"); 887 goto error; 888 } 889 for (i = 0;i < atom->nbRanges;i++) { 890 ret->ranges[i] = xmlRegCopyRange(ctxt, atom->ranges[i]); 891 if (ret->ranges[i] == NULL) 892 goto error; 893 ret->nbRanges = i + 1; 894 } 895 } 896 return(ret); 897 898 error: 899 xmlRegFreeAtom(ret); 900 return(NULL); 901 } 902 903 static xmlRegStatePtr 904 xmlRegNewState(xmlRegParserCtxtPtr ctxt) { 905 xmlRegStatePtr ret; 906 907 ret = (xmlRegStatePtr) xmlMalloc(sizeof(xmlRegState)); 908 if (ret == NULL) { 909 xmlRegexpErrMemory(ctxt, "allocating state"); 910 return(NULL); 911 } 912 memset(ret, 0, sizeof(xmlRegState)); 913 ret->type = XML_REGEXP_TRANS_STATE; 914 ret->mark = XML_REGEXP_MARK_NORMAL; 915 return(ret); 916 } 917 918 /** 919 * xmlRegFreeState: 920 * @state: the regexp state 921 * 922 * Free a regexp state 923 */ 924 static void 925 xmlRegFreeState(xmlRegStatePtr state) { 926 if (state == NULL) 927 return; 928 929 if (state->trans != NULL) 930 xmlFree(state->trans); 931 if (state->transTo != NULL) 932 xmlFree(state->transTo); 933 xmlFree(state); 934 } 935 936 /** 937 * xmlRegFreeParserCtxt: 938 * @ctxt: the regexp parser context 939 * 940 * Free a regexp parser context 941 */ 942 static void 943 xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt) { 944 int i; 945 if (ctxt == NULL) 946 return; 947 948 if (ctxt->string != NULL) 949 xmlFree(ctxt->string); 950 if (ctxt->states != NULL) { 951 for (i = 0;i < ctxt->nbStates;i++) 952 xmlRegFreeState(ctxt->states[i]); 953 xmlFree(ctxt->states); 954 } 955 if (ctxt->atoms != NULL) { 956 for (i = 0;i < ctxt->nbAtoms;i++) 957 xmlRegFreeAtom(ctxt->atoms[i]); 958 xmlFree(ctxt->atoms); 959 } 960 if (ctxt->counters != NULL) 961 xmlFree(ctxt->counters); 962 xmlFree(ctxt); 963 } 964 965 /************************************************************************ 966 * * 967 * Display of Data structures * 968 * * 969 ************************************************************************/ 970 971 static void 972 xmlRegPrintAtomType(FILE *output, xmlRegAtomType type) { 973 switch (type) { 974 case XML_REGEXP_EPSILON: 975 fprintf(output, "epsilon "); break; 976 case XML_REGEXP_CHARVAL: 977 fprintf(output, "charval "); break; 978 case XML_REGEXP_RANGES: 979 fprintf(output, "ranges "); break; 980 case XML_REGEXP_SUBREG: 981 fprintf(output, "subexpr "); break; 982 case XML_REGEXP_STRING: 983 fprintf(output, "string "); break; 984 case XML_REGEXP_ANYCHAR: 985 fprintf(output, "anychar "); break; 986 case XML_REGEXP_ANYSPACE: 987 fprintf(output, "anyspace "); break; 988 case XML_REGEXP_NOTSPACE: 989 fprintf(output, "notspace "); break; 990 case XML_REGEXP_INITNAME: 991 fprintf(output, "initname "); break; 992 case XML_REGEXP_NOTINITNAME: 993 fprintf(output, "notinitname "); break; 994 case XML_REGEXP_NAMECHAR: 995 fprintf(output, "namechar "); break; 996 case XML_REGEXP_NOTNAMECHAR: 997 fprintf(output, "notnamechar "); break; 998 case XML_REGEXP_DECIMAL: 999 fprintf(output, "decimal "); break; 1000 case XML_REGEXP_NOTDECIMAL: 1001 fprintf(output, "notdecimal "); break; 1002 case XML_REGEXP_REALCHAR: 1003 fprintf(output, "realchar "); break; 1004 case XML_REGEXP_NOTREALCHAR: 1005 fprintf(output, "notrealchar "); break; 1006 case XML_REGEXP_LETTER: 1007 fprintf(output, "LETTER "); break; 1008 case XML_REGEXP_LETTER_UPPERCASE: 1009 fprintf(output, "LETTER_UPPERCASE "); break; 1010 case XML_REGEXP_LETTER_LOWERCASE: 1011 fprintf(output, "LETTER_LOWERCASE "); break; 1012 case XML_REGEXP_LETTER_TITLECASE: 1013 fprintf(output, "LETTER_TITLECASE "); break; 1014 case XML_REGEXP_LETTER_MODIFIER: 1015 fprintf(output, "LETTER_MODIFIER "); break; 1016 case XML_REGEXP_LETTER_OTHERS: 1017 fprintf(output, "LETTER_OTHERS "); break; 1018 case XML_REGEXP_MARK: 1019 fprintf(output, "MARK "); break; 1020 case XML_REGEXP_MARK_NONSPACING: 1021 fprintf(output, "MARK_NONSPACING "); break; 1022 case XML_REGEXP_MARK_SPACECOMBINING: 1023 fprintf(output, "MARK_SPACECOMBINING "); break; 1024 case XML_REGEXP_MARK_ENCLOSING: 1025 fprintf(output, "MARK_ENCLOSING "); break; 1026 case XML_REGEXP_NUMBER: 1027 fprintf(output, "NUMBER "); break; 1028 case XML_REGEXP_NUMBER_DECIMAL: 1029 fprintf(output, "NUMBER_DECIMAL "); break; 1030 case XML_REGEXP_NUMBER_LETTER: 1031 fprintf(output, "NUMBER_LETTER "); break; 1032 case XML_REGEXP_NUMBER_OTHERS: 1033 fprintf(output, "NUMBER_OTHERS "); break; 1034 case XML_REGEXP_PUNCT: 1035 fprintf(output, "PUNCT "); break; 1036 case XML_REGEXP_PUNCT_CONNECTOR: 1037 fprintf(output, "PUNCT_CONNECTOR "); break; 1038 case XML_REGEXP_PUNCT_DASH: 1039 fprintf(output, "PUNCT_DASH "); break; 1040 case XML_REGEXP_PUNCT_OPEN: 1041 fprintf(output, "PUNCT_OPEN "); break; 1042 case XML_REGEXP_PUNCT_CLOSE: 1043 fprintf(output, "PUNCT_CLOSE "); break; 1044 case XML_REGEXP_PUNCT_INITQUOTE: 1045 fprintf(output, "PUNCT_INITQUOTE "); break; 1046 case XML_REGEXP_PUNCT_FINQUOTE: 1047 fprintf(output, "PUNCT_FINQUOTE "); break; 1048 case XML_REGEXP_PUNCT_OTHERS: 1049 fprintf(output, "PUNCT_OTHERS "); break; 1050 case XML_REGEXP_SEPAR: 1051 fprintf(output, "SEPAR "); break; 1052 case XML_REGEXP_SEPAR_SPACE: 1053 fprintf(output, "SEPAR_SPACE "); break; 1054 case XML_REGEXP_SEPAR_LINE: 1055 fprintf(output, "SEPAR_LINE "); break; 1056 case XML_REGEXP_SEPAR_PARA: 1057 fprintf(output, "SEPAR_PARA "); break; 1058 case XML_REGEXP_SYMBOL: 1059 fprintf(output, "SYMBOL "); break; 1060 case XML_REGEXP_SYMBOL_MATH: 1061 fprintf(output, "SYMBOL_MATH "); break; 1062 case XML_REGEXP_SYMBOL_CURRENCY: 1063 fprintf(output, "SYMBOL_CURRENCY "); break; 1064 case XML_REGEXP_SYMBOL_MODIFIER: 1065 fprintf(output, "SYMBOL_MODIFIER "); break; 1066 case XML_REGEXP_SYMBOL_OTHERS: 1067 fprintf(output, "SYMBOL_OTHERS "); break; 1068 case XML_REGEXP_OTHER: 1069 fprintf(output, "OTHER "); break; 1070 case XML_REGEXP_OTHER_CONTROL: 1071 fprintf(output, "OTHER_CONTROL "); break; 1072 case XML_REGEXP_OTHER_FORMAT: 1073 fprintf(output, "OTHER_FORMAT "); break; 1074 case XML_REGEXP_OTHER_PRIVATE: 1075 fprintf(output, "OTHER_PRIVATE "); break; 1076 case XML_REGEXP_OTHER_NA: 1077 fprintf(output, "OTHER_NA "); break; 1078 case XML_REGEXP_BLOCK_NAME: 1079 fprintf(output, "BLOCK "); break; 1080 } 1081 } 1082 1083 static void 1084 xmlRegPrintQuantType(FILE *output, xmlRegQuantType type) { 1085 switch (type) { 1086 case XML_REGEXP_QUANT_EPSILON: 1087 fprintf(output, "epsilon "); break; 1088 case XML_REGEXP_QUANT_ONCE: 1089 fprintf(output, "once "); break; 1090 case XML_REGEXP_QUANT_OPT: 1091 fprintf(output, "? "); break; 1092 case XML_REGEXP_QUANT_MULT: 1093 fprintf(output, "* "); break; 1094 case XML_REGEXP_QUANT_PLUS: 1095 fprintf(output, "+ "); break; 1096 case XML_REGEXP_QUANT_RANGE: 1097 fprintf(output, "range "); break; 1098 case XML_REGEXP_QUANT_ONCEONLY: 1099 fprintf(output, "onceonly "); break; 1100 case XML_REGEXP_QUANT_ALL: 1101 fprintf(output, "all "); break; 1102 } 1103 } 1104 static void 1105 xmlRegPrintRange(FILE *output, xmlRegRangePtr range) { 1106 fprintf(output, " range: "); 1107 if (range->neg) 1108 fprintf(output, "negative "); 1109 xmlRegPrintAtomType(output, range->type); 1110 fprintf(output, "%c - %c\n", range->start, range->end); 1111 } 1112 1113 static void 1114 xmlRegPrintAtom(FILE *output, xmlRegAtomPtr atom) { 1115 fprintf(output, " atom: "); 1116 if (atom == NULL) { 1117 fprintf(output, "NULL\n"); 1118 return; 1119 } 1120 if (atom->neg) 1121 fprintf(output, "not "); 1122 xmlRegPrintAtomType(output, atom->type); 1123 xmlRegPrintQuantType(output, atom->quant); 1124 if (atom->quant == XML_REGEXP_QUANT_RANGE) 1125 fprintf(output, "%d-%d ", atom->min, atom->max); 1126 if (atom->type == XML_REGEXP_STRING) 1127 fprintf(output, "'%s' ", (char *) atom->valuep); 1128 if (atom->type == XML_REGEXP_CHARVAL) 1129 fprintf(output, "char %c\n", atom->codepoint); 1130 else if (atom->type == XML_REGEXP_RANGES) { 1131 int i; 1132 fprintf(output, "%d entries\n", atom->nbRanges); 1133 for (i = 0; i < atom->nbRanges;i++) 1134 xmlRegPrintRange(output, atom->ranges[i]); 1135 } else if (atom->type == XML_REGEXP_SUBREG) { 1136 fprintf(output, "start %d end %d\n", atom->start->no, atom->stop->no); 1137 } else { 1138 fprintf(output, "\n"); 1139 } 1140 } 1141 1142 static void 1143 xmlRegPrintTrans(FILE *output, xmlRegTransPtr trans) { 1144 fprintf(output, " trans: "); 1145 if (trans == NULL) { 1146 fprintf(output, "NULL\n"); 1147 return; 1148 } 1149 if (trans->to < 0) { 1150 fprintf(output, "removed\n"); 1151 return; 1152 } 1153 if (trans->nd != 0) { 1154 if (trans->nd == 2) 1155 fprintf(output, "last not determinist, "); 1156 else 1157 fprintf(output, "not determinist, "); 1158 } 1159 if (trans->counter >= 0) { 1160 fprintf(output, "counted %d, ", trans->counter); 1161 } 1162 if (trans->count == REGEXP_ALL_COUNTER) { 1163 fprintf(output, "all transition, "); 1164 } else if (trans->count >= 0) { 1165 fprintf(output, "count based %d, ", trans->count); 1166 } 1167 if (trans->atom == NULL) { 1168 fprintf(output, "epsilon to %d\n", trans->to); 1169 return; 1170 } 1171 if (trans->atom->type == XML_REGEXP_CHARVAL) 1172 fprintf(output, "char %c ", trans->atom->codepoint); 1173 fprintf(output, "atom %d, to %d\n", trans->atom->no, trans->to); 1174 } 1175 1176 static void 1177 xmlRegPrintState(FILE *output, xmlRegStatePtr state) { 1178 int i; 1179 1180 fprintf(output, " state: "); 1181 if (state == NULL) { 1182 fprintf(output, "NULL\n"); 1183 return; 1184 } 1185 if (state->type == XML_REGEXP_START_STATE) 1186 fprintf(output, "START "); 1187 if (state->type == XML_REGEXP_FINAL_STATE) 1188 fprintf(output, "FINAL "); 1189 1190 fprintf(output, "%d, %d transitions:\n", state->no, state->nbTrans); 1191 for (i = 0;i < state->nbTrans; i++) { 1192 xmlRegPrintTrans(output, &(state->trans[i])); 1193 } 1194 } 1195 1196 #ifdef DEBUG_REGEXP_GRAPH 1197 static void 1198 xmlRegPrintCtxt(FILE *output, xmlRegParserCtxtPtr ctxt) { 1199 int i; 1200 1201 fprintf(output, " ctxt: "); 1202 if (ctxt == NULL) { 1203 fprintf(output, "NULL\n"); 1204 return; 1205 } 1206 fprintf(output, "'%s' ", ctxt->string); 1207 if (ctxt->error) 1208 fprintf(output, "error "); 1209 if (ctxt->neg) 1210 fprintf(output, "neg "); 1211 fprintf(output, "\n"); 1212 fprintf(output, "%d atoms:\n", ctxt->nbAtoms); 1213 for (i = 0;i < ctxt->nbAtoms; i++) { 1214 fprintf(output, " %02d ", i); 1215 xmlRegPrintAtom(output, ctxt->atoms[i]); 1216 } 1217 if (ctxt->atom != NULL) { 1218 fprintf(output, "current atom:\n"); 1219 xmlRegPrintAtom(output, ctxt->atom); 1220 } 1221 fprintf(output, "%d states:", ctxt->nbStates); 1222 if (ctxt->start != NULL) 1223 fprintf(output, " start: %d", ctxt->start->no); 1224 if (ctxt->end != NULL) 1225 fprintf(output, " end: %d", ctxt->end->no); 1226 fprintf(output, "\n"); 1227 for (i = 0;i < ctxt->nbStates; i++) { 1228 xmlRegPrintState(output, ctxt->states[i]); 1229 } 1230 fprintf(output, "%d counters:\n", ctxt->nbCounters); 1231 for (i = 0;i < ctxt->nbCounters; i++) { 1232 fprintf(output, " %d: min %d max %d\n", i, ctxt->counters[i].min, 1233 ctxt->counters[i].max); 1234 } 1235 } 1236 #endif 1237 1238 /************************************************************************ 1239 * * 1240 * Finite Automata structures manipulations * 1241 * * 1242 ************************************************************************/ 1243 1244 static void 1245 xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom, 1246 int neg, xmlRegAtomType type, int start, int end, 1247 xmlChar *blockName) { 1248 xmlRegRangePtr range; 1249 1250 if (atom == NULL) { 1251 ERROR("add range: atom is NULL"); 1252 return; 1253 } 1254 if (atom->type != XML_REGEXP_RANGES) { 1255 ERROR("add range: atom is not ranges"); 1256 return; 1257 } 1258 if (atom->maxRanges == 0) { 1259 atom->maxRanges = 4; 1260 atom->ranges = (xmlRegRangePtr *) xmlMalloc(atom->maxRanges * 1261 sizeof(xmlRegRangePtr)); 1262 if (atom->ranges == NULL) { 1263 xmlRegexpErrMemory(ctxt, "adding ranges"); 1264 atom->maxRanges = 0; 1265 return; 1266 } 1267 } else if (atom->nbRanges >= atom->maxRanges) { 1268 xmlRegRangePtr *tmp; 1269 atom->maxRanges *= 2; 1270 tmp = (xmlRegRangePtr *) xmlRealloc(atom->ranges, atom->maxRanges * 1271 sizeof(xmlRegRangePtr)); 1272 if (tmp == NULL) { 1273 xmlRegexpErrMemory(ctxt, "adding ranges"); 1274 atom->maxRanges /= 2; 1275 return; 1276 } 1277 atom->ranges = tmp; 1278 } 1279 range = xmlRegNewRange(ctxt, neg, type, start, end); 1280 if (range == NULL) 1281 return; 1282 range->blockName = blockName; 1283 atom->ranges[atom->nbRanges++] = range; 1284 1285 } 1286 1287 static int 1288 xmlRegGetCounter(xmlRegParserCtxtPtr ctxt) { 1289 if (ctxt->maxCounters == 0) { 1290 ctxt->maxCounters = 4; 1291 ctxt->counters = (xmlRegCounter *) xmlMalloc(ctxt->maxCounters * 1292 sizeof(xmlRegCounter)); 1293 if (ctxt->counters == NULL) { 1294 xmlRegexpErrMemory(ctxt, "allocating counter"); 1295 ctxt->maxCounters = 0; 1296 return(-1); 1297 } 1298 } else if (ctxt->nbCounters >= ctxt->maxCounters) { 1299 xmlRegCounter *tmp; 1300 ctxt->maxCounters *= 2; 1301 tmp = (xmlRegCounter *) xmlRealloc(ctxt->counters, ctxt->maxCounters * 1302 sizeof(xmlRegCounter)); 1303 if (tmp == NULL) { 1304 xmlRegexpErrMemory(ctxt, "allocating counter"); 1305 ctxt->maxCounters /= 2; 1306 return(-1); 1307 } 1308 ctxt->counters = tmp; 1309 } 1310 ctxt->counters[ctxt->nbCounters].min = -1; 1311 ctxt->counters[ctxt->nbCounters].max = -1; 1312 return(ctxt->nbCounters++); 1313 } 1314 1315 static int 1316 xmlRegAtomPush(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) { 1317 if (atom == NULL) { 1318 ERROR("atom push: atom is NULL"); 1319 return(-1); 1320 } 1321 if (ctxt->maxAtoms == 0) { 1322 ctxt->maxAtoms = 4; 1323 ctxt->atoms = (xmlRegAtomPtr *) xmlMalloc(ctxt->maxAtoms * 1324 sizeof(xmlRegAtomPtr)); 1325 if (ctxt->atoms == NULL) { 1326 xmlRegexpErrMemory(ctxt, "pushing atom"); 1327 ctxt->maxAtoms = 0; 1328 return(-1); 1329 } 1330 } else if (ctxt->nbAtoms >= ctxt->maxAtoms) { 1331 xmlRegAtomPtr *tmp; 1332 ctxt->maxAtoms *= 2; 1333 tmp = (xmlRegAtomPtr *) xmlRealloc(ctxt->atoms, ctxt->maxAtoms * 1334 sizeof(xmlRegAtomPtr)); 1335 if (tmp == NULL) { 1336 xmlRegexpErrMemory(ctxt, "allocating counter"); 1337 ctxt->maxAtoms /= 2; 1338 return(-1); 1339 } 1340 ctxt->atoms = tmp; 1341 } 1342 atom->no = ctxt->nbAtoms; 1343 ctxt->atoms[ctxt->nbAtoms++] = atom; 1344 return(0); 1345 } 1346 1347 static void 1348 xmlRegStateAddTransTo(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr target, 1349 int from) { 1350 if (target->maxTransTo == 0) { 1351 target->maxTransTo = 8; 1352 target->transTo = (int *) xmlMalloc(target->maxTransTo * 1353 sizeof(int)); 1354 if (target->transTo == NULL) { 1355 xmlRegexpErrMemory(ctxt, "adding transition"); 1356 target->maxTransTo = 0; 1357 return; 1358 } 1359 } else if (target->nbTransTo >= target->maxTransTo) { 1360 int *tmp; 1361 target->maxTransTo *= 2; 1362 tmp = (int *) xmlRealloc(target->transTo, target->maxTransTo * 1363 sizeof(int)); 1364 if (tmp == NULL) { 1365 xmlRegexpErrMemory(ctxt, "adding transition"); 1366 target->maxTransTo /= 2; 1367 return; 1368 } 1369 target->transTo = tmp; 1370 } 1371 target->transTo[target->nbTransTo] = from; 1372 target->nbTransTo++; 1373 } 1374 1375 static void 1376 xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state, 1377 xmlRegAtomPtr atom, xmlRegStatePtr target, 1378 int counter, int count) { 1379 1380 int nrtrans; 1381 1382 if (state == NULL) { 1383 ERROR("add state: state is NULL"); 1384 return; 1385 } 1386 if (target == NULL) { 1387 ERROR("add state: target is NULL"); 1388 return; 1389 } 1390 /* 1391 * Other routines follow the philosophy 'When in doubt, add a transition' 1392 * so we check here whether such a transition is already present and, if 1393 * so, silently ignore this request. 1394 */ 1395 1396 for (nrtrans = state->nbTrans - 1; nrtrans >= 0; nrtrans--) { 1397 xmlRegTransPtr trans = &(state->trans[nrtrans]); 1398 if ((trans->atom == atom) && 1399 (trans->to == target->no) && 1400 (trans->counter == counter) && 1401 (trans->count == count)) { 1402 #ifdef DEBUG_REGEXP_GRAPH 1403 printf("Ignoring duplicate transition from %d to %d\n", 1404 state->no, target->no); 1405 #endif 1406 return; 1407 } 1408 } 1409 1410 if (state->maxTrans == 0) { 1411 state->maxTrans = 8; 1412 state->trans = (xmlRegTrans *) xmlMalloc(state->maxTrans * 1413 sizeof(xmlRegTrans)); 1414 if (state->trans == NULL) { 1415 xmlRegexpErrMemory(ctxt, "adding transition"); 1416 state->maxTrans = 0; 1417 return; 1418 } 1419 } else if (state->nbTrans >= state->maxTrans) { 1420 xmlRegTrans *tmp; 1421 state->maxTrans *= 2; 1422 tmp = (xmlRegTrans *) xmlRealloc(state->trans, state->maxTrans * 1423 sizeof(xmlRegTrans)); 1424 if (tmp == NULL) { 1425 xmlRegexpErrMemory(ctxt, "adding transition"); 1426 state->maxTrans /= 2; 1427 return; 1428 } 1429 state->trans = tmp; 1430 } 1431 #ifdef DEBUG_REGEXP_GRAPH 1432 printf("Add trans from %d to %d ", state->no, target->no); 1433 if (count == REGEXP_ALL_COUNTER) 1434 printf("all transition\n"); 1435 else if (count >= 0) 1436 printf("count based %d\n", count); 1437 else if (counter >= 0) 1438 printf("counted %d\n", counter); 1439 else if (atom == NULL) 1440 printf("epsilon transition\n"); 1441 else if (atom != NULL) 1442 xmlRegPrintAtom(stdout, atom); 1443 #endif 1444 1445 state->trans[state->nbTrans].atom = atom; 1446 state->trans[state->nbTrans].to = target->no; 1447 state->trans[state->nbTrans].counter = counter; 1448 state->trans[state->nbTrans].count = count; 1449 state->trans[state->nbTrans].nd = 0; 1450 state->nbTrans++; 1451 xmlRegStateAddTransTo(ctxt, target, state->no); 1452 } 1453 1454 static int 1455 xmlRegStatePush(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) { 1456 if (state == NULL) return(-1); 1457 if (ctxt->maxStates == 0) { 1458 ctxt->maxStates = 4; 1459 ctxt->states = (xmlRegStatePtr *) xmlMalloc(ctxt->maxStates * 1460 sizeof(xmlRegStatePtr)); 1461 if (ctxt->states == NULL) { 1462 xmlRegexpErrMemory(ctxt, "adding state"); 1463 ctxt->maxStates = 0; 1464 return(-1); 1465 } 1466 } else if (ctxt->nbStates >= ctxt->maxStates) { 1467 xmlRegStatePtr *tmp; 1468 ctxt->maxStates *= 2; 1469 tmp = (xmlRegStatePtr *) xmlRealloc(ctxt->states, ctxt->maxStates * 1470 sizeof(xmlRegStatePtr)); 1471 if (tmp == NULL) { 1472 xmlRegexpErrMemory(ctxt, "adding state"); 1473 ctxt->maxStates /= 2; 1474 return(-1); 1475 } 1476 ctxt->states = tmp; 1477 } 1478 state->no = ctxt->nbStates; 1479 ctxt->states[ctxt->nbStates++] = state; 1480 return(0); 1481 } 1482 1483 /** 1484 * xmlFAGenerateAllTransition: 1485 * @ctxt: a regexp parser context 1486 * @from: the from state 1487 * @to: the target state or NULL for building a new one 1488 * @lax: 1489 * 1490 */ 1491 static void 1492 xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt, 1493 xmlRegStatePtr from, xmlRegStatePtr to, 1494 int lax) { 1495 if (to == NULL) { 1496 to = xmlRegNewState(ctxt); 1497 xmlRegStatePush(ctxt, to); 1498 ctxt->state = to; 1499 } 1500 if (lax) 1501 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_LAX_COUNTER); 1502 else 1503 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_COUNTER); 1504 } 1505 1506 /** 1507 * xmlFAGenerateEpsilonTransition: 1508 * @ctxt: a regexp parser context 1509 * @from: the from state 1510 * @to: the target state or NULL for building a new one 1511 * 1512 */ 1513 static void 1514 xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt, 1515 xmlRegStatePtr from, xmlRegStatePtr to) { 1516 if (to == NULL) { 1517 to = xmlRegNewState(ctxt); 1518 xmlRegStatePush(ctxt, to); 1519 ctxt->state = to; 1520 } 1521 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, -1); 1522 } 1523 1524 /** 1525 * xmlFAGenerateCountedEpsilonTransition: 1526 * @ctxt: a regexp parser context 1527 * @from: the from state 1528 * @to: the target state or NULL for building a new one 1529 * counter: the counter for that transition 1530 * 1531 */ 1532 static void 1533 xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt, 1534 xmlRegStatePtr from, xmlRegStatePtr to, int counter) { 1535 if (to == NULL) { 1536 to = xmlRegNewState(ctxt); 1537 xmlRegStatePush(ctxt, to); 1538 ctxt->state = to; 1539 } 1540 xmlRegStateAddTrans(ctxt, from, NULL, to, counter, -1); 1541 } 1542 1543 /** 1544 * xmlFAGenerateCountedTransition: 1545 * @ctxt: a regexp parser context 1546 * @from: the from state 1547 * @to: the target state or NULL for building a new one 1548 * counter: the counter for that transition 1549 * 1550 */ 1551 static void 1552 xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt, 1553 xmlRegStatePtr from, xmlRegStatePtr to, int counter) { 1554 if (to == NULL) { 1555 to = xmlRegNewState(ctxt); 1556 xmlRegStatePush(ctxt, to); 1557 ctxt->state = to; 1558 } 1559 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, counter); 1560 } 1561 1562 /** 1563 * xmlFAGenerateTransitions: 1564 * @ctxt: a regexp parser context 1565 * @from: the from state 1566 * @to: the target state or NULL for building a new one 1567 * @atom: the atom generating the transition 1568 * 1569 * Returns 0 if success and -1 in case of error. 1570 */ 1571 static int 1572 xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from, 1573 xmlRegStatePtr to, xmlRegAtomPtr atom) { 1574 xmlRegStatePtr end; 1575 int nullable = 0; 1576 1577 if (atom == NULL) { 1578 ERROR("genrate transition: atom == NULL"); 1579 return(-1); 1580 } 1581 if (atom->type == XML_REGEXP_SUBREG) { 1582 /* 1583 * this is a subexpression handling one should not need to 1584 * create a new node except for XML_REGEXP_QUANT_RANGE. 1585 */ 1586 if (xmlRegAtomPush(ctxt, atom) < 0) { 1587 return(-1); 1588 } 1589 if ((to != NULL) && (atom->stop != to) && 1590 (atom->quant != XML_REGEXP_QUANT_RANGE)) { 1591 /* 1592 * Generate an epsilon transition to link to the target 1593 */ 1594 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to); 1595 #ifdef DV 1596 } else if ((to == NULL) && (atom->quant != XML_REGEXP_QUANT_RANGE) && 1597 (atom->quant != XML_REGEXP_QUANT_ONCE)) { 1598 to = xmlRegNewState(ctxt); 1599 xmlRegStatePush(ctxt, to); 1600 ctxt->state = to; 1601 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to); 1602 #endif 1603 } 1604 switch (atom->quant) { 1605 case XML_REGEXP_QUANT_OPT: 1606 atom->quant = XML_REGEXP_QUANT_ONCE; 1607 /* 1608 * transition done to the state after end of atom. 1609 * 1. set transition from atom start to new state 1610 * 2. set transition from atom end to this state. 1611 */ 1612 if (to == NULL) { 1613 xmlFAGenerateEpsilonTransition(ctxt, atom->start, 0); 1614 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, 1615 ctxt->state); 1616 } else { 1617 xmlFAGenerateEpsilonTransition(ctxt, atom->start, to); 1618 } 1619 break; 1620 case XML_REGEXP_QUANT_MULT: 1621 atom->quant = XML_REGEXP_QUANT_ONCE; 1622 xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop); 1623 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start); 1624 break; 1625 case XML_REGEXP_QUANT_PLUS: 1626 atom->quant = XML_REGEXP_QUANT_ONCE; 1627 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start); 1628 break; 1629 case XML_REGEXP_QUANT_RANGE: { 1630 int counter; 1631 xmlRegStatePtr inter, newstate; 1632 1633 /* 1634 * create the final state now if needed 1635 */ 1636 if (to != NULL) { 1637 newstate = to; 1638 } else { 1639 newstate = xmlRegNewState(ctxt); 1640 xmlRegStatePush(ctxt, newstate); 1641 } 1642 1643 /* 1644 * The principle here is to use counted transition 1645 * to avoid explosion in the number of states in the 1646 * graph. This is clearly more complex but should not 1647 * be exploitable at runtime. 1648 */ 1649 if ((atom->min == 0) && (atom->start0 == NULL)) { 1650 xmlRegAtomPtr copy; 1651 /* 1652 * duplicate a transition based on atom to count next 1653 * occurences after 1. We cannot loop to atom->start 1654 * directly because we need an epsilon transition to 1655 * newstate. 1656 */ 1657 /* ???? For some reason it seems we never reach that 1658 case, I suppose this got optimized out before when 1659 building the automata */ 1660 copy = xmlRegCopyAtom(ctxt, atom); 1661 if (copy == NULL) 1662 return(-1); 1663 copy->quant = XML_REGEXP_QUANT_ONCE; 1664 copy->min = 0; 1665 copy->max = 0; 1666 1667 if (xmlFAGenerateTransitions(ctxt, atom->start, NULL, copy) 1668 < 0) 1669 return(-1); 1670 inter = ctxt->state; 1671 counter = xmlRegGetCounter(ctxt); 1672 ctxt->counters[counter].min = atom->min - 1; 1673 ctxt->counters[counter].max = atom->max - 1; 1674 /* count the number of times we see it again */ 1675 xmlFAGenerateCountedEpsilonTransition(ctxt, inter, 1676 atom->stop, counter); 1677 /* allow a way out based on the count */ 1678 xmlFAGenerateCountedTransition(ctxt, inter, 1679 newstate, counter); 1680 /* and also allow a direct exit for 0 */ 1681 xmlFAGenerateEpsilonTransition(ctxt, atom->start, 1682 newstate); 1683 } else { 1684 /* 1685 * either we need the atom at least once or there 1686 * is an atom->start0 allowing to easilly plug the 1687 * epsilon transition. 1688 */ 1689 counter = xmlRegGetCounter(ctxt); 1690 ctxt->counters[counter].min = atom->min - 1; 1691 ctxt->counters[counter].max = atom->max - 1; 1692 /* count the number of times we see it again */ 1693 xmlFAGenerateCountedEpsilonTransition(ctxt, atom->stop, 1694 atom->start, counter); 1695 /* allow a way out based on the count */ 1696 xmlFAGenerateCountedTransition(ctxt, atom->stop, 1697 newstate, counter); 1698 /* and if needed allow a direct exit for 0 */ 1699 if (atom->min == 0) 1700 xmlFAGenerateEpsilonTransition(ctxt, atom->start0, 1701 newstate); 1702 1703 } 1704 atom->min = 0; 1705 atom->max = 0; 1706 atom->quant = XML_REGEXP_QUANT_ONCE; 1707 ctxt->state = newstate; 1708 } 1709 default: 1710 break; 1711 } 1712 return(0); 1713 } 1714 if ((atom->min == 0) && (atom->max == 0) && 1715 (atom->quant == XML_REGEXP_QUANT_RANGE)) { 1716 /* 1717 * we can discard the atom and generate an epsilon transition instead 1718 */ 1719 if (to == NULL) { 1720 to = xmlRegNewState(ctxt); 1721 if (to != NULL) 1722 xmlRegStatePush(ctxt, to); 1723 else { 1724 return(-1); 1725 } 1726 } 1727 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1728 ctxt->state = to; 1729 xmlRegFreeAtom(atom); 1730 return(0); 1731 } 1732 if (to == NULL) { 1733 to = xmlRegNewState(ctxt); 1734 if (to != NULL) 1735 xmlRegStatePush(ctxt, to); 1736 else { 1737 return(-1); 1738 } 1739 } 1740 end = to; 1741 if ((atom->quant == XML_REGEXP_QUANT_MULT) || 1742 (atom->quant == XML_REGEXP_QUANT_PLUS)) { 1743 /* 1744 * Do not pollute the target state by adding transitions from 1745 * it as it is likely to be the shared target of multiple branches. 1746 * So isolate with an epsilon transition. 1747 */ 1748 xmlRegStatePtr tmp; 1749 1750 tmp = xmlRegNewState(ctxt); 1751 if (tmp != NULL) 1752 xmlRegStatePush(ctxt, tmp); 1753 else { 1754 return(-1); 1755 } 1756 xmlFAGenerateEpsilonTransition(ctxt, tmp, to); 1757 to = tmp; 1758 } 1759 if (xmlRegAtomPush(ctxt, atom) < 0) { 1760 return(-1); 1761 } 1762 if ((atom->quant == XML_REGEXP_QUANT_RANGE) && 1763 (atom->min == 0) && (atom->max > 0)) { 1764 nullable = 1; 1765 atom->min = 1; 1766 if (atom->max == 1) 1767 atom->quant = XML_REGEXP_QUANT_OPT; 1768 } 1769 xmlRegStateAddTrans(ctxt, from, atom, to, -1, -1); 1770 ctxt->state = end; 1771 switch (atom->quant) { 1772 case XML_REGEXP_QUANT_OPT: 1773 atom->quant = XML_REGEXP_QUANT_ONCE; 1774 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1775 break; 1776 case XML_REGEXP_QUANT_MULT: 1777 atom->quant = XML_REGEXP_QUANT_ONCE; 1778 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1779 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1); 1780 break; 1781 case XML_REGEXP_QUANT_PLUS: 1782 atom->quant = XML_REGEXP_QUANT_ONCE; 1783 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1); 1784 break; 1785 case XML_REGEXP_QUANT_RANGE: 1786 if (nullable) 1787 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1788 break; 1789 default: 1790 break; 1791 } 1792 return(0); 1793 } 1794 1795 /** 1796 * xmlFAReduceEpsilonTransitions: 1797 * @ctxt: a regexp parser context 1798 * @fromnr: the from state 1799 * @tonr: the to state 1800 * @counter: should that transition be associated to a counted 1801 * 1802 */ 1803 static void 1804 xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr, 1805 int tonr, int counter) { 1806 int transnr; 1807 xmlRegStatePtr from; 1808 xmlRegStatePtr to; 1809 1810 #ifdef DEBUG_REGEXP_GRAPH 1811 printf("xmlFAReduceEpsilonTransitions(%d, %d)\n", fromnr, tonr); 1812 #endif 1813 from = ctxt->states[fromnr]; 1814 if (from == NULL) 1815 return; 1816 to = ctxt->states[tonr]; 1817 if (to == NULL) 1818 return; 1819 if ((to->mark == XML_REGEXP_MARK_START) || 1820 (to->mark == XML_REGEXP_MARK_VISITED)) 1821 return; 1822 1823 to->mark = XML_REGEXP_MARK_VISITED; 1824 if (to->type == XML_REGEXP_FINAL_STATE) { 1825 #ifdef DEBUG_REGEXP_GRAPH 1826 printf("State %d is final, so %d becomes final\n", tonr, fromnr); 1827 #endif 1828 from->type = XML_REGEXP_FINAL_STATE; 1829 } 1830 for (transnr = 0;transnr < to->nbTrans;transnr++) { 1831 if (to->trans[transnr].to < 0) 1832 continue; 1833 if (to->trans[transnr].atom == NULL) { 1834 /* 1835 * Don't remove counted transitions 1836 * Don't loop either 1837 */ 1838 if (to->trans[transnr].to != fromnr) { 1839 if (to->trans[transnr].count >= 0) { 1840 int newto = to->trans[transnr].to; 1841 1842 xmlRegStateAddTrans(ctxt, from, NULL, 1843 ctxt->states[newto], 1844 -1, to->trans[transnr].count); 1845 } else { 1846 #ifdef DEBUG_REGEXP_GRAPH 1847 printf("Found epsilon trans %d from %d to %d\n", 1848 transnr, tonr, to->trans[transnr].to); 1849 #endif 1850 if (to->trans[transnr].counter >= 0) { 1851 xmlFAReduceEpsilonTransitions(ctxt, fromnr, 1852 to->trans[transnr].to, 1853 to->trans[transnr].counter); 1854 } else { 1855 xmlFAReduceEpsilonTransitions(ctxt, fromnr, 1856 to->trans[transnr].to, 1857 counter); 1858 } 1859 } 1860 } 1861 } else { 1862 int newto = to->trans[transnr].to; 1863 1864 if (to->trans[transnr].counter >= 0) { 1865 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom, 1866 ctxt->states[newto], 1867 to->trans[transnr].counter, -1); 1868 } else { 1869 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom, 1870 ctxt->states[newto], counter, -1); 1871 } 1872 } 1873 } 1874 to->mark = XML_REGEXP_MARK_NORMAL; 1875 } 1876 1877 /** 1878 * xmlFAEliminateSimpleEpsilonTransitions: 1879 * @ctxt: a regexp parser context 1880 * 1881 * Eliminating general epsilon transitions can get costly in the general 1882 * algorithm due to the large amount of generated new transitions and 1883 * associated comparisons. However for simple epsilon transition used just 1884 * to separate building blocks when generating the automata this can be 1885 * reduced to state elimination: 1886 * - if there exists an epsilon from X to Y 1887 * - if there is no other transition from X 1888 * then X and Y are semantically equivalent and X can be eliminated 1889 * If X is the start state then make Y the start state, else replace the 1890 * target of all transitions to X by transitions to Y. 1891 */ 1892 static void 1893 xmlFAEliminateSimpleEpsilonTransitions(xmlRegParserCtxtPtr ctxt) { 1894 int statenr, i, j, newto; 1895 xmlRegStatePtr state, tmp; 1896 1897 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1898 state = ctxt->states[statenr]; 1899 if (state == NULL) 1900 continue; 1901 if (state->nbTrans != 1) 1902 continue; 1903 if (state->type == XML_REGEXP_UNREACH_STATE) 1904 continue; 1905 /* is the only transition out a basic transition */ 1906 if ((state->trans[0].atom == NULL) && 1907 (state->trans[0].to >= 0) && 1908 (state->trans[0].to != statenr) && 1909 (state->trans[0].counter < 0) && 1910 (state->trans[0].count < 0)) { 1911 newto = state->trans[0].to; 1912 1913 if (state->type == XML_REGEXP_START_STATE) { 1914 #ifdef DEBUG_REGEXP_GRAPH 1915 printf("Found simple epsilon trans from start %d to %d\n", 1916 statenr, newto); 1917 #endif 1918 } else { 1919 #ifdef DEBUG_REGEXP_GRAPH 1920 printf("Found simple epsilon trans from %d to %d\n", 1921 statenr, newto); 1922 #endif 1923 for (i = 0;i < state->nbTransTo;i++) { 1924 tmp = ctxt->states[state->transTo[i]]; 1925 for (j = 0;j < tmp->nbTrans;j++) { 1926 if (tmp->trans[j].to == statenr) { 1927 #ifdef DEBUG_REGEXP_GRAPH 1928 printf("Changed transition %d on %d to go to %d\n", 1929 j, tmp->no, newto); 1930 #endif 1931 tmp->trans[j].to = -1; 1932 xmlRegStateAddTrans(ctxt, tmp, tmp->trans[j].atom, 1933 ctxt->states[newto], 1934 tmp->trans[j].counter, 1935 tmp->trans[j].count); 1936 } 1937 } 1938 } 1939 if (state->type == XML_REGEXP_FINAL_STATE) 1940 ctxt->states[newto]->type = XML_REGEXP_FINAL_STATE; 1941 /* eliminate the transition completely */ 1942 state->nbTrans = 0; 1943 1944 state->type = XML_REGEXP_UNREACH_STATE; 1945 1946 } 1947 1948 } 1949 } 1950 } 1951 /** 1952 * xmlFAEliminateEpsilonTransitions: 1953 * @ctxt: a regexp parser context 1954 * 1955 */ 1956 static void 1957 xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) { 1958 int statenr, transnr; 1959 xmlRegStatePtr state; 1960 int has_epsilon; 1961 1962 if (ctxt->states == NULL) return; 1963 1964 /* 1965 * Eliminate simple epsilon transition and the associated unreachable 1966 * states. 1967 */ 1968 xmlFAEliminateSimpleEpsilonTransitions(ctxt); 1969 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1970 state = ctxt->states[statenr]; 1971 if ((state != NULL) && (state->type == XML_REGEXP_UNREACH_STATE)) { 1972 #ifdef DEBUG_REGEXP_GRAPH 1973 printf("Removed unreachable state %d\n", statenr); 1974 #endif 1975 xmlRegFreeState(state); 1976 ctxt->states[statenr] = NULL; 1977 } 1978 } 1979 1980 has_epsilon = 0; 1981 1982 /* 1983 * Build the completed transitions bypassing the epsilons 1984 * Use a marking algorithm to avoid loops 1985 * Mark sink states too. 1986 * Process from the latests states backward to the start when 1987 * there is long cascading epsilon chains this minimize the 1988 * recursions and transition compares when adding the new ones 1989 */ 1990 for (statenr = ctxt->nbStates - 1;statenr >= 0;statenr--) { 1991 state = ctxt->states[statenr]; 1992 if (state == NULL) 1993 continue; 1994 if ((state->nbTrans == 0) && 1995 (state->type != XML_REGEXP_FINAL_STATE)) { 1996 state->type = XML_REGEXP_SINK_STATE; 1997 } 1998 for (transnr = 0;transnr < state->nbTrans;transnr++) { 1999 if ((state->trans[transnr].atom == NULL) && 2000 (state->trans[transnr].to >= 0)) { 2001 if (state->trans[transnr].to == statenr) { 2002 state->trans[transnr].to = -1; 2003 #ifdef DEBUG_REGEXP_GRAPH 2004 printf("Removed loopback epsilon trans %d on %d\n", 2005 transnr, statenr); 2006 #endif 2007 } else if (state->trans[transnr].count < 0) { 2008 int newto = state->trans[transnr].to; 2009 2010 #ifdef DEBUG_REGEXP_GRAPH 2011 printf("Found epsilon trans %d from %d to %d\n", 2012 transnr, statenr, newto); 2013 #endif 2014 has_epsilon = 1; 2015 state->trans[transnr].to = -2; 2016 state->mark = XML_REGEXP_MARK_START; 2017 xmlFAReduceEpsilonTransitions(ctxt, statenr, 2018 newto, state->trans[transnr].counter); 2019 state->mark = XML_REGEXP_MARK_NORMAL; 2020 #ifdef DEBUG_REGEXP_GRAPH 2021 } else { 2022 printf("Found counted transition %d on %d\n", 2023 transnr, statenr); 2024 #endif 2025 } 2026 } 2027 } 2028 } 2029 /* 2030 * Eliminate the epsilon transitions 2031 */ 2032 if (has_epsilon) { 2033 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 2034 state = ctxt->states[statenr]; 2035 if (state == NULL) 2036 continue; 2037 for (transnr = 0;transnr < state->nbTrans;transnr++) { 2038 xmlRegTransPtr trans = &(state->trans[transnr]); 2039 if ((trans->atom == NULL) && 2040 (trans->count < 0) && 2041 (trans->to >= 0)) { 2042 trans->to = -1; 2043 } 2044 } 2045 } 2046 } 2047 2048 /* 2049 * Use this pass to detect unreachable states too 2050 */ 2051 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 2052 state = ctxt->states[statenr]; 2053 if (state != NULL) 2054 state->reached = XML_REGEXP_MARK_NORMAL; 2055 } 2056 state = ctxt->states[0]; 2057 if (state != NULL) 2058 state->reached = XML_REGEXP_MARK_START; 2059 while (state != NULL) { 2060 xmlRegStatePtr target = NULL; 2061 state->reached = XML_REGEXP_MARK_VISITED; 2062 /* 2063 * Mark all states reachable from the current reachable state 2064 */ 2065 for (transnr = 0;transnr < state->nbTrans;transnr++) { 2066 if ((state->trans[transnr].to >= 0) && 2067 ((state->trans[transnr].atom != NULL) || 2068 (state->trans[transnr].count >= 0))) { 2069 int newto = state->trans[transnr].to; 2070 2071 if (ctxt->states[newto] == NULL) 2072 continue; 2073 if (ctxt->states[newto]->reached == XML_REGEXP_MARK_NORMAL) { 2074 ctxt->states[newto]->reached = XML_REGEXP_MARK_START; 2075 target = ctxt->states[newto]; 2076 } 2077 } 2078 } 2079 2080 /* 2081 * find the next accessible state not explored 2082 */ 2083 if (target == NULL) { 2084 for (statenr = 1;statenr < ctxt->nbStates;statenr++) { 2085 state = ctxt->states[statenr]; 2086 if ((state != NULL) && (state->reached == 2087 XML_REGEXP_MARK_START)) { 2088 target = state; 2089 break; 2090 } 2091 } 2092 } 2093 state = target; 2094 } 2095 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 2096 state = ctxt->states[statenr]; 2097 if ((state != NULL) && (state->reached == XML_REGEXP_MARK_NORMAL)) { 2098 #ifdef DEBUG_REGEXP_GRAPH 2099 printf("Removed unreachable state %d\n", statenr); 2100 #endif 2101 xmlRegFreeState(state); 2102 ctxt->states[statenr] = NULL; 2103 } 2104 } 2105 2106 } 2107 2108 static int 2109 xmlFACompareRanges(xmlRegRangePtr range1, xmlRegRangePtr range2) { 2110 int ret = 0; 2111 2112 if ((range1->type == XML_REGEXP_RANGES) || 2113 (range2->type == XML_REGEXP_RANGES) || 2114 (range2->type == XML_REGEXP_SUBREG) || 2115 (range1->type == XML_REGEXP_SUBREG) || 2116 (range1->type == XML_REGEXP_STRING) || 2117 (range2->type == XML_REGEXP_STRING)) 2118 return(-1); 2119 2120 /* put them in order */ 2121 if (range1->type > range2->type) { 2122 xmlRegRangePtr tmp; 2123 2124 tmp = range1; 2125 range1 = range2; 2126 range2 = tmp; 2127 } 2128 if ((range1->type == XML_REGEXP_ANYCHAR) || 2129 (range2->type == XML_REGEXP_ANYCHAR)) { 2130 ret = 1; 2131 } else if ((range1->type == XML_REGEXP_EPSILON) || 2132 (range2->type == XML_REGEXP_EPSILON)) { 2133 return(0); 2134 } else if (range1->type == range2->type) { 2135 if (range1->type != XML_REGEXP_CHARVAL) 2136 ret = 1; 2137 else if ((range1->end < range2->start) || 2138 (range2->end < range1->start)) 2139 ret = 0; 2140 else 2141 ret = 1; 2142 } else if (range1->type == XML_REGEXP_CHARVAL) { 2143 int codepoint; 2144 int neg = 0; 2145 2146 /* 2147 * just check all codepoints in the range for acceptance, 2148 * this is usually way cheaper since done only once at 2149 * compilation than testing over and over at runtime or 2150 * pushing too many states when evaluating. 2151 */ 2152 if (((range1->neg == 0) && (range2->neg != 0)) || 2153 ((range1->neg != 0) && (range2->neg == 0))) 2154 neg = 1; 2155 2156 for (codepoint = range1->start;codepoint <= range1->end ;codepoint++) { 2157 ret = xmlRegCheckCharacterRange(range2->type, codepoint, 2158 0, range2->start, range2->end, 2159 range2->blockName); 2160 if (ret < 0) 2161 return(-1); 2162 if (((neg == 1) && (ret == 0)) || 2163 ((neg == 0) && (ret == 1))) 2164 return(1); 2165 } 2166 return(0); 2167 } else if ((range1->type == XML_REGEXP_BLOCK_NAME) || 2168 (range2->type == XML_REGEXP_BLOCK_NAME)) { 2169 if (range1->type == range2->type) { 2170 ret = xmlStrEqual(range1->blockName, range2->blockName); 2171 } else { 2172 /* 2173 * comparing a block range with anything else is way 2174 * too costly, and maintining the table is like too much 2175 * memory too, so let's force the automata to save state 2176 * here. 2177 */ 2178 return(1); 2179 } 2180 } else if ((range1->type < XML_REGEXP_LETTER) || 2181 (range2->type < XML_REGEXP_LETTER)) { 2182 if ((range1->type == XML_REGEXP_ANYSPACE) && 2183 (range2->type == XML_REGEXP_NOTSPACE)) 2184 ret = 0; 2185 else if ((range1->type == XML_REGEXP_INITNAME) && 2186 (range2->type == XML_REGEXP_NOTINITNAME)) 2187 ret = 0; 2188 else if ((range1->type == XML_REGEXP_NAMECHAR) && 2189 (range2->type == XML_REGEXP_NOTNAMECHAR)) 2190 ret = 0; 2191 else if ((range1->type == XML_REGEXP_DECIMAL) && 2192 (range2->type == XML_REGEXP_NOTDECIMAL)) 2193 ret = 0; 2194 else if ((range1->type == XML_REGEXP_REALCHAR) && 2195 (range2->type == XML_REGEXP_NOTREALCHAR)) 2196 ret = 0; 2197 else { 2198 /* same thing to limit complexity */ 2199 return(1); 2200 } 2201 } else { 2202 ret = 0; 2203 /* range1->type < range2->type here */ 2204 switch (range1->type) { 2205 case XML_REGEXP_LETTER: 2206 /* all disjoint except in the subgroups */ 2207 if ((range2->type == XML_REGEXP_LETTER_UPPERCASE) || 2208 (range2->type == XML_REGEXP_LETTER_LOWERCASE) || 2209 (range2->type == XML_REGEXP_LETTER_TITLECASE) || 2210 (range2->type == XML_REGEXP_LETTER_MODIFIER) || 2211 (range2->type == XML_REGEXP_LETTER_OTHERS)) 2212 ret = 1; 2213 break; 2214 case XML_REGEXP_MARK: 2215 if ((range2->type == XML_REGEXP_MARK_NONSPACING) || 2216 (range2->type == XML_REGEXP_MARK_SPACECOMBINING) || 2217 (range2->type == XML_REGEXP_MARK_ENCLOSING)) 2218 ret = 1; 2219 break; 2220 case XML_REGEXP_NUMBER: 2221 if ((range2->type == XML_REGEXP_NUMBER_DECIMAL) || 2222 (range2->type == XML_REGEXP_NUMBER_LETTER) || 2223 (range2->type == XML_REGEXP_NUMBER_OTHERS)) 2224 ret = 1; 2225 break; 2226 case XML_REGEXP_PUNCT: 2227 if ((range2->type == XML_REGEXP_PUNCT_CONNECTOR) || 2228 (range2->type == XML_REGEXP_PUNCT_DASH) || 2229 (range2->type == XML_REGEXP_PUNCT_OPEN) || 2230 (range2->type == XML_REGEXP_PUNCT_CLOSE) || 2231 (range2->type == XML_REGEXP_PUNCT_INITQUOTE) || 2232 (range2->type == XML_REGEXP_PUNCT_FINQUOTE) || 2233 (range2->type == XML_REGEXP_PUNCT_OTHERS)) 2234 ret = 1; 2235 break; 2236 case XML_REGEXP_SEPAR: 2237 if ((range2->type == XML_REGEXP_SEPAR_SPACE) || 2238 (range2->type == XML_REGEXP_SEPAR_LINE) || 2239 (range2->type == XML_REGEXP_SEPAR_PARA)) 2240 ret = 1; 2241 break; 2242 case XML_REGEXP_SYMBOL: 2243 if ((range2->type == XML_REGEXP_SYMBOL_MATH) || 2244 (range2->type == XML_REGEXP_SYMBOL_CURRENCY) || 2245 (range2->type == XML_REGEXP_SYMBOL_MODIFIER) || 2246 (range2->type == XML_REGEXP_SYMBOL_OTHERS)) 2247 ret = 1; 2248 break; 2249 case XML_REGEXP_OTHER: 2250 if ((range2->type == XML_REGEXP_OTHER_CONTROL) || 2251 (range2->type == XML_REGEXP_OTHER_FORMAT) || 2252 (range2->type == XML_REGEXP_OTHER_PRIVATE)) 2253 ret = 1; 2254 break; 2255 default: 2256 if ((range2->type >= XML_REGEXP_LETTER) && 2257 (range2->type < XML_REGEXP_BLOCK_NAME)) 2258 ret = 0; 2259 else { 2260 /* safety net ! */ 2261 return(1); 2262 } 2263 } 2264 } 2265 if (((range1->neg == 0) && (range2->neg != 0)) || 2266 ((range1->neg != 0) && (range2->neg == 0))) 2267 ret = !ret; 2268 return(ret); 2269 } 2270 2271 /** 2272 * xmlFACompareAtomTypes: 2273 * @type1: an atom type 2274 * @type2: an atom type 2275 * 2276 * Compares two atoms type to check whether they intersect in some ways, 2277 * this is used by xmlFACompareAtoms only 2278 * 2279 * Returns 1 if they may intersect and 0 otherwise 2280 */ 2281 static int 2282 xmlFACompareAtomTypes(xmlRegAtomType type1, xmlRegAtomType type2) { 2283 if ((type1 == XML_REGEXP_EPSILON) || 2284 (type1 == XML_REGEXP_CHARVAL) || 2285 (type1 == XML_REGEXP_RANGES) || 2286 (type1 == XML_REGEXP_SUBREG) || 2287 (type1 == XML_REGEXP_STRING) || 2288 (type1 == XML_REGEXP_ANYCHAR)) 2289 return(1); 2290 if ((type2 == XML_REGEXP_EPSILON) || 2291 (type2 == XML_REGEXP_CHARVAL) || 2292 (type2 == XML_REGEXP_RANGES) || 2293 (type2 == XML_REGEXP_SUBREG) || 2294 (type2 == XML_REGEXP_STRING) || 2295 (type2 == XML_REGEXP_ANYCHAR)) 2296 return(1); 2297 2298 if (type1 == type2) return(1); 2299 2300 /* simplify subsequent compares by making sure type1 < type2 */ 2301 if (type1 > type2) { 2302 xmlRegAtomType tmp = type1; 2303 type1 = type2; 2304 type2 = tmp; 2305 } 2306 switch (type1) { 2307 case XML_REGEXP_ANYSPACE: /* \s */ 2308 /* can't be a letter, number, mark, pontuation, symbol */ 2309 if ((type2 == XML_REGEXP_NOTSPACE) || 2310 ((type2 >= XML_REGEXP_LETTER) && 2311 (type2 <= XML_REGEXP_LETTER_OTHERS)) || 2312 ((type2 >= XML_REGEXP_NUMBER) && 2313 (type2 <= XML_REGEXP_NUMBER_OTHERS)) || 2314 ((type2 >= XML_REGEXP_MARK) && 2315 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2316 ((type2 >= XML_REGEXP_PUNCT) && 2317 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2318 ((type2 >= XML_REGEXP_SYMBOL) && 2319 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) 2320 ) return(0); 2321 break; 2322 case XML_REGEXP_NOTSPACE: /* \S */ 2323 break; 2324 case XML_REGEXP_INITNAME: /* \l */ 2325 /* can't be a number, mark, separator, pontuation, symbol or other */ 2326 if ((type2 == XML_REGEXP_NOTINITNAME) || 2327 ((type2 >= XML_REGEXP_NUMBER) && 2328 (type2 <= XML_REGEXP_NUMBER_OTHERS)) || 2329 ((type2 >= XML_REGEXP_MARK) && 2330 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2331 ((type2 >= XML_REGEXP_SEPAR) && 2332 (type2 <= XML_REGEXP_SEPAR_PARA)) || 2333 ((type2 >= XML_REGEXP_PUNCT) && 2334 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2335 ((type2 >= XML_REGEXP_SYMBOL) && 2336 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || 2337 ((type2 >= XML_REGEXP_OTHER) && 2338 (type2 <= XML_REGEXP_OTHER_NA)) 2339 ) return(0); 2340 break; 2341 case XML_REGEXP_NOTINITNAME: /* \L */ 2342 break; 2343 case XML_REGEXP_NAMECHAR: /* \c */ 2344 /* can't be a mark, separator, pontuation, symbol or other */ 2345 if ((type2 == XML_REGEXP_NOTNAMECHAR) || 2346 ((type2 >= XML_REGEXP_MARK) && 2347 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2348 ((type2 >= XML_REGEXP_PUNCT) && 2349 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2350 ((type2 >= XML_REGEXP_SEPAR) && 2351 (type2 <= XML_REGEXP_SEPAR_PARA)) || 2352 ((type2 >= XML_REGEXP_SYMBOL) && 2353 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || 2354 ((type2 >= XML_REGEXP_OTHER) && 2355 (type2 <= XML_REGEXP_OTHER_NA)) 2356 ) return(0); 2357 break; 2358 case XML_REGEXP_NOTNAMECHAR: /* \C */ 2359 break; 2360 case XML_REGEXP_DECIMAL: /* \d */ 2361 /* can't be a letter, mark, separator, pontuation, symbol or other */ 2362 if ((type2 == XML_REGEXP_NOTDECIMAL) || 2363 (type2 == XML_REGEXP_REALCHAR) || 2364 ((type2 >= XML_REGEXP_LETTER) && 2365 (type2 <= XML_REGEXP_LETTER_OTHERS)) || 2366 ((type2 >= XML_REGEXP_MARK) && 2367 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2368 ((type2 >= XML_REGEXP_PUNCT) && 2369 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2370 ((type2 >= XML_REGEXP_SEPAR) && 2371 (type2 <= XML_REGEXP_SEPAR_PARA)) || 2372 ((type2 >= XML_REGEXP_SYMBOL) && 2373 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || 2374 ((type2 >= XML_REGEXP_OTHER) && 2375 (type2 <= XML_REGEXP_OTHER_NA)) 2376 )return(0); 2377 break; 2378 case XML_REGEXP_NOTDECIMAL: /* \D */ 2379 break; 2380 case XML_REGEXP_REALCHAR: /* \w */ 2381 /* can't be a mark, separator, pontuation, symbol or other */ 2382 if ((type2 == XML_REGEXP_NOTDECIMAL) || 2383 ((type2 >= XML_REGEXP_MARK) && 2384 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2385 ((type2 >= XML_REGEXP_PUNCT) && 2386 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2387 ((type2 >= XML_REGEXP_SEPAR) && 2388 (type2 <= XML_REGEXP_SEPAR_PARA)) || 2389 ((type2 >= XML_REGEXP_SYMBOL) && 2390 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || 2391 ((type2 >= XML_REGEXP_OTHER) && 2392 (type2 <= XML_REGEXP_OTHER_NA)) 2393 )return(0); 2394 break; 2395 case XML_REGEXP_NOTREALCHAR: /* \W */ 2396 break; 2397 /* 2398 * at that point we know both type 1 and type2 are from 2399 * character categories are ordered and are different, 2400 * it becomes simple because this is a partition 2401 */ 2402 case XML_REGEXP_LETTER: 2403 if (type2 <= XML_REGEXP_LETTER_OTHERS) 2404 return(1); 2405 return(0); 2406 case XML_REGEXP_LETTER_UPPERCASE: 2407 case XML_REGEXP_LETTER_LOWERCASE: 2408 case XML_REGEXP_LETTER_TITLECASE: 2409 case XML_REGEXP_LETTER_MODIFIER: 2410 case XML_REGEXP_LETTER_OTHERS: 2411 return(0); 2412 case XML_REGEXP_MARK: 2413 if (type2 <= XML_REGEXP_MARK_ENCLOSING) 2414 return(1); 2415 return(0); 2416 case XML_REGEXP_MARK_NONSPACING: 2417 case XML_REGEXP_MARK_SPACECOMBINING: 2418 case XML_REGEXP_MARK_ENCLOSING: 2419 return(0); 2420 case XML_REGEXP_NUMBER: 2421 if (type2 <= XML_REGEXP_NUMBER_OTHERS) 2422 return(1); 2423 return(0); 2424 case XML_REGEXP_NUMBER_DECIMAL: 2425 case XML_REGEXP_NUMBER_LETTER: 2426 case XML_REGEXP_NUMBER_OTHERS: 2427 return(0); 2428 case XML_REGEXP_PUNCT: 2429 if (type2 <= XML_REGEXP_PUNCT_OTHERS) 2430 return(1); 2431 return(0); 2432 case XML_REGEXP_PUNCT_CONNECTOR: 2433 case XML_REGEXP_PUNCT_DASH: 2434 case XML_REGEXP_PUNCT_OPEN: 2435 case XML_REGEXP_PUNCT_CLOSE: 2436 case XML_REGEXP_PUNCT_INITQUOTE: 2437 case XML_REGEXP_PUNCT_FINQUOTE: 2438 case XML_REGEXP_PUNCT_OTHERS: 2439 return(0); 2440 case XML_REGEXP_SEPAR: 2441 if (type2 <= XML_REGEXP_SEPAR_PARA) 2442 return(1); 2443 return(0); 2444 case XML_REGEXP_SEPAR_SPACE: 2445 case XML_REGEXP_SEPAR_LINE: 2446 case XML_REGEXP_SEPAR_PARA: 2447 return(0); 2448 case XML_REGEXP_SYMBOL: 2449 if (type2 <= XML_REGEXP_SYMBOL_OTHERS) 2450 return(1); 2451 return(0); 2452 case XML_REGEXP_SYMBOL_MATH: 2453 case XML_REGEXP_SYMBOL_CURRENCY: 2454 case XML_REGEXP_SYMBOL_MODIFIER: 2455 case XML_REGEXP_SYMBOL_OTHERS: 2456 return(0); 2457 case XML_REGEXP_OTHER: 2458 if (type2 <= XML_REGEXP_OTHER_NA) 2459 return(1); 2460 return(0); 2461 case XML_REGEXP_OTHER_CONTROL: 2462 case XML_REGEXP_OTHER_FORMAT: 2463 case XML_REGEXP_OTHER_PRIVATE: 2464 case XML_REGEXP_OTHER_NA: 2465 return(0); 2466 default: 2467 break; 2468 } 2469 return(1); 2470 } 2471 2472 /** 2473 * xmlFAEqualAtoms: 2474 * @atom1: an atom 2475 * @atom2: an atom 2476 * @deep: if not set only compare string pointers 2477 * 2478 * Compares two atoms to check whether they are the same exactly 2479 * this is used to remove equivalent transitions 2480 * 2481 * Returns 1 if same and 0 otherwise 2482 */ 2483 static int 2484 xmlFAEqualAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2, int deep) { 2485 int ret = 0; 2486 2487 if (atom1 == atom2) 2488 return(1); 2489 if ((atom1 == NULL) || (atom2 == NULL)) 2490 return(0); 2491 2492 if (atom1->type != atom2->type) 2493 return(0); 2494 switch (atom1->type) { 2495 case XML_REGEXP_EPSILON: 2496 ret = 0; 2497 break; 2498 case XML_REGEXP_STRING: 2499 if (!deep) 2500 ret = (atom1->valuep == atom2->valuep); 2501 else 2502 ret = xmlStrEqual((xmlChar *)atom1->valuep, 2503 (xmlChar *)atom2->valuep); 2504 break; 2505 case XML_REGEXP_CHARVAL: 2506 ret = (atom1->codepoint == atom2->codepoint); 2507 break; 2508 case XML_REGEXP_RANGES: 2509 /* too hard to do in the general case */ 2510 ret = 0; 2511 default: 2512 break; 2513 } 2514 return(ret); 2515 } 2516 2517 /** 2518 * xmlFACompareAtoms: 2519 * @atom1: an atom 2520 * @atom2: an atom 2521 * @deep: if not set only compare string pointers 2522 * 2523 * Compares two atoms to check whether they intersect in some ways, 2524 * this is used by xmlFAComputesDeterminism and xmlFARecurseDeterminism only 2525 * 2526 * Returns 1 if yes and 0 otherwise 2527 */ 2528 static int 2529 xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2, int deep) { 2530 int ret = 1; 2531 2532 if (atom1 == atom2) 2533 return(1); 2534 if ((atom1 == NULL) || (atom2 == NULL)) 2535 return(0); 2536 2537 if ((atom1->type == XML_REGEXP_ANYCHAR) || 2538 (atom2->type == XML_REGEXP_ANYCHAR)) 2539 return(1); 2540 2541 if (atom1->type > atom2->type) { 2542 xmlRegAtomPtr tmp; 2543 tmp = atom1; 2544 atom1 = atom2; 2545 atom2 = tmp; 2546 } 2547 if (atom1->type != atom2->type) { 2548 ret = xmlFACompareAtomTypes(atom1->type, atom2->type); 2549 /* if they can't intersect at the type level break now */ 2550 if (ret == 0) 2551 return(0); 2552 } 2553 switch (atom1->type) { 2554 case XML_REGEXP_STRING: 2555 if (!deep) 2556 ret = (atom1->valuep != atom2->valuep); 2557 else 2558 ret = xmlRegStrEqualWildcard((xmlChar *)atom1->valuep, 2559 (xmlChar *)atom2->valuep); 2560 break; 2561 case XML_REGEXP_EPSILON: 2562 goto not_determinist; 2563 case XML_REGEXP_CHARVAL: 2564 if (atom2->type == XML_REGEXP_CHARVAL) { 2565 ret = (atom1->codepoint == atom2->codepoint); 2566 } else { 2567 ret = xmlRegCheckCharacter(atom2, atom1->codepoint); 2568 if (ret < 0) 2569 ret = 1; 2570 } 2571 break; 2572 case XML_REGEXP_RANGES: 2573 if (atom2->type == XML_REGEXP_RANGES) { 2574 int i, j, res; 2575 xmlRegRangePtr r1, r2; 2576 2577 /* 2578 * need to check that none of the ranges eventually matches 2579 */ 2580 for (i = 0;i < atom1->nbRanges;i++) { 2581 for (j = 0;j < atom2->nbRanges;j++) { 2582 r1 = atom1->ranges[i]; 2583 r2 = atom2->ranges[j]; 2584 res = xmlFACompareRanges(r1, r2); 2585 if (res == 1) { 2586 ret = 1; 2587 goto done; 2588 } 2589 } 2590 } 2591 ret = 0; 2592 } 2593 break; 2594 default: 2595 goto not_determinist; 2596 } 2597 done: 2598 if (atom1->neg != atom2->neg) { 2599 ret = !ret; 2600 } 2601 if (ret == 0) 2602 return(0); 2603 not_determinist: 2604 return(1); 2605 } 2606 2607 /** 2608 * xmlFARecurseDeterminism: 2609 * @ctxt: a regexp parser context 2610 * 2611 * Check whether the associated regexp is determinist, 2612 * should be called after xmlFAEliminateEpsilonTransitions() 2613 * 2614 */ 2615 static int 2616 xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state, 2617 int to, xmlRegAtomPtr atom) { 2618 int ret = 1; 2619 int res; 2620 int transnr, nbTrans; 2621 xmlRegTransPtr t1; 2622 int deep = 1; 2623 2624 if (state == NULL) 2625 return(ret); 2626 if (state->markd == XML_REGEXP_MARK_VISITED) 2627 return(ret); 2628 2629 if (ctxt->flags & AM_AUTOMATA_RNG) 2630 deep = 0; 2631 2632 /* 2633 * don't recurse on transitions potentially added in the course of 2634 * the elimination. 2635 */ 2636 nbTrans = state->nbTrans; 2637 for (transnr = 0;transnr < nbTrans;transnr++) { 2638 t1 = &(state->trans[transnr]); 2639 /* 2640 * check transitions conflicting with the one looked at 2641 */ 2642 if (t1->atom == NULL) { 2643 if (t1->to < 0) 2644 continue; 2645 state->markd = XML_REGEXP_MARK_VISITED; 2646 res = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to], 2647 to, atom); 2648 state->markd = 0; 2649 if (res == 0) { 2650 ret = 0; 2651 /* t1->nd = 1; */ 2652 } 2653 continue; 2654 } 2655 if (t1->to != to) 2656 continue; 2657 if (xmlFACompareAtoms(t1->atom, atom, deep)) { 2658 ret = 0; 2659 /* mark the transition as non-deterministic */ 2660 t1->nd = 1; 2661 } 2662 } 2663 return(ret); 2664 } 2665 2666 /** 2667 * xmlFAComputesDeterminism: 2668 * @ctxt: a regexp parser context 2669 * 2670 * Check whether the associated regexp is determinist, 2671 * should be called after xmlFAEliminateEpsilonTransitions() 2672 * 2673 */ 2674 static int 2675 xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) { 2676 int statenr, transnr; 2677 xmlRegStatePtr state; 2678 xmlRegTransPtr t1, t2, last; 2679 int i; 2680 int ret = 1; 2681 int deep = 1; 2682 2683 #ifdef DEBUG_REGEXP_GRAPH 2684 printf("xmlFAComputesDeterminism\n"); 2685 xmlRegPrintCtxt(stdout, ctxt); 2686 #endif 2687 if (ctxt->determinist != -1) 2688 return(ctxt->determinist); 2689 2690 if (ctxt->flags & AM_AUTOMATA_RNG) 2691 deep = 0; 2692 2693 /* 2694 * First cleanup the automata removing cancelled transitions 2695 */ 2696 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 2697 state = ctxt->states[statenr]; 2698 if (state == NULL) 2699 continue; 2700 if (state->nbTrans < 2) 2701 continue; 2702 for (transnr = 0;transnr < state->nbTrans;transnr++) { 2703 t1 = &(state->trans[transnr]); 2704 /* 2705 * Determinism checks in case of counted or all transitions 2706 * will have to be handled separately 2707 */ 2708 if (t1->atom == NULL) { 2709 /* t1->nd = 1; */ 2710 continue; 2711 } 2712 if (t1->to == -1) /* eliminated */ 2713 continue; 2714 for (i = 0;i < transnr;i++) { 2715 t2 = &(state->trans[i]); 2716 if (t2->to == -1) /* eliminated */ 2717 continue; 2718 if (t2->atom != NULL) { 2719 if (t1->to == t2->to) { 2720 /* 2721 * Here we use deep because we want to keep the 2722 * transitions which indicate a conflict 2723 */ 2724 if (xmlFAEqualAtoms(t1->atom, t2->atom, deep) && 2725 (t1->counter == t2->counter) && 2726 (t1->count == t2->count)) 2727 t2->to = -1; /* eliminated */ 2728 } 2729 } 2730 } 2731 } 2732 } 2733 2734 /* 2735 * Check for all states that there aren't 2 transitions 2736 * with the same atom and a different target. 2737 */ 2738 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 2739 state = ctxt->states[statenr]; 2740 if (state == NULL) 2741 continue; 2742 if (state->nbTrans < 2) 2743 continue; 2744 last = NULL; 2745 for (transnr = 0;transnr < state->nbTrans;transnr++) { 2746 t1 = &(state->trans[transnr]); 2747 /* 2748 * Determinism checks in case of counted or all transitions 2749 * will have to be handled separately 2750 */ 2751 if (t1->atom == NULL) { 2752 continue; 2753 } 2754 if (t1->to == -1) /* eliminated */ 2755 continue; 2756 for (i = 0;i < transnr;i++) { 2757 t2 = &(state->trans[i]); 2758 if (t2->to == -1) /* eliminated */ 2759 continue; 2760 if (t2->atom != NULL) { 2761 /* 2762 * But here we don't use deep because we want to 2763 * find transitions which indicate a conflict 2764 */ 2765 if (xmlFACompareAtoms(t1->atom, t2->atom, 1)) { 2766 ret = 0; 2767 /* mark the transitions as non-deterministic ones */ 2768 t1->nd = 1; 2769 t2->nd = 1; 2770 last = t1; 2771 } 2772 } else if (t1->to != -1) { 2773 /* 2774 * do the closure in case of remaining specific 2775 * epsilon transitions like choices or all 2776 */ 2777 ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to], 2778 t2->to, t2->atom); 2779 /* don't shortcut the computation so all non deterministic 2780 transition get marked down 2781 if (ret == 0) 2782 return(0); 2783 */ 2784 if (ret == 0) { 2785 t1->nd = 1; 2786 /* t2->nd = 1; */ 2787 last = t1; 2788 } 2789 } 2790 } 2791 /* don't shortcut the computation so all non deterministic 2792 transition get marked down 2793 if (ret == 0) 2794 break; */ 2795 } 2796 2797 /* 2798 * mark specifically the last non-deterministic transition 2799 * from a state since there is no need to set-up rollback 2800 * from it 2801 */ 2802 if (last != NULL) { 2803 last->nd = 2; 2804 } 2805 2806 /* don't shortcut the computation so all non deterministic 2807 transition get marked down 2808 if (ret == 0) 2809 break; */ 2810 } 2811 2812 ctxt->determinist = ret; 2813 return(ret); 2814 } 2815 2816 /************************************************************************ 2817 * * 2818 * Routines to check input against transition atoms * 2819 * * 2820 ************************************************************************/ 2821 2822 static int 2823 xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, int neg, 2824 int start, int end, const xmlChar *blockName) { 2825 int ret = 0; 2826 2827 switch (type) { 2828 case XML_REGEXP_STRING: 2829 case XML_REGEXP_SUBREG: 2830 case XML_REGEXP_RANGES: 2831 case XML_REGEXP_EPSILON: 2832 return(-1); 2833 case XML_REGEXP_ANYCHAR: 2834 ret = ((codepoint != '\n') && (codepoint != '\r')); 2835 break; 2836 case XML_REGEXP_CHARVAL: 2837 ret = ((codepoint >= start) && (codepoint <= end)); 2838 break; 2839 case XML_REGEXP_NOTSPACE: 2840 neg = !neg; 2841 case XML_REGEXP_ANYSPACE: 2842 ret = ((codepoint == '\n') || (codepoint == '\r') || 2843 (codepoint == '\t') || (codepoint == ' ')); 2844 break; 2845 case XML_REGEXP_NOTINITNAME: 2846 neg = !neg; 2847 case XML_REGEXP_INITNAME: 2848 ret = (IS_LETTER(codepoint) || 2849 (codepoint == '_') || (codepoint == ':')); 2850 break; 2851 case XML_REGEXP_NOTNAMECHAR: 2852 neg = !neg; 2853 case XML_REGEXP_NAMECHAR: 2854 ret = (IS_LETTER(codepoint) || IS_DIGIT(codepoint) || 2855 (codepoint == '.') || (codepoint == '-') || 2856 (codepoint == '_') || (codepoint == ':') || 2857 IS_COMBINING(codepoint) || IS_EXTENDER(codepoint)); 2858 break; 2859 case XML_REGEXP_NOTDECIMAL: 2860 neg = !neg; 2861 case XML_REGEXP_DECIMAL: 2862 ret = xmlUCSIsCatNd(codepoint); 2863 break; 2864 case XML_REGEXP_REALCHAR: 2865 neg = !neg; 2866 case XML_REGEXP_NOTREALCHAR: 2867 ret = xmlUCSIsCatP(codepoint); 2868 if (ret == 0) 2869 ret = xmlUCSIsCatZ(codepoint); 2870 if (ret == 0) 2871 ret = xmlUCSIsCatC(codepoint); 2872 break; 2873 case XML_REGEXP_LETTER: 2874 ret = xmlUCSIsCatL(codepoint); 2875 break; 2876 case XML_REGEXP_LETTER_UPPERCASE: 2877 ret = xmlUCSIsCatLu(codepoint); 2878 break; 2879 case XML_REGEXP_LETTER_LOWERCASE: 2880 ret = xmlUCSIsCatLl(codepoint); 2881 break; 2882 case XML_REGEXP_LETTER_TITLECASE: 2883 ret = xmlUCSIsCatLt(codepoint); 2884 break; 2885 case XML_REGEXP_LETTER_MODIFIER: 2886 ret = xmlUCSIsCatLm(codepoint); 2887 break; 2888 case XML_REGEXP_LETTER_OTHERS: 2889 ret = xmlUCSIsCatLo(codepoint); 2890 break; 2891 case XML_REGEXP_MARK: 2892 ret = xmlUCSIsCatM(codepoint); 2893 break; 2894 case XML_REGEXP_MARK_NONSPACING: 2895 ret = xmlUCSIsCatMn(codepoint); 2896 break; 2897 case XML_REGEXP_MARK_SPACECOMBINING: 2898 ret = xmlUCSIsCatMc(codepoint); 2899 break; 2900 case XML_REGEXP_MARK_ENCLOSING: 2901 ret = xmlUCSIsCatMe(codepoint); 2902 break; 2903 case XML_REGEXP_NUMBER: 2904 ret = xmlUCSIsCatN(codepoint); 2905 break; 2906 case XML_REGEXP_NUMBER_DECIMAL: 2907 ret = xmlUCSIsCatNd(codepoint); 2908 break; 2909 case XML_REGEXP_NUMBER_LETTER: 2910 ret = xmlUCSIsCatNl(codepoint); 2911 break; 2912 case XML_REGEXP_NUMBER_OTHERS: 2913 ret = xmlUCSIsCatNo(codepoint); 2914 break; 2915 case XML_REGEXP_PUNCT: 2916 ret = xmlUCSIsCatP(codepoint); 2917 break; 2918 case XML_REGEXP_PUNCT_CONNECTOR: 2919 ret = xmlUCSIsCatPc(codepoint); 2920 break; 2921 case XML_REGEXP_PUNCT_DASH: 2922 ret = xmlUCSIsCatPd(codepoint); 2923 break; 2924 case XML_REGEXP_PUNCT_OPEN: 2925 ret = xmlUCSIsCatPs(codepoint); 2926 break; 2927 case XML_REGEXP_PUNCT_CLOSE: 2928 ret = xmlUCSIsCatPe(codepoint); 2929 break; 2930 case XML_REGEXP_PUNCT_INITQUOTE: 2931 ret = xmlUCSIsCatPi(codepoint); 2932 break; 2933 case XML_REGEXP_PUNCT_FINQUOTE: 2934 ret = xmlUCSIsCatPf(codepoint); 2935 break; 2936 case XML_REGEXP_PUNCT_OTHERS: 2937 ret = xmlUCSIsCatPo(codepoint); 2938 break; 2939 case XML_REGEXP_SEPAR: 2940 ret = xmlUCSIsCatZ(codepoint); 2941 break; 2942 case XML_REGEXP_SEPAR_SPACE: 2943 ret = xmlUCSIsCatZs(codepoint); 2944 break; 2945 case XML_REGEXP_SEPAR_LINE: 2946 ret = xmlUCSIsCatZl(codepoint); 2947 break; 2948 case XML_REGEXP_SEPAR_PARA: 2949 ret = xmlUCSIsCatZp(codepoint); 2950 break; 2951 case XML_REGEXP_SYMBOL: 2952 ret = xmlUCSIsCatS(codepoint); 2953 break; 2954 case XML_REGEXP_SYMBOL_MATH: 2955 ret = xmlUCSIsCatSm(codepoint); 2956 break; 2957 case XML_REGEXP_SYMBOL_CURRENCY: 2958 ret = xmlUCSIsCatSc(codepoint); 2959 break; 2960 case XML_REGEXP_SYMBOL_MODIFIER: 2961 ret = xmlUCSIsCatSk(codepoint); 2962 break; 2963 case XML_REGEXP_SYMBOL_OTHERS: 2964 ret = xmlUCSIsCatSo(codepoint); 2965 break; 2966 case XML_REGEXP_OTHER: 2967 ret = xmlUCSIsCatC(codepoint); 2968 break; 2969 case XML_REGEXP_OTHER_CONTROL: 2970 ret = xmlUCSIsCatCc(codepoint); 2971 break; 2972 case XML_REGEXP_OTHER_FORMAT: 2973 ret = xmlUCSIsCatCf(codepoint); 2974 break; 2975 case XML_REGEXP_OTHER_PRIVATE: 2976 ret = xmlUCSIsCatCo(codepoint); 2977 break; 2978 case XML_REGEXP_OTHER_NA: 2979 /* ret = xmlUCSIsCatCn(codepoint); */ 2980 /* Seems it doesn't exist anymore in recent Unicode releases */ 2981 ret = 0; 2982 break; 2983 case XML_REGEXP_BLOCK_NAME: 2984 ret = xmlUCSIsBlock(codepoint, (const char *) blockName); 2985 break; 2986 } 2987 if (neg) 2988 return(!ret); 2989 return(ret); 2990 } 2991 2992 static int 2993 xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint) { 2994 int i, ret = 0; 2995 xmlRegRangePtr range; 2996 2997 if ((atom == NULL) || (!IS_CHAR(codepoint))) 2998 return(-1); 2999 3000 switch (atom->type) { 3001 case XML_REGEXP_SUBREG: 3002 case XML_REGEXP_EPSILON: 3003 return(-1); 3004 case XML_REGEXP_CHARVAL: 3005 return(codepoint == atom->codepoint); 3006 case XML_REGEXP_RANGES: { 3007 int accept = 0; 3008 3009 for (i = 0;i < atom->nbRanges;i++) { 3010 range = atom->ranges[i]; 3011 if (range->neg == 2) { 3012 ret = xmlRegCheckCharacterRange(range->type, codepoint, 3013 0, range->start, range->end, 3014 range->blockName); 3015 if (ret != 0) 3016 return(0); /* excluded char */ 3017 } else if (range->neg) { 3018 ret = xmlRegCheckCharacterRange(range->type, codepoint, 3019 0, range->start, range->end, 3020 range->blockName); 3021 if (ret == 0) 3022 accept = 1; 3023 else 3024 return(0); 3025 } else { 3026 ret = xmlRegCheckCharacterRange(range->type, codepoint, 3027 0, range->start, range->end, 3028 range->blockName); 3029 if (ret != 0) 3030 accept = 1; /* might still be excluded */ 3031 } 3032 } 3033 return(accept); 3034 } 3035 case XML_REGEXP_STRING: 3036 printf("TODO: XML_REGEXP_STRING\n"); 3037 return(-1); 3038 case XML_REGEXP_ANYCHAR: 3039 case XML_REGEXP_ANYSPACE: 3040 case XML_REGEXP_NOTSPACE: 3041 case XML_REGEXP_INITNAME: 3042 case XML_REGEXP_NOTINITNAME: 3043 case XML_REGEXP_NAMECHAR: 3044 case XML_REGEXP_NOTNAMECHAR: 3045 case XML_REGEXP_DECIMAL: 3046 case XML_REGEXP_NOTDECIMAL: 3047 case XML_REGEXP_REALCHAR: 3048 case XML_REGEXP_NOTREALCHAR: 3049 case XML_REGEXP_LETTER: 3050 case XML_REGEXP_LETTER_UPPERCASE: 3051 case XML_REGEXP_LETTER_LOWERCASE: 3052 case XML_REGEXP_LETTER_TITLECASE: 3053 case XML_REGEXP_LETTER_MODIFIER: 3054 case XML_REGEXP_LETTER_OTHERS: 3055 case XML_REGEXP_MARK: 3056 case XML_REGEXP_MARK_NONSPACING: 3057 case XML_REGEXP_MARK_SPACECOMBINING: 3058 case XML_REGEXP_MARK_ENCLOSING: 3059 case XML_REGEXP_NUMBER: 3060 case XML_REGEXP_NUMBER_DECIMAL: 3061 case XML_REGEXP_NUMBER_LETTER: 3062 case XML_REGEXP_NUMBER_OTHERS: 3063 case XML_REGEXP_PUNCT: 3064 case XML_REGEXP_PUNCT_CONNECTOR: 3065 case XML_REGEXP_PUNCT_DASH: 3066 case XML_REGEXP_PUNCT_OPEN: 3067 case XML_REGEXP_PUNCT_CLOSE: 3068 case XML_REGEXP_PUNCT_INITQUOTE: 3069 case XML_REGEXP_PUNCT_FINQUOTE: 3070 case XML_REGEXP_PUNCT_OTHERS: 3071 case XML_REGEXP_SEPAR: 3072 case XML_REGEXP_SEPAR_SPACE: 3073 case XML_REGEXP_SEPAR_LINE: 3074 case XML_REGEXP_SEPAR_PARA: 3075 case XML_REGEXP_SYMBOL: 3076 case XML_REGEXP_SYMBOL_MATH: 3077 case XML_REGEXP_SYMBOL_CURRENCY: 3078 case XML_REGEXP_SYMBOL_MODIFIER: 3079 case XML_REGEXP_SYMBOL_OTHERS: 3080 case XML_REGEXP_OTHER: 3081 case XML_REGEXP_OTHER_CONTROL: 3082 case XML_REGEXP_OTHER_FORMAT: 3083 case XML_REGEXP_OTHER_PRIVATE: 3084 case XML_REGEXP_OTHER_NA: 3085 case XML_REGEXP_BLOCK_NAME: 3086 ret = xmlRegCheckCharacterRange(atom->type, codepoint, 0, 0, 0, 3087 (const xmlChar *)atom->valuep); 3088 if (atom->neg) 3089 ret = !ret; 3090 break; 3091 } 3092 return(ret); 3093 } 3094 3095 /************************************************************************ 3096 * * 3097 * Saving and restoring state of an execution context * 3098 * * 3099 ************************************************************************/ 3100 3101 #ifdef DEBUG_REGEXP_EXEC 3102 static void 3103 xmlFARegDebugExec(xmlRegExecCtxtPtr exec) { 3104 printf("state: %d:%d:idx %d", exec->state->no, exec->transno, exec->index); 3105 if (exec->inputStack != NULL) { 3106 int i; 3107 printf(": "); 3108 for (i = 0;(i < 3) && (i < exec->inputStackNr);i++) 3109 printf("%s ", (const char *) 3110 exec->inputStack[exec->inputStackNr - (i + 1)].value); 3111 } else { 3112 printf(": %s", &(exec->inputString[exec->index])); 3113 } 3114 printf("\n"); 3115 } 3116 #endif 3117 3118 static void 3119 xmlFARegExecSave(xmlRegExecCtxtPtr exec) { 3120 #ifdef DEBUG_REGEXP_EXEC 3121 printf("saving "); 3122 exec->transno++; 3123 xmlFARegDebugExec(exec); 3124 exec->transno--; 3125 #endif 3126 #ifdef MAX_PUSH 3127 if (exec->nbPush > MAX_PUSH) { 3128 return; 3129 } 3130 exec->nbPush++; 3131 #endif 3132 3133 if (exec->maxRollbacks == 0) { 3134 exec->maxRollbacks = 4; 3135 exec->rollbacks = (xmlRegExecRollback *) xmlMalloc(exec->maxRollbacks * 3136 sizeof(xmlRegExecRollback)); 3137 if (exec->rollbacks == NULL) { 3138 xmlRegexpErrMemory(NULL, "saving regexp"); 3139 exec->maxRollbacks = 0; 3140 return; 3141 } 3142 memset(exec->rollbacks, 0, 3143 exec->maxRollbacks * sizeof(xmlRegExecRollback)); 3144 } else if (exec->nbRollbacks >= exec->maxRollbacks) { 3145 xmlRegExecRollback *tmp; 3146 int len = exec->maxRollbacks; 3147 3148 exec->maxRollbacks *= 2; 3149 tmp = (xmlRegExecRollback *) xmlRealloc(exec->rollbacks, 3150 exec->maxRollbacks * sizeof(xmlRegExecRollback)); 3151 if (tmp == NULL) { 3152 xmlRegexpErrMemory(NULL, "saving regexp"); 3153 exec->maxRollbacks /= 2; 3154 return; 3155 } 3156 exec->rollbacks = tmp; 3157 tmp = &exec->rollbacks[len]; 3158 memset(tmp, 0, (exec->maxRollbacks - len) * sizeof(xmlRegExecRollback)); 3159 } 3160 exec->rollbacks[exec->nbRollbacks].state = exec->state; 3161 exec->rollbacks[exec->nbRollbacks].index = exec->index; 3162 exec->rollbacks[exec->nbRollbacks].nextbranch = exec->transno + 1; 3163 if (exec->comp->nbCounters > 0) { 3164 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) { 3165 exec->rollbacks[exec->nbRollbacks].counts = (int *) 3166 xmlMalloc(exec->comp->nbCounters * sizeof(int)); 3167 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) { 3168 xmlRegexpErrMemory(NULL, "saving regexp"); 3169 exec->status = -5; 3170 return; 3171 } 3172 } 3173 memcpy(exec->rollbacks[exec->nbRollbacks].counts, exec->counts, 3174 exec->comp->nbCounters * sizeof(int)); 3175 } 3176 exec->nbRollbacks++; 3177 } 3178 3179 static void 3180 xmlFARegExecRollBack(xmlRegExecCtxtPtr exec) { 3181 if (exec->nbRollbacks <= 0) { 3182 exec->status = -1; 3183 #ifdef DEBUG_REGEXP_EXEC 3184 printf("rollback failed on empty stack\n"); 3185 #endif 3186 return; 3187 } 3188 exec->nbRollbacks--; 3189 exec->state = exec->rollbacks[exec->nbRollbacks].state; 3190 exec->index = exec->rollbacks[exec->nbRollbacks].index; 3191 exec->transno = exec->rollbacks[exec->nbRollbacks].nextbranch; 3192 if (exec->comp->nbCounters > 0) { 3193 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) { 3194 fprintf(stderr, "exec save: allocation failed"); 3195 exec->status = -6; 3196 return; 3197 } 3198 if (exec->counts) { 3199 memcpy(exec->counts, exec->rollbacks[exec->nbRollbacks].counts, 3200 exec->comp->nbCounters * sizeof(int)); 3201 } 3202 } 3203 3204 #ifdef DEBUG_REGEXP_EXEC 3205 printf("restored "); 3206 xmlFARegDebugExec(exec); 3207 #endif 3208 } 3209 3210 /************************************************************************ 3211 * * 3212 * Verifier, running an input against a compiled regexp * 3213 * * 3214 ************************************************************************/ 3215 3216 static int 3217 xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) { 3218 xmlRegExecCtxt execval; 3219 xmlRegExecCtxtPtr exec = &execval; 3220 int ret, codepoint = 0, len, deter; 3221 3222 exec->inputString = content; 3223 exec->index = 0; 3224 exec->nbPush = 0; 3225 exec->determinist = 1; 3226 exec->maxRollbacks = 0; 3227 exec->nbRollbacks = 0; 3228 exec->rollbacks = NULL; 3229 exec->status = 0; 3230 exec->comp = comp; 3231 exec->state = comp->states[0]; 3232 exec->transno = 0; 3233 exec->transcount = 0; 3234 exec->inputStack = NULL; 3235 exec->inputStackMax = 0; 3236 if (comp->nbCounters > 0) { 3237 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int)); 3238 if (exec->counts == NULL) { 3239 xmlRegexpErrMemory(NULL, "running regexp"); 3240 return(-1); 3241 } 3242 memset(exec->counts, 0, comp->nbCounters * sizeof(int)); 3243 } else 3244 exec->counts = NULL; 3245 while ((exec->status == 0) && (exec->state != NULL) && 3246 ((exec->inputString[exec->index] != 0) || 3247 ((exec->state != NULL) && 3248 (exec->state->type != XML_REGEXP_FINAL_STATE)))) { 3249 xmlRegTransPtr trans; 3250 xmlRegAtomPtr atom; 3251 3252 /* 3253 * If end of input on non-terminal state, rollback, however we may 3254 * still have epsilon like transition for counted transitions 3255 * on counters, in that case don't break too early. Additionally, 3256 * if we are working on a range like "AB{0,2}", where B is not present, 3257 * we don't want to break. 3258 */ 3259 len = 1; 3260 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) { 3261 /* 3262 * if there is a transition, we must check if 3263 * atom allows minOccurs of 0 3264 */ 3265 if (exec->transno < exec->state->nbTrans) { 3266 trans = &exec->state->trans[exec->transno]; 3267 if (trans->to >=0) { 3268 atom = trans->atom; 3269 if (!((atom->min == 0) && (atom->max > 0))) 3270 goto rollback; 3271 } 3272 } else 3273 goto rollback; 3274 } 3275 3276 exec->transcount = 0; 3277 for (;exec->transno < exec->state->nbTrans;exec->transno++) { 3278 trans = &exec->state->trans[exec->transno]; 3279 if (trans->to < 0) 3280 continue; 3281 atom = trans->atom; 3282 ret = 0; 3283 deter = 1; 3284 if (trans->count >= 0) { 3285 int count; 3286 xmlRegCounterPtr counter; 3287 3288 if (exec->counts == NULL) { 3289 exec->status = -1; 3290 goto error; 3291 } 3292 /* 3293 * A counted transition. 3294 */ 3295 3296 count = exec->counts[trans->count]; 3297 counter = &exec->comp->counters[trans->count]; 3298 #ifdef DEBUG_REGEXP_EXEC 3299 printf("testing count %d: val %d, min %d, max %d\n", 3300 trans->count, count, counter->min, counter->max); 3301 #endif 3302 ret = ((count >= counter->min) && (count <= counter->max)); 3303 if ((ret) && (counter->min != counter->max)) 3304 deter = 0; 3305 } else if (atom == NULL) { 3306 fprintf(stderr, "epsilon transition left at runtime\n"); 3307 exec->status = -2; 3308 break; 3309 } else if (exec->inputString[exec->index] != 0) { 3310 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len); 3311 ret = xmlRegCheckCharacter(atom, codepoint); 3312 if ((ret == 1) && (atom->min >= 0) && (atom->max > 0)) { 3313 xmlRegStatePtr to = comp->states[trans->to]; 3314 3315 /* 3316 * this is a multiple input sequence 3317 * If there is a counter associated increment it now. 3318 * before potentially saving and rollback 3319 * do not increment if the counter is already over the 3320 * maximum limit in which case get to next transition 3321 */ 3322 if (trans->counter >= 0) { 3323 xmlRegCounterPtr counter; 3324 3325 if ((exec->counts == NULL) || 3326 (exec->comp == NULL) || 3327 (exec->comp->counters == NULL)) { 3328 exec->status = -1; 3329 goto error; 3330 } 3331 counter = &exec->comp->counters[trans->counter]; 3332 if (exec->counts[trans->counter] >= counter->max) 3333 continue; /* for loop on transitions */ 3334 3335 #ifdef DEBUG_REGEXP_EXEC 3336 printf("Increasing count %d\n", trans->counter); 3337 #endif 3338 exec->counts[trans->counter]++; 3339 } 3340 if (exec->state->nbTrans > exec->transno + 1) { 3341 xmlFARegExecSave(exec); 3342 } 3343 exec->transcount = 1; 3344 do { 3345 /* 3346 * Try to progress as much as possible on the input 3347 */ 3348 if (exec->transcount == atom->max) { 3349 break; 3350 } 3351 exec->index += len; 3352 /* 3353 * End of input: stop here 3354 */ 3355 if (exec->inputString[exec->index] == 0) { 3356 exec->index -= len; 3357 break; 3358 } 3359 if (exec->transcount >= atom->min) { 3360 int transno = exec->transno; 3361 xmlRegStatePtr state = exec->state; 3362 3363 /* 3364 * The transition is acceptable save it 3365 */ 3366 exec->transno = -1; /* trick */ 3367 exec->state = to; 3368 xmlFARegExecSave(exec); 3369 exec->transno = transno; 3370 exec->state = state; 3371 } 3372 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), 3373 len); 3374 ret = xmlRegCheckCharacter(atom, codepoint); 3375 exec->transcount++; 3376 } while (ret == 1); 3377 if (exec->transcount < atom->min) 3378 ret = 0; 3379 3380 /* 3381 * If the last check failed but one transition was found 3382 * possible, rollback 3383 */ 3384 if (ret < 0) 3385 ret = 0; 3386 if (ret == 0) { 3387 goto rollback; 3388 } 3389 if (trans->counter >= 0) { 3390 if (exec->counts == NULL) { 3391 exec->status = -1; 3392 goto error; 3393 } 3394 #ifdef DEBUG_REGEXP_EXEC 3395 printf("Decreasing count %d\n", trans->counter); 3396 #endif 3397 exec->counts[trans->counter]--; 3398 } 3399 } else if ((ret == 0) && (atom->min == 0) && (atom->max > 0)) { 3400 /* 3401 * we don't match on the codepoint, but minOccurs of 0 3402 * says that's ok. Setting len to 0 inhibits stepping 3403 * over the codepoint. 3404 */ 3405 exec->transcount = 1; 3406 len = 0; 3407 ret = 1; 3408 } 3409 } else if ((atom->min == 0) && (atom->max > 0)) { 3410 /* another spot to match when minOccurs is 0 */ 3411 exec->transcount = 1; 3412 len = 0; 3413 ret = 1; 3414 } 3415 if (ret == 1) { 3416 if ((trans->nd == 1) || 3417 ((trans->count >= 0) && (deter == 0) && 3418 (exec->state->nbTrans > exec->transno + 1))) { 3419 #ifdef DEBUG_REGEXP_EXEC 3420 if (trans->nd == 1) 3421 printf("Saving on nd transition atom %d for %c at %d\n", 3422 trans->atom->no, codepoint, exec->index); 3423 else 3424 printf("Saving on counted transition count %d for %c at %d\n", 3425 trans->count, codepoint, exec->index); 3426 #endif 3427 xmlFARegExecSave(exec); 3428 } 3429 if (trans->counter >= 0) { 3430 xmlRegCounterPtr counter; 3431 3432 /* make sure we don't go over the counter maximum value */ 3433 if ((exec->counts == NULL) || 3434 (exec->comp == NULL) || 3435 (exec->comp->counters == NULL)) { 3436 exec->status = -1; 3437 goto error; 3438 } 3439 counter = &exec->comp->counters[trans->counter]; 3440 if (exec->counts[trans->counter] >= counter->max) 3441 continue; /* for loop on transitions */ 3442 #ifdef DEBUG_REGEXP_EXEC 3443 printf("Increasing count %d\n", trans->counter); 3444 #endif 3445 exec->counts[trans->counter]++; 3446 } 3447 if ((trans->count >= 0) && 3448 (trans->count < REGEXP_ALL_COUNTER)) { 3449 if (exec->counts == NULL) { 3450 exec->status = -1; 3451 goto error; 3452 } 3453 #ifdef DEBUG_REGEXP_EXEC 3454 printf("resetting count %d on transition\n", 3455 trans->count); 3456 #endif 3457 exec->counts[trans->count] = 0; 3458 } 3459 #ifdef DEBUG_REGEXP_EXEC 3460 printf("entering state %d\n", trans->to); 3461 #endif 3462 exec->state = comp->states[trans->to]; 3463 exec->transno = 0; 3464 if (trans->atom != NULL) { 3465 exec->index += len; 3466 } 3467 goto progress; 3468 } else if (ret < 0) { 3469 exec->status = -4; 3470 break; 3471 } 3472 } 3473 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) { 3474 rollback: 3475 /* 3476 * Failed to find a way out 3477 */ 3478 exec->determinist = 0; 3479 #ifdef DEBUG_REGEXP_EXEC 3480 printf("rollback from state %d on %d:%c\n", exec->state->no, 3481 codepoint,codepoint); 3482 #endif 3483 xmlFARegExecRollBack(exec); 3484 } 3485 progress: 3486 continue; 3487 } 3488 error: 3489 if (exec->rollbacks != NULL) { 3490 if (exec->counts != NULL) { 3491 int i; 3492 3493 for (i = 0;i < exec->maxRollbacks;i++) 3494 if (exec->rollbacks[i].counts != NULL) 3495 xmlFree(exec->rollbacks[i].counts); 3496 } 3497 xmlFree(exec->rollbacks); 3498 } 3499 if (exec->state == NULL) 3500 return(-1); 3501 if (exec->counts != NULL) 3502 xmlFree(exec->counts); 3503 if (exec->status == 0) 3504 return(1); 3505 if (exec->status == -1) { 3506 if (exec->nbPush > MAX_PUSH) 3507 return(-1); 3508 return(0); 3509 } 3510 return(exec->status); 3511 } 3512 3513 /************************************************************************ 3514 * * 3515 * Progressive interface to the verifier one atom at a time * 3516 * * 3517 ************************************************************************/ 3518 #ifdef DEBUG_ERR 3519 static void testerr(xmlRegExecCtxtPtr exec); 3520 #endif 3521 3522 /** 3523 * xmlRegNewExecCtxt: 3524 * @comp: a precompiled regular expression 3525 * @callback: a callback function used for handling progresses in the 3526 * automata matching phase 3527 * @data: the context data associated to the callback in this context 3528 * 3529 * Build a context used for progressive evaluation of a regexp. 3530 * 3531 * Returns the new context 3532 */ 3533 xmlRegExecCtxtPtr 3534 xmlRegNewExecCtxt(xmlRegexpPtr comp, xmlRegExecCallbacks callback, void *data) { 3535 xmlRegExecCtxtPtr exec; 3536 3537 if (comp == NULL) 3538 return(NULL); 3539 if ((comp->compact == NULL) && (comp->states == NULL)) 3540 return(NULL); 3541 exec = (xmlRegExecCtxtPtr) xmlMalloc(sizeof(xmlRegExecCtxt)); 3542 if (exec == NULL) { 3543 xmlRegexpErrMemory(NULL, "creating execution context"); 3544 return(NULL); 3545 } 3546 memset(exec, 0, sizeof(xmlRegExecCtxt)); 3547 exec->inputString = NULL; 3548 exec->index = 0; 3549 exec->determinist = 1; 3550 exec->maxRollbacks = 0; 3551 exec->nbRollbacks = 0; 3552 exec->rollbacks = NULL; 3553 exec->status = 0; 3554 exec->comp = comp; 3555 if (comp->compact == NULL) 3556 exec->state = comp->states[0]; 3557 exec->transno = 0; 3558 exec->transcount = 0; 3559 exec->callback = callback; 3560 exec->data = data; 3561 if (comp->nbCounters > 0) { 3562 /* 3563 * For error handling, exec->counts is allocated twice the size 3564 * the second half is used to store the data in case of rollback 3565 */ 3566 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int) 3567 * 2); 3568 if (exec->counts == NULL) { 3569 xmlRegexpErrMemory(NULL, "creating execution context"); 3570 xmlFree(exec); 3571 return(NULL); 3572 } 3573 memset(exec->counts, 0, comp->nbCounters * sizeof(int) * 2); 3574 exec->errCounts = &exec->counts[comp->nbCounters]; 3575 } else { 3576 exec->counts = NULL; 3577 exec->errCounts = NULL; 3578 } 3579 exec->inputStackMax = 0; 3580 exec->inputStackNr = 0; 3581 exec->inputStack = NULL; 3582 exec->errStateNo = -1; 3583 exec->errString = NULL; 3584 exec->nbPush = 0; 3585 return(exec); 3586 } 3587 3588 /** 3589 * xmlRegFreeExecCtxt: 3590 * @exec: a regular expression evaulation context 3591 * 3592 * Free the structures associated to a regular expression evaulation context. 3593 */ 3594 void 3595 xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec) { 3596 if (exec == NULL) 3597 return; 3598 3599 if (exec->rollbacks != NULL) { 3600 if (exec->counts != NULL) { 3601 int i; 3602 3603 for (i = 0;i < exec->maxRollbacks;i++) 3604 if (exec->rollbacks[i].counts != NULL) 3605 xmlFree(exec->rollbacks[i].counts); 3606 } 3607 xmlFree(exec->rollbacks); 3608 } 3609 if (exec->counts != NULL) 3610 xmlFree(exec->counts); 3611 if (exec->inputStack != NULL) { 3612 int i; 3613 3614 for (i = 0;i < exec->inputStackNr;i++) { 3615 if (exec->inputStack[i].value != NULL) 3616 xmlFree(exec->inputStack[i].value); 3617 } 3618 xmlFree(exec->inputStack); 3619 } 3620 if (exec->errString != NULL) 3621 xmlFree(exec->errString); 3622 xmlFree(exec); 3623 } 3624 3625 static void 3626 xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec, const xmlChar *value, 3627 void *data) { 3628 #ifdef DEBUG_PUSH 3629 printf("saving value: %d:%s\n", exec->inputStackNr, value); 3630 #endif 3631 if (exec->inputStackMax == 0) { 3632 exec->inputStackMax = 4; 3633 exec->inputStack = (xmlRegInputTokenPtr) 3634 xmlMalloc(exec->inputStackMax * sizeof(xmlRegInputToken)); 3635 if (exec->inputStack == NULL) { 3636 xmlRegexpErrMemory(NULL, "pushing input string"); 3637 exec->inputStackMax = 0; 3638 return; 3639 } 3640 } else if (exec->inputStackNr + 1 >= exec->inputStackMax) { 3641 xmlRegInputTokenPtr tmp; 3642 3643 exec->inputStackMax *= 2; 3644 tmp = (xmlRegInputTokenPtr) xmlRealloc(exec->inputStack, 3645 exec->inputStackMax * sizeof(xmlRegInputToken)); 3646 if (tmp == NULL) { 3647 xmlRegexpErrMemory(NULL, "pushing input string"); 3648 exec->inputStackMax /= 2; 3649 return; 3650 } 3651 exec->inputStack = tmp; 3652 } 3653 exec->inputStack[exec->inputStackNr].value = xmlStrdup(value); 3654 exec->inputStack[exec->inputStackNr].data = data; 3655 exec->inputStackNr++; 3656 exec->inputStack[exec->inputStackNr].value = NULL; 3657 exec->inputStack[exec->inputStackNr].data = NULL; 3658 } 3659 3660 /** 3661 * xmlRegStrEqualWildcard: 3662 * @expStr: the string to be evaluated 3663 * @valStr: the validation string 3664 * 3665 * Checks if both strings are equal or have the same content. "*" 3666 * can be used as a wildcard in @valStr; "|" is used as a seperator of 3667 * substrings in both @expStr and @valStr. 3668 * 3669 * Returns 1 if the comparison is satisfied and the number of substrings 3670 * is equal, 0 otherwise. 3671 */ 3672 3673 static int 3674 xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr) { 3675 if (expStr == valStr) return(1); 3676 if (expStr == NULL) return(0); 3677 if (valStr == NULL) return(0); 3678 do { 3679 /* 3680 * Eval if we have a wildcard for the current item. 3681 */ 3682 if (*expStr != *valStr) { 3683 /* if one of them starts with a wildcard make valStr be it */ 3684 if (*valStr == '*') { 3685 const xmlChar *tmp; 3686 3687 tmp = valStr; 3688 valStr = expStr; 3689 expStr = tmp; 3690 } 3691 if ((*valStr != 0) && (*expStr != 0) && (*expStr++ == '*')) { 3692 do { 3693 if (*valStr == XML_REG_STRING_SEPARATOR) 3694 break; 3695 valStr++; 3696 } while (*valStr != 0); 3697 continue; 3698 } else 3699 return(0); 3700 } 3701 expStr++; 3702 valStr++; 3703 } while (*valStr != 0); 3704 if (*expStr != 0) 3705 return (0); 3706 else 3707 return (1); 3708 } 3709 3710 /** 3711 * xmlRegCompactPushString: 3712 * @exec: a regexp execution context 3713 * @comp: the precompiled exec with a compact table 3714 * @value: a string token input 3715 * @data: data associated to the token to reuse in callbacks 3716 * 3717 * Push one input token in the execution context 3718 * 3719 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 3720 * a negative value in case of error. 3721 */ 3722 static int 3723 xmlRegCompactPushString(xmlRegExecCtxtPtr exec, 3724 xmlRegexpPtr comp, 3725 const xmlChar *value, 3726 void *data) { 3727 int state = exec->index; 3728 int i, target; 3729 3730 if ((comp == NULL) || (comp->compact == NULL) || (comp->stringMap == NULL)) 3731 return(-1); 3732 3733 if (value == NULL) { 3734 /* 3735 * are we at a final state ? 3736 */ 3737 if (comp->compact[state * (comp->nbstrings + 1)] == 3738 XML_REGEXP_FINAL_STATE) 3739 return(1); 3740 return(0); 3741 } 3742 3743 #ifdef DEBUG_PUSH 3744 printf("value pushed: %s\n", value); 3745 #endif 3746 3747 /* 3748 * Examine all outside transitions from current state 3749 */ 3750 for (i = 0;i < comp->nbstrings;i++) { 3751 target = comp->compact[state * (comp->nbstrings + 1) + i + 1]; 3752 if ((target > 0) && (target <= comp->nbstates)) { 3753 target--; /* to avoid 0 */ 3754 if (xmlRegStrEqualWildcard(comp->stringMap[i], value)) { 3755 exec->index = target; 3756 if ((exec->callback != NULL) && (comp->transdata != NULL)) { 3757 exec->callback(exec->data, value, 3758 comp->transdata[state * comp->nbstrings + i], data); 3759 } 3760 #ifdef DEBUG_PUSH 3761 printf("entering state %d\n", target); 3762 #endif 3763 if (comp->compact[target * (comp->nbstrings + 1)] == 3764 XML_REGEXP_SINK_STATE) 3765 goto error; 3766 3767 if (comp->compact[target * (comp->nbstrings + 1)] == 3768 XML_REGEXP_FINAL_STATE) 3769 return(1); 3770 return(0); 3771 } 3772 } 3773 } 3774 /* 3775 * Failed to find an exit transition out from current state for the 3776 * current token 3777 */ 3778 #ifdef DEBUG_PUSH 3779 printf("failed to find a transition for %s on state %d\n", value, state); 3780 #endif 3781 error: 3782 if (exec->errString != NULL) 3783 xmlFree(exec->errString); 3784 exec->errString = xmlStrdup(value); 3785 exec->errStateNo = state; 3786 exec->status = -1; 3787 #ifdef DEBUG_ERR 3788 testerr(exec); 3789 #endif 3790 return(-1); 3791 } 3792 3793 /** 3794 * xmlRegExecPushStringInternal: 3795 * @exec: a regexp execution context or NULL to indicate the end 3796 * @value: a string token input 3797 * @data: data associated to the token to reuse in callbacks 3798 * @compound: value was assembled from 2 strings 3799 * 3800 * Push one input token in the execution context 3801 * 3802 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 3803 * a negative value in case of error. 3804 */ 3805 static int 3806 xmlRegExecPushStringInternal(xmlRegExecCtxtPtr exec, const xmlChar *value, 3807 void *data, int compound) { 3808 xmlRegTransPtr trans; 3809 xmlRegAtomPtr atom; 3810 int ret; 3811 int final = 0; 3812 int progress = 1; 3813 3814 if (exec == NULL) 3815 return(-1); 3816 if (exec->comp == NULL) 3817 return(-1); 3818 if (exec->status != 0) 3819 return(exec->status); 3820 3821 if (exec->comp->compact != NULL) 3822 return(xmlRegCompactPushString(exec, exec->comp, value, data)); 3823 3824 if (value == NULL) { 3825 if (exec->state->type == XML_REGEXP_FINAL_STATE) 3826 return(1); 3827 final = 1; 3828 } 3829 3830 #ifdef DEBUG_PUSH 3831 printf("value pushed: %s\n", value); 3832 #endif 3833 /* 3834 * If we have an active rollback stack push the new value there 3835 * and get back to where we were left 3836 */ 3837 if ((value != NULL) && (exec->inputStackNr > 0)) { 3838 xmlFARegExecSaveInputString(exec, value, data); 3839 value = exec->inputStack[exec->index].value; 3840 data = exec->inputStack[exec->index].data; 3841 #ifdef DEBUG_PUSH 3842 printf("value loaded: %s\n", value); 3843 #endif 3844 } 3845 3846 while ((exec->status == 0) && 3847 ((value != NULL) || 3848 ((final == 1) && 3849 (exec->state->type != XML_REGEXP_FINAL_STATE)))) { 3850 3851 /* 3852 * End of input on non-terminal state, rollback, however we may 3853 * still have epsilon like transition for counted transitions 3854 * on counters, in that case don't break too early. 3855 */ 3856 if ((value == NULL) && (exec->counts == NULL)) 3857 goto rollback; 3858 3859 exec->transcount = 0; 3860 for (;exec->transno < exec->state->nbTrans;exec->transno++) { 3861 trans = &exec->state->trans[exec->transno]; 3862 if (trans->to < 0) 3863 continue; 3864 atom = trans->atom; 3865 ret = 0; 3866 if (trans->count == REGEXP_ALL_LAX_COUNTER) { 3867 int i; 3868 int count; 3869 xmlRegTransPtr t; 3870 xmlRegCounterPtr counter; 3871 3872 ret = 0; 3873 3874 #ifdef DEBUG_PUSH 3875 printf("testing all lax %d\n", trans->count); 3876 #endif 3877 /* 3878 * Check all counted transitions from the current state 3879 */ 3880 if ((value == NULL) && (final)) { 3881 ret = 1; 3882 } else if (value != NULL) { 3883 for (i = 0;i < exec->state->nbTrans;i++) { 3884 t = &exec->state->trans[i]; 3885 if ((t->counter < 0) || (t == trans)) 3886 continue; 3887 counter = &exec->comp->counters[t->counter]; 3888 count = exec->counts[t->counter]; 3889 if ((count < counter->max) && 3890 (t->atom != NULL) && 3891 (xmlStrEqual(value, t->atom->valuep))) { 3892 ret = 0; 3893 break; 3894 } 3895 if ((count >= counter->min) && 3896 (count < counter->max) && 3897 (t->atom != NULL) && 3898 (xmlStrEqual(value, t->atom->valuep))) { 3899 ret = 1; 3900 break; 3901 } 3902 } 3903 } 3904 } else if (trans->count == REGEXP_ALL_COUNTER) { 3905 int i; 3906 int count; 3907 xmlRegTransPtr t; 3908 xmlRegCounterPtr counter; 3909 3910 ret = 1; 3911 3912 #ifdef DEBUG_PUSH 3913 printf("testing all %d\n", trans->count); 3914 #endif 3915 /* 3916 * Check all counted transitions from the current state 3917 */ 3918 for (i = 0;i < exec->state->nbTrans;i++) { 3919 t = &exec->state->trans[i]; 3920 if ((t->counter < 0) || (t == trans)) 3921 continue; 3922 counter = &exec->comp->counters[t->counter]; 3923 count = exec->counts[t->counter]; 3924 if ((count < counter->min) || (count > counter->max)) { 3925 ret = 0; 3926 break; 3927 } 3928 } 3929 } else if (trans->count >= 0) { 3930 int count; 3931 xmlRegCounterPtr counter; 3932 3933 /* 3934 * A counted transition. 3935 */ 3936 3937 count = exec->counts[trans->count]; 3938 counter = &exec->comp->counters[trans->count]; 3939 #ifdef DEBUG_PUSH 3940 printf("testing count %d: val %d, min %d, max %d\n", 3941 trans->count, count, counter->min, counter->max); 3942 #endif 3943 ret = ((count >= counter->min) && (count <= counter->max)); 3944 } else if (atom == NULL) { 3945 fprintf(stderr, "epsilon transition left at runtime\n"); 3946 exec->status = -2; 3947 break; 3948 } else if (value != NULL) { 3949 ret = xmlRegStrEqualWildcard(atom->valuep, value); 3950 if (atom->neg) { 3951 ret = !ret; 3952 if (!compound) 3953 ret = 0; 3954 } 3955 if ((ret == 1) && (trans->counter >= 0)) { 3956 xmlRegCounterPtr counter; 3957 int count; 3958 3959 count = exec->counts[trans->counter]; 3960 counter = &exec->comp->counters[trans->counter]; 3961 if (count >= counter->max) 3962 ret = 0; 3963 } 3964 3965 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) { 3966 xmlRegStatePtr to = exec->comp->states[trans->to]; 3967 3968 /* 3969 * this is a multiple input sequence 3970 */ 3971 if (exec->state->nbTrans > exec->transno + 1) { 3972 if (exec->inputStackNr <= 0) { 3973 xmlFARegExecSaveInputString(exec, value, data); 3974 } 3975 xmlFARegExecSave(exec); 3976 } 3977 exec->transcount = 1; 3978 do { 3979 /* 3980 * Try to progress as much as possible on the input 3981 */ 3982 if (exec->transcount == atom->max) { 3983 break; 3984 } 3985 exec->index++; 3986 value = exec->inputStack[exec->index].value; 3987 data = exec->inputStack[exec->index].data; 3988 #ifdef DEBUG_PUSH 3989 printf("value loaded: %s\n", value); 3990 #endif 3991 3992 /* 3993 * End of input: stop here 3994 */ 3995 if (value == NULL) { 3996 exec->index --; 3997 break; 3998 } 3999 if (exec->transcount >= atom->min) { 4000 int transno = exec->transno; 4001 xmlRegStatePtr state = exec->state; 4002 4003 /* 4004 * The transition is acceptable save it 4005 */ 4006 exec->transno = -1; /* trick */ 4007 exec->state = to; 4008 if (exec->inputStackNr <= 0) { 4009 xmlFARegExecSaveInputString(exec, value, data); 4010 } 4011 xmlFARegExecSave(exec); 4012 exec->transno = transno; 4013 exec->state = state; 4014 } 4015 ret = xmlStrEqual(value, atom->valuep); 4016 exec->transcount++; 4017 } while (ret == 1); 4018 if (exec->transcount < atom->min) 4019 ret = 0; 4020 4021 /* 4022 * If the last check failed but one transition was found 4023 * possible, rollback 4024 */ 4025 if (ret < 0) 4026 ret = 0; 4027 if (ret == 0) { 4028 goto rollback; 4029 } 4030 } 4031 } 4032 if (ret == 1) { 4033 if ((exec->callback != NULL) && (atom != NULL) && 4034 (data != NULL)) { 4035 exec->callback(exec->data, atom->valuep, 4036 atom->data, data); 4037 } 4038 if (exec->state->nbTrans > exec->transno + 1) { 4039 if (exec->inputStackNr <= 0) { 4040 xmlFARegExecSaveInputString(exec, value, data); 4041 } 4042 xmlFARegExecSave(exec); 4043 } 4044 if (trans->counter >= 0) { 4045 #ifdef DEBUG_PUSH 4046 printf("Increasing count %d\n", trans->counter); 4047 #endif 4048 exec->counts[trans->counter]++; 4049 } 4050 if ((trans->count >= 0) && 4051 (trans->count < REGEXP_ALL_COUNTER)) { 4052 #ifdef DEBUG_REGEXP_EXEC 4053 printf("resetting count %d on transition\n", 4054 trans->count); 4055 #endif 4056 exec->counts[trans->count] = 0; 4057 } 4058 #ifdef DEBUG_PUSH 4059 printf("entering state %d\n", trans->to); 4060 #endif 4061 if ((exec->comp->states[trans->to] != NULL) && 4062 (exec->comp->states[trans->to]->type == 4063 XML_REGEXP_SINK_STATE)) { 4064 /* 4065 * entering a sink state, save the current state as error 4066 * state. 4067 */ 4068 if (exec->errString != NULL) 4069 xmlFree(exec->errString); 4070 exec->errString = xmlStrdup(value); 4071 exec->errState = exec->state; 4072 memcpy(exec->errCounts, exec->counts, 4073 exec->comp->nbCounters * sizeof(int)); 4074 } 4075 exec->state = exec->comp->states[trans->to]; 4076 exec->transno = 0; 4077 if (trans->atom != NULL) { 4078 if (exec->inputStack != NULL) { 4079 exec->index++; 4080 if (exec->index < exec->inputStackNr) { 4081 value = exec->inputStack[exec->index].value; 4082 data = exec->inputStack[exec->index].data; 4083 #ifdef DEBUG_PUSH 4084 printf("value loaded: %s\n", value); 4085 #endif 4086 } else { 4087 value = NULL; 4088 data = NULL; 4089 #ifdef DEBUG_PUSH 4090 printf("end of input\n"); 4091 #endif 4092 } 4093 } else { 4094 value = NULL; 4095 data = NULL; 4096 #ifdef DEBUG_PUSH 4097 printf("end of input\n"); 4098 #endif 4099 } 4100 } 4101 goto progress; 4102 } else if (ret < 0) { 4103 exec->status = -4; 4104 break; 4105 } 4106 } 4107 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) { 4108 rollback: 4109 /* 4110 * if we didn't yet rollback on the current input 4111 * store the current state as the error state. 4112 */ 4113 if ((progress) && (exec->state != NULL) && 4114 (exec->state->type != XML_REGEXP_SINK_STATE)) { 4115 progress = 0; 4116 if (exec->errString != NULL) 4117 xmlFree(exec->errString); 4118 exec->errString = xmlStrdup(value); 4119 exec->errState = exec->state; 4120 if (exec->comp->nbCounters) 4121 memcpy(exec->errCounts, exec->counts, 4122 exec->comp->nbCounters * sizeof(int)); 4123 } 4124 4125 /* 4126 * Failed to find a way out 4127 */ 4128 exec->determinist = 0; 4129 xmlFARegExecRollBack(exec); 4130 if ((exec->inputStack != NULL ) && (exec->status == 0)) { 4131 value = exec->inputStack[exec->index].value; 4132 data = exec->inputStack[exec->index].data; 4133 #ifdef DEBUG_PUSH 4134 printf("value loaded: %s\n", value); 4135 #endif 4136 } 4137 } 4138 continue; 4139 progress: 4140 progress = 1; 4141 continue; 4142 } 4143 if (exec->status == 0) { 4144 return(exec->state->type == XML_REGEXP_FINAL_STATE); 4145 } 4146 #ifdef DEBUG_ERR 4147 if (exec->status < 0) { 4148 testerr(exec); 4149 } 4150 #endif 4151 return(exec->status); 4152 } 4153 4154 /** 4155 * xmlRegExecPushString: 4156 * @exec: a regexp execution context or NULL to indicate the end 4157 * @value: a string token input 4158 * @data: data associated to the token to reuse in callbacks 4159 * 4160 * Push one input token in the execution context 4161 * 4162 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 4163 * a negative value in case of error. 4164 */ 4165 int 4166 xmlRegExecPushString(xmlRegExecCtxtPtr exec, const xmlChar *value, 4167 void *data) { 4168 return(xmlRegExecPushStringInternal(exec, value, data, 0)); 4169 } 4170 4171 /** 4172 * xmlRegExecPushString2: 4173 * @exec: a regexp execution context or NULL to indicate the end 4174 * @value: the first string token input 4175 * @value2: the second string token input 4176 * @data: data associated to the token to reuse in callbacks 4177 * 4178 * Push one input token in the execution context 4179 * 4180 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 4181 * a negative value in case of error. 4182 */ 4183 int 4184 xmlRegExecPushString2(xmlRegExecCtxtPtr exec, const xmlChar *value, 4185 const xmlChar *value2, void *data) { 4186 xmlChar buf[150]; 4187 int lenn, lenp, ret; 4188 xmlChar *str; 4189 4190 if (exec == NULL) 4191 return(-1); 4192 if (exec->comp == NULL) 4193 return(-1); 4194 if (exec->status != 0) 4195 return(exec->status); 4196 4197 if (value2 == NULL) 4198 return(xmlRegExecPushString(exec, value, data)); 4199 4200 lenn = strlen((char *) value2); 4201 lenp = strlen((char *) value); 4202 4203 if (150 < lenn + lenp + 2) { 4204 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 4205 if (str == NULL) { 4206 exec->status = -1; 4207 return(-1); 4208 } 4209 } else { 4210 str = buf; 4211 } 4212 memcpy(&str[0], value, lenp); 4213 str[lenp] = XML_REG_STRING_SEPARATOR; 4214 memcpy(&str[lenp + 1], value2, lenn); 4215 str[lenn + lenp + 1] = 0; 4216 4217 if (exec->comp->compact != NULL) 4218 ret = xmlRegCompactPushString(exec, exec->comp, str, data); 4219 else 4220 ret = xmlRegExecPushStringInternal(exec, str, data, 1); 4221 4222 if (str != buf) 4223 xmlFree(str); 4224 return(ret); 4225 } 4226 4227 /** 4228 * xmlRegExecGetValues: 4229 * @exec: a regexp execution context 4230 * @err: error extraction or normal one 4231 * @nbval: pointer to the number of accepted values IN/OUT 4232 * @nbneg: return number of negative transitions 4233 * @values: pointer to the array of acceptable values 4234 * @terminal: return value if this was a terminal state 4235 * 4236 * Extract informations from the regexp execution, internal routine to 4237 * implement xmlRegExecNextValues() and xmlRegExecErrInfo() 4238 * 4239 * Returns: 0 in case of success or -1 in case of error. 4240 */ 4241 static int 4242 xmlRegExecGetValues(xmlRegExecCtxtPtr exec, int err, 4243 int *nbval, int *nbneg, 4244 xmlChar **values, int *terminal) { 4245 int maxval; 4246 int nb = 0; 4247 4248 if ((exec == NULL) || (nbval == NULL) || (nbneg == NULL) || 4249 (values == NULL) || (*nbval <= 0)) 4250 return(-1); 4251 4252 maxval = *nbval; 4253 *nbval = 0; 4254 *nbneg = 0; 4255 if ((exec->comp != NULL) && (exec->comp->compact != NULL)) { 4256 xmlRegexpPtr comp; 4257 int target, i, state; 4258 4259 comp = exec->comp; 4260 4261 if (err) { 4262 if (exec->errStateNo == -1) return(-1); 4263 state = exec->errStateNo; 4264 } else { 4265 state = exec->index; 4266 } 4267 if (terminal != NULL) { 4268 if (comp->compact[state * (comp->nbstrings + 1)] == 4269 XML_REGEXP_FINAL_STATE) 4270 *terminal = 1; 4271 else 4272 *terminal = 0; 4273 } 4274 for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) { 4275 target = comp->compact[state * (comp->nbstrings + 1) + i + 1]; 4276 if ((target > 0) && (target <= comp->nbstates) && 4277 (comp->compact[(target - 1) * (comp->nbstrings + 1)] != 4278 XML_REGEXP_SINK_STATE)) { 4279 values[nb++] = comp->stringMap[i]; 4280 (*nbval)++; 4281 } 4282 } 4283 for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) { 4284 target = comp->compact[state * (comp->nbstrings + 1) + i + 1]; 4285 if ((target > 0) && (target <= comp->nbstates) && 4286 (comp->compact[(target - 1) * (comp->nbstrings + 1)] == 4287 XML_REGEXP_SINK_STATE)) { 4288 values[nb++] = comp->stringMap[i]; 4289 (*nbneg)++; 4290 } 4291 } 4292 } else { 4293 int transno; 4294 xmlRegTransPtr trans; 4295 xmlRegAtomPtr atom; 4296 xmlRegStatePtr state; 4297 4298 if (terminal != NULL) { 4299 if (exec->state->type == XML_REGEXP_FINAL_STATE) 4300 *terminal = 1; 4301 else 4302 *terminal = 0; 4303 } 4304 4305 if (err) { 4306 if (exec->errState == NULL) return(-1); 4307 state = exec->errState; 4308 } else { 4309 if (exec->state == NULL) return(-1); 4310 state = exec->state; 4311 } 4312 for (transno = 0; 4313 (transno < state->nbTrans) && (nb < maxval); 4314 transno++) { 4315 trans = &state->trans[transno]; 4316 if (trans->to < 0) 4317 continue; 4318 atom = trans->atom; 4319 if ((atom == NULL) || (atom->valuep == NULL)) 4320 continue; 4321 if (trans->count == REGEXP_ALL_LAX_COUNTER) { 4322 /* this should not be reached but ... */ 4323 TODO; 4324 } else if (trans->count == REGEXP_ALL_COUNTER) { 4325 /* this should not be reached but ... */ 4326 TODO; 4327 } else if (trans->counter >= 0) { 4328 xmlRegCounterPtr counter = NULL; 4329 int count; 4330 4331 if (err) 4332 count = exec->errCounts[trans->counter]; 4333 else 4334 count = exec->counts[trans->counter]; 4335 if (exec->comp != NULL) 4336 counter = &exec->comp->counters[trans->counter]; 4337 if ((counter == NULL) || (count < counter->max)) { 4338 if (atom->neg) 4339 values[nb++] = (xmlChar *) atom->valuep2; 4340 else 4341 values[nb++] = (xmlChar *) atom->valuep; 4342 (*nbval)++; 4343 } 4344 } else { 4345 if ((exec->comp != NULL) && (exec->comp->states[trans->to] != NULL) && 4346 (exec->comp->states[trans->to]->type != 4347 XML_REGEXP_SINK_STATE)) { 4348 if (atom->neg) 4349 values[nb++] = (xmlChar *) atom->valuep2; 4350 else 4351 values[nb++] = (xmlChar *) atom->valuep; 4352 (*nbval)++; 4353 } 4354 } 4355 } 4356 for (transno = 0; 4357 (transno < state->nbTrans) && (nb < maxval); 4358 transno++) { 4359 trans = &state->trans[transno]; 4360 if (trans->to < 0) 4361 continue; 4362 atom = trans->atom; 4363 if ((atom == NULL) || (atom->valuep == NULL)) 4364 continue; 4365 if (trans->count == REGEXP_ALL_LAX_COUNTER) { 4366 continue; 4367 } else if (trans->count == REGEXP_ALL_COUNTER) { 4368 continue; 4369 } else if (trans->counter >= 0) { 4370 continue; 4371 } else { 4372 if ((exec->comp->states[trans->to] != NULL) && 4373 (exec->comp->states[trans->to]->type == 4374 XML_REGEXP_SINK_STATE)) { 4375 if (atom->neg) 4376 values[nb++] = (xmlChar *) atom->valuep2; 4377 else 4378 values[nb++] = (xmlChar *) atom->valuep; 4379 (*nbneg)++; 4380 } 4381 } 4382 } 4383 } 4384 return(0); 4385 } 4386 4387 /** 4388 * xmlRegExecNextValues: 4389 * @exec: a regexp execution context 4390 * @nbval: pointer to the number of accepted values IN/OUT 4391 * @nbneg: return number of negative transitions 4392 * @values: pointer to the array of acceptable values 4393 * @terminal: return value if this was a terminal state 4394 * 4395 * Extract informations from the regexp execution, 4396 * the parameter @values must point to an array of @nbval string pointers 4397 * on return nbval will contain the number of possible strings in that 4398 * state and the @values array will be updated with them. The string values 4399 * returned will be freed with the @exec context and don't need to be 4400 * deallocated. 4401 * 4402 * Returns: 0 in case of success or -1 in case of error. 4403 */ 4404 int 4405 xmlRegExecNextValues(xmlRegExecCtxtPtr exec, int *nbval, int *nbneg, 4406 xmlChar **values, int *terminal) { 4407 return(xmlRegExecGetValues(exec, 0, nbval, nbneg, values, terminal)); 4408 } 4409 4410 /** 4411 * xmlRegExecErrInfo: 4412 * @exec: a regexp execution context generating an error 4413 * @string: return value for the error string 4414 * @nbval: pointer to the number of accepted values IN/OUT 4415 * @nbneg: return number of negative transitions 4416 * @values: pointer to the array of acceptable values 4417 * @terminal: return value if this was a terminal state 4418 * 4419 * Extract error informations from the regexp execution, the parameter 4420 * @string will be updated with the value pushed and not accepted, 4421 * the parameter @values must point to an array of @nbval string pointers 4422 * on return nbval will contain the number of possible strings in that 4423 * state and the @values array will be updated with them. The string values 4424 * returned will be freed with the @exec context and don't need to be 4425 * deallocated. 4426 * 4427 * Returns: 0 in case of success or -1 in case of error. 4428 */ 4429 int 4430 xmlRegExecErrInfo(xmlRegExecCtxtPtr exec, const xmlChar **string, 4431 int *nbval, int *nbneg, xmlChar **values, int *terminal) { 4432 if (exec == NULL) 4433 return(-1); 4434 if (string != NULL) { 4435 if (exec->status != 0) 4436 *string = exec->errString; 4437 else 4438 *string = NULL; 4439 } 4440 return(xmlRegExecGetValues(exec, 1, nbval, nbneg, values, terminal)); 4441 } 4442 4443 #ifdef DEBUG_ERR 4444 static void testerr(xmlRegExecCtxtPtr exec) { 4445 const xmlChar *string; 4446 xmlChar *values[5]; 4447 int nb = 5; 4448 int nbneg; 4449 int terminal; 4450 xmlRegExecErrInfo(exec, &string, &nb, &nbneg, &values[0], &terminal); 4451 } 4452 #endif 4453 4454 #if 0 4455 static int 4456 xmlRegExecPushChar(xmlRegExecCtxtPtr exec, int UCS) { 4457 xmlRegTransPtr trans; 4458 xmlRegAtomPtr atom; 4459 int ret; 4460 int codepoint, len; 4461 4462 if (exec == NULL) 4463 return(-1); 4464 if (exec->status != 0) 4465 return(exec->status); 4466 4467 while ((exec->status == 0) && 4468 ((exec->inputString[exec->index] != 0) || 4469 (exec->state->type != XML_REGEXP_FINAL_STATE))) { 4470 4471 /* 4472 * End of input on non-terminal state, rollback, however we may 4473 * still have epsilon like transition for counted transitions 4474 * on counters, in that case don't break too early. 4475 */ 4476 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) 4477 goto rollback; 4478 4479 exec->transcount = 0; 4480 for (;exec->transno < exec->state->nbTrans;exec->transno++) { 4481 trans = &exec->state->trans[exec->transno]; 4482 if (trans->to < 0) 4483 continue; 4484 atom = trans->atom; 4485 ret = 0; 4486 if (trans->count >= 0) { 4487 int count; 4488 xmlRegCounterPtr counter; 4489 4490 /* 4491 * A counted transition. 4492 */ 4493 4494 count = exec->counts[trans->count]; 4495 counter = &exec->comp->counters[trans->count]; 4496 #ifdef DEBUG_REGEXP_EXEC 4497 printf("testing count %d: val %d, min %d, max %d\n", 4498 trans->count, count, counter->min, counter->max); 4499 #endif 4500 ret = ((count >= counter->min) && (count <= counter->max)); 4501 } else if (atom == NULL) { 4502 fprintf(stderr, "epsilon transition left at runtime\n"); 4503 exec->status = -2; 4504 break; 4505 } else if (exec->inputString[exec->index] != 0) { 4506 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len); 4507 ret = xmlRegCheckCharacter(atom, codepoint); 4508 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) { 4509 xmlRegStatePtr to = exec->comp->states[trans->to]; 4510 4511 /* 4512 * this is a multiple input sequence 4513 */ 4514 if (exec->state->nbTrans > exec->transno + 1) { 4515 xmlFARegExecSave(exec); 4516 } 4517 exec->transcount = 1; 4518 do { 4519 /* 4520 * Try to progress as much as possible on the input 4521 */ 4522 if (exec->transcount == atom->max) { 4523 break; 4524 } 4525 exec->index += len; 4526 /* 4527 * End of input: stop here 4528 */ 4529 if (exec->inputString[exec->index] == 0) { 4530 exec->index -= len; 4531 break; 4532 } 4533 if (exec->transcount >= atom->min) { 4534 int transno = exec->transno; 4535 xmlRegStatePtr state = exec->state; 4536 4537 /* 4538 * The transition is acceptable save it 4539 */ 4540 exec->transno = -1; /* trick */ 4541 exec->state = to; 4542 xmlFARegExecSave(exec); 4543 exec->transno = transno; 4544 exec->state = state; 4545 } 4546 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), 4547 len); 4548 ret = xmlRegCheckCharacter(atom, codepoint); 4549 exec->transcount++; 4550 } while (ret == 1); 4551 if (exec->transcount < atom->min) 4552 ret = 0; 4553 4554 /* 4555 * If the last check failed but one transition was found 4556 * possible, rollback 4557 */ 4558 if (ret < 0) 4559 ret = 0; 4560 if (ret == 0) { 4561 goto rollback; 4562 } 4563 } 4564 } 4565 if (ret == 1) { 4566 if (exec->state->nbTrans > exec->transno + 1) { 4567 xmlFARegExecSave(exec); 4568 } 4569 /* 4570 * restart count for expressions like this ((abc){2})* 4571 */ 4572 if (trans->count >= 0) { 4573 #ifdef DEBUG_REGEXP_EXEC 4574 printf("Reset count %d\n", trans->count); 4575 #endif 4576 exec->counts[trans->count] = 0; 4577 } 4578 if (trans->counter >= 0) { 4579 #ifdef DEBUG_REGEXP_EXEC 4580 printf("Increasing count %d\n", trans->counter); 4581 #endif 4582 exec->counts[trans->counter]++; 4583 } 4584 #ifdef DEBUG_REGEXP_EXEC 4585 printf("entering state %d\n", trans->to); 4586 #endif 4587 exec->state = exec->comp->states[trans->to]; 4588 exec->transno = 0; 4589 if (trans->atom != NULL) { 4590 exec->index += len; 4591 } 4592 goto progress; 4593 } else if (ret < 0) { 4594 exec->status = -4; 4595 break; 4596 } 4597 } 4598 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) { 4599 rollback: 4600 /* 4601 * Failed to find a way out 4602 */ 4603 exec->determinist = 0; 4604 xmlFARegExecRollBack(exec); 4605 } 4606 progress: 4607 continue; 4608 } 4609 } 4610 #endif 4611 /************************************************************************ 4612 * * 4613 * Parser for the Schemas Datatype Regular Expressions * 4614 * http://www.w3.org/TR/2001/REC-xmlschema-2-20010502/#regexs * 4615 * * 4616 ************************************************************************/ 4617 4618 /** 4619 * xmlFAIsChar: 4620 * @ctxt: a regexp parser context 4621 * 4622 * [10] Char ::= [^.\?*+()|#x5B#x5D] 4623 */ 4624 static int 4625 xmlFAIsChar(xmlRegParserCtxtPtr ctxt) { 4626 int cur; 4627 int len; 4628 4629 cur = CUR_SCHAR(ctxt->cur, len); 4630 if ((cur == '.') || (cur == '\\') || (cur == '?') || 4631 (cur == '*') || (cur == '+') || (cur == '(') || 4632 (cur == ')') || (cur == '|') || (cur == 0x5B) || 4633 (cur == 0x5D) || (cur == 0)) 4634 return(-1); 4635 return(cur); 4636 } 4637 4638 /** 4639 * xmlFAParseCharProp: 4640 * @ctxt: a regexp parser context 4641 * 4642 * [27] charProp ::= IsCategory | IsBlock 4643 * [28] IsCategory ::= Letters | Marks | Numbers | Punctuation | 4644 * Separators | Symbols | Others 4645 * [29] Letters ::= 'L' [ultmo]? 4646 * [30] Marks ::= 'M' [nce]? 4647 * [31] Numbers ::= 'N' [dlo]? 4648 * [32] Punctuation ::= 'P' [cdseifo]? 4649 * [33] Separators ::= 'Z' [slp]? 4650 * [34] Symbols ::= 'S' [mcko]? 4651 * [35] Others ::= 'C' [cfon]? 4652 * [36] IsBlock ::= 'Is' [a-zA-Z0-9#x2D]+ 4653 */ 4654 static void 4655 xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt) { 4656 int cur; 4657 xmlRegAtomType type = (xmlRegAtomType) 0; 4658 xmlChar *blockName = NULL; 4659 4660 cur = CUR; 4661 if (cur == 'L') { 4662 NEXT; 4663 cur = CUR; 4664 if (cur == 'u') { 4665 NEXT; 4666 type = XML_REGEXP_LETTER_UPPERCASE; 4667 } else if (cur == 'l') { 4668 NEXT; 4669 type = XML_REGEXP_LETTER_LOWERCASE; 4670 } else if (cur == 't') { 4671 NEXT; 4672 type = XML_REGEXP_LETTER_TITLECASE; 4673 } else if (cur == 'm') { 4674 NEXT; 4675 type = XML_REGEXP_LETTER_MODIFIER; 4676 } else if (cur == 'o') { 4677 NEXT; 4678 type = XML_REGEXP_LETTER_OTHERS; 4679 } else { 4680 type = XML_REGEXP_LETTER; 4681 } 4682 } else if (cur == 'M') { 4683 NEXT; 4684 cur = CUR; 4685 if (cur == 'n') { 4686 NEXT; 4687 /* nonspacing */ 4688 type = XML_REGEXP_MARK_NONSPACING; 4689 } else if (cur == 'c') { 4690 NEXT; 4691 /* spacing combining */ 4692 type = XML_REGEXP_MARK_SPACECOMBINING; 4693 } else if (cur == 'e') { 4694 NEXT; 4695 /* enclosing */ 4696 type = XML_REGEXP_MARK_ENCLOSING; 4697 } else { 4698 /* all marks */ 4699 type = XML_REGEXP_MARK; 4700 } 4701 } else if (cur == 'N') { 4702 NEXT; 4703 cur = CUR; 4704 if (cur == 'd') { 4705 NEXT; 4706 /* digital */ 4707 type = XML_REGEXP_NUMBER_DECIMAL; 4708 } else if (cur == 'l') { 4709 NEXT; 4710 /* letter */ 4711 type = XML_REGEXP_NUMBER_LETTER; 4712 } else if (cur == 'o') { 4713 NEXT; 4714 /* other */ 4715 type = XML_REGEXP_NUMBER_OTHERS; 4716 } else { 4717 /* all numbers */ 4718 type = XML_REGEXP_NUMBER; 4719 } 4720 } else if (cur == 'P') { 4721 NEXT; 4722 cur = CUR; 4723 if (cur == 'c') { 4724 NEXT; 4725 /* connector */ 4726 type = XML_REGEXP_PUNCT_CONNECTOR; 4727 } else if (cur == 'd') { 4728 NEXT; 4729 /* dash */ 4730 type = XML_REGEXP_PUNCT_DASH; 4731 } else if (cur == 's') { 4732 NEXT; 4733 /* open */ 4734 type = XML_REGEXP_PUNCT_OPEN; 4735 } else if (cur == 'e') { 4736 NEXT; 4737 /* close */ 4738 type = XML_REGEXP_PUNCT_CLOSE; 4739 } else if (cur == 'i') { 4740 NEXT; 4741 /* initial quote */ 4742 type = XML_REGEXP_PUNCT_INITQUOTE; 4743 } else if (cur == 'f') { 4744 NEXT; 4745 /* final quote */ 4746 type = XML_REGEXP_PUNCT_FINQUOTE; 4747 } else if (cur == 'o') { 4748 NEXT; 4749 /* other */ 4750 type = XML_REGEXP_PUNCT_OTHERS; 4751 } else { 4752 /* all punctuation */ 4753 type = XML_REGEXP_PUNCT; 4754 } 4755 } else if (cur == 'Z') { 4756 NEXT; 4757 cur = CUR; 4758 if (cur == 's') { 4759 NEXT; 4760 /* space */ 4761 type = XML_REGEXP_SEPAR_SPACE; 4762 } else if (cur == 'l') { 4763 NEXT; 4764 /* line */ 4765 type = XML_REGEXP_SEPAR_LINE; 4766 } else if (cur == 'p') { 4767 NEXT; 4768 /* paragraph */ 4769 type = XML_REGEXP_SEPAR_PARA; 4770 } else { 4771 /* all separators */ 4772 type = XML_REGEXP_SEPAR; 4773 } 4774 } else if (cur == 'S') { 4775 NEXT; 4776 cur = CUR; 4777 if (cur == 'm') { 4778 NEXT; 4779 type = XML_REGEXP_SYMBOL_MATH; 4780 /* math */ 4781 } else if (cur == 'c') { 4782 NEXT; 4783 type = XML_REGEXP_SYMBOL_CURRENCY; 4784 /* currency */ 4785 } else if (cur == 'k') { 4786 NEXT; 4787 type = XML_REGEXP_SYMBOL_MODIFIER; 4788 /* modifiers */ 4789 } else if (cur == 'o') { 4790 NEXT; 4791 type = XML_REGEXP_SYMBOL_OTHERS; 4792 /* other */ 4793 } else { 4794 /* all symbols */ 4795 type = XML_REGEXP_SYMBOL; 4796 } 4797 } else if (cur == 'C') { 4798 NEXT; 4799 cur = CUR; 4800 if (cur == 'c') { 4801 NEXT; 4802 /* control */ 4803 type = XML_REGEXP_OTHER_CONTROL; 4804 } else if (cur == 'f') { 4805 NEXT; 4806 /* format */ 4807 type = XML_REGEXP_OTHER_FORMAT; 4808 } else if (cur == 'o') { 4809 NEXT; 4810 /* private use */ 4811 type = XML_REGEXP_OTHER_PRIVATE; 4812 } else if (cur == 'n') { 4813 NEXT; 4814 /* not assigned */ 4815 type = XML_REGEXP_OTHER_NA; 4816 } else { 4817 /* all others */ 4818 type = XML_REGEXP_OTHER; 4819 } 4820 } else if (cur == 'I') { 4821 const xmlChar *start; 4822 NEXT; 4823 cur = CUR; 4824 if (cur != 's') { 4825 ERROR("IsXXXX expected"); 4826 return; 4827 } 4828 NEXT; 4829 start = ctxt->cur; 4830 cur = CUR; 4831 if (((cur >= 'a') && (cur <= 'z')) || 4832 ((cur >= 'A') && (cur <= 'Z')) || 4833 ((cur >= '0') && (cur <= '9')) || 4834 (cur == 0x2D)) { 4835 NEXT; 4836 cur = CUR; 4837 while (((cur >= 'a') && (cur <= 'z')) || 4838 ((cur >= 'A') && (cur <= 'Z')) || 4839 ((cur >= '0') && (cur <= '9')) || 4840 (cur == 0x2D)) { 4841 NEXT; 4842 cur = CUR; 4843 } 4844 } 4845 type = XML_REGEXP_BLOCK_NAME; 4846 blockName = xmlStrndup(start, ctxt->cur - start); 4847 } else { 4848 ERROR("Unknown char property"); 4849 return; 4850 } 4851 if (ctxt->atom == NULL) { 4852 ctxt->atom = xmlRegNewAtom(ctxt, type); 4853 if (ctxt->atom != NULL) 4854 ctxt->atom->valuep = blockName; 4855 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 4856 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4857 type, 0, 0, blockName); 4858 } 4859 } 4860 4861 /** 4862 * xmlFAParseCharClassEsc: 4863 * @ctxt: a regexp parser context 4864 * 4865 * [23] charClassEsc ::= ( SingleCharEsc | MultiCharEsc | catEsc | complEsc ) 4866 * [24] SingleCharEsc ::= '\' [nrt\|.?*+(){}#x2D#x5B#x5D#x5E] 4867 * [25] catEsc ::= '\p{' charProp '}' 4868 * [26] complEsc ::= '\P{' charProp '}' 4869 * [37] MultiCharEsc ::= '.' | ('\' [sSiIcCdDwW]) 4870 */ 4871 static void 4872 xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) { 4873 int cur; 4874 4875 if (CUR == '.') { 4876 if (ctxt->atom == NULL) { 4877 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_ANYCHAR); 4878 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 4879 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4880 XML_REGEXP_ANYCHAR, 0, 0, NULL); 4881 } 4882 NEXT; 4883 return; 4884 } 4885 if (CUR != '\\') { 4886 ERROR("Escaped sequence: expecting \\"); 4887 return; 4888 } 4889 NEXT; 4890 cur = CUR; 4891 if (cur == 'p') { 4892 NEXT; 4893 if (CUR != '{') { 4894 ERROR("Expecting '{'"); 4895 return; 4896 } 4897 NEXT; 4898 xmlFAParseCharProp(ctxt); 4899 if (CUR != '}') { 4900 ERROR("Expecting '}'"); 4901 return; 4902 } 4903 NEXT; 4904 } else if (cur == 'P') { 4905 NEXT; 4906 if (CUR != '{') { 4907 ERROR("Expecting '{'"); 4908 return; 4909 } 4910 NEXT; 4911 xmlFAParseCharProp(ctxt); 4912 if (ctxt->atom != NULL) 4913 ctxt->atom->neg = 1; 4914 if (CUR != '}') { 4915 ERROR("Expecting '}'"); 4916 return; 4917 } 4918 NEXT; 4919 } else if ((cur == 'n') || (cur == 'r') || (cur == 't') || (cur == '\\') || 4920 (cur == '|') || (cur == '.') || (cur == '?') || (cur == '*') || 4921 (cur == '+') || (cur == '(') || (cur == ')') || (cur == '{') || 4922 (cur == '}') || (cur == 0x2D) || (cur == 0x5B) || (cur == 0x5D) || 4923 (cur == 0x5E)) { 4924 if (ctxt->atom == NULL) { 4925 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL); 4926 if (ctxt->atom != NULL) { 4927 switch (cur) { 4928 case 'n': 4929 ctxt->atom->codepoint = '\n'; 4930 break; 4931 case 'r': 4932 ctxt->atom->codepoint = '\r'; 4933 break; 4934 case 't': 4935 ctxt->atom->codepoint = '\t'; 4936 break; 4937 default: 4938 ctxt->atom->codepoint = cur; 4939 } 4940 } 4941 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 4942 switch (cur) { 4943 case 'n': 4944 cur = '\n'; 4945 break; 4946 case 'r': 4947 cur = '\r'; 4948 break; 4949 case 't': 4950 cur = '\t'; 4951 break; 4952 } 4953 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4954 XML_REGEXP_CHARVAL, cur, cur, NULL); 4955 } 4956 NEXT; 4957 } else if ((cur == 's') || (cur == 'S') || (cur == 'i') || (cur == 'I') || 4958 (cur == 'c') || (cur == 'C') || (cur == 'd') || (cur == 'D') || 4959 (cur == 'w') || (cur == 'W')) { 4960 xmlRegAtomType type = XML_REGEXP_ANYSPACE; 4961 4962 switch (cur) { 4963 case 's': 4964 type = XML_REGEXP_ANYSPACE; 4965 break; 4966 case 'S': 4967 type = XML_REGEXP_NOTSPACE; 4968 break; 4969 case 'i': 4970 type = XML_REGEXP_INITNAME; 4971 break; 4972 case 'I': 4973 type = XML_REGEXP_NOTINITNAME; 4974 break; 4975 case 'c': 4976 type = XML_REGEXP_NAMECHAR; 4977 break; 4978 case 'C': 4979 type = XML_REGEXP_NOTNAMECHAR; 4980 break; 4981 case 'd': 4982 type = XML_REGEXP_DECIMAL; 4983 break; 4984 case 'D': 4985 type = XML_REGEXP_NOTDECIMAL; 4986 break; 4987 case 'w': 4988 type = XML_REGEXP_REALCHAR; 4989 break; 4990 case 'W': 4991 type = XML_REGEXP_NOTREALCHAR; 4992 break; 4993 } 4994 NEXT; 4995 if (ctxt->atom == NULL) { 4996 ctxt->atom = xmlRegNewAtom(ctxt, type); 4997 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 4998 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4999 type, 0, 0, NULL); 5000 } 5001 } else { 5002 ERROR("Wrong escape sequence, misuse of character '\\'"); 5003 } 5004 } 5005 5006 /** 5007 * xmlFAParseCharRange: 5008 * @ctxt: a regexp parser context 5009 * 5010 * [17] charRange ::= seRange | XmlCharRef | XmlCharIncDash 5011 * [18] seRange ::= charOrEsc '-' charOrEsc 5012 * [20] charOrEsc ::= XmlChar | SingleCharEsc 5013 * [21] XmlChar ::= [^\#x2D#x5B#x5D] 5014 * [22] XmlCharIncDash ::= [^\#x5B#x5D] 5015 */ 5016 static void 5017 xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt) { 5018 int cur, len; 5019 int start = -1; 5020 int end = -1; 5021 5022 if (CUR == '\0') { 5023 ERROR("Expecting ']'"); 5024 return; 5025 } 5026 5027 cur = CUR; 5028 if (cur == '\\') { 5029 NEXT; 5030 cur = CUR; 5031 switch (cur) { 5032 case 'n': start = 0xA; break; 5033 case 'r': start = 0xD; break; 5034 case 't': start = 0x9; break; 5035 case '\\': case '|': case '.': case '-': case '^': case '?': 5036 case '*': case '+': case '{': case '}': case '(': case ')': 5037 case '[': case ']': 5038 start = cur; break; 5039 default: 5040 ERROR("Invalid escape value"); 5041 return; 5042 } 5043 end = start; 5044 len = 1; 5045 } else if ((cur != 0x5B) && (cur != 0x5D)) { 5046 end = start = CUR_SCHAR(ctxt->cur, len); 5047 } else { 5048 ERROR("Expecting a char range"); 5049 return; 5050 } 5051 /* 5052 * Since we are "inside" a range, we can assume ctxt->cur is past 5053 * the start of ctxt->string, and PREV should be safe 5054 */ 5055 if ((start == '-') && (NXT(1) != ']') && (PREV != '[') && (PREV != '^')) { 5056 NEXTL(len); 5057 return; 5058 } 5059 NEXTL(len); 5060 cur = CUR; 5061 if ((cur != '-') || (NXT(1) == ']')) { 5062 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 5063 XML_REGEXP_CHARVAL, start, end, NULL); 5064 return; 5065 } 5066 NEXT; 5067 cur = CUR; 5068 if (cur == '\\') { 5069 NEXT; 5070 cur = CUR; 5071 switch (cur) { 5072 case 'n': end = 0xA; break; 5073 case 'r': end = 0xD; break; 5074 case 't': end = 0x9; break; 5075 case '\\': case '|': case '.': case '-': case '^': case '?': 5076 case '*': case '+': case '{': case '}': case '(': case ')': 5077 case '[': case ']': 5078 end = cur; break; 5079 default: 5080 ERROR("Invalid escape value"); 5081 return; 5082 } 5083 len = 1; 5084 } else if ((cur != '\0') && (cur != 0x5B) && (cur != 0x5D)) { 5085 end = CUR_SCHAR(ctxt->cur, len); 5086 } else { 5087 ERROR("Expecting the end of a char range"); 5088 return; 5089 } 5090 5091 /* TODO check that the values are acceptable character ranges for XML */ 5092 if (end < start) { 5093 ERROR("End of range is before start of range"); 5094 } else { 5095 NEXTL(len); 5096 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 5097 XML_REGEXP_CHARVAL, start, end, NULL); 5098 } 5099 return; 5100 } 5101 5102 /** 5103 * xmlFAParsePosCharGroup: 5104 * @ctxt: a regexp parser context 5105 * 5106 * [14] posCharGroup ::= ( charRange | charClassEsc )+ 5107 */ 5108 static void 5109 xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt) { 5110 do { 5111 if (CUR == '\\') { 5112 xmlFAParseCharClassEsc(ctxt); 5113 } else { 5114 xmlFAParseCharRange(ctxt); 5115 } 5116 } while ((CUR != ']') && (CUR != '^') && (CUR != '-') && 5117 (CUR != 0) && (ctxt->error == 0)); 5118 } 5119 5120 /** 5121 * xmlFAParseCharGroup: 5122 * @ctxt: a regexp parser context 5123 * 5124 * [13] charGroup ::= posCharGroup | negCharGroup | charClassSub 5125 * [15] negCharGroup ::= '^' posCharGroup 5126 * [16] charClassSub ::= ( posCharGroup | negCharGroup ) '-' charClassExpr 5127 * [12] charClassExpr ::= '[' charGroup ']' 5128 */ 5129 static void 5130 xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt) { 5131 int n = ctxt->neg; 5132 while ((CUR != ']') && (ctxt->error == 0)) { 5133 if (CUR == '^') { 5134 int neg = ctxt->neg; 5135 5136 NEXT; 5137 ctxt->neg = !ctxt->neg; 5138 xmlFAParsePosCharGroup(ctxt); 5139 ctxt->neg = neg; 5140 } else if ((CUR == '-') && (NXT(1) == '[')) { 5141 int neg = ctxt->neg; 5142 ctxt->neg = 2; 5143 NEXT; /* eat the '-' */ 5144 NEXT; /* eat the '[' */ 5145 xmlFAParseCharGroup(ctxt); 5146 if (CUR == ']') { 5147 NEXT; 5148 } else { 5149 ERROR("charClassExpr: ']' expected"); 5150 break; 5151 } 5152 ctxt->neg = neg; 5153 break; 5154 } else if (CUR != ']') { 5155 xmlFAParsePosCharGroup(ctxt); 5156 } 5157 } 5158 ctxt->neg = n; 5159 } 5160 5161 /** 5162 * xmlFAParseCharClass: 5163 * @ctxt: a regexp parser context 5164 * 5165 * [11] charClass ::= charClassEsc | charClassExpr 5166 * [12] charClassExpr ::= '[' charGroup ']' 5167 */ 5168 static void 5169 xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt) { 5170 if (CUR == '[') { 5171 NEXT; 5172 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_RANGES); 5173 if (ctxt->atom == NULL) 5174 return; 5175 xmlFAParseCharGroup(ctxt); 5176 if (CUR == ']') { 5177 NEXT; 5178 } else { 5179 ERROR("xmlFAParseCharClass: ']' expected"); 5180 } 5181 } else { 5182 xmlFAParseCharClassEsc(ctxt); 5183 } 5184 } 5185 5186 /** 5187 * xmlFAParseQuantExact: 5188 * @ctxt: a regexp parser context 5189 * 5190 * [8] QuantExact ::= [0-9]+ 5191 * 5192 * Returns 0 if success or -1 in case of error 5193 */ 5194 static int 5195 xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt) { 5196 int ret = 0; 5197 int ok = 0; 5198 int overflow = 0; 5199 5200 while ((CUR >= '0') && (CUR <= '9')) { 5201 if (ret > INT_MAX / 10) { 5202 overflow = 1; 5203 } else { 5204 int digit = CUR - '0'; 5205 5206 ret *= 10; 5207 if (ret > INT_MAX - digit) 5208 overflow = 1; 5209 else 5210 ret += digit; 5211 } 5212 ok = 1; 5213 NEXT; 5214 } 5215 if ((ok != 1) || (overflow == 1)) { 5216 return(-1); 5217 } 5218 return(ret); 5219 } 5220 5221 /** 5222 * xmlFAParseQuantifier: 5223 * @ctxt: a regexp parser context 5224 * 5225 * [4] quantifier ::= [?*+] | ( '{' quantity '}' ) 5226 * [5] quantity ::= quantRange | quantMin | QuantExact 5227 * [6] quantRange ::= QuantExact ',' QuantExact 5228 * [7] quantMin ::= QuantExact ',' 5229 * [8] QuantExact ::= [0-9]+ 5230 */ 5231 static int 5232 xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt) { 5233 int cur; 5234 5235 cur = CUR; 5236 if ((cur == '?') || (cur == '*') || (cur == '+')) { 5237 if (ctxt->atom != NULL) { 5238 if (cur == '?') 5239 ctxt->atom->quant = XML_REGEXP_QUANT_OPT; 5240 else if (cur == '*') 5241 ctxt->atom->quant = XML_REGEXP_QUANT_MULT; 5242 else if (cur == '+') 5243 ctxt->atom->quant = XML_REGEXP_QUANT_PLUS; 5244 } 5245 NEXT; 5246 return(1); 5247 } 5248 if (cur == '{') { 5249 int min = 0, max = 0; 5250 5251 NEXT; 5252 cur = xmlFAParseQuantExact(ctxt); 5253 if (cur >= 0) 5254 min = cur; 5255 else { 5256 ERROR("Improper quantifier"); 5257 } 5258 if (CUR == ',') { 5259 NEXT; 5260 if (CUR == '}') 5261 max = INT_MAX; 5262 else { 5263 cur = xmlFAParseQuantExact(ctxt); 5264 if (cur >= 0) 5265 max = cur; 5266 else { 5267 ERROR("Improper quantifier"); 5268 } 5269 } 5270 } 5271 if (CUR == '}') { 5272 NEXT; 5273 } else { 5274 ERROR("Unterminated quantifier"); 5275 } 5276 if (max == 0) 5277 max = min; 5278 if (ctxt->atom != NULL) { 5279 ctxt->atom->quant = XML_REGEXP_QUANT_RANGE; 5280 ctxt->atom->min = min; 5281 ctxt->atom->max = max; 5282 } 5283 return(1); 5284 } 5285 return(0); 5286 } 5287 5288 /** 5289 * xmlFAParseAtom: 5290 * @ctxt: a regexp parser context 5291 * 5292 * [9] atom ::= Char | charClass | ( '(' regExp ')' ) 5293 */ 5294 static int 5295 xmlFAParseAtom(xmlRegParserCtxtPtr ctxt) { 5296 int codepoint, len; 5297 5298 codepoint = xmlFAIsChar(ctxt); 5299 if (codepoint > 0) { 5300 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL); 5301 if (ctxt->atom == NULL) 5302 return(-1); 5303 codepoint = CUR_SCHAR(ctxt->cur, len); 5304 ctxt->atom->codepoint = codepoint; 5305 NEXTL(len); 5306 return(1); 5307 } else if (CUR == '|') { 5308 return(0); 5309 } else if (CUR == 0) { 5310 return(0); 5311 } else if (CUR == ')') { 5312 return(0); 5313 } else if (CUR == '(') { 5314 xmlRegStatePtr start, oldend, start0; 5315 5316 NEXT; 5317 /* 5318 * this extra Epsilon transition is needed if we count with 0 allowed 5319 * unfortunately this can't be known at that point 5320 */ 5321 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL); 5322 start0 = ctxt->state; 5323 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL); 5324 start = ctxt->state; 5325 oldend = ctxt->end; 5326 ctxt->end = NULL; 5327 ctxt->atom = NULL; 5328 xmlFAParseRegExp(ctxt, 0); 5329 if (CUR == ')') { 5330 NEXT; 5331 } else { 5332 ERROR("xmlFAParseAtom: expecting ')'"); 5333 } 5334 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_SUBREG); 5335 if (ctxt->atom == NULL) 5336 return(-1); 5337 ctxt->atom->start = start; 5338 ctxt->atom->start0 = start0; 5339 ctxt->atom->stop = ctxt->state; 5340 ctxt->end = oldend; 5341 return(1); 5342 } else if ((CUR == '[') || (CUR == '\\') || (CUR == '.')) { 5343 xmlFAParseCharClass(ctxt); 5344 return(1); 5345 } 5346 return(0); 5347 } 5348 5349 /** 5350 * xmlFAParsePiece: 5351 * @ctxt: a regexp parser context 5352 * 5353 * [3] piece ::= atom quantifier? 5354 */ 5355 static int 5356 xmlFAParsePiece(xmlRegParserCtxtPtr ctxt) { 5357 int ret; 5358 5359 ctxt->atom = NULL; 5360 ret = xmlFAParseAtom(ctxt); 5361 if (ret == 0) 5362 return(0); 5363 if (ctxt->atom == NULL) { 5364 ERROR("internal: no atom generated"); 5365 } 5366 xmlFAParseQuantifier(ctxt); 5367 return(1); 5368 } 5369 5370 /** 5371 * xmlFAParseBranch: 5372 * @ctxt: a regexp parser context 5373 * @to: optional target to the end of the branch 5374 * 5375 * @to is used to optimize by removing duplicate path in automata 5376 * in expressions like (a|b)(c|d) 5377 * 5378 * [2] branch ::= piece* 5379 */ 5380 static int 5381 xmlFAParseBranch(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr to) { 5382 xmlRegStatePtr previous; 5383 int ret; 5384 5385 previous = ctxt->state; 5386 ret = xmlFAParsePiece(ctxt); 5387 if (ret != 0) { 5388 if (xmlFAGenerateTransitions(ctxt, previous, 5389 (CUR=='|' || CUR==')') ? to : NULL, ctxt->atom) < 0) 5390 return(-1); 5391 previous = ctxt->state; 5392 ctxt->atom = NULL; 5393 } 5394 while ((ret != 0) && (ctxt->error == 0)) { 5395 ret = xmlFAParsePiece(ctxt); 5396 if (ret != 0) { 5397 if (xmlFAGenerateTransitions(ctxt, previous, 5398 (CUR=='|' || CUR==')') ? to : NULL, ctxt->atom) < 0) 5399 return(-1); 5400 previous = ctxt->state; 5401 ctxt->atom = NULL; 5402 } 5403 } 5404 return(0); 5405 } 5406 5407 /** 5408 * xmlFAParseRegExp: 5409 * @ctxt: a regexp parser context 5410 * @top: is this the top-level expression ? 5411 * 5412 * [1] regExp ::= branch ( '|' branch )* 5413 */ 5414 static void 5415 xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top) { 5416 xmlRegStatePtr start, end; 5417 5418 /* if not top start should have been generated by an epsilon trans */ 5419 start = ctxt->state; 5420 ctxt->end = NULL; 5421 xmlFAParseBranch(ctxt, NULL); 5422 if (top) { 5423 #ifdef DEBUG_REGEXP_GRAPH 5424 printf("State %d is final\n", ctxt->state->no); 5425 #endif 5426 ctxt->state->type = XML_REGEXP_FINAL_STATE; 5427 } 5428 if (CUR != '|') { 5429 ctxt->end = ctxt->state; 5430 return; 5431 } 5432 end = ctxt->state; 5433 while ((CUR == '|') && (ctxt->error == 0)) { 5434 NEXT; 5435 if (CUR == 0) { 5436 ERROR("expecting a branch after |") 5437 return; 5438 } 5439 ctxt->state = start; 5440 ctxt->end = NULL; 5441 xmlFAParseBranch(ctxt, end); 5442 } 5443 if (!top) { 5444 ctxt->state = end; 5445 ctxt->end = end; 5446 } 5447 } 5448 5449 /************************************************************************ 5450 * * 5451 * The basic API * 5452 * * 5453 ************************************************************************/ 5454 5455 /** 5456 * xmlRegexpPrint: 5457 * @output: the file for the output debug 5458 * @regexp: the compiled regexp 5459 * 5460 * Print the content of the compiled regular expression 5461 */ 5462 void 5463 xmlRegexpPrint(FILE *output, xmlRegexpPtr regexp) { 5464 int i; 5465 5466 if (output == NULL) 5467 return; 5468 fprintf(output, " regexp: "); 5469 if (regexp == NULL) { 5470 fprintf(output, "NULL\n"); 5471 return; 5472 } 5473 fprintf(output, "'%s' ", regexp->string); 5474 fprintf(output, "\n"); 5475 fprintf(output, "%d atoms:\n", regexp->nbAtoms); 5476 for (i = 0;i < regexp->nbAtoms; i++) { 5477 fprintf(output, " %02d ", i); 5478 xmlRegPrintAtom(output, regexp->atoms[i]); 5479 } 5480 fprintf(output, "%d states:", regexp->nbStates); 5481 fprintf(output, "\n"); 5482 for (i = 0;i < regexp->nbStates; i++) { 5483 xmlRegPrintState(output, regexp->states[i]); 5484 } 5485 fprintf(output, "%d counters:\n", regexp->nbCounters); 5486 for (i = 0;i < regexp->nbCounters; i++) { 5487 fprintf(output, " %d: min %d max %d\n", i, regexp->counters[i].min, 5488 regexp->counters[i].max); 5489 } 5490 } 5491 5492 /** 5493 * xmlRegexpCompile: 5494 * @regexp: a regular expression string 5495 * 5496 * Parses a regular expression conforming to XML Schemas Part 2 Datatype 5497 * Appendix F and builds an automata suitable for testing strings against 5498 * that regular expression 5499 * 5500 * Returns the compiled expression or NULL in case of error 5501 */ 5502 xmlRegexpPtr 5503 xmlRegexpCompile(const xmlChar *regexp) { 5504 xmlRegexpPtr ret; 5505 xmlRegParserCtxtPtr ctxt; 5506 5507 ctxt = xmlRegNewParserCtxt(regexp); 5508 if (ctxt == NULL) 5509 return(NULL); 5510 5511 /* initialize the parser */ 5512 ctxt->end = NULL; 5513 ctxt->start = ctxt->state = xmlRegNewState(ctxt); 5514 xmlRegStatePush(ctxt, ctxt->start); 5515 5516 /* parse the expression building an automata */ 5517 xmlFAParseRegExp(ctxt, 1); 5518 if (CUR != 0) { 5519 ERROR("xmlFAParseRegExp: extra characters"); 5520 } 5521 if (ctxt->error != 0) { 5522 xmlRegFreeParserCtxt(ctxt); 5523 return(NULL); 5524 } 5525 ctxt->end = ctxt->state; 5526 ctxt->start->type = XML_REGEXP_START_STATE; 5527 ctxt->end->type = XML_REGEXP_FINAL_STATE; 5528 5529 /* remove the Epsilon except for counted transitions */ 5530 xmlFAEliminateEpsilonTransitions(ctxt); 5531 5532 5533 if (ctxt->error != 0) { 5534 xmlRegFreeParserCtxt(ctxt); 5535 return(NULL); 5536 } 5537 ret = xmlRegEpxFromParse(ctxt); 5538 xmlRegFreeParserCtxt(ctxt); 5539 return(ret); 5540 } 5541 5542 /** 5543 * xmlRegexpExec: 5544 * @comp: the compiled regular expression 5545 * @content: the value to check against the regular expression 5546 * 5547 * Check if the regular expression generates the value 5548 * 5549 * Returns 1 if it matches, 0 if not and a negative value in case of error 5550 */ 5551 int 5552 xmlRegexpExec(xmlRegexpPtr comp, const xmlChar *content) { 5553 if ((comp == NULL) || (content == NULL)) 5554 return(-1); 5555 return(xmlFARegExec(comp, content)); 5556 } 5557 5558 /** 5559 * xmlRegexpIsDeterminist: 5560 * @comp: the compiled regular expression 5561 * 5562 * Check if the regular expression is determinist 5563 * 5564 * Returns 1 if it yes, 0 if not and a negative value in case of error 5565 */ 5566 int 5567 xmlRegexpIsDeterminist(xmlRegexpPtr comp) { 5568 xmlAutomataPtr am; 5569 int ret; 5570 5571 if (comp == NULL) 5572 return(-1); 5573 if (comp->determinist != -1) 5574 return(comp->determinist); 5575 5576 am = xmlNewAutomata(); 5577 if (am->states != NULL) { 5578 int i; 5579 5580 for (i = 0;i < am->nbStates;i++) 5581 xmlRegFreeState(am->states[i]); 5582 xmlFree(am->states); 5583 } 5584 am->nbAtoms = comp->nbAtoms; 5585 am->atoms = comp->atoms; 5586 am->nbStates = comp->nbStates; 5587 am->states = comp->states; 5588 am->determinist = -1; 5589 am->flags = comp->flags; 5590 ret = xmlFAComputesDeterminism(am); 5591 am->atoms = NULL; 5592 am->states = NULL; 5593 xmlFreeAutomata(am); 5594 comp->determinist = ret; 5595 return(ret); 5596 } 5597 5598 /** 5599 * xmlRegFreeRegexp: 5600 * @regexp: the regexp 5601 * 5602 * Free a regexp 5603 */ 5604 void 5605 xmlRegFreeRegexp(xmlRegexpPtr regexp) { 5606 int i; 5607 if (regexp == NULL) 5608 return; 5609 5610 if (regexp->string != NULL) 5611 xmlFree(regexp->string); 5612 if (regexp->states != NULL) { 5613 for (i = 0;i < regexp->nbStates;i++) 5614 xmlRegFreeState(regexp->states[i]); 5615 xmlFree(regexp->states); 5616 } 5617 if (regexp->atoms != NULL) { 5618 for (i = 0;i < regexp->nbAtoms;i++) 5619 xmlRegFreeAtom(regexp->atoms[i]); 5620 xmlFree(regexp->atoms); 5621 } 5622 if (regexp->counters != NULL) 5623 xmlFree(regexp->counters); 5624 if (regexp->compact != NULL) 5625 xmlFree(regexp->compact); 5626 if (regexp->transdata != NULL) 5627 xmlFree(regexp->transdata); 5628 if (regexp->stringMap != NULL) { 5629 for (i = 0; i < regexp->nbstrings;i++) 5630 xmlFree(regexp->stringMap[i]); 5631 xmlFree(regexp->stringMap); 5632 } 5633 5634 xmlFree(regexp); 5635 } 5636 5637 #ifdef LIBXML_AUTOMATA_ENABLED 5638 /************************************************************************ 5639 * * 5640 * The Automata interface * 5641 * * 5642 ************************************************************************/ 5643 5644 /** 5645 * xmlNewAutomata: 5646 * 5647 * Create a new automata 5648 * 5649 * Returns the new object or NULL in case of failure 5650 */ 5651 xmlAutomataPtr 5652 xmlNewAutomata(void) { 5653 xmlAutomataPtr ctxt; 5654 5655 ctxt = xmlRegNewParserCtxt(NULL); 5656 if (ctxt == NULL) 5657 return(NULL); 5658 5659 /* initialize the parser */ 5660 ctxt->end = NULL; 5661 ctxt->start = ctxt->state = xmlRegNewState(ctxt); 5662 if (ctxt->start == NULL) { 5663 xmlFreeAutomata(ctxt); 5664 return(NULL); 5665 } 5666 ctxt->start->type = XML_REGEXP_START_STATE; 5667 if (xmlRegStatePush(ctxt, ctxt->start) < 0) { 5668 xmlRegFreeState(ctxt->start); 5669 xmlFreeAutomata(ctxt); 5670 return(NULL); 5671 } 5672 ctxt->flags = 0; 5673 5674 return(ctxt); 5675 } 5676 5677 /** 5678 * xmlFreeAutomata: 5679 * @am: an automata 5680 * 5681 * Free an automata 5682 */ 5683 void 5684 xmlFreeAutomata(xmlAutomataPtr am) { 5685 if (am == NULL) 5686 return; 5687 xmlRegFreeParserCtxt(am); 5688 } 5689 5690 /** 5691 * xmlAutomataSetFlags: 5692 * @am: an automata 5693 * @flags: a set of internal flags 5694 * 5695 * Set some flags on the automata 5696 */ 5697 void 5698 xmlAutomataSetFlags(xmlAutomataPtr am, int flags) { 5699 if (am == NULL) 5700 return; 5701 am->flags |= flags; 5702 } 5703 5704 /** 5705 * xmlAutomataGetInitState: 5706 * @am: an automata 5707 * 5708 * Initial state lookup 5709 * 5710 * Returns the initial state of the automata 5711 */ 5712 xmlAutomataStatePtr 5713 xmlAutomataGetInitState(xmlAutomataPtr am) { 5714 if (am == NULL) 5715 return(NULL); 5716 return(am->start); 5717 } 5718 5719 /** 5720 * xmlAutomataSetFinalState: 5721 * @am: an automata 5722 * @state: a state in this automata 5723 * 5724 * Makes that state a final state 5725 * 5726 * Returns 0 or -1 in case of error 5727 */ 5728 int 5729 xmlAutomataSetFinalState(xmlAutomataPtr am, xmlAutomataStatePtr state) { 5730 if ((am == NULL) || (state == NULL)) 5731 return(-1); 5732 state->type = XML_REGEXP_FINAL_STATE; 5733 return(0); 5734 } 5735 5736 /** 5737 * xmlAutomataNewTransition: 5738 * @am: an automata 5739 * @from: the starting point of the transition 5740 * @to: the target point of the transition or NULL 5741 * @token: the input string associated to that transition 5742 * @data: data passed to the callback function if the transition is activated 5743 * 5744 * If @to is NULL, this creates first a new target state in the automata 5745 * and then adds a transition from the @from state to the target state 5746 * activated by the value of @token 5747 * 5748 * Returns the target state or NULL in case of error 5749 */ 5750 xmlAutomataStatePtr 5751 xmlAutomataNewTransition(xmlAutomataPtr am, xmlAutomataStatePtr from, 5752 xmlAutomataStatePtr to, const xmlChar *token, 5753 void *data) { 5754 xmlRegAtomPtr atom; 5755 5756 if ((am == NULL) || (from == NULL) || (token == NULL)) 5757 return(NULL); 5758 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5759 if (atom == NULL) 5760 return(NULL); 5761 atom->data = data; 5762 atom->valuep = xmlStrdup(token); 5763 5764 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) { 5765 xmlRegFreeAtom(atom); 5766 return(NULL); 5767 } 5768 if (to == NULL) 5769 return(am->state); 5770 return(to); 5771 } 5772 5773 /** 5774 * xmlAutomataNewTransition2: 5775 * @am: an automata 5776 * @from: the starting point of the transition 5777 * @to: the target point of the transition or NULL 5778 * @token: the first input string associated to that transition 5779 * @token2: the second input string associated to that transition 5780 * @data: data passed to the callback function if the transition is activated 5781 * 5782 * If @to is NULL, this creates first a new target state in the automata 5783 * and then adds a transition from the @from state to the target state 5784 * activated by the value of @token 5785 * 5786 * Returns the target state or NULL in case of error 5787 */ 5788 xmlAutomataStatePtr 5789 xmlAutomataNewTransition2(xmlAutomataPtr am, xmlAutomataStatePtr from, 5790 xmlAutomataStatePtr to, const xmlChar *token, 5791 const xmlChar *token2, void *data) { 5792 xmlRegAtomPtr atom; 5793 5794 if ((am == NULL) || (from == NULL) || (token == NULL)) 5795 return(NULL); 5796 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5797 if (atom == NULL) 5798 return(NULL); 5799 atom->data = data; 5800 if ((token2 == NULL) || (*token2 == 0)) { 5801 atom->valuep = xmlStrdup(token); 5802 } else { 5803 int lenn, lenp; 5804 xmlChar *str; 5805 5806 lenn = strlen((char *) token2); 5807 lenp = strlen((char *) token); 5808 5809 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 5810 if (str == NULL) { 5811 xmlRegFreeAtom(atom); 5812 return(NULL); 5813 } 5814 memcpy(&str[0], token, lenp); 5815 str[lenp] = '|'; 5816 memcpy(&str[lenp + 1], token2, lenn); 5817 str[lenn + lenp + 1] = 0; 5818 5819 atom->valuep = str; 5820 } 5821 5822 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) { 5823 xmlRegFreeAtom(atom); 5824 return(NULL); 5825 } 5826 if (to == NULL) 5827 return(am->state); 5828 return(to); 5829 } 5830 5831 /** 5832 * xmlAutomataNewNegTrans: 5833 * @am: an automata 5834 * @from: the starting point of the transition 5835 * @to: the target point of the transition or NULL 5836 * @token: the first input string associated to that transition 5837 * @token2: the second input string associated to that transition 5838 * @data: data passed to the callback function if the transition is activated 5839 * 5840 * If @to is NULL, this creates first a new target state in the automata 5841 * and then adds a transition from the @from state to the target state 5842 * activated by any value except (@token,@token2) 5843 * Note that if @token2 is not NULL, then (X, NULL) won't match to follow 5844 # the semantic of XSD ##other 5845 * 5846 * Returns the target state or NULL in case of error 5847 */ 5848 xmlAutomataStatePtr 5849 xmlAutomataNewNegTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 5850 xmlAutomataStatePtr to, const xmlChar *token, 5851 const xmlChar *token2, void *data) { 5852 xmlRegAtomPtr atom; 5853 xmlChar err_msg[200]; 5854 5855 if ((am == NULL) || (from == NULL) || (token == NULL)) 5856 return(NULL); 5857 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5858 if (atom == NULL) 5859 return(NULL); 5860 atom->data = data; 5861 atom->neg = 1; 5862 if ((token2 == NULL) || (*token2 == 0)) { 5863 atom->valuep = xmlStrdup(token); 5864 } else { 5865 int lenn, lenp; 5866 xmlChar *str; 5867 5868 lenn = strlen((char *) token2); 5869 lenp = strlen((char *) token); 5870 5871 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 5872 if (str == NULL) { 5873 xmlRegFreeAtom(atom); 5874 return(NULL); 5875 } 5876 memcpy(&str[0], token, lenp); 5877 str[lenp] = '|'; 5878 memcpy(&str[lenp + 1], token2, lenn); 5879 str[lenn + lenp + 1] = 0; 5880 5881 atom->valuep = str; 5882 } 5883 snprintf((char *) err_msg, 199, "not %s", (const char *) atom->valuep); 5884 err_msg[199] = 0; 5885 atom->valuep2 = xmlStrdup(err_msg); 5886 5887 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) { 5888 xmlRegFreeAtom(atom); 5889 return(NULL); 5890 } 5891 am->negs++; 5892 if (to == NULL) 5893 return(am->state); 5894 return(to); 5895 } 5896 5897 /** 5898 * xmlAutomataNewCountTrans2: 5899 * @am: an automata 5900 * @from: the starting point of the transition 5901 * @to: the target point of the transition or NULL 5902 * @token: the input string associated to that transition 5903 * @token2: the second input string associated to that transition 5904 * @min: the minimum successive occurences of token 5905 * @max: the maximum successive occurences of token 5906 * @data: data associated to the transition 5907 * 5908 * If @to is NULL, this creates first a new target state in the automata 5909 * and then adds a transition from the @from state to the target state 5910 * activated by a succession of input of value @token and @token2 and 5911 * whose number is between @min and @max 5912 * 5913 * Returns the target state or NULL in case of error 5914 */ 5915 xmlAutomataStatePtr 5916 xmlAutomataNewCountTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from, 5917 xmlAutomataStatePtr to, const xmlChar *token, 5918 const xmlChar *token2, 5919 int min, int max, void *data) { 5920 xmlRegAtomPtr atom; 5921 int counter; 5922 5923 if ((am == NULL) || (from == NULL) || (token == NULL)) 5924 return(NULL); 5925 if (min < 0) 5926 return(NULL); 5927 if ((max < min) || (max < 1)) 5928 return(NULL); 5929 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5930 if (atom == NULL) 5931 return(NULL); 5932 if ((token2 == NULL) || (*token2 == 0)) { 5933 atom->valuep = xmlStrdup(token); 5934 } else { 5935 int lenn, lenp; 5936 xmlChar *str; 5937 5938 lenn = strlen((char *) token2); 5939 lenp = strlen((char *) token); 5940 5941 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 5942 if (str == NULL) { 5943 xmlRegFreeAtom(atom); 5944 return(NULL); 5945 } 5946 memcpy(&str[0], token, lenp); 5947 str[lenp] = '|'; 5948 memcpy(&str[lenp + 1], token2, lenn); 5949 str[lenn + lenp + 1] = 0; 5950 5951 atom->valuep = str; 5952 } 5953 atom->data = data; 5954 if (min == 0) 5955 atom->min = 1; 5956 else 5957 atom->min = min; 5958 atom->max = max; 5959 5960 /* 5961 * associate a counter to the transition. 5962 */ 5963 counter = xmlRegGetCounter(am); 5964 am->counters[counter].min = min; 5965 am->counters[counter].max = max; 5966 5967 /* xmlFAGenerateTransitions(am, from, to, atom); */ 5968 if (to == NULL) { 5969 to = xmlRegNewState(am); 5970 xmlRegStatePush(am, to); 5971 } 5972 xmlRegStateAddTrans(am, from, atom, to, counter, -1); 5973 xmlRegAtomPush(am, atom); 5974 am->state = to; 5975 5976 if (to == NULL) 5977 to = am->state; 5978 if (to == NULL) 5979 return(NULL); 5980 if (min == 0) 5981 xmlFAGenerateEpsilonTransition(am, from, to); 5982 return(to); 5983 } 5984 5985 /** 5986 * xmlAutomataNewCountTrans: 5987 * @am: an automata 5988 * @from: the starting point of the transition 5989 * @to: the target point of the transition or NULL 5990 * @token: the input string associated to that transition 5991 * @min: the minimum successive occurences of token 5992 * @max: the maximum successive occurences of token 5993 * @data: data associated to the transition 5994 * 5995 * If @to is NULL, this creates first a new target state in the automata 5996 * and then adds a transition from the @from state to the target state 5997 * activated by a succession of input of value @token and whose number 5998 * is between @min and @max 5999 * 6000 * Returns the target state or NULL in case of error 6001 */ 6002 xmlAutomataStatePtr 6003 xmlAutomataNewCountTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 6004 xmlAutomataStatePtr to, const xmlChar *token, 6005 int min, int max, void *data) { 6006 xmlRegAtomPtr atom; 6007 int counter; 6008 6009 if ((am == NULL) || (from == NULL) || (token == NULL)) 6010 return(NULL); 6011 if (min < 0) 6012 return(NULL); 6013 if ((max < min) || (max < 1)) 6014 return(NULL); 6015 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 6016 if (atom == NULL) 6017 return(NULL); 6018 atom->valuep = xmlStrdup(token); 6019 atom->data = data; 6020 if (min == 0) 6021 atom->min = 1; 6022 else 6023 atom->min = min; 6024 atom->max = max; 6025 6026 /* 6027 * associate a counter to the transition. 6028 */ 6029 counter = xmlRegGetCounter(am); 6030 am->counters[counter].min = min; 6031 am->counters[counter].max = max; 6032 6033 /* xmlFAGenerateTransitions(am, from, to, atom); */ 6034 if (to == NULL) { 6035 to = xmlRegNewState(am); 6036 xmlRegStatePush(am, to); 6037 } 6038 xmlRegStateAddTrans(am, from, atom, to, counter, -1); 6039 xmlRegAtomPush(am, atom); 6040 am->state = to; 6041 6042 if (to == NULL) 6043 to = am->state; 6044 if (to == NULL) 6045 return(NULL); 6046 if (min == 0) 6047 xmlFAGenerateEpsilonTransition(am, from, to); 6048 return(to); 6049 } 6050 6051 /** 6052 * xmlAutomataNewOnceTrans2: 6053 * @am: an automata 6054 * @from: the starting point of the transition 6055 * @to: the target point of the transition or NULL 6056 * @token: the input string associated to that transition 6057 * @token2: the second input string associated to that transition 6058 * @min: the minimum successive occurences of token 6059 * @max: the maximum successive occurences of token 6060 * @data: data associated to the transition 6061 * 6062 * If @to is NULL, this creates first a new target state in the automata 6063 * and then adds a transition from the @from state to the target state 6064 * activated by a succession of input of value @token and @token2 and whose 6065 * number is between @min and @max, moreover that transition can only be 6066 * crossed once. 6067 * 6068 * Returns the target state or NULL in case of error 6069 */ 6070 xmlAutomataStatePtr 6071 xmlAutomataNewOnceTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from, 6072 xmlAutomataStatePtr to, const xmlChar *token, 6073 const xmlChar *token2, 6074 int min, int max, void *data) { 6075 xmlRegAtomPtr atom; 6076 int counter; 6077 6078 if ((am == NULL) || (from == NULL) || (token == NULL)) 6079 return(NULL); 6080 if (min < 1) 6081 return(NULL); 6082 if ((max < min) || (max < 1)) 6083 return(NULL); 6084 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 6085 if (atom == NULL) 6086 return(NULL); 6087 if ((token2 == NULL) || (*token2 == 0)) { 6088 atom->valuep = xmlStrdup(token); 6089 } else { 6090 int lenn, lenp; 6091 xmlChar *str; 6092 6093 lenn = strlen((char *) token2); 6094 lenp = strlen((char *) token); 6095 6096 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 6097 if (str == NULL) { 6098 xmlRegFreeAtom(atom); 6099 return(NULL); 6100 } 6101 memcpy(&str[0], token, lenp); 6102 str[lenp] = '|'; 6103 memcpy(&str[lenp + 1], token2, lenn); 6104 str[lenn + lenp + 1] = 0; 6105 6106 atom->valuep = str; 6107 } 6108 atom->data = data; 6109 atom->quant = XML_REGEXP_QUANT_ONCEONLY; 6110 atom->min = min; 6111 atom->max = max; 6112 /* 6113 * associate a counter to the transition. 6114 */ 6115 counter = xmlRegGetCounter(am); 6116 am->counters[counter].min = 1; 6117 am->counters[counter].max = 1; 6118 6119 /* xmlFAGenerateTransitions(am, from, to, atom); */ 6120 if (to == NULL) { 6121 to = xmlRegNewState(am); 6122 xmlRegStatePush(am, to); 6123 } 6124 xmlRegStateAddTrans(am, from, atom, to, counter, -1); 6125 xmlRegAtomPush(am, atom); 6126 am->state = to; 6127 return(to); 6128 } 6129 6130 6131 6132 /** 6133 * xmlAutomataNewOnceTrans: 6134 * @am: an automata 6135 * @from: the starting point of the transition 6136 * @to: the target point of the transition or NULL 6137 * @token: the input string associated to that transition 6138 * @min: the minimum successive occurences of token 6139 * @max: the maximum successive occurences of token 6140 * @data: data associated to the transition 6141 * 6142 * If @to is NULL, this creates first a new target state in the automata 6143 * and then adds a transition from the @from state to the target state 6144 * activated by a succession of input of value @token and whose number 6145 * is between @min and @max, moreover that transition can only be crossed 6146 * once. 6147 * 6148 * Returns the target state or NULL in case of error 6149 */ 6150 xmlAutomataStatePtr 6151 xmlAutomataNewOnceTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 6152 xmlAutomataStatePtr to, const xmlChar *token, 6153 int min, int max, void *data) { 6154 xmlRegAtomPtr atom; 6155 int counter; 6156 6157 if ((am == NULL) || (from == NULL) || (token == NULL)) 6158 return(NULL); 6159 if (min < 1) 6160 return(NULL); 6161 if ((max < min) || (max < 1)) 6162 return(NULL); 6163 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 6164 if (atom == NULL) 6165 return(NULL); 6166 atom->valuep = xmlStrdup(token); 6167 atom->data = data; 6168 atom->quant = XML_REGEXP_QUANT_ONCEONLY; 6169 atom->min = min; 6170 atom->max = max; 6171 /* 6172 * associate a counter to the transition. 6173 */ 6174 counter = xmlRegGetCounter(am); 6175 am->counters[counter].min = 1; 6176 am->counters[counter].max = 1; 6177 6178 /* xmlFAGenerateTransitions(am, from, to, atom); */ 6179 if (to == NULL) { 6180 to = xmlRegNewState(am); 6181 xmlRegStatePush(am, to); 6182 } 6183 xmlRegStateAddTrans(am, from, atom, to, counter, -1); 6184 xmlRegAtomPush(am, atom); 6185 am->state = to; 6186 return(to); 6187 } 6188 6189 /** 6190 * xmlAutomataNewState: 6191 * @am: an automata 6192 * 6193 * Create a new disconnected state in the automata 6194 * 6195 * Returns the new state or NULL in case of error 6196 */ 6197 xmlAutomataStatePtr 6198 xmlAutomataNewState(xmlAutomataPtr am) { 6199 xmlAutomataStatePtr to; 6200 6201 if (am == NULL) 6202 return(NULL); 6203 to = xmlRegNewState(am); 6204 xmlRegStatePush(am, to); 6205 return(to); 6206 } 6207 6208 /** 6209 * xmlAutomataNewEpsilon: 6210 * @am: an automata 6211 * @from: the starting point of the transition 6212 * @to: the target point of the transition or NULL 6213 * 6214 * If @to is NULL, this creates first a new target state in the automata 6215 * and then adds an epsilon transition from the @from state to the 6216 * target state 6217 * 6218 * Returns the target state or NULL in case of error 6219 */ 6220 xmlAutomataStatePtr 6221 xmlAutomataNewEpsilon(xmlAutomataPtr am, xmlAutomataStatePtr from, 6222 xmlAutomataStatePtr to) { 6223 if ((am == NULL) || (from == NULL)) 6224 return(NULL); 6225 xmlFAGenerateEpsilonTransition(am, from, to); 6226 if (to == NULL) 6227 return(am->state); 6228 return(to); 6229 } 6230 6231 /** 6232 * xmlAutomataNewAllTrans: 6233 * @am: an automata 6234 * @from: the starting point of the transition 6235 * @to: the target point of the transition or NULL 6236 * @lax: allow to transition if not all all transitions have been activated 6237 * 6238 * If @to is NULL, this creates first a new target state in the automata 6239 * and then adds a an ALL transition from the @from state to the 6240 * target state. That transition is an epsilon transition allowed only when 6241 * all transitions from the @from node have been activated. 6242 * 6243 * Returns the target state or NULL in case of error 6244 */ 6245 xmlAutomataStatePtr 6246 xmlAutomataNewAllTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 6247 xmlAutomataStatePtr to, int lax) { 6248 if ((am == NULL) || (from == NULL)) 6249 return(NULL); 6250 xmlFAGenerateAllTransition(am, from, to, lax); 6251 if (to == NULL) 6252 return(am->state); 6253 return(to); 6254 } 6255 6256 /** 6257 * xmlAutomataNewCounter: 6258 * @am: an automata 6259 * @min: the minimal value on the counter 6260 * @max: the maximal value on the counter 6261 * 6262 * Create a new counter 6263 * 6264 * Returns the counter number or -1 in case of error 6265 */ 6266 int 6267 xmlAutomataNewCounter(xmlAutomataPtr am, int min, int max) { 6268 int ret; 6269 6270 if (am == NULL) 6271 return(-1); 6272 6273 ret = xmlRegGetCounter(am); 6274 if (ret < 0) 6275 return(-1); 6276 am->counters[ret].min = min; 6277 am->counters[ret].max = max; 6278 return(ret); 6279 } 6280 6281 /** 6282 * xmlAutomataNewCountedTrans: 6283 * @am: an automata 6284 * @from: the starting point of the transition 6285 * @to: the target point of the transition or NULL 6286 * @counter: the counter associated to that transition 6287 * 6288 * If @to is NULL, this creates first a new target state in the automata 6289 * and then adds an epsilon transition from the @from state to the target state 6290 * which will increment the counter provided 6291 * 6292 * Returns the target state or NULL in case of error 6293 */ 6294 xmlAutomataStatePtr 6295 xmlAutomataNewCountedTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 6296 xmlAutomataStatePtr to, int counter) { 6297 if ((am == NULL) || (from == NULL) || (counter < 0)) 6298 return(NULL); 6299 xmlFAGenerateCountedEpsilonTransition(am, from, to, counter); 6300 if (to == NULL) 6301 return(am->state); 6302 return(to); 6303 } 6304 6305 /** 6306 * xmlAutomataNewCounterTrans: 6307 * @am: an automata 6308 * @from: the starting point of the transition 6309 * @to: the target point of the transition or NULL 6310 * @counter: the counter associated to that transition 6311 * 6312 * If @to is NULL, this creates first a new target state in the automata 6313 * and then adds an epsilon transition from the @from state to the target state 6314 * which will be allowed only if the counter is within the right range. 6315 * 6316 * Returns the target state or NULL in case of error 6317 */ 6318 xmlAutomataStatePtr 6319 xmlAutomataNewCounterTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 6320 xmlAutomataStatePtr to, int counter) { 6321 if ((am == NULL) || (from == NULL) || (counter < 0)) 6322 return(NULL); 6323 xmlFAGenerateCountedTransition(am, from, to, counter); 6324 if (to == NULL) 6325 return(am->state); 6326 return(to); 6327 } 6328 6329 /** 6330 * xmlAutomataCompile: 6331 * @am: an automata 6332 * 6333 * Compile the automata into a Reg Exp ready for being executed. 6334 * The automata should be free after this point. 6335 * 6336 * Returns the compiled regexp or NULL in case of error 6337 */ 6338 xmlRegexpPtr 6339 xmlAutomataCompile(xmlAutomataPtr am) { 6340 xmlRegexpPtr ret; 6341 6342 if ((am == NULL) || (am->error != 0)) return(NULL); 6343 xmlFAEliminateEpsilonTransitions(am); 6344 /* xmlFAComputesDeterminism(am); */ 6345 ret = xmlRegEpxFromParse(am); 6346 6347 return(ret); 6348 } 6349 6350 /** 6351 * xmlAutomataIsDeterminist: 6352 * @am: an automata 6353 * 6354 * Checks if an automata is determinist. 6355 * 6356 * Returns 1 if true, 0 if not, and -1 in case of error 6357 */ 6358 int 6359 xmlAutomataIsDeterminist(xmlAutomataPtr am) { 6360 int ret; 6361 6362 if (am == NULL) 6363 return(-1); 6364 6365 ret = xmlFAComputesDeterminism(am); 6366 return(ret); 6367 } 6368 #endif /* LIBXML_AUTOMATA_ENABLED */ 6369 6370 #ifdef LIBXML_EXPR_ENABLED 6371 /************************************************************************ 6372 * * 6373 * Formal Expression handling code * 6374 * * 6375 ************************************************************************/ 6376 /************************************************************************ 6377 * * 6378 * Expression handling context * 6379 * * 6380 ************************************************************************/ 6381 6382 struct _xmlExpCtxt { 6383 xmlDictPtr dict; 6384 xmlExpNodePtr *table; 6385 int size; 6386 int nbElems; 6387 int nb_nodes; 6388 int maxNodes; 6389 const char *expr; 6390 const char *cur; 6391 int nb_cons; 6392 int tabSize; 6393 }; 6394 6395 /** 6396 * xmlExpNewCtxt: 6397 * @maxNodes: the maximum number of nodes 6398 * @dict: optional dictionary to use internally 6399 * 6400 * Creates a new context for manipulating expressions 6401 * 6402 * Returns the context or NULL in case of error 6403 */ 6404 xmlExpCtxtPtr 6405 xmlExpNewCtxt(int maxNodes, xmlDictPtr dict) { 6406 xmlExpCtxtPtr ret; 6407 int size = 256; 6408 6409 if (maxNodes <= 4096) 6410 maxNodes = 4096; 6411 6412 ret = (xmlExpCtxtPtr) xmlMalloc(sizeof(xmlExpCtxt)); 6413 if (ret == NULL) 6414 return(NULL); 6415 memset(ret, 0, sizeof(xmlExpCtxt)); 6416 ret->size = size; 6417 ret->nbElems = 0; 6418 ret->maxNodes = maxNodes; 6419 ret->table = xmlMalloc(size * sizeof(xmlExpNodePtr)); 6420 if (ret->table == NULL) { 6421 xmlFree(ret); 6422 return(NULL); 6423 } 6424 memset(ret->table, 0, size * sizeof(xmlExpNodePtr)); 6425 if (dict == NULL) { 6426 ret->dict = xmlDictCreate(); 6427 if (ret->dict == NULL) { 6428 xmlFree(ret->table); 6429 xmlFree(ret); 6430 return(NULL); 6431 } 6432 } else { 6433 ret->dict = dict; 6434 xmlDictReference(ret->dict); 6435 } 6436 return(ret); 6437 } 6438 6439 /** 6440 * xmlExpFreeCtxt: 6441 * @ctxt: an expression context 6442 * 6443 * Free an expression context 6444 */ 6445 void 6446 xmlExpFreeCtxt(xmlExpCtxtPtr ctxt) { 6447 if (ctxt == NULL) 6448 return; 6449 xmlDictFree(ctxt->dict); 6450 if (ctxt->table != NULL) 6451 xmlFree(ctxt->table); 6452 xmlFree(ctxt); 6453 } 6454 6455 /************************************************************************ 6456 * * 6457 * Structure associated to an expression node * 6458 * * 6459 ************************************************************************/ 6460 #define MAX_NODES 10000 6461 6462 /* #define DEBUG_DERIV */ 6463 6464 /* 6465 * TODO: 6466 * - Wildcards 6467 * - public API for creation 6468 * 6469 * Started 6470 * - regression testing 6471 * 6472 * Done 6473 * - split into module and test tool 6474 * - memleaks 6475 */ 6476 6477 typedef enum { 6478 XML_EXP_NILABLE = (1 << 0) 6479 } xmlExpNodeInfo; 6480 6481 #define IS_NILLABLE(node) ((node)->info & XML_EXP_NILABLE) 6482 6483 struct _xmlExpNode { 6484 unsigned char type;/* xmlExpNodeType */ 6485 unsigned char info;/* OR of xmlExpNodeInfo */ 6486 unsigned short key; /* the hash key */ 6487 unsigned int ref; /* The number of references */ 6488 int c_max; /* the maximum length it can consume */ 6489 xmlExpNodePtr exp_left; 6490 xmlExpNodePtr next;/* the next node in the hash table or free list */ 6491 union { 6492 struct { 6493 int f_min; 6494 int f_max; 6495 } count; 6496 struct { 6497 xmlExpNodePtr f_right; 6498 } children; 6499 const xmlChar *f_str; 6500 } field; 6501 }; 6502 6503 #define exp_min field.count.f_min 6504 #define exp_max field.count.f_max 6505 /* #define exp_left field.children.f_left */ 6506 #define exp_right field.children.f_right 6507 #define exp_str field.f_str 6508 6509 static xmlExpNodePtr xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type); 6510 static xmlExpNode forbiddenExpNode = { 6511 XML_EXP_FORBID, 0, 0, 0, 0, NULL, NULL, {{ 0, 0}} 6512 }; 6513 xmlExpNodePtr forbiddenExp = &forbiddenExpNode; 6514 static xmlExpNode emptyExpNode = { 6515 XML_EXP_EMPTY, 1, 0, 0, 0, NULL, NULL, {{ 0, 0}} 6516 }; 6517 xmlExpNodePtr emptyExp = &emptyExpNode; 6518 6519 /************************************************************************ 6520 * * 6521 * The custom hash table for unicity and canonicalization * 6522 * of sub-expressions pointers * 6523 * * 6524 ************************************************************************/ 6525 /* 6526 * xmlExpHashNameComputeKey: 6527 * Calculate the hash key for a token 6528 */ 6529 static unsigned short 6530 xmlExpHashNameComputeKey(const xmlChar *name) { 6531 unsigned short value = 0L; 6532 char ch; 6533 6534 if (name != NULL) { 6535 value += 30 * (*name); 6536 while ((ch = *name++) != 0) { 6537 value = value ^ ((value << 5) + (value >> 3) + (unsigned long)ch); 6538 } 6539 } 6540 return (value); 6541 } 6542 6543 /* 6544 * xmlExpHashComputeKey: 6545 * Calculate the hash key for a compound expression 6546 */ 6547 static unsigned short 6548 xmlExpHashComputeKey(xmlExpNodeType type, xmlExpNodePtr left, 6549 xmlExpNodePtr right) { 6550 unsigned long value; 6551 unsigned short ret; 6552 6553 switch (type) { 6554 case XML_EXP_SEQ: 6555 value = left->key; 6556 value += right->key; 6557 value *= 3; 6558 ret = (unsigned short) value; 6559 break; 6560 case XML_EXP_OR: 6561 value = left->key; 6562 value += right->key; 6563 value *= 7; 6564 ret = (unsigned short) value; 6565 break; 6566 case XML_EXP_COUNT: 6567 value = left->key; 6568 value += right->key; 6569 ret = (unsigned short) value; 6570 break; 6571 default: 6572 ret = 0; 6573 } 6574 return(ret); 6575 } 6576 6577 6578 static xmlExpNodePtr 6579 xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type) { 6580 xmlExpNodePtr ret; 6581 6582 if (ctxt->nb_nodes >= MAX_NODES) 6583 return(NULL); 6584 ret = (xmlExpNodePtr) xmlMalloc(sizeof(xmlExpNode)); 6585 if (ret == NULL) 6586 return(NULL); 6587 memset(ret, 0, sizeof(xmlExpNode)); 6588 ret->type = type; 6589 ret->next = NULL; 6590 ctxt->nb_nodes++; 6591 ctxt->nb_cons++; 6592 return(ret); 6593 } 6594 6595 /** 6596 * xmlExpHashGetEntry: 6597 * @table: the hash table 6598 * 6599 * Get the unique entry from the hash table. The entry is created if 6600 * needed. @left and @right are consumed, i.e. their ref count will 6601 * be decremented by the operation. 6602 * 6603 * Returns the pointer or NULL in case of error 6604 */ 6605 static xmlExpNodePtr 6606 xmlExpHashGetEntry(xmlExpCtxtPtr ctxt, xmlExpNodeType type, 6607 xmlExpNodePtr left, xmlExpNodePtr right, 6608 const xmlChar *name, int min, int max) { 6609 unsigned short kbase, key; 6610 xmlExpNodePtr entry; 6611 xmlExpNodePtr insert; 6612 6613 if (ctxt == NULL) 6614 return(NULL); 6615 6616 /* 6617 * Check for duplicate and insertion location. 6618 */ 6619 if (type == XML_EXP_ATOM) { 6620 kbase = xmlExpHashNameComputeKey(name); 6621 } else if (type == XML_EXP_COUNT) { 6622 /* COUNT reduction rule 1 */ 6623 /* a{1} -> a */ 6624 if (min == max) { 6625 if (min == 1) { 6626 return(left); 6627 } 6628 if (min == 0) { 6629 xmlExpFree(ctxt, left); 6630 return(emptyExp); 6631 } 6632 } 6633 if (min < 0) { 6634 xmlExpFree(ctxt, left); 6635 return(forbiddenExp); 6636 } 6637 if (max == -1) 6638 kbase = min + 79; 6639 else 6640 kbase = max - min; 6641 kbase += left->key; 6642 } else if (type == XML_EXP_OR) { 6643 /* Forbid reduction rules */ 6644 if (left->type == XML_EXP_FORBID) { 6645 xmlExpFree(ctxt, left); 6646 return(right); 6647 } 6648 if (right->type == XML_EXP_FORBID) { 6649 xmlExpFree(ctxt, right); 6650 return(left); 6651 } 6652 6653 /* OR reduction rule 1 */ 6654 /* a | a reduced to a */ 6655 if (left == right) { 6656 left->ref--; 6657 return(left); 6658 } 6659 /* OR canonicalization rule 1 */ 6660 /* linearize (a | b) | c into a | (b | c) */ 6661 if ((left->type == XML_EXP_OR) && (right->type != XML_EXP_OR)) { 6662 xmlExpNodePtr tmp = left; 6663 left = right; 6664 right = tmp; 6665 } 6666 /* OR reduction rule 2 */ 6667 /* a | (a | b) and b | (a | b) are reduced to a | b */ 6668 if (right->type == XML_EXP_OR) { 6669 if ((left == right->exp_left) || 6670 (left == right->exp_right)) { 6671 xmlExpFree(ctxt, left); 6672 return(right); 6673 } 6674 } 6675 /* OR canonicalization rule 2 */ 6676 /* linearize (a | b) | c into a | (b | c) */ 6677 if (left->type == XML_EXP_OR) { 6678 xmlExpNodePtr tmp; 6679 6680 /* OR canonicalization rule 2 */ 6681 if ((left->exp_right->type != XML_EXP_OR) && 6682 (left->exp_right->key < left->exp_left->key)) { 6683 tmp = left->exp_right; 6684 left->exp_right = left->exp_left; 6685 left->exp_left = tmp; 6686 } 6687 left->exp_right->ref++; 6688 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_right, right, 6689 NULL, 0, 0); 6690 left->exp_left->ref++; 6691 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_left, tmp, 6692 NULL, 0, 0); 6693 6694 xmlExpFree(ctxt, left); 6695 return(tmp); 6696 } 6697 if (right->type == XML_EXP_OR) { 6698 /* Ordering in the tree */ 6699 /* C | (A | B) -> A | (B | C) */ 6700 if (left->key > right->exp_right->key) { 6701 xmlExpNodePtr tmp; 6702 right->exp_right->ref++; 6703 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_right, 6704 left, NULL, 0, 0); 6705 right->exp_left->ref++; 6706 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left, 6707 tmp, NULL, 0, 0); 6708 xmlExpFree(ctxt, right); 6709 return(tmp); 6710 } 6711 /* Ordering in the tree */ 6712 /* B | (A | C) -> A | (B | C) */ 6713 if (left->key > right->exp_left->key) { 6714 xmlExpNodePtr tmp; 6715 right->exp_right->ref++; 6716 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left, 6717 right->exp_right, NULL, 0, 0); 6718 right->exp_left->ref++; 6719 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left, 6720 tmp, NULL, 0, 0); 6721 xmlExpFree(ctxt, right); 6722 return(tmp); 6723 } 6724 } 6725 /* we know both types are != XML_EXP_OR here */ 6726 else if (left->key > right->key) { 6727 xmlExpNodePtr tmp = left; 6728 left = right; 6729 right = tmp; 6730 } 6731 kbase = xmlExpHashComputeKey(type, left, right); 6732 } else if (type == XML_EXP_SEQ) { 6733 /* Forbid reduction rules */ 6734 if (left->type == XML_EXP_FORBID) { 6735 xmlExpFree(ctxt, right); 6736 return(left); 6737 } 6738 if (right->type == XML_EXP_FORBID) { 6739 xmlExpFree(ctxt, left); 6740 return(right); 6741 } 6742 /* Empty reduction rules */ 6743 if (right->type == XML_EXP_EMPTY) { 6744 return(left); 6745 } 6746 if (left->type == XML_EXP_EMPTY) { 6747 return(right); 6748 } 6749 kbase = xmlExpHashComputeKey(type, left, right); 6750 } else 6751 return(NULL); 6752 6753 key = kbase % ctxt->size; 6754 if (ctxt->table[key] != NULL) { 6755 for (insert = ctxt->table[key]; insert != NULL; 6756 insert = insert->next) { 6757 if ((insert->key == kbase) && 6758 (insert->type == type)) { 6759 if (type == XML_EXP_ATOM) { 6760 if (name == insert->exp_str) { 6761 insert->ref++; 6762 return(insert); 6763 } 6764 } else if (type == XML_EXP_COUNT) { 6765 if ((insert->exp_min == min) && (insert->exp_max == max) && 6766 (insert->exp_left == left)) { 6767 insert->ref++; 6768 left->ref--; 6769 return(insert); 6770 } 6771 } else if ((insert->exp_left == left) && 6772 (insert->exp_right == right)) { 6773 insert->ref++; 6774 left->ref--; 6775 right->ref--; 6776 return(insert); 6777 } 6778 } 6779 } 6780 } 6781 6782 entry = xmlExpNewNode(ctxt, type); 6783 if (entry == NULL) 6784 return(NULL); 6785 entry->key = kbase; 6786 if (type == XML_EXP_ATOM) { 6787 entry->exp_str = name; 6788 entry->c_max = 1; 6789 } else if (type == XML_EXP_COUNT) { 6790 entry->exp_min = min; 6791 entry->exp_max = max; 6792 entry->exp_left = left; 6793 if ((min == 0) || (IS_NILLABLE(left))) 6794 entry->info |= XML_EXP_NILABLE; 6795 if (max < 0) 6796 entry->c_max = -1; 6797 else 6798 entry->c_max = max * entry->exp_left->c_max; 6799 } else { 6800 entry->exp_left = left; 6801 entry->exp_right = right; 6802 if (type == XML_EXP_OR) { 6803 if ((IS_NILLABLE(left)) || (IS_NILLABLE(right))) 6804 entry->info |= XML_EXP_NILABLE; 6805 if ((entry->exp_left->c_max == -1) || 6806 (entry->exp_right->c_max == -1)) 6807 entry->c_max = -1; 6808 else if (entry->exp_left->c_max > entry->exp_right->c_max) 6809 entry->c_max = entry->exp_left->c_max; 6810 else 6811 entry->c_max = entry->exp_right->c_max; 6812 } else { 6813 if ((IS_NILLABLE(left)) && (IS_NILLABLE(right))) 6814 entry->info |= XML_EXP_NILABLE; 6815 if ((entry->exp_left->c_max == -1) || 6816 (entry->exp_right->c_max == -1)) 6817 entry->c_max = -1; 6818 else 6819 entry->c_max = entry->exp_left->c_max + entry->exp_right->c_max; 6820 } 6821 } 6822 entry->ref = 1; 6823 if (ctxt->table[key] != NULL) 6824 entry->next = ctxt->table[key]; 6825 6826 ctxt->table[key] = entry; 6827 ctxt->nbElems++; 6828 6829 return(entry); 6830 } 6831 6832 /** 6833 * xmlExpFree: 6834 * @ctxt: the expression context 6835 * @exp: the expression 6836 * 6837 * Dereference the expression 6838 */ 6839 void 6840 xmlExpFree(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp) { 6841 if ((exp == NULL) || (exp == forbiddenExp) || (exp == emptyExp)) 6842 return; 6843 exp->ref--; 6844 if (exp->ref == 0) { 6845 unsigned short key; 6846 6847 /* Unlink it first from the hash table */ 6848 key = exp->key % ctxt->size; 6849 if (ctxt->table[key] == exp) { 6850 ctxt->table[key] = exp->next; 6851 } else { 6852 xmlExpNodePtr tmp; 6853 6854 tmp = ctxt->table[key]; 6855 while (tmp != NULL) { 6856 if (tmp->next == exp) { 6857 tmp->next = exp->next; 6858 break; 6859 } 6860 tmp = tmp->next; 6861 } 6862 } 6863 6864 if ((exp->type == XML_EXP_SEQ) || (exp->type == XML_EXP_OR)) { 6865 xmlExpFree(ctxt, exp->exp_left); 6866 xmlExpFree(ctxt, exp->exp_right); 6867 } else if (exp->type == XML_EXP_COUNT) { 6868 xmlExpFree(ctxt, exp->exp_left); 6869 } 6870 xmlFree(exp); 6871 ctxt->nb_nodes--; 6872 } 6873 } 6874 6875 /** 6876 * xmlExpRef: 6877 * @exp: the expression 6878 * 6879 * Increase the reference count of the expression 6880 */ 6881 void 6882 xmlExpRef(xmlExpNodePtr exp) { 6883 if (exp != NULL) 6884 exp->ref++; 6885 } 6886 6887 /** 6888 * xmlExpNewAtom: 6889 * @ctxt: the expression context 6890 * @name: the atom name 6891 * @len: the atom name length in byte (or -1); 6892 * 6893 * Get the atom associated to this name from that context 6894 * 6895 * Returns the node or NULL in case of error 6896 */ 6897 xmlExpNodePtr 6898 xmlExpNewAtom(xmlExpCtxtPtr ctxt, const xmlChar *name, int len) { 6899 if ((ctxt == NULL) || (name == NULL)) 6900 return(NULL); 6901 name = xmlDictLookup(ctxt->dict, name, len); 6902 if (name == NULL) 6903 return(NULL); 6904 return(xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, name, 0, 0)); 6905 } 6906 6907 /** 6908 * xmlExpNewOr: 6909 * @ctxt: the expression context 6910 * @left: left expression 6911 * @right: right expression 6912 * 6913 * Get the atom associated to the choice @left | @right 6914 * Note that @left and @right are consumed in the operation, to keep 6915 * an handle on them use xmlExpRef() and use xmlExpFree() to release them, 6916 * this is true even in case of failure (unless ctxt == NULL). 6917 * 6918 * Returns the node or NULL in case of error 6919 */ 6920 xmlExpNodePtr 6921 xmlExpNewOr(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) { 6922 if (ctxt == NULL) 6923 return(NULL); 6924 if ((left == NULL) || (right == NULL)) { 6925 xmlExpFree(ctxt, left); 6926 xmlExpFree(ctxt, right); 6927 return(NULL); 6928 } 6929 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, left, right, NULL, 0, 0)); 6930 } 6931 6932 /** 6933 * xmlExpNewSeq: 6934 * @ctxt: the expression context 6935 * @left: left expression 6936 * @right: right expression 6937 * 6938 * Get the atom associated to the sequence @left , @right 6939 * Note that @left and @right are consumed in the operation, to keep 6940 * an handle on them use xmlExpRef() and use xmlExpFree() to release them, 6941 * this is true even in case of failure (unless ctxt == NULL). 6942 * 6943 * Returns the node or NULL in case of error 6944 */ 6945 xmlExpNodePtr 6946 xmlExpNewSeq(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) { 6947 if (ctxt == NULL) 6948 return(NULL); 6949 if ((left == NULL) || (right == NULL)) { 6950 xmlExpFree(ctxt, left); 6951 xmlExpFree(ctxt, right); 6952 return(NULL); 6953 } 6954 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, left, right, NULL, 0, 0)); 6955 } 6956 6957 /** 6958 * xmlExpNewRange: 6959 * @ctxt: the expression context 6960 * @subset: the expression to be repeated 6961 * @min: the lower bound for the repetition 6962 * @max: the upper bound for the repetition, -1 means infinite 6963 * 6964 * Get the atom associated to the range (@subset){@min, @max} 6965 * Note that @subset is consumed in the operation, to keep 6966 * an handle on it use xmlExpRef() and use xmlExpFree() to release it, 6967 * this is true even in case of failure (unless ctxt == NULL). 6968 * 6969 * Returns the node or NULL in case of error 6970 */ 6971 xmlExpNodePtr 6972 xmlExpNewRange(xmlExpCtxtPtr ctxt, xmlExpNodePtr subset, int min, int max) { 6973 if (ctxt == NULL) 6974 return(NULL); 6975 if ((subset == NULL) || (min < 0) || (max < -1) || 6976 ((max >= 0) && (min > max))) { 6977 xmlExpFree(ctxt, subset); 6978 return(NULL); 6979 } 6980 return(xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, subset, 6981 NULL, NULL, min, max)); 6982 } 6983 6984 /************************************************************************ 6985 * * 6986 * Public API for operations on expressions * 6987 * * 6988 ************************************************************************/ 6989 6990 static int 6991 xmlExpGetLanguageInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 6992 const xmlChar**list, int len, int nb) { 6993 int tmp, tmp2; 6994 tail: 6995 switch (exp->type) { 6996 case XML_EXP_EMPTY: 6997 return(0); 6998 case XML_EXP_ATOM: 6999 for (tmp = 0;tmp < nb;tmp++) 7000 if (list[tmp] == exp->exp_str) 7001 return(0); 7002 if (nb >= len) 7003 return(-2); 7004 list[nb] = exp->exp_str; 7005 return(1); 7006 case XML_EXP_COUNT: 7007 exp = exp->exp_left; 7008 goto tail; 7009 case XML_EXP_SEQ: 7010 case XML_EXP_OR: 7011 tmp = xmlExpGetLanguageInt(ctxt, exp->exp_left, list, len, nb); 7012 if (tmp < 0) 7013 return(tmp); 7014 tmp2 = xmlExpGetLanguageInt(ctxt, exp->exp_right, list, len, 7015 nb + tmp); 7016 if (tmp2 < 0) 7017 return(tmp2); 7018 return(tmp + tmp2); 7019 } 7020 return(-1); 7021 } 7022 7023 /** 7024 * xmlExpGetLanguage: 7025 * @ctxt: the expression context 7026 * @exp: the expression 7027 * @langList: where to store the tokens 7028 * @len: the allocated length of @list 7029 * 7030 * Find all the strings used in @exp and store them in @list 7031 * 7032 * Returns the number of unique strings found, -1 in case of errors and 7033 * -2 if there is more than @len strings 7034 */ 7035 int 7036 xmlExpGetLanguage(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 7037 const xmlChar**langList, int len) { 7038 if ((ctxt == NULL) || (exp == NULL) || (langList == NULL) || (len <= 0)) 7039 return(-1); 7040 return(xmlExpGetLanguageInt(ctxt, exp, langList, len, 0)); 7041 } 7042 7043 static int 7044 xmlExpGetStartInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 7045 const xmlChar**list, int len, int nb) { 7046 int tmp, tmp2; 7047 tail: 7048 switch (exp->type) { 7049 case XML_EXP_FORBID: 7050 return(0); 7051 case XML_EXP_EMPTY: 7052 return(0); 7053 case XML_EXP_ATOM: 7054 for (tmp = 0;tmp < nb;tmp++) 7055 if (list[tmp] == exp->exp_str) 7056 return(0); 7057 if (nb >= len) 7058 return(-2); 7059 list[nb] = exp->exp_str; 7060 return(1); 7061 case XML_EXP_COUNT: 7062 exp = exp->exp_left; 7063 goto tail; 7064 case XML_EXP_SEQ: 7065 tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb); 7066 if (tmp < 0) 7067 return(tmp); 7068 if (IS_NILLABLE(exp->exp_left)) { 7069 tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len, 7070 nb + tmp); 7071 if (tmp2 < 0) 7072 return(tmp2); 7073 tmp += tmp2; 7074 } 7075 return(tmp); 7076 case XML_EXP_OR: 7077 tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb); 7078 if (tmp < 0) 7079 return(tmp); 7080 tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len, 7081 nb + tmp); 7082 if (tmp2 < 0) 7083 return(tmp2); 7084 return(tmp + tmp2); 7085 } 7086 return(-1); 7087 } 7088 7089 /** 7090 * xmlExpGetStart: 7091 * @ctxt: the expression context 7092 * @exp: the expression 7093 * @tokList: where to store the tokens 7094 * @len: the allocated length of @list 7095 * 7096 * Find all the strings that appears at the start of the languages 7097 * accepted by @exp and store them in @list. E.g. for (a, b) | c 7098 * it will return the list [a, c] 7099 * 7100 * Returns the number of unique strings found, -1 in case of errors and 7101 * -2 if there is more than @len strings 7102 */ 7103 int 7104 xmlExpGetStart(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 7105 const xmlChar**tokList, int len) { 7106 if ((ctxt == NULL) || (exp == NULL) || (tokList == NULL) || (len <= 0)) 7107 return(-1); 7108 return(xmlExpGetStartInt(ctxt, exp, tokList, len, 0)); 7109 } 7110 7111 /** 7112 * xmlExpIsNillable: 7113 * @exp: the expression 7114 * 7115 * Finds if the expression is nillable, i.e. if it accepts the empty sequqnce 7116 * 7117 * Returns 1 if nillable, 0 if not and -1 in case of error 7118 */ 7119 int 7120 xmlExpIsNillable(xmlExpNodePtr exp) { 7121 if (exp == NULL) 7122 return(-1); 7123 return(IS_NILLABLE(exp) != 0); 7124 } 7125 7126 static xmlExpNodePtr 7127 xmlExpStringDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, const xmlChar *str) 7128 { 7129 xmlExpNodePtr ret; 7130 7131 switch (exp->type) { 7132 case XML_EXP_EMPTY: 7133 return(forbiddenExp); 7134 case XML_EXP_FORBID: 7135 return(forbiddenExp); 7136 case XML_EXP_ATOM: 7137 if (exp->exp_str == str) { 7138 #ifdef DEBUG_DERIV 7139 printf("deriv atom: equal => Empty\n"); 7140 #endif 7141 ret = emptyExp; 7142 } else { 7143 #ifdef DEBUG_DERIV 7144 printf("deriv atom: mismatch => forbid\n"); 7145 #endif 7146 /* TODO wildcards here */ 7147 ret = forbiddenExp; 7148 } 7149 return(ret); 7150 case XML_EXP_OR: { 7151 xmlExpNodePtr tmp; 7152 7153 #ifdef DEBUG_DERIV 7154 printf("deriv or: => or(derivs)\n"); 7155 #endif 7156 tmp = xmlExpStringDeriveInt(ctxt, exp->exp_left, str); 7157 if (tmp == NULL) { 7158 return(NULL); 7159 } 7160 ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str); 7161 if (ret == NULL) { 7162 xmlExpFree(ctxt, tmp); 7163 return(NULL); 7164 } 7165 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret, 7166 NULL, 0, 0); 7167 return(ret); 7168 } 7169 case XML_EXP_SEQ: 7170 #ifdef DEBUG_DERIV 7171 printf("deriv seq: starting with left\n"); 7172 #endif 7173 ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str); 7174 if (ret == NULL) { 7175 return(NULL); 7176 } else if (ret == forbiddenExp) { 7177 if (IS_NILLABLE(exp->exp_left)) { 7178 #ifdef DEBUG_DERIV 7179 printf("deriv seq: left failed but nillable\n"); 7180 #endif 7181 ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str); 7182 } 7183 } else { 7184 #ifdef DEBUG_DERIV 7185 printf("deriv seq: left match => sequence\n"); 7186 #endif 7187 exp->exp_right->ref++; 7188 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, exp->exp_right, 7189 NULL, 0, 0); 7190 } 7191 return(ret); 7192 case XML_EXP_COUNT: { 7193 int min, max; 7194 xmlExpNodePtr tmp; 7195 7196 if (exp->exp_max == 0) 7197 return(forbiddenExp); 7198 ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str); 7199 if (ret == NULL) 7200 return(NULL); 7201 if (ret == forbiddenExp) { 7202 #ifdef DEBUG_DERIV 7203 printf("deriv count: pattern mismatch => forbid\n"); 7204 #endif 7205 return(ret); 7206 } 7207 if (exp->exp_max == 1) 7208 return(ret); 7209 if (exp->exp_max < 0) /* unbounded */ 7210 max = -1; 7211 else 7212 max = exp->exp_max - 1; 7213 if (exp->exp_min > 0) 7214 min = exp->exp_min - 1; 7215 else 7216 min = 0; 7217 exp->exp_left->ref++; 7218 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left, NULL, 7219 NULL, min, max); 7220 if (ret == emptyExp) { 7221 #ifdef DEBUG_DERIV 7222 printf("deriv count: match to empty => new count\n"); 7223 #endif 7224 return(tmp); 7225 } 7226 #ifdef DEBUG_DERIV 7227 printf("deriv count: match => sequence with new count\n"); 7228 #endif 7229 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, tmp, 7230 NULL, 0, 0)); 7231 } 7232 } 7233 return(NULL); 7234 } 7235 7236 /** 7237 * xmlExpStringDerive: 7238 * @ctxt: the expression context 7239 * @exp: the expression 7240 * @str: the string 7241 * @len: the string len in bytes if available 7242 * 7243 * Do one step of Brzozowski derivation of the expression @exp with 7244 * respect to the input string 7245 * 7246 * Returns the resulting expression or NULL in case of internal error 7247 */ 7248 xmlExpNodePtr 7249 xmlExpStringDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 7250 const xmlChar *str, int len) { 7251 const xmlChar *input; 7252 7253 if ((exp == NULL) || (ctxt == NULL) || (str == NULL)) { 7254 return(NULL); 7255 } 7256 /* 7257 * check the string is in the dictionary, if yes use an interned 7258 * copy, otherwise we know it's not an acceptable input 7259 */ 7260 input = xmlDictExists(ctxt->dict, str, len); 7261 if (input == NULL) { 7262 return(forbiddenExp); 7263 } 7264 return(xmlExpStringDeriveInt(ctxt, exp, input)); 7265 } 7266 7267 static int 7268 xmlExpCheckCard(xmlExpNodePtr exp, xmlExpNodePtr sub) { 7269 int ret = 1; 7270 7271 if (sub->c_max == -1) { 7272 if (exp->c_max != -1) 7273 ret = 0; 7274 } else if ((exp->c_max >= 0) && (exp->c_max < sub->c_max)) { 7275 ret = 0; 7276 } 7277 #if 0 7278 if ((IS_NILLABLE(sub)) && (!IS_NILLABLE(exp))) 7279 ret = 0; 7280 #endif 7281 return(ret); 7282 } 7283 7284 static xmlExpNodePtr xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 7285 xmlExpNodePtr sub); 7286 /** 7287 * xmlExpDivide: 7288 * @ctxt: the expressions context 7289 * @exp: the englobing expression 7290 * @sub: the subexpression 7291 * @mult: the multiple expression 7292 * @remain: the remain from the derivation of the multiple 7293 * 7294 * Check if exp is a multiple of sub, i.e. if there is a finite number n 7295 * so that sub{n} subsume exp 7296 * 7297 * Returns the multiple value if successful, 0 if it is not a multiple 7298 * and -1 in case of internel error. 7299 */ 7300 7301 static int 7302 xmlExpDivide(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub, 7303 xmlExpNodePtr *mult, xmlExpNodePtr *remain) { 7304 int i; 7305 xmlExpNodePtr tmp, tmp2; 7306 7307 if (mult != NULL) *mult = NULL; 7308 if (remain != NULL) *remain = NULL; 7309 if (exp->c_max == -1) return(0); 7310 if (IS_NILLABLE(exp) && (!IS_NILLABLE(sub))) return(0); 7311 7312 for (i = 1;i <= exp->c_max;i++) { 7313 sub->ref++; 7314 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, 7315 sub, NULL, NULL, i, i); 7316 if (tmp == NULL) { 7317 return(-1); 7318 } 7319 if (!xmlExpCheckCard(tmp, exp)) { 7320 xmlExpFree(ctxt, tmp); 7321 continue; 7322 } 7323 tmp2 = xmlExpExpDeriveInt(ctxt, tmp, exp); 7324 if (tmp2 == NULL) { 7325 xmlExpFree(ctxt, tmp); 7326 return(-1); 7327 } 7328 if ((tmp2 != forbiddenExp) && (IS_NILLABLE(tmp2))) { 7329 if (remain != NULL) 7330 *remain = tmp2; 7331 else 7332 xmlExpFree(ctxt, tmp2); 7333 if (mult != NULL) 7334 *mult = tmp; 7335 else 7336 xmlExpFree(ctxt, tmp); 7337 #ifdef DEBUG_DERIV 7338 printf("Divide succeeded %d\n", i); 7339 #endif 7340 return(i); 7341 } 7342 xmlExpFree(ctxt, tmp); 7343 xmlExpFree(ctxt, tmp2); 7344 } 7345 #ifdef DEBUG_DERIV 7346 printf("Divide failed\n"); 7347 #endif 7348 return(0); 7349 } 7350 7351 /** 7352 * xmlExpExpDeriveInt: 7353 * @ctxt: the expressions context 7354 * @exp: the englobing expression 7355 * @sub: the subexpression 7356 * 7357 * Try to do a step of Brzozowski derivation but at a higher level 7358 * the input being a subexpression. 7359 * 7360 * Returns the resulting expression or NULL in case of internal error 7361 */ 7362 static xmlExpNodePtr 7363 xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) { 7364 xmlExpNodePtr ret, tmp, tmp2, tmp3; 7365 const xmlChar **tab; 7366 int len, i; 7367 7368 /* 7369 * In case of equality and if the expression can only consume a finite 7370 * amount, then the derivation is empty 7371 */ 7372 if ((exp == sub) && (exp->c_max >= 0)) { 7373 #ifdef DEBUG_DERIV 7374 printf("Equal(exp, sub) and finite -> Empty\n"); 7375 #endif 7376 return(emptyExp); 7377 } 7378 /* 7379 * decompose sub sequence first 7380 */ 7381 if (sub->type == XML_EXP_EMPTY) { 7382 #ifdef DEBUG_DERIV 7383 printf("Empty(sub) -> Empty\n"); 7384 #endif 7385 exp->ref++; 7386 return(exp); 7387 } 7388 if (sub->type == XML_EXP_SEQ) { 7389 #ifdef DEBUG_DERIV 7390 printf("Seq(sub) -> decompose\n"); 7391 #endif 7392 tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left); 7393 if (tmp == NULL) 7394 return(NULL); 7395 if (tmp == forbiddenExp) 7396 return(tmp); 7397 ret = xmlExpExpDeriveInt(ctxt, tmp, sub->exp_right); 7398 xmlExpFree(ctxt, tmp); 7399 return(ret); 7400 } 7401 if (sub->type == XML_EXP_OR) { 7402 #ifdef DEBUG_DERIV 7403 printf("Or(sub) -> decompose\n"); 7404 #endif 7405 tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left); 7406 if (tmp == forbiddenExp) 7407 return(tmp); 7408 if (tmp == NULL) 7409 return(NULL); 7410 ret = xmlExpExpDeriveInt(ctxt, exp, sub->exp_right); 7411 if ((ret == NULL) || (ret == forbiddenExp)) { 7412 xmlExpFree(ctxt, tmp); 7413 return(ret); 7414 } 7415 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret, NULL, 0, 0)); 7416 } 7417 if (!xmlExpCheckCard(exp, sub)) { 7418 #ifdef DEBUG_DERIV 7419 printf("CheckCard(exp, sub) failed -> Forbid\n"); 7420 #endif 7421 return(forbiddenExp); 7422 } 7423 switch (exp->type) { 7424 case XML_EXP_EMPTY: 7425 if (sub == emptyExp) 7426 return(emptyExp); 7427 #ifdef DEBUG_DERIV 7428 printf("Empty(exp) -> Forbid\n"); 7429 #endif 7430 return(forbiddenExp); 7431 case XML_EXP_FORBID: 7432 #ifdef DEBUG_DERIV 7433 printf("Forbid(exp) -> Forbid\n"); 7434 #endif 7435 return(forbiddenExp); 7436 case XML_EXP_ATOM: 7437 if (sub->type == XML_EXP_ATOM) { 7438 /* TODO: handle wildcards */ 7439 if (exp->exp_str == sub->exp_str) { 7440 #ifdef DEBUG_DERIV 7441 printf("Atom match -> Empty\n"); 7442 #endif 7443 return(emptyExp); 7444 } 7445 #ifdef DEBUG_DERIV 7446 printf("Atom mismatch -> Forbid\n"); 7447 #endif 7448 return(forbiddenExp); 7449 } 7450 if ((sub->type == XML_EXP_COUNT) && 7451 (sub->exp_max == 1) && 7452 (sub->exp_left->type == XML_EXP_ATOM)) { 7453 /* TODO: handle wildcards */ 7454 if (exp->exp_str == sub->exp_left->exp_str) { 7455 #ifdef DEBUG_DERIV 7456 printf("Atom match -> Empty\n"); 7457 #endif 7458 return(emptyExp); 7459 } 7460 #ifdef DEBUG_DERIV 7461 printf("Atom mismatch -> Forbid\n"); 7462 #endif 7463 return(forbiddenExp); 7464 } 7465 #ifdef DEBUG_DERIV 7466 printf("Compex exp vs Atom -> Forbid\n"); 7467 #endif 7468 return(forbiddenExp); 7469 case XML_EXP_SEQ: 7470 /* try to get the sequence consumed only if possible */ 7471 if (xmlExpCheckCard(exp->exp_left, sub)) { 7472 /* See if the sequence can be consumed directly */ 7473 #ifdef DEBUG_DERIV 7474 printf("Seq trying left only\n"); 7475 #endif 7476 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub); 7477 if ((ret != forbiddenExp) && (ret != NULL)) { 7478 #ifdef DEBUG_DERIV 7479 printf("Seq trying left only worked\n"); 7480 #endif 7481 /* 7482 * TODO: assumption here that we are determinist 7483 * i.e. we won't get to a nillable exp left 7484 * subset which could be matched by the right 7485 * part too. 7486 * e.g.: (a | b)+,(a | c) and 'a+,a' 7487 */ 7488 exp->exp_right->ref++; 7489 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, 7490 exp->exp_right, NULL, 0, 0)); 7491 } 7492 #ifdef DEBUG_DERIV 7493 } else { 7494 printf("Seq: left too short\n"); 7495 #endif 7496 } 7497 /* Try instead to decompose */ 7498 if (sub->type == XML_EXP_COUNT) { 7499 int min, max; 7500 7501 #ifdef DEBUG_DERIV 7502 printf("Seq: sub is a count\n"); 7503 #endif 7504 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left); 7505 if (ret == NULL) 7506 return(NULL); 7507 if (ret != forbiddenExp) { 7508 #ifdef DEBUG_DERIV 7509 printf("Seq , Count match on left\n"); 7510 #endif 7511 if (sub->exp_max < 0) 7512 max = -1; 7513 else 7514 max = sub->exp_max -1; 7515 if (sub->exp_min > 0) 7516 min = sub->exp_min -1; 7517 else 7518 min = 0; 7519 exp->exp_right->ref++; 7520 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, 7521 exp->exp_right, NULL, 0, 0); 7522 if (tmp == NULL) 7523 return(NULL); 7524 7525 sub->exp_left->ref++; 7526 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, 7527 sub->exp_left, NULL, NULL, min, max); 7528 if (tmp2 == NULL) { 7529 xmlExpFree(ctxt, tmp); 7530 return(NULL); 7531 } 7532 ret = xmlExpExpDeriveInt(ctxt, tmp, tmp2); 7533 xmlExpFree(ctxt, tmp); 7534 xmlExpFree(ctxt, tmp2); 7535 return(ret); 7536 } 7537 } 7538 /* we made no progress on structured operations */ 7539 break; 7540 case XML_EXP_OR: 7541 #ifdef DEBUG_DERIV 7542 printf("Or , trying both side\n"); 7543 #endif 7544 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub); 7545 if (ret == NULL) 7546 return(NULL); 7547 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_right, sub); 7548 if (tmp == NULL) { 7549 xmlExpFree(ctxt, ret); 7550 return(NULL); 7551 } 7552 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp, NULL, 0, 0)); 7553 case XML_EXP_COUNT: { 7554 int min, max; 7555 7556 if (sub->type == XML_EXP_COUNT) { 7557 /* 7558 * Try to see if the loop is completely subsumed 7559 */ 7560 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left); 7561 if (tmp == NULL) 7562 return(NULL); 7563 if (tmp == forbiddenExp) { 7564 int mult; 7565 7566 #ifdef DEBUG_DERIV 7567 printf("Count, Count inner don't subsume\n"); 7568 #endif 7569 mult = xmlExpDivide(ctxt, sub->exp_left, exp->exp_left, 7570 NULL, &tmp); 7571 if (mult <= 0) { 7572 #ifdef DEBUG_DERIV 7573 printf("Count, Count not multiple => forbidden\n"); 7574 #endif 7575 return(forbiddenExp); 7576 } 7577 if (sub->exp_max == -1) { 7578 max = -1; 7579 if (exp->exp_max == -1) { 7580 if (exp->exp_min <= sub->exp_min * mult) 7581 min = 0; 7582 else 7583 min = exp->exp_min - sub->exp_min * mult; 7584 } else { 7585 #ifdef DEBUG_DERIV 7586 printf("Count, Count finite can't subsume infinite\n"); 7587 #endif 7588 xmlExpFree(ctxt, tmp); 7589 return(forbiddenExp); 7590 } 7591 } else { 7592 if (exp->exp_max == -1) { 7593 #ifdef DEBUG_DERIV 7594 printf("Infinite loop consume mult finite loop\n"); 7595 #endif 7596 if (exp->exp_min > sub->exp_min * mult) { 7597 max = -1; 7598 min = exp->exp_min - sub->exp_min * mult; 7599 } else { 7600 max = -1; 7601 min = 0; 7602 } 7603 } else { 7604 if (exp->exp_max < sub->exp_max * mult) { 7605 #ifdef DEBUG_DERIV 7606 printf("loops max mult mismatch => forbidden\n"); 7607 #endif 7608 xmlExpFree(ctxt, tmp); 7609 return(forbiddenExp); 7610 } 7611 if (sub->exp_max * mult > exp->exp_min) 7612 min = 0; 7613 else 7614 min = exp->exp_min - sub->exp_max * mult; 7615 max = exp->exp_max - sub->exp_max * mult; 7616 } 7617 } 7618 } else if (!IS_NILLABLE(tmp)) { 7619 /* 7620 * TODO: loop here to try to grow if working on finite 7621 * blocks. 7622 */ 7623 #ifdef DEBUG_DERIV 7624 printf("Count, Count remain not nillable => forbidden\n"); 7625 #endif 7626 xmlExpFree(ctxt, tmp); 7627 return(forbiddenExp); 7628 } else if (sub->exp_max == -1) { 7629 if (exp->exp_max == -1) { 7630 if (exp->exp_min <= sub->exp_min) { 7631 #ifdef DEBUG_DERIV 7632 printf("Infinite loops Okay => COUNT(0,Inf)\n"); 7633 #endif 7634 max = -1; 7635 min = 0; 7636 } else { 7637 #ifdef DEBUG_DERIV 7638 printf("Infinite loops min => Count(X,Inf)\n"); 7639 #endif 7640 max = -1; 7641 min = exp->exp_min - sub->exp_min; 7642 } 7643 } else if (exp->exp_min > sub->exp_min) { 7644 #ifdef DEBUG_DERIV 7645 printf("loops min mismatch 1 => forbidden ???\n"); 7646 #endif 7647 xmlExpFree(ctxt, tmp); 7648 return(forbiddenExp); 7649 } else { 7650 max = -1; 7651 min = 0; 7652 } 7653 } else { 7654 if (exp->exp_max == -1) { 7655 #ifdef DEBUG_DERIV 7656 printf("Infinite loop consume finite loop\n"); 7657 #endif 7658 if (exp->exp_min > sub->exp_min) { 7659 max = -1; 7660 min = exp->exp_min - sub->exp_min; 7661 } else { 7662 max = -1; 7663 min = 0; 7664 } 7665 } else { 7666 if (exp->exp_max < sub->exp_max) { 7667 #ifdef DEBUG_DERIV 7668 printf("loops max mismatch => forbidden\n"); 7669 #endif 7670 xmlExpFree(ctxt, tmp); 7671 return(forbiddenExp); 7672 } 7673 if (sub->exp_max > exp->exp_min) 7674 min = 0; 7675 else 7676 min = exp->exp_min - sub->exp_max; 7677 max = exp->exp_max - sub->exp_max; 7678 } 7679 } 7680 #ifdef DEBUG_DERIV 7681 printf("loops match => SEQ(COUNT())\n"); 7682 #endif 7683 exp->exp_left->ref++; 7684 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left, 7685 NULL, NULL, min, max); 7686 if (tmp2 == NULL) { 7687 return(NULL); 7688 } 7689 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2, 7690 NULL, 0, 0); 7691 return(ret); 7692 } 7693 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub); 7694 if (tmp == NULL) 7695 return(NULL); 7696 if (tmp == forbiddenExp) { 7697 #ifdef DEBUG_DERIV 7698 printf("loop mismatch => forbidden\n"); 7699 #endif 7700 return(forbiddenExp); 7701 } 7702 if (exp->exp_min > 0) 7703 min = exp->exp_min - 1; 7704 else 7705 min = 0; 7706 if (exp->exp_max < 0) 7707 max = -1; 7708 else 7709 max = exp->exp_max - 1; 7710 7711 #ifdef DEBUG_DERIV 7712 printf("loop match => SEQ(COUNT())\n"); 7713 #endif 7714 exp->exp_left->ref++; 7715 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left, 7716 NULL, NULL, min, max); 7717 if (tmp2 == NULL) 7718 return(NULL); 7719 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2, 7720 NULL, 0, 0); 7721 return(ret); 7722 } 7723 } 7724 7725 #ifdef DEBUG_DERIV 7726 printf("Fallback to derivative\n"); 7727 #endif 7728 if (IS_NILLABLE(sub)) { 7729 if (!(IS_NILLABLE(exp))) 7730 return(forbiddenExp); 7731 else 7732 ret = emptyExp; 7733 } else 7734 ret = NULL; 7735 /* 7736 * here the structured derivation made no progress so 7737 * we use the default token based derivation to force one more step 7738 */ 7739 if (ctxt->tabSize == 0) 7740 ctxt->tabSize = 40; 7741 7742 tab = (const xmlChar **) xmlMalloc(ctxt->tabSize * 7743 sizeof(const xmlChar *)); 7744 if (tab == NULL) { 7745 return(NULL); 7746 } 7747 7748 /* 7749 * collect all the strings accepted by the subexpression on input 7750 */ 7751 len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0); 7752 while (len < 0) { 7753 const xmlChar **temp; 7754 temp = (const xmlChar **) xmlRealloc((xmlChar **) tab, ctxt->tabSize * 2 * 7755 sizeof(const xmlChar *)); 7756 if (temp == NULL) { 7757 xmlFree((xmlChar **) tab); 7758 return(NULL); 7759 } 7760 tab = temp; 7761 ctxt->tabSize *= 2; 7762 len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0); 7763 } 7764 for (i = 0;i < len;i++) { 7765 tmp = xmlExpStringDeriveInt(ctxt, exp, tab[i]); 7766 if ((tmp == NULL) || (tmp == forbiddenExp)) { 7767 xmlExpFree(ctxt, ret); 7768 xmlFree((xmlChar **) tab); 7769 return(tmp); 7770 } 7771 tmp2 = xmlExpStringDeriveInt(ctxt, sub, tab[i]); 7772 if ((tmp2 == NULL) || (tmp2 == forbiddenExp)) { 7773 xmlExpFree(ctxt, tmp); 7774 xmlExpFree(ctxt, ret); 7775 xmlFree((xmlChar **) tab); 7776 return(tmp); 7777 } 7778 tmp3 = xmlExpExpDeriveInt(ctxt, tmp, tmp2); 7779 xmlExpFree(ctxt, tmp); 7780 xmlExpFree(ctxt, tmp2); 7781 7782 if ((tmp3 == NULL) || (tmp3 == forbiddenExp)) { 7783 xmlExpFree(ctxt, ret); 7784 xmlFree((xmlChar **) tab); 7785 return(tmp3); 7786 } 7787 7788 if (ret == NULL) 7789 ret = tmp3; 7790 else { 7791 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp3, NULL, 0, 0); 7792 if (ret == NULL) { 7793 xmlFree((xmlChar **) tab); 7794 return(NULL); 7795 } 7796 } 7797 } 7798 xmlFree((xmlChar **) tab); 7799 return(ret); 7800 } 7801 7802 /** 7803 * xmlExpExpDerive: 7804 * @ctxt: the expressions context 7805 * @exp: the englobing expression 7806 * @sub: the subexpression 7807 * 7808 * Evaluates the expression resulting from @exp consuming a sub expression @sub 7809 * Based on algebraic derivation and sometimes direct Brzozowski derivation 7810 * it usually tatkes less than linear time and can handle expressions generating 7811 * infinite languages. 7812 * 7813 * Returns the resulting expression or NULL in case of internal error, the 7814 * result must be freed 7815 */ 7816 xmlExpNodePtr 7817 xmlExpExpDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) { 7818 if ((exp == NULL) || (ctxt == NULL) || (sub == NULL)) 7819 return(NULL); 7820 7821 /* 7822 * O(1) speedups 7823 */ 7824 if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) { 7825 #ifdef DEBUG_DERIV 7826 printf("Sub nillable and not exp : can't subsume\n"); 7827 #endif 7828 return(forbiddenExp); 7829 } 7830 if (xmlExpCheckCard(exp, sub) == 0) { 7831 #ifdef DEBUG_DERIV 7832 printf("sub generate longuer sequances than exp : can't subsume\n"); 7833 #endif 7834 return(forbiddenExp); 7835 } 7836 return(xmlExpExpDeriveInt(ctxt, exp, sub)); 7837 } 7838 7839 /** 7840 * xmlExpSubsume: 7841 * @ctxt: the expressions context 7842 * @exp: the englobing expression 7843 * @sub: the subexpression 7844 * 7845 * Check whether @exp accepts all the languages accexpted by @sub 7846 * the input being a subexpression. 7847 * 7848 * Returns 1 if true 0 if false and -1 in case of failure. 7849 */ 7850 int 7851 xmlExpSubsume(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) { 7852 xmlExpNodePtr tmp; 7853 7854 if ((exp == NULL) || (ctxt == NULL) || (sub == NULL)) 7855 return(-1); 7856 7857 /* 7858 * TODO: speedup by checking the language of sub is a subset of the 7859 * language of exp 7860 */ 7861 /* 7862 * O(1) speedups 7863 */ 7864 if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) { 7865 #ifdef DEBUG_DERIV 7866 printf("Sub nillable and not exp : can't subsume\n"); 7867 #endif 7868 return(0); 7869 } 7870 if (xmlExpCheckCard(exp, sub) == 0) { 7871 #ifdef DEBUG_DERIV 7872 printf("sub generate longuer sequances than exp : can't subsume\n"); 7873 #endif 7874 return(0); 7875 } 7876 tmp = xmlExpExpDeriveInt(ctxt, exp, sub); 7877 #ifdef DEBUG_DERIV 7878 printf("Result derivation :\n"); 7879 PRINT_EXP(tmp); 7880 #endif 7881 if (tmp == NULL) 7882 return(-1); 7883 if (tmp == forbiddenExp) 7884 return(0); 7885 if (tmp == emptyExp) 7886 return(1); 7887 if ((tmp != NULL) && (IS_NILLABLE(tmp))) { 7888 xmlExpFree(ctxt, tmp); 7889 return(1); 7890 } 7891 xmlExpFree(ctxt, tmp); 7892 return(0); 7893 } 7894 7895 /************************************************************************ 7896 * * 7897 * Parsing expression * 7898 * * 7899 ************************************************************************/ 7900 7901 static xmlExpNodePtr xmlExpParseExpr(xmlExpCtxtPtr ctxt); 7902 7903 #undef CUR 7904 #define CUR (*ctxt->cur) 7905 #undef NEXT 7906 #define NEXT ctxt->cur++; 7907 #undef IS_BLANK 7908 #define IS_BLANK(c) ((c == ' ') || (c == '\n') || (c == '\r') || (c == '\t')) 7909 #define SKIP_BLANKS while (IS_BLANK(*ctxt->cur)) ctxt->cur++; 7910 7911 static int 7912 xmlExpParseNumber(xmlExpCtxtPtr ctxt) { 7913 int ret = 0; 7914 7915 SKIP_BLANKS 7916 if (CUR == '*') { 7917 NEXT 7918 return(-1); 7919 } 7920 if ((CUR < '0') || (CUR > '9')) 7921 return(-1); 7922 while ((CUR >= '0') && (CUR <= '9')) { 7923 ret = ret * 10 + (CUR - '0'); 7924 NEXT 7925 } 7926 return(ret); 7927 } 7928 7929 static xmlExpNodePtr 7930 xmlExpParseOr(xmlExpCtxtPtr ctxt) { 7931 const char *base; 7932 xmlExpNodePtr ret; 7933 const xmlChar *val; 7934 7935 SKIP_BLANKS 7936 base = ctxt->cur; 7937 if (*ctxt->cur == '(') { 7938 NEXT 7939 ret = xmlExpParseExpr(ctxt); 7940 SKIP_BLANKS 7941 if (*ctxt->cur != ')') { 7942 fprintf(stderr, "unbalanced '(' : %s\n", base); 7943 xmlExpFree(ctxt, ret); 7944 return(NULL); 7945 } 7946 NEXT; 7947 SKIP_BLANKS 7948 goto parse_quantifier; 7949 } 7950 while ((CUR != 0) && (!(IS_BLANK(CUR))) && (CUR != '(') && 7951 (CUR != ')') && (CUR != '|') && (CUR != ',') && (CUR != '{') && 7952 (CUR != '*') && (CUR != '+') && (CUR != '?') && (CUR != '}')) 7953 NEXT; 7954 val = xmlDictLookup(ctxt->dict, BAD_CAST base, ctxt->cur - base); 7955 if (val == NULL) 7956 return(NULL); 7957 ret = xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, val, 0, 0); 7958 if (ret == NULL) 7959 return(NULL); 7960 SKIP_BLANKS 7961 parse_quantifier: 7962 if (CUR == '{') { 7963 int min, max; 7964 7965 NEXT 7966 min = xmlExpParseNumber(ctxt); 7967 if (min < 0) { 7968 xmlExpFree(ctxt, ret); 7969 return(NULL); 7970 } 7971 SKIP_BLANKS 7972 if (CUR == ',') { 7973 NEXT 7974 max = xmlExpParseNumber(ctxt); 7975 SKIP_BLANKS 7976 } else 7977 max = min; 7978 if (CUR != '}') { 7979 xmlExpFree(ctxt, ret); 7980 return(NULL); 7981 } 7982 NEXT 7983 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL, 7984 min, max); 7985 SKIP_BLANKS 7986 } else if (CUR == '?') { 7987 NEXT 7988 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL, 7989 0, 1); 7990 SKIP_BLANKS 7991 } else if (CUR == '+') { 7992 NEXT 7993 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL, 7994 1, -1); 7995 SKIP_BLANKS 7996 } else if (CUR == '*') { 7997 NEXT 7998 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL, 7999 0, -1); 8000 SKIP_BLANKS 8001 } 8002 return(ret); 8003 } 8004 8005 8006 static xmlExpNodePtr 8007 xmlExpParseSeq(xmlExpCtxtPtr ctxt) { 8008 xmlExpNodePtr ret, right; 8009 8010 ret = xmlExpParseOr(ctxt); 8011 SKIP_BLANKS 8012 while (CUR == '|') { 8013 NEXT 8014 right = xmlExpParseOr(ctxt); 8015 if (right == NULL) { 8016 xmlExpFree(ctxt, ret); 8017 return(NULL); 8018 } 8019 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, right, NULL, 0, 0); 8020 if (ret == NULL) 8021 return(NULL); 8022 } 8023 return(ret); 8024 } 8025 8026 static xmlExpNodePtr 8027 xmlExpParseExpr(xmlExpCtxtPtr ctxt) { 8028 xmlExpNodePtr ret, right; 8029 8030 ret = xmlExpParseSeq(ctxt); 8031 SKIP_BLANKS 8032 while (CUR == ',') { 8033 NEXT 8034 right = xmlExpParseSeq(ctxt); 8035 if (right == NULL) { 8036 xmlExpFree(ctxt, ret); 8037 return(NULL); 8038 } 8039 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, right, NULL, 0, 0); 8040 if (ret == NULL) 8041 return(NULL); 8042 } 8043 return(ret); 8044 } 8045 8046 /** 8047 * xmlExpParse: 8048 * @ctxt: the expressions context 8049 * @expr: the 0 terminated string 8050 * 8051 * Minimal parser for regexps, it understand the following constructs 8052 * - string terminals 8053 * - choice operator | 8054 * - sequence operator , 8055 * - subexpressions (...) 8056 * - usual cardinality operators + * and ? 8057 * - finite sequences { min, max } 8058 * - infinite sequences { min, * } 8059 * There is minimal checkings made especially no checking on strings values 8060 * 8061 * Returns a new expression or NULL in case of failure 8062 */ 8063 xmlExpNodePtr 8064 xmlExpParse(xmlExpCtxtPtr ctxt, const char *expr) { 8065 xmlExpNodePtr ret; 8066 8067 ctxt->expr = expr; 8068 ctxt->cur = expr; 8069 8070 ret = xmlExpParseExpr(ctxt); 8071 SKIP_BLANKS 8072 if (*ctxt->cur != 0) { 8073 xmlExpFree(ctxt, ret); 8074 return(NULL); 8075 } 8076 return(ret); 8077 } 8078 8079 static void 8080 xmlExpDumpInt(xmlBufferPtr buf, xmlExpNodePtr expr, int glob) { 8081 xmlExpNodePtr c; 8082 8083 if (expr == NULL) return; 8084 if (glob) xmlBufferWriteChar(buf, "("); 8085 switch (expr->type) { 8086 case XML_EXP_EMPTY: 8087 xmlBufferWriteChar(buf, "empty"); 8088 break; 8089 case XML_EXP_FORBID: 8090 xmlBufferWriteChar(buf, "forbidden"); 8091 break; 8092 case XML_EXP_ATOM: 8093 xmlBufferWriteCHAR(buf, expr->exp_str); 8094 break; 8095 case XML_EXP_SEQ: 8096 c = expr->exp_left; 8097 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 8098 xmlExpDumpInt(buf, c, 1); 8099 else 8100 xmlExpDumpInt(buf, c, 0); 8101 xmlBufferWriteChar(buf, " , "); 8102 c = expr->exp_right; 8103 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 8104 xmlExpDumpInt(buf, c, 1); 8105 else 8106 xmlExpDumpInt(buf, c, 0); 8107 break; 8108 case XML_EXP_OR: 8109 c = expr->exp_left; 8110 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 8111 xmlExpDumpInt(buf, c, 1); 8112 else 8113 xmlExpDumpInt(buf, c, 0); 8114 xmlBufferWriteChar(buf, " | "); 8115 c = expr->exp_right; 8116 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 8117 xmlExpDumpInt(buf, c, 1); 8118 else 8119 xmlExpDumpInt(buf, c, 0); 8120 break; 8121 case XML_EXP_COUNT: { 8122 char rep[40]; 8123 8124 c = expr->exp_left; 8125 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 8126 xmlExpDumpInt(buf, c, 1); 8127 else 8128 xmlExpDumpInt(buf, c, 0); 8129 if ((expr->exp_min == 0) && (expr->exp_max == 1)) { 8130 rep[0] = '?'; 8131 rep[1] = 0; 8132 } else if ((expr->exp_min == 0) && (expr->exp_max == -1)) { 8133 rep[0] = '*'; 8134 rep[1] = 0; 8135 } else if ((expr->exp_min == 1) && (expr->exp_max == -1)) { 8136 rep[0] = '+'; 8137 rep[1] = 0; 8138 } else if (expr->exp_max == expr->exp_min) { 8139 snprintf(rep, 39, "{%d}", expr->exp_min); 8140 } else if (expr->exp_max < 0) { 8141 snprintf(rep, 39, "{%d,inf}", expr->exp_min); 8142 } else { 8143 snprintf(rep, 39, "{%d,%d}", expr->exp_min, expr->exp_max); 8144 } 8145 rep[39] = 0; 8146 xmlBufferWriteChar(buf, rep); 8147 break; 8148 } 8149 default: 8150 fprintf(stderr, "Error in tree\n"); 8151 } 8152 if (glob) 8153 xmlBufferWriteChar(buf, ")"); 8154 } 8155 /** 8156 * xmlExpDump: 8157 * @buf: a buffer to receive the output 8158 * @expr: the compiled expression 8159 * 8160 * Serialize the expression as compiled to the buffer 8161 */ 8162 void 8163 xmlExpDump(xmlBufferPtr buf, xmlExpNodePtr expr) { 8164 if ((buf == NULL) || (expr == NULL)) 8165 return; 8166 xmlExpDumpInt(buf, expr, 0); 8167 } 8168 8169 /** 8170 * xmlExpMaxToken: 8171 * @expr: a compiled expression 8172 * 8173 * Indicate the maximum number of input a expression can accept 8174 * 8175 * Returns the maximum length or -1 in case of error 8176 */ 8177 int 8178 xmlExpMaxToken(xmlExpNodePtr expr) { 8179 if (expr == NULL) 8180 return(-1); 8181 return(expr->c_max); 8182 } 8183 8184 /** 8185 * xmlExpCtxtNbNodes: 8186 * @ctxt: an expression context 8187 * 8188 * Debugging facility provides the number of allocated nodes at a that point 8189 * 8190 * Returns the number of nodes in use or -1 in case of error 8191 */ 8192 int 8193 xmlExpCtxtNbNodes(xmlExpCtxtPtr ctxt) { 8194 if (ctxt == NULL) 8195 return(-1); 8196 return(ctxt->nb_nodes); 8197 } 8198 8199 /** 8200 * xmlExpCtxtNbCons: 8201 * @ctxt: an expression context 8202 * 8203 * Debugging facility provides the number of allocated nodes over lifetime 8204 * 8205 * Returns the number of nodes ever allocated or -1 in case of error 8206 */ 8207 int 8208 xmlExpCtxtNbCons(xmlExpCtxtPtr ctxt) { 8209 if (ctxt == NULL) 8210 return(-1); 8211 return(ctxt->nb_cons); 8212 } 8213 8214 #endif /* LIBXML_EXPR_ENABLED */ 8215 #define bottom_xmlregexp 8216 #include "elfgcchack.h" 8217 #endif /* LIBXML_REGEXP_ENABLED */