Lexer.java
1 // HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML 2 // http://sourceforge.org/projects/htmlparser 3 // Copyright (C) 2004 Derrick Oswald 4 // 5 // Revision Control Information 6 // 7 // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Lexer.java,v $ 8 // $Author: derrickoswald $ 9 // $Date: 2006/03/19 21:26:32 $ 10 // $Revision: 1.44 $ 11 // 12 // This library is free software; you can redistribute it and/or 13 // modify it under the terms of the GNU Lesser General Public 14 // License as published by the Free Software Foundation; either 15 // version 2.1 of the License, or (at your option) any later version. 16 // 17 // This library is distributed in the hope that it will be useful, 18 // but WITHOUT ANY WARRANTY; without even the implied warranty of 19 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 // Lesser General Public License for more details. 21 // 22 // You should have received a copy of the GNU Lesser General Public 23 // License along with this library; if not, write to the Free Software 24 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 25 // 26 27 package org.htmlparser.lexer; 28 29 import java.io.Serializable; 30 import java.net.MalformedURLException; 31 import java.net.URLConnection; 32 import java.util.Vector; 33 34 import org.htmlparser.Node; 35 import org.htmlparser.NodeFactory; 36 import org.htmlparser.Remark; 37 import org.htmlparser.Text; 38 import org.htmlparser.Tag; 39 import org.htmlparser.http.ConnectionManager; 40 import org.htmlparser.nodes.RemarkNode; 41 import org.htmlparser.nodes.TextNode; 42 import org.htmlparser.nodes.TagNode; 43 import org.htmlparser.util.ParserException; 44 45 /** 46 * This class parses the HTML stream into nodes. 47 * There are three major types of nodes (lexemes): 48 * <ul> 49 * <li>Remark</li> 50 * <li>Text</li> 51 * <li>Tag</li> 52 * </ul> 53 * Each time <code>nextNode()</code> is called, another node is returned until 54 * the stream is exhausted, and <code>null</code> is returned. 55 */ 56 public class Lexer 57 implements 58 Serializable, 59 NodeFactory 60 { 61 /** 62 * The page lexemes are retrieved from. 63 */ 64 protected Page mPage; 65 66 /** 67 * The current position on the page. 68 */ 69 protected Cursor mCursor; 70 71 /** 72 * The factory for new nodes. 73 */ 74 protected NodeFactory mFactory; 75 76 /** 77 * Line number to trigger on. 78 * This is tested on each <code>nextNode()</code> call, as a debugging aid. 79 * Alter this value and set a breakpoint on the guarded statement. 80 * Remember, these line numbers are zero based, while most editors are 81 * one based. 82 * @see #nextNode 83 */ 84 protected static int mDebugLineTrigger = -1; 85 86 /** 87 * Creates a new instance of a Lexer. 88 */ 89 public Lexer () 90 { 91 this (new Page ("")); 92 } 93 94 /** 95 * Creates a new instance of a Lexer. 96 * @param page The page with HTML text. 97 */ 98 public Lexer (Page page) 99 { 100 setPage (page); 101 setCursor (new Cursor (page, 0)); 102 setNodeFactory (this); 103 } 104 105 /** 106 * Creates a new instance of a Lexer. 107 * @param text The text to parse. 108 */ 109 public Lexer (String text) 110 { 111 this (new Page (text)); 112 } 113 114 /** 115 * Creates a new instance of a Lexer. 116 * @param connection The url to parse. 117 * @exception ParserException If an error occurs opening the connection. 118 */ 119 public Lexer (URLConnection connection) 120 throws 121 ParserException 122 { 123 this (new Page (connection)); 124 } 125 126 /** 127 * Reset the lexer to start parsing from the beginning again. 128 * The underlying components are reset such that the next call to 129 * <code>nextNode()</code> will return the first lexeme on the page. 130 */ 131 public void reset () 132 { 133 getPage ().reset (); 134 setCursor (new Cursor (getPage (), 0)); 135 } 136 137 /** 138 * Get the page this lexer is working on. 139 * @return The page that nodes are being read from. 140 */ 141 public Page getPage () 142 { 143 return (mPage); 144 } 145 146 /** 147 * Set the page this lexer is working on. 148 * @param page The page that nodes will be read from. 149 */ 150 public void setPage (Page page) 151 { 152 if (null == page) 153 throw new IllegalArgumentException ("page cannot be null"); 154 // todo: sanity checks 155 mPage = page; 156 } 157 158 /** 159 * Get the current scanning position. 160 * @return The lexer's cursor position. 161 */ 162 public Cursor getCursor () 163 { 164 return (mCursor); 165 } 166 167 /** 168 * Set the current scanning position. 169 * @param cursor The lexer's new cursor position. 170 */ 171 public void setCursor (Cursor cursor) 172 { 173 if (null == cursor) 174 throw new IllegalArgumentException ("cursor cannot be null"); 175 // todo: sanity checks 176 mCursor = cursor; 177 } 178 179 /** 180 * Get the current node factory. 181 * @return The lexer's node factory. 182 */ 183 public NodeFactory getNodeFactory () 184 { 185 return (mFactory); 186 } 187 188 /** 189 * Set the current node factory. 190 * @param factory The node factory to be used by the lexer. 191 */ 192 public void setNodeFactory (NodeFactory factory) 193 { 194 if (null == factory) 195 throw new IllegalArgumentException ("node factory cannot be null"); 196 mFactory = factory; 197 } 198 199 /** 200 * Get the current cursor position. 201 * @return The current character offset into the source. 202 */ 203 public int getPosition () 204 { 205 return (getCursor ().getPosition ()); 206 } 207 208 /** 209 * Set the current cursor position. 210 * @param position The new character offset into the source. 211 */ 212 public void setPosition (int position) 213 { 214 // todo: sanity checks 215 getCursor ().setPosition (position); 216 } 217 218 /** 219 * Get the current line number. 220 * @return The line number the lexer's working on. 221 */ 222 public int getCurrentLineNumber () 223 { 224 return (getPage ().row (getCursor ())); 225 } 226 227 /** 228 * Get the current line. 229 * @return The string the lexer's working on. 230 */ 231 public String getCurrentLine () 232 { 233 return (getPage ().getLine (getCursor ())); 234 } 235 236 /** 237 * Get the next node from the source. 238 * @return A Remark, Text or Tag, or <code>null</code> if no 239 * more lexemes are present. 240 * @exception ParserException If there is a problem with the 241 * underlying page. 242 */ 243 public Node nextNode () 244 throws 245 ParserException 246 { 247 return nextNode (false); 248 } 249 250 /** 251 * Get the next node from the source. 252 * @param quotesmart If <code>true</code>, strings ignore quoted contents. 253 * @return A Remark, Text or Tag, or <code>null</code> if no 254 * more lexemes are present. 255 * @exception ParserException If there is a problem with the 256 * underlying page. 257 */ 258 public Node nextNode (boolean quotesmart) 259 throws 260 ParserException 261 { 262 int start; 263 char ch; 264 Node ret; 265 266 // debugging suppport 267 if (-1 != mDebugLineTrigger) 268 { 269 Page page = getPage (); 270 int lineno = page.row (mCursor); 271 if (mDebugLineTrigger < lineno) 272 mDebugLineTrigger = lineno + 1; // trigger on next line too 273 } 274 start = mCursor.getPosition (); 275 ch = mPage.getCharacter (mCursor); 276 switch (ch) 277 { 278 case Page.EOF: 279 ret = null; 280 break; 281 case '<': 282 ch = mPage.getCharacter (mCursor); 283 if (Page.EOF == ch) 284 ret = makeString (start, mCursor.getPosition ()); 285 else if ('%' == ch) 286 { 287 mCursor.retreat (); 288 ret = parseJsp (start); 289 } 290 else if ('?' == ch) 291 { 292 mCursor.retreat (); 293 ret = parsePI (start); 294 } 295 else if ('/' == ch || '%' == ch || Character.isLetter (ch)) 296 { 297 mCursor.retreat (); 298 ret = parseTag (start); 299 } 300 else if ('!' == ch) 301 { 302 ch = mPage.getCharacter (mCursor); 303 if (Page.EOF == ch) 304 ret = makeString (start, mCursor.getPosition ()); 305 else 306 { 307 if ('>' == ch) // handle <!> 308 ret = makeRemark (start, mCursor.getPosition ()); 309 else 310 { 311 mCursor.retreat (); // remark/tag need this char 312 if ('-' == ch) 313 ret = parseRemark (start, quotesmart); 314 else 315 { 316 mCursor.retreat (); // tag needs prior one too 317 ret = parseTag (start); 318 } 319 } 320 } 321 } 322 else 323 ret = parseString (start, quotesmart); 324 break; 325 default: 326 mCursor.retreat (); // string needs to see leading foreslash 327 ret = parseString (start, quotesmart); 328 break; 329 } 330 331 return (ret); 332 } 333 334 /** 335 * Advance the cursor through a JIS escape sequence. 336 * @param cursor A cursor positioned within the escape sequence. 337 * @exception ParserException If a problem occurs reading from the source. 338 */ 339 protected void scanJIS (Cursor cursor) 340 throws 341 ParserException 342 { 343 boolean done; 344 char ch; 345 int state; 346 347 done = false; 348 state = 0; 349 while (!done) 350 { 351 ch = mPage.getCharacter (cursor); 352 if (Page.EOF == ch) 353 done = true; 354 else 355 switch (state) 356 { 357 case 0: 358 if (0x1b == ch) // escape 359 state = 1; 360 break; 361 case 1: 362 if ('(' == ch) 363 state = 2; 364 else 365 state = 0; 366 break; 367 case 2: 368 if ('J' == ch) 369 done = true; 370 else 371 state = 0; 372 break; 373 default: 374 throw new IllegalStateException ("state " + state); 375 } 376 } 377 } 378 379 /** 380 * Parse a string node. 381 * Scan characters until "</", "<%", "<!" or < followed by a 382 * letter is encountered, or the input stream is exhausted, in which 383 * case <code>null</code> is returned. 384 * @param start The position at which to start scanning. 385 * @param quotesmart If <code>true</code>, strings ignore quoted contents. 386 * @return The parsed node. 387 * @exception ParserException If a problem occurs reading from the source. 388 */ 389 protected Node parseString (int start, boolean quotesmart) 390 throws 391 ParserException 392 { 393 boolean done; 394 char ch; 395 char quote; 396 397 done = false; 398 quote = 0; 399 while (!done) 400 { 401 ch = mPage.getCharacter (mCursor); 402 if (Page.EOF == ch) 403 done = true; 404 else if (0x1b == ch) // escape 405 { 406 ch = mPage.getCharacter (mCursor); 407 if (Page.EOF == ch) 408 done = true; 409 else if ('$' == ch) 410 { 411 ch = mPage.getCharacter (mCursor); 412 if (Page.EOF == ch) 413 done = true; 414 else if ('B' == ch) 415 scanJIS (mCursor); 416 else 417 { 418 mCursor.retreat (); 419 mCursor.retreat (); 420 } 421 } 422 else 423 mCursor.retreat (); 424 } 425 else if (quotesmart && (0 == quote) 426 && (('\'' == ch) || ('"' == ch))) 427 quote = ch; // enter quoted state 428 // patch from Gernot Fricke to handle escaped closing quote 429 else if (quotesmart && (0 != quote) && ('\\' == ch)) 430 { 431 ch = mPage.getCharacter (mCursor); // try to consume escape 432 if ((Page.EOF != ch) 433 && ('\\' != ch) // escaped backslash 434 && (ch != quote)) // escaped quote character 435 // ( reflects ["] or ['] whichever opened the quotation) 436 mCursor.retreat(); // unconsume char if char not an escape 437 } 438 else if (quotesmart && (ch == quote)) 439 quote = 0; // exit quoted state 440 else if (quotesmart && (0 == quote) && (ch == '/')) 441 { 442 // handle multiline and double slash comments (with a quote) 443 // in script like: 444 // I can't handle single quotations. 445 ch = mPage.getCharacter (mCursor); 446 if (Page.EOF == ch) 447 done = true; 448 else if ('/' == ch) 449 { 450 do 451 ch = mPage.getCharacter (mCursor); 452 while ((Page.EOF != ch) && ('\n' != ch)); 453 } 454 else if ('*' == ch) 455 { 456 do 457 { 458 do 459 ch = mPage.getCharacter (mCursor); 460 while ((Page.EOF != ch) && ('*' != ch)); 461 ch = mPage.getCharacter (mCursor); 462 if (ch == '*') 463 mCursor.retreat (); 464 } 465 while ((Page.EOF != ch) && ('/' != ch)); 466 } 467 else 468 mCursor.retreat (); 469 } 470 else if ((0 == quote) && ('<' == ch)) 471 { 472 ch = mPage.getCharacter (mCursor); 473 if (Page.EOF == ch) 474 done = true; 475 // the order of these tests might be optimized for speed: 476 else if ('/' == ch || Character.isLetter (ch) 477 || '!' == ch || '%' == ch || '?' == ch) 478 { 479 done = true; 480 mCursor.retreat (); 481 mCursor.retreat (); 482 } 483 else 484 { 485 // it's not a tag, so keep going, but check for quotes 486 mCursor.retreat (); 487 } 488 } 489 } 490 491 return (makeString (start, mCursor.getPosition ())); 492 } 493 494 /** 495 * Create a string node based on the current cursor and the one provided. 496 * @param start The starting point of the node. 497 * @param end The ending point of the node. 498 * @exception ParserException If the nodefactory creation of the text 499 * node fails. 500 * @return The new Text node. 501 */ 502 protected Node makeString (int start, int end) 503 throws 504 ParserException 505 { 506 int length; 507 Node ret; 508 509 length = end - start; 510 if (0 != length) 511 // got some characters 512 ret = getNodeFactory ().createStringNode ( 513 this.getPage (), start, end); 514 else 515 ret = null; 516 517 return (ret); 518 } 519 520 /** 521 * Generate a whitespace 'attribute', 522 * @param attributes The list so far. 523 * @param bookmarks The array of positions. 524 */ 525 private void whitespace (Vector attributes, int[] bookmarks) 526 { 527 if (bookmarks[1] > bookmarks[0]) 528 attributes.addElement (new PageAttribute ( 529 mPage, -1, -1, bookmarks[0], bookmarks[1], (char)0)); 530 } 531 532 /** 533 * Generate a standalone attribute -- font. 534 * @param attributes The list so far. 535 * @param bookmarks The array of positions. 536 */ 537 private void standalone (Vector attributes, int[] bookmarks) 538 { 539 attributes.addElement (new PageAttribute ( 540 mPage, bookmarks[1], bookmarks[2], -1, -1, (char)0)); 541 } 542 543 /** 544 * Generate an empty attribute -- color=. 545 * @param attributes The list so far. 546 * @param bookmarks The array of positions. 547 */ 548 private void empty (Vector attributes, int[] bookmarks) 549 { 550 attributes.addElement (new PageAttribute ( 551 mPage, bookmarks[1], bookmarks[2], bookmarks[2] + 1, -1, (char)0)); 552 } 553 554 /** 555 * Generate an unquoted attribute -- size=1. 556 * @param attributes The list so far. 557 * @param bookmarks The array of positions. 558 */ 559 private void naked (Vector attributes, int[] bookmarks) 560 { 561 attributes.addElement (new PageAttribute ( 562 mPage, bookmarks[1], bookmarks[2], bookmarks[3], 563 bookmarks[4], (char)0)); 564 } 565 566 /** 567 * Generate an single quoted attribute -- width='100%'. 568 * @param attributes The list so far. 569 * @param bookmarks The array of positions. 570 */ 571 private void single_quote (Vector attributes, int[] bookmarks) 572 { 573 attributes.addElement (new PageAttribute ( 574 mPage, bookmarks[1], bookmarks[2], bookmarks[4] + 1, 575 bookmarks[5], '\'')); 576 } 577 578 /** 579 * Generate an double quoted attribute -- CONTENT="Test Development". 580 * @param attributes The list so far. 581 * @param bookmarks The array of positions. 582 */ 583 private void double_quote (Vector attributes, int[] bookmarks) 584 { 585 attributes.addElement (new PageAttribute ( 586 mPage, bookmarks[1], bookmarks[2], bookmarks[5] + 1, 587 bookmarks[6], '"')); 588 } 589 590 /** 591 * Parse a tag. 592 * Parse the name and attributes from a start tag.<p> 593 * From the <a href="http://www.w3.org/TR/html4/intro/sgmltut.html#h-3.2.2"> 594 * HTML 4.01 Specification, W3C Recommendation 24 December 1999</a> 595 * http://www.w3.org/TR/html4/intro/sgmltut.html#h-3.2.2<p> 596 * <cite> 597 * 3.2.2 Attributes<p> 598 * Elements may have associated properties, called attributes, which may 599 * have values (by default, or set by authors or scripts). Attribute/value 600 * pairs appear before the final ">" of an element's start tag. Any number 601 * of (legal) attribute value pairs, separated by spaces, may appear in an 602 * element's start tag. They may appear in any order.<p> 603 * In this example, the id attribute is set for an H1 element: 604 * <code> 605 * <H1 id="section1"> 606 * </code> 607 * This is an identified heading thanks to the id attribute 608 * <code> 609 * </H1> 610 * </code> 611 * By default, SGML requires that all attribute values be delimited using 612 * either double quotation marks (ASCII decimal 34) or single quotation 613 * marks (ASCII decimal 39). Single quote marks can be included within the 614 * attribute value when the value is delimited by double quote marks, and 615 * vice versa. Authors may also use numeric character references to 616 * represent double quotes (&#34;) and single quotes (&#39;). 617 * For doublequotes authors can also use the character entity reference 618 * &quot;.<p> 619 * In certain cases, authors may specify the value of an attribute without 620 * any quotation marks. The attribute value may only contain letters 621 * (a-z and A-Z), digits (0-9), hyphens (ASCII decimal 45), 622 * periods (ASCII decimal 46), underscores (ASCII decimal 95), 623 * and colons (ASCII decimal 58). We recommend using quotation marks even 624 * when it is possible to eliminate them.<p> 625 * Attribute names are always case-insensitive.<p> 626 * Attribute values are generally case-insensitive. The definition of each 627 * attribute in the reference manual indicates whether its value is 628 * case-insensitive.<p> 629 * All the attributes defined by this specification are listed in the 630 * attribute index.<p> 631 * </cite> 632 * <p> 633 * This method uses a state machine with the following states: 634 * <ol> 635 * <li>state 0 - outside of any attribute</li> 636 * <li>state 1 - within attributre name</li> 637 * <li>state 2 - equals hit</li> 638 * <li>state 3 - within naked attribute value.</li> 639 * <li>state 4 - within single quoted attribute value</li> 640 * <li>state 5 - within double quoted attribute value</li> 641 * <li>state 6 - whitespaces after attribute name could lead to state 2 (=)or state 0</li> 642 * </ol> 643 * <p> 644 * The starting point for the various components is stored in an array 645 * of integers that match the initiation point for the states one-for-one, 646 * i.e. bookmarks[0] is where state 0 began, bookmarks[1] is where state 1 647 * began, etc. 648 * Attributes are stored in a <code>Vector</code> having 649 * one slot for each whitespace or attribute/value pair. 650 * The first slot is for attribute name (kind of like a standalone attribute). 651 * @param start The position at which to start scanning. 652 * @return The parsed tag. 653 * @exception ParserException If a problem occurs reading from the source. 654 */ 655 protected Node parseTag (int start) 656 throws 657 ParserException 658 { 659 boolean done; 660 char ch; 661 int state; 662 int[] bookmarks; 663 Vector attributes; 664 665 done = false; 666 attributes = new Vector (); 667 state = 0; 668 bookmarks = new int[8]; 669 bookmarks[0] = mCursor.getPosition (); 670 while (!done) 671 { 672 bookmarks[state + 1] = mCursor.getPosition (); 673 ch = mPage.getCharacter (mCursor); 674 switch (state) 675 { 676 case 0: // outside of any attribute 677 if ((Page.EOF == ch) || ('>' == ch) || ('<' == ch)) 678 { 679 if ('<' == ch) 680 { 681 // don't consume the opening angle 682 mCursor.retreat (); 683 bookmarks[state + 1] = mCursor.getPosition (); 684 } 685 whitespace (attributes, bookmarks); 686 done = true; 687 } 688 else if (!Character.isWhitespace (ch)) 689 { 690 whitespace (attributes, bookmarks); 691 state = 1; 692 } 693 break; 694 case 1: // within attribute name 695 if ((Page.EOF == ch) || ('>' == ch) || ('<' == ch)) 696 { 697 if ('<' == ch) 698 { 699 // don't consume the opening angle 700 mCursor.retreat (); 701 bookmarks[state + 1] = mCursor.getPosition (); 702 } 703 standalone (attributes, bookmarks); 704 done = true; 705 } 706 else if (Character.isWhitespace (ch)) 707 { 708 // whitespaces might be followed by next attribute or an equal sign 709 // see Bug #891058 Bug in lexer. 710 bookmarks[6] = bookmarks[2]; // setting the bookmark[0] is done in state 6 if applicable 711 state = 6; 712 } 713 else if ('=' == ch) 714 state = 2; 715 break; 716 case 2: // equals hit 717 if ((Page.EOF == ch) || ('>' == ch)) 718 { 719 empty (attributes, bookmarks); 720 done = true; 721 } 722 else if ('\'' == ch) 723 { 724 state = 4; 725 bookmarks[4] = bookmarks[3]; 726 } 727 else if ('"' == ch) 728 { 729 state = 5; 730 bookmarks[5] = bookmarks[3]; 731 } 732 else if (Character.isWhitespace (ch)) 733 { 734 // collect white spaces after "=" into the assignment string; 735 // do nothing 736 // see Bug #891058 Bug in lexer. 737 } 738 else 739 state = 3; 740 break; 741 case 3: // within naked attribute value 742 if ((Page.EOF == ch) || ('>' == ch)) 743 { 744 naked (attributes, bookmarks); 745 done = true; 746 } 747 else if (Character.isWhitespace (ch)) 748 { 749 naked (attributes, bookmarks); 750 bookmarks[0] = bookmarks[4]; 751 state = 0; 752 } 753 break; 754 case 4: // within single quoted attribute value 755 if (Page.EOF == ch) 756 { 757 single_quote (attributes, bookmarks); 758 done = true; // complain? 759 } 760 else if ('\'' == ch) 761 { 762 single_quote (attributes, bookmarks); 763 bookmarks[0] = bookmarks[5] + 1; 764 state = 0; 765 } 766 break; 767 case 5: // within double quoted attribute value 768 if (Page.EOF == ch) 769 { 770 double_quote (attributes, bookmarks); 771 done = true; // complain? 772 } 773 else if ('"' == ch) 774 { 775 double_quote (attributes, bookmarks); 776 bookmarks[0] = bookmarks[6] + 1; 777 state = 0; 778 } 779 break; 780 // patch for lexer state correction by 781 // Gernot Fricke 782 // See Bug # 891058 Bug in lexer. 783 case 6: // undecided for state 0 or 2 784 // we have read white spaces after an attributte name 785 if (Page.EOF == ch) 786 { 787 // same as last else clause 788 standalone (attributes, bookmarks); 789 bookmarks[0]=bookmarks[6]; 790 mCursor.retreat(); 791 state=0; 792 } 793 else if (Character.isWhitespace (ch)) 794 { 795 // proceed 796 } 797 else if ('=' == ch) // yepp. the white spaces belonged to the equal. 798 { 799 bookmarks[2] = bookmarks[6]; 800 bookmarks[3] = bookmarks[7]; 801 state=2; 802 } 803 else 804 { 805 // white spaces were not ended by equal 806 // meaning the attribute was a stand alone attribute 807 // now: create the stand alone attribute and rewind 808 // the cursor to the end of the white spaces 809 // and restart scanning as whitespace attribute. 810 standalone (attributes, bookmarks); 811 bookmarks[0]=bookmarks[6]; 812 mCursor.retreat(); 813 state=0; 814 } 815 break; 816 default: 817 throw new IllegalStateException ("how the fuck did we get in state " + state); 818 } 819 } 820 821 return (makeTag (start, mCursor.getPosition (), attributes)); 822 } 823 824 /** 825 * Create a tag node based on the current cursor and the one provided. 826 * @param start The starting point of the node. 827 * @param end The ending point of the node. 828 * @param attributes The attributes parsed from the tag. 829 * @exception ParserException If the nodefactory creation of the tag node fails. 830 * @return The new Tag node. 831 */ 832 protected Node makeTag (int start, int end, Vector attributes) 833 throws 834 ParserException 835 { 836 int length; 837 Node ret; 838 839 length = end - start; 840 if (0 != length) 841 { // return tag based on second character, '/', '%', Letter (ch), '!' 842 if (2 > length) 843 // this is an error 844 return (makeString (start, end)); 845 ret = getNodeFactory ().createTagNode (this.getPage (), start, end, attributes); 846 } 847 else 848 ret = null; 849 850 return (ret); 851 } 852 853 /** 854 * Parse a comment. 855 * Parse a remark markup.<p> 856 * From the <a href="http://www.w3.org/TR/html4/intro/sgmltut.html#h-3.2.4"> 857 * HTML 4.01 Specification, W3C Recommendation 24 December 1999</a> 858 * http://www.w3.org/TR/html4/intro/sgmltut.html#h-3.2.4<p> 859 * <cite> 860 * 3.2.4 Comments<p> 861 * HTML comments have the following syntax:<p> 862 * <code> 863 * <!-- this is a comment --><p> 864 * <!-- and so is this one,<p> 865 * which occupies more than one line --><p> 866 * </code> 867 * White space is not permitted between the markup declaration 868 * open delimiter("<!") and the comment open delimiter ("--"), 869 * but is permitted between the comment close delimiter ("--") and 870 * the markup declaration close delimiter (">"). 871 * A common error is to include a string of hyphens ("---") within a comment. 872 * Authors should avoid putting two or more adjacent hyphens inside comments. 873 * Information that appears between comments has no special meaning 874 * (e.g., character references are not interpreted as such). 875 * Note that comments are markup.<p> 876 * </cite> 877 * <p> 878 * This method uses a state machine with the following states: 879 * <ol> 880 * <li>state 0 - prior to the first open delimiter</li> 881 * <li>state 1 - prior to the second open delimiter</li> 882 * <li>state 2 - prior to the first closing delimiter</li> 883 * <li>state 3 - prior to the second closing delimiter</li> 884 * <li>state 4 - prior to the terminating ></li> 885 * </ol> 886 * <p> 887 * All comment text (everything excluding the < and >), is included 888 * in the remark text. 889 * We allow terminators like --!> even though this isn't part of the spec. 890 * @param start The position at which to start scanning. 891 * @param quotesmart If <code>true</code>, strings ignore quoted contents. 892 * @return The parsed node. 893 * @exception ParserException If a problem occurs reading from the source. 894 */ 895 protected Node parseRemark (int start, boolean quotesmart) 896 throws 897 ParserException 898 { 899 boolean done; 900 char ch; 901 int state; 902 903 done = false; 904 state = 0; 905 while (!done) 906 { 907 ch = mPage.getCharacter (mCursor); 908 if (Page.EOF == ch) 909 done = true; 910 else 911 switch (state) 912 { 913 case 0: // prior to the first open delimiter 914 if ('>' == ch) 915 done = true; 916 if ('-' == ch) 917 state = 1; 918 else 919 return (parseString (start, quotesmart)); 920 break; 921 case 1: // prior to the second open delimiter 922 if ('-' == ch) 923 { 924 // handle <!--> because netscape does 925 ch = mPage.getCharacter (mCursor); 926 if (Page.EOF == ch) 927 done = true; 928 else if ('>' == ch) 929 done = true; 930 else 931 { 932 mCursor.retreat (); 933 state = 2; 934 } 935 } 936 else 937 return (parseString (start, quotesmart)); 938 break; 939 case 2: // prior to the first closing delimiter 940 if ('-' == ch) 941 state = 3; 942 else if (Page.EOF == ch) 943 return (parseString (start, quotesmart)); // no terminator 944 break; 945 case 3: // prior to the second closing delimiter 946 if ('-' == ch) 947 state = 4; 948 else 949 state = 2; 950 break; 951 case 4: // prior to the terminating > 952 if ('>' == ch) 953 done = true; 954 else if (('!' == ch) || ('-' == ch) || Character.isWhitespace (ch)) 955 { 956 // stay in state 4 957 } 958 else 959 state = 2; 960 break; 961 default: 962 throw new IllegalStateException ("how the fuck did we get in state " + state); 963 } 964 } 965 966 return (makeRemark (start, mCursor.getPosition ())); 967 } 968 969 /** 970 * Create a remark node based on the current cursor and the one provided. 971 * @param start The starting point of the node. 972 * @param end The ending point of the node. 973 * @exception ParserException If the nodefactory creation of the remark node fails. 974 * @return The new Remark node. 975 */ 976 protected Node makeRemark (int start, int end) 977 throws 978 ParserException 979 { 980 int length; 981 Node ret; 982 983 length = end - start; 984 if (0 != length) 985 { // return tag based on second character, '/', '%', Letter (ch), '!' 986 if (2 > length) 987 // this is an error 988 return (makeString (start, end)); 989 ret = getNodeFactory ().createRemarkNode (this.getPage (), start, end); 990 } 991 else 992 ret = null; 993 994 return (ret); 995 } 996 997 /** 998 * Parse a java server page node. 999 * Scan characters until "%>" is encountered, or the input stream is 1000 * exhausted, in which case <code>null</code> is returned. 1001 * @param start The position at which to start scanning. 1002 * @return The parsed node. 1003 * @exception ParserException If a problem occurs reading from the source. 1004 */ 1005 protected Node parseJsp (int start) 1006 throws 1007 ParserException 1008 { 1009 boolean done; 1010 char ch; 1011 int state; 1012 Vector attributes; 1013 int code; 1014 1015 done = false; 1016 state = 0; 1017 code = 0; 1018 attributes = new Vector (); 1019 // <%xyz%> 1020 // 012223d 1021 // <%=xyz%> 1022 // 0122223d 1023 // <%@xyz%d 1024 // 0122223d 1025 while (!done) 1026 { 1027 ch = mPage.getCharacter (mCursor); 1028 switch (state) 1029 { 1030 case 0: // prior to the percent 1031 switch (ch) 1032 { 1033 case '%': // <% 1034 state = 1; 1035 break; 1036 // case Page.EOF: // <\0 1037 // case '>': // <> 1038 default: 1039 done = true; 1040 break; 1041 } 1042 break; 1043 case 1: // prior to the optional qualifier 1044 switch (ch) 1045 { 1046 case Page.EOF: // <%\0 1047 case '>': // <%> 1048 done = true; 1049 break; 1050 case '=': // <%= 1051 case '@': // <%@ 1052 code = mCursor.getPosition (); 1053 attributes.addElement (new PageAttribute (mPage, start + 1, code, -1, -1, (char)0)); 1054 state = 2; 1055 break; 1056 default: // <%x 1057 code = mCursor.getPosition () - 1; 1058 attributes.addElement (new PageAttribute (mPage, start + 1, code, -1, -1, (char)0)); 1059 state = 2; 1060 break; 1061 } 1062 break; 1063 case 2: // prior to the closing percent 1064 switch (ch) 1065 { 1066 case Page.EOF: // <%x\0 1067 case '>': // <%x> 1068 done = true; 1069 break; 1070 case '\'': 1071 case '"':// <%???" 1072 state = ch; 1073 break; 1074 case '%': // <%???% 1075 state = 3; 1076 break; 1077 case '/': // // or /* 1078 ch = mPage.getCharacter (mCursor); 1079 if (ch == '/') 1080 { // find the \n or \r 1081 while(true) 1082 { 1083 ch = mPage.getCharacter (mCursor); 1084 if (ch == Page.EOF) 1085 { 1086 done = true; 1087 break; 1088 } 1089 else if (ch == '\n' || ch == '\r') 1090 { 1091 break; 1092 } 1093 } 1094 } 1095 else if (ch == '*') 1096 { 1097 do 1098 { 1099 do 1100 ch = mPage.getCharacter (mCursor); 1101 while ((Page.EOF != ch) && ('*' != ch)); 1102 ch = mPage.getCharacter (mCursor); 1103 if (ch == '*') 1104 mCursor.retreat (); 1105 } 1106 while ((Page.EOF != ch) && ('/' != ch)); 1107 } 1108 else 1109 { 1110 mCursor.retreat (); 1111 } 1112 break; 1113 default: // <%???x 1114 break; 1115 } 1116 break; 1117 case 3: 1118 switch (ch) 1119 { 1120 case Page.EOF: // <%x??%\0 1121 done = true; 1122 break; 1123 case '>': 1124 state = 4; 1125 done = true; 1126 break; 1127 default: // <%???%x 1128 state = 2; 1129 break; 1130 } 1131 break; 1132 case '"': 1133 switch (ch) 1134 { 1135 case Page.EOF: // <%x??"\0 1136 done = true; 1137 break; 1138 case '"': 1139 state = 2; 1140 break; 1141 default: // <%???'??x 1142 break; 1143 } 1144 break; 1145 case '\'': 1146 switch (ch) 1147 { 1148 case Page.EOF: // <%x??'\0 1149 done = true; 1150 break; 1151 case '\'': 1152 state = 2; 1153 break; 1154 default: // <%???"??x 1155 break; 1156 } 1157 break; 1158 default: 1159 throw new IllegalStateException ("how the fuck did we get in state " + state); 1160 } 1161 } 1162 1163 if (4 == state) // normal exit 1164 { 1165 if (0 != code) 1166 { 1167 state = mCursor.getPosition () - 2; // reuse state 1168 attributes.addElement (new PageAttribute (mPage, code, state, -1, -1, (char)0)); 1169 attributes.addElement (new PageAttribute (mPage, state, state + 1, -1, -1, (char)0)); 1170 } 1171 else 1172 throw new IllegalStateException ("jsp with no code!"); 1173 } 1174 else 1175 return (parseString (start, true)); // hmmm, true? 1176 1177 return (makeTag (start, mCursor.getPosition (), attributes)); 1178 } 1179 1180 /** 1181 * Parse an XML processing instruction. 1182 * Scan characters until "?>" is encountered, or the input stream is 1183 * exhausted, in which case <code>null</code> is returned. 1184 * @param start The position at which to start scanning. 1185 * @return The parsed node. 1186 * @exception ParserException If a problem occurs reading from the source. 1187 */ 1188 protected Node parsePI (int start) 1189 throws 1190 ParserException 1191 { 1192 boolean done; 1193 char ch; 1194 int state; 1195 Vector attributes; 1196 int code; 1197 1198 done = false; 1199 state = 0; 1200 code = 0; 1201 attributes = new Vector (); 1202 // <?xyz?> 1203 // 011112d 1204 while (!done) 1205 { 1206 ch = mPage.getCharacter (mCursor); 1207 switch (state) 1208 { 1209 case 0: // prior to the question mark 1210 switch (ch) 1211 { 1212 case '?': // <? 1213 code = mCursor.getPosition (); 1214 attributes.addElement (new PageAttribute (mPage, start + 1, code, -1, -1, (char)0)); 1215 state = 1; 1216 break; 1217 // case Page.EOF: // <\0 1218 // case '>': // <> 1219 default: 1220 done = true; 1221 break; 1222 } 1223 break; 1224 case 1: // prior to the closing question mark 1225 switch (ch) 1226 { 1227 case Page.EOF: // <?x\0 1228 case '>': // <?x> 1229 done = true; 1230 break; 1231 case '\'': 1232 case '"':// <?..." 1233 state = ch; 1234 break; 1235 case '?': // <?...? 1236 state = 2; 1237 break; 1238 default: // <?...x 1239 break; 1240 } 1241 break; 1242 case 2: 1243 switch (ch) 1244 { 1245 case Page.EOF: // <?x..?\0 1246 done = true; 1247 break; 1248 case '>': 1249 state = 3; 1250 done = true; 1251 break; 1252 default: // <?...?x 1253 state = 1; 1254 break; 1255 } 1256 break; 1257 case '"': 1258 switch (ch) 1259 { 1260 case Page.EOF: // <?x.."\0 1261 done = true; 1262 break; 1263 case '"': 1264 state = 1; 1265 break; 1266 default: // <?...'.x 1267 break; 1268 } 1269 break; 1270 case '\'': 1271 switch (ch) 1272 { 1273 case Page.EOF: // <?x..'\0 1274 done = true; 1275 break; 1276 case '\'': 1277 state = 1; 1278 break; 1279 default: // <?..."..x 1280 break; 1281 } 1282 break; 1283 default: 1284 throw new IllegalStateException ("how the fuck did we get in state " + state); 1285 } 1286 } 1287 1288 if (3 == state) // normal exit 1289 { 1290 if (0 != code) 1291 { 1292 state = mCursor.getPosition () - 2; // reuse state 1293 attributes.addElement (new PageAttribute (mPage, code, state, -1, -1, (char)0)); 1294 attributes.addElement (new PageAttribute (mPage, state, state + 1, -1, -1, (char)0)); 1295 } 1296 else 1297 throw new IllegalStateException ("processing instruction with no content"); 1298 } 1299 else 1300 return (parseString (start, true)); // hmmm, true? 1301 1302 return (makeTag (start, mCursor.getPosition (), attributes)); 1303 } 1304 1305 /** 1306 * Return CDATA as a text node. 1307 * According to appendix <a href="http://www.w3.org/TR/html4/appendix/notes.html#notes-specifying-data"> 1308 * B.3.2 Specifying non-HTML data</a> of the 1309 * <a href="http://www.w3.org/TR/html4/">HTML 4.01 Specification</a>:<br> 1310 * <quote> 1311 * <b>Element content</b><br> 1312 * When script or style data is the content of an element (SCRIPT and STYLE), 1313 * the data begins immediately after the element start tag and ends at the 1314 * first ETAGO ("</") delimiter followed by a name start character ([a-zA-Z]); 1315 * note that this may not be the element's end tag. 1316 * Authors should therefore escape "</" within the content. Escape mechanisms 1317 * are specific to each scripting or style sheet language. 1318 * </quote> 1319 * @return The <code>TextNode</code> of the CDATA or <code>null</code> if none. 1320 * @exception ParserException If a problem occurs reading from the source. 1321 */ 1322 public Node parseCDATA () 1323 throws 1324 ParserException 1325 { 1326 return (parseCDATA (false)); 1327 } 1328 1329 /** 1330 * Return CDATA as a text node. 1331 * Slightly less rigid than {@link #parseCDATA()} this method provides for 1332 * parsing CDATA that may contain quoted strings that have embedded 1333 * ETAGO ("</") delimiters and skips single and multiline comments. 1334 * @param quotesmart If <code>true</code> the strict definition of CDATA is 1335 * extended to allow for single or double quoted ETAGO ("</") sequences. 1336 * @return The <code>TextNode</code> of the CDATA or <code>null</code> if none. 1337 * @see #parseCDATA() 1338 * @exception ParserException If a problem occurs reading from the source. 1339 */ 1340 public Node parseCDATA (boolean quotesmart) 1341 throws 1342 ParserException 1343 { 1344 int start; 1345 int state; 1346 boolean done; 1347 char quote; 1348 char ch; 1349 int end; 1350 boolean comment; 1351 1352 start = mCursor.getPosition (); 1353 state = 0; 1354 done = false; 1355 quote = 0; 1356 comment = false; 1357 1358 while (!done) 1359 { 1360 ch = mPage.getCharacter (mCursor); 1361 switch (state) 1362 { 1363 case 0: // prior to ETAGO 1364 switch (ch) 1365 { 1366 case Page.EOF: 1367 done = true; 1368 break; 1369 case '\'': 1370 if (quotesmart && !comment) 1371 if (0 == quote) 1372 quote = '\''; // enter quoted state 1373 else if ('\'' == quote) 1374 quote = 0; // exit quoted state 1375 break; 1376 case '"': 1377 if (quotesmart && !comment) 1378 if (0 == quote) 1379 quote = '"'; // enter quoted state 1380 else if ('"' == quote) 1381 quote = 0; // exit quoted state 1382 break; 1383 case '\\': 1384 if (quotesmart) 1385 if (0 != quote) 1386 { 1387 ch = mPage.getCharacter (mCursor); // try to consume escaped character 1388 if (Page.EOF == ch) 1389 done = true; 1390 else if ( (ch != '\\') && (ch != quote)) 1391 mCursor.retreat (); // unconsume char if character was not an escapable char. 1392 } 1393 break; 1394 case '/': 1395 if (quotesmart) 1396 if (0 == quote) 1397 { 1398 // handle multiline and double slash comments (with a quote) 1399 ch = mPage.getCharacter (mCursor); 1400 if (Page.EOF == ch) 1401 done = true; 1402 else if ('/' == ch) 1403 comment = true; 1404 else if ('*' == ch) 1405 { 1406 do 1407 { 1408 do 1409 ch = mPage.getCharacter (mCursor); 1410 while ((Page.EOF != ch) && ('*' != ch)); 1411 ch = mPage.getCharacter (mCursor); 1412 if (ch == '*') 1413 mCursor.retreat (); 1414 } 1415 while ((Page.EOF != ch) && ('/' != ch)); 1416 } 1417 else 1418 mCursor.retreat (); 1419 } 1420 break; 1421 case '\n': 1422 comment = false; 1423 break; 1424 case '<': 1425 if (quotesmart) 1426 { 1427 if (0 == quote) 1428 state = 1; 1429 } 1430 else 1431 state = 1; 1432 break; 1433 default: 1434 break; 1435 } 1436 break; 1437 case 1: // < 1438 switch (ch) 1439 { 1440 case Page.EOF: 1441 done = true; 1442 break; 1443 case '/': 1444 state = 2; 1445 break; 1446 case '!': 1447 ch = mPage.getCharacter (mCursor); 1448 if (Page.EOF == ch) 1449 done = true; 1450 else if ('-' == ch) 1451 { 1452 ch = mPage.getCharacter (mCursor); 1453 if (Page.EOF == ch) 1454 done = true; 1455 else if ('-' == ch) 1456 state = 3; 1457 else 1458 state = 0; 1459 } 1460 else 1461 state = 0; 1462 break; 1463 default: 1464 state = 0; 1465 break; 1466 } 1467 break; 1468 case 2: // </ 1469 comment = false; 1470 if (Page.EOF == ch) 1471 done = true; 1472 else if (Character.isLetter (ch)) 1473 { 1474 done = true; 1475 // back up to the start of ETAGO 1476 mCursor.retreat (); 1477 mCursor.retreat (); 1478 mCursor.retreat (); 1479 } 1480 else 1481 state = 0; 1482 break; 1483 case 3: // <! 1484 comment = false; 1485 if (Page.EOF == ch) 1486 done = true; 1487 else if ('-' == ch) 1488 { 1489 ch = mPage.getCharacter (mCursor); 1490 if (Page.EOF == ch) 1491 done = true; 1492 else if ('-' == ch) 1493 { 1494 ch = mPage.getCharacter (mCursor); 1495 if (Page.EOF == ch) 1496 done = true; 1497 else if ('>' == ch) 1498 state = 0; 1499 else 1500 { 1501 mCursor.retreat (); 1502 mCursor.retreat (); 1503 } 1504 } 1505 else 1506 mCursor.retreat (); 1507 } 1508 break; 1509 default: 1510 throw new IllegalStateException ("how the fuck did we get in state " + state); 1511 } 1512 } 1513 end = mCursor.getPosition (); 1514 1515 return (makeString (start, end)); 1516 } 1517 1518 // 1519 // NodeFactory interface 1520 // 1521 1522 /** 1523 * Create a new string node. 1524 * @param page The page the node is on. 1525 * @param start The beginning position of the string. 1526 * @param end The ending positiong of the string. 1527 * @return The created Text node. 1528 */ 1529 public Text createStringNode (Page page, int start, int end) 1530 { 1531 return (new TextNode (page, start, end)); 1532 } 1533 1534 /** 1535 * Create a new remark node. 1536 * @param page The page the node is on. 1537 * @param start The beginning position of the remark. 1538 * @param end The ending positiong of the remark. 1539 * @return The created Remark node. 1540 */ 1541 public Remark createRemarkNode (Page page, int start, int end) 1542 { 1543 return (new RemarkNode (page, start, end)); 1544 } 1545 1546 /** 1547 * Create a new tag node. 1548 * Note that the attributes vector contains at least one element, 1549 * which is the tag name (standalone attribute) at position zero. 1550 * This can be used to decide which type of node to create, or 1551 * gate other processing that may be appropriate. 1552 * @param page The page the node is on. 1553 * @param start The beginning position of the tag. 1554 * @param end The ending positiong of the tag. 1555 * @param attributes The attributes contained in this tag. 1556 * @return The created Tag node. 1557 */ 1558 public Tag createTagNode (Page page, int start, int end, Vector attributes) 1559 { 1560 return (new TagNode (page, start, end, attributes)); 1561 } 1562 1563 /** 1564 * Mainline for command line operation 1565 * @param args [0] The URL to parse. 1566 * @exception MalformedURLException If the provided URL cannot be resolved. 1567 * @exception ParserException If the parse fails. 1568 */ 1569 public static void main (String[] args) 1570 throws 1571 MalformedURLException, 1572 ParserException 1573 { 1574 Lexer lexer; 1575 Node node; 1576 1577 if (0 >= args.length) 1578 System.out.println ("usage: java -jar htmllexer.jar <url>"); 1579 else 1580 { 1581 try 1582 { 1583 ConnectionManager manager = Page.getConnectionManager (); 1584 lexer = new Lexer (manager.openConnection (args[0])); 1585 while (null != (node = lexer.nextNode (false))) 1586 System.out.println (node.toString ()); 1587 } 1588 catch (ParserException pe) 1589 { 1590 System.out.println (pe.getMessage ()); 1591 if (null != pe.getThrowable ()) 1592 System.out.println (pe.getThrowable ().getMessage ()); 1593 } 1594 } 1595 } 1596 }