Attribute.java
1 // HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML 2 // http://sourceforge.org/projects/htmlparser 3 // Copyright (C) 2004 Derrick Oswald 4 // 5 // Revision Control Information 6 // 7 // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/Attribute.java,v $ 8 // $Author: derrickoswald $ 9 // $Date: 2005/11/15 02:09:10 $ 10 // $Revision: 1.8 $ 11 // 12 // This library is free software; you can redistribute it and/or 13 // modify it under the terms of the GNU Lesser General Public 14 // License as published by the Free Software Foundation; either 15 // version 2.1 of the License, or (at your option) any later version. 16 // 17 // This library is distributed in the hope that it will be useful, 18 // but WITHOUT ANY WARRANTY; without even the implied warranty of 19 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 // Lesser General Public License for more details. 21 // 22 // You should have received a copy of the GNU Lesser General Public 23 // License along with this library; if not, write to the Free Software 24 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 25 // 26 27 package org.htmlparser; 28 29 import java.io.Serializable; 30 31 /** 32 * An attribute within a tag. 33 * Holds the name, assignment string, value and quote character. 34 * <p> 35 * This class was made deliberately simple. Except for 36 * {@link #setRawValue RawValue}, the properties are completely orthogonal, 37 * that is: each property is independant of the others. This means you have 38 * enough rope here to hang yourself, and it's very easy to create 39 * malformed HTML. Where it's obvious, warnings and notes have been provided 40 * in the setters javadocs, but it is up to you -- the programmer -- 41 * to ensure that the contents of the four fields will yield valid HTML 42 * (if that's what you want). 43 * <p> 44 * Be especially mindful of quotes and assignment strings. These are handled 45 * by the constructors where it's obvious, but in general, you need to set 46 * them explicitly when building an attribute. For example to construct 47 * the attribute <b><code>label="A multi word value."</code></b> you could use: 48 * <pre> 49 * attribute = new Attribute (); 50 * attribute.setName ("label"); 51 * attribute.setAssignment ("="); 52 * attribute.setValue ("A multi word value."); 53 * attribute.setQuote ('"'); 54 * </pre> 55 * or 56 * <pre> 57 * attribute = new Attribute (); 58 * attribute.setName ("label"); 59 * attribute.setAssignment ("="); 60 * attribute.setRawValue ("A multi word value."); 61 * </pre> 62 * or 63 * <pre> 64 * attribute = new Attribute ("label", "A multi word value."); 65 * </pre> 66 * Note that the assignment value and quoting need to be set separately when 67 * building the attribute from scratch using the properties. 68 * <p> 69 * <table width="100.0%" align="Center" border="1"> 70 * <caption>Valid States for Attributes.</caption> 71 * <tr> 72 * <th align="Center">Description</th> 73 * <th align="Center">toString()</th> 74 * <th align="Center">Name</th> 75 * <th align="Center">Assignment</th> 76 * <th align="Center">Value</th> 77 * <th align="Center">Quote</th> 78 * </tr> 79 * <tr> 80 * <td align="Center">whitespace attribute</td> 81 * <td align="Center">value</td> 82 * <td align="Center"><code>null</code></td> 83 * <td align="Center"><code>null</code></td> 84 * <td align="Center">"value"</td> 85 * <td align="Center"><code>0</code></td> 86 * </tr> 87 * <tr> 88 * <td align="Center">standalone attribute</td> 89 * <td align="Center">name</td> 90 * <td align="Center">"name"</td> 91 * <td align="Center"><code>null</code></td> 92 * <td align="Center"><code>null</code></td> 93 * <td align="Center"><code>0</code></td> 94 * </tr> 95 * <tr> 96 * <td align="Center">empty attribute</td> 97 * <td align="Center">name=</td> 98 * <td align="Center">"name"</td> 99 * <td align="Center">"="</td> 100 * <td align="Center"><code>null</code></td> 101 * <td align="Center"><code>0</code></td> 102 * </tr> 103 * <tr> 104 * <td align="Center">empty single quoted attribute</td> 105 * <td align="Center">name=''</td> 106 * <td align="Center">"name"</td> 107 * <td align="Center">"="</td> 108 * <td align="Center"><code>null</code></td> 109 * <td align="Center"><code>'</code></td> 110 * </tr> 111 * <tr> 112 * <td align="Center">empty double quoted attribute</td> 113 * <td align="Center">name=""</td> 114 * <td align="Center">"name"</td> 115 * <td align="Center">"="</td> 116 * <td align="Center"><code>null</code></td> 117 * <td align="Center"><code>"</code></td> 118 * </tr> 119 * <tr> 120 * <td align="Center">naked attribute</td> 121 * <td align="Center">name=value</td> 122 * <td align="Center">"name"</td> 123 * <td align="Center">"="</td> 124 * <td align="Center">"value"</td> 125 * <td align="Center"><code>0</code></td> 126 * </tr> 127 * <tr> 128 * <td align="Center">single quoted attribute</td> 129 * <td align="Center">name='value'</td> 130 * <td align="Center">"name"</td> 131 * <td align="Center">"="</td> 132 * <td align="Center">"value"</td> 133 * <td align="Center"><code>'</code></td> 134 * </tr> 135 * <tr> 136 * <td align="Center">double quoted attribute</td> 137 * <td align="Center">name="value"</td> 138 * <td align="Center">"name"</td> 139 * <td align="Center">"="</td> 140 * <td align="Center">"value"</td> 141 * <td align="Center"><code>"</code></td> 142 * </tr> 143 * </table> 144 * <br>In words: 145 * <br>If Name is null, and Assignment is null, and Quote is zero, 146 * it's whitepace and Value has the whitespace text -- value 147 * <br>If Name is not null, and both Assignment and Value are null 148 * it's a standalone attribute -- name 149 * <br>If Name is not null, and Assignment is an equals sign, and Quote is zero 150 * it's an empty attribute -- name= 151 * <br>If Name is not null, and Assignment is an equals sign, 152 * and Value is "" or null, and Quote is ' 153 * it's an empty single quoted attribute -- name='' 154 * <br>If Name is not null, and Assignment is an equals sign, 155 * and Value is "" or null, and Quote is " 156 * it's an empty double quoted attribute -- name="" 157 * <br>If Name is not null, and Assignment is an equals sign, 158 * and Value is something, and Quote is zero 159 * it's a naked attribute -- name=value 160 * <br>If Name is not null, and Assignment is an equals sign, 161 * and Value is something, and Quote is ' 162 * it's a single quoted attribute -- name='value' 163 * <br>If Name is not null, and Assignment is an equals sign, 164 * and Value is something, and Quote is " 165 * it's a double quoted attribute -- name="value" 166 * <br>All other states are invalid HTML. 167 * <p> 168 * From the <a href="http://www.w3.org/TR/html4/intro/sgmltut.html#h-3.2.2"> 169 * HTML 4.01 Specification, W3C Recommendation 24 December 1999</a> 170 * http://www.w3.org/TR/html4/intro/sgmltut.html#h-3.2.2:<p> 171 * <cite> 172 * 3.2.2 Attributes<p> 173 * Elements may have associated properties, called attributes, which may 174 * have values (by default, or set by authors or scripts). Attribute/value 175 * pairs appear before the final ">" of an element's start tag. Any number 176 * of (legal) attribute value pairs, separated by spaces, may appear in an 177 * element's start tag. They may appear in any order.<p> 178 * In this example, the id attribute is set for an H1 element: 179 * <pre> 180 * <code> 181 * {@.html 182 * <H1 id="section1"> 183 * This is an identified heading thanks to the id attribute 184 * </H1>} 185 * </code> 186 * </pre> 187 * By default, SGML requires that all attribute values be delimited using 188 * either double quotation marks (ASCII decimal 34) or single quotation 189 * marks (ASCII decimal 39). Single quote marks can be included within the 190 * attribute value when the value is delimited by double quote marks, and 191 * vice versa. Authors may also use numeric character references to 192 * represent double quotes (&#34;) and single quotes (&#39;). 193 * For doublequotes authors can also use the character entity reference 194 * &quot;.<p> 195 * In certain cases, authors may specify the value of an attribute without 196 * any quotation marks. The attribute value may only contain letters 197 * (a-z and A-Z), digits (0-9), hyphens (ASCII decimal 45), 198 * periods (ASCII decimal 46), underscores (ASCII decimal 95), 199 * and colons (ASCII decimal 58). We recommend using quotation marks even 200 * when it is possible to eliminate them.<p> 201 * Attribute names are always case-insensitive.<p> 202 * Attribute values are generally case-insensitive. The definition of each 203 * attribute in the reference manual indicates whether its value is 204 * case-insensitive.<p> 205 * All the attributes defined by this specification are listed in the 206 * <a href="http://www.w3.org/TR/html4/index/attributes.html">attribute 207 * index</a>.<p> 208 * </cite> 209 * <p> 210 */ 211 public class Attribute 212 implements 213 Serializable 214 { 215 /** 216 * The name of this attribute. 217 * The part before the equals sign, or the stand-alone attribute. 218 * This will be <code>null</code> if the attribute is whitespace. 219 */ 220 protected String mName; 221 222 /** 223 * The assignment string of the attribute. 224 * The equals sign. 225 * This will be <code>null</code> if the attribute is a 226 * stand-alone attribute. 227 */ 228 protected String mAssignment; 229 230 /** 231 * The value of the attribute. 232 * The part after the equals sign. 233 * This will be <code>null</code> if the attribute is an empty or 234 * stand-alone attribute. 235 */ 236 protected String mValue; 237 238 /** 239 * The quote, if any, surrounding the value of the attribute, if any. 240 * This will be zero if there are no quotes around the value. 241 */ 242 protected char mQuote; 243 244 /** 245 * Create an attribute with the name, assignment, value and quote given. 246 * If the quote value is zero, assigns the value using {@link #setRawValue} 247 * which sets the quote character to a proper value if necessary. 248 * @param name The name of this attribute. 249 * @param assignment The assignment string of this attribute. 250 * @param value The value of this attribute. 251 * @param quote The quote around the value of this attribute. 252 */ 253 public Attribute (String name, String assignment, String value, char quote) 254 { 255 setName (name); 256 setAssignment (assignment); 257 if (0 == quote) 258 setRawValue (value); 259 else 260 { 261 setValue (value); 262 setQuote (quote); 263 } 264 } 265 266 /** 267 * Create an attribute with the name, value and quote given. 268 * Uses an equals sign as the assignment string if the value is not 269 * <code>null</code>, and calls {@link #setRawValue} to get the 270 * correct quoting if <code>quote</code> is zero. 271 * @param name The name of this attribute. 272 * @param value The value of this attribute. 273 * @param quote The quote around the value of this attribute. 274 */ 275 public Attribute (String name, String value, char quote) 276 { 277 this (name, (null == value ? "" : "="), value, quote); 278 } 279 280 /** 281 * Create a whitespace attribute with the value given. 282 * @param value The value of this attribute. 283 * @exception IllegalArgumentException if the value contains other than 284 * whitespace. To set a real value use {@link #Attribute(String,String)}. 285 */ 286 public Attribute (String value) 287 throws 288 IllegalArgumentException 289 { 290 if (0 != value.trim ().length ()) 291 throw new IllegalArgumentException ("non whitespace value"); 292 else 293 { 294 setName (null); 295 setAssignment (null); 296 setValue (value); 297 setQuote ((char)0); 298 } 299 } 300 301 /** 302 * Create an attribute with the name and value given. 303 * Uses an equals sign as the assignment string if the value is not 304 * <code>null</code>, and calls {@link #setRawValue} to get the 305 * correct quoting. 306 * @param name The name of this attribute. 307 * @param value The value of this attribute. 308 */ 309 public Attribute (String name, String value) 310 { 311 this (name, (null == value ? "" : "="), value, (char)0); 312 } 313 314 /** 315 * Create an attribute with the name, assignment string and value given. 316 * Calls {@link #setRawValue} to get the correct quoting. 317 * @param name The name of this attribute. 318 * @param assignment The assignment string of this attribute. 319 * @param value The value of this attribute. 320 */ 321 public Attribute (String name, String assignment, String value) 322 { 323 this (name, assignment, value, (char)0); 324 } 325 326 /** 327 * Create an empty attribute. 328 * This will provide "" from the {@link #toString} and 329 * {@link #toString(StringBuffer)} methods. 330 */ 331 public Attribute () 332 { 333 this (null, null, null, (char)0); 334 } 335 336 /** 337 * Get the name of this attribute. 338 * The part before the equals sign, or the contents of the 339 * stand-alone attribute. 340 * @return The name, or <code>null</code> if it's just a whitepace 341 * 'attribute'. 342 * @see #setName 343 */ 344 public String getName () 345 { 346 return (mName); 347 } 348 349 /** 350 * Get the name of this attribute. 351 * @param buffer The buffer to place the name in. 352 * @see #getName() 353 * @see #setName 354 */ 355 public void getName (StringBuffer buffer) 356 { 357 if (null != mName) 358 buffer.append (mName); 359 } 360 361 /** 362 * Set the name of this attribute. 363 * Set the part before the equals sign, or the contents of the 364 * stand-alone attribute. 365 * <em>WARNING:</em> Setting this to <code>null</code> can result in 366 * malformed HTML if the assignment string is not <code>null</code>. 367 * @param name The new name. 368 * @see #getName 369 * @see #getName(StringBuffer) 370 */ 371 public void setName (String name) 372 { 373 mName = name; 374 } 375 376 /** 377 * Get the assignment string of this attribute. 378 * This is usually just an equals sign, but in poorly formed attributes it 379 * can include whitespace on either or both sides of an equals sign. 380 * @return The assignment string. 381 * @see #setAssignment 382 */ 383 public String getAssignment () 384 { 385 return (mAssignment); 386 } 387 388 /** 389 * Get the assignment string of this attribute. 390 * @param buffer The buffer to place the assignment string in. 391 * @see #getAssignment() 392 * @see #setAssignment 393 */ 394 public void getAssignment (StringBuffer buffer) 395 { 396 if (null != mAssignment) 397 buffer.append (mAssignment); 398 } 399 400 /** 401 * Set the assignment string of this attribute. 402 * <em>WARNING:</em> Setting this property to other than an equals sign 403 * or <code>null</code> will result in malformed HTML. In the case of a 404 * <code>null</code>, the {@link #setValue value} should also be set to 405 * <code>null</code>. 406 * @param assignment The new assignment string. 407 * @see #getAssignment 408 * @see #getAssignment(StringBuffer) 409 */ 410 public void setAssignment (String assignment) 411 { 412 mAssignment = assignment; 413 } 414 415 /** 416 * Get the value of the attribute. 417 * The part after the equals sign, or the text if it's just a whitepace 418 * 'attribute'. 419 * <em>NOTE:</em> This does not include any quotes that may have enclosed 420 * the value when it was read. To get the un-stripped value use 421 * {@link #getRawValue}. 422 * @return The value, or <code>null</code> if it's a stand-alone or 423 * empty attribute, or the text if it's just a whitepace 'attribute'. 424 * @see #setValue 425 */ 426 public String getValue () 427 { 428 return (mValue); 429 } 430 431 /** 432 * Get the value of the attribute. 433 * @param buffer The buffer to place the value in. 434 * @see #getValue() 435 * @see #setValue 436 */ 437 public void getValue (StringBuffer buffer) 438 { 439 if (null != mValue) 440 buffer.append (mValue); 441 } 442 443 /** 444 * Set the value of the attribute. 445 * The part after the equals sign, or the text if it's a whitepace 446 * 'attribute'. 447 * <em>WARNING:</em> Setting this property to a value that needs to be 448 * quoted without also setting the quote character will result in malformed 449 * HTML. 450 * @param value The new value. 451 * @see #getValue 452 * @see #getValue(StringBuffer) 453 */ 454 public void setValue (String value) 455 { 456 mValue = value; 457 } 458 459 /** 460 * Get the quote, if any, surrounding the value of the attribute, if any. 461 * @return Either ' or " if the attribute value was quoted, or zero 462 * if there are no quotes around it. 463 * @see #setQuote 464 */ 465 public char getQuote () 466 { 467 return (mQuote); 468 } 469 470 /** 471 * Get the quote, if any, surrounding the value of the attribute, if any. 472 * @param buffer The buffer to place the quote in. 473 * @see #getQuote() 474 * @see #setQuote 475 */ 476 public void getQuote (StringBuffer buffer) 477 { 478 if (0 != mQuote) 479 buffer.append (mQuote); 480 } 481 482 /** 483 * Set the quote surrounding the value of the attribute. 484 * <em>WARNING:</em> Setting this property to zero will result in malformed 485 * HTML if the {@link #getValue value} needs to be quoted (i.e. contains 486 * whitespace). 487 * @param quote The new quote value. 488 * @see #getQuote 489 * @see #getQuote(StringBuffer) 490 */ 491 public void setQuote (char quote) 492 { 493 mQuote = quote; 494 } 495 496 /** 497 * Get the raw value of the attribute. 498 * The part after the equals sign, or the text if it's just a whitepace 499 * 'attribute'. This includes the quotes around the value if any. 500 * @return The value, or <code>null</code> if it's a stand-alone attribute, 501 * or the text if it's just a whitepace 'attribute'. 502 * @see #setRawValue 503 */ 504 public String getRawValue () 505 { 506 char quote; 507 StringBuffer buffer; 508 String ret; 509 510 if (isValued ()) 511 { 512 quote = getQuote (); 513 if (0 != quote) 514 { 515 buffer = new StringBuffer (); // todo: what is the value length? 516 buffer.append (quote); 517 getValue (buffer); 518 buffer.append (quote); 519 ret = buffer.toString (); 520 } 521 else 522 ret = getValue (); 523 } 524 else 525 ret = null; 526 527 return (ret); 528 } 529 530 /** 531 * Get the raw value of the attribute. 532 * The part after the equals sign, or the text if it's just a whitepace 533 * 'attribute'. This includes the quotes around the value if any. 534 * @param buffer The string buffer to append the attribute value to. 535 * @see #getRawValue() 536 * @see #setRawValue 537 */ 538 public void getRawValue (StringBuffer buffer) 539 { 540 getQuote (buffer); 541 getValue (buffer); 542 getQuote (buffer); 543 } 544 545 /** 546 * Set the value of the attribute and the quote character. 547 * If the value is pure whitespace, assign it 'as is' and reset the 548 * quote character. If not, check for leading and trailing double or 549 * single quotes, and if found use this as the quote character and 550 * the inner contents of <code>value</code> as the real value. 551 * Otherwise, examine the string to determine if quotes are needed 552 * and an appropriate quote character if so. This may involve changing 553 * double quotes within the string to character references. 554 * @param value The new value. 555 * @see #getRawValue 556 * @see #getRawValue(StringBuffer) 557 */ 558 public void setRawValue (String value) 559 { 560 char ch; 561 boolean needed; 562 boolean singleq; 563 boolean doubleq; 564 String ref; 565 StringBuffer buffer; 566 char quote; 567 568 quote = 0; 569 if ((null != value) && (0 != value.trim ().length ())) 570 { 571 if (value.startsWith ("'") && value.endsWith ("'") 572 && (2 <= value.length ())) 573 { 574 quote = '\''; 575 value = value.substring (1, value.length () - 1); 576 } 577 else if (value.startsWith ("\"") && value.endsWith ("\"") 578 && (2 <= value.length ())) 579 { 580 quote = '"'; 581 value = value.substring (1, value.length () - 1); 582 } 583 else 584 { 585 // first determine if there's whitespace in the value 586 // and while we're at it find a suitable quote character 587 needed = false; 588 singleq = true; 589 doubleq = true; 590 for (int i = 0; i < value.length (); i++) 591 { 592 ch = value.charAt (i); 593 if ('\'' == ch) 594 { 595 singleq = false; 596 needed = true; 597 } 598 else if ('"' == ch) 599 { 600 doubleq = false; 601 needed = true; 602 } 603 else if (!('-' == ch) && !('.' == ch) && !('_' == ch) 604 && !(':' == ch) && !Character.isLetterOrDigit (ch)) 605 { 606 needed = true; 607 } 608 } 609 610 // now apply quoting 611 if (needed) 612 { 613 if (doubleq) 614 quote = '"'; 615 else if (singleq) 616 quote = '\''; 617 else 618 { 619 // uh-oh, we need to convert some quotes into character 620 // references, so convert all double quotes into " 621 quote = '"'; 622 ref = """; // Translate.encode (quote); 623 // JDK 1.4: value = value.replaceAll ("\"", ref); 624 buffer = new StringBuffer ( 625 value.length() * (ref.length () - 1)); 626 for (int i = 0; i < value.length (); i++) 627 { 628 ch = value.charAt (i); 629 if (quote == ch) 630 buffer.append (ref); 631 else 632 buffer.append (ch); 633 } 634 value = buffer.toString (); 635 } 636 } 637 } 638 } 639 setValue (value); 640 setQuote (quote); 641 } 642 643 /** 644 * Predicate to determine if this attribute is whitespace. 645 * @return <code>true</code> if this attribute is whitespace, 646 * <code>false</code> if it is a real attribute. 647 */ 648 public boolean isWhitespace () 649 { 650 return (null == getName ()); 651 } 652 653 /** 654 * Predicate to determine if this attribute has no equals sign (or value). 655 * @return <code>true</code> if this attribute is a standalone attribute. 656 * <code>false</code> if has an equals sign. 657 */ 658 public boolean isStandAlone () 659 { 660 return ((null != getName ()) && (null == getAssignment ())); 661 } 662 663 /** 664 * Predicate to determine if this attribute has an equals sign but no value. 665 * @return <code>true</code> if this attribute is an empty attribute. 666 * <code>false</code> if has an equals sign and a value. 667 */ 668 public boolean isEmpty () 669 { 670 return ((null != getAssignment ()) && (null == getValue ())); 671 } 672 673 /** 674 * Predicate to determine if this attribute has a value. 675 * @return <code>true</code> if this attribute has a value. 676 * <code>false</code> if it is empty or standalone. 677 */ 678 public boolean isValued () 679 { 680 return (null != getValue ()); 681 } 682 683 /** 684 * Get the length of the string value of this attribute. 685 * @return The number of characters required to express this attribute. 686 */ 687 public int getLength () 688 { 689 String name; 690 String assignment; 691 String value; 692 char quote; 693 int ret; 694 695 ret = 0; 696 name = getName (); 697 if (null != name) 698 ret += name.length (); 699 assignment = getAssignment (); 700 if (null != assignment) 701 ret += assignment.length (); 702 value = getValue (); 703 if (null != value) 704 ret += value.length (); 705 quote = getQuote (); 706 if (0 != quote) 707 ret += 2; 708 709 return (ret); 710 } 711 712 /** 713 * Get a text representation of this attribute. 714 * Suitable for insertion into a tag, the output is one of 715 * the forms: 716 * <code> 717 * <pre> 718 * value 719 * name 720 * name= 721 * name=value 722 * name='value' 723 * name="value" 724 * </pre> 725 * </code> 726 * @return A string that can be used within a tag. 727 */ 728 public String toString () 729 { 730 int length; 731 StringBuffer ret; 732 733 // get the size to avoid extra StringBuffer allocations 734 length = getLength (); 735 ret = new StringBuffer (length); 736 toString (ret); 737 738 return (ret.toString ()); 739 } 740 741 /** 742 * Get a text representation of this attribute. 743 * @param buffer The accumulator for placing the text into. 744 * @see #toString() 745 */ 746 public void toString (StringBuffer buffer) 747 { 748 getName (buffer); 749 getAssignment (buffer); 750 getRawValue (buffer); 751 } 752 753 }