URI.java
1 /* 2 * $HeadURL: https://svn.apache.org/repos/asf/jakarta/httpcomponents/oac.hc3x/tags/HTTPCLIENT_3_1/src/java/org/apache/commons/httpclient/URI.java $ 3 * $Revision: 564973 $ 4 * $Date: 2007-08-11 22:51:47 +0200 (Sat, 11 Aug 2007) $ 5 * 6 * ==================================================================== 7 * 8 * Licensed to the Apache Software Foundation (ASF) under one or more 9 * contributor license agreements. See the NOTICE file distributed with 10 * this work for additional information regarding copyright ownership. 11 * The ASF licenses this file to You under the Apache License, Version 2.0 12 * (the "License"); you may not use this file except in compliance with 13 * the License. You may obtain a copy of the License at 14 * 15 * http://www.apache.org/licenses/LICENSE-2.0 16 * 17 * Unless required by applicable law or agreed to in writing, software 18 * distributed under the License is distributed on an "AS IS" BASIS, 19 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 20 * See the License for the specific language governing permissions and 21 * limitations under the License. 22 * ==================================================================== 23 * 24 * This software consists of voluntary contributions made by many 25 * individuals on behalf of the Apache Software Foundation. For more 26 * information on the Apache Software Foundation, please see 27 * <http://www.apache.org/>. 28 * 29 */ 30 31 package org.apache.commons.httpclient; 32 33 import java.io.IOException; 34 import java.io.ObjectInputStream; 35 import java.io.ObjectOutputStream; 36 import java.io.Serializable; 37 import java.util.Arrays; 38 import java.util.Locale; 39 import java.util.BitSet; 40 import java.util.Hashtable; 41 42 import org.apache.commons.codec.DecoderException; 43 import org.apache.commons.codec.net.URLCodec; 44 import org.apache.commons.httpclient.util.EncodingUtil; 45 46 /** 47 * The interface for the URI(Uniform Resource Identifiers) version of RFC 2396. 48 * This class has the purpose of supportting of parsing a URI reference to 49 * extend any specific protocols, the character encoding of the protocol to 50 * be transported and the charset of the document. 51 * <p> 52 * A URI is always in an "escaped" form, since escaping or unescaping a 53 * completed URI might change its semantics. 54 * <p> 55 * Implementers should be careful not to escape or unescape the same string 56 * more than once, since unescaping an already unescaped string might lead to 57 * misinterpreting a percent data character as another escaped character, 58 * or vice versa in the case of escaping an already escaped string. 59 * <p> 60 * In order to avoid these problems, data types used as follows: 61 * <p><blockquote><pre> 62 * URI character sequence: char 63 * octet sequence: byte 64 * original character sequence: String 65 * </pre></blockquote><p> 66 * 67 * So, a URI is a sequence of characters as an array of a char type, which 68 * is not always represented as a sequence of octets as an array of byte. 69 * <p> 70 * 71 * URI Syntactic Components 72 * <p><blockquote><pre> 73 * - In general, written as follows: 74 * Absolute URI = <scheme>:<scheme-specific-part> 75 * Generic URI = <scheme>://<authority><path>?<query> 76 * 77 * - Syntax 78 * absoluteURI = scheme ":" ( hier_part | opaque_part ) 79 * hier_part = ( net_path | abs_path ) [ "?" query ] 80 * net_path = "//" authority [ abs_path ] 81 * abs_path = "/" path_segments 82 * </pre></blockquote><p> 83 * 84 * The following examples illustrate URI that are in common use. 85 * <pre> 86 * ftp://ftp.is.co.za/rfc/rfc1808.txt 87 * -- ftp scheme for File Transfer Protocol services 88 * gopher://spinaltap.micro.umn.edu/00/Weather/California/Los%20Angeles 89 * -- gopher scheme for Gopher and Gopher+ Protocol services 90 * http://www.math.uio.no/faq/compression-faq/part1.html 91 * -- http scheme for Hypertext Transfer Protocol services 92 * mailto:mduerst@ifi.unizh.ch 93 * -- mailto scheme for electronic mail addresses 94 * news:comp.infosystems.www.servers.unix 95 * -- news scheme for USENET news groups and articles 96 * telnet://melvyl.ucop.edu/ 97 * -- telnet scheme for interactive services via the TELNET Protocol 98 * </pre> 99 * Please, notice that there are many modifications from URL(RFC 1738) and 100 * relative URL(RFC 1808). 101 * <p> 102 * <b>The expressions for a URI</b> 103 * <p><pre> 104 * For escaped URI forms 105 * - URI(char[]) // constructor 106 * - char[] getRawXxx() // method 107 * - String getEscapedXxx() // method 108 * - String toString() // method 109 * <p> 110 * For unescaped URI forms 111 * - URI(String) // constructor 112 * - String getXXX() // method 113 * </pre><p> 114 * 115 * @author <a href="mailto:jericho@apache.org">Sung-Gu</a> 116 * @author <a href="mailto:mbowler@GargoyleSoftware.com">Mike Bowler</a> 117 * @version $Revision: 564973 $ $Date: 2002/03/14 15:14:01 118 */ 119 public class URI implements Cloneable, Comparable, Serializable { 120 121 122 // ----------------------------------------------------------- Constructors 123 124 /** Create an instance as an internal use */ 125 protected URI() { 126 } 127 128 /** 129 * Construct a URI from a string with the given charset. The input string can 130 * be either in escaped or unescaped form. 131 * 132 * @param s URI character sequence 133 * @param escaped <tt>true</tt> if URI character sequence is in escaped form. 134 * <tt>false</tt> otherwise. 135 * @param charset the charset string to do escape encoding, if required 136 * 137 * @throws URIException If the URI cannot be created. 138 * @throws NullPointerException if input string is <code>null</code> 139 * 140 * @see #getProtocolCharset 141 * 142 * @since 3.0 143 */ 144 public URI(String s, boolean escaped, String charset) 145 throws URIException, NullPointerException { 146 protocolCharset = charset; 147 parseUriReference(s, escaped); 148 } 149 150 /** 151 * Construct a URI from a string with the given charset. The input string can 152 * be either in escaped or unescaped form. 153 * 154 * @param s URI character sequence 155 * @param escaped <tt>true</tt> if URI character sequence is in escaped form. 156 * <tt>false</tt> otherwise. 157 * 158 * @throws URIException If the URI cannot be created. 159 * @throws NullPointerException if input string is <code>null</code> 160 * 161 * @see #getProtocolCharset 162 * 163 * @since 3.0 164 */ 165 public URI(String s, boolean escaped) 166 throws URIException, NullPointerException { 167 parseUriReference(s, escaped); 168 } 169 170 /** 171 * Construct a URI as an escaped form of a character array with the given 172 * charset. 173 * 174 * @param escaped the URI character sequence 175 * @param charset the charset string to do escape encoding 176 * @throws URIException If the URI cannot be created. 177 * @throws NullPointerException if <code>escaped</code> is <code>null</code> 178 * @see #getProtocolCharset 179 * 180 * @deprecated Use #URI(String, boolean, String) 181 */ 182 public URI(char[] escaped, String charset) 183 throws URIException, NullPointerException { 184 protocolCharset = charset; 185 parseUriReference(new String(escaped), true); 186 } 187 188 189 /** 190 * Construct a URI as an escaped form of a character array. 191 * An URI can be placed within double-quotes or angle brackets like 192 * "http://test.com/" and <http://test.com/> 193 * 194 * @param escaped the URI character sequence 195 * @throws URIException If the URI cannot be created. 196 * @throws NullPointerException if <code>escaped</code> is <code>null</code> 197 * @see #getDefaultProtocolCharset 198 * 199 * @deprecated Use #URI(String, boolean) 200 */ 201 public URI(char[] escaped) 202 throws URIException, NullPointerException { 203 parseUriReference(new String(escaped), true); 204 } 205 206 207 /** 208 * Construct a URI from the given string with the given charset. 209 * 210 * @param original the string to be represented to URI character sequence 211 * It is one of absoluteURI and relativeURI. 212 * @param charset the charset string to do escape encoding 213 * @throws URIException If the URI cannot be created. 214 * @see #getProtocolCharset 215 * 216 * @deprecated Use #URI(String, boolean, String) 217 */ 218 public URI(String original, String charset) throws URIException { 219 protocolCharset = charset; 220 parseUriReference(original, false); 221 } 222 223 224 /** 225 * Construct a URI from the given string. 226 * <p><blockquote><pre> 227 * URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ] 228 * </pre></blockquote><p> 229 * An URI can be placed within double-quotes or angle brackets like 230 * "http://test.com/" and <http://test.com/> 231 * 232 * @param original the string to be represented to URI character sequence 233 * It is one of absoluteURI and relativeURI. 234 * @throws URIException If the URI cannot be created. 235 * @see #getDefaultProtocolCharset 236 * 237 * @deprecated Use #URI(String, boolean) 238 */ 239 public URI(String original) throws URIException { 240 parseUriReference(original, false); 241 } 242 243 244 /** 245 * Construct a general URI from the given components. 246 * <p><blockquote><pre> 247 * URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ] 248 * absoluteURI = scheme ":" ( hier_part | opaque_part ) 249 * opaque_part = uric_no_slash *uric 250 * </pre></blockquote><p> 251 * It's for absolute URI = <scheme>:<scheme-specific-part># 252 * <fragment>. 253 * 254 * @param scheme the scheme string 255 * @param schemeSpecificPart scheme_specific_part 256 * @param fragment the fragment string 257 * @throws URIException If the URI cannot be created. 258 * @see #getDefaultProtocolCharset 259 */ 260 public URI(String scheme, String schemeSpecificPart, String fragment) 261 throws URIException { 262 263 // validate and contruct the URI character sequence 264 if (scheme == null) { 265 throw new URIException(URIException.PARSING, "scheme required"); 266 } 267 char[] s = scheme.toLowerCase().toCharArray(); 268 if (validate(s, URI.scheme)) { 269 _scheme = s; // is_absoluteURI 270 } else { 271 throw new URIException(URIException.PARSING, "incorrect scheme"); 272 } 273 _opaque = encode(schemeSpecificPart, allowed_opaque_part, 274 getProtocolCharset()); 275 // Set flag 276 _is_opaque_part = true; 277 _fragment = fragment == null ? null : fragment.toCharArray(); 278 setURI(); 279 } 280 281 282 /** 283 * Construct a general URI from the given components. 284 * <p><blockquote><pre> 285 * URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ] 286 * absoluteURI = scheme ":" ( hier_part | opaque_part ) 287 * relativeURI = ( net_path | abs_path | rel_path ) [ "?" query ] 288 * hier_part = ( net_path | abs_path ) [ "?" query ] 289 * </pre></blockquote><p> 290 * It's for absolute URI = <scheme>:<path>?<query>#< 291 * fragment> and relative URI = <path>?<query>#<fragment 292 * >. 293 * 294 * @param scheme the scheme string 295 * @param authority the authority string 296 * @param path the path string 297 * @param query the query string 298 * @param fragment the fragment string 299 * @throws URIException If the new URI cannot be created. 300 * @see #getDefaultProtocolCharset 301 */ 302 public URI(String scheme, String authority, String path, String query, 303 String fragment) throws URIException { 304 305 // validate and contruct the URI character sequence 306 StringBuffer buff = new StringBuffer(); 307 if (scheme != null) { 308 buff.append(scheme); 309 buff.append(':'); 310 } 311 if (authority != null) { 312 buff.append("//"); 313 buff.append(authority); 314 } 315 if (path != null) { // accept empty path 316 if ((scheme != null || authority != null) 317 && !path.startsWith("/")) { 318 throw new URIException(URIException.PARSING, 319 "abs_path requested"); 320 } 321 buff.append(path); 322 } 323 if (query != null) { 324 buff.append('?'); 325 buff.append(query); 326 } 327 if (fragment != null) { 328 buff.append('#'); 329 buff.append(fragment); 330 } 331 parseUriReference(buff.toString(), false); 332 } 333 334 335 /** 336 * Construct a general URI from the given components. 337 * 338 * @param scheme the scheme string 339 * @param userinfo the userinfo string 340 * @param host the host string 341 * @param port the port number 342 * @throws URIException If the new URI cannot be created. 343 * @see #getDefaultProtocolCharset 344 */ 345 public URI(String scheme, String userinfo, String host, int port) 346 throws URIException { 347 348 this(scheme, userinfo, host, port, null, null, null); 349 } 350 351 352 /** 353 * Construct a general URI from the given components. 354 * 355 * @param scheme the scheme string 356 * @param userinfo the userinfo string 357 * @param host the host string 358 * @param port the port number 359 * @param path the path string 360 * @throws URIException If the new URI cannot be created. 361 * @see #getDefaultProtocolCharset 362 */ 363 public URI(String scheme, String userinfo, String host, int port, 364 String path) throws URIException { 365 366 this(scheme, userinfo, host, port, path, null, null); 367 } 368 369 370 /** 371 * Construct a general URI from the given components. 372 * 373 * @param scheme the scheme string 374 * @param userinfo the userinfo string 375 * @param host the host string 376 * @param port the port number 377 * @param path the path string 378 * @param query the query string 379 * @throws URIException If the new URI cannot be created. 380 * @see #getDefaultProtocolCharset 381 */ 382 public URI(String scheme, String userinfo, String host, int port, 383 String path, String query) throws URIException { 384 385 this(scheme, userinfo, host, port, path, query, null); 386 } 387 388 389 /** 390 * Construct a general URI from the given components. 391 * 392 * @param scheme the scheme string 393 * @param userinfo the userinfo string 394 * @param host the host string 395 * @param port the port number 396 * @param path the path string 397 * @param query the query string 398 * @param fragment the fragment string 399 * @throws URIException If the new URI cannot be created. 400 * @see #getDefaultProtocolCharset 401 */ 402 public URI(String scheme, String userinfo, String host, int port, 403 String path, String query, String fragment) throws URIException { 404 405 this(scheme, (host == null) ? null 406 : ((userinfo != null) ? userinfo + '@' : "") + host 407 + ((port != -1) ? ":" + port : ""), path, query, fragment); 408 } 409 410 411 /** 412 * Construct a general URI from the given components. 413 * 414 * @param scheme the scheme string 415 * @param host the host string 416 * @param path the path string 417 * @param fragment the fragment string 418 * @throws URIException If the new URI cannot be created. 419 * @see #getDefaultProtocolCharset 420 */ 421 public URI(String scheme, String host, String path, String fragment) 422 throws URIException { 423 424 this(scheme, host, path, null, fragment); 425 } 426 427 428 /** 429 * Construct a general URI with the given relative URI string. 430 * 431 * @param base the base URI 432 * @param relative the relative URI string 433 * @throws URIException If the new URI cannot be created. 434 * 435 * @deprecated Use #URI(URI, String, boolean) 436 */ 437 public URI(URI base, String relative) throws URIException { 438 this(base, new URI(relative)); 439 } 440 441 442 /** 443 * Construct a general URI with the given relative URI string. 444 * 445 * @param base the base URI 446 * @param relative the relative URI string 447 * @param escaped <tt>true</tt> if URI character sequence is in escaped form. 448 * <tt>false</tt> otherwise. 449 * 450 * @throws URIException If the new URI cannot be created. 451 * 452 * @since 3.0 453 */ 454 public URI(URI base, String relative, boolean escaped) throws URIException { 455 this(base, new URI(relative, escaped)); 456 } 457 458 459 /** 460 * Construct a general URI with the given relative URI. 461 * <p><blockquote><pre> 462 * URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ] 463 * relativeURI = ( net_path | abs_path | rel_path ) [ "?" query ] 464 * </pre></blockquote><p> 465 * Resolving Relative References to Absolute Form. 466 * 467 * <strong>Examples of Resolving Relative URI References</strong> 468 * 469 * Within an object with a well-defined base URI of 470 * <p><blockquote><pre> 471 * http://a/b/c/d;p?q 472 * </pre></blockquote><p> 473 * the relative URI would be resolved as follows: 474 * 475 * Normal Examples 476 * 477 * <p><blockquote><pre> 478 * g:h = g:h 479 * g = http://a/b/c/g 480 * ./g = http://a/b/c/g 481 * g/ = http://a/b/c/g/ 482 * /g = http://a/g 483 * //g = http://g 484 * ?y = http://a/b/c/?y 485 * g?y = http://a/b/c/g?y 486 * #s = (current document)#s 487 * g#s = http://a/b/c/g#s 488 * g?y#s = http://a/b/c/g?y#s 489 * ;x = http://a/b/c/;x 490 * g;x = http://a/b/c/g;x 491 * g;x?y#s = http://a/b/c/g;x?y#s 492 * . = http://a/b/c/ 493 * ./ = http://a/b/c/ 494 * .. = http://a/b/ 495 * ../ = http://a/b/ 496 * ../g = http://a/b/g 497 * ../.. = http://a/ 498 * ../../ = http://a/ 499 * ../../g = http://a/g 500 * </pre></blockquote><p> 501 * 502 * Some URI schemes do not allow a hierarchical syntax matching the 503 * <hier_part> syntax, and thus cannot use relative references. 504 * 505 * @param base the base URI 506 * @param relative the relative URI 507 * @throws URIException If the new URI cannot be created. 508 */ 509 public URI(URI base, URI relative) throws URIException { 510 511 if (base._scheme == null) { 512 throw new URIException(URIException.PARSING, "base URI required"); 513 } 514 if (base._scheme != null) { 515 this._scheme = base._scheme; 516 this._authority = base._authority; 517 this._is_net_path = base._is_net_path; 518 } 519 if (base._is_opaque_part || relative._is_opaque_part) { 520 this._scheme = base._scheme; 521 this._is_opaque_part = base._is_opaque_part 522 || relative._is_opaque_part; 523 this._opaque = relative._opaque; 524 this._fragment = relative._fragment; 525 this.setURI(); 526 return; 527 } 528 boolean schemesEqual = Arrays.equals(base._scheme,relative._scheme); 529 if (relative._scheme != null 530 && (!schemesEqual || relative._authority != null)) { 531 this._scheme = relative._scheme; 532 this._is_net_path = relative._is_net_path; 533 this._authority = relative._authority; 534 if (relative._is_server) { 535 this._is_server = relative._is_server; 536 this._userinfo = relative._userinfo; 537 this._host = relative._host; 538 this._port = relative._port; 539 } else if (relative._is_reg_name) { 540 this._is_reg_name = relative._is_reg_name; 541 } 542 this._is_abs_path = relative._is_abs_path; 543 this._is_rel_path = relative._is_rel_path; 544 this._path = relative._path; 545 } else if (base._authority != null && relative._scheme == null) { 546 this._is_net_path = base._is_net_path; 547 this._authority = base._authority; 548 if (base._is_server) { 549 this._is_server = base._is_server; 550 this._userinfo = base._userinfo; 551 this._host = base._host; 552 this._port = base._port; 553 } else if (base._is_reg_name) { 554 this._is_reg_name = base._is_reg_name; 555 } 556 } 557 if (relative._authority != null) { 558 this._is_net_path = relative._is_net_path; 559 this._authority = relative._authority; 560 if (relative._is_server) { 561 this._is_server = relative._is_server; 562 this._userinfo = relative._userinfo; 563 this._host = relative._host; 564 this._port = relative._port; 565 } else if (relative._is_reg_name) { 566 this._is_reg_name = relative._is_reg_name; 567 } 568 this._is_abs_path = relative._is_abs_path; 569 this._is_rel_path = relative._is_rel_path; 570 this._path = relative._path; 571 } 572 // resolve the path and query if necessary 573 if (relative._authority == null 574 && (relative._scheme == null || schemesEqual)) { 575 if ((relative._path == null || relative._path.length == 0) 576 && relative._query == null) { 577 // handle a reference to the current document, see RFC 2396 578 // section 5.2 step 2 579 this._path = base._path; 580 this._query = base._query; 581 } else { 582 this._path = resolvePath(base._path, relative._path); 583 } 584 } 585 // base._query removed 586 if (relative._query != null) { 587 this._query = relative._query; 588 } 589 // base._fragment removed 590 if (relative._fragment != null) { 591 this._fragment = relative._fragment; 592 } 593 this.setURI(); 594 // reparse the newly built URI, this will ensure that all flags are set correctly. 595 // TODO there must be a better way to do this 596 parseUriReference(new String(_uri), true); 597 } 598 599 // --------------------------------------------------- Instance Variables 600 601 /** Version ID for serialization */ 602 static final long serialVersionUID = 604752400577948726L; 603 604 605 /** 606 * Cache the hash code for this URI. 607 */ 608 protected int hash = 0; 609 610 611 /** 612 * This Uniform Resource Identifier (URI). 613 * The URI is always in an "escaped" form, since escaping or unescaping 614 * a completed URI might change its semantics. 615 */ 616 protected char[] _uri = null; 617 618 619 /** 620 * The charset of the protocol used by this URI instance. 621 */ 622 protected String protocolCharset = null; 623 624 625 /** 626 * The default charset of the protocol. RFC 2277, 2396 627 */ 628 protected static String defaultProtocolCharset = "UTF-8"; 629 630 631 /** 632 * The default charset of the document. RFC 2277, 2396 633 * The platform's charset is used for the document by default. 634 */ 635 protected static String defaultDocumentCharset = null; 636 protected static String defaultDocumentCharsetByLocale = null; 637 protected static String defaultDocumentCharsetByPlatform = null; 638 // Static initializer for defaultDocumentCharset 639 static { 640 Locale locale = Locale.getDefault(); 641 // in order to support backward compatiblity 642 if (locale != null) { 643 defaultDocumentCharsetByLocale = 644 LocaleToCharsetMap.getCharset(locale); 645 // set the default document charset 646 defaultDocumentCharset = defaultDocumentCharsetByLocale; 647 } 648 // in order to support platform encoding 649 try { 650 defaultDocumentCharsetByPlatform = System.getProperty("file.encoding"); 651 } catch (SecurityException ignore) { 652 } 653 if (defaultDocumentCharset == null) { 654 // set the default document charset 655 defaultDocumentCharset = defaultDocumentCharsetByPlatform; 656 } 657 } 658 659 660 /** 661 * The scheme. 662 */ 663 protected char[] _scheme = null; 664 665 666 /** 667 * The opaque. 668 */ 669 protected char[] _opaque = null; 670 671 672 /** 673 * The authority. 674 */ 675 protected char[] _authority = null; 676 677 678 /** 679 * The userinfo. 680 */ 681 protected char[] _userinfo = null; 682 683 684 /** 685 * The host. 686 */ 687 protected char[] _host = null; 688 689 690 /** 691 * The port. 692 */ 693 protected int _port = -1; 694 695 696 /** 697 * The path. 698 */ 699 protected char[] _path = null; 700 701 702 /** 703 * The query. 704 */ 705 protected char[] _query = null; 706 707 708 /** 709 * The fragment. 710 */ 711 protected char[] _fragment = null; 712 713 714 /** 715 * The root path. 716 */ 717 protected static final char[] rootPath = { '/' }; 718 719 // ---------------------- Generous characters for each component validation 720 721 /** 722 * The percent "%" character always has the reserved purpose of being the 723 * escape indicator, it must be escaped as "%25" in order to be used as 724 * data within a URI. 725 */ 726 protected static final BitSet percent = new BitSet(256); 727 // Static initializer for percent 728 static { 729 percent.set('%'); 730 } 731 732 733 /** 734 * BitSet for digit. 735 * <p><blockquote><pre> 736 * digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | 737 * "8" | "9" 738 * </pre></blockquote><p> 739 */ 740 protected static final BitSet digit = new BitSet(256); 741 // Static initializer for digit 742 static { 743 for (int i = '0'; i <= '9'; i++) { 744 digit.set(i); 745 } 746 } 747 748 749 /** 750 * BitSet for alpha. 751 * <p><blockquote><pre> 752 * alpha = lowalpha | upalpha 753 * </pre></blockquote><p> 754 */ 755 protected static final BitSet alpha = new BitSet(256); 756 // Static initializer for alpha 757 static { 758 for (int i = 'a'; i <= 'z'; i++) { 759 alpha.set(i); 760 } 761 for (int i = 'A'; i <= 'Z'; i++) { 762 alpha.set(i); 763 } 764 } 765 766 767 /** 768 * BitSet for alphanum (join of alpha & digit). 769 * <p><blockquote><pre> 770 * alphanum = alpha | digit 771 * </pre></blockquote><p> 772 */ 773 protected static final BitSet alphanum = new BitSet(256); 774 // Static initializer for alphanum 775 static { 776 alphanum.or(alpha); 777 alphanum.or(digit); 778 } 779 780 781 /** 782 * BitSet for hex. 783 * <p><blockquote><pre> 784 * hex = digit | "A" | "B" | "C" | "D" | "E" | "F" | 785 * "a" | "b" | "c" | "d" | "e" | "f" 786 * </pre></blockquote><p> 787 */ 788 protected static final BitSet hex = new BitSet(256); 789 // Static initializer for hex 790 static { 791 hex.or(digit); 792 for (int i = 'a'; i <= 'f'; i++) { 793 hex.set(i); 794 } 795 for (int i = 'A'; i <= 'F'; i++) { 796 hex.set(i); 797 } 798 } 799 800 801 /** 802 * BitSet for escaped. 803 * <p><blockquote><pre> 804 * escaped = "%" hex hex 805 * </pre></blockquote><p> 806 */ 807 protected static final BitSet escaped = new BitSet(256); 808 // Static initializer for escaped 809 static { 810 escaped.or(percent); 811 escaped.or(hex); 812 } 813 814 815 /** 816 * BitSet for mark. 817 * <p><blockquote><pre> 818 * mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | 819 * "(" | ")" 820 * </pre></blockquote><p> 821 */ 822 protected static final BitSet mark = new BitSet(256); 823 // Static initializer for mark 824 static { 825 mark.set('-'); 826 mark.set('_'); 827 mark.set('.'); 828 mark.set('!'); 829 mark.set('~'); 830 mark.set('*'); 831 mark.set('\''); 832 mark.set('('); 833 mark.set(')'); 834 } 835 836 837 /** 838 * Data characters that are allowed in a URI but do not have a reserved 839 * purpose are called unreserved. 840 * <p><blockquote><pre> 841 * unreserved = alphanum | mark 842 * </pre></blockquote><p> 843 */ 844 protected static final BitSet unreserved = new BitSet(256); 845 // Static initializer for unreserved 846 static { 847 unreserved.or(alphanum); 848 unreserved.or(mark); 849 } 850 851 852 /** 853 * BitSet for reserved. 854 * <p><blockquote><pre> 855 * reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | 856 * "$" | "," 857 * </pre></blockquote><p> 858 */ 859 protected static final BitSet reserved = new BitSet(256); 860 // Static initializer for reserved 861 static { 862 reserved.set(';'); 863 reserved.set('/'); 864 reserved.set('?'); 865 reserved.set(':'); 866 reserved.set('@'); 867 reserved.set('&'); 868 reserved.set('='); 869 reserved.set('+'); 870 reserved.set('$'); 871 reserved.set(','); 872 } 873 874 875 /** 876 * BitSet for uric. 877 * <p><blockquote><pre> 878 * uric = reserved | unreserved | escaped 879 * </pre></blockquote><p> 880 */ 881 protected static final BitSet uric = new BitSet(256); 882 // Static initializer for uric 883 static { 884 uric.or(reserved); 885 uric.or(unreserved); 886 uric.or(escaped); 887 } 888 889 890 /** 891 * BitSet for fragment (alias for uric). 892 * <p><blockquote><pre> 893 * fragment = *uric 894 * </pre></blockquote><p> 895 */ 896 protected static final BitSet fragment = uric; 897 898 899 /** 900 * BitSet for query (alias for uric). 901 * <p><blockquote><pre> 902 * query = *uric 903 * </pre></blockquote><p> 904 */ 905 protected static final BitSet query = uric; 906 907 908 /** 909 * BitSet for pchar. 910 * <p><blockquote><pre> 911 * pchar = unreserved | escaped | 912 * ":" | "@" | "&" | "=" | "+" | "$" | "," 913 * </pre></blockquote><p> 914 */ 915 protected static final BitSet pchar = new BitSet(256); 916 // Static initializer for pchar 917 static { 918 pchar.or(unreserved); 919 pchar.or(escaped); 920 pchar.set(':'); 921 pchar.set('@'); 922 pchar.set('&'); 923 pchar.set('='); 924 pchar.set('+'); 925 pchar.set('$'); 926 pchar.set(','); 927 } 928 929 930 /** 931 * BitSet for param (alias for pchar). 932 * <p><blockquote><pre> 933 * param = *pchar 934 * </pre></blockquote><p> 935 */ 936 protected static final BitSet param = pchar; 937 938 939 /** 940 * BitSet for segment. 941 * <p><blockquote><pre> 942 * segment = *pchar *( ";" param ) 943 * </pre></blockquote><p> 944 */ 945 protected static final BitSet segment = new BitSet(256); 946 // Static initializer for segment 947 static { 948 segment.or(pchar); 949 segment.set(';'); 950 segment.or(param); 951 } 952 953 954 /** 955 * BitSet for path segments. 956 * <p><blockquote><pre> 957 * path_segments = segment *( "/" segment ) 958 * </pre></blockquote><p> 959 */ 960 protected static final BitSet path_segments = new BitSet(256); 961 // Static initializer for path_segments 962 static { 963 path_segments.set('/'); 964 path_segments.or(segment); 965 } 966 967 968 /** 969 * URI absolute path. 970 * <p><blockquote><pre> 971 * abs_path = "/" path_segments 972 * </pre></blockquote><p> 973 */ 974 protected static final BitSet abs_path = new BitSet(256); 975 // Static initializer for abs_path 976 static { 977 abs_path.set('/'); 978 abs_path.or(path_segments); 979 } 980 981 982 /** 983 * URI bitset for encoding typical non-slash characters. 984 * <p><blockquote><pre> 985 * uric_no_slash = unreserved | escaped | ";" | "?" | ":" | "@" | 986 * "&" | "=" | "+" | "$" | "," 987 * </pre></blockquote><p> 988 */ 989 protected static final BitSet uric_no_slash = new BitSet(256); 990 // Static initializer for uric_no_slash 991 static { 992 uric_no_slash.or(unreserved); 993 uric_no_slash.or(escaped); 994 uric_no_slash.set(';'); 995 uric_no_slash.set('?'); 996 uric_no_slash.set(';'); 997 uric_no_slash.set('@'); 998 uric_no_slash.set('&'); 999 uric_no_slash.set('='); 1000 uric_no_slash.set('+'); 1001 uric_no_slash.set('$'); 1002 uric_no_slash.set(','); 1003 } 1004 1005 1006 /** 1007 * URI bitset that combines uric_no_slash and uric. 1008 * <p><blockquote><pre> 1009 * opaque_part = uric_no_slash *uric 1010 * </pre></blockquote><p> 1011 */ 1012 protected static final BitSet opaque_part = new BitSet(256); 1013 // Static initializer for opaque_part 1014 static { 1015 // it's generous. because first character must not include a slash 1016 opaque_part.or(uric_no_slash); 1017 opaque_part.or(uric); 1018 } 1019 1020 1021 /** 1022 * URI bitset that combines absolute path and opaque part. 1023 * <p><blockquote><pre> 1024 * path = [ abs_path | opaque_part ] 1025 * </pre></blockquote><p> 1026 */ 1027 protected static final BitSet path = new BitSet(256); 1028 // Static initializer for path 1029 static { 1030 path.or(abs_path); 1031 path.or(opaque_part); 1032 } 1033 1034 1035 /** 1036 * Port, a logical alias for digit. 1037 */ 1038 protected static final BitSet port = digit; 1039 1040 1041 /** 1042 * Bitset that combines digit and dot fo IPv$address. 1043 * <p><blockquote><pre> 1044 * IPv4address = 1*digit "." 1*digit "." 1*digit "." 1*digit 1045 * </pre></blockquote><p> 1046 */ 1047 protected static final BitSet IPv4address = new BitSet(256); 1048 // Static initializer for IPv4address 1049 static { 1050 IPv4address.or(digit); 1051 IPv4address.set('.'); 1052 } 1053 1054 1055 /** 1056 * RFC 2373. 1057 * <p><blockquote><pre> 1058 * IPv6address = hexpart [ ":" IPv4address ] 1059 * </pre></blockquote><p> 1060 */ 1061 protected static final BitSet IPv6address = new BitSet(256); 1062 // Static initializer for IPv6address reference 1063 static { 1064 IPv6address.or(hex); // hexpart 1065 IPv6address.set(':'); 1066 IPv6address.or(IPv4address); 1067 } 1068 1069 1070 /** 1071 * RFC 2732, 2373. 1072 * <p><blockquote><pre> 1073 * IPv6reference = "[" IPv6address "]" 1074 * </pre></blockquote><p> 1075 */ 1076 protected static final BitSet IPv6reference = new BitSet(256); 1077 // Static initializer for IPv6reference 1078 static { 1079 IPv6reference.set('['); 1080 IPv6reference.or(IPv6address); 1081 IPv6reference.set(']'); 1082 } 1083 1084 1085 /** 1086 * BitSet for toplabel. 1087 * <p><blockquote><pre> 1088 * toplabel = alpha | alpha *( alphanum | "-" ) alphanum 1089 * </pre></blockquote><p> 1090 */ 1091 protected static final BitSet toplabel = new BitSet(256); 1092 // Static initializer for toplabel 1093 static { 1094 toplabel.or(alphanum); 1095 toplabel.set('-'); 1096 } 1097 1098 1099 /** 1100 * BitSet for domainlabel. 1101 * <p><blockquote><pre> 1102 * domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum 1103 * </pre></blockquote><p> 1104 */ 1105 protected static final BitSet domainlabel = toplabel; 1106 1107 1108 /** 1109 * BitSet for hostname. 1110 * <p><blockquote><pre> 1111 * hostname = *( domainlabel "." ) toplabel [ "." ] 1112 * </pre></blockquote><p> 1113 */ 1114 protected static final BitSet hostname = new BitSet(256); 1115 // Static initializer for hostname 1116 static { 1117 hostname.or(toplabel); 1118 // hostname.or(domainlabel); 1119 hostname.set('.'); 1120 } 1121 1122 1123 /** 1124 * BitSet for host. 1125 * <p><blockquote><pre> 1126 * host = hostname | IPv4address | IPv6reference 1127 * </pre></blockquote><p> 1128 */ 1129 protected static final BitSet host = new BitSet(256); 1130 // Static initializer for host 1131 static { 1132 host.or(hostname); 1133 // host.or(IPv4address); 1134 host.or(IPv6reference); // IPv4address 1135 } 1136 1137 1138 /** 1139 * BitSet for hostport. 1140 * <p><blockquote><pre> 1141 * hostport = host [ ":" port ] 1142 * </pre></blockquote><p> 1143 */ 1144 protected static final BitSet hostport = new BitSet(256); 1145 // Static initializer for hostport 1146 static { 1147 hostport.or(host); 1148 hostport.set(':'); 1149 hostport.or(port); 1150 } 1151 1152 1153 /** 1154 * Bitset for userinfo. 1155 * <p><blockquote><pre> 1156 * userinfo = *( unreserved | escaped | 1157 * ";" | ":" | "&" | "=" | "+" | "$" | "," ) 1158 * </pre></blockquote><p> 1159 */ 1160 protected static final BitSet userinfo = new BitSet(256); 1161 // Static initializer for userinfo 1162 static { 1163 userinfo.or(unreserved); 1164 userinfo.or(escaped); 1165 userinfo.set(';'); 1166 userinfo.set(':'); 1167 userinfo.set('&'); 1168 userinfo.set('='); 1169 userinfo.set('+'); 1170 userinfo.set('$'); 1171 userinfo.set(','); 1172 } 1173 1174 1175 /** 1176 * BitSet for within the userinfo component like user and password. 1177 */ 1178 public static final BitSet within_userinfo = new BitSet(256); 1179 // Static initializer for within_userinfo 1180 static { 1181 within_userinfo.or(userinfo); 1182 within_userinfo.clear(';'); // reserved within authority 1183 within_userinfo.clear(':'); 1184 within_userinfo.clear('@'); 1185 within_userinfo.clear('?'); 1186 within_userinfo.clear('/'); 1187 } 1188 1189 1190 /** 1191 * Bitset for server. 1192 * <p><blockquote><pre> 1193 * server = [ [ userinfo "@" ] hostport ] 1194 * </pre></blockquote><p> 1195 */ 1196 protected static final BitSet server = new BitSet(256); 1197 // Static initializer for server 1198 static { 1199 server.or(userinfo); 1200 server.set('@'); 1201 server.or(hostport); 1202 } 1203 1204 1205 /** 1206 * BitSet for reg_name. 1207 * <p><blockquote><pre> 1208 * reg_name = 1*( unreserved | escaped | "$" | "," | 1209 * ";" | ":" | "@" | "&" | "=" | "+" ) 1210 * </pre></blockquote><p> 1211 */ 1212 protected static final BitSet reg_name = new BitSet(256); 1213 // Static initializer for reg_name 1214 static { 1215 reg_name.or(unreserved); 1216 reg_name.or(escaped); 1217 reg_name.set('$'); 1218 reg_name.set(','); 1219 reg_name.set(';'); 1220 reg_name.set(':'); 1221 reg_name.set('@'); 1222 reg_name.set('&'); 1223 reg_name.set('='); 1224 reg_name.set('+'); 1225 } 1226 1227 1228 /** 1229 * BitSet for authority. 1230 * <p><blockquote><pre> 1231 * authority = server | reg_name 1232 * </pre></blockquote><p> 1233 */ 1234 protected static final BitSet authority = new BitSet(256); 1235 // Static initializer for authority 1236 static { 1237 authority.or(server); 1238 authority.or(reg_name); 1239 } 1240 1241 1242 /** 1243 * BitSet for scheme. 1244 * <p><blockquote><pre> 1245 * scheme = alpha *( alpha | digit | "+" | "-" | "." ) 1246 * </pre></blockquote><p> 1247 */ 1248 protected static final BitSet scheme = new BitSet(256); 1249 // Static initializer for scheme 1250 static { 1251 scheme.or(alpha); 1252 scheme.or(digit); 1253 scheme.set('+'); 1254 scheme.set('-'); 1255 scheme.set('.'); 1256 } 1257 1258 1259 /** 1260 * BitSet for rel_segment. 1261 * <p><blockquote><pre> 1262 * rel_segment = 1*( unreserved | escaped | 1263 * ";" | "@" | "&" | "=" | "+" | "$" | "," ) 1264 * </pre></blockquote><p> 1265 */ 1266 protected static final BitSet rel_segment = new BitSet(256); 1267 // Static initializer for rel_segment 1268 static { 1269 rel_segment.or(unreserved); 1270 rel_segment.or(escaped); 1271 rel_segment.set(';'); 1272 rel_segment.set('@'); 1273 rel_segment.set('&'); 1274 rel_segment.set('='); 1275 rel_segment.set('+'); 1276 rel_segment.set('$'); 1277 rel_segment.set(','); 1278 } 1279 1280 1281 /** 1282 * BitSet for rel_path. 1283 * <p><blockquote><pre> 1284 * rel_path = rel_segment [ abs_path ] 1285 * </pre></blockquote><p> 1286 */ 1287 protected static final BitSet rel_path = new BitSet(256); 1288 // Static initializer for rel_path 1289 static { 1290 rel_path.or(rel_segment); 1291 rel_path.or(abs_path); 1292 } 1293 1294 1295 /** 1296 * BitSet for net_path. 1297 * <p><blockquote><pre> 1298 * net_path = "//" authority [ abs_path ] 1299 * </pre></blockquote><p> 1300 */ 1301 protected static final BitSet net_path = new BitSet(256); 1302 // Static initializer for net_path 1303 static { 1304 net_path.set('/'); 1305 net_path.or(authority); 1306 net_path.or(abs_path); 1307 } 1308 1309 1310 /** 1311 * BitSet for hier_part. 1312 * <p><blockquote><pre> 1313 * hier_part = ( net_path | abs_path ) [ "?" query ] 1314 * </pre></blockquote><p> 1315 */ 1316 protected static final BitSet hier_part = new BitSet(256); 1317 // Static initializer for hier_part 1318 static { 1319 hier_part.or(net_path); 1320 hier_part.or(abs_path); 1321 // hier_part.set('?'); aleady included 1322 hier_part.or(query); 1323 } 1324 1325 1326 /** 1327 * BitSet for relativeURI. 1328 * <p><blockquote><pre> 1329 * relativeURI = ( net_path | abs_path | rel_path ) [ "?" query ] 1330 * </pre></blockquote><p> 1331 */ 1332 protected static final BitSet relativeURI = new BitSet(256); 1333 // Static initializer for relativeURI 1334 static { 1335 relativeURI.or(net_path); 1336 relativeURI.or(abs_path); 1337 relativeURI.or(rel_path); 1338 // relativeURI.set('?'); aleady included 1339 relativeURI.or(query); 1340 } 1341 1342 1343 /** 1344 * BitSet for absoluteURI. 1345 * <p><blockquote><pre> 1346 * absoluteURI = scheme ":" ( hier_part | opaque_part ) 1347 * </pre></blockquote><p> 1348 */ 1349 protected static final BitSet absoluteURI = new BitSet(256); 1350 // Static initializer for absoluteURI 1351 static { 1352 absoluteURI.or(scheme); 1353 absoluteURI.set(':'); 1354 absoluteURI.or(hier_part); 1355 absoluteURI.or(opaque_part); 1356 } 1357 1358 1359 /** 1360 * BitSet for URI-reference. 1361 * <p><blockquote><pre> 1362 * URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ] 1363 * </pre></blockquote><p> 1364 */ 1365 protected static final BitSet URI_reference = new BitSet(256); 1366 // Static initializer for URI_reference 1367 static { 1368 URI_reference.or(absoluteURI); 1369 URI_reference.or(relativeURI); 1370 URI_reference.set('#'); 1371 URI_reference.or(fragment); 1372 } 1373 1374 // ---------------------------- Characters disallowed within the URI syntax 1375 // Excluded US-ASCII Characters are like control, space, delims and unwise 1376 1377 /** 1378 * BitSet for control. 1379 */ 1380 public static final BitSet control = new BitSet(256); 1381 // Static initializer for control 1382 static { 1383 for (int i = 0; i <= 0x1F; i++) { 1384 control.set(i); 1385 } 1386 control.set(0x7F); 1387 } 1388 1389 /** 1390 * BitSet for space. 1391 */ 1392 public static final BitSet space = new BitSet(256); 1393 // Static initializer for space 1394 static { 1395 space.set(0x20); 1396 } 1397 1398 1399 /** 1400 * BitSet for delims. 1401 */ 1402 public static final BitSet delims = new BitSet(256); 1403 // Static initializer for delims 1404 static { 1405 delims.set('<'); 1406 delims.set('>'); 1407 delims.set('#'); 1408 delims.set('%'); 1409 delims.set('"'); 1410 } 1411 1412 1413 /** 1414 * BitSet for unwise. 1415 */ 1416 public static final BitSet unwise = new BitSet(256); 1417 // Static initializer for unwise 1418 static { 1419 unwise.set('{'); 1420 unwise.set('}'); 1421 unwise.set('|'); 1422 unwise.set('\\'); 1423 unwise.set('^'); 1424 unwise.set('['); 1425 unwise.set(']'); 1426 unwise.set('`'); 1427 } 1428 1429 1430 /** 1431 * Disallowed rel_path before escaping. 1432 */ 1433 public static final BitSet disallowed_rel_path = new BitSet(256); 1434 // Static initializer for disallowed_rel_path 1435 static { 1436 disallowed_rel_path.or(uric); 1437 disallowed_rel_path.andNot(rel_path); 1438 } 1439 1440 1441 /** 1442 * Disallowed opaque_part before escaping. 1443 */ 1444 public static final BitSet disallowed_opaque_part = new BitSet(256); 1445 // Static initializer for disallowed_opaque_part 1446 static { 1447 disallowed_opaque_part.or(uric); 1448 disallowed_opaque_part.andNot(opaque_part); 1449 } 1450 1451 // ----------------------- Characters allowed within and for each component 1452 1453 /** 1454 * Those characters that are allowed for the authority component. 1455 */ 1456 public static final BitSet allowed_authority = new BitSet(256); 1457 // Static initializer for allowed_authority 1458 static { 1459 allowed_authority.or(authority); 1460 allowed_authority.clear('%'); 1461 } 1462 1463 1464 /** 1465 * Those characters that are allowed for the opaque_part. 1466 */ 1467 public static final BitSet allowed_opaque_part = new BitSet(256); 1468 // Static initializer for allowed_opaque_part 1469 static { 1470 allowed_opaque_part.or(opaque_part); 1471 allowed_opaque_part.clear('%'); 1472 } 1473 1474 1475 /** 1476 * Those characters that are allowed for the reg_name. 1477 */ 1478 public static final BitSet allowed_reg_name = new BitSet(256); 1479 // Static initializer for allowed_reg_name 1480 static { 1481 allowed_reg_name.or(reg_name); 1482 // allowed_reg_name.andNot(percent); 1483 allowed_reg_name.clear('%'); 1484 } 1485 1486 1487 /** 1488 * Those characters that are allowed for the userinfo component. 1489 */ 1490 public static final BitSet allowed_userinfo = new BitSet(256); 1491 // Static initializer for allowed_userinfo 1492 static { 1493 allowed_userinfo.or(userinfo); 1494 // allowed_userinfo.andNot(percent); 1495 allowed_userinfo.clear('%'); 1496 } 1497 1498 1499 /** 1500 * Those characters that are allowed for within the userinfo component. 1501 */ 1502 public static final BitSet allowed_within_userinfo = new BitSet(256); 1503 // Static initializer for allowed_within_userinfo 1504 static { 1505 allowed_within_userinfo.or(within_userinfo); 1506 allowed_within_userinfo.clear('%'); 1507 } 1508 1509 1510 /** 1511 * Those characters that are allowed for the IPv6reference component. 1512 * The characters '[', ']' in IPv6reference should be excluded. 1513 */ 1514 public static final BitSet allowed_IPv6reference = new BitSet(256); 1515 // Static initializer for allowed_IPv6reference 1516 static { 1517 allowed_IPv6reference.or(IPv6reference); 1518 // allowed_IPv6reference.andNot(unwise); 1519 allowed_IPv6reference.clear('['); 1520 allowed_IPv6reference.clear(']'); 1521 } 1522 1523 1524 /** 1525 * Those characters that are allowed for the host component. 1526 * The characters '[', ']' in IPv6reference should be excluded. 1527 */ 1528 public static final BitSet allowed_host = new BitSet(256); 1529 // Static initializer for allowed_host 1530 static { 1531 allowed_host.or(hostname); 1532 allowed_host.or(allowed_IPv6reference); 1533 } 1534 1535 1536 /** 1537 * Those characters that are allowed for the authority component. 1538 */ 1539 public static final BitSet allowed_within_authority = new BitSet(256); 1540 // Static initializer for allowed_within_authority 1541 static { 1542 allowed_within_authority.or(server); 1543 allowed_within_authority.or(reg_name); 1544 allowed_within_authority.clear(';'); 1545 allowed_within_authority.clear(':'); 1546 allowed_within_authority.clear('@'); 1547 allowed_within_authority.clear('?'); 1548 allowed_within_authority.clear('/'); 1549 } 1550 1551 1552 /** 1553 * Those characters that are allowed for the abs_path. 1554 */ 1555 public static final BitSet allowed_abs_path = new BitSet(256); 1556 // Static initializer for allowed_abs_path 1557 static { 1558 allowed_abs_path.or(abs_path); 1559 // allowed_abs_path.set('/'); // aleady included 1560 allowed_abs_path.andNot(percent); 1561 allowed_abs_path.clear('+'); 1562 } 1563 1564 1565 /** 1566 * Those characters that are allowed for the rel_path. 1567 */ 1568 public static final BitSet allowed_rel_path = new BitSet(256); 1569 // Static initializer for allowed_rel_path 1570 static { 1571 allowed_rel_path.or(rel_path); 1572 allowed_rel_path.clear('%'); 1573 allowed_rel_path.clear('+'); 1574 } 1575 1576 1577 /** 1578 * Those characters that are allowed within the path. 1579 */ 1580 public static final BitSet allowed_within_path = new BitSet(256); 1581 // Static initializer for allowed_within_path 1582 static { 1583 allowed_within_path.or(abs_path); 1584 allowed_within_path.clear('/'); 1585 allowed_within_path.clear(';'); 1586 allowed_within_path.clear('='); 1587 allowed_within_path.clear('?'); 1588 } 1589 1590 1591 /** 1592 * Those characters that are allowed for the query component. 1593 */ 1594 public static final BitSet allowed_query = new BitSet(256); 1595 // Static initializer for allowed_query 1596 static { 1597 allowed_query.or(uric); 1598 allowed_query.clear('%'); 1599 } 1600 1601 1602 /** 1603 * Those characters that are allowed within the query component. 1604 */ 1605 public static final BitSet allowed_within_query = new BitSet(256); 1606 // Static initializer for allowed_within_query 1607 static { 1608 allowed_within_query.or(allowed_query); 1609 allowed_within_query.andNot(reserved); // excluded 'reserved' 1610 } 1611 1612 1613 /** 1614 * Those characters that are allowed for the fragment component. 1615 */ 1616 public static final BitSet allowed_fragment = new BitSet(256); 1617 // Static initializer for allowed_fragment 1618 static { 1619 allowed_fragment.or(uric); 1620 allowed_fragment.clear('%'); 1621 } 1622 1623 // ------------------------------------------- Flags for this URI-reference 1624 1625 // TODO: Figure out what all these variables are for and provide javadoc 1626 1627 // URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ] 1628 // absoluteURI = scheme ":" ( hier_part | opaque_part ) 1629 protected boolean _is_hier_part; 1630 protected boolean _is_opaque_part; 1631 // relativeURI = ( net_path | abs_path | rel_path ) [ "?" query ] 1632 // hier_part = ( net_path | abs_path ) [ "?" query ] 1633 protected boolean _is_net_path; 1634 protected boolean _is_abs_path; 1635 protected boolean _is_rel_path; 1636 // net_path = "//" authority [ abs_path ] 1637 // authority = server | reg_name 1638 protected boolean _is_reg_name; 1639 protected boolean _is_server; // = _has_server 1640 // server = [ [ userinfo "@" ] hostport ] 1641 // host = hostname | IPv4address | IPv6reference 1642 protected boolean _is_hostname; 1643 protected boolean _is_IPv4address; 1644 protected boolean _is_IPv6reference; 1645 1646 // ------------------------------------------ Character and escape encoding 1647 1648 /** 1649 * Encodes URI string. 1650 * 1651 * This is a two mapping, one from original characters to octets, and 1652 * subsequently a second from octets to URI characters: 1653 * <p><blockquote><pre> 1654 * original character sequence->octet sequence->URI character sequence 1655 * </pre></blockquote><p> 1656 * 1657 * An escaped octet is encoded as a character triplet, consisting of the 1658 * percent character "%" followed by the two hexadecimal digits 1659 * representing the octet code. For example, "%20" is the escaped 1660 * encoding for the US-ASCII space character. 1661 * <p> 1662 * Conversion from the local filesystem character set to UTF-8 will 1663 * normally involve a two step process. First convert the local character 1664 * set to the UCS; then convert the UCS to UTF-8. 1665 * The first step in the process can be performed by maintaining a mapping 1666 * table that includes the local character set code and the corresponding 1667 * UCS code. 1668 * The next step is to convert the UCS character code to the UTF-8 encoding. 1669 * <p> 1670 * Mapping between vendor codepages can be done in a very similar manner 1671 * as described above. 1672 * <p> 1673 * The only time escape encodings can allowedly be made is when a URI is 1674 * being created from its component parts. The escape and validate methods 1675 * are internally performed within this method. 1676 * 1677 * @param original the original character sequence 1678 * @param allowed those characters that are allowed within a component 1679 * @param charset the protocol charset 1680 * @return URI character sequence 1681 * @throws URIException null component or unsupported character encoding 1682 */ 1683 1684 protected static char[] encode(String original, BitSet allowed, 1685 String charset) throws URIException { 1686 if (original == null) { 1687 throw new IllegalArgumentException("Original string may not be null"); 1688 } 1689 if (allowed == null) { 1690 throw new IllegalArgumentException("Allowed bitset may not be null"); 1691 } 1692 byte[] rawdata = URLCodec.encodeUrl(allowed, EncodingUtil.getBytes(original, charset)); 1693 return EncodingUtil.getAsciiString(rawdata).toCharArray(); 1694 } 1695 1696 /** 1697 * Decodes URI encoded string. 1698 * 1699 * This is a two mapping, one from URI characters to octets, and 1700 * subsequently a second from octets to original characters: 1701 * <p><blockquote><pre> 1702 * URI character sequence->octet sequence->original character sequence 1703 * </pre></blockquote><p> 1704 * 1705 * A URI must be separated into its components before the escaped 1706 * characters within those components can be allowedly decoded. 1707 * <p> 1708 * Notice that there is a chance that URI characters that are non UTF-8 1709 * may be parsed as valid UTF-8. A recent non-scientific analysis found 1710 * that EUC encoded Japanese words had a 2.7% false reading; SJIS had a 1711 * 0.0005% false reading; other encoding such as ASCII or KOI-8 have a 0% 1712 * false reading. 1713 * <p> 1714 * The percent "%" character always has the reserved purpose of being 1715 * the escape indicator, it must be escaped as "%25" in order to be used 1716 * as data within a URI. 1717 * <p> 1718 * The unescape method is internally performed within this method. 1719 * 1720 * @param component the URI character sequence 1721 * @param charset the protocol charset 1722 * @return original character sequence 1723 * @throws URIException incomplete trailing escape pattern or unsupported 1724 * character encoding 1725 */ 1726 protected static String decode(char[] component, String charset) 1727 throws URIException { 1728 if (component == null) { 1729 throw new IllegalArgumentException("Component array of chars may not be null"); 1730 } 1731 return decode(new String(component), charset); 1732 } 1733 1734 /** 1735 * Decodes URI encoded string. 1736 * 1737 * This is a two mapping, one from URI characters to octets, and 1738 * subsequently a second from octets to original characters: 1739 * <p><blockquote><pre> 1740 * URI character sequence->octet sequence->original character sequence 1741 * </pre></blockquote><p> 1742 * 1743 * A URI must be separated into its components before the escaped 1744 * characters within those components can be allowedly decoded. 1745 * <p> 1746 * Notice that there is a chance that URI characters that are non UTF-8 1747 * may be parsed as valid UTF-8. A recent non-scientific analysis found 1748 * that EUC encoded Japanese words had a 2.7% false reading; SJIS had a 1749 * 0.0005% false reading; other encoding such as ASCII or KOI-8 have a 0% 1750 * false reading. 1751 * <p> 1752 * The percent "%" character always has the reserved purpose of being 1753 * the escape indicator, it must be escaped as "%25" in order to be used 1754 * as data within a URI. 1755 * <p> 1756 * The unescape method is internally performed within this method. 1757 * 1758 * @param component the URI character sequence 1759 * @param charset the protocol charset 1760 * @return original character sequence 1761 * @throws URIException incomplete trailing escape pattern or unsupported 1762 * character encoding 1763 * 1764 * @since 3.0 1765 */ 1766 protected static String decode(String component, String charset) 1767 throws URIException { 1768 if (component == null) { 1769 throw new IllegalArgumentException("Component array of chars may not be null"); 1770 } 1771 byte[] rawdata = null; 1772 try { 1773 rawdata = URLCodec.decodeUrl(EncodingUtil.getAsciiBytes(component)); 1774 } catch (DecoderException e) { 1775 throw new URIException(e.getMessage()); 1776 } 1777 return EncodingUtil.getString(rawdata, charset); 1778 } 1779 /** 1780 * Pre-validate the unescaped URI string within a specific component. 1781 * 1782 * @param component the component string within the component 1783 * @param disallowed those characters disallowed within the component 1784 * @return if true, it doesn't have the disallowed characters 1785 * if false, the component is undefined or an incorrect one 1786 */ 1787 protected boolean prevalidate(String component, BitSet disallowed) { 1788 // prevalidate the given component by disallowed characters 1789 if (component == null) { 1790 return false; // undefined 1791 } 1792 char[] target = component.toCharArray(); 1793 for (int i = 0; i < target.length; i++) { 1794 if (disallowed.get(target[i])) { 1795 return false; 1796 } 1797 } 1798 return true; 1799 } 1800 1801 1802 /** 1803 * Validate the URI characters within a specific component. 1804 * The component must be performed after escape encoding. Or it doesn't 1805 * include escaped characters. 1806 * 1807 * @param component the characters sequence within the component 1808 * @param generous those characters that are allowed within a component 1809 * @return if true, it's the correct URI character sequence 1810 */ 1811 protected boolean validate(char[] component, BitSet generous) { 1812 // validate each component by generous characters 1813 return validate(component, 0, -1, generous); 1814 } 1815 1816 1817 /** 1818 * Validate the URI characters within a specific component. 1819 * The component must be performed after escape encoding. Or it doesn't 1820 * include escaped characters. 1821 * <p> 1822 * It's not that much strict, generous. The strict validation might be 1823 * performed before being called this method. 1824 * 1825 * @param component the characters sequence within the component 1826 * @param soffset the starting offset of the given component 1827 * @param eoffset the ending offset of the given component 1828 * if -1, it means the length of the component 1829 * @param generous those characters that are allowed within a component 1830 * @return if true, it's the correct URI character sequence 1831 */ 1832 protected boolean validate(char[] component, int soffset, int eoffset, 1833 BitSet generous) { 1834 // validate each component by generous characters 1835 if (eoffset == -1) { 1836 eoffset = component.length - 1; 1837 } 1838 for (int i = soffset; i <= eoffset; i++) { 1839 if (!generous.get(component[i])) { 1840 return false; 1841 } 1842 } 1843 return true; 1844 } 1845 1846 1847 /** 1848 * In order to avoid any possilbity of conflict with non-ASCII characters, 1849 * Parse a URI reference as a <code>String</code> with the character 1850 * encoding of the local system or the document. 1851 * <p> 1852 * The following line is the regular expression for breaking-down a URI 1853 * reference into its components. 1854 * <p><blockquote><pre> 1855 * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? 1856 * 12 3 4 5 6 7 8 9 1857 * </pre></blockquote><p> 1858 * For example, matching the above expression to 1859 * http://jakarta.apache.org/ietf/uri/#Related 1860 * results in the following subexpression matches: 1861 * <p><blockquote><pre> 1862 * $1 = http: 1863 * scheme = $2 = http 1864 * $3 = //jakarta.apache.org 1865 * authority = $4 = jakarta.apache.org 1866 * path = $5 = /ietf/uri/ 1867 * $6 = <undefined> 1868 * query = $7 = <undefined> 1869 * $8 = #Related 1870 * fragment = $9 = Related 1871 * </pre></blockquote><p> 1872 * 1873 * @param original the original character sequence 1874 * @param escaped <code>true</code> if <code>original</code> is escaped 1875 * @throws URIException If an error occurs. 1876 */ 1877 protected void parseUriReference(String original, boolean escaped) 1878 throws URIException { 1879 1880 // validate and contruct the URI character sequence 1881 if (original == null) { 1882 throw new URIException("URI-Reference required"); 1883 } 1884 1885 /* @ 1886 * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? 1887 */ 1888 String tmp = original.trim(); 1889 1890 /* 1891 * The length of the string sequence of characters. 1892 * It may not be equal to the length of the byte array. 1893 */ 1894 int length = tmp.length(); 1895 1896 /* 1897 * Remove the delimiters like angle brackets around an URI. 1898 */ 1899 if (length > 0) { 1900 char[] firstDelimiter = { tmp.charAt(0) }; 1901 if (validate(firstDelimiter, delims)) { 1902 if (length >= 2) { 1903 char[] lastDelimiter = { tmp.charAt(length - 1) }; 1904 if (validate(lastDelimiter, delims)) { 1905 tmp = tmp.substring(1, length - 1); 1906 length = length - 2; 1907 } 1908 } 1909 } 1910 } 1911 1912 /* 1913 * The starting index 1914 */ 1915 int from = 0; 1916 1917 /* 1918 * The test flag whether the URI is started from the path component. 1919 */ 1920 boolean isStartedFromPath = false; 1921 int atColon = tmp.indexOf(':'); 1922 int atSlash = tmp.indexOf('/'); 1923 if ((atColon <= 0 && !tmp.startsWith("//")) 1924 || (atSlash >= 0 && atSlash < atColon)) { 1925 isStartedFromPath = true; 1926 } 1927 1928 /* 1929 * <p><blockquote><pre> 1930 * @@@@@@@@ 1931 * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? 1932 * </pre></blockquote><p> 1933 */ 1934 int at = indexFirstOf(tmp, isStartedFromPath ? "/?#" : ":/?#", from); 1935 if (at == -1) { 1936 at = 0; 1937 } 1938 1939 /* 1940 * Parse the scheme. 1941 * <p><blockquote><pre> 1942 * scheme = $2 = http 1943 * @ 1944 * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? 1945 * </pre></blockquote><p> 1946 */ 1947 if (at > 0 && at < length && tmp.charAt(at) == ':') { 1948 char[] target = tmp.substring(0, at).toLowerCase().toCharArray(); 1949 if (validate(target, scheme)) { 1950 _scheme = target; 1951 } else { 1952 throw new URIException("incorrect scheme"); 1953 } 1954 from = ++at; 1955 } 1956 1957 /* 1958 * Parse the authority component. 1959 * <p><blockquote><pre> 1960 * authority = $4 = jakarta.apache.org 1961 * @@ 1962 * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? 1963 * </pre></blockquote><p> 1964 */ 1965 // Reset flags 1966 _is_net_path = _is_abs_path = _is_rel_path = _is_hier_part = false; 1967 if (0 <= at && at < length && tmp.charAt(at) == '/') { 1968 // Set flag 1969 _is_hier_part = true; 1970 if (at + 2 < length && tmp.charAt(at + 1) == '/' 1971 && !isStartedFromPath) { 1972 // the temporary index to start the search from 1973 int next = indexFirstOf(tmp, "/?#", at + 2); 1974 if (next == -1) { 1975 next = (tmp.substring(at + 2).length() == 0) ? at + 2 1976 : tmp.length(); 1977 } 1978 parseAuthority(tmp.substring(at + 2, next), escaped); 1979 from = at = next; 1980 // Set flag 1981 _is_net_path = true; 1982 } 1983 if (from == at) { 1984 // Set flag 1985 _is_abs_path = true; 1986 } 1987 } 1988 1989 /* 1990 * Parse the path component. 1991 * <p><blockquote><pre> 1992 * path = $5 = /ietf/uri/ 1993 * @@@@@@ 1994 * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? 1995 * </pre></blockquote><p> 1996 */ 1997 if (from < length) { 1998 // rel_path = rel_segment [ abs_path ] 1999 int next = indexFirstOf(tmp, "?#", from); 2000 if (next == -1) { 2001 next = tmp.length(); 2002 } 2003 if (!_is_abs_path) { 2004 if (!escaped 2005 && prevalidate(tmp.substring(from, next), disallowed_rel_path) 2006 || escaped 2007 && validate(tmp.substring(from, next).toCharArray(), rel_path)) { 2008 // Set flag 2009 _is_rel_path = true; 2010 } else if (!escaped 2011 && prevalidate(tmp.substring(from, next), disallowed_opaque_part) 2012 || escaped 2013 && validate(tmp.substring(from, next).toCharArray(), opaque_part)) { 2014 // Set flag 2015 _is_opaque_part = true; 2016 } else { 2017 // the path component may be empty 2018 _path = null; 2019 } 2020 } 2021 String s = tmp.substring(from, next); 2022 if (escaped) { 2023 setRawPath(s.toCharArray()); 2024 } else { 2025 setPath(s); 2026 } 2027 at = next; 2028 } 2029 2030 // set the charset to do escape encoding 2031 String charset = getProtocolCharset(); 2032 2033 /* 2034 * Parse the query component. 2035 * <p><blockquote><pre> 2036 * query = $7 = <undefined> 2037 * @@@@@@@@@ 2038 * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? 2039 * </pre></blockquote><p> 2040 */ 2041 if (0 <= at && at + 1 < length && tmp.charAt(at) == '?') { 2042 int next = tmp.indexOf('#', at + 1); 2043 if (next == -1) { 2044 next = tmp.length(); 2045 } 2046 if (escaped) { 2047 _query = tmp.substring(at + 1, next).toCharArray(); 2048 if (!validate(_query, uric)) { 2049 throw new URIException("Invalid query"); 2050 } 2051 } else { 2052 _query = encode(tmp.substring(at + 1, next), allowed_query, charset); 2053 } 2054 at = next; 2055 } 2056 2057 /* 2058 * Parse the fragment component. 2059 * <p><blockquote><pre> 2060 * fragment = $9 = Related 2061 * @@@@@@@@ 2062 * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? 2063 * </pre></blockquote><p> 2064 */ 2065 if (0 <= at && at + 1 <= length && tmp.charAt(at) == '#') { 2066 if (at + 1 == length) { // empty fragment 2067 _fragment = "".toCharArray(); 2068 } else { 2069 _fragment = (escaped) ? tmp.substring(at + 1).toCharArray() 2070 : encode(tmp.substring(at + 1), allowed_fragment, charset); 2071 } 2072 } 2073 2074 // set this URI. 2075 setURI(); 2076 } 2077 2078 2079 /** 2080 * Get the earlier index that to be searched for the first occurrance in 2081 * one of any of the given string. 2082 * 2083 * @param s the string to be indexed 2084 * @param delims the delimiters used to index 2085 * @return the earlier index if there are delimiters 2086 */ 2087 protected int indexFirstOf(String s, String delims) { 2088 return indexFirstOf(s, delims, -1); 2089 } 2090 2091 2092 /** 2093 * Get the earlier index that to be searched for the first occurrance in 2094 * one of any of the given string. 2095 * 2096 * @param s the string to be indexed 2097 * @param delims the delimiters used to index 2098 * @param offset the from index 2099 * @return the earlier index if there are delimiters 2100 */ 2101 protected int indexFirstOf(String s, String delims, int offset) { 2102 if (s == null || s.length() == 0) { 2103 return -1; 2104 } 2105 if (delims == null || delims.length() == 0) { 2106 return -1; 2107 } 2108 // check boundaries 2109 if (offset < 0) { 2110 offset = 0; 2111 } else if (offset > s.length()) { 2112 return -1; 2113 } 2114 // s is never null 2115 int min = s.length(); 2116 char[] delim = delims.toCharArray(); 2117 for (int i = 0; i < delim.length; i++) { 2118 int at = s.indexOf(delim[i], offset); 2119 if (at >= 0 && at < min) { 2120 min = at; 2121 } 2122 } 2123 return (min == s.length()) ? -1 : min; 2124 } 2125 2126 2127 /** 2128 * Get the earlier index that to be searched for the first occurrance in 2129 * one of any of the given array. 2130 * 2131 * @param s the character array to be indexed 2132 * @param delim the delimiter used to index 2133 * @return the ealier index if there are a delimiter 2134 */ 2135 protected int indexFirstOf(char[] s, char delim) { 2136 return indexFirstOf(s, delim, 0); 2137 } 2138 2139 2140 /** 2141 * Get the earlier index that to be searched for the first occurrance in 2142 * one of any of the given array. 2143 * 2144 * @param s the character array to be indexed 2145 * @param delim the delimiter used to index 2146 * @param offset The offset. 2147 * @return the ealier index if there is a delimiter 2148 */ 2149 protected int indexFirstOf(char[] s, char delim, int offset) { 2150 if (s == null || s.length == 0) { 2151 return -1; 2152 } 2153 // check boundaries 2154 if (offset < 0) { 2155 offset = 0; 2156 } else if (offset > s.length) { 2157 return -1; 2158 } 2159 for (int i = offset; i < s.length; i++) { 2160 if (s[i] == delim) { 2161 return i; 2162 } 2163 } 2164 return -1; 2165 } 2166 2167 2168 /** 2169 * Parse the authority component. 2170 * 2171 * @param original the original character sequence of authority component 2172 * @param escaped <code>true</code> if <code>original</code> is escaped 2173 * @throws URIException If an error occurs. 2174 */ 2175 protected void parseAuthority(String original, boolean escaped) 2176 throws URIException { 2177 2178 // Reset flags 2179 _is_reg_name = _is_server = 2180 _is_hostname = _is_IPv4address = _is_IPv6reference = false; 2181 2182 // set the charset to do escape encoding 2183 String charset = getProtocolCharset(); 2184 2185 boolean hasPort = true; 2186 int from = 0; 2187 int next = original.indexOf('@'); 2188 if (next != -1) { // neither -1 and 0 2189 // each protocol extented from URI supports the specific userinfo 2190 _userinfo = (escaped) ? original.substring(0, next).toCharArray() 2191 : encode(original.substring(0, next), allowed_userinfo, 2192 charset); 2193 from = next + 1; 2194 } 2195 next = original.indexOf('[', from); 2196 if (next >= from) { 2197 next = original.indexOf(']', from); 2198 if (next == -1) { 2199 throw new URIException(URIException.PARSING, "IPv6reference"); 2200 } else { 2201 next++; 2202 } 2203 // In IPv6reference, '[', ']' should be excluded 2204 _host = (escaped) ? original.substring(from, next).toCharArray() 2205 : encode(original.substring(from, next), allowed_IPv6reference, 2206 charset); 2207 // Set flag 2208 _is_IPv6reference = true; 2209 } else { // only for !_is_IPv6reference 2210 next = original.indexOf(':', from); 2211 if (next == -1) { 2212 next = original.length(); 2213 hasPort = false; 2214 } 2215 // REMINDME: it doesn't need the pre-validation 2216 _host = original.substring(from, next).toCharArray(); 2217 if (validate(_host, IPv4address)) { 2218 // Set flag 2219 _is_IPv4address = true; 2220 } else if (validate(_host, hostname)) { 2221 // Set flag 2222 _is_hostname = true; 2223 } else { 2224 // Set flag 2225 _is_reg_name = true; 2226 } 2227 } 2228 if (_is_reg_name) { 2229 // Reset flags for a server-based naming authority 2230 _is_server = _is_hostname = _is_IPv4address = 2231 _is_IPv6reference = false; 2232 // set a registry-based naming authority 2233 if (escaped) { 2234 _authority = original.toCharArray(); 2235 if (!validate(_authority, reg_name)) { 2236 throw new URIException("Invalid authority"); 2237 } 2238 } else { 2239 _authority = encode(original, allowed_reg_name, charset); 2240 } 2241 } else { 2242 if (original.length() - 1 > next && hasPort 2243 && original.charAt(next) == ':') { // not empty 2244 from = next + 1; 2245 try { 2246 _port = Integer.parseInt(original.substring(from)); 2247 } catch (NumberFormatException error) { 2248 throw new URIException(URIException.PARSING, 2249 "invalid port number"); 2250 } 2251 } 2252 // set a server-based naming authority 2253 StringBuffer buf = new StringBuffer(); 2254 if (_userinfo != null) { // has_userinfo 2255 buf.append(_userinfo); 2256 buf.append('@'); 2257 } 2258 if (_host != null) { 2259 buf.append(_host); 2260 if (_port != -1) { 2261 buf.append(':'); 2262 buf.append(_port); 2263 } 2264 } 2265 _authority = buf.toString().toCharArray(); 2266 // Set flag 2267 _is_server = true; 2268 } 2269 } 2270 2271 2272 /** 2273 * Once it's parsed successfully, set this URI. 2274 * 2275 * @see #getRawURI 2276 */ 2277 protected void setURI() { 2278 // set _uri 2279 StringBuffer buf = new StringBuffer(); 2280 // ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? 2281 if (_scheme != null) { 2282 buf.append(_scheme); 2283 buf.append(':'); 2284 } 2285 if (_is_net_path) { 2286 buf.append("//"); 2287 if (_authority != null) { // has_authority 2288 buf.append(_authority); 2289 } 2290 } 2291 if (_opaque != null && _is_opaque_part) { 2292 buf.append(_opaque); 2293 } else if (_path != null) { 2294 // _is_hier_part or _is_relativeURI 2295 if (_path.length != 0) { 2296 buf.append(_path); 2297 } 2298 } 2299 if (_query != null) { // has_query 2300 buf.append('?'); 2301 buf.append(_query); 2302 } 2303 // ignore the fragment identifier 2304 _uri = buf.toString().toCharArray(); 2305 hash = 0; 2306 } 2307 2308 // ----------------------------------------------------------- Test methods 2309 2310 2311 /** 2312 * Tell whether or not this URI is absolute. 2313 * 2314 * @return true iif this URI is absoluteURI 2315 */ 2316 public boolean isAbsoluteURI() { 2317 return (_scheme != null); 2318 } 2319 2320 2321 /** 2322 * Tell whether or not this URI is relative. 2323 * 2324 * @return true iif this URI is relativeURI 2325 */ 2326 public boolean isRelativeURI() { 2327 return (_scheme == null); 2328 } 2329 2330 2331 /** 2332 * Tell whether or not the absoluteURI of this URI is hier_part. 2333 * 2334 * @return true iif the absoluteURI is hier_part 2335 */ 2336 public boolean isHierPart() { 2337 return _is_hier_part; 2338 } 2339 2340 2341 /** 2342 * Tell whether or not the absoluteURI of this URI is opaque_part. 2343 * 2344 * @return true iif the absoluteURI is opaque_part 2345 */ 2346 public boolean isOpaquePart() { 2347 return _is_opaque_part; 2348 } 2349 2350 2351 /** 2352 * Tell whether or not the relativeURI or heir_part of this URI is net_path. 2353 * It's the same function as the has_authority() method. 2354 * 2355 * @return true iif the relativeURI or heir_part is net_path 2356 * @see #hasAuthority 2357 */ 2358 public boolean isNetPath() { 2359 return _is_net_path || (_authority != null); 2360 } 2361 2362 2363 /** 2364 * Tell whether or not the relativeURI or hier_part of this URI is abs_path. 2365 * 2366 * @return true iif the relativeURI or hier_part is abs_path 2367 */ 2368 public boolean isAbsPath() { 2369 return _is_abs_path; 2370 } 2371 2372 2373 /** 2374 * Tell whether or not the relativeURI of this URI is rel_path. 2375 * 2376 * @return true iif the relativeURI is rel_path 2377 */ 2378 public boolean isRelPath() { 2379 return _is_rel_path; 2380 } 2381 2382 2383 /** 2384 * Tell whether or not this URI has authority. 2385 * It's the same function as the is_net_path() method. 2386 * 2387 * @return true iif this URI has authority 2388 * @see #isNetPath 2389 */ 2390 public boolean hasAuthority() { 2391 return (_authority != null) || _is_net_path; 2392 } 2393 2394 /** 2395 * Tell whether or not the authority component of this URI is reg_name. 2396 * 2397 * @return true iif the authority component is reg_name 2398 */ 2399 public boolean isRegName() { 2400 return _is_reg_name; 2401 } 2402 2403 2404 /** 2405 * Tell whether or not the authority component of this URI is server. 2406 * 2407 * @return true iif the authority component is server 2408 */ 2409 public boolean isServer() { 2410 return _is_server; 2411 } 2412 2413 2414 /** 2415 * Tell whether or not this URI has userinfo. 2416 * 2417 * @return true iif this URI has userinfo 2418 */ 2419 public boolean hasUserinfo() { 2420 return (_userinfo != null); 2421 } 2422 2423 2424 /** 2425 * Tell whether or not the host part of this URI is hostname. 2426 * 2427 * @return true iif the host part is hostname 2428 */ 2429 public boolean isHostname() { 2430 return _is_hostname; 2431 } 2432 2433 2434 /** 2435 * Tell whether or not the host part of this URI is IPv4address. 2436 * 2437 * @return true iif the host part is IPv4address 2438 */ 2439 public boolean isIPv4address() { 2440 return _is_IPv4address; 2441 } 2442 2443 2444 /** 2445 * Tell whether or not the host part of this URI is IPv6reference. 2446 * 2447 * @return true iif the host part is IPv6reference 2448 */ 2449 public boolean isIPv6reference() { 2450 return _is_IPv6reference; 2451 } 2452 2453 2454 /** 2455 * Tell whether or not this URI has query. 2456 * 2457 * @return true iif this URI has query 2458 */ 2459 public boolean hasQuery() { 2460 return (_query != null); 2461 } 2462 2463 2464 /** 2465 * Tell whether or not this URI has fragment. 2466 * 2467 * @return true iif this URI has fragment 2468 */ 2469 public boolean hasFragment() { 2470 return (_fragment != null); 2471 } 2472 2473 2474 // ---------------------------------------------------------------- Charset 2475 2476 2477 /** 2478 * Set the default charset of the protocol. 2479 * <p> 2480 * The character set used to store files SHALL remain a local decision and 2481 * MAY depend on the capability of local operating systems. Prior to the 2482 * exchange of URIs they SHOULD be converted into a ISO/IEC 10646 format 2483 * and UTF-8 encoded. This approach, while allowing international exchange 2484 * of URIs, will still allow backward compatibility with older systems 2485 * because the code set positions for ASCII characters are identical to the 2486 * one byte sequence in UTF-8. 2487 * <p> 2488 * An individual URI scheme may require a single charset, define a default 2489 * charset, or provide a way to indicate the charset used. 2490 * 2491 * <p> 2492 * Always all the time, the setter method is always succeeded and throws 2493 * <code>DefaultCharsetChanged</code> exception. 2494 * 2495 * So API programmer must follow the following way: 2496 * <code><pre> 2497 * import org.apache.util.URI$DefaultCharsetChanged; 2498 * . 2499 * . 2500 * . 2501 * try { 2502 * URI.setDefaultProtocolCharset("UTF-8"); 2503 * } catch (DefaultCharsetChanged cc) { 2504 * // CASE 1: the exception could be ignored, when it is set by user 2505 * if (cc.getReasonCode() == DefaultCharsetChanged.PROTOCOL_CHARSET) { 2506 * // CASE 2: let user know the default protocol charset changed 2507 * } else { 2508 * // CASE 2: let user know the default document charset changed 2509 * } 2510 * } 2511 * </pre></code> 2512 * 2513 * The API programmer is responsible to set the correct charset. 2514 * And each application should remember its own charset to support. 2515 * 2516 * @param charset the default charset for each protocol 2517 * @throws DefaultCharsetChanged default charset changed 2518 */ 2519 public static void setDefaultProtocolCharset(String charset) 2520 throws DefaultCharsetChanged { 2521 2522 defaultProtocolCharset = charset; 2523 throw new DefaultCharsetChanged(DefaultCharsetChanged.PROTOCOL_CHARSET, 2524 "the default protocol charset changed"); 2525 } 2526 2527 2528 /** 2529 * Get the default charset of the protocol. 2530 * <p> 2531 * An individual URI scheme may require a single charset, define a default 2532 * charset, or provide a way to indicate the charset used. 2533 * <p> 2534 * To work globally either requires support of a number of character sets 2535 * and to be able to convert between them, or the use of a single preferred 2536 * character set. 2537 * For support of global compatibility it is STRONGLY RECOMMENDED that 2538 * clients and servers use UTF-8 encoding when exchanging URIs. 2539 * 2540 * @return the default charset string 2541 */ 2542 public static String getDefaultProtocolCharset() { 2543 return defaultProtocolCharset; 2544 } 2545 2546 2547 /** 2548 * Get the protocol charset used by this current URI instance. 2549 * It was set by the constructor for this instance. If it was not set by 2550 * contructor, it will return the default protocol charset. 2551 * 2552 * @return the protocol charset string 2553 * @see #getDefaultProtocolCharset 2554 */ 2555 public String getProtocolCharset() { 2556 return (protocolCharset != null) 2557 ? protocolCharset 2558 : defaultProtocolCharset; 2559 } 2560 2561 2562 /** 2563 * Set the default charset of the document. 2564 * <p> 2565 * Notice that it will be possible to contain mixed characters (e.g. 2566 * ftp://host/KoreanNamespace/ChineseResource). To handle the Bi-directional 2567 * display of these character sets, the protocol charset could be simply 2568 * used again. Because it's not yet implemented that the insertion of BIDI 2569 * control characters at different points during composition is extracted. 2570 * <p> 2571 * 2572 * Always all the time, the setter method is always succeeded and throws 2573 * <code>DefaultCharsetChanged</code> exception. 2574 * 2575 * So API programmer must follow the following way: 2576 * <code><pre> 2577 * import org.apache.util.URI$DefaultCharsetChanged; 2578 * . 2579 * . 2580 * . 2581 * try { 2582 * URI.setDefaultDocumentCharset("EUC-KR"); 2583 * } catch (DefaultCharsetChanged cc) { 2584 * // CASE 1: the exception could be ignored, when it is set by user 2585 * if (cc.getReasonCode() == DefaultCharsetChanged.DOCUMENT_CHARSET) { 2586 * // CASE 2: let user know the default document charset changed 2587 * } else { 2588 * // CASE 2: let user know the default protocol charset changed 2589 * } 2590 * } 2591 * </pre></code> 2592 * 2593 * The API programmer is responsible to set the correct charset. 2594 * And each application should remember its own charset to support. 2595 * 2596 * @param charset the default charset for the document 2597 * @throws DefaultCharsetChanged default charset changed 2598 */ 2599 public static void setDefaultDocumentCharset(String charset) 2600 throws DefaultCharsetChanged { 2601 2602 defaultDocumentCharset = charset; 2603 throw new DefaultCharsetChanged(DefaultCharsetChanged.DOCUMENT_CHARSET, 2604 "the default document charset changed"); 2605 } 2606 2607 2608 /** 2609 * Get the recommended default charset of the document. 2610 * 2611 * @return the default charset string 2612 */ 2613 public static String getDefaultDocumentCharset() { 2614 return defaultDocumentCharset; 2615 } 2616 2617 2618 /** 2619 * Get the default charset of the document by locale. 2620 * 2621 * @return the default charset string by locale 2622 */ 2623 public static String getDefaultDocumentCharsetByLocale() { 2624 return defaultDocumentCharsetByLocale; 2625 } 2626 2627 2628 /** 2629 * Get the default charset of the document by platform. 2630 * 2631 * @return the default charset string by platform 2632 */ 2633 public static String getDefaultDocumentCharsetByPlatform() { 2634 return defaultDocumentCharsetByPlatform; 2635 } 2636 2637 // ------------------------------------------------------------- The scheme 2638 2639 /** 2640 * Get the scheme. 2641 * 2642 * @return the scheme 2643 */ 2644 public char[] getRawScheme() { 2645 return _scheme; 2646 } 2647 2648 2649 /** 2650 * Get the scheme. 2651 * 2652 * @return the scheme 2653 * null if undefined scheme 2654 */ 2655 public String getScheme() { 2656 return (_scheme == null) ? null : new String(_scheme); 2657 } 2658 2659 // ---------------------------------------------------------- The authority 2660 2661 /** 2662 * Set the authority. It can be one type of server, hostport, hostname, 2663 * IPv4address, IPv6reference and reg_name. 2664 * <p><blockquote><pre> 2665 * authority = server | reg_name 2666 * </pre></blockquote><p> 2667 * 2668 * @param escapedAuthority the raw escaped authority 2669 * @throws URIException If {@link 2670 * #parseAuthority(java.lang.String,boolean)} fails 2671 * @throws NullPointerException null authority 2672 */ 2673 public void setRawAuthority(char[] escapedAuthority) 2674 throws URIException, NullPointerException { 2675 2676 parseAuthority(new String(escapedAuthority), true); 2677 setURI(); 2678 } 2679 2680 2681 /** 2682 * Set the authority. It can be one type of server, hostport, hostname, 2683 * IPv4address, IPv6reference and reg_name. 2684 * Note that there is no setAuthority method by the escape encoding reason. 2685 * 2686 * @param escapedAuthority the escaped authority string 2687 * @throws URIException If {@link 2688 * #parseAuthority(java.lang.String,boolean)} fails 2689 */ 2690 public void setEscapedAuthority(String escapedAuthority) 2691 throws URIException { 2692 2693 parseAuthority(escapedAuthority, true); 2694 setURI(); 2695 } 2696 2697 2698 /** 2699 * Get the raw-escaped authority. 2700 * 2701 * @return the raw-escaped authority 2702 */ 2703 public char[] getRawAuthority() { 2704 return _authority; 2705 } 2706 2707 2708 /** 2709 * Get the escaped authority. 2710 * 2711 * @return the escaped authority 2712 */ 2713 public String getEscapedAuthority() { 2714 return (_authority == null) ? null : new String(_authority); 2715 } 2716 2717 2718 /** 2719 * Get the authority. 2720 * 2721 * @return the authority 2722 * @throws URIException If {@link #decode} fails 2723 */ 2724 public String getAuthority() throws URIException { 2725 return (_authority == null) ? null : decode(_authority, 2726 getProtocolCharset()); 2727 } 2728 2729 // ----------------------------------------------------------- The userinfo 2730 2731 /** 2732 * Get the raw-escaped userinfo. 2733 * 2734 * @return the raw-escaped userinfo 2735 * @see #getAuthority 2736 */ 2737 public char[] getRawUserinfo() { 2738 return _userinfo; 2739 } 2740 2741 2742 /** 2743 * Get the escaped userinfo. 2744 * 2745 * @return the escaped userinfo 2746 * @see #getAuthority 2747 */ 2748 public String getEscapedUserinfo() { 2749 return (_userinfo == null) ? null : new String(_userinfo); 2750 } 2751 2752 2753 /** 2754 * Get the userinfo. 2755 * 2756 * @return the userinfo 2757 * @throws URIException If {@link #decode} fails 2758 * @see #getAuthority 2759 */ 2760 public String getUserinfo() throws URIException { 2761 return (_userinfo == null) ? null : decode(_userinfo, 2762 getProtocolCharset()); 2763 } 2764 2765 // --------------------------------------------------------------- The host 2766 2767 /** 2768 * Get the host. 2769 * <p><blockquote><pre> 2770 * host = hostname | IPv4address | IPv6reference 2771 * </pre></blockquote><p> 2772 * 2773 * @return the host 2774 * @see #getAuthority 2775 */ 2776 public char[] getRawHost() { 2777 return _host; 2778 } 2779 2780 2781 /** 2782 * Get the host. 2783 * <p><blockquote><pre> 2784 * host = hostname | IPv4address | IPv6reference 2785 * </pre></blockquote><p> 2786 * 2787 * @return the host 2788 * @throws URIException If {@link #decode} fails 2789 * @see #getAuthority 2790 */ 2791 public String getHost() throws URIException { 2792 if (_host != null) { 2793 return decode(_host, getProtocolCharset()); 2794 } else { 2795 return null; 2796 } 2797 } 2798 2799 // --------------------------------------------------------------- The port 2800 2801 /** 2802 * Get the port. In order to get the specfic default port, the specific 2803 * protocol-supported class extended from the URI class should be used. 2804 * It has the server-based naming authority. 2805 * 2806 * @return the port 2807 * if -1, it has the default port for the scheme or the server-based 2808 * naming authority is not supported in the specific URI. 2809 */ 2810 public int getPort() { 2811 return _port; 2812 } 2813 2814 // --------------------------------------------------------------- The path 2815 2816 /** 2817 * Set the raw-escaped path. 2818 * 2819 * @param escapedPath the path character sequence 2820 * @throws URIException encoding error or not proper for initial instance 2821 * @see #encode 2822 */ 2823 public void setRawPath(char[] escapedPath) throws URIException { 2824 if (escapedPath == null || escapedPath.length == 0) { 2825 _path = _opaque = escapedPath; 2826 setURI(); 2827 return; 2828 } 2829 // remove the fragment identifier 2830 escapedPath = removeFragmentIdentifier(escapedPath); 2831 if (_is_net_path || _is_abs_path) { 2832 if (escapedPath[0] != '/') { 2833 throw new URIException(URIException.PARSING, 2834 "not absolute path"); 2835 } 2836 if (!validate(escapedPath, abs_path)) { 2837 throw new URIException(URIException.ESCAPING, 2838 "escaped absolute path not valid"); 2839 } 2840 _path = escapedPath; 2841 } else if (_is_rel_path) { 2842 int at = indexFirstOf(escapedPath, '/'); 2843 if (at == 0) { 2844 throw new URIException(URIException.PARSING, "incorrect path"); 2845 } 2846 if (at > 0 && !validate(escapedPath, 0, at - 1, rel_segment) 2847 && !validate(escapedPath, at, -1, abs_path) 2848 || at < 0 && !validate(escapedPath, 0, -1, rel_segment)) { 2849 2850 throw new URIException(URIException.ESCAPING, 2851 "escaped relative path not valid"); 2852 } 2853 _path = escapedPath; 2854 } else if (_is_opaque_part) { 2855 if (!uric_no_slash.get(escapedPath[0]) 2856 && !validate(escapedPath, 1, -1, uric)) { 2857 throw new URIException(URIException.ESCAPING, 2858 "escaped opaque part not valid"); 2859 } 2860 _opaque = escapedPath; 2861 } else { 2862 throw new URIException(URIException.PARSING, "incorrect path"); 2863 } 2864 setURI(); 2865 } 2866 2867 2868 /** 2869 * Set the escaped path. 2870 * 2871 * @param escapedPath the escaped path string 2872 * @throws URIException encoding error or not proper for initial instance 2873 * @see #encode 2874 */ 2875 public void setEscapedPath(String escapedPath) throws URIException { 2876 if (escapedPath == null) { 2877 _path = _opaque = null; 2878 setURI(); 2879 return; 2880 } 2881 setRawPath(escapedPath.toCharArray()); 2882 } 2883 2884 2885 /** 2886 * Set the path. 2887 * 2888 * @param path the path string 2889 * @throws URIException set incorrectly or fragment only 2890 * @see #encode 2891 */ 2892 public void setPath(String path) throws URIException { 2893 2894 if (path == null || path.length() == 0) { 2895 _path = _opaque = (path == null) ? null : path.toCharArray(); 2896 setURI(); 2897 return; 2898 } 2899 // set the charset to do escape encoding 2900 String charset = getProtocolCharset(); 2901 2902 if (_is_net_path || _is_abs_path) { 2903 _path = encode(path, allowed_abs_path, charset); 2904 } else if (_is_rel_path) { 2905 StringBuffer buff = new StringBuffer(path.length()); 2906 int at = path.indexOf('/'); 2907 if (at == 0) { // never 0 2908 throw new URIException(URIException.PARSING, 2909 "incorrect relative path"); 2910 } 2911 if (at > 0) { 2912 buff.append(encode(path.substring(0, at), allowed_rel_path, 2913 charset)); 2914 buff.append(encode(path.substring(at), allowed_abs_path, 2915 charset)); 2916 } else { 2917 buff.append(encode(path, allowed_rel_path, charset)); 2918 } 2919 _path = buff.toString().toCharArray(); 2920 } else if (_is_opaque_part) { 2921 StringBuffer buf = new StringBuffer(); 2922 buf.insert(0, encode(path.substring(0, 1), uric_no_slash, charset)); 2923 buf.insert(1, encode(path.substring(1), uric, charset)); 2924 _opaque = buf.toString().toCharArray(); 2925 } else { 2926 throw new URIException(URIException.PARSING, "incorrect path"); 2927 } 2928 setURI(); 2929 } 2930 2931 2932 /** 2933 * Resolve the base and relative path. 2934 * 2935 * @param basePath a character array of the basePath 2936 * @param relPath a character array of the relPath 2937 * @return the resolved path 2938 * @throws URIException no more higher path level to be resolved 2939 */ 2940 protected char[] resolvePath(char[] basePath, char[] relPath) 2941 throws URIException { 2942 2943 // REMINDME: paths are never null 2944 String base = (basePath == null) ? "" : new String(basePath); 2945 2946 // _path could be empty 2947 if (relPath == null || relPath.length == 0) { 2948 return normalize(basePath); 2949 } else if (relPath[0] == '/') { 2950 return normalize(relPath); 2951 } else { 2952 int at = base.lastIndexOf('/'); 2953 if (at != -1) { 2954 basePath = base.substring(0, at + 1).toCharArray(); 2955 } 2956 StringBuffer buff = new StringBuffer(base.length() 2957 + relPath.length); 2958 buff.append((at != -1) ? base.substring(0, at + 1) : "/"); 2959 buff.append(relPath); 2960 return normalize(buff.toString().toCharArray()); 2961 } 2962 } 2963 2964 2965 /** 2966 * Get the raw-escaped current hierarchy level in the given path. 2967 * If the last namespace is a collection, the slash mark ('/') should be 2968 * ended with at the last character of the path string. 2969 * 2970 * @param path the path 2971 * @return the current hierarchy level 2972 * @throws URIException no hierarchy level 2973 */ 2974 protected char[] getRawCurrentHierPath(char[] path) throws URIException { 2975 2976 if (_is_opaque_part) { 2977 throw new URIException(URIException.PARSING, "no hierarchy level"); 2978 } 2979 if (path == null) { 2980 throw new URIException(URIException.PARSING, "empty path"); 2981 } 2982 String buff = new String(path); 2983 int first = buff.indexOf('/'); 2984 int last = buff.lastIndexOf('/'); 2985 if (last == 0) { 2986 return rootPath; 2987 } else if (first != last && last != -1) { 2988 return buff.substring(0, last).toCharArray(); 2989 } 2990 // FIXME: it could be a document on the server side 2991 return path; 2992 } 2993 2994 2995 /** 2996 * Get the raw-escaped current hierarchy level. 2997 * 2998 * @return the raw-escaped current hierarchy level 2999 * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails. 3000 */ 3001 public char[] getRawCurrentHierPath() throws URIException { 3002 return (_path == null) ? null : getRawCurrentHierPath(_path); 3003 } 3004 3005 3006 /** 3007 * Get the escaped current hierarchy level. 3008 * 3009 * @return the escaped current hierarchy level 3010 * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails. 3011 */ 3012 public String getEscapedCurrentHierPath() throws URIException { 3013 char[] path = getRawCurrentHierPath(); 3014 return (path == null) ? null : new String(path); 3015 } 3016 3017 3018 /** 3019 * Get the current hierarchy level. 3020 * 3021 * @return the current hierarchy level 3022 * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails. 3023 * @see #decode 3024 */ 3025 public String getCurrentHierPath() throws URIException { 3026 char[] path = getRawCurrentHierPath(); 3027 return (path == null) ? null : decode(path, getProtocolCharset()); 3028 } 3029 3030 3031 /** 3032 * Get the level above the this hierarchy level. 3033 * 3034 * @return the raw above hierarchy level 3035 * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails. 3036 */ 3037 public char[] getRawAboveHierPath() throws URIException { 3038 char[] path = getRawCurrentHierPath(); 3039 return (path == null) ? null : getRawCurrentHierPath(path); 3040 } 3041 3042 3043 /** 3044 * Get the level above the this hierarchy level. 3045 * 3046 * @return the raw above hierarchy level 3047 * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails. 3048 */ 3049 public String getEscapedAboveHierPath() throws URIException { 3050 char[] path = getRawAboveHierPath(); 3051 return (path == null) ? null : new String(path); 3052 } 3053 3054 3055 /** 3056 * Get the level above the this hierarchy level. 3057 * 3058 * @return the above hierarchy level 3059 * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails. 3060 * @see #decode 3061 */ 3062 public String getAboveHierPath() throws URIException { 3063 char[] path = getRawAboveHierPath(); 3064 return (path == null) ? null : decode(path, getProtocolCharset()); 3065 } 3066 3067 3068 /** 3069 * Get the raw-escaped path. 3070 * <p><blockquote><pre> 3071 * path = [ abs_path | opaque_part ] 3072 * </pre></blockquote><p> 3073 * 3074 * @return the raw-escaped path 3075 */ 3076 public char[] getRawPath() { 3077 return _is_opaque_part ? _opaque : _path; 3078 } 3079 3080 3081 /** 3082 * Get the escaped path. 3083 * <p><blockquote><pre> 3084 * path = [ abs_path | opaque_part ] 3085 * abs_path = "/" path_segments 3086 * opaque_part = uric_no_slash *uric 3087 * </pre></blockquote><p> 3088 * 3089 * @return the escaped path string 3090 */ 3091 public String getEscapedPath() { 3092 char[] path = getRawPath(); 3093 return (path == null) ? null : new String(path); 3094 } 3095 3096 3097 /** 3098 * Get the path. 3099 * <p><blockquote><pre> 3100 * path = [ abs_path | opaque_part ] 3101 * </pre></blockquote><p> 3102 * @return the path string 3103 * @throws URIException If {@link #decode} fails. 3104 * @see #decode 3105 */ 3106 public String getPath() throws URIException { 3107 char[] path = getRawPath(); 3108 return (path == null) ? null : decode(path, getProtocolCharset()); 3109 } 3110 3111 3112 /** 3113 * Get the raw-escaped basename of the path. 3114 * 3115 * @return the raw-escaped basename 3116 */ 3117 public char[] getRawName() { 3118 if (_path == null) { 3119 return null; 3120 } 3121 3122 int at = 0; 3123 for (int i = _path.length - 1; i >= 0; i--) { 3124 if (_path[i] == '/') { 3125 at = i + 1; 3126 break; 3127 } 3128 } 3129 int len = _path.length - at; 3130 char[] basename = new char[len]; 3131 System.arraycopy(_path, at, basename, 0, len); 3132 return basename; 3133 } 3134 3135 3136 /** 3137 * Get the escaped basename of the path. 3138 * 3139 * @return the escaped basename string 3140 */ 3141 public String getEscapedName() { 3142 char[] basename = getRawName(); 3143 return (basename == null) ? null : new String(basename); 3144 } 3145 3146 3147 /** 3148 * Get the basename of the path. 3149 * 3150 * @return the basename string 3151 * @throws URIException incomplete trailing escape pattern or unsupported 3152 * character encoding 3153 * @see #decode 3154 */ 3155 public String getName() throws URIException { 3156 char[] basename = getRawName(); 3157 return (basename == null) ? null : decode(getRawName(), 3158 getProtocolCharset()); 3159 } 3160 3161 // ----------------------------------------------------- The path and query 3162 3163 /** 3164 * Get the raw-escaped path and query. 3165 * 3166 * @return the raw-escaped path and query 3167 */ 3168 public char[] getRawPathQuery() { 3169 3170 if (_path == null && _query == null) { 3171 return null; 3172 } 3173 StringBuffer buff = new StringBuffer(); 3174 if (_path != null) { 3175 buff.append(_path); 3176 } 3177 if (_query != null) { 3178 buff.append('?'); 3179 buff.append(_query); 3180 } 3181 return buff.toString().toCharArray(); 3182 } 3183 3184 3185 /** 3186 * Get the escaped query. 3187 * 3188 * @return the escaped path and query string 3189 */ 3190 public String getEscapedPathQuery() { 3191 char[] rawPathQuery = getRawPathQuery(); 3192 return (rawPathQuery == null) ? null : new String(rawPathQuery); 3193 } 3194 3195 3196 /** 3197 * Get the path and query. 3198 * 3199 * @return the path and query string. 3200 * @throws URIException incomplete trailing escape pattern or unsupported 3201 * character encoding 3202 * @see #decode 3203 */ 3204 public String getPathQuery() throws URIException { 3205 char[] rawPathQuery = getRawPathQuery(); 3206 return (rawPathQuery == null) ? null : decode(rawPathQuery, 3207 getProtocolCharset()); 3208 } 3209 3210 // -------------------------------------------------------------- The query 3211 3212 /** 3213 * Set the raw-escaped query. 3214 * 3215 * @param escapedQuery the raw-escaped query 3216 * @throws URIException escaped query not valid 3217 */ 3218 public void setRawQuery(char[] escapedQuery) throws URIException { 3219 if (escapedQuery == null || escapedQuery.length == 0) { 3220 _query = escapedQuery; 3221 setURI(); 3222 return; 3223 } 3224 // remove the fragment identifier 3225 escapedQuery = removeFragmentIdentifier(escapedQuery); 3226 if (!validate(escapedQuery, query)) { 3227 throw new URIException(URIException.ESCAPING, 3228 "escaped query not valid"); 3229 } 3230 _query = escapedQuery; 3231 setURI(); 3232 } 3233 3234 3235 /** 3236 * Set the escaped query string. 3237 * 3238 * @param escapedQuery the escaped query string 3239 * @throws URIException escaped query not valid 3240 */ 3241 public void setEscapedQuery(String escapedQuery) throws URIException { 3242 if (escapedQuery == null) { 3243 _query = null; 3244 setURI(); 3245 return; 3246 } 3247 setRawQuery(escapedQuery.toCharArray()); 3248 } 3249 3250 3251 /** 3252 * Set the query. 3253 * <p> 3254 * When a query string is not misunderstood the reserved special characters 3255 * ("&", "=", "+", ",", and "$") within a query component, it is 3256 * recommended to use in encoding the whole query with this method. 3257 * <p> 3258 * The additional APIs for the special purpose using by the reserved 3259 * special characters used in each protocol are implemented in each protocol 3260 * classes inherited from <code>URI</code>. So refer to the same-named APIs 3261 * implemented in each specific protocol instance. 3262 * 3263 * @param query the query string. 3264 * @throws URIException incomplete trailing escape pattern or unsupported 3265 * character encoding 3266 * @see #encode 3267 */ 3268 public void setQuery(String query) throws URIException { 3269 if (query == null || query.length() == 0) { 3270 _query = (query == null) ? null : query.toCharArray(); 3271 setURI(); 3272 return; 3273 } 3274 setRawQuery(encode(query, allowed_query, getProtocolCharset())); 3275 } 3276 3277 3278 /** 3279 * Get the raw-escaped query. 3280 * 3281 * @return the raw-escaped query 3282 */ 3283 public char[] getRawQuery() { 3284 return _query; 3285 } 3286 3287 3288 /** 3289 * Get the escaped query. 3290 * 3291 * @return the escaped query string 3292 */ 3293 public String getEscapedQuery() { 3294 return (_query == null) ? null : new String(_query); 3295 } 3296 3297 3298 /** 3299 * Get the query. 3300 * 3301 * @return the query string. 3302 * @throws URIException incomplete trailing escape pattern or unsupported 3303 * character encoding 3304 * @see #decode 3305 */ 3306 public String getQuery() throws URIException { 3307 return (_query == null) ? null : decode(_query, getProtocolCharset()); 3308 } 3309 3310 // ----------------------------------------------------------- The fragment 3311 3312 /** 3313 * Set the raw-escaped fragment. 3314 * 3315 * @param escapedFragment the raw-escaped fragment 3316 * @throws URIException escaped fragment not valid 3317 */ 3318 public void setRawFragment(char[] escapedFragment) throws URIException { 3319 if (escapedFragment == null || escapedFragment.length == 0) { 3320 _fragment = escapedFragment; 3321 hash = 0; 3322 return; 3323 } 3324 if (!validate(escapedFragment, fragment)) { 3325 throw new URIException(URIException.ESCAPING, 3326 "escaped fragment not valid"); 3327 } 3328 _fragment = escapedFragment; 3329 hash = 0; 3330 } 3331 3332 3333 /** 3334 * Set the escaped fragment string. 3335 * 3336 * @param escapedFragment the escaped fragment string 3337 * @throws URIException escaped fragment not valid 3338 */ 3339 public void setEscapedFragment(String escapedFragment) throws URIException { 3340 if (escapedFragment == null) { 3341 _fragment = null; 3342 hash = 0; 3343 return; 3344 } 3345 setRawFragment(escapedFragment.toCharArray()); 3346 } 3347 3348 3349 /** 3350 * Set the fragment. 3351 * 3352 * @param fragment the fragment string. 3353 * @throws URIException If an error occurs. 3354 */ 3355 public void setFragment(String fragment) throws URIException { 3356 if (fragment == null || fragment.length() == 0) { 3357 _fragment = (fragment == null) ? null : fragment.toCharArray(); 3358 hash = 0; 3359 return; 3360 } 3361 _fragment = encode(fragment, allowed_fragment, getProtocolCharset()); 3362 hash = 0; 3363 } 3364 3365 3366 /** 3367 * Get the raw-escaped fragment. 3368 * <p> 3369 * The optional fragment identifier is not part of a URI, but is often used 3370 * in conjunction with a URI. 3371 * <p> 3372 * The format and interpretation of fragment identifiers is dependent on 3373 * the media type [RFC2046] of the retrieval result. 3374 * <p> 3375 * A fragment identifier is only meaningful when a URI reference is 3376 * intended for retrieval and the result of that retrieval is a document 3377 * for which the identified fragment is consistently defined. 3378 * 3379 * @return the raw-escaped fragment 3380 */ 3381 public char[] getRawFragment() { 3382 return _fragment; 3383 } 3384 3385 3386 /** 3387 * Get the escaped fragment. 3388 * 3389 * @return the escaped fragment string 3390 */ 3391 public String getEscapedFragment() { 3392 return (_fragment == null) ? null : new String(_fragment); 3393 } 3394 3395 3396 /** 3397 * Get the fragment. 3398 * 3399 * @return the fragment string 3400 * @throws URIException incomplete trailing escape pattern or unsupported 3401 * character encoding 3402 * @see #decode 3403 */ 3404 public String getFragment() throws URIException { 3405 return (_fragment == null) ? null : decode(_fragment, 3406 getProtocolCharset()); 3407 } 3408 3409 // ------------------------------------------------------------- Utilities 3410 3411 /** 3412 * Remove the fragment identifier of the given component. 3413 * 3414 * @param component the component that a fragment may be included 3415 * @return the component that the fragment identifier is removed 3416 */ 3417 protected char[] removeFragmentIdentifier(char[] component) { 3418 if (component == null) { 3419 return null; 3420 } 3421 int lastIndex = new String(component).indexOf('#'); 3422 if (lastIndex != -1) { 3423 component = new String(component).substring(0, 3424 lastIndex).toCharArray(); 3425 } 3426 return component; 3427 } 3428 3429 3430 /** 3431 * Normalize the given hier path part. 3432 * 3433 * <p>Algorithm taken from URI reference parser at 3434 * http://www.apache.org/~fielding/uri/rev-2002/issues.html. 3435 * 3436 * @param path the path to normalize 3437 * @return the normalized path 3438 * @throws URIException no more higher path level to be normalized 3439 */ 3440 protected char[] normalize(char[] path) throws URIException { 3441 3442 if (path == null) { 3443 return null; 3444 } 3445 3446 String normalized = new String(path); 3447 3448 // If the buffer begins with "./" or "../", the "." or ".." is removed. 3449 if (normalized.startsWith("./")) { 3450 normalized = normalized.substring(1); 3451 } else if (normalized.startsWith("../")) { 3452 normalized = normalized.substring(2); 3453 } else if (normalized.startsWith("..")) { 3454 normalized = normalized.substring(2); 3455 } 3456 3457 // All occurrences of "/./" in the buffer are replaced with "/" 3458 int index = -1; 3459 while ((index = normalized.indexOf("/./")) != -1) { 3460 normalized = normalized.substring(0, index) + normalized.substring(index + 2); 3461 } 3462 3463 // If the buffer ends with "/.", the "." is removed. 3464 if (normalized.endsWith("/.")) { 3465 normalized = normalized.substring(0, normalized.length() - 1); 3466 } 3467 3468 int startIndex = 0; 3469 3470 // All occurrences of "/<segment>/../" in the buffer, where ".." 3471 // and <segment> are complete path segments, are iteratively replaced 3472 // with "/" in order from left to right until no matching pattern remains. 3473 // If the buffer ends with "/<segment>/..", that is also replaced 3474 // with "/". Note that <segment> may be empty. 3475 while ((index = normalized.indexOf("/../", startIndex)) != -1) { 3476 int slashIndex = normalized.lastIndexOf('/', index - 1); 3477 if (slashIndex >= 0) { 3478 normalized = normalized.substring(0, slashIndex) + normalized.substring(index + 3); 3479 } else { 3480 startIndex = index + 3; 3481 } 3482 } 3483 if (normalized.endsWith("/..")) { 3484 int slashIndex = normalized.lastIndexOf('/', normalized.length() - 4); 3485 if (slashIndex >= 0) { 3486 normalized = normalized.substring(0, slashIndex + 1); 3487 } 3488 } 3489 3490 // All prefixes of "<segment>/../" in the buffer, where ".." 3491 // and <segment> are complete path segments, are iteratively replaced 3492 // with "/" in order from left to right until no matching pattern remains. 3493 // If the buffer ends with "<segment>/..", that is also replaced 3494 // with "/". Note that <segment> may be empty. 3495 while ((index = normalized.indexOf("/../")) != -1) { 3496 int slashIndex = normalized.lastIndexOf('/', index - 1); 3497 if (slashIndex >= 0) { 3498 break; 3499 } else { 3500 normalized = normalized.substring(index + 3); 3501 } 3502 } 3503 if (normalized.endsWith("/..")) { 3504 int slashIndex = normalized.lastIndexOf('/', normalized.length() - 4); 3505 if (slashIndex < 0) { 3506 normalized = "/"; 3507 } 3508 } 3509 3510 return normalized.toCharArray(); 3511 } 3512 3513 3514 /** 3515 * Normalizes the path part of this URI. Normalization is only meant to be performed on 3516 * URIs with an absolute path. Calling this method on a relative path URI will have no 3517 * effect. 3518 * 3519 * @throws URIException no more higher path level to be normalized 3520 * 3521 * @see #isAbsPath() 3522 */ 3523 public void normalize() throws URIException { 3524 if (isAbsPath()) { 3525 _path = normalize(_path); 3526 setURI(); 3527 } 3528 } 3529 3530 3531 /** 3532 * Test if the first array is equal to the second array. 3533 * 3534 * @param first the first character array 3535 * @param second the second character array 3536 * @return true if they're equal 3537 */ 3538 protected boolean equals(char[] first, char[] second) { 3539 3540 if (first == null && second == null) { 3541 return true; 3542 } 3543 if (first == null || second == null) { 3544 return false; 3545 } 3546 if (first.length != second.length) { 3547 return false; 3548 } 3549 for (int i = 0; i < first.length; i++) { 3550 if (first[i] != second[i]) { 3551 return false; 3552 } 3553 } 3554 return true; 3555 } 3556 3557 3558 /** 3559 * Test an object if this URI is equal to another. 3560 * 3561 * @param obj an object to compare 3562 * @return true if two URI objects are equal 3563 */ 3564 public boolean equals(Object obj) { 3565 3566 // normalize and test each components 3567 if (obj == this) { 3568 return true; 3569 } 3570 if (!(obj instanceof URI)) { 3571 return false; 3572 } 3573 URI another = (URI) obj; 3574 // scheme 3575 if (!equals(_scheme, another._scheme)) { 3576 return false; 3577 } 3578 // is_opaque_part or is_hier_part? and opaque 3579 if (!equals(_opaque, another._opaque)) { 3580 return false; 3581 } 3582 // is_hier_part 3583 // has_authority 3584 if (!equals(_authority, another._authority)) { 3585 return false; 3586 } 3587 // path 3588 if (!equals(_path, another._path)) { 3589 return false; 3590 } 3591 // has_query 3592 if (!equals(_query, another._query)) { 3593 return false; 3594 } 3595 // has_fragment? should be careful of the only fragment case. 3596 if (!equals(_fragment, another._fragment)) { 3597 return false; 3598 } 3599 return true; 3600 } 3601 3602 // ---------------------------------------------------------- Serialization 3603 3604 /** 3605 * Write the content of this URI. 3606 * 3607 * @param oos the object-output stream 3608 * @throws IOException If an IO problem occurs. 3609 */ 3610 private void writeObject(ObjectOutputStream oos) 3611 throws IOException { 3612 3613 oos.defaultWriteObject(); 3614 } 3615 3616 3617 /** 3618 * Read a URI. 3619 * 3620 * @param ois the object-input stream 3621 * @throws ClassNotFoundException If one of the classes specified in the 3622 * input stream cannot be found. 3623 * @throws IOException If an IO problem occurs. 3624 */ 3625 private void readObject(ObjectInputStream ois) 3626 throws ClassNotFoundException, IOException { 3627 3628 ois.defaultReadObject(); 3629 } 3630 3631 // -------------------------------------------------------------- Hash code 3632 3633 /** 3634 * Return a hash code for this URI. 3635 * 3636 * @return a has code value for this URI 3637 */ 3638 public int hashCode() { 3639 if (hash == 0) { 3640 char[] c = _uri; 3641 if (c != null) { 3642 for (int i = 0, len = c.length; i < len; i++) { 3643 hash = 31 * hash + c[i]; 3644 } 3645 } 3646 c = _fragment; 3647 if (c != null) { 3648 for (int i = 0, len = c.length; i < len; i++) { 3649 hash = 31 * hash + c[i]; 3650 } 3651 } 3652 } 3653 return hash; 3654 } 3655 3656 // ------------------------------------------------------------- Comparison 3657 3658 /** 3659 * Compare this URI to another object. 3660 * 3661 * @param obj the object to be compared. 3662 * @return 0, if it's same, 3663 * -1, if failed, first being compared with in the authority component 3664 * @throws ClassCastException not URI argument 3665 */ 3666 public int compareTo(Object obj) throws ClassCastException { 3667 3668 URI another = (URI) obj; 3669 if (!equals(_authority, another.getRawAuthority())) { 3670 return -1; 3671 } 3672 return toString().compareTo(another.toString()); 3673 } 3674 3675 // ------------------------------------------------------------------ Clone 3676 3677 /** 3678 * Create and return a copy of this object, the URI-reference containing 3679 * the userinfo component. Notice that the whole URI-reference including 3680 * the userinfo component counld not be gotten as a <code>String</code>. 3681 * <p> 3682 * To copy the identical <code>URI</code> object including the userinfo 3683 * component, it should be used. 3684 * 3685 * @return a clone of this instance 3686 */ 3687 public synchronized Object clone() throws CloneNotSupportedException { 3688 3689 URI instance = (URI) super.clone(); 3690 3691 instance._uri = _uri; 3692 instance._scheme = _scheme; 3693 instance._opaque = _opaque; 3694 instance._authority = _authority; 3695 instance._userinfo = _userinfo; 3696 instance._host = _host; 3697 instance._port = _port; 3698 instance._path = _path; 3699 instance._query = _query; 3700 instance._fragment = _fragment; 3701 // the charset to do escape encoding for this instance 3702 instance.protocolCharset = protocolCharset; 3703 // flags 3704 instance._is_hier_part = _is_hier_part; 3705 instance._is_opaque_part = _is_opaque_part; 3706 instance._is_net_path = _is_net_path; 3707 instance._is_abs_path = _is_abs_path; 3708 instance._is_rel_path = _is_rel_path; 3709 instance._is_reg_name = _is_reg_name; 3710 instance._is_server = _is_server; 3711 instance._is_hostname = _is_hostname; 3712 instance._is_IPv4address = _is_IPv4address; 3713 instance._is_IPv6reference = _is_IPv6reference; 3714 3715 return instance; 3716 } 3717 3718 // ------------------------------------------------------------ Get the URI 3719 3720 /** 3721 * It can be gotten the URI character sequence. It's raw-escaped. 3722 * For the purpose of the protocol to be transported, it will be useful. 3723 * <p> 3724 * It is clearly unwise to use a URL that contains a password which is 3725 * intended to be secret. In particular, the use of a password within 3726 * the 'userinfo' component of a URL is strongly disrecommended except 3727 * in those rare cases where the 'password' parameter is intended to be 3728 * public. 3729 * <p> 3730 * When you want to get each part of the userinfo, you need to use the 3731 * specific methods in the specific URL. It depends on the specific URL. 3732 * 3733 * @return the URI character sequence 3734 */ 3735 public char[] getRawURI() { 3736 return _uri; 3737 } 3738 3739 3740 /** 3741 * It can be gotten the URI character sequence. It's escaped. 3742 * For the purpose of the protocol to be transported, it will be useful. 3743 * 3744 * @return the escaped URI string 3745 */ 3746 public String getEscapedURI() { 3747 return (_uri == null) ? null : new String(_uri); 3748 } 3749 3750 3751 /** 3752 * It can be gotten the URI character sequence. 3753 * 3754 * @return the original URI string 3755 * @throws URIException incomplete trailing escape pattern or unsupported 3756 * character encoding 3757 * @see #decode 3758 */ 3759 public String getURI() throws URIException { 3760 return (_uri == null) ? null : decode(_uri, getProtocolCharset()); 3761 } 3762 3763 3764 /** 3765 * Get the URI reference character sequence. 3766 * 3767 * @return the URI reference character sequence 3768 */ 3769 public char[] getRawURIReference() { 3770 if (_fragment == null) { 3771 return _uri; 3772 } 3773 if (_uri == null) { 3774 return _fragment; 3775 } 3776 // if _uri != null && _fragment != null 3777 String uriReference = new String(_uri) + "#" + new String(_fragment); 3778 return uriReference.toCharArray(); 3779 } 3780 3781 3782 /** 3783 * Get the escaped URI reference string. 3784 * 3785 * @return the escaped URI reference string 3786 */ 3787 public String getEscapedURIReference() { 3788 char[] uriReference = getRawURIReference(); 3789 return (uriReference == null) ? null : new String(uriReference); 3790 } 3791 3792 3793 /** 3794 * Get the original URI reference string. 3795 * 3796 * @return the original URI reference string 3797 * @throws URIException If {@link #decode} fails. 3798 */ 3799 public String getURIReference() throws URIException { 3800 char[] uriReference = getRawURIReference(); 3801 return (uriReference == null) ? null : decode(uriReference, 3802 getProtocolCharset()); 3803 } 3804 3805 3806 /** 3807 * Get the escaped URI string. 3808 * <p> 3809 * On the document, the URI-reference form is only used without the userinfo 3810 * component like http://jakarta.apache.org/ by the security reason. 3811 * But the URI-reference form with the userinfo component could be parsed. 3812 * <p> 3813 * In other words, this URI and any its subclasses must not expose the 3814 * URI-reference expression with the userinfo component like 3815 * http://user:password@hostport/restricted_zone.<br> 3816 * It means that the API client programmer should extract each user and 3817 * password to access manually. Probably it will be supported in the each 3818 * subclass, however, not a whole URI-reference expression. 3819 * 3820 * @return the escaped URI string 3821 * @see #clone() 3822 */ 3823 public String toString() { 3824 return getEscapedURI(); 3825 } 3826 3827 3828 // ------------------------------------------------------------ Inner class 3829 3830 /** 3831 * The charset-changed normal operation to represent to be required to 3832 * alert to user the fact the default charset is changed. 3833 */ 3834 public static class DefaultCharsetChanged extends RuntimeException { 3835 3836 // ------------------------------------------------------- constructors 3837 3838 /** 3839 * The constructor with a reason string and its code arguments. 3840 * 3841 * @param reasonCode the reason code 3842 * @param reason the reason 3843 */ 3844 public DefaultCharsetChanged(int reasonCode, String reason) { 3845 super(reason); 3846 this.reason = reason; 3847 this.reasonCode = reasonCode; 3848 } 3849 3850 // ---------------------------------------------------------- constants 3851 3852 /** No specified reason code. */ 3853 public static final int UNKNOWN = 0; 3854 3855 /** Protocol charset changed. */ 3856 public static final int PROTOCOL_CHARSET = 1; 3857 3858 /** Document charset changed. */ 3859 public static final int DOCUMENT_CHARSET = 2; 3860 3861 // ------------------------------------------------- instance variables 3862 3863 /** The reason code. */ 3864 private int reasonCode; 3865 3866 /** The reason message. */ 3867 private String reason; 3868 3869 // ------------------------------------------------------------ methods 3870 3871 /** 3872 * Get the reason code. 3873 * 3874 * @return the reason code 3875 */ 3876 public int getReasonCode() { 3877 return reasonCode; 3878 } 3879 3880 /** 3881 * Get the reason message. 3882 * 3883 * @return the reason message 3884 */ 3885 public String getReason() { 3886 return reason; 3887 } 3888 3889 } 3890 3891 3892 /** 3893 * A mapping to determine the (somewhat arbitrarily) preferred charset for a 3894 * given locale. Supports all locales recognized in JDK 1.1. 3895 * <p> 3896 * The distribution of this class is Servlets.com. It was originally 3897 * written by Jason Hunter [jhunter at acm.org] and used by with permission. 3898 */ 3899 public static class LocaleToCharsetMap { 3900 3901 /** A mapping of language code to charset */ 3902 private static final Hashtable LOCALE_TO_CHARSET_MAP; 3903 static { 3904 LOCALE_TO_CHARSET_MAP = new Hashtable(); 3905 LOCALE_TO_CHARSET_MAP.put("ar", "ISO-8859-6"); 3906 LOCALE_TO_CHARSET_MAP.put("be", "ISO-8859-5"); 3907 LOCALE_TO_CHARSET_MAP.put("bg", "ISO-8859-5"); 3908 LOCALE_TO_CHARSET_MAP.put("ca", "ISO-8859-1"); 3909 LOCALE_TO_CHARSET_MAP.put("cs", "ISO-8859-2"); 3910 LOCALE_TO_CHARSET_MAP.put("da", "ISO-8859-1"); 3911 LOCALE_TO_CHARSET_MAP.put("de", "ISO-8859-1"); 3912 LOCALE_TO_CHARSET_MAP.put("el", "ISO-8859-7"); 3913 LOCALE_TO_CHARSET_MAP.put("en", "ISO-8859-1"); 3914 LOCALE_TO_CHARSET_MAP.put("es", "ISO-8859-1"); 3915 LOCALE_TO_CHARSET_MAP.put("et", "ISO-8859-1"); 3916 LOCALE_TO_CHARSET_MAP.put("fi", "ISO-8859-1"); 3917 LOCALE_TO_CHARSET_MAP.put("fr", "ISO-8859-1"); 3918 LOCALE_TO_CHARSET_MAP.put("hr", "ISO-8859-2"); 3919 LOCALE_TO_CHARSET_MAP.put("hu", "ISO-8859-2"); 3920 LOCALE_TO_CHARSET_MAP.put("is", "ISO-8859-1"); 3921 LOCALE_TO_CHARSET_MAP.put("it", "ISO-8859-1"); 3922 LOCALE_TO_CHARSET_MAP.put("iw", "ISO-8859-8"); 3923 LOCALE_TO_CHARSET_MAP.put("ja", "Shift_JIS"); 3924 LOCALE_TO_CHARSET_MAP.put("ko", "EUC-KR"); 3925 LOCALE_TO_CHARSET_MAP.put("lt", "ISO-8859-2"); 3926 LOCALE_TO_CHARSET_MAP.put("lv", "ISO-8859-2"); 3927 LOCALE_TO_CHARSET_MAP.put("mk", "ISO-8859-5"); 3928 LOCALE_TO_CHARSET_MAP.put("nl", "ISO-8859-1"); 3929 LOCALE_TO_CHARSET_MAP.put("no", "ISO-8859-1"); 3930 LOCALE_TO_CHARSET_MAP.put("pl", "ISO-8859-2"); 3931 LOCALE_TO_CHARSET_MAP.put("pt", "ISO-8859-1"); 3932 LOCALE_TO_CHARSET_MAP.put("ro", "ISO-8859-2"); 3933 LOCALE_TO_CHARSET_MAP.put("ru", "ISO-8859-5"); 3934 LOCALE_TO_CHARSET_MAP.put("sh", "ISO-8859-5"); 3935 LOCALE_TO_CHARSET_MAP.put("sk", "ISO-8859-2"); 3936 LOCALE_TO_CHARSET_MAP.put("sl", "ISO-8859-2"); 3937 LOCALE_TO_CHARSET_MAP.put("sq", "ISO-8859-2"); 3938 LOCALE_TO_CHARSET_MAP.put("sr", "ISO-8859-5"); 3939 LOCALE_TO_CHARSET_MAP.put("sv", "ISO-8859-1"); 3940 LOCALE_TO_CHARSET_MAP.put("tr", "ISO-8859-9"); 3941 LOCALE_TO_CHARSET_MAP.put("uk", "ISO-8859-5"); 3942 LOCALE_TO_CHARSET_MAP.put("zh", "GB2312"); 3943 LOCALE_TO_CHARSET_MAP.put("zh_TW", "Big5"); 3944 } 3945 3946 /** 3947 * Get the preferred charset for the given locale. 3948 * 3949 * @param locale the locale 3950 * @return the preferred charset or null if the locale is not 3951 * recognized. 3952 */ 3953 public static String getCharset(Locale locale) { 3954 // try for an full name match (may include country) 3955 String charset = 3956 (String) LOCALE_TO_CHARSET_MAP.get(locale.toString()); 3957 if (charset != null) { 3958 return charset; 3959 } 3960 3961 // if a full name didn't match, try just the language 3962 charset = (String) LOCALE_TO_CHARSET_MAP.get(locale.getLanguage()); 3963 return charset; // may be null 3964 } 3965 3966 } 3967 3968 } 3969