rt/emul/compact/src/main/java/java/net/URI.java
author Jaroslav Tulach <jaroslav.tulach@apidesign.org>
Thu, 31 Oct 2013 11:23:54 +0100
changeset 1398 9926996eca2d
parent 1259 d257b7a37635
permissions -rw-r--r--
Implementing URLConnection
     1 /*
     2  * Copyright (c) 2000, 2011, Oracle and/or its affiliates. All rights reserved.
     3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
     4  *
     5  * This code is free software; you can redistribute it and/or modify it
     6  * under the terms of the GNU General Public License version 2 only, as
     7  * published by the Free Software Foundation.  Oracle designates this
     8  * particular file as subject to the "Classpath" exception as provided
     9  * by Oracle in the LICENSE file that accompanied this code.
    10  *
    11  * This code is distributed in the hope that it will be useful, but WITHOUT
    12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
    13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    14  * version 2 for more details (a copy is included in the LICENSE file that
    15  * accompanied this code).
    16  *
    17  * You should have received a copy of the GNU General Public License version
    18  * 2 along with this work; if not, write to the Free Software Foundation,
    19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
    20  *
    21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
    22  * or visit www.oracle.com if you need additional information or have any
    23  * questions.
    24  */
    25 
    26 package java.net;
    27 
    28 import java.io.IOException;
    29 import java.io.InvalidObjectException;
    30 import java.io.ObjectInputStream;
    31 import java.io.ObjectOutputStream;
    32 import java.io.Serializable;
    33 
    34 import java.lang.Character;             // for javadoc
    35 import java.lang.NullPointerException;  // for javadoc
    36 
    37 
    38 /**
    39  * Represents a Uniform Resource Identifier (URI) reference.
    40  *
    41  * <p> Aside from some minor deviations noted below, an instance of this
    42  * class represents a URI reference as defined by
    43  * <a href="http://www.ietf.org/rfc/rfc2396.txt"><i>RFC&nbsp;2396: Uniform
    44  * Resource Identifiers (URI): Generic Syntax</i></a>, amended by <a
    45  * href="http://www.ietf.org/rfc/rfc2732.txt"><i>RFC&nbsp;2732: Format for
    46  * Literal IPv6 Addresses in URLs</i></a>. The Literal IPv6 address format
    47  * also supports scope_ids. The syntax and usage of scope_ids is described
    48  * <a href="Inet6Address.html#scoped">here</a>.
    49  * This class provides constructors for creating URI instances from
    50  * their components or by parsing their string forms, methods for accessing the
    51  * various components of an instance, and methods for normalizing, resolving,
    52  * and relativizing URI instances.  Instances of this class are immutable.
    53  *
    54  *
    55  * <h4> URI syntax and components </h4>
    56  *
    57  * At the highest level a URI reference (hereinafter simply "URI") in string
    58  * form has the syntax
    59  *
    60  * <blockquote>
    61  * [<i>scheme</i><tt><b>:</b></tt><i></i>]<i>scheme-specific-part</i>[<tt><b>#</b></tt><i>fragment</i>]
    62  * </blockquote>
    63  *
    64  * where square brackets [...] delineate optional components and the characters
    65  * <tt><b>:</b></tt> and <tt><b>#</b></tt> stand for themselves.
    66  *
    67  * <p> An <i>absolute</i> URI specifies a scheme; a URI that is not absolute is
    68  * said to be <i>relative</i>.  URIs are also classified according to whether
    69  * they are <i>opaque</i> or <i>hierarchical</i>.
    70  *
    71  * <p> An <i>opaque</i> URI is an absolute URI whose scheme-specific part does
    72  * not begin with a slash character (<tt>'/'</tt>).  Opaque URIs are not
    73  * subject to further parsing.  Some examples of opaque URIs are:
    74  *
    75  * <blockquote><table cellpadding=0 cellspacing=0 summary="layout">
    76  * <tr><td><tt>mailto:java-net@java.sun.com</tt><td></tr>
    77  * <tr><td><tt>news:comp.lang.java</tt><td></tr>
    78  * <tr><td><tt>urn:isbn:096139210x</tt></td></tr>
    79  * </table></blockquote>
    80  *
    81  * <p> A <i>hierarchical</i> URI is either an absolute URI whose
    82  * scheme-specific part begins with a slash character, or a relative URI, that
    83  * is, a URI that does not specify a scheme.  Some examples of hierarchical
    84  * URIs are:
    85  *
    86  * <blockquote>
    87  * <tt>http://java.sun.com/j2se/1.3/</tt><br>
    88  * <tt>docs/guide/collections/designfaq.html#28</tt><br>
    89  * <tt>../../../demo/jfc/SwingSet2/src/SwingSet2.java</tt><br>
    90  * <tt>file:///~/calendar</tt>
    91  * </blockquote>
    92  *
    93  * <p> A hierarchical URI is subject to further parsing according to the syntax
    94  *
    95  * <blockquote>
    96  * [<i>scheme</i><tt><b>:</b></tt>][<tt><b>//</b></tt><i>authority</i>][<i>path</i>][<tt><b>?</b></tt><i>query</i>][<tt><b>#</b></tt><i>fragment</i>]
    97  * </blockquote>
    98  *
    99  * where the characters <tt><b>:</b></tt>, <tt><b>/</b></tt>,
   100  * <tt><b>?</b></tt>, and <tt><b>#</b></tt> stand for themselves.  The
   101  * scheme-specific part of a hierarchical URI consists of the characters
   102  * between the scheme and fragment components.
   103  *
   104  * <p> The authority component of a hierarchical URI is, if specified, either
   105  * <i>server-based</i> or <i>registry-based</i>.  A server-based authority
   106  * parses according to the familiar syntax
   107  *
   108  * <blockquote>
   109  * [<i>user-info</i><tt><b>@</b></tt>]<i>host</i>[<tt><b>:</b></tt><i>port</i>]
   110  * </blockquote>
   111  *
   112  * where the characters <tt><b>@</b></tt> and <tt><b>:</b></tt> stand for
   113  * themselves.  Nearly all URI schemes currently in use are server-based.  An
   114  * authority component that does not parse in this way is considered to be
   115  * registry-based.
   116  *
   117  * <p> The path component of a hierarchical URI is itself said to be absolute
   118  * if it begins with a slash character (<tt>'/'</tt>); otherwise it is
   119  * relative.  The path of a hierarchical URI that is either absolute or
   120  * specifies an authority is always absolute.
   121  *
   122  * <p> All told, then, a URI instance has the following nine components:
   123  *
   124  * <blockquote><table summary="Describes the components of a URI:scheme,scheme-specific-part,authority,user-info,host,port,path,query,fragment">
   125  * <tr><th><i>Component</i></th><th><i>Type</i></th></tr>
   126  * <tr><td>scheme</td><td><tt>String</tt></td></tr>
   127  * <tr><td>scheme-specific-part&nbsp;&nbsp;&nbsp;&nbsp;</td><td><tt>String</tt></td></tr>
   128  * <tr><td>authority</td><td><tt>String</tt></td></tr>
   129  * <tr><td>user-info</td><td><tt>String</tt></td></tr>
   130  * <tr><td>host</td><td><tt>String</tt></td></tr>
   131  * <tr><td>port</td><td><tt>int</tt></td></tr>
   132  * <tr><td>path</td><td><tt>String</tt></td></tr>
   133  * <tr><td>query</td><td><tt>String</tt></td></tr>
   134  * <tr><td>fragment</td><td><tt>String</tt></td></tr>
   135  * </table></blockquote>
   136  *
   137  * In a given instance any particular component is either <i>undefined</i> or
   138  * <i>defined</i> with a distinct value.  Undefined string components are
   139  * represented by <tt>null</tt>, while undefined integer components are
   140  * represented by <tt>-1</tt>.  A string component may be defined to have the
   141  * empty string as its value; this is not equivalent to that component being
   142  * undefined.
   143  *
   144  * <p> Whether a particular component is or is not defined in an instance
   145  * depends upon the type of the URI being represented.  An absolute URI has a
   146  * scheme component.  An opaque URI has a scheme, a scheme-specific part, and
   147  * possibly a fragment, but has no other components.  A hierarchical URI always
   148  * has a path (though it may be empty) and a scheme-specific-part (which at
   149  * least contains the path), and may have any of the other components.  If the
   150  * authority component is present and is server-based then the host component
   151  * will be defined and the user-information and port components may be defined.
   152  *
   153  *
   154  * <h4> Operations on URI instances </h4>
   155  *
   156  * The key operations supported by this class are those of
   157  * <i>normalization</i>, <i>resolution</i>, and <i>relativization</i>.
   158  *
   159  * <p> <i>Normalization</i> is the process of removing unnecessary <tt>"."</tt>
   160  * and <tt>".."</tt> segments from the path component of a hierarchical URI.
   161  * Each <tt>"."</tt> segment is simply removed.  A <tt>".."</tt> segment is
   162  * removed only if it is preceded by a non-<tt>".."</tt> segment.
   163  * Normalization has no effect upon opaque URIs.
   164  *
   165  * <p> <i>Resolution</i> is the process of resolving one URI against another,
   166  * <i>base</i> URI.  The resulting URI is constructed from components of both
   167  * URIs in the manner specified by RFC&nbsp;2396, taking components from the
   168  * base URI for those not specified in the original.  For hierarchical URIs,
   169  * the path of the original is resolved against the path of the base and then
   170  * normalized.  The result, for example, of resolving
   171  *
   172  * <blockquote>
   173  * <tt>docs/guide/collections/designfaq.html#28&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</tt>(1)
   174  * </blockquote>
   175  *
   176  * against the base URI <tt>http://java.sun.com/j2se/1.3/</tt> is the result
   177  * URI
   178  *
   179  * <blockquote>
   180  * <tt>http://java.sun.com/j2se/1.3/docs/guide/collections/designfaq.html#28</tt>
   181  * </blockquote>
   182  *
   183  * Resolving the relative URI
   184  *
   185  * <blockquote>
   186  * <tt>../../../demo/jfc/SwingSet2/src/SwingSet2.java&nbsp;&nbsp;&nbsp;&nbsp;</tt>(2)
   187  * </blockquote>
   188  *
   189  * against this result yields, in turn,
   190  *
   191  * <blockquote>
   192  * <tt>http://java.sun.com/j2se/1.3/demo/jfc/SwingSet2/src/SwingSet2.java</tt>
   193  * </blockquote>
   194  *
   195  * Resolution of both absolute and relative URIs, and of both absolute and
   196  * relative paths in the case of hierarchical URIs, is supported.  Resolving
   197  * the URI <tt>file:///~calendar</tt> against any other URI simply yields the
   198  * original URI, since it is absolute.  Resolving the relative URI (2) above
   199  * against the relative base URI (1) yields the normalized, but still relative,
   200  * URI
   201  *
   202  * <blockquote>
   203  * <tt>demo/jfc/SwingSet2/src/SwingSet2.java</tt>
   204  * </blockquote>
   205  *
   206  * <p> <i>Relativization</i>, finally, is the inverse of resolution: For any
   207  * two normalized URIs <i>u</i> and&nbsp;<i>v</i>,
   208  *
   209  * <blockquote>
   210  *   <i>u</i><tt>.relativize(</tt><i>u</i><tt>.resolve(</tt><i>v</i><tt>)).equals(</tt><i>v</i><tt>)</tt>&nbsp;&nbsp;and<br>
   211  *   <i>u</i><tt>.resolve(</tt><i>u</i><tt>.relativize(</tt><i>v</i><tt>)).equals(</tt><i>v</i><tt>)</tt>&nbsp;&nbsp;.<br>
   212  * </blockquote>
   213  *
   214  * This operation is often useful when constructing a document containing URIs
   215  * that must be made relative to the base URI of the document wherever
   216  * possible.  For example, relativizing the URI
   217  *
   218  * <blockquote>
   219  * <tt>http://java.sun.com/j2se/1.3/docs/guide/index.html</tt>
   220  * </blockquote>
   221  *
   222  * against the base URI
   223  *
   224  * <blockquote>
   225  * <tt>http://java.sun.com/j2se/1.3</tt>
   226  * </blockquote>
   227  *
   228  * yields the relative URI <tt>docs/guide/index.html</tt>.
   229  *
   230  *
   231  * <h4> Character categories </h4>
   232  *
   233  * RFC&nbsp;2396 specifies precisely which characters are permitted in the
   234  * various components of a URI reference.  The following categories, most of
   235  * which are taken from that specification, are used below to describe these
   236  * constraints:
   237  *
   238  * <blockquote><table cellspacing=2 summary="Describes categories alpha,digit,alphanum,unreserved,punct,reserved,escaped,and other">
   239  *   <tr><th valign=top><i>alpha</i></th>
   240  *       <td>The US-ASCII alphabetic characters,
   241  *        <tt>'A'</tt>&nbsp;through&nbsp;<tt>'Z'</tt>
   242  *        and <tt>'a'</tt>&nbsp;through&nbsp;<tt>'z'</tt></td></tr>
   243  *   <tr><th valign=top><i>digit</i></th>
   244  *       <td>The US-ASCII decimal digit characters,
   245  *       <tt>'0'</tt>&nbsp;through&nbsp;<tt>'9'</tt></td></tr>
   246  *   <tr><th valign=top><i>alphanum</i></th>
   247  *       <td>All <i>alpha</i> and <i>digit</i> characters</td></tr>
   248  *   <tr><th valign=top><i>unreserved</i>&nbsp;&nbsp;&nbsp;&nbsp;</th>
   249  *       <td>All <i>alphanum</i> characters together with those in the string
   250  *        <tt>"_-!.~'()*"</tt></td></tr>
   251  *   <tr><th valign=top><i>punct</i></th>
   252  *       <td>The characters in the string <tt>",;:$&+="</tt></td></tr>
   253  *   <tr><th valign=top><i>reserved</i></th>
   254  *       <td>All <i>punct</i> characters together with those in the string
   255  *        <tt>"?/[]@"</tt></td></tr>
   256  *   <tr><th valign=top><i>escaped</i></th>
   257  *       <td>Escaped octets, that is, triplets consisting of the percent
   258  *           character (<tt>'%'</tt>) followed by two hexadecimal digits
   259  *           (<tt>'0'</tt>-<tt>'9'</tt>, <tt>'A'</tt>-<tt>'F'</tt>, and
   260  *           <tt>'a'</tt>-<tt>'f'</tt>)</td></tr>
   261  *   <tr><th valign=top><i>other</i></th>
   262  *       <td>The Unicode characters that are not in the US-ASCII character set,
   263  *           are not control characters (according to the {@link
   264  *           java.lang.Character#isISOControl(char) Character.isISOControl}
   265  *           method), and are not space characters (according to the {@link
   266  *           java.lang.Character#isSpaceChar(char) Character.isSpaceChar}
   267  *           method)&nbsp;&nbsp;<i>(<b>Deviation from RFC 2396</b>, which is
   268  *           limited to US-ASCII)</i></td></tr>
   269  * </table></blockquote>
   270  *
   271  * <p><a name="legal-chars"></a> The set of all legal URI characters consists of
   272  * the <i>unreserved</i>, <i>reserved</i>, <i>escaped</i>, and <i>other</i>
   273  * characters.
   274  *
   275  *
   276  * <h4> Escaped octets, quotation, encoding, and decoding </h4>
   277  *
   278  * RFC 2396 allows escaped octets to appear in the user-info, path, query, and
   279  * fragment components.  Escaping serves two purposes in URIs:
   280  *
   281  * <ul>
   282  *
   283  *   <li><p> To <i>encode</i> non-US-ASCII characters when a URI is required to
   284  *   conform strictly to RFC&nbsp;2396 by not containing any <i>other</i>
   285  *   characters.  </p></li>
   286  *
   287  *   <li><p> To <i>quote</i> characters that are otherwise illegal in a
   288  *   component.  The user-info, path, query, and fragment components differ
   289  *   slightly in terms of which characters are considered legal and illegal.
   290  *   </p></li>
   291  *
   292  * </ul>
   293  *
   294  * These purposes are served in this class by three related operations:
   295  *
   296  * <ul>
   297  *
   298  *   <li><p><a name="encode"></a> A character is <i>encoded</i> by replacing it
   299  *   with the sequence of escaped octets that represent that character in the
   300  *   UTF-8 character set.  The Euro currency symbol (<tt>'&#92;u20AC'</tt>),
   301  *   for example, is encoded as <tt>"%E2%82%AC"</tt>.  <i>(<b>Deviation from
   302  *   RFC&nbsp;2396</b>, which does not specify any particular character
   303  *   set.)</i> </p></li>
   304  *
   305  *   <li><p><a name="quote"></a> An illegal character is <i>quoted</i> simply by
   306  *   encoding it.  The space character, for example, is quoted by replacing it
   307  *   with <tt>"%20"</tt>.  UTF-8 contains US-ASCII, hence for US-ASCII
   308  *   characters this transformation has exactly the effect required by
   309  *   RFC&nbsp;2396. </p></li>
   310  *
   311  *   <li><p><a name="decode"></a>
   312  *   A sequence of escaped octets is <i>decoded</i> by
   313  *   replacing it with the sequence of characters that it represents in the
   314  *   UTF-8 character set.  UTF-8 contains US-ASCII, hence decoding has the
   315  *   effect of de-quoting any quoted US-ASCII characters as well as that of
   316  *   decoding any encoded non-US-ASCII characters.  If a <a
   317  *   href="../nio/charset/CharsetDecoder.html#ce">decoding error</a> occurs
   318  *   when decoding the escaped octets then the erroneous octets are replaced by
   319  *   <tt>'&#92;uFFFD'</tt>, the Unicode replacement character.  </p></li>
   320  *
   321  * </ul>
   322  *
   323  * These operations are exposed in the constructors and methods of this class
   324  * as follows:
   325  *
   326  * <ul>
   327  *
   328  *   <li><p> The {@link #URI(java.lang.String) <code>single-argument
   329  *   constructor</code>} requires any illegal characters in its argument to be
   330  *   quoted and preserves any escaped octets and <i>other</i> characters that
   331  *   are present.  </p></li>
   332  *
   333  *   <li><p> The {@link
   334  *   #URI(java.lang.String,java.lang.String,java.lang.String,int,java.lang.String,java.lang.String,java.lang.String)
   335  *   <code>multi-argument constructors</code>} quote illegal characters as
   336  *   required by the components in which they appear.  The percent character
   337  *   (<tt>'%'</tt>) is always quoted by these constructors.  Any <i>other</i>
   338  *   characters are preserved.  </p></li>
   339  *
   340  *   <li><p> The {@link #getRawUserInfo() getRawUserInfo}, {@link #getRawPath()
   341  *   getRawPath}, {@link #getRawQuery() getRawQuery}, {@link #getRawFragment()
   342  *   getRawFragment}, {@link #getRawAuthority() getRawAuthority}, and {@link
   343  *   #getRawSchemeSpecificPart() getRawSchemeSpecificPart} methods return the
   344  *   values of their corresponding components in raw form, without interpreting
   345  *   any escaped octets.  The strings returned by these methods may contain
   346  *   both escaped octets and <i>other</i> characters, and will not contain any
   347  *   illegal characters.  </p></li>
   348  *
   349  *   <li><p> The {@link #getUserInfo() getUserInfo}, {@link #getPath()
   350  *   getPath}, {@link #getQuery() getQuery}, {@link #getFragment()
   351  *   getFragment}, {@link #getAuthority() getAuthority}, and {@link
   352  *   #getSchemeSpecificPart() getSchemeSpecificPart} methods decode any escaped
   353  *   octets in their corresponding components.  The strings returned by these
   354  *   methods may contain both <i>other</i> characters and illegal characters,
   355  *   and will not contain any escaped octets.  </p></li>
   356  *
   357  *   <li><p> The {@link #toString() toString} method returns a URI string with
   358  *   all necessary quotation but which may contain <i>other</i> characters.
   359  *   </p></li>
   360  *
   361  *   <li><p> The {@link #toASCIIString() toASCIIString} method returns a fully
   362  *   quoted and encoded URI string that does not contain any <i>other</i>
   363  *   characters.  </p></li>
   364  *
   365  * </ul>
   366  *
   367  *
   368  * <h4> Identities </h4>
   369  *
   370  * For any URI <i>u</i>, it is always the case that
   371  *
   372  * <blockquote>
   373  * <tt>new URI(</tt><i>u</i><tt>.toString()).equals(</tt><i>u</i><tt>)</tt>&nbsp;.
   374  * </blockquote>
   375  *
   376  * For any URI <i>u</i> that does not contain redundant syntax such as two
   377  * slashes before an empty authority (as in <tt>file:///tmp/</tt>&nbsp;) or a
   378  * colon following a host name but no port (as in
   379  * <tt>http://java.sun.com:</tt>&nbsp;), and that does not encode characters
   380  * except those that must be quoted, the following identities also hold:
   381  *
   382  * <blockquote>
   383  * <tt>new URI(</tt><i>u</i><tt>.getScheme(),<br>
   384  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</tt><i>u</i><tt>.getSchemeSpecificPart(),<br>
   385  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</tt><i>u</i><tt>.getFragment())<br>
   386  * .equals(</tt><i>u</i><tt>)</tt>
   387  * </blockquote>
   388  *
   389  * in all cases,
   390  *
   391  * <blockquote>
   392  * <tt>new URI(</tt><i>u</i><tt>.getScheme(),<br>
   393  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</tt><i>u</i><tt>.getUserInfo(),&nbsp;</tt><i>u</i><tt>.getAuthority(),<br>
   394  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</tt><i>u</i><tt>.getPath(),&nbsp;</tt><i>u</i><tt>.getQuery(),<br>
   395  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</tt><i>u</i><tt>.getFragment())<br>
   396  * .equals(</tt><i>u</i><tt>)</tt>
   397  * </blockquote>
   398  *
   399  * if <i>u</i> is hierarchical, and
   400  *
   401  * <blockquote>
   402  * <tt>new URI(</tt><i>u</i><tt>.getScheme(),<br>
   403  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</tt><i>u</i><tt>.getUserInfo(),&nbsp;</tt><i>u</i><tt>.getHost(),&nbsp;</tt><i>u</i><tt>.getPort(),<br>
   404  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</tt><i>u</i><tt>.getPath(),&nbsp;</tt><i>u</i><tt>.getQuery(),<br>
   405  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</tt><i>u</i><tt>.getFragment())<br>
   406  * .equals(</tt><i>u</i><tt>)</tt>
   407  * </blockquote>
   408  *
   409  * if <i>u</i> is hierarchical and has either no authority or a server-based
   410  * authority.
   411  *
   412  *
   413  * <h4> URIs, URLs, and URNs </h4>
   414  *
   415  * A URI is a uniform resource <i>identifier</i> while a URL is a uniform
   416  * resource <i>locator</i>.  Hence every URL is a URI, abstractly speaking, but
   417  * not every URI is a URL.  This is because there is another subcategory of
   418  * URIs, uniform resource <i>names</i> (URNs), which name resources but do not
   419  * specify how to locate them.  The <tt>mailto</tt>, <tt>news</tt>, and
   420  * <tt>isbn</tt> URIs shown above are examples of URNs.
   421  *
   422  * <p> The conceptual distinction between URIs and URLs is reflected in the
   423  * differences between this class and the {@link URL} class.
   424  *
   425  * <p> An instance of this class represents a URI reference in the syntactic
   426  * sense defined by RFC&nbsp;2396.  A URI may be either absolute or relative.
   427  * A URI string is parsed according to the generic syntax without regard to the
   428  * scheme, if any, that it specifies.  No lookup of the host, if any, is
   429  * performed, and no scheme-dependent stream handler is constructed.  Equality,
   430  * hashing, and comparison are defined strictly in terms of the character
   431  * content of the instance.  In other words, a URI instance is little more than
   432  * a structured string that supports the syntactic, scheme-independent
   433  * operations of comparison, normalization, resolution, and relativization.
   434  *
   435  * <p> An instance of the {@link URL} class, by contrast, represents the
   436  * syntactic components of a URL together with some of the information required
   437  * to access the resource that it describes.  A URL must be absolute, that is,
   438  * it must always specify a scheme.  A URL string is parsed according to its
   439  * scheme.  A stream handler is always established for a URL, and in fact it is
   440  * impossible to create a URL instance for a scheme for which no handler is
   441  * available.  Equality and hashing depend upon both the scheme and the
   442  * Internet address of the host, if any; comparison is not defined.  In other
   443  * words, a URL is a structured string that supports the syntactic operation of
   444  * resolution as well as the network I/O operations of looking up the host and
   445  * opening a connection to the specified resource.
   446  *
   447  *
   448  * @author Mark Reinhold
   449  * @since 1.4
   450  *
   451  * @see <a href="http://www.ietf.org/rfc/rfc2279.txt"><i>RFC&nbsp;2279: UTF-8, a
   452  * transformation format of ISO 10646</i></a>, <br><a
   453  * href="http://www.ietf.org/rfc/rfc2373.txt"><i>RFC&nbsp;2373: IPv6 Addressing
   454  * Architecture</i></a>, <br><a
   455  * href="http://www.ietf.org/rfc/rfc2396.txt"><i>RFC&nbsp;2396: Uniform
   456  * Resource Identifiers (URI): Generic Syntax</i></a>, <br><a
   457  * href="http://www.ietf.org/rfc/rfc2732.txt"><i>RFC&nbsp;2732: Format for
   458  * Literal IPv6 Addresses in URLs</i></a>, <br><a
   459  * href="URISyntaxException.html">URISyntaxException</a>
   460  */
   461 
   462 public final class URI
   463     implements Comparable<URI>, Serializable
   464 {
   465 
   466     // Note: Comments containing the word "ASSERT" indicate places where a
   467     // throw of an InternalError should be replaced by an appropriate assertion
   468     // statement once asserts are enabled in the build.
   469 
   470     static final long serialVersionUID = -6052424284110960213L;
   471 
   472 
   473     // -- Properties and components of this instance --
   474 
   475     // Components of all URIs: [<scheme>:]<scheme-specific-part>[#<fragment>]
   476     private transient String scheme;            // null ==> relative URI
   477     private transient String fragment;
   478 
   479     // Hierarchical URI components: [//<authority>]<path>[?<query>]
   480     private transient String authority;         // Registry or server
   481 
   482     // Server-based authority: [<userInfo>@]<host>[:<port>]
   483     private transient String userInfo;
   484     private transient String host;              // null ==> registry-based
   485     private transient int port = -1;            // -1 ==> undefined
   486 
   487     // Remaining components of hierarchical URIs
   488     private transient String path;              // null ==> opaque
   489     private transient String query;
   490 
   491     // The remaining fields may be computed on demand
   492 
   493     private volatile transient String schemeSpecificPart;
   494     private volatile transient int hash;        // Zero ==> undefined
   495 
   496     private volatile transient String decodedUserInfo = null;
   497     private volatile transient String decodedAuthority = null;
   498     private volatile transient String decodedPath = null;
   499     private volatile transient String decodedQuery = null;
   500     private volatile transient String decodedFragment = null;
   501     private volatile transient String decodedSchemeSpecificPart = null;
   502 
   503     /**
   504      * The string form of this URI.
   505      *
   506      * @serial
   507      */
   508     private volatile String string;             // The only serializable field
   509 
   510 
   511 
   512     // -- Constructors and factories --
   513 
   514     private URI() { }                           // Used internally
   515 
   516     /**
   517      * Constructs a URI by parsing the given string.
   518      *
   519      * <p> This constructor parses the given string exactly as specified by the
   520      * grammar in <a
   521      * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
   522      * Appendix&nbsp;A, <b><i>except for the following deviations:</i></b> </p>
   523      *
   524      * <ul type=disc>
   525      *
   526      *   <li><p> An empty authority component is permitted as long as it is
   527      *   followed by a non-empty path, a query component, or a fragment
   528      *   component.  This allows the parsing of URIs such as
   529      *   <tt>"file:///foo/bar"</tt>, which seems to be the intent of
   530      *   RFC&nbsp;2396 although the grammar does not permit it.  If the
   531      *   authority component is empty then the user-information, host, and port
   532      *   components are undefined. </p></li>
   533      *
   534      *   <li><p> Empty relative paths are permitted; this seems to be the
   535      *   intent of RFC&nbsp;2396 although the grammar does not permit it.  The
   536      *   primary consequence of this deviation is that a standalone fragment
   537      *   such as <tt>"#foo"</tt> parses as a relative URI with an empty path
   538      *   and the given fragment, and can be usefully <a
   539      *   href="#resolve-frag">resolved</a> against a base URI.
   540      *
   541      *   <li><p> IPv4 addresses in host components are parsed rigorously, as
   542      *   specified by <a
   543      *   href="http://www.ietf.org/rfc/rfc2732.txt">RFC&nbsp;2732</a>: Each
   544      *   element of a dotted-quad address must contain no more than three
   545      *   decimal digits.  Each element is further constrained to have a value
   546      *   no greater than 255. </p></li>
   547      *
   548      *   <li> <p> Hostnames in host components that comprise only a single
   549      *   domain label are permitted to start with an <i>alphanum</i>
   550      *   character. This seems to be the intent of <a
   551      *   href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>
   552      *   section&nbsp;3.2.2 although the grammar does not permit it. The
   553      *   consequence of this deviation is that the authority component of a
   554      *   hierarchical URI such as <tt>s://123</tt>, will parse as a server-based
   555      *   authority. </p></li>
   556      *
   557      *   <li><p> IPv6 addresses are permitted for the host component.  An IPv6
   558      *   address must be enclosed in square brackets (<tt>'['</tt> and
   559      *   <tt>']'</tt>) as specified by <a
   560      *   href="http://www.ietf.org/rfc/rfc2732.txt">RFC&nbsp;2732</a>.  The
   561      *   IPv6 address itself must parse according to <a
   562      *   href="http://www.ietf.org/rfc/rfc2373.txt">RFC&nbsp;2373</a>.  IPv6
   563      *   addresses are further constrained to describe no more than sixteen
   564      *   bytes of address information, a constraint implicit in RFC&nbsp;2373
   565      *   but not expressible in the grammar. </p></li>
   566      *
   567      *   <li><p> Characters in the <i>other</i> category are permitted wherever
   568      *   RFC&nbsp;2396 permits <i>escaped</i> octets, that is, in the
   569      *   user-information, path, query, and fragment components, as well as in
   570      *   the authority component if the authority is registry-based.  This
   571      *   allows URIs to contain Unicode characters beyond those in the US-ASCII
   572      *   character set. </p></li>
   573      *
   574      * </ul>
   575      *
   576      * @param  str   The string to be parsed into a URI
   577      *
   578      * @throws  NullPointerException
   579      *          If <tt>str</tt> is <tt>null</tt>
   580      *
   581      * @throws  URISyntaxException
   582      *          If the given string violates RFC&nbsp;2396, as augmented
   583      *          by the above deviations
   584      */
   585     public URI(String str) throws URISyntaxException {
   586         new Parser(str).parse(false);
   587     }
   588 
   589     /**
   590      * Constructs a hierarchical URI from the given components.
   591      *
   592      * <p> If a scheme is given then the path, if also given, must either be
   593      * empty or begin with a slash character (<tt>'/'</tt>).  Otherwise a
   594      * component of the new URI may be left undefined by passing <tt>null</tt>
   595      * for the corresponding parameter or, in the case of the <tt>port</tt>
   596      * parameter, by passing <tt>-1</tt>.
   597      *
   598      * <p> This constructor first builds a URI string from the given components
   599      * according to the rules specified in <a
   600      * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
   601      * section&nbsp;5.2, step&nbsp;7: </p>
   602      *
   603      * <ol>
   604      *
   605      *   <li><p> Initially, the result string is empty. </p></li>
   606      *
   607      *   <li><p> If a scheme is given then it is appended to the result,
   608      *   followed by a colon character (<tt>':'</tt>).  </p></li>
   609      *
   610      *   <li><p> If user information, a host, or a port are given then the
   611      *   string <tt>"//"</tt> is appended.  </p></li>
   612      *
   613      *   <li><p> If user information is given then it is appended, followed by
   614      *   a commercial-at character (<tt>'@'</tt>).  Any character not in the
   615      *   <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i>
   616      *   categories is <a href="#quote">quoted</a>.  </p></li>
   617      *
   618      *   <li><p> If a host is given then it is appended.  If the host is a
   619      *   literal IPv6 address but is not enclosed in square brackets
   620      *   (<tt>'['</tt> and <tt>']'</tt>) then the square brackets are added.
   621      *   </p></li>
   622      *
   623      *   <li><p> If a port number is given then a colon character
   624      *   (<tt>':'</tt>) is appended, followed by the port number in decimal.
   625      *   </p></li>
   626      *
   627      *   <li><p> If a path is given then it is appended.  Any character not in
   628      *   the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i>
   629      *   categories, and not equal to the slash character (<tt>'/'</tt>) or the
   630      *   commercial-at character (<tt>'@'</tt>), is quoted.  </p></li>
   631      *
   632      *   <li><p> If a query is given then a question-mark character
   633      *   (<tt>'?'</tt>) is appended, followed by the query.  Any character that
   634      *   is not a <a href="#legal-chars">legal URI character</a> is quoted.
   635      *   </p></li>
   636      *
   637      *   <li><p> Finally, if a fragment is given then a hash character
   638      *   (<tt>'#'</tt>) is appended, followed by the fragment.  Any character
   639      *   that is not a legal URI character is quoted.  </p></li>
   640      *
   641      * </ol>
   642      *
   643      * <p> The resulting URI string is then parsed as if by invoking the {@link
   644      * #URI(String)} constructor and then invoking the {@link
   645      * #parseServerAuthority()} method upon the result; this may cause a {@link
   646      * URISyntaxException} to be thrown.  </p>
   647      *
   648      * @param   scheme    Scheme name
   649      * @param   userInfo  User name and authorization information
   650      * @param   host      Host name
   651      * @param   port      Port number
   652      * @param   path      Path
   653      * @param   query     Query
   654      * @param   fragment  Fragment
   655      *
   656      * @throws URISyntaxException
   657      *         If both a scheme and a path are given but the path is relative,
   658      *         if the URI string constructed from the given components violates
   659      *         RFC&nbsp;2396, or if the authority component of the string is
   660      *         present but cannot be parsed as a server-based authority
   661      */
   662     public URI(String scheme,
   663                String userInfo, String host, int port,
   664                String path, String query, String fragment)
   665         throws URISyntaxException
   666     {
   667         String s = toString(scheme, null,
   668                             null, userInfo, host, port,
   669                             path, query, fragment);
   670         checkPath(s, scheme, path);
   671         new Parser(s).parse(true);
   672     }
   673 
   674     /**
   675      * Constructs a hierarchical URI from the given components.
   676      *
   677      * <p> If a scheme is given then the path, if also given, must either be
   678      * empty or begin with a slash character (<tt>'/'</tt>).  Otherwise a
   679      * component of the new URI may be left undefined by passing <tt>null</tt>
   680      * for the corresponding parameter.
   681      *
   682      * <p> This constructor first builds a URI string from the given components
   683      * according to the rules specified in <a
   684      * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
   685      * section&nbsp;5.2, step&nbsp;7: </p>
   686      *
   687      * <ol>
   688      *
   689      *   <li><p> Initially, the result string is empty.  </p></li>
   690      *
   691      *   <li><p> If a scheme is given then it is appended to the result,
   692      *   followed by a colon character (<tt>':'</tt>).  </p></li>
   693      *
   694      *   <li><p> If an authority is given then the string <tt>"//"</tt> is
   695      *   appended, followed by the authority.  If the authority contains a
   696      *   literal IPv6 address then the address must be enclosed in square
   697      *   brackets (<tt>'['</tt> and <tt>']'</tt>).  Any character not in the
   698      *   <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i>
   699      *   categories, and not equal to the commercial-at character
   700      *   (<tt>'@'</tt>), is <a href="#quote">quoted</a>.  </p></li>
   701      *
   702      *   <li><p> If a path is given then it is appended.  Any character not in
   703      *   the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i>
   704      *   categories, and not equal to the slash character (<tt>'/'</tt>) or the
   705      *   commercial-at character (<tt>'@'</tt>), is quoted.  </p></li>
   706      *
   707      *   <li><p> If a query is given then a question-mark character
   708      *   (<tt>'?'</tt>) is appended, followed by the query.  Any character that
   709      *   is not a <a href="#legal-chars">legal URI character</a> is quoted.
   710      *   </p></li>
   711      *
   712      *   <li><p> Finally, if a fragment is given then a hash character
   713      *   (<tt>'#'</tt>) is appended, followed by the fragment.  Any character
   714      *   that is not a legal URI character is quoted.  </p></li>
   715      *
   716      * </ol>
   717      *
   718      * <p> The resulting URI string is then parsed as if by invoking the {@link
   719      * #URI(String)} constructor and then invoking the {@link
   720      * #parseServerAuthority()} method upon the result; this may cause a {@link
   721      * URISyntaxException} to be thrown.  </p>
   722      *
   723      * @param   scheme     Scheme name
   724      * @param   authority  Authority
   725      * @param   path       Path
   726      * @param   query      Query
   727      * @param   fragment   Fragment
   728      *
   729      * @throws URISyntaxException
   730      *         If both a scheme and a path are given but the path is relative,
   731      *         if the URI string constructed from the given components violates
   732      *         RFC&nbsp;2396, or if the authority component of the string is
   733      *         present but cannot be parsed as a server-based authority
   734      */
   735     public URI(String scheme,
   736                String authority,
   737                String path, String query, String fragment)
   738         throws URISyntaxException
   739     {
   740         String s = toString(scheme, null,
   741                             authority, null, null, -1,
   742                             path, query, fragment);
   743         checkPath(s, scheme, path);
   744         new Parser(s).parse(false);
   745     }
   746 
   747     /**
   748      * Constructs a hierarchical URI from the given components.
   749      *
   750      * <p> A component may be left undefined by passing <tt>null</tt>.
   751      *
   752      * <p> This convenience constructor works as if by invoking the
   753      * seven-argument constructor as follows:
   754      *
   755      * <blockquote><tt>
   756      * new&nbsp;{@link #URI(String, String, String, int, String, String, String)
   757      * URI}(scheme,&nbsp;null,&nbsp;host,&nbsp;-1,&nbsp;path,&nbsp;null,&nbsp;fragment);
   758      * </tt></blockquote>
   759      *
   760      * @param   scheme    Scheme name
   761      * @param   host      Host name
   762      * @param   path      Path
   763      * @param   fragment  Fragment
   764      *
   765      * @throws  URISyntaxException
   766      *          If the URI string constructed from the given components
   767      *          violates RFC&nbsp;2396
   768      */
   769     public URI(String scheme, String host, String path, String fragment)
   770         throws URISyntaxException
   771     {
   772         this(scheme, null, host, -1, path, null, fragment);
   773     }
   774 
   775     /**
   776      * Constructs a URI from the given components.
   777      *
   778      * <p> A component may be left undefined by passing <tt>null</tt>.
   779      *
   780      * <p> This constructor first builds a URI in string form using the given
   781      * components as follows:  </p>
   782      *
   783      * <ol>
   784      *
   785      *   <li><p> Initially, the result string is empty.  </p></li>
   786      *
   787      *   <li><p> If a scheme is given then it is appended to the result,
   788      *   followed by a colon character (<tt>':'</tt>).  </p></li>
   789      *
   790      *   <li><p> If a scheme-specific part is given then it is appended.  Any
   791      *   character that is not a <a href="#legal-chars">legal URI character</a>
   792      *   is <a href="#quote">quoted</a>.  </p></li>
   793      *
   794      *   <li><p> Finally, if a fragment is given then a hash character
   795      *   (<tt>'#'</tt>) is appended to the string, followed by the fragment.
   796      *   Any character that is not a legal URI character is quoted.  </p></li>
   797      *
   798      * </ol>
   799      *
   800      * <p> The resulting URI string is then parsed in order to create the new
   801      * URI instance as if by invoking the {@link #URI(String)} constructor;
   802      * this may cause a {@link URISyntaxException} to be thrown.  </p>
   803      *
   804      * @param   scheme    Scheme name
   805      * @param   ssp       Scheme-specific part
   806      * @param   fragment  Fragment
   807      *
   808      * @throws  URISyntaxException
   809      *          If the URI string constructed from the given components
   810      *          violates RFC&nbsp;2396
   811      */
   812     public URI(String scheme, String ssp, String fragment)
   813         throws URISyntaxException
   814     {
   815         new Parser(toString(scheme, ssp,
   816                             null, null, null, -1,
   817                             null, null, fragment))
   818             .parse(false);
   819     }
   820 
   821     /**
   822      * Creates a URI by parsing the given string.
   823      *
   824      * <p> This convenience factory method works as if by invoking the {@link
   825      * #URI(String)} constructor; any {@link URISyntaxException} thrown by the
   826      * constructor is caught and wrapped in a new {@link
   827      * IllegalArgumentException} object, which is then thrown.
   828      *
   829      * <p> This method is provided for use in situations where it is known that
   830      * the given string is a legal URI, for example for URI constants declared
   831      * within in a program, and so it would be considered a programming error
   832      * for the string not to parse as such.  The constructors, which throw
   833      * {@link URISyntaxException} directly, should be used situations where a
   834      * URI is being constructed from user input or from some other source that
   835      * may be prone to errors.  </p>
   836      *
   837      * @param  str   The string to be parsed into a URI
   838      * @return The new URI
   839      *
   840      * @throws  NullPointerException
   841      *          If <tt>str</tt> is <tt>null</tt>
   842      *
   843      * @throws  IllegalArgumentException
   844      *          If the given string violates RFC&nbsp;2396
   845      */
   846     public static URI create(String str) {
   847         try {
   848             return new URI(str);
   849         } catch (URISyntaxException x) {
   850             throw new IllegalArgumentException(x.getMessage(), x);
   851         }
   852     }
   853 
   854 
   855     // -- Operations --
   856 
   857     /**
   858      * Attempts to parse this URI's authority component, if defined, into
   859      * user-information, host, and port components.
   860      *
   861      * <p> If this URI's authority component has already been recognized as
   862      * being server-based then it will already have been parsed into
   863      * user-information, host, and port components.  In this case, or if this
   864      * URI has no authority component, this method simply returns this URI.
   865      *
   866      * <p> Otherwise this method attempts once more to parse the authority
   867      * component into user-information, host, and port components, and throws
   868      * an exception describing why the authority component could not be parsed
   869      * in that way.
   870      *
   871      * <p> This method is provided because the generic URI syntax specified in
   872      * <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>
   873      * cannot always distinguish a malformed server-based authority from a
   874      * legitimate registry-based authority.  It must therefore treat some
   875      * instances of the former as instances of the latter.  The authority
   876      * component in the URI string <tt>"//foo:bar"</tt>, for example, is not a
   877      * legal server-based authority but it is legal as a registry-based
   878      * authority.
   879      *
   880      * <p> In many common situations, for example when working URIs that are
   881      * known to be either URNs or URLs, the hierarchical URIs being used will
   882      * always be server-based.  They therefore must either be parsed as such or
   883      * treated as an error.  In these cases a statement such as
   884      *
   885      * <blockquote>
   886      * <tt>URI </tt><i>u</i><tt> = new URI(str).parseServerAuthority();</tt>
   887      * </blockquote>
   888      *
   889      * <p> can be used to ensure that <i>u</i> always refers to a URI that, if
   890      * it has an authority component, has a server-based authority with proper
   891      * user-information, host, and port components.  Invoking this method also
   892      * ensures that if the authority could not be parsed in that way then an
   893      * appropriate diagnostic message can be issued based upon the exception
   894      * that is thrown. </p>
   895      *
   896      * @return  A URI whose authority field has been parsed
   897      *          as a server-based authority
   898      *
   899      * @throws  URISyntaxException
   900      *          If the authority component of this URI is defined
   901      *          but cannot be parsed as a server-based authority
   902      *          according to RFC&nbsp;2396
   903      */
   904     public URI parseServerAuthority()
   905         throws URISyntaxException
   906     {
   907         // We could be clever and cache the error message and index from the
   908         // exception thrown during the original parse, but that would require
   909         // either more fields or a more-obscure representation.
   910         if ((host != null) || (authority == null))
   911             return this;
   912         defineString();
   913         new Parser(string).parse(true);
   914         return this;
   915     }
   916 
   917     /**
   918      * Normalizes this URI's path.
   919      *
   920      * <p> If this URI is opaque, or if its path is already in normal form,
   921      * then this URI is returned.  Otherwise a new URI is constructed that is
   922      * identical to this URI except that its path is computed by normalizing
   923      * this URI's path in a manner consistent with <a
   924      * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
   925      * section&nbsp;5.2, step&nbsp;6, sub-steps&nbsp;c through&nbsp;f; that is:
   926      * </p>
   927      *
   928      * <ol>
   929      *
   930      *   <li><p> All <tt>"."</tt> segments are removed. </p></li>
   931      *
   932      *   <li><p> If a <tt>".."</tt> segment is preceded by a non-<tt>".."</tt>
   933      *   segment then both of these segments are removed.  This step is
   934      *   repeated until it is no longer applicable. </p></li>
   935      *
   936      *   <li><p> If the path is relative, and if its first segment contains a
   937      *   colon character (<tt>':'</tt>), then a <tt>"."</tt> segment is
   938      *   prepended.  This prevents a relative URI with a path such as
   939      *   <tt>"a:b/c/d"</tt> from later being re-parsed as an opaque URI with a
   940      *   scheme of <tt>"a"</tt> and a scheme-specific part of <tt>"b/c/d"</tt>.
   941      *   <b><i>(Deviation from RFC&nbsp;2396)</i></b> </p></li>
   942      *
   943      * </ol>
   944      *
   945      * <p> A normalized path will begin with one or more <tt>".."</tt> segments
   946      * if there were insufficient non-<tt>".."</tt> segments preceding them to
   947      * allow their removal.  A normalized path will begin with a <tt>"."</tt>
   948      * segment if one was inserted by step 3 above.  Otherwise, a normalized
   949      * path will not contain any <tt>"."</tt> or <tt>".."</tt> segments. </p>
   950      *
   951      * @return  A URI equivalent to this URI,
   952      *          but whose path is in normal form
   953      */
   954     public URI normalize() {
   955         return normalize(this);
   956     }
   957 
   958     /**
   959      * Resolves the given URI against this URI.
   960      *
   961      * <p> If the given URI is already absolute, or if this URI is opaque, then
   962      * the given URI is returned.
   963      *
   964      * <p><a name="resolve-frag"></a> If the given URI's fragment component is
   965      * defined, its path component is empty, and its scheme, authority, and
   966      * query components are undefined, then a URI with the given fragment but
   967      * with all other components equal to those of this URI is returned.  This
   968      * allows a URI representing a standalone fragment reference, such as
   969      * <tt>"#foo"</tt>, to be usefully resolved against a base URI.
   970      *
   971      * <p> Otherwise this method constructs a new hierarchical URI in a manner
   972      * consistent with <a
   973      * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
   974      * section&nbsp;5.2; that is: </p>
   975      *
   976      * <ol>
   977      *
   978      *   <li><p> A new URI is constructed with this URI's scheme and the given
   979      *   URI's query and fragment components. </p></li>
   980      *
   981      *   <li><p> If the given URI has an authority component then the new URI's
   982      *   authority and path are taken from the given URI. </p></li>
   983      *
   984      *   <li><p> Otherwise the new URI's authority component is copied from
   985      *   this URI, and its path is computed as follows: </p>
   986      *
   987      *   <ol type=a>
   988      *
   989      *     <li><p> If the given URI's path is absolute then the new URI's path
   990      *     is taken from the given URI. </p></li>
   991      *
   992      *     <li><p> Otherwise the given URI's path is relative, and so the new
   993      *     URI's path is computed by resolving the path of the given URI
   994      *     against the path of this URI.  This is done by concatenating all but
   995      *     the last segment of this URI's path, if any, with the given URI's
   996      *     path and then normalizing the result as if by invoking the {@link
   997      *     #normalize() normalize} method. </p></li>
   998      *
   999      *   </ol></li>
  1000      *
  1001      * </ol>
  1002      *
  1003      * <p> The result of this method is absolute if, and only if, either this
  1004      * URI is absolute or the given URI is absolute.  </p>
  1005      *
  1006      * @param  uri  The URI to be resolved against this URI
  1007      * @return The resulting URI
  1008      *
  1009      * @throws  NullPointerException
  1010      *          If <tt>uri</tt> is <tt>null</tt>
  1011      */
  1012     public URI resolve(URI uri) {
  1013         return resolve(this, uri);
  1014     }
  1015 
  1016     /**
  1017      * Constructs a new URI by parsing the given string and then resolving it
  1018      * against this URI.
  1019      *
  1020      * <p> This convenience method works as if invoking it were equivalent to
  1021      * evaluating the expression <tt>{@link #resolve(java.net.URI)
  1022      * resolve}(URI.{@link #create(String) create}(str))</tt>. </p>
  1023      *
  1024      * @param  str   The string to be parsed into a URI
  1025      * @return The resulting URI
  1026      *
  1027      * @throws  NullPointerException
  1028      *          If <tt>str</tt> is <tt>null</tt>
  1029      *
  1030      * @throws  IllegalArgumentException
  1031      *          If the given string violates RFC&nbsp;2396
  1032      */
  1033     public URI resolve(String str) {
  1034         return resolve(URI.create(str));
  1035     }
  1036 
  1037     /**
  1038      * Relativizes the given URI against this URI.
  1039      *
  1040      * <p> The relativization of the given URI against this URI is computed as
  1041      * follows: </p>
  1042      *
  1043      * <ol>
  1044      *
  1045      *   <li><p> If either this URI or the given URI are opaque, or if the
  1046      *   scheme and authority components of the two URIs are not identical, or
  1047      *   if the path of this URI is not a prefix of the path of the given URI,
  1048      *   then the given URI is returned. </p></li>
  1049      *
  1050      *   <li><p> Otherwise a new relative hierarchical URI is constructed with
  1051      *   query and fragment components taken from the given URI and with a path
  1052      *   component computed by removing this URI's path from the beginning of
  1053      *   the given URI's path. </p></li>
  1054      *
  1055      * </ol>
  1056      *
  1057      * @param  uri  The URI to be relativized against this URI
  1058      * @return The resulting URI
  1059      *
  1060      * @throws  NullPointerException
  1061      *          If <tt>uri</tt> is <tt>null</tt>
  1062      */
  1063     public URI relativize(URI uri) {
  1064         return relativize(this, uri);
  1065     }
  1066 
  1067     /**
  1068      * Constructs a URL from this URI.
  1069      *
  1070      * <p> This convenience method works as if invoking it were equivalent to
  1071      * evaluating the expression <tt>new&nbsp;URL(this.toString())</tt> after
  1072      * first checking that this URI is absolute. </p>
  1073      *
  1074      * @return  A URL constructed from this URI
  1075      *
  1076      * @throws  IllegalArgumentException
  1077      *          If this URL is not absolute
  1078      *
  1079      * @throws  MalformedURLException
  1080      *          If a protocol handler for the URL could not be found,
  1081      *          or if some other error occurred while constructing the URL
  1082      */
  1083     public URL toURL()
  1084         throws MalformedURLException {
  1085         if (!isAbsolute())
  1086             throw new IllegalArgumentException("URI is not absolute");
  1087         return new URL(toString());
  1088     }
  1089 
  1090     // -- Component access methods --
  1091 
  1092     /**
  1093      * Returns the scheme component of this URI.
  1094      *
  1095      * <p> The scheme component of a URI, if defined, only contains characters
  1096      * in the <i>alphanum</i> category and in the string <tt>"-.+"</tt>.  A
  1097      * scheme always starts with an <i>alpha</i> character. <p>
  1098      *
  1099      * The scheme component of a URI cannot contain escaped octets, hence this
  1100      * method does not perform any decoding.
  1101      *
  1102      * @return  The scheme component of this URI,
  1103      *          or <tt>null</tt> if the scheme is undefined
  1104      */
  1105     public String getScheme() {
  1106         return scheme;
  1107     }
  1108 
  1109     /**
  1110      * Tells whether or not this URI is absolute.
  1111      *
  1112      * <p> A URI is absolute if, and only if, it has a scheme component. </p>
  1113      *
  1114      * @return  <tt>true</tt> if, and only if, this URI is absolute
  1115      */
  1116     public boolean isAbsolute() {
  1117         return scheme != null;
  1118     }
  1119 
  1120     /**
  1121      * Tells whether or not this URI is opaque.
  1122      *
  1123      * <p> A URI is opaque if, and only if, it is absolute and its
  1124      * scheme-specific part does not begin with a slash character ('/').
  1125      * An opaque URI has a scheme, a scheme-specific part, and possibly
  1126      * a fragment; all other components are undefined. </p>
  1127      *
  1128      * @return  <tt>true</tt> if, and only if, this URI is opaque
  1129      */
  1130     public boolean isOpaque() {
  1131         return path == null;
  1132     }
  1133 
  1134     /**
  1135      * Returns the raw scheme-specific part of this URI.  The scheme-specific
  1136      * part is never undefined, though it may be empty.
  1137      *
  1138      * <p> The scheme-specific part of a URI only contains legal URI
  1139      * characters. </p>
  1140      *
  1141      * @return  The raw scheme-specific part of this URI
  1142      *          (never <tt>null</tt>)
  1143      */
  1144     public String getRawSchemeSpecificPart() {
  1145         defineSchemeSpecificPart();
  1146         return schemeSpecificPart;
  1147     }
  1148 
  1149     /**
  1150      * Returns the decoded scheme-specific part of this URI.
  1151      *
  1152      * <p> The string returned by this method is equal to that returned by the
  1153      * {@link #getRawSchemeSpecificPart() getRawSchemeSpecificPart} method
  1154      * except that all sequences of escaped octets are <a
  1155      * href="#decode">decoded</a>.  </p>
  1156      *
  1157      * @return  The decoded scheme-specific part of this URI
  1158      *          (never <tt>null</tt>)
  1159      */
  1160     public String getSchemeSpecificPart() {
  1161         if (decodedSchemeSpecificPart == null)
  1162             decodedSchemeSpecificPart = decode(getRawSchemeSpecificPart());
  1163         return decodedSchemeSpecificPart;
  1164     }
  1165 
  1166     /**
  1167      * Returns the raw authority component of this URI.
  1168      *
  1169      * <p> The authority component of a URI, if defined, only contains the
  1170      * commercial-at character (<tt>'@'</tt>) and characters in the
  1171      * <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, and <i>other</i>
  1172      * categories.  If the authority is server-based then it is further
  1173      * constrained to have valid user-information, host, and port
  1174      * components. </p>
  1175      *
  1176      * @return  The raw authority component of this URI,
  1177      *          or <tt>null</tt> if the authority is undefined
  1178      */
  1179     public String getRawAuthority() {
  1180         return authority;
  1181     }
  1182 
  1183     /**
  1184      * Returns the decoded authority component of this URI.
  1185      *
  1186      * <p> The string returned by this method is equal to that returned by the
  1187      * {@link #getRawAuthority() getRawAuthority} method except that all
  1188      * sequences of escaped octets are <a href="#decode">decoded</a>.  </p>
  1189      *
  1190      * @return  The decoded authority component of this URI,
  1191      *          or <tt>null</tt> if the authority is undefined
  1192      */
  1193     public String getAuthority() {
  1194         if (decodedAuthority == null)
  1195             decodedAuthority = decode(authority);
  1196         return decodedAuthority;
  1197     }
  1198 
  1199     /**
  1200      * Returns the raw user-information component of this URI.
  1201      *
  1202      * <p> The user-information component of a URI, if defined, only contains
  1203      * characters in the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, and
  1204      * <i>other</i> categories. </p>
  1205      *
  1206      * @return  The raw user-information component of this URI,
  1207      *          or <tt>null</tt> if the user information is undefined
  1208      */
  1209     public String getRawUserInfo() {
  1210         return userInfo;
  1211     }
  1212 
  1213     /**
  1214      * Returns the decoded user-information component of this URI.
  1215      *
  1216      * <p> The string returned by this method is equal to that returned by the
  1217      * {@link #getRawUserInfo() getRawUserInfo} method except that all
  1218      * sequences of escaped octets are <a href="#decode">decoded</a>.  </p>
  1219      *
  1220      * @return  The decoded user-information component of this URI,
  1221      *          or <tt>null</tt> if the user information is undefined
  1222      */
  1223     public String getUserInfo() {
  1224         if ((decodedUserInfo == null) && (userInfo != null))
  1225             decodedUserInfo = decode(userInfo);
  1226         return decodedUserInfo;
  1227     }
  1228 
  1229     /**
  1230      * Returns the host component of this URI.
  1231      *
  1232      * <p> The host component of a URI, if defined, will have one of the
  1233      * following forms: </p>
  1234      *
  1235      * <ul type=disc>
  1236      *
  1237      *   <li><p> A domain name consisting of one or more <i>labels</i>
  1238      *   separated by period characters (<tt>'.'</tt>), optionally followed by
  1239      *   a period character.  Each label consists of <i>alphanum</i> characters
  1240      *   as well as hyphen characters (<tt>'-'</tt>), though hyphens never
  1241      *   occur as the first or last characters in a label. The rightmost
  1242      *   label of a domain name consisting of two or more labels, begins
  1243      *   with an <i>alpha</i> character. </li>
  1244      *
  1245      *   <li><p> A dotted-quad IPv4 address of the form
  1246      *   <i>digit</i><tt>+.</tt><i>digit</i><tt>+.</tt><i>digit</i><tt>+.</tt><i>digit</i><tt>+</tt>,
  1247      *   where no <i>digit</i> sequence is longer than three characters and no
  1248      *   sequence has a value larger than 255. </p></li>
  1249      *
  1250      *   <li><p> An IPv6 address enclosed in square brackets (<tt>'['</tt> and
  1251      *   <tt>']'</tt>) and consisting of hexadecimal digits, colon characters
  1252      *   (<tt>':'</tt>), and possibly an embedded IPv4 address.  The full
  1253      *   syntax of IPv6 addresses is specified in <a
  1254      *   href="http://www.ietf.org/rfc/rfc2373.txt"><i>RFC&nbsp;2373: IPv6
  1255      *   Addressing Architecture</i></a>.  </p></li>
  1256      *
  1257      * </ul>
  1258      *
  1259      * The host component of a URI cannot contain escaped octets, hence this
  1260      * method does not perform any decoding.
  1261      *
  1262      * @return  The host component of this URI,
  1263      *          or <tt>null</tt> if the host is undefined
  1264      */
  1265     public String getHost() {
  1266         return host;
  1267     }
  1268 
  1269     /**
  1270      * Returns the port number of this URI.
  1271      *
  1272      * <p> The port component of a URI, if defined, is a non-negative
  1273      * integer. </p>
  1274      *
  1275      * @return  The port component of this URI,
  1276      *          or <tt>-1</tt> if the port is undefined
  1277      */
  1278     public int getPort() {
  1279         return port;
  1280     }
  1281 
  1282     /**
  1283      * Returns the raw path component of this URI.
  1284      *
  1285      * <p> The path component of a URI, if defined, only contains the slash
  1286      * character (<tt>'/'</tt>), the commercial-at character (<tt>'@'</tt>),
  1287      * and characters in the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>,
  1288      * and <i>other</i> categories. </p>
  1289      *
  1290      * @return  The path component of this URI,
  1291      *          or <tt>null</tt> if the path is undefined
  1292      */
  1293     public String getRawPath() {
  1294         return path;
  1295     }
  1296 
  1297     /**
  1298      * Returns the decoded path component of this URI.
  1299      *
  1300      * <p> The string returned by this method is equal to that returned by the
  1301      * {@link #getRawPath() getRawPath} method except that all sequences of
  1302      * escaped octets are <a href="#decode">decoded</a>.  </p>
  1303      *
  1304      * @return  The decoded path component of this URI,
  1305      *          or <tt>null</tt> if the path is undefined
  1306      */
  1307     public String getPath() {
  1308         if ((decodedPath == null) && (path != null))
  1309             decodedPath = decode(path);
  1310         return decodedPath;
  1311     }
  1312 
  1313     /**
  1314      * Returns the raw query component of this URI.
  1315      *
  1316      * <p> The query component of a URI, if defined, only contains legal URI
  1317      * characters. </p>
  1318      *
  1319      * @return  The raw query component of this URI,
  1320      *          or <tt>null</tt> if the query is undefined
  1321      */
  1322     public String getRawQuery() {
  1323         return query;
  1324     }
  1325 
  1326     /**
  1327      * Returns the decoded query component of this URI.
  1328      *
  1329      * <p> The string returned by this method is equal to that returned by the
  1330      * {@link #getRawQuery() getRawQuery} method except that all sequences of
  1331      * escaped octets are <a href="#decode">decoded</a>.  </p>
  1332      *
  1333      * @return  The decoded query component of this URI,
  1334      *          or <tt>null</tt> if the query is undefined
  1335      */
  1336     public String getQuery() {
  1337         if ((decodedQuery == null) && (query != null))
  1338             decodedQuery = decode(query);
  1339         return decodedQuery;
  1340     }
  1341 
  1342     /**
  1343      * Returns the raw fragment component of this URI.
  1344      *
  1345      * <p> The fragment component of a URI, if defined, only contains legal URI
  1346      * characters. </p>
  1347      *
  1348      * @return  The raw fragment component of this URI,
  1349      *          or <tt>null</tt> if the fragment is undefined
  1350      */
  1351     public String getRawFragment() {
  1352         return fragment;
  1353     }
  1354 
  1355     /**
  1356      * Returns the decoded fragment component of this URI.
  1357      *
  1358      * <p> The string returned by this method is equal to that returned by the
  1359      * {@link #getRawFragment() getRawFragment} method except that all
  1360      * sequences of escaped octets are <a href="#decode">decoded</a>.  </p>
  1361      *
  1362      * @return  The decoded fragment component of this URI,
  1363      *          or <tt>null</tt> if the fragment is undefined
  1364      */
  1365     public String getFragment() {
  1366         if ((decodedFragment == null) && (fragment != null))
  1367             decodedFragment = decode(fragment);
  1368         return decodedFragment;
  1369     }
  1370 
  1371 
  1372     // -- Equality, comparison, hash code, toString, and serialization --
  1373 
  1374     /**
  1375      * Tests this URI for equality with another object.
  1376      *
  1377      * <p> If the given object is not a URI then this method immediately
  1378      * returns <tt>false</tt>.
  1379      *
  1380      * <p> For two URIs to be considered equal requires that either both are
  1381      * opaque or both are hierarchical.  Their schemes must either both be
  1382      * undefined or else be equal without regard to case. Their fragments
  1383      * must either both be undefined or else be equal.
  1384      *
  1385      * <p> For two opaque URIs to be considered equal, their scheme-specific
  1386      * parts must be equal.
  1387      *
  1388      * <p> For two hierarchical URIs to be considered equal, their paths must
  1389      * be equal and their queries must either both be undefined or else be
  1390      * equal.  Their authorities must either both be undefined, or both be
  1391      * registry-based, or both be server-based.  If their authorities are
  1392      * defined and are registry-based, then they must be equal.  If their
  1393      * authorities are defined and are server-based, then their hosts must be
  1394      * equal without regard to case, their port numbers must be equal, and
  1395      * their user-information components must be equal.
  1396      *
  1397      * <p> When testing the user-information, path, query, fragment, authority,
  1398      * or scheme-specific parts of two URIs for equality, the raw forms rather
  1399      * than the encoded forms of these components are compared and the
  1400      * hexadecimal digits of escaped octets are compared without regard to
  1401      * case.
  1402      *
  1403      * <p> This method satisfies the general contract of the {@link
  1404      * java.lang.Object#equals(Object) Object.equals} method. </p>
  1405      *
  1406      * @param   ob   The object to which this object is to be compared
  1407      *
  1408      * @return  <tt>true</tt> if, and only if, the given object is a URI that
  1409      *          is identical to this URI
  1410      */
  1411     public boolean equals(Object ob) {
  1412         if (ob == this)
  1413             return true;
  1414         if (!(ob instanceof URI))
  1415             return false;
  1416         URI that = (URI)ob;
  1417         if (this.isOpaque() != that.isOpaque()) return false;
  1418         if (!equalIgnoringCase(this.scheme, that.scheme)) return false;
  1419         if (!equal(this.fragment, that.fragment)) return false;
  1420 
  1421         // Opaque
  1422         if (this.isOpaque())
  1423             return equal(this.schemeSpecificPart, that.schemeSpecificPart);
  1424 
  1425         // Hierarchical
  1426         if (!equal(this.path, that.path)) return false;
  1427         if (!equal(this.query, that.query)) return false;
  1428 
  1429         // Authorities
  1430         if (this.authority == that.authority) return true;
  1431         if (this.host != null) {
  1432             // Server-based
  1433             if (!equal(this.userInfo, that.userInfo)) return false;
  1434             if (!equalIgnoringCase(this.host, that.host)) return false;
  1435             if (this.port != that.port) return false;
  1436         } else if (this.authority != null) {
  1437             // Registry-based
  1438             if (!equal(this.authority, that.authority)) return false;
  1439         } else if (this.authority != that.authority) {
  1440             return false;
  1441         }
  1442 
  1443         return true;
  1444     }
  1445 
  1446     /**
  1447      * Returns a hash-code value for this URI.  The hash code is based upon all
  1448      * of the URI's components, and satisfies the general contract of the
  1449      * {@link java.lang.Object#hashCode() Object.hashCode} method.
  1450      *
  1451      * @return  A hash-code value for this URI
  1452      */
  1453     public int hashCode() {
  1454         if (hash != 0)
  1455             return hash;
  1456         int h = hashIgnoringCase(0, scheme);
  1457         h = hash(h, fragment);
  1458         if (isOpaque()) {
  1459             h = hash(h, schemeSpecificPart);
  1460         } else {
  1461             h = hash(h, path);
  1462             h = hash(h, query);
  1463             if (host != null) {
  1464                 h = hash(h, userInfo);
  1465                 h = hashIgnoringCase(h, host);
  1466                 h += 1949 * port;
  1467             } else {
  1468                 h = hash(h, authority);
  1469             }
  1470         }
  1471         hash = h;
  1472         return h;
  1473     }
  1474 
  1475     /**
  1476      * Compares this URI to another object, which must be a URI.
  1477      *
  1478      * <p> When comparing corresponding components of two URIs, if one
  1479      * component is undefined but the other is defined then the first is
  1480      * considered to be less than the second.  Unless otherwise noted, string
  1481      * components are ordered according to their natural, case-sensitive
  1482      * ordering as defined by the {@link java.lang.String#compareTo(Object)
  1483      * String.compareTo} method.  String components that are subject to
  1484      * encoding are compared by comparing their raw forms rather than their
  1485      * encoded forms.
  1486      *
  1487      * <p> The ordering of URIs is defined as follows: </p>
  1488      *
  1489      * <ul type=disc>
  1490      *
  1491      *   <li><p> Two URIs with different schemes are ordered according the
  1492      *   ordering of their schemes, without regard to case. </p></li>
  1493      *
  1494      *   <li><p> A hierarchical URI is considered to be less than an opaque URI
  1495      *   with an identical scheme. </p></li>
  1496      *
  1497      *   <li><p> Two opaque URIs with identical schemes are ordered according
  1498      *   to the ordering of their scheme-specific parts. </p></li>
  1499      *
  1500      *   <li><p> Two opaque URIs with identical schemes and scheme-specific
  1501      *   parts are ordered according to the ordering of their
  1502      *   fragments. </p></li>
  1503      *
  1504      *   <li><p> Two hierarchical URIs with identical schemes are ordered
  1505      *   according to the ordering of their authority components: </p>
  1506      *
  1507      *   <ul type=disc>
  1508      *
  1509      *     <li><p> If both authority components are server-based then the URIs
  1510      *     are ordered according to their user-information components; if these
  1511      *     components are identical then the URIs are ordered according to the
  1512      *     ordering of their hosts, without regard to case; if the hosts are
  1513      *     identical then the URIs are ordered according to the ordering of
  1514      *     their ports. </p></li>
  1515      *
  1516      *     <li><p> If one or both authority components are registry-based then
  1517      *     the URIs are ordered according to the ordering of their authority
  1518      *     components. </p></li>
  1519      *
  1520      *   </ul></li>
  1521      *
  1522      *   <li><p> Finally, two hierarchical URIs with identical schemes and
  1523      *   authority components are ordered according to the ordering of their
  1524      *   paths; if their paths are identical then they are ordered according to
  1525      *   the ordering of their queries; if the queries are identical then they
  1526      *   are ordered according to the order of their fragments. </p></li>
  1527      *
  1528      * </ul>
  1529      *
  1530      * <p> This method satisfies the general contract of the {@link
  1531      * java.lang.Comparable#compareTo(Object) Comparable.compareTo}
  1532      * method. </p>
  1533      *
  1534      * @param   that
  1535      *          The object to which this URI is to be compared
  1536      *
  1537      * @return  A negative integer, zero, or a positive integer as this URI is
  1538      *          less than, equal to, or greater than the given URI
  1539      *
  1540      * @throws  ClassCastException
  1541      *          If the given object is not a URI
  1542      */
  1543     public int compareTo(URI that) {
  1544         int c;
  1545 
  1546         if ((c = compareIgnoringCase(this.scheme, that.scheme)) != 0)
  1547             return c;
  1548 
  1549         if (this.isOpaque()) {
  1550             if (that.isOpaque()) {
  1551                 // Both opaque
  1552                 if ((c = compare(this.schemeSpecificPart,
  1553                                  that.schemeSpecificPart)) != 0)
  1554                     return c;
  1555                 return compare(this.fragment, that.fragment);
  1556             }
  1557             return +1;                  // Opaque > hierarchical
  1558         } else if (that.isOpaque()) {
  1559             return -1;                  // Hierarchical < opaque
  1560         }
  1561 
  1562         // Hierarchical
  1563         if ((this.host != null) && (that.host != null)) {
  1564             // Both server-based
  1565             if ((c = compare(this.userInfo, that.userInfo)) != 0)
  1566                 return c;
  1567             if ((c = compareIgnoringCase(this.host, that.host)) != 0)
  1568                 return c;
  1569             if ((c = this.port - that.port) != 0)
  1570                 return c;
  1571         } else {
  1572             // If one or both authorities are registry-based then we simply
  1573             // compare them in the usual, case-sensitive way.  If one is
  1574             // registry-based and one is server-based then the strings are
  1575             // guaranteed to be unequal, hence the comparison will never return
  1576             // zero and the compareTo and equals methods will remain
  1577             // consistent.
  1578             if ((c = compare(this.authority, that.authority)) != 0) return c;
  1579         }
  1580 
  1581         if ((c = compare(this.path, that.path)) != 0) return c;
  1582         if ((c = compare(this.query, that.query)) != 0) return c;
  1583         return compare(this.fragment, that.fragment);
  1584     }
  1585 
  1586     /**
  1587      * Returns the content of this URI as a string.
  1588      *
  1589      * <p> If this URI was created by invoking one of the constructors in this
  1590      * class then a string equivalent to the original input string, or to the
  1591      * string computed from the originally-given components, as appropriate, is
  1592      * returned.  Otherwise this URI was created by normalization, resolution,
  1593      * or relativization, and so a string is constructed from this URI's
  1594      * components according to the rules specified in <a
  1595      * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
  1596      * section&nbsp;5.2, step&nbsp;7. </p>
  1597      *
  1598      * @return  The string form of this URI
  1599      */
  1600     public String toString() {
  1601         defineString();
  1602         return string;
  1603     }
  1604 
  1605     /**
  1606      * Returns the content of this URI as a US-ASCII string.
  1607      *
  1608      * <p> If this URI does not contain any characters in the <i>other</i>
  1609      * category then an invocation of this method will return the same value as
  1610      * an invocation of the {@link #toString() toString} method.  Otherwise
  1611      * this method works as if by invoking that method and then <a
  1612      * href="#encode">encoding</a> the result.  </p>
  1613      *
  1614      * @return  The string form of this URI, encoded as needed
  1615      *          so that it only contains characters in the US-ASCII
  1616      *          charset
  1617      */
  1618     public String toASCIIString() {
  1619         defineString();
  1620         return encode(string);
  1621     }
  1622 
  1623 
  1624     // -- Serialization support --
  1625 
  1626     /**
  1627      * Saves the content of this URI to the given serial stream.
  1628      *
  1629      * <p> The only serializable field of a URI instance is its <tt>string</tt>
  1630      * field.  That field is given a value, if it does not have one already,
  1631      * and then the {@link java.io.ObjectOutputStream#defaultWriteObject()}
  1632      * method of the given object-output stream is invoked. </p>
  1633      *
  1634      * @param  os  The object-output stream to which this object
  1635      *             is to be written
  1636      */
  1637     private void writeObject(ObjectOutputStream os)
  1638         throws IOException
  1639     {
  1640         defineString();
  1641         os.defaultWriteObject();        // Writes the string field only
  1642     }
  1643 
  1644     /**
  1645      * Reconstitutes a URI from the given serial stream.
  1646      *
  1647      * <p> The {@link java.io.ObjectInputStream#defaultReadObject()} method is
  1648      * invoked to read the value of the <tt>string</tt> field.  The result is
  1649      * then parsed in the usual way.
  1650      *
  1651      * @param  is  The object-input stream from which this object
  1652      *             is being read
  1653      */
  1654     private void readObject(ObjectInputStream is)
  1655         throws ClassNotFoundException, IOException
  1656     {
  1657         port = -1;                      // Argh
  1658         is.defaultReadObject();
  1659         try {
  1660             new Parser(string).parse(false);
  1661         } catch (URISyntaxException x) {
  1662             IOException y = new InvalidObjectException("Invalid URI");
  1663             y.initCause(x);
  1664             throw y;
  1665         }
  1666     }
  1667 
  1668 
  1669     // -- End of public methods --
  1670 
  1671 
  1672     // -- Utility methods for string-field comparison and hashing --
  1673 
  1674     // These methods return appropriate values for null string arguments,
  1675     // thereby simplifying the equals, hashCode, and compareTo methods.
  1676     //
  1677     // The case-ignoring methods should only be applied to strings whose
  1678     // characters are all known to be US-ASCII.  Because of this restriction,
  1679     // these methods are faster than the similar methods in the String class.
  1680 
  1681     // US-ASCII only
  1682     private static int toLower(char c) {
  1683         if ((c >= 'A') && (c <= 'Z'))
  1684             return c + ('a' - 'A');
  1685         return c;
  1686     }
  1687 
  1688     private static boolean equal(String s, String t) {
  1689         if (s == t) return true;
  1690         if ((s != null) && (t != null)) {
  1691             if (s.length() != t.length())
  1692                 return false;
  1693             if (s.indexOf('%') < 0)
  1694                 return s.equals(t);
  1695             int n = s.length();
  1696             for (int i = 0; i < n;) {
  1697                 char c = s.charAt(i);
  1698                 char d = t.charAt(i);
  1699                 if (c != '%') {
  1700                     if (c != d)
  1701                         return false;
  1702                     i++;
  1703                     continue;
  1704                 }
  1705                 i++;
  1706                 if (toLower(s.charAt(i)) != toLower(t.charAt(i)))
  1707                     return false;
  1708                 i++;
  1709                 if (toLower(s.charAt(i)) != toLower(t.charAt(i)))
  1710                     return false;
  1711                 i++;
  1712             }
  1713             return true;
  1714         }
  1715         return false;
  1716     }
  1717 
  1718     // US-ASCII only
  1719     private static boolean equalIgnoringCase(String s, String t) {
  1720         if (s == t) return true;
  1721         if ((s != null) && (t != null)) {
  1722             int n = s.length();
  1723             if (t.length() != n)
  1724                 return false;
  1725             for (int i = 0; i < n; i++) {
  1726                 if (toLower(s.charAt(i)) != toLower(t.charAt(i)))
  1727                     return false;
  1728             }
  1729             return true;
  1730         }
  1731         return false;
  1732     }
  1733 
  1734     private static int hash(int hash, String s) {
  1735         if (s == null) return hash;
  1736         return hash * 127 + s.hashCode();
  1737     }
  1738 
  1739     // US-ASCII only
  1740     private static int hashIgnoringCase(int hash, String s) {
  1741         if (s == null) return hash;
  1742         int h = hash;
  1743         int n = s.length();
  1744         for (int i = 0; i < n; i++)
  1745             h = 31 * h + toLower(s.charAt(i));
  1746         return h;
  1747     }
  1748 
  1749     private static int compare(String s, String t) {
  1750         if (s == t) return 0;
  1751         if (s != null) {
  1752             if (t != null)
  1753                 return s.compareTo(t);
  1754             else
  1755                 return +1;
  1756         } else {
  1757             return -1;
  1758         }
  1759     }
  1760 
  1761     // US-ASCII only
  1762     private static int compareIgnoringCase(String s, String t) {
  1763         if (s == t) return 0;
  1764         if (s != null) {
  1765             if (t != null) {
  1766                 int sn = s.length();
  1767                 int tn = t.length();
  1768                 int n = sn < tn ? sn : tn;
  1769                 for (int i = 0; i < n; i++) {
  1770                     int c = toLower(s.charAt(i)) - toLower(t.charAt(i));
  1771                     if (c != 0)
  1772                         return c;
  1773                 }
  1774                 return sn - tn;
  1775             }
  1776             return +1;
  1777         } else {
  1778             return -1;
  1779         }
  1780     }
  1781 
  1782 
  1783     // -- String construction --
  1784 
  1785     // If a scheme is given then the path, if given, must be absolute
  1786     //
  1787     private static void checkPath(String s, String scheme, String path)
  1788         throws URISyntaxException
  1789     {
  1790         if (scheme != null) {
  1791             if ((path != null)
  1792                 && ((path.length() > 0) && (path.charAt(0) != '/')))
  1793                 throw new URISyntaxException(s,
  1794                                              "Relative path in absolute URI");
  1795         }
  1796     }
  1797 
  1798     private void appendAuthority(StringBuffer sb,
  1799                                  String authority,
  1800                                  String userInfo,
  1801                                  String host,
  1802                                  int port)
  1803     {
  1804         if (host != null) {
  1805             sb.append("//");
  1806             if (userInfo != null) {
  1807                 sb.append(quote(userInfo, L_USERINFO, H_USERINFO));
  1808                 sb.append('@');
  1809             }
  1810             boolean needBrackets = ((host.indexOf(':') >= 0)
  1811                                     && !host.startsWith("[")
  1812                                     && !host.endsWith("]"));
  1813             if (needBrackets) sb.append('[');
  1814             sb.append(host);
  1815             if (needBrackets) sb.append(']');
  1816             if (port != -1) {
  1817                 sb.append(':');
  1818                 sb.append(port);
  1819             }
  1820         } else if (authority != null) {
  1821             sb.append("//");
  1822             if (authority.startsWith("[")) {
  1823                 // authority should (but may not) contain an embedded IPv6 address
  1824                 int end = authority.indexOf("]");
  1825                 String doquote = authority, dontquote = "";
  1826                 if (end != -1 && authority.indexOf(":") != -1) {
  1827                     // the authority contains an IPv6 address
  1828                     if (end == authority.length()) {
  1829                         dontquote = authority;
  1830                         doquote = "";
  1831                     } else {
  1832                         dontquote = authority.substring(0 , end + 1);
  1833                         doquote = authority.substring(end + 1);
  1834                     }
  1835                 }
  1836                 sb.append(dontquote);
  1837                 sb.append(quote(doquote,
  1838                             L_REG_NAME | L_SERVER,
  1839                             H_REG_NAME | H_SERVER));
  1840             } else {
  1841                 sb.append(quote(authority,
  1842                             L_REG_NAME | L_SERVER,
  1843                             H_REG_NAME | H_SERVER));
  1844             }
  1845         }
  1846     }
  1847 
  1848     private void appendSchemeSpecificPart(StringBuffer sb,
  1849                                           String opaquePart,
  1850                                           String authority,
  1851                                           String userInfo,
  1852                                           String host,
  1853                                           int port,
  1854                                           String path,
  1855                                           String query)
  1856     {
  1857         if (opaquePart != null) {
  1858             /* check if SSP begins with an IPv6 address
  1859              * because we must not quote a literal IPv6 address
  1860              */
  1861             if (opaquePart.startsWith("//[")) {
  1862                 int end =  opaquePart.indexOf("]");
  1863                 if (end != -1 && opaquePart.indexOf(":")!=-1) {
  1864                     String doquote, dontquote;
  1865                     if (end == opaquePart.length()) {
  1866                         dontquote = opaquePart;
  1867                         doquote = "";
  1868                     } else {
  1869                         dontquote = opaquePart.substring(0,end+1);
  1870                         doquote = opaquePart.substring(end+1);
  1871                     }
  1872                     sb.append (dontquote);
  1873                     sb.append(quote(doquote, L_URIC, H_URIC));
  1874                 }
  1875             } else {
  1876                 sb.append(quote(opaquePart, L_URIC, H_URIC));
  1877             }
  1878         } else {
  1879             appendAuthority(sb, authority, userInfo, host, port);
  1880             if (path != null)
  1881                 sb.append(quote(path, L_PATH, H_PATH));
  1882             if (query != null) {
  1883                 sb.append('?');
  1884                 sb.append(quote(query, L_URIC, H_URIC));
  1885             }
  1886         }
  1887     }
  1888 
  1889     private void appendFragment(StringBuffer sb, String fragment) {
  1890         if (fragment != null) {
  1891             sb.append('#');
  1892             sb.append(quote(fragment, L_URIC, H_URIC));
  1893         }
  1894     }
  1895 
  1896     private String toString(String scheme,
  1897                             String opaquePart,
  1898                             String authority,
  1899                             String userInfo,
  1900                             String host,
  1901                             int port,
  1902                             String path,
  1903                             String query,
  1904                             String fragment)
  1905     {
  1906         StringBuffer sb = new StringBuffer();
  1907         if (scheme != null) {
  1908             sb.append(scheme);
  1909             sb.append(':');
  1910         }
  1911         appendSchemeSpecificPart(sb, opaquePart,
  1912                                  authority, userInfo, host, port,
  1913                                  path, query);
  1914         appendFragment(sb, fragment);
  1915         return sb.toString();
  1916     }
  1917 
  1918     private void defineSchemeSpecificPart() {
  1919         if (schemeSpecificPart != null) return;
  1920         StringBuffer sb = new StringBuffer();
  1921         appendSchemeSpecificPart(sb, null, getAuthority(), getUserInfo(),
  1922                                  host, port, getPath(), getQuery());
  1923         if (sb.length() == 0) return;
  1924         schemeSpecificPart = sb.toString();
  1925     }
  1926 
  1927     private void defineString() {
  1928         if (string != null) return;
  1929 
  1930         StringBuffer sb = new StringBuffer();
  1931         if (scheme != null) {
  1932             sb.append(scheme);
  1933             sb.append(':');
  1934         }
  1935         if (isOpaque()) {
  1936             sb.append(schemeSpecificPart);
  1937         } else {
  1938             if (host != null) {
  1939                 sb.append("//");
  1940                 if (userInfo != null) {
  1941                     sb.append(userInfo);
  1942                     sb.append('@');
  1943                 }
  1944                 boolean needBrackets = ((host.indexOf(':') >= 0)
  1945                                     && !host.startsWith("[")
  1946                                     && !host.endsWith("]"));
  1947                 if (needBrackets) sb.append('[');
  1948                 sb.append(host);
  1949                 if (needBrackets) sb.append(']');
  1950                 if (port != -1) {
  1951                     sb.append(':');
  1952                     sb.append(port);
  1953                 }
  1954             } else if (authority != null) {
  1955                 sb.append("//");
  1956                 sb.append(authority);
  1957             }
  1958             if (path != null)
  1959                 sb.append(path);
  1960             if (query != null) {
  1961                 sb.append('?');
  1962                 sb.append(query);
  1963             }
  1964         }
  1965         if (fragment != null) {
  1966             sb.append('#');
  1967             sb.append(fragment);
  1968         }
  1969         string = sb.toString();
  1970     }
  1971 
  1972 
  1973     // -- Normalization, resolution, and relativization --
  1974 
  1975     // RFC2396 5.2 (6)
  1976     private static String resolvePath(String base, String child,
  1977                                       boolean absolute)
  1978     {
  1979         int i = base.lastIndexOf('/');
  1980         int cn = child.length();
  1981         String path = "";
  1982 
  1983         if (cn == 0) {
  1984             // 5.2 (6a)
  1985             if (i >= 0)
  1986                 path = base.substring(0, i + 1);
  1987         } else {
  1988             StringBuffer sb = new StringBuffer(base.length() + cn);
  1989             // 5.2 (6a)
  1990             if (i >= 0)
  1991                 sb.append(base.substring(0, i + 1));
  1992             // 5.2 (6b)
  1993             sb.append(child);
  1994             path = sb.toString();
  1995         }
  1996 
  1997         // 5.2 (6c-f)
  1998         String np = normalize(path);
  1999 
  2000         // 5.2 (6g): If the result is absolute but the path begins with "../",
  2001         // then we simply leave the path as-is
  2002 
  2003         return np;
  2004     }
  2005 
  2006     // RFC2396 5.2
  2007     private static URI resolve(URI base, URI child) {
  2008         // check if child if opaque first so that NPE is thrown
  2009         // if child is null.
  2010         if (child.isOpaque() || base.isOpaque())
  2011             return child;
  2012 
  2013         // 5.2 (2): Reference to current document (lone fragment)
  2014         if ((child.scheme == null) && (child.authority == null)
  2015             && child.path.equals("") && (child.fragment != null)
  2016             && (child.query == null)) {
  2017             if ((base.fragment != null)
  2018                 && child.fragment.equals(base.fragment)) {
  2019                 return base;
  2020             }
  2021             URI ru = new URI();
  2022             ru.scheme = base.scheme;
  2023             ru.authority = base.authority;
  2024             ru.userInfo = base.userInfo;
  2025             ru.host = base.host;
  2026             ru.port = base.port;
  2027             ru.path = base.path;
  2028             ru.fragment = child.fragment;
  2029             ru.query = base.query;
  2030             return ru;
  2031         }
  2032 
  2033         // 5.2 (3): Child is absolute
  2034         if (child.scheme != null)
  2035             return child;
  2036 
  2037         URI ru = new URI();             // Resolved URI
  2038         ru.scheme = base.scheme;
  2039         ru.query = child.query;
  2040         ru.fragment = child.fragment;
  2041 
  2042         // 5.2 (4): Authority
  2043         if (child.authority == null) {
  2044             ru.authority = base.authority;
  2045             ru.host = base.host;
  2046             ru.userInfo = base.userInfo;
  2047             ru.port = base.port;
  2048 
  2049             String cp = (child.path == null) ? "" : child.path;
  2050             if ((cp.length() > 0) && (cp.charAt(0) == '/')) {
  2051                 // 5.2 (5): Child path is absolute
  2052                 ru.path = child.path;
  2053             } else {
  2054                 // 5.2 (6): Resolve relative path
  2055                 ru.path = resolvePath(base.path, cp, base.isAbsolute());
  2056             }
  2057         } else {
  2058             ru.authority = child.authority;
  2059             ru.host = child.host;
  2060             ru.userInfo = child.userInfo;
  2061             ru.host = child.host;
  2062             ru.port = child.port;
  2063             ru.path = child.path;
  2064         }
  2065 
  2066         // 5.2 (7): Recombine (nothing to do here)
  2067         return ru;
  2068     }
  2069 
  2070     // If the given URI's path is normal then return the URI;
  2071     // o.w., return a new URI containing the normalized path.
  2072     //
  2073     private static URI normalize(URI u) {
  2074         if (u.isOpaque() || (u.path == null) || (u.path.length() == 0))
  2075             return u;
  2076 
  2077         String np = normalize(u.path);
  2078         if (np == u.path)
  2079             return u;
  2080 
  2081         URI v = new URI();
  2082         v.scheme = u.scheme;
  2083         v.fragment = u.fragment;
  2084         v.authority = u.authority;
  2085         v.userInfo = u.userInfo;
  2086         v.host = u.host;
  2087         v.port = u.port;
  2088         v.path = np;
  2089         v.query = u.query;
  2090         return v;
  2091     }
  2092 
  2093     // If both URIs are hierarchical, their scheme and authority components are
  2094     // identical, and the base path is a prefix of the child's path, then
  2095     // return a relative URI that, when resolved against the base, yields the
  2096     // child; otherwise, return the child.
  2097     //
  2098     private static URI relativize(URI base, URI child) {
  2099         // check if child if opaque first so that NPE is thrown
  2100         // if child is null.
  2101         if (child.isOpaque() || base.isOpaque())
  2102             return child;
  2103         if (!equalIgnoringCase(base.scheme, child.scheme)
  2104             || !equal(base.authority, child.authority))
  2105             return child;
  2106 
  2107         String bp = normalize(base.path);
  2108         String cp = normalize(child.path);
  2109         if (!bp.equals(cp)) {
  2110             if (!bp.endsWith("/"))
  2111                 bp = bp + "/";
  2112             if (!cp.startsWith(bp))
  2113                 return child;
  2114         }
  2115 
  2116         URI v = new URI();
  2117         v.path = cp.substring(bp.length());
  2118         v.query = child.query;
  2119         v.fragment = child.fragment;
  2120         return v;
  2121     }
  2122 
  2123 
  2124 
  2125     // -- Path normalization --
  2126 
  2127     // The following algorithm for path normalization avoids the creation of a
  2128     // string object for each segment, as well as the use of a string buffer to
  2129     // compute the final result, by using a single char array and editing it in
  2130     // place.  The array is first split into segments, replacing each slash
  2131     // with '\0' and creating a segment-index array, each element of which is
  2132     // the index of the first char in the corresponding segment.  We then walk
  2133     // through both arrays, removing ".", "..", and other segments as necessary
  2134     // by setting their entries in the index array to -1.  Finally, the two
  2135     // arrays are used to rejoin the segments and compute the final result.
  2136     //
  2137     // This code is based upon src/solaris/native/java/io/canonicalize_md.c
  2138 
  2139 
  2140     // Check the given path to see if it might need normalization.  A path
  2141     // might need normalization if it contains duplicate slashes, a "."
  2142     // segment, or a ".." segment.  Return -1 if no further normalization is
  2143     // possible, otherwise return the number of segments found.
  2144     //
  2145     // This method takes a string argument rather than a char array so that
  2146     // this test can be performed without invoking path.toCharArray().
  2147     //
  2148     static private int needsNormalization(String path) {
  2149         boolean normal = true;
  2150         int ns = 0;                     // Number of segments
  2151         int end = path.length() - 1;    // Index of last char in path
  2152         int p = 0;                      // Index of next char in path
  2153 
  2154         // Skip initial slashes
  2155         while (p <= end) {
  2156             if (path.charAt(p) != '/') break;
  2157             p++;
  2158         }
  2159         if (p > 1) normal = false;
  2160 
  2161         // Scan segments
  2162         while (p <= end) {
  2163 
  2164             // Looking at "." or ".." ?
  2165             if ((path.charAt(p) == '.')
  2166                 && ((p == end)
  2167                     || ((path.charAt(p + 1) == '/')
  2168                         || ((path.charAt(p + 1) == '.')
  2169                             && ((p + 1 == end)
  2170                                 || (path.charAt(p + 2) == '/')))))) {
  2171                 normal = false;
  2172             }
  2173             ns++;
  2174 
  2175             // Find beginning of next segment
  2176             while (p <= end) {
  2177                 if (path.charAt(p++) != '/')
  2178                     continue;
  2179 
  2180                 // Skip redundant slashes
  2181                 while (p <= end) {
  2182                     if (path.charAt(p) != '/') break;
  2183                     normal = false;
  2184                     p++;
  2185                 }
  2186 
  2187                 break;
  2188             }
  2189         }
  2190 
  2191         return normal ? -1 : ns;
  2192     }
  2193 
  2194 
  2195     // Split the given path into segments, replacing slashes with nulls and
  2196     // filling in the given segment-index array.
  2197     //
  2198     // Preconditions:
  2199     //   segs.length == Number of segments in path
  2200     //
  2201     // Postconditions:
  2202     //   All slashes in path replaced by '\0'
  2203     //   segs[i] == Index of first char in segment i (0 <= i < segs.length)
  2204     //
  2205     static private void split(char[] path, int[] segs) {
  2206         int end = path.length - 1;      // Index of last char in path
  2207         int p = 0;                      // Index of next char in path
  2208         int i = 0;                      // Index of current segment
  2209 
  2210         // Skip initial slashes
  2211         while (p <= end) {
  2212             if (path[p] != '/') break;
  2213             path[p] = '\0';
  2214             p++;
  2215         }
  2216 
  2217         while (p <= end) {
  2218 
  2219             // Note start of segment
  2220             segs[i++] = p++;
  2221 
  2222             // Find beginning of next segment
  2223             while (p <= end) {
  2224                 if (path[p++] != '/')
  2225                     continue;
  2226                 path[p - 1] = '\0';
  2227 
  2228                 // Skip redundant slashes
  2229                 while (p <= end) {
  2230                     if (path[p] != '/') break;
  2231                     path[p++] = '\0';
  2232                 }
  2233                 break;
  2234             }
  2235         }
  2236 
  2237         if (i != segs.length)
  2238             throw new InternalError();  // ASSERT
  2239     }
  2240 
  2241 
  2242     // Join the segments in the given path according to the given segment-index
  2243     // array, ignoring those segments whose index entries have been set to -1,
  2244     // and inserting slashes as needed.  Return the length of the resulting
  2245     // path.
  2246     //
  2247     // Preconditions:
  2248     //   segs[i] == -1 implies segment i is to be ignored
  2249     //   path computed by split, as above, with '\0' having replaced '/'
  2250     //
  2251     // Postconditions:
  2252     //   path[0] .. path[return value] == Resulting path
  2253     //
  2254     static private int join(char[] path, int[] segs) {
  2255         int ns = segs.length;           // Number of segments
  2256         int end = path.length - 1;      // Index of last char in path
  2257         int p = 0;                      // Index of next path char to write
  2258 
  2259         if (path[p] == '\0') {
  2260             // Restore initial slash for absolute paths
  2261             path[p++] = '/';
  2262         }
  2263 
  2264         for (int i = 0; i < ns; i++) {
  2265             int q = segs[i];            // Current segment
  2266             if (q == -1)
  2267                 // Ignore this segment
  2268                 continue;
  2269 
  2270             if (p == q) {
  2271                 // We're already at this segment, so just skip to its end
  2272                 while ((p <= end) && (path[p] != '\0'))
  2273                     p++;
  2274                 if (p <= end) {
  2275                     // Preserve trailing slash
  2276                     path[p++] = '/';
  2277                 }
  2278             } else if (p < q) {
  2279                 // Copy q down to p
  2280                 while ((q <= end) && (path[q] != '\0'))
  2281                     path[p++] = path[q++];
  2282                 if (q <= end) {
  2283                     // Preserve trailing slash
  2284                     path[p++] = '/';
  2285                 }
  2286             } else
  2287                 throw new InternalError(); // ASSERT false
  2288         }
  2289 
  2290         return p;
  2291     }
  2292 
  2293 
  2294     // Remove "." segments from the given path, and remove segment pairs
  2295     // consisting of a non-".." segment followed by a ".." segment.
  2296     //
  2297     private static void removeDots(char[] path, int[] segs) {
  2298         int ns = segs.length;
  2299         int end = path.length - 1;
  2300 
  2301         for (int i = 0; i < ns; i++) {
  2302             int dots = 0;               // Number of dots found (0, 1, or 2)
  2303 
  2304             // Find next occurrence of "." or ".."
  2305             do {
  2306                 int p = segs[i];
  2307                 if (path[p] == '.') {
  2308                     if (p == end) {
  2309                         dots = 1;
  2310                         break;
  2311                     } else if (path[p + 1] == '\0') {
  2312                         dots = 1;
  2313                         break;
  2314                     } else if ((path[p + 1] == '.')
  2315                                && ((p + 1 == end)
  2316                                    || (path[p + 2] == '\0'))) {
  2317                         dots = 2;
  2318                         break;
  2319                     }
  2320                 }
  2321                 i++;
  2322             } while (i < ns);
  2323             if ((i > ns) || (dots == 0))
  2324                 break;
  2325 
  2326             if (dots == 1) {
  2327                 // Remove this occurrence of "."
  2328                 segs[i] = -1;
  2329             } else {
  2330                 // If there is a preceding non-".." segment, remove both that
  2331                 // segment and this occurrence of ".."; otherwise, leave this
  2332                 // ".." segment as-is.
  2333                 int j;
  2334                 for (j = i - 1; j >= 0; j--) {
  2335                     if (segs[j] != -1) break;
  2336                 }
  2337                 if (j >= 0) {
  2338                     int q = segs[j];
  2339                     if (!((path[q] == '.')
  2340                           && (path[q + 1] == '.')
  2341                           && (path[q + 2] == '\0'))) {
  2342                         segs[i] = -1;
  2343                         segs[j] = -1;
  2344                     }
  2345                 }
  2346             }
  2347         }
  2348     }
  2349 
  2350 
  2351     // DEVIATION: If the normalized path is relative, and if the first
  2352     // segment could be parsed as a scheme name, then prepend a "." segment
  2353     //
  2354     private static void maybeAddLeadingDot(char[] path, int[] segs) {
  2355 
  2356         if (path[0] == '\0')
  2357             // The path is absolute
  2358             return;
  2359 
  2360         int ns = segs.length;
  2361         int f = 0;                      // Index of first segment
  2362         while (f < ns) {
  2363             if (segs[f] >= 0)
  2364                 break;
  2365             f++;
  2366         }
  2367         if ((f >= ns) || (f == 0))
  2368             // The path is empty, or else the original first segment survived,
  2369             // in which case we already know that no leading "." is needed
  2370             return;
  2371 
  2372         int p = segs[f];
  2373         while ((p < path.length) && (path[p] != ':') && (path[p] != '\0')) p++;
  2374         if (p >= path.length || path[p] == '\0')
  2375             // No colon in first segment, so no "." needed
  2376             return;
  2377 
  2378         // At this point we know that the first segment is unused,
  2379         // hence we can insert a "." segment at that position
  2380         path[0] = '.';
  2381         path[1] = '\0';
  2382         segs[0] = 0;
  2383     }
  2384 
  2385 
  2386     // Normalize the given path string.  A normal path string has no empty
  2387     // segments (i.e., occurrences of "//"), no segments equal to ".", and no
  2388     // segments equal to ".." that are preceded by a segment not equal to "..".
  2389     // In contrast to Unix-style pathname normalization, for URI paths we
  2390     // always retain trailing slashes.
  2391     //
  2392     private static String normalize(String ps) {
  2393 
  2394         // Does this path need normalization?
  2395         int ns = needsNormalization(ps);        // Number of segments
  2396         if (ns < 0)
  2397             // Nope -- just return it
  2398             return ps;
  2399 
  2400         char[] path = ps.toCharArray();         // Path in char-array form
  2401 
  2402         // Split path into segments
  2403         int[] segs = new int[ns];               // Segment-index array
  2404         split(path, segs);
  2405 
  2406         // Remove dots
  2407         removeDots(path, segs);
  2408 
  2409         // Prevent scheme-name confusion
  2410         maybeAddLeadingDot(path, segs);
  2411 
  2412         // Join the remaining segments and return the result
  2413         String s = new String(path, 0, join(path, segs));
  2414         if (s.equals(ps)) {
  2415             // string was already normalized
  2416             return ps;
  2417         }
  2418         return s;
  2419     }
  2420 
  2421 
  2422 
  2423     // -- Character classes for parsing --
  2424 
  2425     // RFC2396 precisely specifies which characters in the US-ASCII charset are
  2426     // permissible in the various components of a URI reference.  We here
  2427     // define a set of mask pairs to aid in enforcing these restrictions.  Each
  2428     // mask pair consists of two longs, a low mask and a high mask.  Taken
  2429     // together they represent a 128-bit mask, where bit i is set iff the
  2430     // character with value i is permitted.
  2431     //
  2432     // This approach is more efficient than sequentially searching arrays of
  2433     // permitted characters.  It could be made still more efficient by
  2434     // precompiling the mask information so that a character's presence in a
  2435     // given mask could be determined by a single table lookup.
  2436 
  2437     // Compute the low-order mask for the characters in the given string
  2438     private static long lowMask(String chars) {
  2439         int n = chars.length();
  2440         long m = 0;
  2441         for (int i = 0; i < n; i++) {
  2442             char c = chars.charAt(i);
  2443             if (c < 64)
  2444                 m |= (1L << c);
  2445         }
  2446         return m;
  2447     }
  2448 
  2449     // Compute the high-order mask for the characters in the given string
  2450     private static long highMask(String chars) {
  2451         int n = chars.length();
  2452         long m = 0;
  2453         for (int i = 0; i < n; i++) {
  2454             char c = chars.charAt(i);
  2455             if ((c >= 64) && (c < 128))
  2456                 m |= (1L << (c - 64));
  2457         }
  2458         return m;
  2459     }
  2460 
  2461     // Compute a low-order mask for the characters
  2462     // between first and last, inclusive
  2463     private static long lowMask(char first, char last) {
  2464         long m = 0;
  2465         int f = Math.max(Math.min(first, 63), 0);
  2466         int l = Math.max(Math.min(last, 63), 0);
  2467         for (int i = f; i <= l; i++)
  2468             m |= 1L << i;
  2469         return m;
  2470     }
  2471 
  2472     // Compute a high-order mask for the characters
  2473     // between first and last, inclusive
  2474     private static long highMask(char first, char last) {
  2475         long m = 0;
  2476         int f = Math.max(Math.min(first, 127), 64) - 64;
  2477         int l = Math.max(Math.min(last, 127), 64) - 64;
  2478         for (int i = f; i <= l; i++)
  2479             m |= 1L << i;
  2480         return m;
  2481     }
  2482 
  2483     // Tell whether the given character is permitted by the given mask pair
  2484     private static boolean match(char c, long lowMask, long highMask) {
  2485         if (c == 0) // 0 doesn't have a slot in the mask. So, it never matches.
  2486             return false;
  2487         if (c < 64)
  2488             return ((1L << c) & lowMask) != 0;
  2489         if (c < 128)
  2490             return ((1L << (c - 64)) & highMask) != 0;
  2491         return false;
  2492     }
  2493 
  2494     // Character-class masks, in reverse order from RFC2396 because
  2495     // initializers for static fields cannot make forward references.
  2496 
  2497     // digit    = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" |
  2498     //            "8" | "9"
  2499     private static final long L_DIGIT = lowMask('0', '9');
  2500     private static final long H_DIGIT = 0L;
  2501 
  2502     // upalpha  = "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" |
  2503     //            "J" | "K" | "L" | "M" | "N" | "O" | "P" | "Q" | "R" |
  2504     //            "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z"
  2505     private static final long L_UPALPHA = 0L;
  2506     private static final long H_UPALPHA = highMask('A', 'Z');
  2507 
  2508     // lowalpha = "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" |
  2509     //            "j" | "k" | "l" | "m" | "n" | "o" | "p" | "q" | "r" |
  2510     //            "s" | "t" | "u" | "v" | "w" | "x" | "y" | "z"
  2511     private static final long L_LOWALPHA = 0L;
  2512     private static final long H_LOWALPHA = highMask('a', 'z');
  2513 
  2514     // alpha         = lowalpha | upalpha
  2515     private static final long L_ALPHA = L_LOWALPHA | L_UPALPHA;
  2516     private static final long H_ALPHA = H_LOWALPHA | H_UPALPHA;
  2517 
  2518     // alphanum      = alpha | digit
  2519     private static final long L_ALPHANUM = L_DIGIT | L_ALPHA;
  2520     private static final long H_ALPHANUM = H_DIGIT | H_ALPHA;
  2521 
  2522     // hex           = digit | "A" | "B" | "C" | "D" | "E" | "F" |
  2523     //                         "a" | "b" | "c" | "d" | "e" | "f"
  2524     private static final long L_HEX = L_DIGIT;
  2525     private static final long H_HEX = highMask('A', 'F') | highMask('a', 'f');
  2526 
  2527     // mark          = "-" | "_" | "." | "!" | "~" | "*" | "'" |
  2528     //                 "(" | ")"
  2529     private static final long L_MARK = lowMask("-_.!~*'()");
  2530     private static final long H_MARK = highMask("-_.!~*'()");
  2531 
  2532     // unreserved    = alphanum | mark
  2533     private static final long L_UNRESERVED = L_ALPHANUM | L_MARK;
  2534     private static final long H_UNRESERVED = H_ALPHANUM | H_MARK;
  2535 
  2536     // reserved      = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
  2537     //                 "$" | "," | "[" | "]"
  2538     // Added per RFC2732: "[", "]"
  2539     private static final long L_RESERVED = lowMask(";/?:@&=+$,[]");
  2540     private static final long H_RESERVED = highMask(";/?:@&=+$,[]");
  2541 
  2542     // The zero'th bit is used to indicate that escape pairs and non-US-ASCII
  2543     // characters are allowed; this is handled by the scanEscape method below.
  2544     private static final long L_ESCAPED = 1L;
  2545     private static final long H_ESCAPED = 0L;
  2546 
  2547     // uric          = reserved | unreserved | escaped
  2548     private static final long L_URIC = L_RESERVED | L_UNRESERVED | L_ESCAPED;
  2549     private static final long H_URIC = H_RESERVED | H_UNRESERVED | H_ESCAPED;
  2550 
  2551     // pchar         = unreserved | escaped |
  2552     //                 ":" | "@" | "&" | "=" | "+" | "$" | ","
  2553     private static final long L_PCHAR
  2554         = L_UNRESERVED | L_ESCAPED | lowMask(":@&=+$,");
  2555     private static final long H_PCHAR
  2556         = H_UNRESERVED | H_ESCAPED | highMask(":@&=+$,");
  2557 
  2558     // All valid path characters
  2559     private static final long L_PATH = L_PCHAR | lowMask(";/");
  2560     private static final long H_PATH = H_PCHAR | highMask(";/");
  2561 
  2562     // Dash, for use in domainlabel and toplabel
  2563     private static final long L_DASH = lowMask("-");
  2564     private static final long H_DASH = highMask("-");
  2565 
  2566     // Dot, for use in hostnames
  2567     private static final long L_DOT = lowMask(".");
  2568     private static final long H_DOT = highMask(".");
  2569 
  2570     // userinfo      = *( unreserved | escaped |
  2571     //                    ";" | ":" | "&" | "=" | "+" | "$" | "," )
  2572     private static final long L_USERINFO
  2573         = L_UNRESERVED | L_ESCAPED | lowMask(";:&=+$,");
  2574     private static final long H_USERINFO
  2575         = H_UNRESERVED | H_ESCAPED | highMask(";:&=+$,");
  2576 
  2577     // reg_name      = 1*( unreserved | escaped | "$" | "," |
  2578     //                     ";" | ":" | "@" | "&" | "=" | "+" )
  2579     private static final long L_REG_NAME
  2580         = L_UNRESERVED | L_ESCAPED | lowMask("$,;:@&=+");
  2581     private static final long H_REG_NAME
  2582         = H_UNRESERVED | H_ESCAPED | highMask("$,;:@&=+");
  2583 
  2584     // All valid characters for server-based authorities
  2585     private static final long L_SERVER
  2586         = L_USERINFO | L_ALPHANUM | L_DASH | lowMask(".:@[]");
  2587     private static final long H_SERVER
  2588         = H_USERINFO | H_ALPHANUM | H_DASH | highMask(".:@[]");
  2589 
  2590     // Special case of server authority that represents an IPv6 address
  2591     // In this case, a % does not signify an escape sequence
  2592     private static final long L_SERVER_PERCENT
  2593         = L_SERVER | lowMask("%");
  2594     private static final long H_SERVER_PERCENT
  2595         = H_SERVER | highMask("%");
  2596     private static final long L_LEFT_BRACKET = lowMask("[");
  2597     private static final long H_LEFT_BRACKET = highMask("[");
  2598 
  2599     // scheme        = alpha *( alpha | digit | "+" | "-" | "." )
  2600     private static final long L_SCHEME = L_ALPHA | L_DIGIT | lowMask("+-.");
  2601     private static final long H_SCHEME = H_ALPHA | H_DIGIT | highMask("+-.");
  2602 
  2603     // uric_no_slash = unreserved | escaped | ";" | "?" | ":" | "@" |
  2604     //                 "&" | "=" | "+" | "$" | ","
  2605     private static final long L_URIC_NO_SLASH
  2606         = L_UNRESERVED | L_ESCAPED | lowMask(";?:@&=+$,");
  2607     private static final long H_URIC_NO_SLASH
  2608         = H_UNRESERVED | H_ESCAPED | highMask(";?:@&=+$,");
  2609 
  2610 
  2611     // -- Escaping and encoding --
  2612 
  2613     private final static char[] hexDigits = {
  2614         '0', '1', '2', '3', '4', '5', '6', '7',
  2615         '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'
  2616     };
  2617 
  2618     private static void appendEscape(StringBuffer sb, byte b) {
  2619         sb.append('%');
  2620         sb.append(hexDigits[(b >> 4) & 0x0f]);
  2621         sb.append(hexDigits[(b >> 0) & 0x0f]);
  2622     }
  2623 
  2624     private static void appendEncoded(StringBuffer sb, char c) {
  2625         /*
  2626         ByteBuffer bb = null;
  2627         try {
  2628             bb = ThreadLocalCoders.encoderFor("UTF-8")
  2629                 .encode(CharBuffer.wrap("" + c));
  2630         } catch (CharacterCodingException x) {
  2631             assert false;
  2632         }
  2633         while (bb.hasRemaining()) {
  2634             int b = bb.get() & 0xff;
  2635             if (b >= 0x80)
  2636                 appendEscape(sb, (byte)b);
  2637             else
  2638                 sb.append((char)b);
  2639         }
  2640         */
  2641     }
  2642 
  2643     // Quote any characters in s that are not permitted
  2644     // by the given mask pair
  2645     //
  2646     private static String quote(String s, long lowMask, long highMask) {
  2647         int n = s.length();
  2648         StringBuffer sb = null;
  2649         boolean allowNonASCII = ((lowMask & L_ESCAPED) != 0);
  2650         for (int i = 0; i < s.length(); i++) {
  2651             char c = s.charAt(i);
  2652             if (c < '\u0080') {
  2653                 if (!match(c, lowMask, highMask)) {
  2654                     if (sb == null) {
  2655                         sb = new StringBuffer();
  2656                         sb.append(s.substring(0, i));
  2657                     }
  2658                     appendEscape(sb, (byte)c);
  2659                 } else {
  2660                     if (sb != null)
  2661                         sb.append(c);
  2662                 }
  2663             } else if (allowNonASCII
  2664                        && (Character.isSpaceChar(c)
  2665                            || Character.isISOControl(c))) {
  2666                 if (sb == null) {
  2667                     sb = new StringBuffer();
  2668                     sb.append(s.substring(0, i));
  2669                 }
  2670                 appendEncoded(sb, c);
  2671             } else {
  2672                 if (sb != null)
  2673                     sb.append(c);
  2674             }
  2675         }
  2676         return (sb == null) ? s : sb.toString();
  2677     }
  2678 
  2679     // Encodes all characters >= \u0080 into escaped, normalized UTF-8 octets,
  2680     // assuming that s is otherwise legal
  2681     //
  2682     private static String encode(String s) {
  2683         int n = s.length();
  2684         if (n == 0)
  2685             return s;
  2686 
  2687         // First check whether we actually need to encode
  2688         for (int i = 0;;) {
  2689             if (s.charAt(i) >= '\u0080')
  2690                 break;
  2691             if (++i >= n)
  2692                 return s;
  2693         }
  2694 /*
  2695         String ns = Normalizer.normalize(s, Normalizer.Form.NFC);
  2696         ByteBuffer bb = null;
  2697         try {
  2698             bb = ThreadLocalCoders.encoderFor("UTF-8")
  2699                 .encode(CharBuffer.wrap(ns));
  2700         } catch (CharacterCodingException x) {
  2701             assert false;
  2702         }
  2703 */
  2704         StringBuffer sb = new StringBuffer();
  2705         /*
  2706         while (bb.hasRemaining()) {
  2707             int b = bb.get() & 0xff;
  2708             if (b >= 0x80)
  2709                 appendEscape(sb, (byte)b);
  2710             else
  2711                 sb.append((char)b);
  2712         }
  2713         */
  2714         return sb.toString();
  2715     }
  2716 
  2717     private static int decode(char c) {
  2718         if ((c >= '0') && (c <= '9'))
  2719             return c - '0';
  2720         if ((c >= 'a') && (c <= 'f'))
  2721             return c - 'a' + 10;
  2722         if ((c >= 'A') && (c <= 'F'))
  2723             return c - 'A' + 10;
  2724         assert false;
  2725         return -1;
  2726     }
  2727 
  2728     private static byte decode(char c1, char c2) {
  2729         return (byte)(  ((decode(c1) & 0xf) << 4)
  2730                       | ((decode(c2) & 0xf) << 0));
  2731     }
  2732 
  2733     // Evaluates all escapes in s, applying UTF-8 decoding if needed.  Assumes
  2734     // that escapes are well-formed syntactically, i.e., of the form %XX.  If a
  2735     // sequence of escaped octets is not valid UTF-8 then the erroneous octets
  2736     // are replaced with '\uFFFD'.
  2737     // Exception: any "%" found between "[]" is left alone. It is an IPv6 literal
  2738     //            with a scope_id
  2739     //
  2740     private static String decode(String s) {
  2741         if (s == null)
  2742             return s;
  2743         int n = s.length();
  2744         if (n == 0)
  2745             return s;
  2746         if (s.indexOf('%') < 0)
  2747             return s;
  2748 
  2749         StringBuffer sb = new StringBuffer(n);
  2750         /*
  2751         ByteBuffer bb = ByteBuffer.allocate(n);
  2752         CharBuffer cb = CharBuffer.allocate(n);
  2753         CharsetDecoder dec = ThreadLocalCoders.decoderFor("UTF-8")
  2754             .onMalformedInput(CodingErrorAction.REPLACE)
  2755             .onUnmappableCharacter(CodingErrorAction.REPLACE);
  2756 
  2757         // This is not horribly efficient, but it will do for now
  2758         char c = s.charAt(0);
  2759         boolean betweenBrackets = false;
  2760 
  2761         for (int i = 0; i < n;) {
  2762             assert c == s.charAt(i);    // Loop invariant
  2763             if (c == '[') {
  2764                 betweenBrackets = true;
  2765             } else if (betweenBrackets && c == ']') {
  2766                 betweenBrackets = false;
  2767             }
  2768             if (c != '%' || betweenBrackets) {
  2769                 sb.append(c);
  2770                 if (++i >= n)
  2771                     break;
  2772                 c = s.charAt(i);
  2773                 continue;
  2774             }
  2775             bb.clear();
  2776             int ui = i;
  2777             for (;;) {
  2778                 assert (n - i >= 2);
  2779                 bb.put(decode(s.charAt(++i), s.charAt(++i)));
  2780                 if (++i >= n)
  2781                     break;
  2782                 c = s.charAt(i);
  2783                 if (c != '%')
  2784                     break;
  2785             }
  2786             bb.flip();
  2787             cb.clear();
  2788             dec.reset();
  2789             CoderResult cr = dec.decode(bb, cb, true);
  2790             assert cr.isUnderflow();
  2791             cr = dec.flush(cb);
  2792             assert cr.isUnderflow();
  2793             sb.append(cb.flip().toString());
  2794         }
  2795 */
  2796         return sb.toString();
  2797     }
  2798 
  2799 
  2800     // -- Parsing --
  2801 
  2802     // For convenience we wrap the input URI string in a new instance of the
  2803     // following internal class.  This saves always having to pass the input
  2804     // string as an argument to each internal scan/parse method.
  2805 
  2806     private class Parser {
  2807 
  2808         private String input;           // URI input string
  2809         private boolean requireServerAuthority = false;
  2810 
  2811         Parser(String s) {
  2812             input = s;
  2813             string = s;
  2814         }
  2815 
  2816         // -- Methods for throwing URISyntaxException in various ways --
  2817 
  2818         private void fail(String reason) throws URISyntaxException {
  2819             throw new URISyntaxException(input, reason);
  2820         }
  2821 
  2822         private void fail(String reason, int p) throws URISyntaxException {
  2823             throw new URISyntaxException(input, reason, p);
  2824         }
  2825 
  2826         private void failExpecting(String expected, int p)
  2827             throws URISyntaxException
  2828         {
  2829             fail("Expected " + expected, p);
  2830         }
  2831 
  2832         private void failExpecting(String expected, String prior, int p)
  2833             throws URISyntaxException
  2834         {
  2835             fail("Expected " + expected + " following " + prior, p);
  2836         }
  2837 
  2838 
  2839         // -- Simple access to the input string --
  2840 
  2841         // Return a substring of the input string
  2842         //
  2843         private String substring(int start, int end) {
  2844             return input.substring(start, end);
  2845         }
  2846 
  2847         // Return the char at position p,
  2848         // assuming that p < input.length()
  2849         //
  2850         private char charAt(int p) {
  2851             return input.charAt(p);
  2852         }
  2853 
  2854         // Tells whether start < end and, if so, whether charAt(start) == c
  2855         //
  2856         private boolean at(int start, int end, char c) {
  2857             return (start < end) && (charAt(start) == c);
  2858         }
  2859 
  2860         // Tells whether start + s.length() < end and, if so,
  2861         // whether the chars at the start position match s exactly
  2862         //
  2863         private boolean at(int start, int end, String s) {
  2864             int p = start;
  2865             int sn = s.length();
  2866             if (sn > end - p)
  2867                 return false;
  2868             int i = 0;
  2869             while (i < sn) {
  2870                 if (charAt(p++) != s.charAt(i)) {
  2871                     break;
  2872                 }
  2873                 i++;
  2874             }
  2875             return (i == sn);
  2876         }
  2877 
  2878 
  2879         // -- Scanning --
  2880 
  2881         // The various scan and parse methods that follow use a uniform
  2882         // convention of taking the current start position and end index as
  2883         // their first two arguments.  The start is inclusive while the end is
  2884         // exclusive, just as in the String class, i.e., a start/end pair
  2885         // denotes the left-open interval [start, end) of the input string.
  2886         //
  2887         // These methods never proceed past the end position.  They may return
  2888         // -1 to indicate outright failure, but more often they simply return
  2889         // the position of the first char after the last char scanned.  Thus
  2890         // a typical idiom is
  2891         //
  2892         //     int p = start;
  2893         //     int q = scan(p, end, ...);
  2894         //     if (q > p)
  2895         //         // We scanned something
  2896         //         ...;
  2897         //     else if (q == p)
  2898         //         // We scanned nothing
  2899         //         ...;
  2900         //     else if (q == -1)
  2901         //         // Something went wrong
  2902         //         ...;
  2903 
  2904 
  2905         // Scan a specific char: If the char at the given start position is
  2906         // equal to c, return the index of the next char; otherwise, return the
  2907         // start position.
  2908         //
  2909         private int scan(int start, int end, char c) {
  2910             if ((start < end) && (charAt(start) == c))
  2911                 return start + 1;
  2912             return start;
  2913         }
  2914 
  2915         // Scan forward from the given start position.  Stop at the first char
  2916         // in the err string (in which case -1 is returned), or the first char
  2917         // in the stop string (in which case the index of the preceding char is
  2918         // returned), or the end of the input string (in which case the length
  2919         // of the input string is returned).  May return the start position if
  2920         // nothing matches.
  2921         //
  2922         private int scan(int start, int end, String err, String stop) {
  2923             int p = start;
  2924             while (p < end) {
  2925                 char c = charAt(p);
  2926                 if (err.indexOf(c) >= 0)
  2927                     return -1;
  2928                 if (stop.indexOf(c) >= 0)
  2929                     break;
  2930                 p++;
  2931             }
  2932             return p;
  2933         }
  2934 
  2935         // Scan a potential escape sequence, starting at the given position,
  2936         // with the given first char (i.e., charAt(start) == c).
  2937         //
  2938         // This method assumes that if escapes are allowed then visible
  2939         // non-US-ASCII chars are also allowed.
  2940         //
  2941         private int scanEscape(int start, int n, char first)
  2942             throws URISyntaxException
  2943         {
  2944             int p = start;
  2945             char c = first;
  2946             if (c == '%') {
  2947                 // Process escape pair
  2948                 if ((p + 3 <= n)
  2949                     && match(charAt(p + 1), L_HEX, H_HEX)
  2950                     && match(charAt(p + 2), L_HEX, H_HEX)) {
  2951                     return p + 3;
  2952                 }
  2953                 fail("Malformed escape pair", p);
  2954             } else if ((c > 128)
  2955                        && !Character.isSpaceChar(c)
  2956                        && !Character.isISOControl(c)) {
  2957                 // Allow unescaped but visible non-US-ASCII chars
  2958                 return p + 1;
  2959             }
  2960             return p;
  2961         }
  2962 
  2963         // Scan chars that match the given mask pair
  2964         //
  2965         private int scan(int start, int n, long lowMask, long highMask)
  2966             throws URISyntaxException
  2967         {
  2968             int p = start;
  2969             while (p < n) {
  2970                 char c = charAt(p);
  2971                 if (match(c, lowMask, highMask)) {
  2972                     p++;
  2973                     continue;
  2974                 }
  2975                 if ((lowMask & L_ESCAPED) != 0) {
  2976                     int q = scanEscape(p, n, c);
  2977                     if (q > p) {
  2978                         p = q;
  2979                         continue;
  2980                     }
  2981                 }
  2982                 break;
  2983             }
  2984             return p;
  2985         }
  2986 
  2987         // Check that each of the chars in [start, end) matches the given mask
  2988         //
  2989         private void checkChars(int start, int end,
  2990                                 long lowMask, long highMask,
  2991                                 String what)
  2992             throws URISyntaxException
  2993         {
  2994             int p = scan(start, end, lowMask, highMask);
  2995             if (p < end)
  2996                 fail("Illegal character in " + what, p);
  2997         }
  2998 
  2999         // Check that the char at position p matches the given mask
  3000         //
  3001         private void checkChar(int p,
  3002                                long lowMask, long highMask,
  3003                                String what)
  3004             throws URISyntaxException
  3005         {
  3006             checkChars(p, p + 1, lowMask, highMask, what);
  3007         }
  3008 
  3009 
  3010         // -- Parsing --
  3011 
  3012         // [<scheme>:]<scheme-specific-part>[#<fragment>]
  3013         //
  3014         void parse(boolean rsa) throws URISyntaxException {
  3015             requireServerAuthority = rsa;
  3016             int ssp;                    // Start of scheme-specific part
  3017             int n = input.length();
  3018             int p = scan(0, n, "/?#", ":");
  3019             if ((p >= 0) && at(p, n, ':')) {
  3020                 if (p == 0)
  3021                     failExpecting("scheme name", 0);
  3022                 checkChar(0, L_ALPHA, H_ALPHA, "scheme name");
  3023                 checkChars(1, p, L_SCHEME, H_SCHEME, "scheme name");
  3024                 scheme = substring(0, p);
  3025                 p++;                    // Skip ':'
  3026                 ssp = p;
  3027                 if (at(p, n, '/')) {
  3028                     p = parseHierarchical(p, n);
  3029                 } else {
  3030                     int q = scan(p, n, "", "#");
  3031                     if (q <= p)
  3032                         failExpecting("scheme-specific part", p);
  3033                     checkChars(p, q, L_URIC, H_URIC, "opaque part");
  3034                     p = q;
  3035                 }
  3036             } else {
  3037                 ssp = 0;
  3038                 p = parseHierarchical(0, n);
  3039             }
  3040             schemeSpecificPart = substring(ssp, p);
  3041             if (at(p, n, '#')) {
  3042                 checkChars(p + 1, n, L_URIC, H_URIC, "fragment");
  3043                 fragment = substring(p + 1, n);
  3044                 p = n;
  3045             }
  3046             if (p < n)
  3047                 fail("end of URI", p);
  3048         }
  3049 
  3050         // [//authority]<path>[?<query>]
  3051         //
  3052         // DEVIATION from RFC2396: We allow an empty authority component as
  3053         // long as it's followed by a non-empty path, query component, or
  3054         // fragment component.  This is so that URIs such as "file:///foo/bar"
  3055         // will parse.  This seems to be the intent of RFC2396, though the
  3056         // grammar does not permit it.  If the authority is empty then the
  3057         // userInfo, host, and port components are undefined.
  3058         //
  3059         // DEVIATION from RFC2396: We allow empty relative paths.  This seems
  3060         // to be the intent of RFC2396, but the grammar does not permit it.
  3061         // The primary consequence of this deviation is that "#f" parses as a
  3062         // relative URI with an empty path.
  3063         //
  3064         private int parseHierarchical(int start, int n)
  3065             throws URISyntaxException
  3066         {
  3067             int p = start;
  3068             if (at(p, n, '/') && at(p + 1, n, '/')) {
  3069                 p += 2;
  3070                 int q = scan(p, n, "", "/?#");
  3071                 if (q > p) {
  3072                     p = parseAuthority(p, q);
  3073                 } else if (q < n) {
  3074                     // DEVIATION: Allow empty authority prior to non-empty
  3075                     // path, query component or fragment identifier
  3076                 } else
  3077                     failExpecting("authority", p);
  3078             }
  3079             int q = scan(p, n, "", "?#"); // DEVIATION: May be empty
  3080             checkChars(p, q, L_PATH, H_PATH, "path");
  3081             path = substring(p, q);
  3082             p = q;
  3083             if (at(p, n, '?')) {
  3084                 p++;
  3085                 q = scan(p, n, "", "#");
  3086                 checkChars(p, q, L_URIC, H_URIC, "query");
  3087                 query = substring(p, q);
  3088                 p = q;
  3089             }
  3090             return p;
  3091         }
  3092 
  3093         // authority     = server | reg_name
  3094         //
  3095         // Ambiguity: An authority that is a registry name rather than a server
  3096         // might have a prefix that parses as a server.  We use the fact that
  3097         // the authority component is always followed by '/' or the end of the
  3098         // input string to resolve this: If the complete authority did not
  3099         // parse as a server then we try to parse it as a registry name.
  3100         //
  3101         private int parseAuthority(int start, int n)
  3102             throws URISyntaxException
  3103         {
  3104             int p = start;
  3105             int q = p;
  3106             URISyntaxException ex = null;
  3107 
  3108             boolean serverChars;
  3109             boolean regChars;
  3110 
  3111             if (scan(p, n, "", "]") > p) {
  3112                 // contains a literal IPv6 address, therefore % is allowed
  3113                 serverChars = (scan(p, n, L_SERVER_PERCENT, H_SERVER_PERCENT) == n);
  3114             } else {
  3115                 serverChars = (scan(p, n, L_SERVER, H_SERVER) == n);
  3116             }
  3117             regChars = (scan(p, n, L_REG_NAME, H_REG_NAME) == n);
  3118 
  3119             if (regChars && !serverChars) {
  3120                 // Must be a registry-based authority
  3121                 authority = substring(p, n);
  3122                 return n;
  3123             }
  3124 
  3125             if (serverChars) {
  3126                 // Might be (probably is) a server-based authority, so attempt
  3127                 // to parse it as such.  If the attempt fails, try to treat it
  3128                 // as a registry-based authority.
  3129                 try {
  3130                     q = parseServer(p, n);
  3131                     if (q < n)
  3132                         failExpecting("end of authority", q);
  3133                     authority = substring(p, n);
  3134                 } catch (URISyntaxException x) {
  3135                     // Undo results of failed parse
  3136                     userInfo = null;
  3137                     host = null;
  3138                     port = -1;
  3139                     if (requireServerAuthority) {
  3140                         // If we're insisting upon a server-based authority,
  3141                         // then just re-throw the exception
  3142                         throw x;
  3143                     } else {
  3144                         // Save the exception in case it doesn't parse as a
  3145                         // registry either
  3146                         ex = x;
  3147                         q = p;
  3148                     }
  3149                 }
  3150             }
  3151 
  3152             if (q < n) {
  3153                 if (regChars) {
  3154                     // Registry-based authority
  3155                     authority = substring(p, n);
  3156                 } else if (ex != null) {
  3157                     // Re-throw exception; it was probably due to
  3158                     // a malformed IPv6 address
  3159                     throw ex;
  3160                 } else {
  3161                     fail("Illegal character in authority", q);
  3162                 }
  3163             }
  3164 
  3165             return n;
  3166         }
  3167 
  3168 
  3169         // [<userinfo>@]<host>[:<port>]
  3170         //
  3171         private int parseServer(int start, int n)
  3172             throws URISyntaxException
  3173         {
  3174             int p = start;
  3175             int q;
  3176 
  3177             // userinfo
  3178             q = scan(p, n, "/?#", "@");
  3179             if ((q >= p) && at(q, n, '@')) {
  3180                 checkChars(p, q, L_USERINFO, H_USERINFO, "user info");
  3181                 userInfo = substring(p, q);
  3182                 p = q + 1;              // Skip '@'
  3183             }
  3184 
  3185             // hostname, IPv4 address, or IPv6 address
  3186             if (at(p, n, '[')) {
  3187                 // DEVIATION from RFC2396: Support IPv6 addresses, per RFC2732
  3188                 p++;
  3189                 q = scan(p, n, "/?#", "]");
  3190                 if ((q > p) && at(q, n, ']')) {
  3191                     // look for a "%" scope id
  3192                     int r = scan (p, q, "", "%");
  3193                     if (r > p) {
  3194                         parseIPv6Reference(p, r);
  3195                         if (r+1 == q) {
  3196                             fail ("scope id expected");
  3197                         }
  3198                         checkChars (r+1, q, L_ALPHANUM, H_ALPHANUM,
  3199                                                 "scope id");
  3200                     } else {
  3201                         parseIPv6Reference(p, q);
  3202                     }
  3203                     host = substring(p-1, q+1);
  3204                     p = q + 1;
  3205                 } else {
  3206                     failExpecting("closing bracket for IPv6 address", q);
  3207                 }
  3208             } else {
  3209                 q = parseIPv4Address(p, n);
  3210                 if (q <= p)
  3211                     q = parseHostname(p, n);
  3212                 p = q;
  3213             }
  3214 
  3215             // port
  3216             if (at(p, n, ':')) {
  3217                 p++;
  3218                 q = scan(p, n, "", "/");
  3219                 if (q > p) {
  3220                     checkChars(p, q, L_DIGIT, H_DIGIT, "port number");
  3221                     try {
  3222                         port = Integer.parseInt(substring(p, q));
  3223                     } catch (NumberFormatException x) {
  3224                         fail("Malformed port number", p);
  3225                     }
  3226                     p = q;
  3227                 }
  3228             }
  3229             if (p < n)
  3230                 failExpecting("port number", p);
  3231 
  3232             return p;
  3233         }
  3234 
  3235         // Scan a string of decimal digits whose value fits in a byte
  3236         //
  3237         private int scanByte(int start, int n)
  3238             throws URISyntaxException
  3239         {
  3240             int p = start;
  3241             int q = scan(p, n, L_DIGIT, H_DIGIT);
  3242             if (q <= p) return q;
  3243             if (Integer.parseInt(substring(p, q)) > 255) return p;
  3244             return q;
  3245         }
  3246 
  3247         // Scan an IPv4 address.
  3248         //
  3249         // If the strict argument is true then we require that the given
  3250         // interval contain nothing besides an IPv4 address; if it is false
  3251         // then we only require that it start with an IPv4 address.
  3252         //
  3253         // If the interval does not contain or start with (depending upon the
  3254         // strict argument) a legal IPv4 address characters then we return -1
  3255         // immediately; otherwise we insist that these characters parse as a
  3256         // legal IPv4 address and throw an exception on failure.
  3257         //
  3258         // We assume that any string of decimal digits and dots must be an IPv4
  3259         // address.  It won't parse as a hostname anyway, so making that
  3260         // assumption here allows more meaningful exceptions to be thrown.
  3261         //
  3262         private int scanIPv4Address(int start, int n, boolean strict)
  3263             throws URISyntaxException
  3264         {
  3265             int p = start;
  3266             int q;
  3267             int m = scan(p, n, L_DIGIT | L_DOT, H_DIGIT | H_DOT);
  3268             if ((m <= p) || (strict && (m != n)))
  3269                 return -1;
  3270             for (;;) {
  3271                 // Per RFC2732: At most three digits per byte
  3272                 // Further constraint: Each element fits in a byte
  3273                 if ((q = scanByte(p, m)) <= p) break;   p = q;
  3274                 if ((q = scan(p, m, '.')) <= p) break;  p = q;
  3275                 if ((q = scanByte(p, m)) <= p) break;   p = q;
  3276                 if ((q = scan(p, m, '.')) <= p) break;  p = q;
  3277                 if ((q = scanByte(p, m)) <= p) break;   p = q;
  3278                 if ((q = scan(p, m, '.')) <= p) break;  p = q;
  3279                 if ((q = scanByte(p, m)) <= p) break;   p = q;
  3280                 if (q < m) break;
  3281                 return q;
  3282             }
  3283             fail("Malformed IPv4 address", q);
  3284             return -1;
  3285         }
  3286 
  3287         // Take an IPv4 address: Throw an exception if the given interval
  3288         // contains anything except an IPv4 address
  3289         //
  3290         private int takeIPv4Address(int start, int n, String expected)
  3291             throws URISyntaxException
  3292         {
  3293             int p = scanIPv4Address(start, n, true);
  3294             if (p <= start)
  3295                 failExpecting(expected, start);
  3296             return p;
  3297         }
  3298 
  3299         // Attempt to parse an IPv4 address, returning -1 on failure but
  3300         // allowing the given interval to contain [:<characters>] after
  3301         // the IPv4 address.
  3302         //
  3303         private int parseIPv4Address(int start, int n) {
  3304             int p;
  3305 
  3306             try {
  3307                 p = scanIPv4Address(start, n, false);
  3308             } catch (URISyntaxException x) {
  3309                 return -1;
  3310             } catch (NumberFormatException nfe) {
  3311                 return -1;
  3312             }
  3313 
  3314             if (p > start && p < n) {
  3315                 // IPv4 address is followed by something - check that
  3316                 // it's a ":" as this is the only valid character to
  3317                 // follow an address.
  3318                 if (charAt(p) != ':') {
  3319                     p = -1;
  3320                 }
  3321             }
  3322 
  3323             if (p > start)
  3324                 host = substring(start, p);
  3325 
  3326             return p;
  3327         }
  3328 
  3329         // hostname      = domainlabel [ "." ] | 1*( domainlabel "." ) toplabel [ "." ]
  3330         // domainlabel   = alphanum | alphanum *( alphanum | "-" ) alphanum
  3331         // toplabel      = alpha | alpha *( alphanum | "-" ) alphanum
  3332         //
  3333         private int parseHostname(int start, int n)
  3334             throws URISyntaxException
  3335         {
  3336             int p = start;
  3337             int q;
  3338             int l = -1;                 // Start of last parsed label
  3339 
  3340             do {
  3341                 // domainlabel = alphanum [ *( alphanum | "-" ) alphanum ]
  3342                 q = scan(p, n, L_ALPHANUM, H_ALPHANUM);
  3343                 if (q <= p)
  3344                     break;
  3345                 l = p;
  3346                 if (q > p) {
  3347                     p = q;
  3348                     q = scan(p, n, L_ALPHANUM | L_DASH, H_ALPHANUM | H_DASH);
  3349                     if (q > p) {
  3350                         if (charAt(q - 1) == '-')
  3351                             fail("Illegal character in hostname", q - 1);
  3352                         p = q;
  3353                     }
  3354                 }
  3355                 q = scan(p, n, '.');
  3356                 if (q <= p)
  3357                     break;
  3358                 p = q;
  3359             } while (p < n);
  3360 
  3361             if ((p < n) && !at(p, n, ':'))
  3362                 fail("Illegal character in hostname", p);
  3363 
  3364             if (l < 0)
  3365                 failExpecting("hostname", start);
  3366 
  3367             // for a fully qualified hostname check that the rightmost
  3368             // label starts with an alpha character.
  3369             if (l > start && !match(charAt(l), L_ALPHA, H_ALPHA)) {
  3370                 fail("Illegal character in hostname", l);
  3371             }
  3372 
  3373             host = substring(start, p);
  3374             return p;
  3375         }
  3376 
  3377 
  3378         // IPv6 address parsing, from RFC2373: IPv6 Addressing Architecture
  3379         //
  3380         // Bug: The grammar in RFC2373 Appendix B does not allow addresses of
  3381         // the form ::12.34.56.78, which are clearly shown in the examples
  3382         // earlier in the document.  Here is the original grammar:
  3383         //
  3384         //   IPv6address = hexpart [ ":" IPv4address ]
  3385         //   hexpart     = hexseq | hexseq "::" [ hexseq ] | "::" [ hexseq ]
  3386         //   hexseq      = hex4 *( ":" hex4)
  3387         //   hex4        = 1*4HEXDIG
  3388         //
  3389         // We therefore use the following revised grammar:
  3390         //
  3391         //   IPv6address = hexseq [ ":" IPv4address ]
  3392         //                 | hexseq [ "::" [ hexpost ] ]
  3393         //                 | "::" [ hexpost ]
  3394         //   hexpost     = hexseq | hexseq ":" IPv4address | IPv4address
  3395         //   hexseq      = hex4 *( ":" hex4)
  3396         //   hex4        = 1*4HEXDIG
  3397         //
  3398         // This covers all and only the following cases:
  3399         //
  3400         //   hexseq
  3401         //   hexseq : IPv4address
  3402         //   hexseq ::
  3403         //   hexseq :: hexseq
  3404         //   hexseq :: hexseq : IPv4address
  3405         //   hexseq :: IPv4address
  3406         //   :: hexseq
  3407         //   :: hexseq : IPv4address
  3408         //   :: IPv4address
  3409         //   ::
  3410         //
  3411         // Additionally we constrain the IPv6 address as follows :-
  3412         //
  3413         //  i.  IPv6 addresses without compressed zeros should contain
  3414         //      exactly 16 bytes.
  3415         //
  3416         //  ii. IPv6 addresses with compressed zeros should contain
  3417         //      less than 16 bytes.
  3418 
  3419         private int ipv6byteCount = 0;
  3420 
  3421         private int parseIPv6Reference(int start, int n)
  3422             throws URISyntaxException
  3423         {
  3424             int p = start;
  3425             int q;
  3426             boolean compressedZeros = false;
  3427 
  3428             q = scanHexSeq(p, n);
  3429 
  3430             if (q > p) {
  3431                 p = q;
  3432                 if (at(p, n, "::")) {
  3433                     compressedZeros = true;
  3434                     p = scanHexPost(p + 2, n);
  3435                 } else if (at(p, n, ':')) {
  3436                     p = takeIPv4Address(p + 1,  n, "IPv4 address");
  3437                     ipv6byteCount += 4;
  3438                 }
  3439             } else if (at(p, n, "::")) {
  3440                 compressedZeros = true;
  3441                 p = scanHexPost(p + 2, n);
  3442             }
  3443             if (p < n)
  3444                 fail("Malformed IPv6 address", start);
  3445             if (ipv6byteCount > 16)
  3446                 fail("IPv6 address too long", start);
  3447             if (!compressedZeros && ipv6byteCount < 16)
  3448                 fail("IPv6 address too short", start);
  3449             if (compressedZeros && ipv6byteCount == 16)
  3450                 fail("Malformed IPv6 address", start);
  3451 
  3452             return p;
  3453         }
  3454 
  3455         private int scanHexPost(int start, int n)
  3456             throws URISyntaxException
  3457         {
  3458             int p = start;
  3459             int q;
  3460 
  3461             if (p == n)
  3462                 return p;
  3463 
  3464             q = scanHexSeq(p, n);
  3465             if (q > p) {
  3466                 p = q;
  3467                 if (at(p, n, ':')) {
  3468                     p++;
  3469                     p = takeIPv4Address(p, n, "hex digits or IPv4 address");
  3470                     ipv6byteCount += 4;
  3471                 }
  3472             } else {
  3473                 p = takeIPv4Address(p, n, "hex digits or IPv4 address");
  3474                 ipv6byteCount += 4;
  3475             }
  3476             return p;
  3477         }
  3478 
  3479         // Scan a hex sequence; return -1 if one could not be scanned
  3480         //
  3481         private int scanHexSeq(int start, int n)
  3482             throws URISyntaxException
  3483         {
  3484             int p = start;
  3485             int q;
  3486 
  3487             q = scan(p, n, L_HEX, H_HEX);
  3488             if (q <= p)
  3489                 return -1;
  3490             if (at(q, n, '.'))          // Beginning of IPv4 address
  3491                 return -1;
  3492             if (q > p + 4)
  3493                 fail("IPv6 hexadecimal digit sequence too long", p);
  3494             ipv6byteCount += 2;
  3495             p = q;
  3496             while (p < n) {
  3497                 if (!at(p, n, ':'))
  3498                     break;
  3499                 if (at(p + 1, n, ':'))
  3500                     break;              // "::"
  3501                 p++;
  3502                 q = scan(p, n, L_HEX, H_HEX);
  3503                 if (q <= p)
  3504                     failExpecting("digits for an IPv6 address", p);
  3505                 if (at(q, n, '.')) {    // Beginning of IPv4 address
  3506                     p--;
  3507                     break;
  3508                 }
  3509                 if (q > p + 4)
  3510                     fail("IPv6 hexadecimal digit sequence too long", p);
  3511                 ipv6byteCount += 2;
  3512                 p = q;
  3513             }
  3514 
  3515             return p;
  3516         }
  3517 
  3518     }
  3519 
  3520 }