rt/emul/compact/src/main/java/java/util/regex/Pattern.java
author Jaroslav Tulach <jtulach@netbeans.org>
Mon, 07 Oct 2013 16:17:21 +0200
changeset 1350 f14e9730d4e9
parent 1348 bca65655b36b
permissions -rw-r--r--
ReExp builds on top of mini emul Character
     1 /*
     2  * Copyright (c) 1999, 2011, Oracle and/or its affiliates. All rights reserved.
     3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
     4  *
     5  * This code is free software; you can redistribute it and/or modify it
     6  * under the terms of the GNU General Public License version 2 only, as
     7  * published by the Free Software Foundation.  Oracle designates this
     8  * particular file as subject to the "Classpath" exception as provided
     9  * by Oracle in the LICENSE file that accompanied this code.
    10  *
    11  * This code is distributed in the hope that it will be useful, but WITHOUT
    12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
    13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    14  * version 2 for more details (a copy is included in the LICENSE file that
    15  * accompanied this code).
    16  *
    17  * You should have received a copy of the GNU General Public License version
    18  * 2 along with this work; if not, write to the Free Software Foundation,
    19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
    20  *
    21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
    22  * or visit www.oracle.com if you need additional information or have any
    23  * questions.
    24  */
    25 
    26 package java.util.regex;
    27 
    28 import java.util.Locale;
    29 import java.util.Map;
    30 import java.util.ArrayList;
    31 import java.util.HashMap;
    32 import java.util.Arrays;
    33 
    34 
    35 /**
    36  * A compiled representation of a regular expression.
    37  *
    38  * <p> A regular expression, specified as a string, must first be compiled into
    39  * an instance of this class.  The resulting pattern can then be used to create
    40  * a {@link Matcher} object that can match arbitrary {@link
    41  * java.lang.CharSequence </code>character sequences<code>} against the regular
    42  * expression.  All of the state involved in performing a match resides in the
    43  * matcher, so many matchers can share the same pattern.
    44  *
    45  * <p> A typical invocation sequence is thus
    46  *
    47  * <blockquote><pre>
    48  * Pattern p = Pattern.{@link #compile compile}("a*b");
    49  * Matcher m = p.{@link #matcher matcher}("aaaaab");
    50  * boolean b = m.{@link Matcher#matches matches}();</pre></blockquote>
    51  *
    52  * <p> A {@link #matches matches} method is defined by this class as a
    53  * convenience for when a regular expression is used just once.  This method
    54  * compiles an expression and matches an input sequence against it in a single
    55  * invocation.  The statement
    56  *
    57  * <blockquote><pre>
    58  * boolean b = Pattern.matches("a*b", "aaaaab");</pre></blockquote>
    59  *
    60  * is equivalent to the three statements above, though for repeated matches it
    61  * is less efficient since it does not allow the compiled pattern to be reused.
    62  *
    63  * <p> Instances of this class are immutable and are safe for use by multiple
    64  * concurrent threads.  Instances of the {@link Matcher} class are not safe for
    65  * such use.
    66  *
    67  *
    68  * <a name="sum">
    69  * <h4> Summary of regular-expression constructs </h4>
    70  *
    71  * <table border="0" cellpadding="1" cellspacing="0"
    72  *  summary="Regular expression constructs, and what they match">
    73  *
    74  * <tr align="left">
    75  * <th bgcolor="#CCCCFF" align="left" id="construct">Construct</th>
    76  * <th bgcolor="#CCCCFF" align="left" id="matches">Matches</th>
    77  * </tr>
    78  *
    79  * <tr><th>&nbsp;</th></tr>
    80  * <tr align="left"><th colspan="2" id="characters">Characters</th></tr>
    81  *
    82  * <tr><td valign="top" headers="construct characters"><i>x</i></td>
    83  *     <td headers="matches">The character <i>x</i></td></tr>
    84  * <tr><td valign="top" headers="construct characters"><tt>\\</tt></td>
    85  *     <td headers="matches">The backslash character</td></tr>
    86  * <tr><td valign="top" headers="construct characters"><tt>\0</tt><i>n</i></td>
    87  *     <td headers="matches">The character with octal value <tt>0</tt><i>n</i>
    88  *         (0&nbsp;<tt>&lt;=</tt>&nbsp;<i>n</i>&nbsp;<tt>&lt;=</tt>&nbsp;7)</td></tr>
    89  * <tr><td valign="top" headers="construct characters"><tt>\0</tt><i>nn</i></td>
    90  *     <td headers="matches">The character with octal value <tt>0</tt><i>nn</i>
    91  *         (0&nbsp;<tt>&lt;=</tt>&nbsp;<i>n</i>&nbsp;<tt>&lt;=</tt>&nbsp;7)</td></tr>
    92  * <tr><td valign="top" headers="construct characters"><tt>\0</tt><i>mnn</i></td>
    93  *     <td headers="matches">The character with octal value <tt>0</tt><i>mnn</i>
    94  *         (0&nbsp;<tt>&lt;=</tt>&nbsp;<i>m</i>&nbsp;<tt>&lt;=</tt>&nbsp;3,
    95  *         0&nbsp;<tt>&lt;=</tt>&nbsp;<i>n</i>&nbsp;<tt>&lt;=</tt>&nbsp;7)</td></tr>
    96  * <tr><td valign="top" headers="construct characters"><tt>\x</tt><i>hh</i></td>
    97  *     <td headers="matches">The character with hexadecimal&nbsp;value&nbsp;<tt>0x</tt><i>hh</i></td></tr>
    98  * <tr><td valign="top" headers="construct characters"><tt>&#92;u</tt><i>hhhh</i></td>
    99  *     <td headers="matches">The character with hexadecimal&nbsp;value&nbsp;<tt>0x</tt><i>hhhh</i></td></tr>
   100  * <tr><td valign="top" headers="construct characters"><tt>&#92;x</tt><i>{h...h}</i></td>
   101  *     <td headers="matches">The character with hexadecimal&nbsp;value&nbsp;<tt>0x</tt><i>h...h</i>
   102  *         ({@link java.lang.Character#MIN_CODE_POINT Character.MIN_CODE_POINT}
   103  *         &nbsp;&lt;=&nbsp;<tt>0x</tt><i>h...h</i>&nbsp;&lt;=&nbsp
   104  *          {@link java.lang.Character#MAX_CODE_POINT Character.MAX_CODE_POINT})</td></tr>
   105  * <tr><td valign="top" headers="matches"><tt>\t</tt></td>
   106  *     <td headers="matches">The tab character (<tt>'&#92;u0009'</tt>)</td></tr>
   107  * <tr><td valign="top" headers="construct characters"><tt>\n</tt></td>
   108  *     <td headers="matches">The newline (line feed) character (<tt>'&#92;u000A'</tt>)</td></tr>
   109  * <tr><td valign="top" headers="construct characters"><tt>\r</tt></td>
   110  *     <td headers="matches">The carriage-return character (<tt>'&#92;u000D'</tt>)</td></tr>
   111  * <tr><td valign="top" headers="construct characters"><tt>\f</tt></td>
   112  *     <td headers="matches">The form-feed character (<tt>'&#92;u000C'</tt>)</td></tr>
   113  * <tr><td valign="top" headers="construct characters"><tt>\a</tt></td>
   114  *     <td headers="matches">The alert (bell) character (<tt>'&#92;u0007'</tt>)</td></tr>
   115  * <tr><td valign="top" headers="construct characters"><tt>\e</tt></td>
   116  *     <td headers="matches">The escape character (<tt>'&#92;u001B'</tt>)</td></tr>
   117  * <tr><td valign="top" headers="construct characters"><tt>\c</tt><i>x</i></td>
   118  *     <td headers="matches">The control character corresponding to <i>x</i></td></tr>
   119  *
   120  * <tr><th>&nbsp;</th></tr>
   121  * <tr align="left"><th colspan="2" id="classes">Character classes</th></tr>
   122  *
   123  * <tr><td valign="top" headers="construct classes"><tt>[abc]</tt></td>
   124  *     <td headers="matches"><tt>a</tt>, <tt>b</tt>, or <tt>c</tt> (simple class)</td></tr>
   125  * <tr><td valign="top" headers="construct classes"><tt>[^abc]</tt></td>
   126  *     <td headers="matches">Any character except <tt>a</tt>, <tt>b</tt>, or <tt>c</tt> (negation)</td></tr>
   127  * <tr><td valign="top" headers="construct classes"><tt>[a-zA-Z]</tt></td>
   128  *     <td headers="matches"><tt>a</tt> through <tt>z</tt>
   129  *         or <tt>A</tt> through <tt>Z</tt>, inclusive (range)</td></tr>
   130  * <tr><td valign="top" headers="construct classes"><tt>[a-d[m-p]]</tt></td>
   131  *     <td headers="matches"><tt>a</tt> through <tt>d</tt>,
   132  *      or <tt>m</tt> through <tt>p</tt>: <tt>[a-dm-p]</tt> (union)</td></tr>
   133  * <tr><td valign="top" headers="construct classes"><tt>[a-z&&[def]]</tt></td>
   134  *     <td headers="matches"><tt>d</tt>, <tt>e</tt>, or <tt>f</tt> (intersection)</tr>
   135  * <tr><td valign="top" headers="construct classes"><tt>[a-z&&[^bc]]</tt></td>
   136  *     <td headers="matches"><tt>a</tt> through <tt>z</tt>,
   137  *         except for <tt>b</tt> and <tt>c</tt>: <tt>[ad-z]</tt> (subtraction)</td></tr>
   138  * <tr><td valign="top" headers="construct classes"><tt>[a-z&&[^m-p]]</tt></td>
   139  *     <td headers="matches"><tt>a</tt> through <tt>z</tt>,
   140  *          and not <tt>m</tt> through <tt>p</tt>: <tt>[a-lq-z]</tt>(subtraction)</td></tr>
   141  * <tr><th>&nbsp;</th></tr>
   142  *
   143  * <tr align="left"><th colspan="2" id="predef">Predefined character classes</th></tr>
   144  *
   145  * <tr><td valign="top" headers="construct predef"><tt>.</tt></td>
   146  *     <td headers="matches">Any character (may or may not match <a href="#lt">line terminators</a>)</td></tr>
   147  * <tr><td valign="top" headers="construct predef"><tt>\d</tt></td>
   148  *     <td headers="matches">A digit: <tt>[0-9]</tt></td></tr>
   149  * <tr><td valign="top" headers="construct predef"><tt>\D</tt></td>
   150  *     <td headers="matches">A non-digit: <tt>[^0-9]</tt></td></tr>
   151  * <tr><td valign="top" headers="construct predef"><tt>\s</tt></td>
   152  *     <td headers="matches">A whitespace character: <tt>[ \t\n\x0B\f\r]</tt></td></tr>
   153  * <tr><td valign="top" headers="construct predef"><tt>\S</tt></td>
   154  *     <td headers="matches">A non-whitespace character: <tt>[^\s]</tt></td></tr>
   155  * <tr><td valign="top" headers="construct predef"><tt>\w</tt></td>
   156  *     <td headers="matches">A word character: <tt>[a-zA-Z_0-9]</tt></td></tr>
   157  * <tr><td valign="top" headers="construct predef"><tt>\W</tt></td>
   158  *     <td headers="matches">A non-word character: <tt>[^\w]</tt></td></tr>
   159  *
   160  * <tr><th>&nbsp;</th></tr>
   161  * <tr align="left"><th colspan="2" id="posix">POSIX character classes</b> (US-ASCII only)<b></th></tr>
   162  *
   163  * <tr><td valign="top" headers="construct posix"><tt>\p{Lower}</tt></td>
   164  *     <td headers="matches">A lower-case alphabetic character: <tt>[a-z]</tt></td></tr>
   165  * <tr><td valign="top" headers="construct posix"><tt>\p{Upper}</tt></td>
   166  *     <td headers="matches">An upper-case alphabetic character:<tt>[A-Z]</tt></td></tr>
   167  * <tr><td valign="top" headers="construct posix"><tt>\p{ASCII}</tt></td>
   168  *     <td headers="matches">All ASCII:<tt>[\x00-\x7F]</tt></td></tr>
   169  * <tr><td valign="top" headers="construct posix"><tt>\p{Alpha}</tt></td>
   170  *     <td headers="matches">An alphabetic character:<tt>[\p{Lower}\p{Upper}]</tt></td></tr>
   171  * <tr><td valign="top" headers="construct posix"><tt>\p{Digit}</tt></td>
   172  *     <td headers="matches">A decimal digit: <tt>[0-9]</tt></td></tr>
   173  * <tr><td valign="top" headers="construct posix"><tt>\p{Alnum}</tt></td>
   174  *     <td headers="matches">An alphanumeric character:<tt>[\p{Alpha}\p{Digit}]</tt></td></tr>
   175  * <tr><td valign="top" headers="construct posix"><tt>\p{Punct}</tt></td>
   176  *     <td headers="matches">Punctuation: One of <tt>!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~</tt></td></tr>
   177  *     <!-- <tt>[\!"#\$%&'\(\)\*\+,\-\./:;\<=\>\?@\[\\\]\^_`\{\|\}~]</tt>
   178  *          <tt>[\X21-\X2F\X31-\X40\X5B-\X60\X7B-\X7E]</tt> -->
   179  * <tr><td valign="top" headers="construct posix"><tt>\p{Graph}</tt></td>
   180  *     <td headers="matches">A visible character: <tt>[\p{Alnum}\p{Punct}]</tt></td></tr>
   181  * <tr><td valign="top" headers="construct posix"><tt>\p{Print}</tt></td>
   182  *     <td headers="matches">A printable character: <tt>[\p{Graph}\x20]</tt></td></tr>
   183  * <tr><td valign="top" headers="construct posix"><tt>\p{Blank}</tt></td>
   184  *     <td headers="matches">A space or a tab: <tt>[ \t]</tt></td></tr>
   185  * <tr><td valign="top" headers="construct posix"><tt>\p{Cntrl}</tt></td>
   186  *     <td headers="matches">A control character: <tt>[\x00-\x1F\x7F]</tt></td></tr>
   187  * <tr><td valign="top" headers="construct posix"><tt>\p{XDigit}</tt></td>
   188  *     <td headers="matches">A hexadecimal digit: <tt>[0-9a-fA-F]</tt></td></tr>
   189  * <tr><td valign="top" headers="construct posix"><tt>\p{Space}</tt></td>
   190  *     <td headers="matches">A whitespace character: <tt>[ \t\n\x0B\f\r]</tt></td></tr>
   191  *
   192  * <tr><th>&nbsp;</th></tr>
   193  * <tr align="left"><th colspan="2">java.lang.Character classes (simple <a href="#jcc">java character type</a>)</th></tr>
   194  *
   195  * <tr><td valign="top"><tt>\p{javaLowerCase}</tt></td>
   196  *     <td>Equivalent to java.lang.Character.isLowerCase()</td></tr>
   197  * <tr><td valign="top"><tt>\p{javaUpperCase}</tt></td>
   198  *     <td>Equivalent to java.lang.Character.isUpperCase()</td></tr>
   199  * <tr><td valign="top"><tt>\p{javaWhitespace}</tt></td>
   200  *     <td>Equivalent to java.lang.Character.isWhitespace()</td></tr>
   201  * <tr><td valign="top"><tt>\p{javaMirrored}</tt></td>
   202  *     <td>Equivalent to java.lang.Character.isMirrored()</td></tr>
   203  *
   204  * <tr><th>&nbsp;</th></tr>
   205  * <tr align="left"><th colspan="2" id="unicode">Classes for Unicode scripts, blocks, categories and binary properties</th></tr>
   206  * * <tr><td valign="top" headers="construct unicode"><tt>\p{IsLatin}</tt></td>
   207  *     <td headers="matches">A Latin&nbsp;script character (<a href="#usc">script</a>)</td></tr>
   208  * <tr><td valign="top" headers="construct unicode"><tt>\p{InGreek}</tt></td>
   209  *     <td headers="matches">A character in the Greek&nbsp;block (<a href="#ubc">block</a>)</td></tr>
   210  * <tr><td valign="top" headers="construct unicode"><tt>\p{Lu}</tt></td>
   211  *     <td headers="matches">An uppercase letter (<a href="#ucc">category</a>)</td></tr>
   212  * <tr><td valign="top" headers="construct unicode"><tt>\p{IsAlphabetic}</tt></td>
   213  *     <td headers="matches">An alphabetic character (<a href="#ubpc">binary property</a>)</td></tr>
   214  * <tr><td valign="top" headers="construct unicode"><tt>\p{Sc}</tt></td>
   215  *     <td headers="matches">A currency symbol</td></tr>
   216  * <tr><td valign="top" headers="construct unicode"><tt>\P{InGreek}</tt></td>
   217  *     <td headers="matches">Any character except one in the Greek block (negation)</td></tr>
   218  * <tr><td valign="top" headers="construct unicode"><tt>[\p{L}&&[^\p{Lu}]]&nbsp;</tt></td>
   219  *     <td headers="matches">Any letter except an uppercase letter (subtraction)</td></tr>
   220  *
   221  * <tr><th>&nbsp;</th></tr>
   222  * <tr align="left"><th colspan="2" id="bounds">Boundary matchers</th></tr>
   223  *
   224  * <tr><td valign="top" headers="construct bounds"><tt>^</tt></td>
   225  *     <td headers="matches">The beginning of a line</td></tr>
   226  * <tr><td valign="top" headers="construct bounds"><tt>$</tt></td>
   227  *     <td headers="matches">The end of a line</td></tr>
   228  * <tr><td valign="top" headers="construct bounds"><tt>\b</tt></td>
   229  *     <td headers="matches">A word boundary</td></tr>
   230  * <tr><td valign="top" headers="construct bounds"><tt>\B</tt></td>
   231  *     <td headers="matches">A non-word boundary</td></tr>
   232  * <tr><td valign="top" headers="construct bounds"><tt>\A</tt></td>
   233  *     <td headers="matches">The beginning of the input</td></tr>
   234  * <tr><td valign="top" headers="construct bounds"><tt>\G</tt></td>
   235  *     <td headers="matches">The end of the previous match</td></tr>
   236  * <tr><td valign="top" headers="construct bounds"><tt>\Z</tt></td>
   237  *     <td headers="matches">The end of the input but for the final
   238  *         <a href="#lt">terminator</a>, if&nbsp;any</td></tr>
   239  * <tr><td valign="top" headers="construct bounds"><tt>\z</tt></td>
   240  *     <td headers="matches">The end of the input</td></tr>
   241  *
   242  * <tr><th>&nbsp;</th></tr>
   243  * <tr align="left"><th colspan="2" id="greedy">Greedy quantifiers</th></tr>
   244  *
   245  * <tr><td valign="top" headers="construct greedy"><i>X</i><tt>?</tt></td>
   246  *     <td headers="matches"><i>X</i>, once or not at all</td></tr>
   247  * <tr><td valign="top" headers="construct greedy"><i>X</i><tt>*</tt></td>
   248  *     <td headers="matches"><i>X</i>, zero or more times</td></tr>
   249  * <tr><td valign="top" headers="construct greedy"><i>X</i><tt>+</tt></td>
   250  *     <td headers="matches"><i>X</i>, one or more times</td></tr>
   251  * <tr><td valign="top" headers="construct greedy"><i>X</i><tt>{</tt><i>n</i><tt>}</tt></td>
   252  *     <td headers="matches"><i>X</i>, exactly <i>n</i> times</td></tr>
   253  * <tr><td valign="top" headers="construct greedy"><i>X</i><tt>{</tt><i>n</i><tt>,}</tt></td>
   254  *     <td headers="matches"><i>X</i>, at least <i>n</i> times</td></tr>
   255  * <tr><td valign="top" headers="construct greedy"><i>X</i><tt>{</tt><i>n</i><tt>,</tt><i>m</i><tt>}</tt></td>
   256  *     <td headers="matches"><i>X</i>, at least <i>n</i> but not more than <i>m</i> times</td></tr>
   257  *
   258  * <tr><th>&nbsp;</th></tr>
   259  * <tr align="left"><th colspan="2" id="reluc">Reluctant quantifiers</th></tr>
   260  *
   261  * <tr><td valign="top" headers="construct reluc"><i>X</i><tt>??</tt></td>
   262  *     <td headers="matches"><i>X</i>, once or not at all</td></tr>
   263  * <tr><td valign="top" headers="construct reluc"><i>X</i><tt>*?</tt></td>
   264  *     <td headers="matches"><i>X</i>, zero or more times</td></tr>
   265  * <tr><td valign="top" headers="construct reluc"><i>X</i><tt>+?</tt></td>
   266  *     <td headers="matches"><i>X</i>, one or more times</td></tr>
   267  * <tr><td valign="top" headers="construct reluc"><i>X</i><tt>{</tt><i>n</i><tt>}?</tt></td>
   268  *     <td headers="matches"><i>X</i>, exactly <i>n</i> times</td></tr>
   269  * <tr><td valign="top" headers="construct reluc"><i>X</i><tt>{</tt><i>n</i><tt>,}?</tt></td>
   270  *     <td headers="matches"><i>X</i>, at least <i>n</i> times</td></tr>
   271  * <tr><td valign="top" headers="construct reluc"><i>X</i><tt>{</tt><i>n</i><tt>,</tt><i>m</i><tt>}?</tt></td>
   272  *     <td headers="matches"><i>X</i>, at least <i>n</i> but not more than <i>m</i> times</td></tr>
   273  *
   274  * <tr><th>&nbsp;</th></tr>
   275  * <tr align="left"><th colspan="2" id="poss">Possessive quantifiers</th></tr>
   276  *
   277  * <tr><td valign="top" headers="construct poss"><i>X</i><tt>?+</tt></td>
   278  *     <td headers="matches"><i>X</i>, once or not at all</td></tr>
   279  * <tr><td valign="top" headers="construct poss"><i>X</i><tt>*+</tt></td>
   280  *     <td headers="matches"><i>X</i>, zero or more times</td></tr>
   281  * <tr><td valign="top" headers="construct poss"><i>X</i><tt>++</tt></td>
   282  *     <td headers="matches"><i>X</i>, one or more times</td></tr>
   283  * <tr><td valign="top" headers="construct poss"><i>X</i><tt>{</tt><i>n</i><tt>}+</tt></td>
   284  *     <td headers="matches"><i>X</i>, exactly <i>n</i> times</td></tr>
   285  * <tr><td valign="top" headers="construct poss"><i>X</i><tt>{</tt><i>n</i><tt>,}+</tt></td>
   286  *     <td headers="matches"><i>X</i>, at least <i>n</i> times</td></tr>
   287  * <tr><td valign="top" headers="construct poss"><i>X</i><tt>{</tt><i>n</i><tt>,</tt><i>m</i><tt>}+</tt></td>
   288  *     <td headers="matches"><i>X</i>, at least <i>n</i> but not more than <i>m</i> times</td></tr>
   289  *
   290  * <tr><th>&nbsp;</th></tr>
   291  * <tr align="left"><th colspan="2" id="logical">Logical operators</th></tr>
   292  *
   293  * <tr><td valign="top" headers="construct logical"><i>XY</i></td>
   294  *     <td headers="matches"><i>X</i> followed by <i>Y</i></td></tr>
   295  * <tr><td valign="top" headers="construct logical"><i>X</i><tt>|</tt><i>Y</i></td>
   296  *     <td headers="matches">Either <i>X</i> or <i>Y</i></td></tr>
   297  * <tr><td valign="top" headers="construct logical"><tt>(</tt><i>X</i><tt>)</tt></td>
   298  *     <td headers="matches">X, as a <a href="#cg">capturing group</a></td></tr>
   299  *
   300  * <tr><th>&nbsp;</th></tr>
   301  * <tr align="left"><th colspan="2" id="backref">Back references</th></tr>
   302  *
   303  * <tr><td valign="bottom" headers="construct backref"><tt>\</tt><i>n</i></td>
   304  *     <td valign="bottom" headers="matches">Whatever the <i>n</i><sup>th</sup>
   305  *     <a href="#cg">capturing group</a> matched</td></tr>
   306  *
   307  * <tr><td valign="bottom" headers="construct backref"><tt>\</tt><i>k</i>&lt;<i>name</i>&gt;</td>
   308  *     <td valign="bottom" headers="matches">Whatever the
   309  *     <a href="#groupname">named-capturing group</a> "name" matched</td></tr>
   310  *
   311  * <tr><th>&nbsp;</th></tr>
   312  * <tr align="left"><th colspan="2" id="quot">Quotation</th></tr>
   313  *
   314  * <tr><td valign="top" headers="construct quot"><tt>\</tt></td>
   315  *     <td headers="matches">Nothing, but quotes the following character</td></tr>
   316  * <tr><td valign="top" headers="construct quot"><tt>\Q</tt></td>
   317  *     <td headers="matches">Nothing, but quotes all characters until <tt>\E</tt></td></tr>
   318  * <tr><td valign="top" headers="construct quot"><tt>\E</tt></td>
   319  *     <td headers="matches">Nothing, but ends quoting started by <tt>\Q</tt></td></tr>
   320  *     <!-- Metachars: !$()*+.<>?[\]^{|} -->
   321  *
   322  * <tr><th>&nbsp;</th></tr>
   323  * <tr align="left"><th colspan="2" id="special">Special constructs (named-capturing and non-capturing)</th></tr>
   324  *
   325  * <tr><td valign="top" headers="construct special"><tt>(?&lt;<a href="#groupname">name</a>&gt;</tt><i>X</i><tt>)</tt></td>
   326  *     <td headers="matches"><i>X</i>, as a named-capturing group</td></tr>
   327  * <tr><td valign="top" headers="construct special"><tt>(?:</tt><i>X</i><tt>)</tt></td>
   328  *     <td headers="matches"><i>X</i>, as a non-capturing group</td></tr>
   329  * <tr><td valign="top" headers="construct special"><tt>(?idmsuxU-idmsuxU)&nbsp;</tt></td>
   330  *     <td headers="matches">Nothing, but turns match flags <a href="#CASE_INSENSITIVE">i</a>
   331  * <a href="#UNIX_LINES">d</a> <a href="#MULTILINE">m</a> <a href="#DOTALL">s</a>
   332  * <a href="#UNICODE_CASE">u</a> <a href="#COMMENTS">x</a> <a href="#UNICODE_CHARACTER_CLASS">U</a>
   333  * on - off</td></tr>
   334  * <tr><td valign="top" headers="construct special"><tt>(?idmsux-idmsux:</tt><i>X</i><tt>)</tt>&nbsp;&nbsp;</td>
   335  *     <td headers="matches"><i>X</i>, as a <a href="#cg">non-capturing group</a> with the
   336  *         given flags <a href="#CASE_INSENSITIVE">i</a> <a href="#UNIX_LINES">d</a>
   337  * <a href="#MULTILINE">m</a> <a href="#DOTALL">s</a> <a href="#UNICODE_CASE">u</a >
   338  * <a href="#COMMENTS">x</a> on - off</td></tr>
   339  * <tr><td valign="top" headers="construct special"><tt>(?=</tt><i>X</i><tt>)</tt></td>
   340  *     <td headers="matches"><i>X</i>, via zero-width positive lookahead</td></tr>
   341  * <tr><td valign="top" headers="construct special"><tt>(?!</tt><i>X</i><tt>)</tt></td>
   342  *     <td headers="matches"><i>X</i>, via zero-width negative lookahead</td></tr>
   343  * <tr><td valign="top" headers="construct special"><tt>(?&lt;=</tt><i>X</i><tt>)</tt></td>
   344  *     <td headers="matches"><i>X</i>, via zero-width positive lookbehind</td></tr>
   345  * <tr><td valign="top" headers="construct special"><tt>(?&lt;!</tt><i>X</i><tt>)</tt></td>
   346  *     <td headers="matches"><i>X</i>, via zero-width negative lookbehind</td></tr>
   347  * <tr><td valign="top" headers="construct special"><tt>(?&gt;</tt><i>X</i><tt>)</tt></td>
   348  *     <td headers="matches"><i>X</i>, as an independent, non-capturing group</td></tr>
   349  *
   350  * </table>
   351  *
   352  * <hr>
   353  *
   354  *
   355  * <a name="bs">
   356  * <h4> Backslashes, escapes, and quoting </h4>
   357  *
   358  * <p> The backslash character (<tt>'\'</tt>) serves to introduce escaped
   359  * constructs, as defined in the table above, as well as to quote characters
   360  * that otherwise would be interpreted as unescaped constructs.  Thus the
   361  * expression <tt>\\</tt> matches a single backslash and <tt>\{</tt> matches a
   362  * left brace.
   363  *
   364  * <p> It is an error to use a backslash prior to any alphabetic character that
   365  * does not denote an escaped construct; these are reserved for future
   366  * extensions to the regular-expression language.  A backslash may be used
   367  * prior to a non-alphabetic character regardless of whether that character is
   368  * part of an unescaped construct.
   369  *
   370  * <p> Backslashes within string literals in Java source code are interpreted
   371  * as required by
   372  * <cite>The Java&trade; Language Specification</cite>
   373  * as either Unicode escapes (section 3.3) or other character escapes (section 3.10.6)
   374  * It is therefore necessary to double backslashes in string
   375  * literals that represent regular expressions to protect them from
   376  * interpretation by the Java bytecode compiler.  The string literal
   377  * <tt>"&#92;b"</tt>, for example, matches a single backspace character when
   378  * interpreted as a regular expression, while <tt>"&#92;&#92;b"</tt> matches a
   379  * word boundary.  The string literal <tt>"&#92;(hello&#92;)"</tt> is illegal
   380  * and leads to a compile-time error; in order to match the string
   381  * <tt>(hello)</tt> the string literal <tt>"&#92;&#92;(hello&#92;&#92;)"</tt>
   382  * must be used.
   383  *
   384  * <a name="cc">
   385  * <h4> Character Classes </h4>
   386  *
   387  *    <p> Character classes may appear within other character classes, and
   388  *    may be composed by the union operator (implicit) and the intersection
   389  *    operator (<tt>&amp;&amp;</tt>).
   390  *    The union operator denotes a class that contains every character that is
   391  *    in at least one of its operand classes.  The intersection operator
   392  *    denotes a class that contains every character that is in both of its
   393  *    operand classes.
   394  *
   395  *    <p> The precedence of character-class operators is as follows, from
   396  *    highest to lowest:
   397  *
   398  *    <blockquote><table border="0" cellpadding="1" cellspacing="0"
   399  *                 summary="Precedence of character class operators.">
   400  *      <tr><th>1&nbsp;&nbsp;&nbsp;&nbsp;</th>
   401  *        <td>Literal escape&nbsp;&nbsp;&nbsp;&nbsp;</td>
   402  *        <td><tt>\x</tt></td></tr>
   403  *     <tr><th>2&nbsp;&nbsp;&nbsp;&nbsp;</th>
   404  *        <td>Grouping</td>
   405  *        <td><tt>[...]</tt></td></tr>
   406  *     <tr><th>3&nbsp;&nbsp;&nbsp;&nbsp;</th>
   407  *        <td>Range</td>
   408  *        <td><tt>a-z</tt></td></tr>
   409  *      <tr><th>4&nbsp;&nbsp;&nbsp;&nbsp;</th>
   410  *        <td>Union</td>
   411  *        <td><tt>[a-e][i-u]</tt></td></tr>
   412  *      <tr><th>5&nbsp;&nbsp;&nbsp;&nbsp;</th>
   413  *        <td>Intersection</td>
   414  *        <td><tt>[a-z&&[aeiou]]</tt></td></tr>
   415  *    </table></blockquote>
   416  *
   417  *    <p> Note that a different set of metacharacters are in effect inside
   418  *    a character class than outside a character class. For instance, the
   419  *    regular expression <tt>.</tt> loses its special meaning inside a
   420  *    character class, while the expression <tt>-</tt> becomes a range
   421  *    forming metacharacter.
   422  *
   423  * <a name="lt">
   424  * <h4> Line terminators </h4>
   425  *
   426  * <p> A <i>line terminator</i> is a one- or two-character sequence that marks
   427  * the end of a line of the input character sequence.  The following are
   428  * recognized as line terminators:
   429  *
   430  * <ul>
   431  *
   432  *   <li> A newline (line feed) character&nbsp;(<tt>'\n'</tt>),
   433  *
   434  *   <li> A carriage-return character followed immediately by a newline
   435  *   character&nbsp;(<tt>"\r\n"</tt>),
   436  *
   437  *   <li> A standalone carriage-return character&nbsp;(<tt>'\r'</tt>),
   438  *
   439  *   <li> A next-line character&nbsp;(<tt>'&#92;u0085'</tt>),
   440  *
   441  *   <li> A line-separator character&nbsp;(<tt>'&#92;u2028'</tt>), or
   442  *
   443  *   <li> A paragraph-separator character&nbsp;(<tt>'&#92;u2029</tt>).
   444  *
   445  * </ul>
   446  * <p>If {@link #UNIX_LINES} mode is activated, then the only line terminators
   447  * recognized are newline characters.
   448  *
   449  * <p> The regular expression <tt>.</tt> matches any character except a line
   450  * terminator unless the {@link #DOTALL} flag is specified.
   451  *
   452  * <p> By default, the regular expressions <tt>^</tt> and <tt>$</tt> ignore
   453  * line terminators and only match at the beginning and the end, respectively,
   454  * of the entire input sequence. If {@link #MULTILINE} mode is activated then
   455  * <tt>^</tt> matches at the beginning of input and after any line terminator
   456  * except at the end of input. When in {@link #MULTILINE} mode <tt>$</tt>
   457  * matches just before a line terminator or the end of the input sequence.
   458  *
   459  * <a name="cg">
   460  * <h4> Groups and capturing </h4>
   461  *
   462  * <a name="gnumber">
   463  * <h5> Group number </h5>
   464  * <p> Capturing groups are numbered by counting their opening parentheses from
   465  * left to right.  In the expression <tt>((A)(B(C)))</tt>, for example, there
   466  * are four such groups: </p>
   467  *
   468  * <blockquote><table cellpadding=1 cellspacing=0 summary="Capturing group numberings">
   469  * <tr><th>1&nbsp;&nbsp;&nbsp;&nbsp;</th>
   470  *     <td><tt>((A)(B(C)))</tt></td></tr>
   471  * <tr><th>2&nbsp;&nbsp;&nbsp;&nbsp;</th>
   472  *     <td><tt>(A)</tt></td></tr>
   473  * <tr><th>3&nbsp;&nbsp;&nbsp;&nbsp;</th>
   474  *     <td><tt>(B(C))</tt></td></tr>
   475  * <tr><th>4&nbsp;&nbsp;&nbsp;&nbsp;</th>
   476  *     <td><tt>(C)</tt></td></tr>
   477  * </table></blockquote>
   478  *
   479  * <p> Group zero always stands for the entire expression.
   480  *
   481  * <p> Capturing groups are so named because, during a match, each subsequence
   482  * of the input sequence that matches such a group is saved.  The captured
   483  * subsequence may be used later in the expression, via a back reference, and
   484  * may also be retrieved from the matcher once the match operation is complete.
   485  *
   486  * <a name="groupname">
   487  * <h5> Group name </h5>
   488  * <p>A capturing group can also be assigned a "name", a <tt>named-capturing group</tt>,
   489  * and then be back-referenced later by the "name". Group names are composed of
   490  * the following characters. The first character must be a <tt>letter</tt>.
   491  *
   492  * <ul>
   493  *   <li> The uppercase letters <tt>'A'</tt> through <tt>'Z'</tt>
   494  *        (<tt>'&#92;u0041'</tt>&nbsp;through&nbsp;<tt>'&#92;u005a'</tt>),
   495  *   <li> The lowercase letters <tt>'a'</tt> through <tt>'z'</tt>
   496  *        (<tt>'&#92;u0061'</tt>&nbsp;through&nbsp;<tt>'&#92;u007a'</tt>),
   497  *   <li> The digits <tt>'0'</tt> through <tt>'9'</tt>
   498  *        (<tt>'&#92;u0030'</tt>&nbsp;through&nbsp;<tt>'&#92;u0039'</tt>),
   499  * </ul>
   500  *
   501  * <p> A <tt>named-capturing group</tt> is still numbered as described in
   502  * <a href="#gnumber">Group number</a>.
   503  *
   504  * <p> The captured input associated with a group is always the subsequence
   505  * that the group most recently matched.  If a group is evaluated a second time
   506  * because of quantification then its previously-captured value, if any, will
   507  * be retained if the second evaluation fails.  Matching the string
   508  * <tt>"aba"</tt> against the expression <tt>(a(b)?)+</tt>, for example, leaves
   509  * group two set to <tt>"b"</tt>.  All captured input is discarded at the
   510  * beginning of each match.
   511  *
   512  * <p> Groups beginning with <tt>(?</tt> are either pure, <i>non-capturing</i> groups
   513  * that do not capture text and do not count towards the group total, or
   514  * <i>named-capturing</i> group.
   515  *
   516  * <h4> Unicode support </h4>
   517  *
   518  * <p> This class is in conformance with Level 1 of <a
   519  * href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical
   520  * Standard #18: Unicode Regular Expression</i></a>, plus RL2.1
   521  * Canonical Equivalents.
   522  * <p>
   523  * <b>Unicode escape sequences</b> such as <tt>&#92;u2014</tt> in Java source code
   524  * are processed as described in section 3.3 of
   525  * <cite>The Java&trade; Language Specification</cite>.
   526  * Such escape sequences are also implemented directly by the regular-expression
   527  * parser so that Unicode escapes can be used in expressions that are read from
   528  * files or from the keyboard.  Thus the strings <tt>"&#92;u2014"</tt> and
   529  * <tt>"\\u2014"</tt>, while not equal, compile into the same pattern, which
   530  * matches the character with hexadecimal value <tt>0x2014</tt>.
   531  * <p>
   532  * A Unicode character can also be represented in a regular-expression by
   533  * using its <b>Hex notation</b>(hexadecimal code point value) directly as described in construct
   534  * <tt>&#92;x{...}</tt>, for example a supplementary character U+2011F
   535  * can be specified as <tt>&#92;x{2011F}</tt>, instead of two consecutive
   536  * Unicode escape sequences of the surrogate pair
   537  * <tt>&#92;uD840</tt><tt>&#92;uDD1F</tt>.
   538  * <p>
   539  * Unicode scripts, blocks, categories and binary properties are written with
   540  * the <tt>\p</tt> and <tt>\P</tt> constructs as in Perl.
   541  * <tt>\p{</tt><i>prop</i><tt>}</tt> matches if
   542  * the input has the property <i>prop</i>, while <tt>\P{</tt><i>prop</i><tt>}</tt>
   543  * does not match if the input has that property.
   544  * <p>
   545  * Scripts, blocks, categories and binary properties can be used both inside
   546  * and outside of a character class.
   547  * <a name="usc">
   548  * <p>
   549  * <b>Scripts</b> are specified either with the prefix {@code Is}, as in
   550  * {@code IsHiragana}, or by using  the {@code script} keyword (or its short
   551  * form {@code sc})as in {@code script=Hiragana} or {@code sc=Hiragana}.
   552  * <p>
   553  * The script names supported by <code>Pattern</code> are the valid script names
   554  * accepted and defined by
   555  * {@link java.lang.Character.UnicodeScript#forName(String) UnicodeScript.forName}.
   556  * <a name="ubc">
   557  * <p>
   558  * <b>Blocks</b> are specified with the prefix {@code In}, as in
   559  * {@code InMongolian}, or by using the keyword {@code block} (or its short
   560  * form {@code blk}) as in {@code block=Mongolian} or {@code blk=Mongolian}.
   561  * <p>
   562  * The block names supported by <code>Pattern</code> are the valid block names
   563  * accepted and defined by
   564  * {@link java.lang.Character.UnicodeBlock#forName(String) UnicodeBlock.forName}.
   565  * <p>
   566  * <a name="ucc">
   567  * <b>Categories</b> may be specified with the optional prefix {@code Is}:
   568  * Both {@code \p{L}} and {@code \p{IsL}} denote the category of Unicode
   569  * letters. Same as scripts and blocks, categories can also be specified
   570  * by using the keyword {@code general_category} (or its short form
   571  * {@code gc}) as in {@code general_category=Lu} or {@code gc=Lu}.
   572  * <p>
   573  * The supported categories are those of
   574  * <a href="http://www.unicode.org/unicode/standard/standard.html">
   575  * <i>The Unicode Standard</i></a> in the version specified by the
   576  * {@link java.lang.Character Character} class. The category names are those
   577  * defined in the Standard, both normative and informative.
   578  * <p>
   579  * <a name="ubpc">
   580  * <b>Binary properties</b> are specified with the prefix {@code Is}, as in
   581  * {@code IsAlphabetic}. The supported binary properties by <code>Pattern</code>
   582  * are
   583  * <ul>
   584  *   <li> Alphabetic
   585  *   <li> Ideographic
   586  *   <li> Letter
   587  *   <li> Lowercase
   588  *   <li> Uppercase
   589  *   <li> Titlecase
   590  *   <li> Punctuation
   591  *   <Li> Control
   592  *   <li> White_Space
   593  *   <li> Digit
   594  *   <li> Hex_Digit
   595  *   <li> Noncharacter_Code_Point
   596  *   <li> Assigned
   597  * </ul>
   598 
   599 
   600  * <p>
   601  * <b>Predefined Character classes</b> and <b>POSIX character classes</b> are in
   602  * conformance with the recommendation of <i>Annex C: Compatibility Properties</i>
   603  * of <a href="http://www.unicode.org/reports/tr18/"><i>Unicode Regular Expression
   604  * </i></a>, when {@link #UNICODE_CHARACTER_CLASS} flag is specified.
   605  * <p>
   606  * <table border="0" cellpadding="1" cellspacing="0"
   607  *  summary="predefined and posix character classes in Unicode mode">
   608  * <tr align="left">
   609  * <th bgcolor="#CCCCFF" align="left" id="classes">Classes</th>
   610  * <th bgcolor="#CCCCFF" align="left" id="matches">Matches</th>
   611  *</tr>
   612  * <tr><td><tt>\p{Lower}</tt></td>
   613  *     <td>A lowercase character:<tt>\p{IsLowercase}</tt></td></tr>
   614  * <tr><td><tt>\p{Upper}</tt></td>
   615  *     <td>An uppercase character:<tt>\p{IsUppercase}</tt></td></tr>
   616  * <tr><td><tt>\p{ASCII}</tt></td>
   617  *     <td>All ASCII:<tt>[\x00-\x7F]</tt></td></tr>
   618  * <tr><td><tt>\p{Alpha}</tt></td>
   619  *     <td>An alphabetic character:<tt>\p{IsAlphabetic}</tt></td></tr>
   620  * <tr><td><tt>\p{Digit}</tt></td>
   621  *     <td>A decimal digit character:<tt>p{IsDigit}</tt></td></tr>
   622  * <tr><td><tt>\p{Alnum}</tt></td>
   623  *     <td>An alphanumeric character:<tt>[\p{IsAlphabetic}\p{IsDigit}]</tt></td></tr>
   624  * <tr><td><tt>\p{Punct}</tt></td>
   625  *     <td>A punctuation character:<tt>p{IsPunctuation}</tt></td></tr>
   626  * <tr><td><tt>\p{Graph}</tt></td>
   627  *     <td>A visible character: <tt>[^\p{IsWhite_Space}\p{gc=Cc}\p{gc=Cs}\p{gc=Cn}]</tt></td></tr>
   628  * <tr><td><tt>\p{Print}</tt></td>
   629  *     <td>A printable character: <tt>[\p{Graph}\p{Blank}&&[^\p{Cntrl}]]</tt></td></tr>
   630  * <tr><td><tt>\p{Blank}</tt></td>
   631  *     <td>A space or a tab: <tt>[\p{IsWhite_Space}&&[^\p{gc=Zl}\p{gc=Zp}\x0a\x0b\x0c\x0d\x85]]</tt></td></tr>
   632  * <tr><td><tt>\p{Cntrl}</tt></td>
   633  *     <td>A control character: <tt>\p{gc=Cc}</tt></td></tr>
   634  * <tr><td><tt>\p{XDigit}</tt></td>
   635  *     <td>A hexadecimal digit: <tt>[\p{gc=Nd}\p{IsHex_Digit}]</tt></td></tr>
   636  * <tr><td><tt>\p{Space}</tt></td>
   637  *     <td>A whitespace character:<tt>\p{IsWhite_Space}</tt></td></tr>
   638  * <tr><td><tt>\d</tt></td>
   639  *     <td>A digit: <tt>\p{IsDigit}</tt></td></tr>
   640  * <tr><td><tt>\D</tt></td>
   641  *     <td>A non-digit: <tt>[^\d]</tt></td></tr>
   642  * <tr><td><tt>\s</tt></td>
   643  *     <td>A whitespace character: <tt>\p{IsWhite_Space}</tt></td></tr>
   644  * <tr><td><tt>\S</tt></td>
   645  *     <td>A non-whitespace character: <tt>[^\s]</tt></td></tr>
   646  * <tr><td><tt>\w</tt></td>
   647  *     <td>A word character: <tt>[\p{Alpha}\p{gc=Mn}\p{gc=Me}\p{gc=Mc}\p{Digit}\p{gc=Pc}]</tt></td></tr>
   648  * <tr><td><tt>\W</tt></td>
   649  *     <td>A non-word character: <tt>[^\w]</tt></td></tr>
   650  * </table>
   651  * <p>
   652  * <a name="jcc">
   653  * Categories that behave like the java.lang.Character
   654  * boolean is<i>methodname</i> methods (except for the deprecated ones) are
   655  * available through the same <tt>\p{</tt><i>prop</i><tt>}</tt> syntax where
   656  * the specified property has the name <tt>java<i>methodname</i></tt>.
   657  *
   658  * <h4> Comparison to Perl 5 </h4>
   659  *
   660  * <p>The <code>Pattern</code> engine performs traditional NFA-based matching
   661  * with ordered alternation as occurs in Perl 5.
   662  *
   663  * <p> Perl constructs not supported by this class: </p>
   664  *
   665  * <ul>
   666  *    <li><p> Predefined character classes (Unicode character)
   667  *    <p><tt>\h&nbsp;&nbsp;&nbsp;&nbsp;</tt>A horizontal whitespace
   668  *    <p><tt>\H&nbsp;&nbsp;&nbsp;&nbsp;</tt>A non horizontal whitespace
   669  *    <p><tt>\v&nbsp;&nbsp;&nbsp;&nbsp;</tt>A vertical whitespace
   670  *    <p><tt>\V&nbsp;&nbsp;&nbsp;&nbsp;</tt>A non vertical whitespace
   671  *    <p><tt>\R&nbsp;&nbsp;&nbsp;&nbsp;</tt>Any Unicode linebreak sequence
   672  *    <tt>\u005cu000D\u005cu000A|[\u005cu000A\u005cu000B\u005cu000C\u005cu000D\u005cu0085\u005cu2028\u005cu2029]</tt>
   673  *    <p><tt>\X&nbsp;&nbsp;&nbsp;&nbsp;</tt>Match Unicode
   674  *    <a href="http://www.unicode.org/reports/tr18/#Default_Grapheme_Clusters">
   675  *    <i>extended grapheme cluster</i></a>
   676  *    </p></li>
   677  *
   678  *    <li><p> The backreference constructs, <tt>\g{</tt><i>n</i><tt>}</tt> for
   679  *    the <i>n</i><sup>th</sup><a href="#cg">capturing group</a> and
   680  *    <tt>\g{</tt><i>name</i><tt>}</tt> for
   681  *    <a href="#groupname">named-capturing group</a>.
   682  *    </p></li>
   683  *
   684  *    <li><p> The named character construct, <tt>\N{</tt><i>name</i><tt>}</tt>
   685  *    for a Unicode character by its name.
   686  *    </p></li>
   687  *
   688  *    <li><p> The conditional constructs
   689  *    <tt>(?(</tt><i>condition</i><tt>)</tt><i>X</i><tt>)</tt> and
   690  *    <tt>(?(</tt><i>condition</i><tt>)</tt><i>X</i><tt>|</tt><i>Y</i><tt>)</tt>,
   691  *    </p></li>
   692  *
   693  *    <li><p> The embedded code constructs <tt>(?{</tt><i>code</i><tt>})</tt>
   694  *    and <tt>(??{</tt><i>code</i><tt>})</tt>,</p></li>
   695  *
   696  *    <li><p> The embedded comment syntax <tt>(?#comment)</tt>, and </p></li>
   697  *
   698  *    <li><p> The preprocessing operations <tt>\l</tt> <tt>&#92;u</tt>,
   699  *    <tt>\L</tt>, and <tt>\U</tt>.  </p></li>
   700  *
   701  * </ul>
   702  *
   703  * <p> Constructs supported by this class but not by Perl: </p>
   704  *
   705  * <ul>
   706  *
   707  *    <li><p> Character-class union and intersection as described
   708  *    <a href="#cc">above</a>.</p></li>
   709  *
   710  * </ul>
   711  *
   712  * <p> Notable differences from Perl: </p>
   713  *
   714  * <ul>
   715  *
   716  *    <li><p> In Perl, <tt>\1</tt> through <tt>\9</tt> are always interpreted
   717  *    as back references; a backslash-escaped number greater than <tt>9</tt> is
   718  *    treated as a back reference if at least that many subexpressions exist,
   719  *    otherwise it is interpreted, if possible, as an octal escape.  In this
   720  *    class octal escapes must always begin with a zero. In this class,
   721  *    <tt>\1</tt> through <tt>\9</tt> are always interpreted as back
   722  *    references, and a larger number is accepted as a back reference if at
   723  *    least that many subexpressions exist at that point in the regular
   724  *    expression, otherwise the parser will drop digits until the number is
   725  *    smaller or equal to the existing number of groups or it is one digit.
   726  *    </p></li>
   727  *
   728  *    <li><p> Perl uses the <tt>g</tt> flag to request a match that resumes
   729  *    where the last match left off.  This functionality is provided implicitly
   730  *    by the {@link Matcher} class: Repeated invocations of the {@link
   731  *    Matcher#find find} method will resume where the last match left off,
   732  *    unless the matcher is reset.  </p></li>
   733  *
   734  *    <li><p> In Perl, embedded flags at the top level of an expression affect
   735  *    the whole expression.  In this class, embedded flags always take effect
   736  *    at the point at which they appear, whether they are at the top level or
   737  *    within a group; in the latter case, flags are restored at the end of the
   738  *    group just as in Perl.  </p></li>
   739  *
   740  * </ul>
   741  *
   742  *
   743  * <p> For a more precise description of the behavior of regular expression
   744  * constructs, please see <a href="http://www.oreilly.com/catalog/regex3/">
   745  * <i>Mastering Regular Expressions, 3nd Edition</i>, Jeffrey E. F. Friedl,
   746  * O'Reilly and Associates, 2006.</a>
   747  * </p>
   748  *
   749  * @see java.lang.String#split(String, int)
   750  * @see java.lang.String#split(String)
   751  *
   752  * @author      Mike McCloskey
   753  * @author      Mark Reinhold
   754  * @author      JSR-51 Expert Group
   755  * @since       1.4
   756  * @spec        JSR-51
   757  */
   758 
   759 public final class Pattern
   760     implements java.io.Serializable
   761 {
   762 
   763     /**
   764      * Regular expression modifier values.  Instead of being passed as
   765      * arguments, they can also be passed as inline modifiers.
   766      * For example, the following statements have the same effect.
   767      * <pre>
   768      * RegExp r1 = RegExp.compile("abc", Pattern.I|Pattern.M);
   769      * RegExp r2 = RegExp.compile("(?im)abc", 0);
   770      * </pre>
   771      *
   772      * The flags are duplicated so that the familiar Perl match flag
   773      * names are available.
   774      */
   775 
   776     /**
   777      * Enables Unix lines mode.
   778      *
   779      * <p> In this mode, only the <tt>'\n'</tt> line terminator is recognized
   780      * in the behavior of <tt>.</tt>, <tt>^</tt>, and <tt>$</tt>.
   781      *
   782      * <p> Unix lines mode can also be enabled via the embedded flag
   783      * expression&nbsp;<tt>(?d)</tt>.
   784      */
   785     public static final int UNIX_LINES = 0x01;
   786 
   787     /**
   788      * Enables case-insensitive matching.
   789      *
   790      * <p> By default, case-insensitive matching assumes that only characters
   791      * in the US-ASCII charset are being matched.  Unicode-aware
   792      * case-insensitive matching can be enabled by specifying the {@link
   793      * #UNICODE_CASE} flag in conjunction with this flag.
   794      *
   795      * <p> Case-insensitive matching can also be enabled via the embedded flag
   796      * expression&nbsp;<tt>(?i)</tt>.
   797      *
   798      * <p> Specifying this flag may impose a slight performance penalty.  </p>
   799      */
   800     public static final int CASE_INSENSITIVE = 0x02;
   801 
   802     /**
   803      * Permits whitespace and comments in pattern.
   804      *
   805      * <p> In this mode, whitespace is ignored, and embedded comments starting
   806      * with <tt>#</tt> are ignored until the end of a line.
   807      *
   808      * <p> Comments mode can also be enabled via the embedded flag
   809      * expression&nbsp;<tt>(?x)</tt>.
   810      */
   811     public static final int COMMENTS = 0x04;
   812 
   813     /**
   814      * Enables multiline mode.
   815      *
   816      * <p> In multiline mode the expressions <tt>^</tt> and <tt>$</tt> match
   817      * just after or just before, respectively, a line terminator or the end of
   818      * the input sequence.  By default these expressions only match at the
   819      * beginning and the end of the entire input sequence.
   820      *
   821      * <p> Multiline mode can also be enabled via the embedded flag
   822      * expression&nbsp;<tt>(?m)</tt>.  </p>
   823      */
   824     public static final int MULTILINE = 0x08;
   825 
   826     /**
   827      * Enables literal parsing of the pattern.
   828      *
   829      * <p> When this flag is specified then the input string that specifies
   830      * the pattern is treated as a sequence of literal characters.
   831      * Metacharacters or escape sequences in the input sequence will be
   832      * given no special meaning.
   833      *
   834      * <p>The flags CASE_INSENSITIVE and UNICODE_CASE retain their impact on
   835      * matching when used in conjunction with this flag. The other flags
   836      * become superfluous.
   837      *
   838      * <p> There is no embedded flag character for enabling literal parsing.
   839      * @since 1.5
   840      */
   841     public static final int LITERAL = 0x10;
   842 
   843     /**
   844      * Enables dotall mode.
   845      *
   846      * <p> In dotall mode, the expression <tt>.</tt> matches any character,
   847      * including a line terminator.  By default this expression does not match
   848      * line terminators.
   849      *
   850      * <p> Dotall mode can also be enabled via the embedded flag
   851      * expression&nbsp;<tt>(?s)</tt>.  (The <tt>s</tt> is a mnemonic for
   852      * "single-line" mode, which is what this is called in Perl.)  </p>
   853      */
   854     public static final int DOTALL = 0x20;
   855 
   856     /**
   857      * Enables Unicode-aware case folding.
   858      *
   859      * <p> When this flag is specified then case-insensitive matching, when
   860      * enabled by the {@link #CASE_INSENSITIVE} flag, is done in a manner
   861      * consistent with the Unicode Standard.  By default, case-insensitive
   862      * matching assumes that only characters in the US-ASCII charset are being
   863      * matched.
   864      *
   865      * <p> Unicode-aware case folding can also be enabled via the embedded flag
   866      * expression&nbsp;<tt>(?u)</tt>.
   867      *
   868      * <p> Specifying this flag may impose a performance penalty.  </p>
   869      */
   870     public static final int UNICODE_CASE = 0x40;
   871 
   872     /**
   873      * Enables canonical equivalence.
   874      *
   875      * <p> When this flag is specified then two characters will be considered
   876      * to match if, and only if, their full canonical decompositions match.
   877      * The expression <tt>"a&#92;u030A"</tt>, for example, will match the
   878      * string <tt>"&#92;u00E5"</tt> when this flag is specified.  By default,
   879      * matching does not take canonical equivalence into account.
   880      *
   881      * <p> There is no embedded flag character for enabling canonical
   882      * equivalence.
   883      *
   884      * <p> Specifying this flag may impose a performance penalty.  </p>
   885      */
   886     public static final int CANON_EQ = 0x80;
   887 
   888     /**
   889      * Enables the Unicode version of <i>Predefined character classes</i> and
   890      * <i>POSIX character classes</i>.
   891      *
   892      * <p> When this flag is specified then the (US-ASCII only)
   893      * <i>Predefined character classes</i> and <i>POSIX character classes</i>
   894      * are in conformance with
   895      * <a href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical
   896      * Standard #18: Unicode Regular Expression</i></a>
   897      * <i>Annex C: Compatibility Properties</i>.
   898      * <p>
   899      * The UNICODE_CHARACTER_CLASS mode can also be enabled via the embedded
   900      * flag expression&nbsp;<tt>(?U)</tt>.
   901      * <p>
   902      * The flag implies UNICODE_CASE, that is, it enables Unicode-aware case
   903      * folding.
   904      * <p>
   905      * Specifying this flag may impose a performance penalty.  </p>
   906      * @since 1.7
   907      */
   908     public static final int UNICODE_CHARACTER_CLASS = 0x100;
   909 
   910     /* Pattern has only two serialized components: The pattern string
   911      * and the flags, which are all that is needed to recompile the pattern
   912      * when it is deserialized.
   913      */
   914 
   915     /** use serialVersionUID from Merlin b59 for interoperability */
   916     private static final long serialVersionUID = 5073258162644648461L;
   917 
   918     /**
   919      * The original regular-expression pattern string.
   920      *
   921      * @serial
   922      */
   923     private String pattern;
   924 
   925     /**
   926      * The original pattern flags.
   927      *
   928      * @serial
   929      */
   930     private int flags;
   931 
   932     /**
   933      * Boolean indicating this Pattern is compiled; this is necessary in order
   934      * to lazily compile deserialized Patterns.
   935      */
   936     private transient volatile boolean compiled = false;
   937 
   938     /**
   939      * The normalized pattern string.
   940      */
   941     private transient String normalizedPattern;
   942 
   943     /**
   944      * The starting point of state machine for the find operation.  This allows
   945      * a match to start anywhere in the input.
   946      */
   947     transient Node root;
   948 
   949     /**
   950      * The root of object tree for a match operation.  The pattern is matched
   951      * at the beginning.  This may include a find that uses BnM or a First
   952      * node.
   953      */
   954     transient Node matchRoot;
   955 
   956     /**
   957      * Temporary storage used by parsing pattern slice.
   958      */
   959     transient int[] buffer;
   960 
   961     /**
   962      * Map the "name" of the "named capturing group" to its group id
   963      * node.
   964      */
   965     transient volatile Map<String, Integer> namedGroups;
   966 
   967     /**
   968      * Temporary storage used while parsing group references.
   969      */
   970     transient GroupHead[] groupNodes;
   971 
   972     /**
   973      * Temporary null terminated code point array used by pattern compiling.
   974      */
   975     private transient int[] temp;
   976 
   977     /**
   978      * The number of capturing groups in this Pattern. Used by matchers to
   979      * allocate storage needed to perform a match.
   980      */
   981     transient int capturingGroupCount;
   982 
   983     /**
   984      * The local variable count used by parsing tree. Used by matchers to
   985      * allocate storage needed to perform a match.
   986      */
   987     transient int localCount;
   988 
   989     /**
   990      * Index into the pattern string that keeps track of how much has been
   991      * parsed.
   992      */
   993     private transient int cursor;
   994 
   995     /**
   996      * Holds the length of the pattern string.
   997      */
   998     private transient int patternLength;
   999 
  1000     /**
  1001      * If the Start node might possibly match supplementary characters.
  1002      * It is set to true during compiling if
  1003      * (1) There is supplementary char in pattern, or
  1004      * (2) There is complement node of Category or Block
  1005      */
  1006     private transient boolean hasSupplementary;
  1007 
  1008     /**
  1009      * Compiles the given regular expression into a pattern.  </p>
  1010      *
  1011      * @param  regex
  1012      *         The expression to be compiled
  1013      *
  1014      * @throws  PatternSyntaxException
  1015      *          If the expression's syntax is invalid
  1016      */
  1017     public static Pattern compile(String regex) {
  1018         return new Pattern(regex, 0);
  1019     }
  1020 
  1021     /**
  1022      * Compiles the given regular expression into a pattern with the given
  1023      * flags.  </p>
  1024      *
  1025      * @param  regex
  1026      *         The expression to be compiled
  1027      *
  1028      * @param  flags
  1029      *         Match flags, a bit mask that may include
  1030      *         {@link #CASE_INSENSITIVE}, {@link #MULTILINE}, {@link #DOTALL},
  1031      *         {@link #UNICODE_CASE}, {@link #CANON_EQ}, {@link #UNIX_LINES},
  1032      *         {@link #LITERAL}, {@link #UNICODE_CHARACTER_CLASS}
  1033      *         and {@link #COMMENTS}
  1034      *
  1035      * @throws  IllegalArgumentException
  1036      *          If bit values other than those corresponding to the defined
  1037      *          match flags are set in <tt>flags</tt>
  1038      *
  1039      * @throws  PatternSyntaxException
  1040      *          If the expression's syntax is invalid
  1041      */
  1042     public static Pattern compile(String regex, int flags) {
  1043         return new Pattern(regex, flags);
  1044     }
  1045 
  1046     /**
  1047      * Returns the regular expression from which this pattern was compiled.
  1048      * </p>
  1049      *
  1050      * @return  The source of this pattern
  1051      */
  1052     public String pattern() {
  1053         return pattern;
  1054     }
  1055 
  1056     /**
  1057      * <p>Returns the string representation of this pattern. This
  1058      * is the regular expression from which this pattern was
  1059      * compiled.</p>
  1060      *
  1061      * @return  The string representation of this pattern
  1062      * @since 1.5
  1063      */
  1064     public String toString() {
  1065         return pattern;
  1066     }
  1067 
  1068     /**
  1069      * Creates a matcher that will match the given input against this pattern.
  1070      * </p>
  1071      *
  1072      * @param  input
  1073      *         The character sequence to be matched
  1074      *
  1075      * @return  A new matcher for this pattern
  1076      */
  1077     public Matcher matcher(CharSequence input) {
  1078         if (!compiled) {
  1079             synchronized(this) {
  1080                 if (!compiled)
  1081                     compile();
  1082             }
  1083         }
  1084         Matcher m = new Matcher(this, input);
  1085         return m;
  1086     }
  1087 
  1088     /**
  1089      * Returns this pattern's match flags.  </p>
  1090      *
  1091      * @return  The match flags specified when this pattern was compiled
  1092      */
  1093     public int flags() {
  1094         return flags;
  1095     }
  1096 
  1097     /**
  1098      * Compiles the given regular expression and attempts to match the given
  1099      * input against it.
  1100      *
  1101      * <p> An invocation of this convenience method of the form
  1102      *
  1103      * <blockquote><pre>
  1104      * Pattern.matches(regex, input);</pre></blockquote>
  1105      *
  1106      * behaves in exactly the same way as the expression
  1107      *
  1108      * <blockquote><pre>
  1109      * Pattern.compile(regex).matcher(input).matches()</pre></blockquote>
  1110      *
  1111      * <p> If a pattern is to be used multiple times, compiling it once and reusing
  1112      * it will be more efficient than invoking this method each time.  </p>
  1113      *
  1114      * @param  regex
  1115      *         The expression to be compiled
  1116      *
  1117      * @param  input
  1118      *         The character sequence to be matched
  1119      *
  1120      * @throws  PatternSyntaxException
  1121      *          If the expression's syntax is invalid
  1122      */
  1123     public static boolean matches(String regex, CharSequence input) {
  1124         Pattern p = Pattern.compile(regex);
  1125         Matcher m = p.matcher(input);
  1126         return m.matches();
  1127     }
  1128 
  1129     /**
  1130      * Splits the given input sequence around matches of this pattern.
  1131      *
  1132      * <p> The array returned by this method contains each substring of the
  1133      * input sequence that is terminated by another subsequence that matches
  1134      * this pattern or is terminated by the end of the input sequence.  The
  1135      * substrings in the array are in the order in which they occur in the
  1136      * input.  If this pattern does not match any subsequence of the input then
  1137      * the resulting array has just one element, namely the input sequence in
  1138      * string form.
  1139      *
  1140      * <p> The <tt>limit</tt> parameter controls the number of times the
  1141      * pattern is applied and therefore affects the length of the resulting
  1142      * array.  If the limit <i>n</i> is greater than zero then the pattern
  1143      * will be applied at most <i>n</i>&nbsp;-&nbsp;1 times, the array's
  1144      * length will be no greater than <i>n</i>, and the array's last entry
  1145      * will contain all input beyond the last matched delimiter.  If <i>n</i>
  1146      * is non-positive then the pattern will be applied as many times as
  1147      * possible and the array can have any length.  If <i>n</i> is zero then
  1148      * the pattern will be applied as many times as possible, the array can
  1149      * have any length, and trailing empty strings will be discarded.
  1150      *
  1151      * <p> The input <tt>"boo:and:foo"</tt>, for example, yields the following
  1152      * results with these parameters:
  1153      *
  1154      * <blockquote><table cellpadding=1 cellspacing=0
  1155      *              summary="Split examples showing regex, limit, and result">
  1156      * <tr><th><P align="left"><i>Regex&nbsp;&nbsp;&nbsp;&nbsp;</i></th>
  1157      *     <th><P align="left"><i>Limit&nbsp;&nbsp;&nbsp;&nbsp;</i></th>
  1158      *     <th><P align="left"><i>Result&nbsp;&nbsp;&nbsp;&nbsp;</i></th></tr>
  1159      * <tr><td align=center>:</td>
  1160      *     <td align=center>2</td>
  1161      *     <td><tt>{ "boo", "and:foo" }</tt></td></tr>
  1162      * <tr><td align=center>:</td>
  1163      *     <td align=center>5</td>
  1164      *     <td><tt>{ "boo", "and", "foo" }</tt></td></tr>
  1165      * <tr><td align=center>:</td>
  1166      *     <td align=center>-2</td>
  1167      *     <td><tt>{ "boo", "and", "foo" }</tt></td></tr>
  1168      * <tr><td align=center>o</td>
  1169      *     <td align=center>5</td>
  1170      *     <td><tt>{ "b", "", ":and:f", "", "" }</tt></td></tr>
  1171      * <tr><td align=center>o</td>
  1172      *     <td align=center>-2</td>
  1173      *     <td><tt>{ "b", "", ":and:f", "", "" }</tt></td></tr>
  1174      * <tr><td align=center>o</td>
  1175      *     <td align=center>0</td>
  1176      *     <td><tt>{ "b", "", ":and:f" }</tt></td></tr>
  1177      * </table></blockquote>
  1178      *
  1179      *
  1180      * @param  input
  1181      *         The character sequence to be split
  1182      *
  1183      * @param  limit
  1184      *         The result threshold, as described above
  1185      *
  1186      * @return  The array of strings computed by splitting the input
  1187      *          around matches of this pattern
  1188      */
  1189     public String[] split(CharSequence input, int limit) {
  1190         int index = 0;
  1191         boolean matchLimited = limit > 0;
  1192         ArrayList<String> matchList = new ArrayList<>();
  1193         Matcher m = matcher(input);
  1194 
  1195         // Add segments before each match found
  1196         while(m.find()) {
  1197             if (!matchLimited || matchList.size() < limit - 1) {
  1198                 String match = input.subSequence(index, m.start()).toString();
  1199                 matchList.add(match);
  1200                 index = m.end();
  1201             } else if (matchList.size() == limit - 1) { // last one
  1202                 String match = input.subSequence(index,
  1203                                                  input.length()).toString();
  1204                 matchList.add(match);
  1205                 index = m.end();
  1206             }
  1207         }
  1208 
  1209         // If no match was found, return this
  1210         if (index == 0)
  1211             return new String[] {input.toString()};
  1212 
  1213         // Add remaining segment
  1214         if (!matchLimited || matchList.size() < limit)
  1215             matchList.add(input.subSequence(index, input.length()).toString());
  1216 
  1217         // Construct result
  1218         int resultSize = matchList.size();
  1219         if (limit == 0)
  1220             while (resultSize > 0 && matchList.get(resultSize-1).equals(""))
  1221                 resultSize--;
  1222         String[] result = new String[resultSize];
  1223         return matchList.subList(0, resultSize).toArray(result);
  1224     }
  1225 
  1226     /**
  1227      * Splits the given input sequence around matches of this pattern.
  1228      *
  1229      * <p> This method works as if by invoking the two-argument {@link
  1230      * #split(java.lang.CharSequence, int) split} method with the given input
  1231      * sequence and a limit argument of zero.  Trailing empty strings are
  1232      * therefore not included in the resulting array. </p>
  1233      *
  1234      * <p> The input <tt>"boo:and:foo"</tt>, for example, yields the following
  1235      * results with these expressions:
  1236      *
  1237      * <blockquote><table cellpadding=1 cellspacing=0
  1238      *              summary="Split examples showing regex and result">
  1239      * <tr><th><P align="left"><i>Regex&nbsp;&nbsp;&nbsp;&nbsp;</i></th>
  1240      *     <th><P align="left"><i>Result</i></th></tr>
  1241      * <tr><td align=center>:</td>
  1242      *     <td><tt>{ "boo", "and", "foo" }</tt></td></tr>
  1243      * <tr><td align=center>o</td>
  1244      *     <td><tt>{ "b", "", ":and:f" }</tt></td></tr>
  1245      * </table></blockquote>
  1246      *
  1247      *
  1248      * @param  input
  1249      *         The character sequence to be split
  1250      *
  1251      * @return  The array of strings computed by splitting the input
  1252      *          around matches of this pattern
  1253      */
  1254     public String[] split(CharSequence input) {
  1255         return split(input, 0);
  1256     }
  1257 
  1258     /**
  1259      * Returns a literal pattern <code>String</code> for the specified
  1260      * <code>String</code>.
  1261      *
  1262      * <p>This method produces a <code>String</code> that can be used to
  1263      * create a <code>Pattern</code> that would match the string
  1264      * <code>s</code> as if it were a literal pattern.</p> Metacharacters
  1265      * or escape sequences in the input sequence will be given no special
  1266      * meaning.
  1267      *
  1268      * @param  s The string to be literalized
  1269      * @return  A literal string replacement
  1270      * @since 1.5
  1271      */
  1272     public static String quote(String s) {
  1273         int slashEIndex = s.indexOf("\\E");
  1274         if (slashEIndex == -1)
  1275             return "\\Q" + s + "\\E";
  1276 
  1277         StringBuilder sb = new StringBuilder(s.length() * 2);
  1278         sb.append("\\Q");
  1279         slashEIndex = 0;
  1280         int current = 0;
  1281         while ((slashEIndex = s.indexOf("\\E", current)) != -1) {
  1282             sb.append(s.substring(current, slashEIndex));
  1283             current = slashEIndex + 2;
  1284             sb.append("\\E\\\\E\\Q");
  1285         }
  1286         sb.append(s.substring(current, s.length()));
  1287         sb.append("\\E");
  1288         return sb.toString();
  1289     }
  1290 
  1291     /**
  1292      * Recompile the Pattern instance from a stream.  The original pattern
  1293      * string is read in and the object tree is recompiled from it.
  1294      */
  1295     private void readObject(java.io.ObjectInputStream s)
  1296         throws java.io.IOException, ClassNotFoundException {
  1297 
  1298         // Read in all fields
  1299         s.defaultReadObject();
  1300 
  1301         // Initialize counts
  1302         capturingGroupCount = 1;
  1303         localCount = 0;
  1304 
  1305         // if length > 0, the Pattern is lazily compiled
  1306         compiled = false;
  1307         if (pattern.length() == 0) {
  1308             root = new Start(lastAccept);
  1309             matchRoot = lastAccept;
  1310             compiled = true;
  1311         }
  1312     }
  1313 
  1314     /**
  1315      * This private constructor is used to create all Patterns. The pattern
  1316      * string and match flags are all that is needed to completely describe
  1317      * a Pattern. An empty pattern string results in an object tree with
  1318      * only a Start node and a LastNode node.
  1319      */
  1320     private Pattern(String p, int f) {
  1321         pattern = p;
  1322         flags = f;
  1323 
  1324         // to use UNICODE_CASE if UNICODE_CHARACTER_CLASS present
  1325         if ((flags & UNICODE_CHARACTER_CLASS) != 0)
  1326             flags |= UNICODE_CASE;
  1327 
  1328         // Reset group index count
  1329         capturingGroupCount = 1;
  1330         localCount = 0;
  1331 
  1332         if (pattern.length() > 0) {
  1333             compile();
  1334         } else {
  1335             root = new Start(lastAccept);
  1336             matchRoot = lastAccept;
  1337         }
  1338     }
  1339 
  1340     /**
  1341      * The pattern is converted to normalizedD form and then a pure group
  1342      * is constructed to match canonical equivalences of the characters.
  1343      */
  1344     private void normalize() {
  1345         boolean inCharClass = false;
  1346         int lastCodePoint = -1;
  1347 
  1348         // Convert pattern into normalizedD form
  1349         normalizedPattern = Normalizer.normalize(pattern, Normalizer.NFD);
  1350         patternLength = normalizedPattern.length();
  1351 
  1352         // Modify pattern to match canonical equivalences
  1353         StringBuilder newPattern = new StringBuilder(patternLength);
  1354         for(int i=0; i<patternLength; ) {
  1355             int c = normalizedPattern.codePointAt(i);
  1356             StringBuilder sequenceBuffer;
  1357             if ((Character.getType(c) == Character.NON_SPACING_MARK)
  1358                 && (lastCodePoint != -1)) {
  1359                 sequenceBuffer = new StringBuilder();
  1360                 sequenceBuffer.appendCodePoint(lastCodePoint);
  1361                 sequenceBuffer.appendCodePoint(c);
  1362                 while(Character.getType(c) == Character.NON_SPACING_MARK) {
  1363                     i += Character.charCount(c);
  1364                     if (i >= patternLength)
  1365                         break;
  1366                     c = normalizedPattern.codePointAt(i);
  1367                     sequenceBuffer.appendCodePoint(c);
  1368                 }
  1369                 String ea = produceEquivalentAlternation(
  1370                                                sequenceBuffer.toString());
  1371                 newPattern.setLength(newPattern.length()-Character.charCount(lastCodePoint));
  1372                 newPattern.append("(?:").append(ea).append(")");
  1373             } else if (c == '[' && lastCodePoint != '\\') {
  1374                 i = normalizeCharClass(newPattern, i);
  1375             } else {
  1376                 newPattern.appendCodePoint(c);
  1377             }
  1378             lastCodePoint = c;
  1379             i += Character.charCount(c);
  1380         }
  1381         normalizedPattern = newPattern.toString();
  1382     }
  1383 
  1384     /**
  1385      * Complete the character class being parsed and add a set
  1386      * of alternations to it that will match the canonical equivalences
  1387      * of the characters within the class.
  1388      */
  1389     private int normalizeCharClass(StringBuilder newPattern, int i) {
  1390         StringBuilder charClass = new StringBuilder();
  1391         StringBuilder eq = null;
  1392         int lastCodePoint = -1;
  1393         String result;
  1394 
  1395         i++;
  1396         charClass.append("[");
  1397         while(true) {
  1398             int c = normalizedPattern.codePointAt(i);
  1399             StringBuilder sequenceBuffer;
  1400 
  1401             if (c == ']' && lastCodePoint != '\\') {
  1402                 charClass.append((char)c);
  1403                 break;
  1404             } else if (Character.getType(c) == Character.NON_SPACING_MARK) {
  1405                 sequenceBuffer = new StringBuilder();
  1406                 sequenceBuffer.appendCodePoint(lastCodePoint);
  1407                 while(Character.getType(c) == Character.NON_SPACING_MARK) {
  1408                     sequenceBuffer.appendCodePoint(c);
  1409                     i += Character.charCount(c);
  1410                     if (i >= normalizedPattern.length())
  1411                         break;
  1412                     c = normalizedPattern.codePointAt(i);
  1413                 }
  1414                 String ea = produceEquivalentAlternation(
  1415                                                   sequenceBuffer.toString());
  1416 
  1417                 charClass.setLength(charClass.length()-Character.charCount(lastCodePoint));
  1418                 if (eq == null)
  1419                     eq = new StringBuilder();
  1420                 eq.append('|');
  1421                 eq.append(ea);
  1422             } else {
  1423                 charClass.appendCodePoint(c);
  1424                 i++;
  1425             }
  1426             if (i == normalizedPattern.length())
  1427                 throw error("Unclosed character class");
  1428             lastCodePoint = c;
  1429         }
  1430 
  1431         if (eq != null) {
  1432             result = "(?:"+charClass.toString()+eq.toString()+")";
  1433         } else {
  1434             result = charClass.toString();
  1435         }
  1436 
  1437         newPattern.append(result);
  1438         return i;
  1439     }
  1440 
  1441     /**
  1442      * Given a specific sequence composed of a regular character and
  1443      * combining marks that follow it, produce the alternation that will
  1444      * match all canonical equivalences of that sequence.
  1445      */
  1446     private String produceEquivalentAlternation(String source) {
  1447         int len = countChars(source, 0, 1);
  1448         if (source.length() == len)
  1449             // source has one character.
  1450             return source;
  1451 
  1452         String base = source.substring(0,len);
  1453         String combiningMarks = source.substring(len);
  1454 
  1455         String[] perms = producePermutations(combiningMarks);
  1456         StringBuilder result = new StringBuilder(source);
  1457 
  1458         // Add combined permutations
  1459         for(int x=0; x<perms.length; x++) {
  1460             String next = base + perms[x];
  1461             if (x>0)
  1462                 result.append("|"+next);
  1463             next = composeOneStep(next);
  1464             if (next != null)
  1465                 result.append("|"+produceEquivalentAlternation(next));
  1466         }
  1467         return result.toString();
  1468     }
  1469 
  1470     /**
  1471      * Returns an array of strings that have all the possible
  1472      * permutations of the characters in the input string.
  1473      * This is used to get a list of all possible orderings
  1474      * of a set of combining marks. Note that some of the permutations
  1475      * are invalid because of combining class collisions, and these
  1476      * possibilities must be removed because they are not canonically
  1477      * equivalent.
  1478      */
  1479     private String[] producePermutations(String input) {
  1480         if (input.length() == countChars(input, 0, 1))
  1481             return new String[] {input};
  1482 
  1483         if (input.length() == countChars(input, 0, 2)) {
  1484             int c0 = Character.codePointAt(input, 0);
  1485             int c1 = Character.codePointAt(input, Character.charCount(c0));
  1486             if (getClass(c1) == getClass(c0)) {
  1487                 return new String[] {input};
  1488             }
  1489             String[] result = new String[2];
  1490             result[0] = input;
  1491             StringBuilder sb = new StringBuilder(2);
  1492             sb.appendCodePoint(c1);
  1493             sb.appendCodePoint(c0);
  1494             result[1] = sb.toString();
  1495             return result;
  1496         }
  1497 
  1498         int length = 1;
  1499         int nCodePoints = countCodePoints(input);
  1500         for(int x=1; x<nCodePoints; x++)
  1501             length = length * (x+1);
  1502 
  1503         String[] temp = new String[length];
  1504 
  1505         int combClass[] = new int[nCodePoints];
  1506         for(int x=0, i=0; x<nCodePoints; x++) {
  1507             int c = Character.codePointAt(input, i);
  1508             combClass[x] = getClass(c);
  1509             i +=  Character.charCount(c);
  1510         }
  1511 
  1512         // For each char, take it out and add the permutations
  1513         // of the remaining chars
  1514         int index = 0;
  1515         int len;
  1516         // offset maintains the index in code units.
  1517 loop:   for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
  1518             len = countChars(input, offset, 1);
  1519             boolean skip = false;
  1520             for(int y=x-1; y>=0; y--) {
  1521                 if (combClass[y] == combClass[x]) {
  1522                     continue loop;
  1523                 }
  1524             }
  1525             StringBuilder sb = new StringBuilder(input);
  1526             String otherChars = sb.delete(offset, offset+len).toString();
  1527             String[] subResult = producePermutations(otherChars);
  1528 
  1529             String prefix = input.substring(offset, offset+len);
  1530             for(int y=0; y<subResult.length; y++)
  1531                 temp[index++] =  prefix + subResult[y];
  1532         }
  1533         String[] result = new String[index];
  1534         for (int x=0; x<index; x++)
  1535             result[x] = temp[x];
  1536         return result;
  1537     }
  1538 
  1539     private int getClass(int c) {
  1540         return Normalizer.getCombiningClass(c);
  1541     }
  1542 
  1543     /**
  1544      * Attempts to compose input by combining the first character
  1545      * with the first combining mark following it. Returns a String
  1546      * that is the composition of the leading character with its first
  1547      * combining mark followed by the remaining combining marks. Returns
  1548      * null if the first two characters cannot be further composed.
  1549      */
  1550     private String composeOneStep(String input) {
  1551         int len = countChars(input, 0, 2);
  1552         String firstTwoCharacters = input.substring(0, len);
  1553         String result = Normalizer.normalize(firstTwoCharacters, Normalizer.NFC);
  1554 
  1555         if (result.equals(firstTwoCharacters))
  1556             return null;
  1557         else {
  1558             String remainder = input.substring(len);
  1559             return result + remainder;
  1560         }
  1561     }
  1562 
  1563     /**
  1564      * Preprocess any \Q...\E sequences in `temp', meta-quoting them.
  1565      * See the description of `quotemeta' in perlfunc(1).
  1566      */
  1567     private void RemoveQEQuoting() {
  1568         final int pLen = patternLength;
  1569         int i = 0;
  1570         while (i < pLen-1) {
  1571             if (temp[i] != '\\')
  1572                 i += 1;
  1573             else if (temp[i + 1] != 'Q')
  1574                 i += 2;
  1575             else
  1576                 break;
  1577         }
  1578         if (i >= pLen - 1)    // No \Q sequence found
  1579             return;
  1580         int j = i;
  1581         i += 2;
  1582         int[] newtemp = new int[j + 2*(pLen-i) + 2];
  1583         System.arraycopy(temp, 0, newtemp, 0, j);
  1584 
  1585         boolean inQuote = true;
  1586         while (i < pLen) {
  1587             int c = temp[i++];
  1588             if (! ASCII.isAscii(c) || ASCII.isAlnum(c)) {
  1589                 newtemp[j++] = c;
  1590             } else if (c != '\\') {
  1591                 if (inQuote) newtemp[j++] = '\\';
  1592                 newtemp[j++] = c;
  1593             } else if (inQuote) {
  1594                 if (temp[i] == 'E') {
  1595                     i++;
  1596                     inQuote = false;
  1597                 } else {
  1598                     newtemp[j++] = '\\';
  1599                     newtemp[j++] = '\\';
  1600                 }
  1601             } else {
  1602                 if (temp[i] == 'Q') {
  1603                     i++;
  1604                     inQuote = true;
  1605                 } else {
  1606                     newtemp[j++] = c;
  1607                     if (i != pLen)
  1608                         newtemp[j++] = temp[i++];
  1609                 }
  1610             }
  1611         }
  1612 
  1613         patternLength = j;
  1614         temp = Arrays.copyOf(newtemp, j + 2); // double zero termination
  1615     }
  1616 
  1617     /**
  1618      * Copies regular expression to an int array and invokes the parsing
  1619      * of the expression which will create the object tree.
  1620      */
  1621     private void compile() {
  1622         // Handle canonical equivalences
  1623         if (has(CANON_EQ) && !has(LITERAL)) {
  1624             normalize();
  1625         } else {
  1626             normalizedPattern = pattern;
  1627         }
  1628         patternLength = normalizedPattern.length();
  1629 
  1630         // Copy pattern to int array for convenience
  1631         // Use double zero to terminate pattern
  1632         temp = new int[patternLength + 2];
  1633 
  1634         hasSupplementary = false;
  1635         int c, count = 0;
  1636         // Convert all chars into code points
  1637         for (int x = 0; x < patternLength; x += Character.charCount(c)) {
  1638             c = normalizedPattern.codePointAt(x);
  1639             if (isSupplementary(c)) {
  1640                 hasSupplementary = true;
  1641             }
  1642             temp[count++] = c;
  1643         }
  1644 
  1645         patternLength = count;   // patternLength now in code points
  1646 
  1647         if (! has(LITERAL))
  1648             RemoveQEQuoting();
  1649 
  1650         // Allocate all temporary objects here.
  1651         buffer = new int[32];
  1652         groupNodes = new GroupHead[10];
  1653         namedGroups = null;
  1654 
  1655         if (has(LITERAL)) {
  1656             // Literal pattern handling
  1657             matchRoot = newSlice(temp, patternLength, hasSupplementary);
  1658             matchRoot.next = lastAccept;
  1659         } else {
  1660             // Start recursive descent parsing
  1661             matchRoot = expr(lastAccept);
  1662             // Check extra pattern characters
  1663             if (patternLength != cursor) {
  1664                 if (peek() == ')') {
  1665                     throw error("Unmatched closing ')'");
  1666                 } else {
  1667                     throw error("Unexpected internal error");
  1668                 }
  1669             }
  1670         }
  1671 
  1672         // Peephole optimization
  1673         if (matchRoot instanceof Slice) {
  1674             root = BnM.optimize(matchRoot);
  1675             if (root == matchRoot) {
  1676                 root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot);
  1677             }
  1678         } else if (matchRoot instanceof Begin || matchRoot instanceof First) {
  1679             root = matchRoot;
  1680         } else {
  1681             root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot);
  1682         }
  1683 
  1684         // Release temporary storage
  1685         temp = null;
  1686         buffer = null;
  1687         groupNodes = null;
  1688         patternLength = 0;
  1689         compiled = true;
  1690     }
  1691 
  1692     Map<String, Integer> namedGroups() {
  1693         if (namedGroups == null)
  1694             namedGroups = new HashMap<>(2);
  1695         return namedGroups;
  1696     }
  1697 
  1698     /**
  1699      * Used to print out a subtree of the Pattern to help with debugging.
  1700      */
  1701     private static void printObjectTree(Node node) {
  1702         while(node != null) {
  1703             if (node instanceof Prolog) {
  1704                 System.out.println(node);
  1705                 printObjectTree(((Prolog)node).loop);
  1706                 System.out.println("**** end contents prolog loop");
  1707             } else if (node instanceof Loop) {
  1708                 System.out.println(node);
  1709                 printObjectTree(((Loop)node).body);
  1710                 System.out.println("**** end contents Loop body");
  1711             } else if (node instanceof Curly) {
  1712                 System.out.println(node);
  1713                 printObjectTree(((Curly)node).atom);
  1714                 System.out.println("**** end contents Curly body");
  1715             } else if (node instanceof GroupCurly) {
  1716                 System.out.println(node);
  1717                 printObjectTree(((GroupCurly)node).atom);
  1718                 System.out.println("**** end contents GroupCurly body");
  1719             } else if (node instanceof GroupTail) {
  1720                 System.out.println(node);
  1721                 System.out.println("Tail next is "+node.next);
  1722                 return;
  1723             } else {
  1724                 System.out.println(node);
  1725             }
  1726             node = node.next;
  1727             if (node != null)
  1728                 System.out.println("->next:");
  1729             if (node == Pattern.accept) {
  1730                 System.out.println("Accept Node");
  1731                 node = null;
  1732             }
  1733        }
  1734     }
  1735 
  1736     /**
  1737      * Used to accumulate information about a subtree of the object graph
  1738      * so that optimizations can be applied to the subtree.
  1739      */
  1740     static final class TreeInfo {
  1741         int minLength;
  1742         int maxLength;
  1743         boolean maxValid;
  1744         boolean deterministic;
  1745 
  1746         TreeInfo() {
  1747             reset();
  1748         }
  1749         void reset() {
  1750             minLength = 0;
  1751             maxLength = 0;
  1752             maxValid = true;
  1753             deterministic = true;
  1754         }
  1755     }
  1756 
  1757     /*
  1758      * The following private methods are mainly used to improve the
  1759      * readability of the code. In order to let the Java compiler easily
  1760      * inline them, we should not put many assertions or error checks in them.
  1761      */
  1762 
  1763     /**
  1764      * Indicates whether a particular flag is set or not.
  1765      */
  1766     private boolean has(int f) {
  1767         return (flags & f) != 0;
  1768     }
  1769 
  1770     /**
  1771      * Match next character, signal error if failed.
  1772      */
  1773     private void accept(int ch, String s) {
  1774         int testChar = temp[cursor++];
  1775         if (has(COMMENTS))
  1776             testChar = parsePastWhitespace(testChar);
  1777         if (ch != testChar) {
  1778             throw error(s);
  1779         }
  1780     }
  1781 
  1782     /**
  1783      * Mark the end of pattern with a specific character.
  1784      */
  1785     private void mark(int c) {
  1786         temp[patternLength] = c;
  1787     }
  1788 
  1789     /**
  1790      * Peek the next character, and do not advance the cursor.
  1791      */
  1792     private int peek() {
  1793         int ch = temp[cursor];
  1794         if (has(COMMENTS))
  1795             ch = peekPastWhitespace(ch);
  1796         return ch;
  1797     }
  1798 
  1799     /**
  1800      * Read the next character, and advance the cursor by one.
  1801      */
  1802     private int read() {
  1803         int ch = temp[cursor++];
  1804         if (has(COMMENTS))
  1805             ch = parsePastWhitespace(ch);
  1806         return ch;
  1807     }
  1808 
  1809     /**
  1810      * Read the next character, and advance the cursor by one,
  1811      * ignoring the COMMENTS setting
  1812      */
  1813     private int readEscaped() {
  1814         int ch = temp[cursor++];
  1815         return ch;
  1816     }
  1817 
  1818     /**
  1819      * Advance the cursor by one, and peek the next character.
  1820      */
  1821     private int next() {
  1822         int ch = temp[++cursor];
  1823         if (has(COMMENTS))
  1824             ch = peekPastWhitespace(ch);
  1825         return ch;
  1826     }
  1827 
  1828     /**
  1829      * Advance the cursor by one, and peek the next character,
  1830      * ignoring the COMMENTS setting
  1831      */
  1832     private int nextEscaped() {
  1833         int ch = temp[++cursor];
  1834         return ch;
  1835     }
  1836 
  1837     /**
  1838      * If in xmode peek past whitespace and comments.
  1839      */
  1840     private int peekPastWhitespace(int ch) {
  1841         while (ASCII.isSpace(ch) || ch == '#') {
  1842             while (ASCII.isSpace(ch))
  1843                 ch = temp[++cursor];
  1844             if (ch == '#') {
  1845                 ch = peekPastLine();
  1846             }
  1847         }
  1848         return ch;
  1849     }
  1850 
  1851     /**
  1852      * If in xmode parse past whitespace and comments.
  1853      */
  1854     private int parsePastWhitespace(int ch) {
  1855         while (ASCII.isSpace(ch) || ch == '#') {
  1856             while (ASCII.isSpace(ch))
  1857                 ch = temp[cursor++];
  1858             if (ch == '#')
  1859                 ch = parsePastLine();
  1860         }
  1861         return ch;
  1862     }
  1863 
  1864     /**
  1865      * xmode parse past comment to end of line.
  1866      */
  1867     private int parsePastLine() {
  1868         int ch = temp[cursor++];
  1869         while (ch != 0 && !isLineSeparator(ch))
  1870             ch = temp[cursor++];
  1871         return ch;
  1872     }
  1873 
  1874     /**
  1875      * xmode peek past comment to end of line.
  1876      */
  1877     private int peekPastLine() {
  1878         int ch = temp[++cursor];
  1879         while (ch != 0 && !isLineSeparator(ch))
  1880             ch = temp[++cursor];
  1881         return ch;
  1882     }
  1883 
  1884     /**
  1885      * Determines if character is a line separator in the current mode
  1886      */
  1887     private boolean isLineSeparator(int ch) {
  1888         if (has(UNIX_LINES)) {
  1889             return ch == '\n';
  1890         } else {
  1891             return (ch == '\n' ||
  1892                     ch == '\r' ||
  1893                     (ch|1) == '\u2029' ||
  1894                     ch == '\u0085');
  1895         }
  1896     }
  1897 
  1898     /**
  1899      * Read the character after the next one, and advance the cursor by two.
  1900      */
  1901     private int skip() {
  1902         int i = cursor;
  1903         int ch = temp[i+1];
  1904         cursor = i + 2;
  1905         return ch;
  1906     }
  1907 
  1908     /**
  1909      * Unread one next character, and retreat cursor by one.
  1910      */
  1911     private void unread() {
  1912         cursor--;
  1913     }
  1914 
  1915     /**
  1916      * Internal method used for handling all syntax errors. The pattern is
  1917      * displayed with a pointer to aid in locating the syntax error.
  1918      */
  1919     private PatternSyntaxException error(String s) {
  1920         return new PatternSyntaxException(s, normalizedPattern,  cursor - 1);
  1921     }
  1922 
  1923     /**
  1924      * Determines if there is any supplementary character or unpaired
  1925      * surrogate in the specified range.
  1926      */
  1927     private boolean findSupplementary(int start, int end) {
  1928         for (int i = start; i < end; i++) {
  1929             if (isSupplementary(temp[i]))
  1930                 return true;
  1931         }
  1932         return false;
  1933     }
  1934 
  1935     /**
  1936      * Determines if the specified code point is a supplementary
  1937      * character or unpaired surrogate.
  1938      */
  1939     private static final boolean isSupplementary(int ch) {
  1940         return ch >= Character.MIN_SUPPLEMENTARY_CODE_POINT ||
  1941                Character.isSurrogate((char)ch);
  1942     }
  1943 
  1944     /**
  1945      *  The following methods handle the main parsing. They are sorted
  1946      *  according to their precedence order, the lowest one first.
  1947      */
  1948 
  1949     /**
  1950      * The expression is parsed with branch nodes added for alternations.
  1951      * This may be called recursively to parse sub expressions that may
  1952      * contain alternations.
  1953      */
  1954     private Node expr(Node end) {
  1955         Node prev = null;
  1956         Node firstTail = null;
  1957         Node branchConn = null;
  1958 
  1959         for (;;) {
  1960             Node node = sequence(end);
  1961             Node nodeTail = root;      //double return
  1962             if (prev == null) {
  1963                 prev = node;
  1964                 firstTail = nodeTail;
  1965             } else {
  1966                 // Branch
  1967                 if (branchConn == null) {
  1968                     branchConn = new BranchConn();
  1969                     branchConn.next = end;
  1970                 }
  1971                 if (node == end) {
  1972                     // if the node returned from sequence() is "end"
  1973                     // we have an empty expr, set a null atom into
  1974                     // the branch to indicate to go "next" directly.
  1975                     node = null;
  1976                 } else {
  1977                     // the "tail.next" of each atom goes to branchConn
  1978                     nodeTail.next = branchConn;
  1979                 }
  1980                 if (prev instanceof Branch) {
  1981                     ((Branch)prev).add(node);
  1982                 } else {
  1983                     if (prev == end) {
  1984                         prev = null;
  1985                     } else {
  1986                         // replace the "end" with "branchConn" at its tail.next
  1987                         // when put the "prev" into the branch as the first atom.
  1988                         firstTail.next = branchConn;
  1989                     }
  1990                     prev = new Branch(prev, node, branchConn);
  1991                 }
  1992             }
  1993             if (peek() != '|') {
  1994                 return prev;
  1995             }
  1996             next();
  1997         }
  1998     }
  1999 
  2000     /**
  2001      * Parsing of sequences between alternations.
  2002      */
  2003     private Node sequence(Node end) {
  2004         Node head = null;
  2005         Node tail = null;
  2006         Node node = null;
  2007     LOOP:
  2008         for (;;) {
  2009             int ch = peek();
  2010             switch (ch) {
  2011             case '(':
  2012                 // Because group handles its own closure,
  2013                 // we need to treat it differently
  2014                 node = group0();
  2015                 // Check for comment or flag group
  2016                 if (node == null)
  2017                     continue;
  2018                 if (head == null)
  2019                     head = node;
  2020                 else
  2021                     tail.next = node;
  2022                 // Double return: Tail was returned in root
  2023                 tail = root;
  2024                 continue;
  2025             case '[':
  2026                 node = clazz(true);
  2027                 break;
  2028             case '\\':
  2029                 ch = nextEscaped();
  2030                 if (ch == 'p' || ch == 'P') {
  2031                     boolean oneLetter = true;
  2032                     boolean comp = (ch == 'P');
  2033                     ch = next(); // Consume { if present
  2034                     if (ch != '{') {
  2035                         unread();
  2036                     } else {
  2037                         oneLetter = false;
  2038                     }
  2039                     node = family(oneLetter, comp);
  2040                 } else {
  2041                     unread();
  2042                     node = atom();
  2043                 }
  2044                 break;
  2045             case '^':
  2046                 next();
  2047                 if (has(MULTILINE)) {
  2048                     if (has(UNIX_LINES))
  2049                         node = new UnixCaret();
  2050                     else
  2051                         node = new Caret();
  2052                 } else {
  2053                     node = new Begin();
  2054                 }
  2055                 break;
  2056             case '$':
  2057                 next();
  2058                 if (has(UNIX_LINES))
  2059                     node = new UnixDollar(has(MULTILINE));
  2060                 else
  2061                     node = new Dollar(has(MULTILINE));
  2062                 break;
  2063             case '.':
  2064                 next();
  2065                 if (has(DOTALL)) {
  2066                     node = new All();
  2067                 } else {
  2068                     if (has(UNIX_LINES))
  2069                         node = new UnixDot();
  2070                     else {
  2071                         node = new Dot();
  2072                     }
  2073                 }
  2074                 break;
  2075             case '|':
  2076             case ')':
  2077                 break LOOP;
  2078             case ']': // Now interpreting dangling ] and } as literals
  2079             case '}':
  2080                 node = atom();
  2081                 break;
  2082             case '?':
  2083             case '*':
  2084             case '+':
  2085                 next();
  2086                 throw error("Dangling meta character '" + ((char)ch) + "'");
  2087             case 0:
  2088                 if (cursor >= patternLength) {
  2089                     break LOOP;
  2090                 }
  2091                 // Fall through
  2092             default:
  2093                 node = atom();
  2094                 break;
  2095             }
  2096 
  2097             node = closure(node);
  2098 
  2099             if (head == null) {
  2100                 head = tail = node;
  2101             } else {
  2102                 tail.next = node;
  2103                 tail = node;
  2104             }
  2105         }
  2106         if (head == null) {
  2107             return end;
  2108         }
  2109         tail.next = end;
  2110         root = tail;      //double return
  2111         return head;
  2112     }
  2113 
  2114     /**
  2115      * Parse and add a new Single or Slice.
  2116      */
  2117     private Node atom() {
  2118         int first = 0;
  2119         int prev = -1;
  2120         boolean hasSupplementary = false;
  2121         int ch = peek();
  2122         for (;;) {
  2123             switch (ch) {
  2124             case '*':
  2125             case '+':
  2126             case '?':
  2127             case '{':
  2128                 if (first > 1) {
  2129                     cursor = prev;    // Unwind one character
  2130                     first--;
  2131                 }
  2132                 break;
  2133             case '$':
  2134             case '.':
  2135             case '^':
  2136             case '(':
  2137             case '[':
  2138             case '|':
  2139             case ')':
  2140                 break;
  2141             case '\\':
  2142                 ch = nextEscaped();
  2143                 if (ch == 'p' || ch == 'P') { // Property
  2144                     if (first > 0) { // Slice is waiting; handle it first
  2145                         unread();
  2146                         break;
  2147                     } else { // No slice; just return the family node
  2148                         boolean comp = (ch == 'P');
  2149                         boolean oneLetter = true;
  2150                         ch = next(); // Consume { if present
  2151                         if (ch != '{')
  2152                             unread();
  2153                         else
  2154                             oneLetter = false;
  2155                         return family(oneLetter, comp);
  2156                     }
  2157                 }
  2158                 unread();
  2159                 prev = cursor;
  2160                 ch = escape(false, first == 0);
  2161                 if (ch >= 0) {
  2162                     append(ch, first);
  2163                     first++;
  2164                     if (isSupplementary(ch)) {
  2165                         hasSupplementary = true;
  2166                     }
  2167                     ch = peek();
  2168                     continue;
  2169                 } else if (first == 0) {
  2170                     return root;
  2171                 }
  2172                 // Unwind meta escape sequence
  2173                 cursor = prev;
  2174                 break;
  2175             case 0:
  2176                 if (cursor >= patternLength) {
  2177                     break;
  2178                 }
  2179                 // Fall through
  2180             default:
  2181                 prev = cursor;
  2182                 append(ch, first);
  2183                 first++;
  2184                 if (isSupplementary(ch)) {
  2185                     hasSupplementary = true;
  2186                 }
  2187                 ch = next();
  2188                 continue;
  2189             }
  2190             break;
  2191         }
  2192         if (first == 1) {
  2193             return newSingle(buffer[0]);
  2194         } else {
  2195             return newSlice(buffer, first, hasSupplementary);
  2196         }
  2197     }
  2198 
  2199     private void append(int ch, int len) {
  2200         if (len >= buffer.length) {
  2201             int[] tmp = new int[len+len];
  2202             System.arraycopy(buffer, 0, tmp, 0, len);
  2203             buffer = tmp;
  2204         }
  2205         buffer[len] = ch;
  2206     }
  2207 
  2208     /**
  2209      * Parses a backref greedily, taking as many numbers as it
  2210      * can. The first digit is always treated as a backref, but
  2211      * multi digit numbers are only treated as a backref if at
  2212      * least that many backrefs exist at this point in the regex.
  2213      */
  2214     private Node ref(int refNum) {
  2215         boolean done = false;
  2216         while(!done) {
  2217             int ch = peek();
  2218             switch(ch) {
  2219             case '0':
  2220             case '1':
  2221             case '2':
  2222             case '3':
  2223             case '4':
  2224             case '5':
  2225             case '6':
  2226             case '7':
  2227             case '8':
  2228             case '9':
  2229                 int newRefNum = (refNum * 10) + (ch - '0');
  2230                 // Add another number if it doesn't make a group
  2231                 // that doesn't exist
  2232                 if (capturingGroupCount - 1 < newRefNum) {
  2233                     done = true;
  2234                     break;
  2235                 }
  2236                 refNum = newRefNum;
  2237                 read();
  2238                 break;
  2239             default:
  2240                 done = true;
  2241                 break;
  2242             }
  2243         }
  2244         if (has(CASE_INSENSITIVE))
  2245             return new CIBackRef(refNum, has(UNICODE_CASE));
  2246         else
  2247             return new BackRef(refNum);
  2248     }
  2249 
  2250     /**
  2251      * Parses an escape sequence to determine the actual value that needs
  2252      * to be matched.
  2253      * If -1 is returned and create was true a new object was added to the tree
  2254      * to handle the escape sequence.
  2255      * If the returned value is greater than zero, it is the value that
  2256      * matches the escape sequence.
  2257      */
  2258     private int escape(boolean inclass, boolean create) {
  2259         int ch = skip();
  2260         switch (ch) {
  2261         case '0':
  2262             return o();
  2263         case '1':
  2264         case '2':
  2265         case '3':
  2266         case '4':
  2267         case '5':
  2268         case '6':
  2269         case '7':
  2270         case '8':
  2271         case '9':
  2272             if (inclass) break;
  2273             if (create) {
  2274                 root = ref((ch - '0'));
  2275             }
  2276             return -1;
  2277         case 'A':
  2278             if (inclass) break;
  2279             if (create) root = new Begin();
  2280             return -1;
  2281         case 'B':
  2282             if (inclass) break;
  2283             if (create) root = new Bound(Bound.NONE, has(UNICODE_CHARACTER_CLASS));
  2284             return -1;
  2285         case 'C':
  2286             break;
  2287         case 'D':
  2288             if (create) root = has(UNICODE_CHARACTER_CLASS)
  2289                                ? new Utype(UnicodeProp.DIGIT).complement()
  2290                                : new Ctype(ASCII.DIGIT).complement();
  2291             return -1;
  2292         case 'E':
  2293         case 'F':
  2294             break;
  2295         case 'G':
  2296             if (inclass) break;
  2297             if (create) root = new LastMatch();
  2298             return -1;
  2299         case 'H':
  2300         case 'I':
  2301         case 'J':
  2302         case 'K':
  2303         case 'L':
  2304         case 'M':
  2305         case 'N':
  2306         case 'O':
  2307         case 'P':
  2308         case 'Q':
  2309         case 'R':
  2310             break;
  2311         case 'S':
  2312             if (create) root = has(UNICODE_CHARACTER_CLASS)
  2313                                ? new Utype(UnicodeProp.WHITE_SPACE).complement()
  2314                                : new Ctype(ASCII.SPACE).complement();
  2315             return -1;
  2316         case 'T':
  2317         case 'U':
  2318         case 'V':
  2319             break;
  2320         case 'W':
  2321             if (create) root = has(UNICODE_CHARACTER_CLASS)
  2322                                ? new Utype(UnicodeProp.WORD).complement()
  2323                                : new Ctype(ASCII.WORD).complement();
  2324             return -1;
  2325         case 'X':
  2326         case 'Y':
  2327             break;
  2328         case 'Z':
  2329             if (inclass) break;
  2330             if (create) {
  2331                 if (has(UNIX_LINES))
  2332                     root = new UnixDollar(false);
  2333                 else
  2334                     root = new Dollar(false);
  2335             }
  2336             return -1;
  2337         case 'a':
  2338             return '\007';
  2339         case 'b':
  2340             if (inclass) break;
  2341             if (create) root = new Bound(Bound.BOTH, has(UNICODE_CHARACTER_CLASS));
  2342             return -1;
  2343         case 'c':
  2344             return c();
  2345         case 'd':
  2346             if (create) root = has(UNICODE_CHARACTER_CLASS)
  2347                                ? new Utype(UnicodeProp.DIGIT)
  2348                                : new Ctype(ASCII.DIGIT);
  2349             return -1;
  2350         case 'e':
  2351             return '\033';
  2352         case 'f':
  2353             return '\f';
  2354         case 'g':
  2355         case 'h':
  2356         case 'i':
  2357         case 'j':
  2358             break;
  2359         case 'k':
  2360             if (inclass)
  2361                 break;
  2362             if (read() != '<')
  2363                 throw error("\\k is not followed by '<' for named capturing group");
  2364             String name = groupname(read());
  2365             if (!namedGroups().containsKey(name))
  2366                 throw error("(named capturing group <"+ name+"> does not exit");
  2367             if (create) {
  2368                 if (has(CASE_INSENSITIVE))
  2369                     root = new CIBackRef(namedGroups().get(name), has(UNICODE_CASE));
  2370                 else
  2371                     root = new BackRef(namedGroups().get(name));
  2372             }
  2373             return -1;
  2374         case 'l':
  2375         case 'm':
  2376             break;
  2377         case 'n':
  2378             return '\n';
  2379         case 'o':
  2380         case 'p':
  2381         case 'q':
  2382             break;
  2383         case 'r':
  2384             return '\r';
  2385         case 's':
  2386             if (create) root = has(UNICODE_CHARACTER_CLASS)
  2387                                ? new Utype(UnicodeProp.WHITE_SPACE)
  2388                                : new Ctype(ASCII.SPACE);
  2389             return -1;
  2390         case 't':
  2391             return '\t';
  2392         case 'u':
  2393             return u();
  2394         case 'v':
  2395             return '\013';
  2396         case 'w':
  2397             if (create) root = has(UNICODE_CHARACTER_CLASS)
  2398                                ? new Utype(UnicodeProp.WORD)
  2399                                : new Ctype(ASCII.WORD);
  2400             return -1;
  2401         case 'x':
  2402             return x();
  2403         case 'y':
  2404             break;
  2405         case 'z':
  2406             if (inclass) break;
  2407             if (create) root = new End();
  2408             return -1;
  2409         default:
  2410             return ch;
  2411         }
  2412         throw error("Illegal/unsupported escape sequence");
  2413     }
  2414 
  2415     /**
  2416      * Parse a character class, and return the node that matches it.
  2417      *
  2418      * Consumes a ] on the way out if consume is true. Usually consume
  2419      * is true except for the case of [abc&&def] where def is a separate
  2420      * right hand node with "understood" brackets.
  2421      */
  2422     private CharProperty clazz(boolean consume) {
  2423         CharProperty prev = null;
  2424         CharProperty node = null;
  2425         BitClass bits = new BitClass();
  2426         boolean include = true;
  2427         boolean firstInClass = true;
  2428         int ch = next();
  2429         for (;;) {
  2430             switch (ch) {
  2431                 case '^':
  2432                     // Negates if first char in a class, otherwise literal
  2433                     if (firstInClass) {
  2434                         if (temp[cursor-1] != '[')
  2435                             break;
  2436                         ch = next();
  2437                         include = !include;
  2438                         continue;
  2439                     } else {
  2440                         // ^ not first in class, treat as literal
  2441                         break;
  2442                     }
  2443                 case '[':
  2444                     firstInClass = false;
  2445                     node = clazz(true);
  2446                     if (prev == null)
  2447                         prev = node;
  2448                     else
  2449                         prev = union(prev, node);
  2450                     ch = peek();
  2451                     continue;
  2452                 case '&':
  2453                     firstInClass = false;
  2454                     ch = next();
  2455                     if (ch == '&') {
  2456                         ch = next();
  2457                         CharProperty rightNode = null;
  2458                         while (ch != ']' && ch != '&') {
  2459                             if (ch == '[') {
  2460                                 if (rightNode == null)
  2461                                     rightNode = clazz(true);
  2462                                 else
  2463                                     rightNode = union(rightNode, clazz(true));
  2464                             } else { // abc&&def
  2465                                 unread();
  2466                                 rightNode = clazz(false);
  2467                             }
  2468                             ch = peek();
  2469                         }
  2470                         if (rightNode != null)
  2471                             node = rightNode;
  2472                         if (prev == null) {
  2473                             if (rightNode == null)
  2474                                 throw error("Bad class syntax");
  2475                             else
  2476                                 prev = rightNode;
  2477                         } else {
  2478                             prev = intersection(prev, node);
  2479                         }
  2480                     } else {
  2481                         // treat as a literal &
  2482                         unread();
  2483                         break;
  2484                     }
  2485                     continue;
  2486                 case 0:
  2487                     firstInClass = false;
  2488                     if (cursor >= patternLength)
  2489                         throw error("Unclosed character class");
  2490                     break;
  2491                 case ']':
  2492                     firstInClass = false;
  2493                     if (prev != null) {
  2494                         if (consume)
  2495                             next();
  2496                         return prev;
  2497                     }
  2498                     break;
  2499                 default:
  2500                     firstInClass = false;
  2501                     break;
  2502             }
  2503             node = range(bits);
  2504             if (include) {
  2505                 if (prev == null) {
  2506                     prev = node;
  2507                 } else {
  2508                     if (prev != node)
  2509                         prev = union(prev, node);
  2510                 }
  2511             } else {
  2512                 if (prev == null) {
  2513                     prev = node.complement();
  2514                 } else {
  2515                     if (prev != node)
  2516                         prev = setDifference(prev, node);
  2517                 }
  2518             }
  2519             ch = peek();
  2520         }
  2521     }
  2522 
  2523     private CharProperty bitsOrSingle(BitClass bits, int ch) {
  2524         /* Bits can only handle codepoints in [u+0000-u+00ff] range.
  2525            Use "single" node instead of bits when dealing with unicode
  2526            case folding for codepoints listed below.
  2527            (1)Uppercase out of range: u+00ff, u+00b5
  2528               toUpperCase(u+00ff) -> u+0178
  2529               toUpperCase(u+00b5) -> u+039c
  2530            (2)LatinSmallLetterLongS u+17f
  2531               toUpperCase(u+017f) -> u+0053
  2532            (3)LatinSmallLetterDotlessI u+131
  2533               toUpperCase(u+0131) -> u+0049
  2534            (4)LatinCapitalLetterIWithDotAbove u+0130
  2535               toLowerCase(u+0130) -> u+0069
  2536            (5)KelvinSign u+212a
  2537               toLowerCase(u+212a) ==> u+006B
  2538            (6)AngstromSign u+212b
  2539               toLowerCase(u+212b) ==> u+00e5
  2540         */
  2541         int d;
  2542         if (ch < 256 &&
  2543             !(has(CASE_INSENSITIVE) && has(UNICODE_CASE) &&
  2544               (ch == 0xff || ch == 0xb5 ||
  2545                ch == 0x49 || ch == 0x69 ||  //I and i
  2546                ch == 0x53 || ch == 0x73 ||  //S and s
  2547                ch == 0x4b || ch == 0x6b ||  //K and k
  2548                ch == 0xc5 || ch == 0xe5)))  //A+ring
  2549             return bits.add(ch, flags());
  2550         return newSingle(ch);
  2551     }
  2552 
  2553     /**
  2554      * Parse a single character or a character range in a character class
  2555      * and return its representative node.
  2556      */
  2557     private CharProperty range(BitClass bits) {
  2558         int ch = peek();
  2559         if (ch == '\\') {
  2560             ch = nextEscaped();
  2561             if (ch == 'p' || ch == 'P') { // A property
  2562                 boolean comp = (ch == 'P');
  2563                 boolean oneLetter = true;
  2564                 // Consume { if present
  2565                 ch = next();
  2566                 if (ch != '{')
  2567                     unread();
  2568                 else
  2569                     oneLetter = false;
  2570                 return family(oneLetter, comp);
  2571             } else { // ordinary escape
  2572                 unread();
  2573                 ch = escape(true, true);
  2574                 if (ch == -1)
  2575                     return (CharProperty) root;
  2576             }
  2577         } else {
  2578             ch = single();
  2579         }
  2580         if (ch >= 0) {
  2581             if (peek() == '-') {
  2582                 int endRange = temp[cursor+1];
  2583                 if (endRange == '[') {
  2584                     return bitsOrSingle(bits, ch);
  2585                 }
  2586                 if (endRange != ']') {
  2587                     next();
  2588                     int m = single();
  2589                     if (m < ch)
  2590                         throw error("Illegal character range");
  2591                     if (has(CASE_INSENSITIVE))
  2592                         return caseInsensitiveRangeFor(ch, m);
  2593                     else
  2594                         return rangeFor(ch, m);
  2595                 }
  2596             }
  2597             return bitsOrSingle(bits, ch);
  2598         }
  2599         throw error("Unexpected character '"+((char)ch)+"'");
  2600     }
  2601 
  2602     private int single() {
  2603         int ch = peek();
  2604         switch (ch) {
  2605         case '\\':
  2606             return escape(true, false);
  2607         default:
  2608             next();
  2609             return ch;
  2610         }
  2611     }
  2612 
  2613     /**
  2614      * Parses a Unicode character family and returns its representative node.
  2615      */
  2616     private CharProperty family(boolean singleLetter,
  2617                                 boolean maybeComplement)
  2618     {
  2619         next();
  2620         String name;
  2621         CharProperty node = null;
  2622 
  2623         if (singleLetter) {
  2624             int c = temp[cursor];
  2625             if (!Character.isSupplementaryCodePoint(c)) {
  2626                 name = String.valueOf((char)c);
  2627             } else {
  2628                 name = new String(temp, cursor, 1);
  2629             }
  2630             read();
  2631         } else {
  2632             int i = cursor;
  2633             mark('}');
  2634             while(read() != '}') {
  2635             }
  2636             mark('\000');
  2637             int j = cursor;
  2638             if (j > patternLength)
  2639                 throw error("Unclosed character family");
  2640             if (i + 1 >= j)
  2641                 throw error("Empty character family");
  2642             name = new String(temp, i, j-i-1);
  2643         }
  2644 
  2645         int i = name.indexOf('=');
  2646         if (i != -1) {
  2647             // property construct \p{name=value}
  2648             String value = name.substring(i + 1);
  2649             name = name.substring(0, i).toLowerCase(Locale.ENGLISH);
  2650     /*        if ("sc".equals(name) || "script".equals(name)) {
  2651                 node = unicodeScriptPropertyFor(value);
  2652             } else if ("blk".equals(name) || "block".equals(name)) {
  2653                 node = unicodeBlockPropertyFor(value);
  2654             } else*/ if ("gc".equals(name) || "general_category".equals(name)) {
  2655                 node = charPropertyNodeFor(value);
  2656             } else {
  2657                 throw error("Unknown Unicode property {name=<" + name + ">, "
  2658                              + "value=<" + value + ">}");
  2659             }
  2660         } else {
  2661             /*if (name.startsWith("In")) {
  2662                 // \p{inBlockName}
  2663                 node = unicodeBlockPropertyFor(name.substring(2));
  2664             } else if (name.startsWith("Is")) {
  2665                 // \p{isGeneralCategory} and \p{isScriptName}
  2666                 name = name.substring(2);
  2667                 UnicodeProp uprop = UnicodeProp.forName(name);
  2668                 if (uprop != null)
  2669                     node = new Utype(uprop);
  2670                 if (node == null)
  2671                     node = CharPropertyNames.charPropertyFor(name);
  2672                 if (node == null)
  2673                     node = unicodeScriptPropertyFor(name);
  2674             } else*/ {
  2675                 if (has(UNICODE_CHARACTER_CLASS)) {
  2676                     UnicodeProp uprop = UnicodeProp.forPOSIXName(name);
  2677                     if (uprop != null)
  2678                         node = new Utype(uprop);
  2679                 }
  2680                 if (node == null)
  2681                     node = charPropertyNodeFor(name);
  2682             }
  2683         }
  2684         if (maybeComplement) {
  2685             if (node instanceof Category /*|| node instanceof Block*/)
  2686                 hasSupplementary = true;
  2687             node = node.complement();
  2688         }
  2689         return node;
  2690     }
  2691 
  2692 
  2693     /**
  2694      * Returns a CharProperty matching all characters belong to
  2695      * a UnicodeScript.
  2696      *
  2697     private CharProperty unicodeScriptPropertyFor(String name) {
  2698         final Character.UnicodeScript script;
  2699         try {
  2700             script = Character.UnicodeScript.forName(name);
  2701         } catch (IllegalArgumentException iae) {
  2702             throw error("Unknown character script name {" + name + "}");
  2703         }
  2704         return new Script(script);
  2705     }
  2706 
  2707     /**
  2708      * Returns a CharProperty matching all characters in a UnicodeBlock.
  2709      *
  2710     private CharProperty unicodeBlockPropertyFor(String name) {
  2711         final Character.UnicodeBlock block;
  2712         try {
  2713             block = Character.UnicodeBlock.forName(name);
  2714         } catch (IllegalArgumentException iae) {
  2715             throw error("Unknown character block name {" + name + "}");
  2716         }
  2717         return new Block(block);
  2718     }
  2719 
  2720     /**
  2721      * Returns a CharProperty matching all characters in a named property.
  2722      */
  2723     private CharProperty charPropertyNodeFor(String name) {
  2724         CharProperty p = CharPropertyNames.charPropertyFor(name);
  2725         if (p == null)
  2726             throw error("Unknown character property name {" + name + "}");
  2727         return p;
  2728     }
  2729 
  2730     /**
  2731      * Parses and returns the name of a "named capturing group", the trailing
  2732      * ">" is consumed after parsing.
  2733      */
  2734     private String groupname(int ch) {
  2735         StringBuilder sb = new StringBuilder();
  2736         sb.append(Character.toChars(ch));
  2737         while (ASCII.isLower(ch=read()) || ASCII.isUpper(ch) ||
  2738                ASCII.isDigit(ch)) {
  2739             sb.append(Character.toChars(ch));
  2740         }
  2741         if (sb.length() == 0)
  2742             throw error("named capturing group has 0 length name");
  2743         if (ch != '>')
  2744             throw error("named capturing group is missing trailing '>'");
  2745         return sb.toString();
  2746     }
  2747 
  2748     /**
  2749      * Parses a group and returns the head node of a set of nodes that process
  2750      * the group. Sometimes a double return system is used where the tail is
  2751      * returned in root.
  2752      */
  2753     private Node group0() {
  2754         boolean capturingGroup = false;
  2755         Node head = null;
  2756         Node tail = null;
  2757         int save = flags;
  2758         root = null;
  2759         int ch = next();
  2760         if (ch == '?') {
  2761             ch = skip();
  2762             switch (ch) {
  2763             case ':':   //  (?:xxx) pure group
  2764                 head = createGroup(true);
  2765                 tail = root;
  2766                 head.next = expr(tail);
  2767                 break;
  2768             case '=':   // (?=xxx) and (?!xxx) lookahead
  2769             case '!':
  2770                 head = createGroup(true);
  2771                 tail = root;
  2772                 head.next = expr(tail);
  2773                 if (ch == '=') {
  2774                     head = tail = new Pos(head);
  2775                 } else {
  2776                     head = tail = new Neg(head);
  2777                 }
  2778                 break;
  2779             case '>':   // (?>xxx)  independent group
  2780                 head = createGroup(true);
  2781                 tail = root;
  2782                 head.next = expr(tail);
  2783                 head = tail = new Ques(head, INDEPENDENT);
  2784                 break;
  2785             case '<':   // (?<xxx)  look behind
  2786                 ch = read();
  2787                 if (ASCII.isLower(ch) || ASCII.isUpper(ch)) {
  2788                     // named captured group
  2789                     String name = groupname(ch);
  2790                     if (namedGroups().containsKey(name))
  2791                         throw error("Named capturing group <" + name
  2792                                     + "> is already defined");
  2793                     capturingGroup = true;
  2794                     head = createGroup(false);
  2795                     tail = root;
  2796                     namedGroups().put(name, capturingGroupCount-1);
  2797                     head.next = expr(tail);
  2798                     break;
  2799                 }
  2800                 int start = cursor;
  2801                 head = createGroup(true);
  2802                 tail = root;
  2803                 head.next = expr(tail);
  2804                 tail.next = lookbehindEnd;
  2805                 TreeInfo info = new TreeInfo();
  2806                 head.study(info);
  2807                 if (info.maxValid == false) {
  2808                     throw error("Look-behind group does not have "
  2809                                 + "an obvious maximum length");
  2810                 }
  2811                 boolean hasSupplementary = findSupplementary(start, patternLength);
  2812                 if (ch == '=') {
  2813                     head = tail = (hasSupplementary ?
  2814                                    new BehindS(head, info.maxLength,
  2815                                                info.minLength) :
  2816                                    new Behind(head, info.maxLength,
  2817                                               info.minLength));
  2818                 } else if (ch == '!') {
  2819                     head = tail = (hasSupplementary ?
  2820                                    new NotBehindS(head, info.maxLength,
  2821                                                   info.minLength) :
  2822                                    new NotBehind(head, info.maxLength,
  2823                                                  info.minLength));
  2824                 } else {
  2825                     throw error("Unknown look-behind group");
  2826                 }
  2827                 break;
  2828             case '$':
  2829             case '@':
  2830                 throw error("Unknown group type");
  2831             default:    // (?xxx:) inlined match flags
  2832                 unread();
  2833                 addFlag();
  2834                 ch = read();
  2835                 if (ch == ')') {
  2836                     return null;    // Inline modifier only
  2837                 }
  2838                 if (ch != ':') {
  2839                     throw error("Unknown inline modifier");
  2840                 }
  2841                 head = createGroup(true);
  2842                 tail = root;
  2843                 head.next = expr(tail);
  2844                 break;
  2845             }
  2846         } else { // (xxx) a regular group
  2847             capturingGroup = true;
  2848             head = createGroup(false);
  2849             tail = root;
  2850             head.next = expr(tail);
  2851         }
  2852 
  2853         accept(')', "Unclosed group");
  2854         flags = save;
  2855 
  2856         // Check for quantifiers
  2857         Node node = closure(head);
  2858         if (node == head) { // No closure
  2859             root = tail;
  2860             return node;    // Dual return
  2861         }
  2862         if (head == tail) { // Zero length assertion
  2863             root = node;
  2864             return node;    // Dual return
  2865         }
  2866 
  2867         if (node instanceof Ques) {
  2868             Ques ques = (Ques) node;
  2869             if (ques.type == POSSESSIVE) {
  2870                 root = node;
  2871                 return node;
  2872             }
  2873             tail.next = new BranchConn();
  2874             tail = tail.next;
  2875             if (ques.type == GREEDY) {
  2876                 head = new Branch(head, null, tail);
  2877             } else { // Reluctant quantifier
  2878                 head = new Branch(null, head, tail);
  2879             }
  2880             root = tail;
  2881             return head;
  2882         } else if (node instanceof Curly) {
  2883             Curly curly = (Curly) node;
  2884             if (curly.type == POSSESSIVE) {
  2885                 root = node;
  2886                 return node;
  2887             }
  2888             // Discover if the group is deterministic
  2889             TreeInfo info = new TreeInfo();
  2890             if (head.study(info)) { // Deterministic
  2891                 GroupTail temp = (GroupTail) tail;
  2892                 head = root = new GroupCurly(head.next, curly.cmin,
  2893                                    curly.cmax, curly.type,
  2894                                    ((GroupTail)tail).localIndex,
  2895                                    ((GroupTail)tail).groupIndex,
  2896                                              capturingGroup);
  2897                 return head;
  2898             } else { // Non-deterministic
  2899                 int temp = ((GroupHead) head).localIndex;
  2900                 Loop loop;
  2901                 if (curly.type == GREEDY)
  2902                     loop = new Loop(this.localCount, temp);
  2903                 else  // Reluctant Curly
  2904                     loop = new LazyLoop(this.localCount, temp);
  2905                 Prolog prolog = new Prolog(loop);
  2906                 this.localCount += 1;
  2907                 loop.cmin = curly.cmin;
  2908                 loop.cmax = curly.cmax;
  2909                 loop.body = head;
  2910                 tail.next = loop;
  2911                 root = loop;
  2912                 return prolog; // Dual return
  2913             }
  2914         }
  2915         throw error("Internal logic error");
  2916     }
  2917 
  2918     /**
  2919      * Create group head and tail nodes using double return. If the group is
  2920      * created with anonymous true then it is a pure group and should not
  2921      * affect group counting.
  2922      */
  2923     private Node createGroup(boolean anonymous) {
  2924         int localIndex = localCount++;
  2925         int groupIndex = 0;
  2926         if (!anonymous)
  2927             groupIndex = capturingGroupCount++;
  2928         GroupHead head = new GroupHead(localIndex);
  2929         root = new GroupTail(localIndex, groupIndex);
  2930         if (!anonymous && groupIndex < 10)
  2931             groupNodes[groupIndex] = head;
  2932         return head;
  2933     }
  2934 
  2935     /**
  2936      * Parses inlined match flags and set them appropriately.
  2937      */
  2938     private void addFlag() {
  2939         int ch = peek();
  2940         for (;;) {
  2941             switch (ch) {
  2942             case 'i':
  2943                 flags |= CASE_INSENSITIVE;
  2944                 break;
  2945             case 'm':
  2946                 flags |= MULTILINE;
  2947                 break;
  2948             case 's':
  2949                 flags |= DOTALL;
  2950                 break;
  2951             case 'd':
  2952                 flags |= UNIX_LINES;
  2953                 break;
  2954             case 'u':
  2955                 flags |= UNICODE_CASE;
  2956                 break;
  2957             case 'c':
  2958                 flags |= CANON_EQ;
  2959                 break;
  2960             case 'x':
  2961                 flags |= COMMENTS;
  2962                 break;
  2963             case 'U':
  2964                 flags |= (UNICODE_CHARACTER_CLASS | UNICODE_CASE);
  2965                 break;
  2966             case '-': // subFlag then fall through
  2967                 ch = next();
  2968                 subFlag();
  2969             default:
  2970                 return;
  2971             }
  2972             ch = next();
  2973         }
  2974     }
  2975 
  2976     /**
  2977      * Parses the second part of inlined match flags and turns off
  2978      * flags appropriately.
  2979      */
  2980     private void subFlag() {
  2981         int ch = peek();
  2982         for (;;) {
  2983             switch (ch) {
  2984             case 'i':
  2985                 flags &= ~CASE_INSENSITIVE;
  2986                 break;
  2987             case 'm':
  2988                 flags &= ~MULTILINE;
  2989                 break;
  2990             case 's':
  2991                 flags &= ~DOTALL;
  2992                 break;
  2993             case 'd':
  2994                 flags &= ~UNIX_LINES;
  2995                 break;
  2996             case 'u':
  2997                 flags &= ~UNICODE_CASE;
  2998                 break;
  2999             case 'c':
  3000                 flags &= ~CANON_EQ;
  3001                 break;
  3002             case 'x':
  3003                 flags &= ~COMMENTS;
  3004                 break;
  3005             case 'U':
  3006                 flags &= ~(UNICODE_CHARACTER_CLASS | UNICODE_CASE);
  3007             default:
  3008                 return;
  3009             }
  3010             ch = next();
  3011         }
  3012     }
  3013 
  3014     static final int MAX_REPS   = 0x7FFFFFFF;
  3015 
  3016     static final int GREEDY     = 0;
  3017 
  3018     static final int LAZY       = 1;
  3019 
  3020     static final int POSSESSIVE = 2;
  3021 
  3022     static final int INDEPENDENT = 3;
  3023 
  3024     /**
  3025      * Processes repetition. If the next character peeked is a quantifier
  3026      * then new nodes must be appended to handle the repetition.
  3027      * Prev could be a single or a group, so it could be a chain of nodes.
  3028      */
  3029     private Node closure(Node prev) {
  3030         Node atom;
  3031         int ch = peek();
  3032         switch (ch) {
  3033         case '?':
  3034             ch = next();
  3035             if (ch == '?') {
  3036                 next();
  3037                 return new Ques(prev, LAZY);
  3038             } else if (ch == '+') {
  3039                 next();
  3040                 return new Ques(prev, POSSESSIVE);
  3041             }
  3042             return new Ques(prev, GREEDY);
  3043         case '*':
  3044             ch = next();
  3045             if (ch == '?') {
  3046                 next();
  3047                 return new Curly(prev, 0, MAX_REPS, LAZY);
  3048             } else if (ch == '+') {
  3049                 next();
  3050                 return new Curly(prev, 0, MAX_REPS, POSSESSIVE);
  3051             }
  3052             return new Curly(prev, 0, MAX_REPS, GREEDY);
  3053         case '+':
  3054             ch = next();
  3055             if (ch == '?') {
  3056                 next();
  3057                 return new Curly(prev, 1, MAX_REPS, LAZY);
  3058             } else if (ch == '+') {
  3059                 next();
  3060                 return new Curly(prev, 1, MAX_REPS, POSSESSIVE);
  3061             }
  3062             return new Curly(prev, 1, MAX_REPS, GREEDY);
  3063         case '{':
  3064             ch = temp[cursor+1];
  3065             if (ASCII.isDigit(ch)) {
  3066                 skip();
  3067                 int cmin = 0;
  3068                 do {
  3069                     cmin = cmin * 10 + (ch - '0');
  3070                 } while (ASCII.isDigit(ch = read()));
  3071                 int cmax = cmin;
  3072                 if (ch == ',') {
  3073                     ch = read();
  3074                     cmax = MAX_REPS;
  3075                     if (ch != '}') {
  3076                         cmax = 0;
  3077                         while (ASCII.isDigit(ch)) {
  3078                             cmax = cmax * 10 + (ch - '0');
  3079                             ch = read();
  3080                         }
  3081                     }
  3082                 }
  3083                 if (ch != '}')
  3084                     throw error("Unclosed counted closure");
  3085                 if (((cmin) | (cmax) | (cmax - cmin)) < 0)
  3086                     throw error("Illegal repetition range");
  3087                 Curly curly;
  3088                 ch = peek();
  3089                 if (ch == '?') {
  3090                     next();
  3091                     curly = new Curly(prev, cmin, cmax, LAZY);
  3092                 } else if (ch == '+') {
  3093                     next();
  3094                     curly = new Curly(prev, cmin, cmax, POSSESSIVE);
  3095                 } else {
  3096                     curly = new Curly(prev, cmin, cmax, GREEDY);
  3097                 }
  3098                 return curly;
  3099             } else {
  3100                 throw error("Illegal repetition");
  3101             }
  3102         default:
  3103             return prev;
  3104         }
  3105     }
  3106 
  3107     /**
  3108      *  Utility method for parsing control escape sequences.
  3109      */
  3110     private int c() {
  3111         if (cursor < patternLength) {
  3112             return read() ^ 64;
  3113         }
  3114         throw error("Illegal control escape sequence");
  3115     }
  3116 
  3117     /**
  3118      *  Utility method for parsing octal escape sequences.
  3119      */
  3120     private int o() {
  3121         int n = read();
  3122         if (((n-'0')|('7'-n)) >= 0) {
  3123             int m = read();
  3124             if (((m-'0')|('7'-m)) >= 0) {
  3125                 int o = read();
  3126                 if ((((o-'0')|('7'-o)) >= 0) && (((n-'0')|('3'-n)) >= 0)) {
  3127                     return (n - '0') * 64 + (m - '0') * 8 + (o - '0');
  3128                 }
  3129                 unread();
  3130                 return (n - '0') * 8 + (m - '0');
  3131             }
  3132             unread();
  3133             return (n - '0');
  3134         }
  3135         throw error("Illegal octal escape sequence");
  3136     }
  3137 
  3138     /**
  3139      *  Utility method for parsing hexadecimal escape sequences.
  3140      */
  3141     private int x() {
  3142         int n = read();
  3143         if (ASCII.isHexDigit(n)) {
  3144             int m = read();
  3145             if (ASCII.isHexDigit(m)) {
  3146                 return ASCII.toDigit(n) * 16 + ASCII.toDigit(m);
  3147             }
  3148         } else if (n == '{' && ASCII.isHexDigit(peek())) {
  3149             int ch = 0;
  3150             while (ASCII.isHexDigit(n = read())) {
  3151                 ch = (ch << 4) + ASCII.toDigit(n);
  3152                 if (ch > Character.MAX_CODE_POINT)
  3153                     throw error("Hexadecimal codepoint is too big");
  3154             }
  3155             if (n != '}')
  3156                 throw error("Unclosed hexadecimal escape sequence");
  3157             return ch;
  3158         }
  3159         throw error("Illegal hexadecimal escape sequence");
  3160     }
  3161 
  3162     /**
  3163      *  Utility method for parsing unicode escape sequences.
  3164      */
  3165     private int cursor() {
  3166         return cursor;
  3167     }
  3168 
  3169     private void setcursor(int pos) {
  3170         cursor = pos;
  3171     }
  3172 
  3173     private int uxxxx() {
  3174         int n = 0;
  3175         for (int i = 0; i < 4; i++) {
  3176             int ch = read();
  3177             if (!ASCII.isHexDigit(ch)) {
  3178                 throw error("Illegal Unicode escape sequence");
  3179             }
  3180             n = n * 16 + ASCII.toDigit(ch);
  3181         }
  3182         return n;
  3183     }
  3184 
  3185     private int u() {
  3186         int n = uxxxx();
  3187         if (Character.isHighSurrogate((char)n)) {
  3188             int cur = cursor();
  3189             if (read() == '\\' && read() == 'u') {
  3190                 int n2 = uxxxx();
  3191                 if (Character.isLowSurrogate((char)n2))
  3192                     return Character.toCodePoint((char)n, (char)n2);
  3193             }
  3194             setcursor(cur);
  3195         }
  3196         return n;
  3197     }
  3198 
  3199     //
  3200     // Utility methods for code point support
  3201     //
  3202 
  3203     private static final int countChars(CharSequence seq, int index,
  3204                                         int lengthInCodePoints) {
  3205         // optimization
  3206         if (lengthInCodePoints == 1 && !Character.isHighSurrogate(seq.charAt(index))) {
  3207             assert (index >= 0 && index < seq.length());
  3208             return 1;
  3209         }
  3210         int length = seq.length();
  3211         int x = index;
  3212         if (lengthInCodePoints >= 0) {
  3213             assert (index >= 0 && index < length);
  3214             for (int i = 0; x < length && i < lengthInCodePoints; i++) {
  3215                 if (Character.isHighSurrogate(seq.charAt(x++))) {
  3216                     if (x < length && Character.isLowSurrogate(seq.charAt(x))) {
  3217                         x++;
  3218                     }
  3219                 }
  3220             }
  3221             return x - index;
  3222         }
  3223 
  3224         assert (index >= 0 && index <= length);
  3225         if (index == 0) {
  3226             return 0;
  3227         }
  3228         int len = -lengthInCodePoints;
  3229         for (int i = 0; x > 0 && i < len; i++) {
  3230             if (Character.isLowSurrogate(seq.charAt(--x))) {
  3231                 if (x > 0 && Character.isHighSurrogate(seq.charAt(x-1))) {
  3232                     x--;
  3233                 }
  3234             }
  3235         }
  3236         return index - x;
  3237     }
  3238 
  3239     private static final int countCodePoints(CharSequence seq) {
  3240         int length = seq.length();
  3241         int n = 0;
  3242         for (int i = 0; i < length; ) {
  3243             n++;
  3244             if (Character.isHighSurrogate(seq.charAt(i++))) {
  3245                 if (i < length && Character.isLowSurrogate(seq.charAt(i))) {
  3246                     i++;
  3247                 }
  3248             }
  3249         }
  3250         return n;
  3251     }
  3252 
  3253     /**
  3254      *  Creates a bit vector for matching Latin-1 values. A normal BitClass
  3255      *  never matches values above Latin-1, and a complemented BitClass always
  3256      *  matches values above Latin-1.
  3257      */
  3258     private static final class BitClass extends BmpCharProperty {
  3259         final boolean[] bits;
  3260         BitClass() { bits = new boolean[256]; }
  3261         private BitClass(boolean[] bits) { this.bits = bits; }
  3262         BitClass add(int c, int flags) {
  3263             assert c >= 0 && c <= 255;
  3264             if ((flags & CASE_INSENSITIVE) != 0) {
  3265                 if (ASCII.isAscii(c)) {
  3266                     bits[ASCII.toUpper(c)] = true;
  3267                     bits[ASCII.toLower(c)] = true;
  3268                 } else if ((flags & UNICODE_CASE) != 0) {
  3269                     bits[Character.toLowerCase(c)] = true;
  3270                     bits[Character.toUpperCase(c)] = true;
  3271                 }
  3272             }
  3273             bits[c] = true;
  3274             return this;
  3275         }
  3276         boolean isSatisfiedBy(int ch) {
  3277             return ch < 256 && bits[ch];
  3278         }
  3279     }
  3280 
  3281     /**
  3282      *  Returns a suitably optimized, single character matcher.
  3283      */
  3284     private CharProperty newSingle(final int ch) {
  3285         if (has(CASE_INSENSITIVE)) {
  3286             int lower, upper;
  3287             if (has(UNICODE_CASE)) {
  3288                 upper = Character.toUpperCase(ch);
  3289                 lower = Character.toLowerCase(upper);
  3290                 if (upper != lower)
  3291                     return new SingleU(lower);
  3292             } else if (ASCII.isAscii(ch)) {
  3293                 lower = ASCII.toLower(ch);
  3294                 upper = ASCII.toUpper(ch);
  3295                 if (lower != upper)
  3296                     return new SingleI(lower, upper);
  3297             }
  3298         }
  3299         if (isSupplementary(ch))
  3300             return new SingleS(ch);    // Match a given Unicode character
  3301         return new Single(ch);         // Match a given BMP character
  3302     }
  3303 
  3304     /**
  3305      *  Utility method for creating a string slice matcher.
  3306      */
  3307     private Node newSlice(int[] buf, int count, boolean hasSupplementary) {
  3308         int[] tmp = new int[count];
  3309         if (has(CASE_INSENSITIVE)) {
  3310             if (has(UNICODE_CASE)) {
  3311                 for (int i = 0; i < count; i++) {
  3312                     tmp[i] = Character.toLowerCase(
  3313                                  Character.toUpperCase(buf[i]));
  3314                 }
  3315                 return hasSupplementary? new SliceUS(tmp) : new SliceU(tmp);
  3316             }
  3317             for (int i = 0; i < count; i++) {
  3318                 tmp[i] = ASCII.toLower(buf[i]);
  3319             }
  3320             return hasSupplementary? new SliceIS(tmp) : new SliceI(tmp);
  3321         }
  3322         for (int i = 0; i < count; i++) {
  3323             tmp[i] = buf[i];
  3324         }
  3325         return hasSupplementary ? new SliceS(tmp) : new Slice(tmp);
  3326     }
  3327 
  3328     /**
  3329      * The following classes are the building components of the object
  3330      * tree that represents a compiled regular expression. The object tree
  3331      * is made of individual elements that handle constructs in the Pattern.
  3332      * Each type of object knows how to match its equivalent construct with
  3333      * the match() method.
  3334      */
  3335 
  3336     /**
  3337      * Base class for all node classes. Subclasses should override the match()
  3338      * method as appropriate. This class is an accepting node, so its match()
  3339      * always returns true.
  3340      */
  3341     static class Node extends Object {
  3342         Node next;
  3343         Node() {
  3344             next = Pattern.accept;
  3345         }
  3346         /**
  3347          * This method implements the classic accept node.
  3348          */
  3349         boolean match(Matcher matcher, int i, CharSequence seq) {
  3350             matcher.last = i;
  3351             matcher.groups[0] = matcher.first;
  3352             matcher.groups[1] = matcher.last;
  3353             return true;
  3354         }
  3355         /**
  3356          * This method is good for all zero length assertions.
  3357          */
  3358         boolean study(TreeInfo info) {
  3359             if (next != null) {
  3360                 return next.study(info);
  3361             } else {
  3362                 return info.deterministic;
  3363             }
  3364         }
  3365     }
  3366 
  3367     static class LastNode extends Node {
  3368         /**
  3369          * This method implements the classic accept node with
  3370          * the addition of a check to see if the match occurred
  3371          * using all of the input.
  3372          */
  3373         boolean match(Matcher matcher, int i, CharSequence seq) {
  3374             if (matcher.acceptMode == Matcher.ENDANCHOR && i != matcher.to)
  3375                 return false;
  3376             matcher.last = i;
  3377             matcher.groups[0] = matcher.first;
  3378             matcher.groups[1] = matcher.last;
  3379             return true;
  3380         }
  3381     }
  3382 
  3383     /**
  3384      * Used for REs that can start anywhere within the input string.
  3385      * This basically tries to match repeatedly at each spot in the
  3386      * input string, moving forward after each try. An anchored search
  3387      * or a BnM will bypass this node completely.
  3388      */
  3389     static class Start extends Node {
  3390         int minLength;
  3391         Start(Node node) {
  3392             this.next = node;
  3393             TreeInfo info = new TreeInfo();
  3394             next.study(info);
  3395             minLength = info.minLength;
  3396         }
  3397         boolean match(Matcher matcher, int i, CharSequence seq) {
  3398             if (i > matcher.to - minLength) {
  3399                 matcher.hitEnd = true;
  3400                 return false;
  3401             }
  3402             int guard = matcher.to - minLength;
  3403             for (; i <= guard; i++) {
  3404                 if (next.match(matcher, i, seq)) {
  3405                     matcher.first = i;
  3406                     matcher.groups[0] = matcher.first;
  3407                     matcher.groups[1] = matcher.last;
  3408                     return true;
  3409                 }
  3410             }
  3411             matcher.hitEnd = true;
  3412             return false;
  3413         }
  3414         boolean study(TreeInfo info) {
  3415             next.study(info);
  3416             info.maxValid = false;
  3417             info.deterministic = false;
  3418             return false;
  3419         }
  3420     }
  3421 
  3422     /*
  3423      * StartS supports supplementary characters, including unpaired surrogates.
  3424      */
  3425     static final class StartS extends Start {
  3426         StartS(Node node) {
  3427             super(node);
  3428         }
  3429         boolean match(Matcher matcher, int i, CharSequence seq) {
  3430             if (i > matcher.to - minLength) {
  3431                 matcher.hitEnd = true;
  3432                 return false;
  3433             }
  3434             int guard = matcher.to - minLength;
  3435             while (i <= guard) {
  3436                 //if ((ret = next.match(matcher, i, seq)) || i == guard)
  3437                 if (next.match(matcher, i, seq)) {
  3438                     matcher.first = i;
  3439                     matcher.groups[0] = matcher.first;
  3440                     matcher.groups[1] = matcher.last;
  3441                     return true;
  3442                 }
  3443                 if (i == guard)
  3444                     break;
  3445                 // Optimization to move to the next character. This is
  3446                 // faster than countChars(seq, i, 1).
  3447                 if (Character.isHighSurrogate(seq.charAt(i++))) {
  3448                     if (i < seq.length() &&
  3449                         Character.isLowSurrogate(seq.charAt(i))) {
  3450                         i++;
  3451                     }
  3452                 }
  3453             }
  3454             matcher.hitEnd = true;
  3455             return false;
  3456         }
  3457     }
  3458 
  3459     /**
  3460      * Node to anchor at the beginning of input. This object implements the
  3461      * match for a \A sequence, and the caret anchor will use this if not in
  3462      * multiline mode.
  3463      */
  3464     static final class Begin extends Node {
  3465         boolean match(Matcher matcher, int i, CharSequence seq) {
  3466             int fromIndex = (matcher.anchoringBounds) ?
  3467                 matcher.from : 0;
  3468             if (i == fromIndex && next.match(matcher, i, seq)) {
  3469                 matcher.first = i;
  3470                 matcher.groups[0] = i;
  3471                 matcher.groups[1] = matcher.last;
  3472                 return true;
  3473             } else {
  3474                 return false;
  3475             }
  3476         }
  3477     }
  3478 
  3479     /**
  3480      * Node to anchor at the end of input. This is the absolute end, so this
  3481      * should not match at the last newline before the end as $ will.
  3482      */
  3483     static final class End extends Node {
  3484         boolean match(Matcher matcher, int i, CharSequence seq) {
  3485             int endIndex = (matcher.anchoringBounds) ?
  3486                 matcher.to : matcher.getTextLength();
  3487             if (i == endIndex) {
  3488                 matcher.hitEnd = true;
  3489                 return next.match(matcher, i, seq);
  3490             }
  3491             return false;
  3492         }
  3493     }
  3494 
  3495     /**
  3496      * Node to anchor at the beginning of a line. This is essentially the
  3497      * object to match for the multiline ^.
  3498      */
  3499     static final class Caret extends Node {
  3500         boolean match(Matcher matcher, int i, CharSequence seq) {
  3501             int startIndex = matcher.from;
  3502             int endIndex = matcher.to;
  3503             if (!matcher.anchoringBounds) {
  3504                 startIndex = 0;
  3505                 endIndex = matcher.getTextLength();
  3506             }
  3507             // Perl does not match ^ at end of input even after newline
  3508             if (i == endIndex) {
  3509                 matcher.hitEnd = true;
  3510                 return false;
  3511             }
  3512             if (i > startIndex) {
  3513                 char ch = seq.charAt(i-1);
  3514                 if (ch != '\n' && ch != '\r'
  3515                     && (ch|1) != '\u2029'
  3516                     && ch != '\u0085' ) {
  3517                     return false;
  3518                 }
  3519                 // Should treat /r/n as one newline
  3520                 if (ch == '\r' && seq.charAt(i) == '\n')
  3521                     return false;
  3522             }
  3523             return next.match(matcher, i, seq);
  3524         }
  3525     }
  3526 
  3527     /**
  3528      * Node to anchor at the beginning of a line when in unixdot mode.
  3529      */
  3530     static final class UnixCaret extends Node {
  3531         boolean match(Matcher matcher, int i, CharSequence seq) {
  3532             int startIndex = matcher.from;
  3533             int endIndex = matcher.to;
  3534             if (!matcher.anchoringBounds) {
  3535                 startIndex = 0;
  3536                 endIndex = matcher.getTextLength();
  3537             }
  3538             // Perl does not match ^ at end of input even after newline
  3539             if (i == endIndex) {
  3540                 matcher.hitEnd = true;
  3541                 return false;
  3542             }
  3543             if (i > startIndex) {
  3544                 char ch = seq.charAt(i-1);
  3545                 if (ch != '\n') {
  3546                     return false;
  3547                 }
  3548             }
  3549             return next.match(matcher, i, seq);
  3550         }
  3551     }
  3552 
  3553     /**
  3554      * Node to match the location where the last match ended.
  3555      * This is used for the \G construct.
  3556      */
  3557     static final class LastMatch extends Node {
  3558         boolean match(Matcher matcher, int i, CharSequence seq) {
  3559             if (i != matcher.oldLast)
  3560                 return false;
  3561             return next.match(matcher, i, seq);
  3562         }
  3563     }
  3564 
  3565     /**
  3566      * Node to anchor at the end of a line or the end of input based on the
  3567      * multiline mode.
  3568      *
  3569      * When not in multiline mode, the $ can only match at the very end
  3570      * of the input, unless the input ends in a line terminator in which
  3571      * it matches right before the last line terminator.
  3572      *
  3573      * Note that \r\n is considered an atomic line terminator.
  3574      *
  3575      * Like ^ the $ operator matches at a position, it does not match the
  3576      * line terminators themselves.
  3577      */
  3578     static final class Dollar extends Node {
  3579         boolean multiline;
  3580         Dollar(boolean mul) {
  3581             multiline = mul;
  3582         }
  3583         boolean match(Matcher matcher, int i, CharSequence seq) {
  3584             int endIndex = (matcher.anchoringBounds) ?
  3585                 matcher.to : matcher.getTextLength();
  3586             if (!multiline) {
  3587                 if (i < endIndex - 2)
  3588                     return false;
  3589                 if (i == endIndex - 2) {
  3590                     char ch = seq.charAt(i);
  3591                     if (ch != '\r')
  3592                         return false;
  3593                     ch = seq.charAt(i + 1);
  3594                     if (ch != '\n')
  3595                         return false;
  3596                 }
  3597             }
  3598             // Matches before any line terminator; also matches at the
  3599             // end of input
  3600             // Before line terminator:
  3601             // If multiline, we match here no matter what
  3602             // If not multiline, fall through so that the end
  3603             // is marked as hit; this must be a /r/n or a /n
  3604             // at the very end so the end was hit; more input
  3605             // could make this not match here
  3606             if (i < endIndex) {
  3607                 char ch = seq.charAt(i);
  3608                  if (ch == '\n') {
  3609                      // No match between \r\n
  3610                      if (i > 0 && seq.charAt(i-1) == '\r')
  3611                          return false;
  3612                      if (multiline)
  3613                          return next.match(matcher, i, seq);
  3614                  } else if (ch == '\r' || ch == '\u0085' ||
  3615                             (ch|1) == '\u2029') {
  3616                      if (multiline)
  3617                          return next.match(matcher, i, seq);
  3618                  } else { // No line terminator, no match
  3619                      return false;
  3620                  }
  3621             }
  3622             // Matched at current end so hit end
  3623             matcher.hitEnd = true;
  3624             // If a $ matches because of end of input, then more input
  3625             // could cause it to fail!
  3626             matcher.requireEnd = true;
  3627             return next.match(matcher, i, seq);
  3628         }
  3629         boolean study(TreeInfo info) {
  3630             next.study(info);
  3631             return info.deterministic;
  3632         }
  3633     }
  3634 
  3635     /**
  3636      * Node to anchor at the end of a line or the end of input based on the
  3637      * multiline mode when in unix lines mode.
  3638      */
  3639     static final class UnixDollar extends Node {
  3640         boolean multiline;
  3641         UnixDollar(boolean mul) {
  3642             multiline = mul;
  3643         }
  3644         boolean match(Matcher matcher, int i, CharSequence seq) {
  3645             int endIndex = (matcher.anchoringBounds) ?
  3646                 matcher.to : matcher.getTextLength();
  3647             if (i < endIndex) {
  3648                 char ch = seq.charAt(i);
  3649                 if (ch == '\n') {
  3650                     // If not multiline, then only possible to
  3651                     // match at very end or one before end
  3652                     if (multiline == false && i != endIndex - 1)
  3653                         return false;
  3654                     // If multiline return next.match without setting
  3655                     // matcher.hitEnd
  3656                     if (multiline)
  3657                         return next.match(matcher, i, seq);
  3658                 } else {
  3659                     return false;
  3660                 }
  3661             }
  3662             // Matching because at the end or 1 before the end;
  3663             // more input could change this so set hitEnd
  3664             matcher.hitEnd = true;
  3665             // If a $ matches because of end of input, then more input
  3666             // could cause it to fail!
  3667             matcher.requireEnd = true;
  3668             return next.match(matcher, i, seq);
  3669         }
  3670         boolean study(TreeInfo info) {
  3671             next.study(info);
  3672             return info.deterministic;
  3673         }
  3674     }
  3675 
  3676     /**
  3677      * Abstract node class to match one character satisfying some
  3678      * boolean property.
  3679      */
  3680     private static abstract class CharProperty extends Node {
  3681         abstract boolean isSatisfiedBy(int ch);
  3682         CharProperty complement() {
  3683             return new CharProperty() {
  3684                     boolean isSatisfiedBy(int ch) {
  3685                         return ! CharProperty.this.isSatisfiedBy(ch);}};
  3686         }
  3687         boolean match(Matcher matcher, int i, CharSequence seq) {
  3688             if (i < matcher.to) {
  3689                 int ch = Character.codePointAt(seq, i);
  3690                 return isSatisfiedBy(ch)
  3691                     && next.match(matcher, i+Character.charCount(ch), seq);
  3692             } else {
  3693                 matcher.hitEnd = true;
  3694                 return false;
  3695             }
  3696         }
  3697         boolean study(TreeInfo info) {
  3698             info.minLength++;
  3699             info.maxLength++;
  3700             return next.study(info);
  3701         }
  3702     }
  3703 
  3704     /**
  3705      * Optimized version of CharProperty that works only for
  3706      * properties never satisfied by Supplementary characters.
  3707      */
  3708     private static abstract class BmpCharProperty extends CharProperty {
  3709         boolean match(Matcher matcher, int i, CharSequence seq) {
  3710             if (i < matcher.to) {
  3711                 return isSatisfiedBy(seq.charAt(i))
  3712                     && next.match(matcher, i+1, seq);
  3713             } else {
  3714                 matcher.hitEnd = true;
  3715                 return false;
  3716             }
  3717         }
  3718     }
  3719 
  3720     /**
  3721      * Node class that matches a Supplementary Unicode character
  3722      */
  3723     static final class SingleS extends CharProperty {
  3724         final int c;
  3725         SingleS(int c) { this.c = c; }
  3726         boolean isSatisfiedBy(int ch) {
  3727             return ch == c;
  3728         }
  3729     }
  3730 
  3731     /**
  3732      * Optimization -- matches a given BMP character
  3733      */
  3734     static final class Single extends BmpCharProperty {
  3735         final int c;
  3736         Single(int c) { this.c = c; }
  3737         boolean isSatisfiedBy(int ch) {
  3738             return ch == c;
  3739         }
  3740     }
  3741 
  3742     /**
  3743      * Case insensitive matches a given BMP character
  3744      */
  3745     static final class SingleI extends BmpCharProperty {
  3746         final int lower;
  3747         final int upper;
  3748         SingleI(int lower, int upper) {
  3749             this.lower = lower;
  3750             this.upper = upper;
  3751         }
  3752         boolean isSatisfiedBy(int ch) {
  3753             return ch == lower || ch == upper;
  3754         }
  3755     }
  3756 
  3757     /**
  3758      * Unicode case insensitive matches a given Unicode character
  3759      */
  3760     static final class SingleU extends CharProperty {
  3761         final int lower;
  3762         SingleU(int lower) {
  3763             this.lower = lower;
  3764         }
  3765         boolean isSatisfiedBy(int ch) {
  3766             return lower == ch ||
  3767                 lower == Character.toLowerCase(Character.toUpperCase(ch));
  3768         }
  3769     }
  3770 
  3771 
  3772     /**
  3773      * Node class that matches a Unicode block.
  3774      *
  3775     static final class Block extends CharProperty {
  3776         final Character.UnicodeBlock block;
  3777         Block(Character.UnicodeBlock block) {
  3778             this.block = block;
  3779         }
  3780         boolean isSatisfiedBy(int ch) {
  3781             return block == Character.UnicodeBlock.of(ch);
  3782         }
  3783     }
  3784 
  3785     /**
  3786      * Node class that matches a Unicode script
  3787      *
  3788     static final class Script extends CharProperty {
  3789         final Character.UnicodeScript script;
  3790         Script(Character.UnicodeScript script) {
  3791             this.script = script;
  3792         }
  3793         boolean isSatisfiedBy(int ch) {
  3794             return script == Character.UnicodeScript.of(ch);
  3795         }
  3796     }
  3797 
  3798     /**
  3799      * Node class that matches a Unicode category.
  3800      */
  3801     static final class Category extends CharProperty {
  3802         final int typeMask;
  3803         Category(int typeMask) { this.typeMask = typeMask; }
  3804         boolean isSatisfiedBy(int ch) {
  3805             return (typeMask & (1 << Character.getType(ch))) != 0;
  3806         }
  3807     }
  3808 
  3809     /**
  3810      * Node class that matches a Unicode "type"
  3811      */
  3812     static final class Utype extends CharProperty {
  3813         final UnicodeProp uprop;
  3814         Utype(UnicodeProp uprop) { this.uprop = uprop; }
  3815         boolean isSatisfiedBy(int ch) {
  3816             return uprop.is(ch);
  3817         }
  3818     }
  3819 
  3820 
  3821     /**
  3822      * Node class that matches a POSIX type.
  3823      */
  3824     static final class Ctype extends BmpCharProperty {
  3825         final int ctype;
  3826         Ctype(int ctype) { this.ctype = ctype; }
  3827         boolean isSatisfiedBy(int ch) {
  3828             return ch < 128 && ASCII.isType(ch, ctype);
  3829         }
  3830     }
  3831 
  3832     /**
  3833      * Base class for all Slice nodes
  3834      */
  3835     static class SliceNode extends Node {
  3836         int[] buffer;
  3837         SliceNode(int[] buf) {
  3838             buffer = buf;
  3839         }
  3840         boolean study(TreeInfo info) {
  3841             info.minLength += buffer.length;
  3842             info.maxLength += buffer.length;
  3843             return next.study(info);
  3844         }
  3845     }
  3846 
  3847     /**
  3848      * Node class for a case sensitive/BMP-only sequence of literal
  3849      * characters.
  3850      */
  3851     static final class Slice extends SliceNode {
  3852         Slice(int[] buf) {
  3853             super(buf);
  3854         }
  3855         boolean match(Matcher matcher, int i, CharSequence seq) {
  3856             int[] buf = buffer;
  3857             int len = buf.length;
  3858             for (int j=0; j<len; j++) {
  3859                 if ((i+j) >= matcher.to) {
  3860                     matcher.hitEnd = true;
  3861                     return false;
  3862                 }
  3863                 if (buf[j] != seq.charAt(i+j))
  3864                     return false;
  3865             }
  3866             return next.match(matcher, i+len, seq);
  3867         }
  3868     }
  3869 
  3870     /**
  3871      * Node class for a case_insensitive/BMP-only sequence of literal
  3872      * characters.
  3873      */
  3874     static class SliceI extends SliceNode {
  3875         SliceI(int[] buf) {
  3876             super(buf);
  3877         }
  3878         boolean match(Matcher matcher, int i, CharSequence seq) {
  3879             int[] buf = buffer;
  3880             int len = buf.length;
  3881             for (int j=0; j<len; j++) {
  3882                 if ((i+j) >= matcher.to) {
  3883                     matcher.hitEnd = true;
  3884                     return false;
  3885                 }
  3886                 int c = seq.charAt(i+j);
  3887                 if (buf[j] != c &&
  3888                     buf[j] != ASCII.toLower(c))
  3889                     return false;
  3890             }
  3891             return next.match(matcher, i+len, seq);
  3892         }
  3893     }
  3894 
  3895     /**
  3896      * Node class for a unicode_case_insensitive/BMP-only sequence of
  3897      * literal characters. Uses unicode case folding.
  3898      */
  3899     static final class SliceU extends SliceNode {
  3900         SliceU(int[] buf) {
  3901             super(buf);
  3902         }
  3903         boolean match(Matcher matcher, int i, CharSequence seq) {
  3904             int[] buf = buffer;
  3905             int len = buf.length;
  3906             for (int j=0; j<len; j++) {
  3907                 if ((i+j) >= matcher.to) {
  3908                     matcher.hitEnd = true;
  3909                     return false;
  3910                 }
  3911                 int c = seq.charAt(i+j);
  3912                 if (buf[j] != c &&
  3913                     buf[j] != Character.toLowerCase(Character.toUpperCase(c)))
  3914                     return false;
  3915             }
  3916             return next.match(matcher, i+len, seq);
  3917         }
  3918     }
  3919 
  3920     /**
  3921      * Node class for a case sensitive sequence of literal characters
  3922      * including supplementary characters.
  3923      */
  3924     static final class SliceS extends SliceNode {
  3925         SliceS(int[] buf) {
  3926             super(buf);
  3927         }
  3928         boolean match(Matcher matcher, int i, CharSequence seq) {
  3929             int[] buf = buffer;
  3930             int x = i;
  3931             for (int j = 0; j < buf.length; j++) {
  3932                 if (x >= matcher.to) {
  3933                     matcher.hitEnd = true;
  3934                     return false;
  3935                 }
  3936                 int c = Character.codePointAt(seq, x);
  3937                 if (buf[j] != c)
  3938                     return false;
  3939                 x += Character.charCount(c);
  3940                 if (x > matcher.to) {
  3941                     matcher.hitEnd = true;
  3942                     return false;
  3943                 }
  3944             }
  3945             return next.match(matcher, x, seq);
  3946         }
  3947     }
  3948 
  3949     /**
  3950      * Node class for a case insensitive sequence of literal characters
  3951      * including supplementary characters.
  3952      */
  3953     static class SliceIS extends SliceNode {
  3954         SliceIS(int[] buf) {
  3955             super(buf);
  3956         }
  3957         int toLower(int c) {
  3958             return ASCII.toLower(c);
  3959         }
  3960         boolean match(Matcher matcher, int i, CharSequence seq) {
  3961             int[] buf = buffer;
  3962             int x = i;
  3963             for (int j = 0; j < buf.length; j++) {
  3964                 if (x >= matcher.to) {
  3965                     matcher.hitEnd = true;
  3966                     return false;
  3967                 }
  3968                 int c = Character.codePointAt(seq, x);
  3969                 if (buf[j] != c && buf[j] != toLower(c))
  3970                     return false;
  3971                 x += Character.charCount(c);
  3972                 if (x > matcher.to) {
  3973                     matcher.hitEnd = true;
  3974                     return false;
  3975                 }
  3976             }
  3977             return next.match(matcher, x, seq);
  3978         }
  3979     }
  3980 
  3981     /**
  3982      * Node class for a case insensitive sequence of literal characters.
  3983      * Uses unicode case folding.
  3984      */
  3985     static final class SliceUS extends SliceIS {
  3986         SliceUS(int[] buf) {
  3987             super(buf);
  3988         }
  3989         int toLower(int c) {
  3990             return Character.toLowerCase(Character.toUpperCase(c));
  3991         }
  3992     }
  3993 
  3994     private static boolean inRange(int lower, int ch, int upper) {
  3995         return lower <= ch && ch <= upper;
  3996     }
  3997 
  3998     /**
  3999      * Returns node for matching characters within an explicit value range.
  4000      */
  4001     private static CharProperty rangeFor(final int lower,
  4002                                          final int upper) {
  4003         return new CharProperty() {
  4004                 boolean isSatisfiedBy(int ch) {
  4005                     return inRange(lower, ch, upper);}};
  4006     }
  4007 
  4008     /**
  4009      * Returns node for matching characters within an explicit value
  4010      * range in a case insensitive manner.
  4011      */
  4012     private CharProperty caseInsensitiveRangeFor(final int lower,
  4013                                                  final int upper) {
  4014         if (has(UNICODE_CASE))
  4015             return new CharProperty() {
  4016                 boolean isSatisfiedBy(int ch) {
  4017                     if (inRange(lower, ch, upper))
  4018                         return true;
  4019                     int up = Character.toUpperCase(ch);
  4020                     return inRange(lower, up, upper) ||
  4021                            inRange(lower, Character.toLowerCase(up), upper);}};
  4022         return new CharProperty() {
  4023             boolean isSatisfiedBy(int ch) {
  4024                 return inRange(lower, ch, upper) ||
  4025                     ASCII.isAscii(ch) &&
  4026                         (inRange(lower, ASCII.toUpper(ch), upper) ||
  4027                          inRange(lower, ASCII.toLower(ch), upper));
  4028             }};
  4029     }
  4030 
  4031     /**
  4032      * Implements the Unicode category ALL and the dot metacharacter when
  4033      * in dotall mode.
  4034      */
  4035     static final class All extends CharProperty {
  4036         boolean isSatisfiedBy(int ch) {
  4037             return true;
  4038         }
  4039     }
  4040 
  4041     /**
  4042      * Node class for the dot metacharacter when dotall is not enabled.
  4043      */
  4044     static final class Dot extends CharProperty {
  4045         boolean isSatisfiedBy(int ch) {
  4046             return (ch != '\n' && ch != '\r'
  4047                     && (ch|1) != '\u2029'
  4048                     && ch != '\u0085');
  4049         }
  4050     }
  4051 
  4052     /**
  4053      * Node class for the dot metacharacter when dotall is not enabled
  4054      * but UNIX_LINES is enabled.
  4055      */
  4056     static final class UnixDot extends CharProperty {
  4057         boolean isSatisfiedBy(int ch) {
  4058             return ch != '\n';
  4059         }
  4060     }
  4061 
  4062     /**
  4063      * The 0 or 1 quantifier. This one class implements all three types.
  4064      */
  4065     static final class Ques extends Node {
  4066         Node atom;
  4067         int type;
  4068         Ques(Node node, int type) {
  4069             this.atom = node;
  4070             this.type = type;
  4071         }
  4072         boolean match(Matcher matcher, int i, CharSequence seq) {
  4073             switch (type) {
  4074             case GREEDY:
  4075                 return (atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq))
  4076                     || next.match(matcher, i, seq);
  4077             case LAZY:
  4078                 return next.match(matcher, i, seq)
  4079                     || (atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq));
  4080             case POSSESSIVE:
  4081                 if (atom.match(matcher, i, seq)) i = matcher.last;
  4082                 return next.match(matcher, i, seq);
  4083             default:
  4084                 return atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq);
  4085             }
  4086         }
  4087         boolean study(TreeInfo info) {
  4088             if (type != INDEPENDENT) {
  4089                 int minL = info.minLength;
  4090                 atom.study(info);
  4091                 info.minLength = minL;
  4092                 info.deterministic = false;
  4093                 return next.study(info);
  4094             } else {
  4095                 atom.study(info);
  4096                 return next.study(info);
  4097             }
  4098         }
  4099     }
  4100 
  4101     /**
  4102      * Handles the curly-brace style repetition with a specified minimum and
  4103      * maximum occurrences. The * quantifier is handled as a special case.
  4104      * This class handles the three types.
  4105      */
  4106     static final class Curly extends Node {
  4107         Node atom;
  4108         int type;
  4109         int cmin;
  4110         int cmax;
  4111 
  4112         Curly(Node node, int cmin, int cmax, int type) {
  4113             this.atom = node;
  4114             this.type = type;
  4115             this.cmin = cmin;
  4116             this.cmax = cmax;
  4117         }
  4118         boolean match(Matcher matcher, int i, CharSequence seq) {
  4119             int j;
  4120             for (j = 0; j < cmin; j++) {
  4121                 if (atom.match(matcher, i, seq)) {
  4122                     i = matcher.last;
  4123                     continue;
  4124                 }
  4125                 return false;
  4126             }
  4127             if (type == GREEDY)
  4128                 return match0(matcher, i, j, seq);
  4129             else if (type == LAZY)
  4130                 return match1(matcher, i, j, seq);
  4131             else
  4132                 return match2(matcher, i, j, seq);
  4133         }
  4134         // Greedy match.
  4135         // i is the index to start matching at
  4136         // j is the number of atoms that have matched
  4137         boolean match0(Matcher matcher, int i, int j, CharSequence seq) {
  4138             if (j >= cmax) {
  4139                 // We have matched the maximum... continue with the rest of
  4140                 // the regular expression
  4141                 return next.match(matcher, i, seq);
  4142             }
  4143             int backLimit = j;
  4144             while (atom.match(matcher, i, seq)) {
  4145                 // k is the length of this match
  4146                 int k = matcher.last - i;
  4147                 if (k == 0) // Zero length match
  4148                     break;
  4149                 // Move up index and number matched
  4150                 i = matcher.last;
  4151                 j++;
  4152                 // We are greedy so match as many as we can
  4153                 while (j < cmax) {
  4154                     if (!atom.match(matcher, i, seq))
  4155                         break;
  4156                     if (i + k != matcher.last) {
  4157                         if (match0(matcher, matcher.last, j+1, seq))
  4158                             return true;
  4159                         break;
  4160                     }
  4161                     i += k;
  4162                     j++;
  4163                 }
  4164                 // Handle backing off if match fails
  4165                 while (j >= backLimit) {
  4166                    if (next.match(matcher, i, seq))
  4167                         return true;
  4168                     i -= k;
  4169                     j--;
  4170                 }
  4171                 return false;
  4172             }
  4173             return next.match(matcher, i, seq);
  4174         }
  4175         // Reluctant match. At this point, the minimum has been satisfied.
  4176         // i is the index to start matching at
  4177         // j is the number of atoms that have matched
  4178         boolean match1(Matcher matcher, int i, int j, CharSequence seq) {
  4179             for (;;) {
  4180                 // Try finishing match without consuming any more
  4181                 if (next.match(matcher, i, seq))
  4182                     return true;
  4183                 // At the maximum, no match found
  4184                 if (j >= cmax)
  4185                     return false;
  4186                 // Okay, must try one more atom
  4187                 if (!atom.match(matcher, i, seq))
  4188                     return false;
  4189                 // If we haven't moved forward then must break out
  4190                 if (i == matcher.last)
  4191                     return false;
  4192                 // Move up index and number matched
  4193                 i = matcher.last;
  4194                 j++;
  4195             }
  4196         }
  4197         boolean match2(Matcher matcher, int i, int j, CharSequence seq) {
  4198             for (; j < cmax; j++) {
  4199                 if (!atom.match(matcher, i, seq))
  4200                     break;
  4201                 if (i == matcher.last)
  4202                     break;
  4203                 i = matcher.last;
  4204             }
  4205             return next.match(matcher, i, seq);
  4206         }
  4207         boolean study(TreeInfo info) {
  4208             // Save original info
  4209             int minL = info.minLength;
  4210             int maxL = info.maxLength;
  4211             boolean maxV = info.maxValid;
  4212             boolean detm = info.deterministic;
  4213             info.reset();
  4214 
  4215             atom.study(info);
  4216 
  4217             int temp = info.minLength * cmin + minL;
  4218             if (temp < minL) {
  4219                 temp = 0xFFFFFFF; // arbitrary large number
  4220             }
  4221             info.minLength = temp;
  4222 
  4223             if (maxV & info.maxValid) {
  4224                 temp = info.maxLength * cmax + maxL;
  4225                 info.maxLength = temp;
  4226                 if (temp < maxL) {
  4227                     info.maxValid = false;
  4228                 }
  4229             } else {
  4230                 info.maxValid = false;
  4231             }
  4232 
  4233             if (info.deterministic && cmin == cmax)
  4234                 info.deterministic = detm;
  4235             else
  4236                 info.deterministic = false;
  4237 
  4238             return next.study(info);
  4239         }
  4240     }
  4241 
  4242     /**
  4243      * Handles the curly-brace style repetition with a specified minimum and
  4244      * maximum occurrences in deterministic cases. This is an iterative
  4245      * optimization over the Prolog and Loop system which would handle this
  4246      * in a recursive way. The * quantifier is handled as a special case.
  4247      * If capture is true then this class saves group settings and ensures
  4248      * that groups are unset when backing off of a group match.
  4249      */
  4250     static final class GroupCurly extends Node {
  4251         Node atom;
  4252         int type;
  4253         int cmin;
  4254         int cmax;
  4255         int localIndex;
  4256         int groupIndex;
  4257         boolean capture;
  4258 
  4259         GroupCurly(Node node, int cmin, int cmax, int type, int local,
  4260                    int group, boolean capture) {
  4261             this.atom = node;
  4262             this.type = type;
  4263             this.cmin = cmin;
  4264             this.cmax = cmax;
  4265             this.localIndex = local;
  4266             this.groupIndex = group;
  4267             this.capture = capture;
  4268         }
  4269         boolean match(Matcher matcher, int i, CharSequence seq) {
  4270             int[] groups = matcher.groups;
  4271             int[] locals = matcher.locals;
  4272             int save0 = locals[localIndex];
  4273             int save1 = 0;
  4274             int save2 = 0;
  4275 
  4276             if (capture) {
  4277                 save1 = groups[groupIndex];
  4278                 save2 = groups[groupIndex+1];
  4279             }
  4280 
  4281             // Notify GroupTail there is no need to setup group info
  4282             // because it will be set here
  4283             locals[localIndex] = -1;
  4284 
  4285             boolean ret = true;
  4286             for (int j = 0; j < cmin; j++) {
  4287                 if (atom.match(matcher, i, seq)) {
  4288                     if (capture) {
  4289                         groups[groupIndex] = i;
  4290                         groups[groupIndex+1] = matcher.last;
  4291                     }
  4292                     i = matcher.last;
  4293                 } else {
  4294                     ret = false;
  4295                     break;
  4296                 }
  4297             }
  4298             if (ret) {
  4299                 if (type == GREEDY) {
  4300                     ret = match0(matcher, i, cmin, seq);
  4301                 } else if (type == LAZY) {
  4302                     ret = match1(matcher, i, cmin, seq);
  4303                 } else {
  4304                     ret = match2(matcher, i, cmin, seq);
  4305                 }
  4306             }
  4307             if (!ret) {
  4308                 locals[localIndex] = save0;
  4309                 if (capture) {
  4310                     groups[groupIndex] = save1;
  4311                     groups[groupIndex+1] = save2;
  4312                 }
  4313             }
  4314             return ret;
  4315         }
  4316         // Aggressive group match
  4317         boolean match0(Matcher matcher, int i, int j, CharSequence seq) {
  4318             int[] groups = matcher.groups;
  4319             int save0 = 0;
  4320             int save1 = 0;
  4321             if (capture) {
  4322                 save0 = groups[groupIndex];
  4323                 save1 = groups[groupIndex+1];
  4324             }
  4325             for (;;) {
  4326                 if (j >= cmax)
  4327                     break;
  4328                 if (!atom.match(matcher, i, seq))
  4329                     break;
  4330                 int k = matcher.last - i;
  4331                 if (k <= 0) {
  4332                     if (capture) {
  4333                         groups[groupIndex] = i;
  4334                         groups[groupIndex+1] = i + k;
  4335                     }
  4336                     i = i + k;
  4337                     break;
  4338                 }
  4339                 for (;;) {
  4340                     if (capture) {
  4341                         groups[groupIndex] = i;
  4342                         groups[groupIndex+1] = i + k;
  4343                     }
  4344                     i = i + k;
  4345                     if (++j >= cmax)
  4346                         break;
  4347                     if (!atom.match(matcher, i, seq))
  4348                         break;
  4349                     if (i + k != matcher.last) {
  4350                         if (match0(matcher, i, j, seq))
  4351                             return true;
  4352                         break;
  4353                     }
  4354                 }
  4355                 while (j > cmin) {
  4356                     if (next.match(matcher, i, seq)) {
  4357                         if (capture) {
  4358                             groups[groupIndex+1] = i;
  4359                             groups[groupIndex] = i - k;
  4360                         }
  4361                         i = i - k;
  4362                         return true;
  4363                     }
  4364                     // backing off
  4365                     if (capture) {
  4366                         groups[groupIndex+1] = i;
  4367                         groups[groupIndex] = i - k;
  4368                     }
  4369                     i = i - k;
  4370                     j--;
  4371                 }
  4372                 break;
  4373             }
  4374             if (capture) {
  4375                 groups[groupIndex] = save0;
  4376                 groups[groupIndex+1] = save1;
  4377             }
  4378             return next.match(matcher, i, seq);
  4379         }
  4380         // Reluctant matching
  4381         boolean match1(Matcher matcher, int i, int j, CharSequence seq) {
  4382             for (;;) {
  4383                 if (next.match(matcher, i, seq))
  4384                     return true;
  4385                 if (j >= cmax)
  4386                     return false;
  4387                 if (!atom.match(matcher, i, seq))
  4388                     return false;
  4389                 if (i == matcher.last)
  4390                     return false;
  4391                 if (capture) {
  4392                     matcher.groups[groupIndex] = i;
  4393                     matcher.groups[groupIndex+1] = matcher.last;
  4394                 }
  4395                 i = matcher.last;
  4396                 j++;
  4397             }
  4398         }
  4399         // Possessive matching
  4400         boolean match2(Matcher matcher, int i, int j, CharSequence seq) {
  4401             for (; j < cmax; j++) {
  4402                 if (!atom.match(matcher, i, seq)) {
  4403                     break;
  4404                 }
  4405                 if (capture) {
  4406                     matcher.groups[groupIndex] = i;
  4407                     matcher.groups[groupIndex+1] = matcher.last;
  4408                 }
  4409                 if (i == matcher.last) {
  4410                     break;
  4411                 }
  4412                 i = matcher.last;
  4413             }
  4414             return next.match(matcher, i, seq);
  4415         }
  4416         boolean study(TreeInfo info) {
  4417             // Save original info
  4418             int minL = info.minLength;
  4419             int maxL = info.maxLength;
  4420             boolean maxV = info.maxValid;
  4421             boolean detm = info.deterministic;
  4422             info.reset();
  4423 
  4424             atom.study(info);
  4425 
  4426             int temp = info.minLength * cmin + minL;
  4427             if (temp < minL) {
  4428                 temp = 0xFFFFFFF; // Arbitrary large number
  4429             }
  4430             info.minLength = temp;
  4431 
  4432             if (maxV & info.maxValid) {
  4433                 temp = info.maxLength * cmax + maxL;
  4434                 info.maxLength = temp;
  4435                 if (temp < maxL) {
  4436                     info.maxValid = false;
  4437                 }
  4438             } else {
  4439                 info.maxValid = false;
  4440             }
  4441 
  4442             if (info.deterministic && cmin == cmax) {
  4443                 info.deterministic = detm;
  4444             } else {
  4445                 info.deterministic = false;
  4446             }
  4447 
  4448             return next.study(info);
  4449         }
  4450     }
  4451 
  4452     /**
  4453      * A Guard node at the end of each atom node in a Branch. It
  4454      * serves the purpose of chaining the "match" operation to
  4455      * "next" but not the "study", so we can collect the TreeInfo
  4456      * of each atom node without including the TreeInfo of the
  4457      * "next".
  4458      */
  4459     static final class BranchConn extends Node {
  4460         BranchConn() {};
  4461         boolean match(Matcher matcher, int i, CharSequence seq) {
  4462             return next.match(matcher, i, seq);
  4463         }
  4464         boolean study(TreeInfo info) {
  4465             return info.deterministic;
  4466         }
  4467     }
  4468 
  4469     /**
  4470      * Handles the branching of alternations. Note this is also used for
  4471      * the ? quantifier to branch between the case where it matches once
  4472      * and where it does not occur.
  4473      */
  4474     static final class Branch extends Node {
  4475         Node[] atoms = new Node[2];
  4476         int size = 2;
  4477         Node conn;
  4478         Branch(Node first, Node second, Node branchConn) {
  4479             conn = branchConn;
  4480             atoms[0] = first;
  4481             atoms[1] = second;
  4482         }
  4483 
  4484         void add(Node node) {
  4485             if (size >= atoms.length) {
  4486                 Node[] tmp = new Node[atoms.length*2];
  4487                 System.arraycopy(atoms, 0, tmp, 0, atoms.length);
  4488                 atoms = tmp;
  4489             }
  4490             atoms[size++] = node;
  4491         }
  4492 
  4493         boolean match(Matcher matcher, int i, CharSequence seq) {
  4494             for (int n = 0; n < size; n++) {
  4495                 if (atoms[n] == null) {
  4496                     if (conn.next.match(matcher, i, seq))
  4497                         return true;
  4498                 } else if (atoms[n].match(matcher, i, seq)) {
  4499                     return true;
  4500                 }
  4501             }
  4502             return false;
  4503         }
  4504 
  4505         boolean study(TreeInfo info) {
  4506             int minL = info.minLength;
  4507             int maxL = info.maxLength;
  4508             boolean maxV = info.maxValid;
  4509 
  4510             int minL2 = Integer.MAX_VALUE; //arbitrary large enough num
  4511             int maxL2 = -1;
  4512             for (int n = 0; n < size; n++) {
  4513                 info.reset();
  4514                 if (atoms[n] != null)
  4515                     atoms[n].study(info);
  4516                 minL2 = Math.min(minL2, info.minLength);
  4517                 maxL2 = Math.max(maxL2, info.maxLength);
  4518                 maxV = (maxV & info.maxValid);
  4519             }
  4520 
  4521             minL += minL2;
  4522             maxL += maxL2;
  4523 
  4524             info.reset();
  4525             conn.next.study(info);
  4526 
  4527             info.minLength += minL;
  4528             info.maxLength += maxL;
  4529             info.maxValid &= maxV;
  4530             info.deterministic = false;
  4531             return false;
  4532         }
  4533     }
  4534 
  4535     /**
  4536      * The GroupHead saves the location where the group begins in the locals
  4537      * and restores them when the match is done.
  4538      *
  4539      * The matchRef is used when a reference to this group is accessed later
  4540      * in the expression. The locals will have a negative value in them to
  4541      * indicate that we do not want to unset the group if the reference
  4542      * doesn't match.
  4543      */
  4544     static final class GroupHead extends Node {
  4545         int localIndex;
  4546         GroupHead(int localCount) {
  4547             localIndex = localCount;
  4548         }
  4549         boolean match(Matcher matcher, int i, CharSequence seq) {
  4550             int save = matcher.locals[localIndex];
  4551             matcher.locals[localIndex] = i;
  4552             boolean ret = next.match(matcher, i, seq);
  4553             matcher.locals[localIndex] = save;
  4554             return ret;
  4555         }
  4556         boolean matchRef(Matcher matcher, int i, CharSequence seq) {
  4557             int save = matcher.locals[localIndex];
  4558             matcher.locals[localIndex] = ~i; // HACK
  4559             boolean ret = next.match(matcher, i, seq);
  4560             matcher.locals[localIndex] = save;
  4561             return ret;
  4562         }
  4563     }
  4564 
  4565     /**
  4566      * Recursive reference to a group in the regular expression. It calls
  4567      * matchRef because if the reference fails to match we would not unset
  4568      * the group.
  4569      */
  4570     static final class GroupRef extends Node {
  4571         GroupHead head;
  4572         GroupRef(GroupHead head) {
  4573             this.head = head;
  4574         }
  4575         boolean match(Matcher matcher, int i, CharSequence seq) {
  4576             return head.matchRef(matcher, i, seq)
  4577                 && next.match(matcher, matcher.last, seq);
  4578         }
  4579         boolean study(TreeInfo info) {
  4580             info.maxValid = false;
  4581             info.deterministic = false;
  4582             return next.study(info);
  4583         }
  4584     }
  4585 
  4586     /**
  4587      * The GroupTail handles the setting of group beginning and ending
  4588      * locations when groups are successfully matched. It must also be able to
  4589      * unset groups that have to be backed off of.
  4590      *
  4591      * The GroupTail node is also used when a previous group is referenced,
  4592      * and in that case no group information needs to be set.
  4593      */
  4594     static final class GroupTail extends Node {
  4595         int localIndex;
  4596         int groupIndex;
  4597         GroupTail(int localCount, int groupCount) {
  4598             localIndex = localCount;
  4599             groupIndex = groupCount + groupCount;
  4600         }
  4601         boolean match(Matcher matcher, int i, CharSequence seq) {
  4602             int tmp = matcher.locals[localIndex];
  4603             if (tmp >= 0) { // This is the normal group case.
  4604                 // Save the group so we can unset it if it
  4605                 // backs off of a match.
  4606                 int groupStart = matcher.groups[groupIndex];
  4607                 int groupEnd = matcher.groups[groupIndex+1];
  4608 
  4609                 matcher.groups[groupIndex] = tmp;
  4610                 matcher.groups[groupIndex+1] = i;
  4611                 if (next.match(matcher, i, seq)) {
  4612                     return true;
  4613                 }
  4614                 matcher.groups[groupIndex] = groupStart;
  4615                 matcher.groups[groupIndex+1] = groupEnd;
  4616                 return false;
  4617             } else {
  4618                 // This is a group reference case. We don't need to save any
  4619                 // group info because it isn't really a group.
  4620                 matcher.last = i;
  4621                 return true;
  4622             }
  4623         }
  4624     }
  4625 
  4626     /**
  4627      * This sets up a loop to handle a recursive quantifier structure.
  4628      */
  4629     static final class Prolog extends Node {
  4630         Loop loop;
  4631         Prolog(Loop loop) {
  4632             this.loop = loop;
  4633         }
  4634         boolean match(Matcher matcher, int i, CharSequence seq) {
  4635             return loop.matchInit(matcher, i, seq);
  4636         }
  4637         boolean study(TreeInfo info) {
  4638             return loop.study(info);
  4639         }
  4640     }
  4641 
  4642     /**
  4643      * Handles the repetition count for a greedy Curly. The matchInit
  4644      * is called from the Prolog to save the index of where the group
  4645      * beginning is stored. A zero length group check occurs in the
  4646      * normal match but is skipped in the matchInit.
  4647      */
  4648     static class Loop extends Node {
  4649         Node body;
  4650         int countIndex; // local count index in matcher locals
  4651         int beginIndex; // group beginning index
  4652         int cmin, cmax;
  4653         Loop(int countIndex, int beginIndex) {
  4654             this.countIndex = countIndex;
  4655             this.beginIndex = beginIndex;
  4656         }
  4657         boolean match(Matcher matcher, int i, CharSequence seq) {
  4658             // Avoid infinite loop in zero-length case.
  4659             if (i > matcher.locals[beginIndex]) {
  4660                 int count = matcher.locals[countIndex];
  4661 
  4662                 // This block is for before we reach the minimum
  4663                 // iterations required for the loop to match
  4664                 if (count < cmin) {
  4665                     matcher.locals[countIndex] = count + 1;
  4666                     boolean b = body.match(matcher, i, seq);
  4667                     // If match failed we must backtrack, so
  4668                     // the loop count should NOT be incremented
  4669                     if (!b)
  4670                         matcher.locals[countIndex] = count;
  4671                     // Return success or failure since we are under
  4672                     // minimum
  4673                     return b;
  4674                 }
  4675                 // This block is for after we have the minimum
  4676                 // iterations required for the loop to match
  4677                 if (count < cmax) {
  4678                     matcher.locals[countIndex] = count + 1;
  4679                     boolean b = body.match(matcher, i, seq);
  4680                     // If match failed we must backtrack, so
  4681                     // the loop count should NOT be incremented
  4682                     if (!b)
  4683                         matcher.locals[countIndex] = count;
  4684                     else
  4685                         return true;
  4686                 }
  4687             }
  4688             return next.match(matcher, i, seq);
  4689         }
  4690         boolean matchInit(Matcher matcher, int i, CharSequence seq) {
  4691             int save = matcher.locals[countIndex];
  4692             boolean ret = false;
  4693             if (0 < cmin) {
  4694                 matcher.locals[countIndex] = 1;
  4695                 ret = body.match(matcher, i, seq);
  4696             } else if (0 < cmax) {
  4697                 matcher.locals[countIndex] = 1;
  4698                 ret = body.match(matcher, i, seq);
  4699                 if (ret == false)
  4700                     ret = next.match(matcher, i, seq);
  4701             } else {
  4702                 ret = next.match(matcher, i, seq);
  4703             }
  4704             matcher.locals[countIndex] = save;
  4705             return ret;
  4706         }
  4707         boolean study(TreeInfo info) {
  4708             info.maxValid = false;
  4709             info.deterministic = false;
  4710             return false;
  4711         }
  4712     }
  4713 
  4714     /**
  4715      * Handles the repetition count for a reluctant Curly. The matchInit
  4716      * is called from the Prolog to save the index of where the group
  4717      * beginning is stored. A zero length group check occurs in the
  4718      * normal match but is skipped in the matchInit.
  4719      */
  4720     static final class LazyLoop extends Loop {
  4721         LazyLoop(int countIndex, int beginIndex) {
  4722             super(countIndex, beginIndex);
  4723         }
  4724         boolean match(Matcher matcher, int i, CharSequence seq) {
  4725             // Check for zero length group
  4726             if (i > matcher.locals[beginIndex]) {
  4727                 int count = matcher.locals[countIndex];
  4728                 if (count < cmin) {
  4729                     matcher.locals[countIndex] = count + 1;
  4730                     boolean result = body.match(matcher, i, seq);
  4731                     // If match failed we must backtrack, so
  4732                     // the loop count should NOT be incremented
  4733                     if (!result)
  4734                         matcher.locals[countIndex] = count;
  4735                     return result;
  4736                 }
  4737                 if (next.match(matcher, i, seq))
  4738                     return true;
  4739                 if (count < cmax) {
  4740                     matcher.locals[countIndex] = count + 1;
  4741                     boolean result = body.match(matcher, i, seq);
  4742                     // If match failed we must backtrack, so
  4743                     // the loop count should NOT be incremented
  4744                     if (!result)
  4745                         matcher.locals[countIndex] = count;
  4746                     return result;
  4747                 }
  4748                 return false;
  4749             }
  4750             return next.match(matcher, i, seq);
  4751         }
  4752         boolean matchInit(Matcher matcher, int i, CharSequence seq) {
  4753             int save = matcher.locals[countIndex];
  4754             boolean ret = false;
  4755             if (0 < cmin) {
  4756                 matcher.locals[countIndex] = 1;
  4757                 ret = body.match(matcher, i, seq);
  4758             } else if (next.match(matcher, i, seq)) {
  4759                 ret = true;
  4760             } else if (0 < cmax) {
  4761                 matcher.locals[countIndex] = 1;
  4762                 ret = body.match(matcher, i, seq);
  4763             }
  4764             matcher.locals[countIndex] = save;
  4765             return ret;
  4766         }
  4767         boolean study(TreeInfo info) {
  4768             info.maxValid = false;
  4769             info.deterministic = false;
  4770             return false;
  4771         }
  4772     }
  4773 
  4774     /**
  4775      * Refers to a group in the regular expression. Attempts to match
  4776      * whatever the group referred to last matched.
  4777      */
  4778     static class BackRef extends Node {
  4779         int groupIndex;
  4780         BackRef(int groupCount) {
  4781             super();
  4782             groupIndex = groupCount + groupCount;
  4783         }
  4784         boolean match(Matcher matcher, int i, CharSequence seq) {
  4785             int j = matcher.groups[groupIndex];
  4786             int k = matcher.groups[groupIndex+1];
  4787 
  4788             int groupSize = k - j;
  4789 
  4790             // If the referenced group didn't match, neither can this
  4791             if (j < 0)
  4792                 return false;
  4793 
  4794             // If there isn't enough input left no match
  4795             if (i + groupSize > matcher.to) {
  4796                 matcher.hitEnd = true;
  4797                 return false;
  4798             }
  4799 
  4800             // Check each new char to make sure it matches what the group
  4801             // referenced matched last time around
  4802             for (int index=0; index<groupSize; index++)
  4803                 if (seq.charAt(i+index) != seq.charAt(j+index))
  4804                     return false;
  4805 
  4806             return next.match(matcher, i+groupSize, seq);
  4807         }
  4808         boolean study(TreeInfo info) {
  4809             info.maxValid = false;
  4810             return next.study(info);
  4811         }
  4812     }
  4813 
  4814     static class CIBackRef extends Node {
  4815         int groupIndex;
  4816         boolean doUnicodeCase;
  4817         CIBackRef(int groupCount, boolean doUnicodeCase) {
  4818             super();
  4819             groupIndex = groupCount + groupCount;
  4820             this.doUnicodeCase = doUnicodeCase;
  4821         }
  4822         boolean match(Matcher matcher, int i, CharSequence seq) {
  4823             int j = matcher.groups[groupIndex];
  4824             int k = matcher.groups[groupIndex+1];
  4825 
  4826             int groupSize = k - j;
  4827 
  4828             // If the referenced group didn't match, neither can this
  4829             if (j < 0)
  4830                 return false;
  4831 
  4832             // If there isn't enough input left no match
  4833             if (i + groupSize > matcher.to) {
  4834                 matcher.hitEnd = true;
  4835                 return false;
  4836             }
  4837 
  4838             // Check each new char to make sure it matches what the group
  4839             // referenced matched last time around
  4840             int x = i;
  4841             for (int index=0; index<groupSize; index++) {
  4842                 int c1 = Character.codePointAt(seq, x);
  4843                 int c2 = Character.codePointAt(seq, j);
  4844                 if (c1 != c2) {
  4845                     if (doUnicodeCase) {
  4846                         int cc1 = Character.toUpperCase(c1);
  4847                         int cc2 = Character.toUpperCase(c2);
  4848                         if (cc1 != cc2 &&
  4849                             Character.toLowerCase(cc1) !=
  4850                             Character.toLowerCase(cc2))
  4851                             return false;
  4852                     } else {
  4853                         if (ASCII.toLower(c1) != ASCII.toLower(c2))
  4854                             return false;
  4855                     }
  4856                 }
  4857                 x += Character.charCount(c1);
  4858                 j += Character.charCount(c2);
  4859             }
  4860 
  4861             return next.match(matcher, i+groupSize, seq);
  4862         }
  4863         boolean study(TreeInfo info) {
  4864             info.maxValid = false;
  4865             return next.study(info);
  4866         }
  4867     }
  4868 
  4869     /**
  4870      * Searches until the next instance of its atom. This is useful for
  4871      * finding the atom efficiently without passing an instance of it
  4872      * (greedy problem) and without a lot of wasted search time (reluctant
  4873      * problem).
  4874      */
  4875     static final class First extends Node {
  4876         Node atom;
  4877         First(Node node) {
  4878             this.atom = BnM.optimize(node);
  4879         }
  4880         boolean match(Matcher matcher, int i, CharSequence seq) {
  4881             if (atom instanceof BnM) {
  4882                 return atom.match(matcher, i, seq)
  4883                     && next.match(matcher, matcher.last, seq);
  4884             }
  4885             for (;;) {
  4886                 if (i > matcher.to) {
  4887                     matcher.hitEnd = true;
  4888                     return false;
  4889                 }
  4890                 if (atom.match(matcher, i, seq)) {
  4891                     return next.match(matcher, matcher.last, seq);
  4892                 }
  4893                 i += countChars(seq, i, 1);
  4894                 matcher.first++;
  4895             }
  4896         }
  4897         boolean study(TreeInfo info) {
  4898             atom.study(info);
  4899             info.maxValid = false;
  4900             info.deterministic = false;
  4901             return next.study(info);
  4902         }
  4903     }
  4904 
  4905     static final class Conditional extends Node {
  4906         Node cond, yes, not;
  4907         Conditional(Node cond, Node yes, Node not) {
  4908             this.cond = cond;
  4909             this.yes = yes;
  4910             this.not = not;
  4911         }
  4912         boolean match(Matcher matcher, int i, CharSequence seq) {
  4913             if (cond.match(matcher, i, seq)) {
  4914                 return yes.match(matcher, i, seq);
  4915             } else {
  4916                 return not.match(matcher, i, seq);
  4917             }
  4918         }
  4919         boolean study(TreeInfo info) {
  4920             int minL = info.minLength;
  4921             int maxL = info.maxLength;
  4922             boolean maxV = info.maxValid;
  4923             info.reset();
  4924             yes.study(info);
  4925 
  4926             int minL2 = info.minLength;
  4927             int maxL2 = info.maxLength;
  4928             boolean maxV2 = info.maxValid;
  4929             info.reset();
  4930             not.study(info);
  4931 
  4932             info.minLength = minL + Math.min(minL2, info.minLength);
  4933             info.maxLength = maxL + Math.max(maxL2, info.maxLength);
  4934             info.maxValid = (maxV & maxV2 & info.maxValid);
  4935             info.deterministic = false;
  4936             return next.study(info);
  4937         }
  4938     }
  4939 
  4940     /**
  4941      * Zero width positive lookahead.
  4942      */
  4943     static final class Pos extends Node {
  4944         Node cond;
  4945         Pos(Node cond) {
  4946             this.cond = cond;
  4947         }
  4948         boolean match(Matcher matcher, int i, CharSequence seq) {
  4949             int savedTo = matcher.to;
  4950             boolean conditionMatched = false;
  4951 
  4952             // Relax transparent region boundaries for lookahead
  4953             if (matcher.transparentBounds)
  4954                 matcher.to = matcher.getTextLength();
  4955             try {
  4956                 conditionMatched = cond.match(matcher, i, seq);
  4957             } finally {
  4958                 // Reinstate region boundaries
  4959                 matcher.to = savedTo;
  4960             }
  4961             return conditionMatched && next.match(matcher, i, seq);
  4962         }
  4963     }
  4964 
  4965     /**
  4966      * Zero width negative lookahead.
  4967      */
  4968     static final class Neg extends Node {
  4969         Node cond;
  4970         Neg(Node cond) {
  4971             this.cond = cond;
  4972         }
  4973         boolean match(Matcher matcher, int i, CharSequence seq) {
  4974             int savedTo = matcher.to;
  4975             boolean conditionMatched = false;
  4976 
  4977             // Relax transparent region boundaries for lookahead
  4978             if (matcher.transparentBounds)
  4979                 matcher.to = matcher.getTextLength();
  4980             try {
  4981                 if (i < matcher.to) {
  4982                     conditionMatched = !cond.match(matcher, i, seq);
  4983                 } else {
  4984                     // If a negative lookahead succeeds then more input
  4985                     // could cause it to fail!
  4986                     matcher.requireEnd = true;
  4987                     conditionMatched = !cond.match(matcher, i, seq);
  4988                 }
  4989             } finally {
  4990                 // Reinstate region boundaries
  4991                 matcher.to = savedTo;
  4992             }
  4993             return conditionMatched && next.match(matcher, i, seq);
  4994         }
  4995     }
  4996 
  4997     /**
  4998      * For use with lookbehinds; matches the position where the lookbehind
  4999      * was encountered.
  5000      */
  5001     static Node lookbehindEnd = new Node() {
  5002         boolean match(Matcher matcher, int i, CharSequence seq) {
  5003             return i == matcher.lookbehindTo;
  5004         }
  5005     };
  5006 
  5007     /**
  5008      * Zero width positive lookbehind.
  5009      */
  5010     static class Behind extends Node {
  5011         Node cond;
  5012         int rmax, rmin;
  5013         Behind(Node cond, int rmax, int rmin) {
  5014             this.cond = cond;
  5015             this.rmax = rmax;
  5016             this.rmin = rmin;
  5017         }
  5018 
  5019         boolean match(Matcher matcher, int i, CharSequence seq) {
  5020             int savedFrom = matcher.from;
  5021             boolean conditionMatched = false;
  5022             int startIndex = (!matcher.transparentBounds) ?
  5023                              matcher.from : 0;
  5024             int from = Math.max(i - rmax, startIndex);
  5025             // Set end boundary
  5026             int savedLBT = matcher.lookbehindTo;
  5027             matcher.lookbehindTo = i;
  5028             // Relax transparent region boundaries for lookbehind
  5029             if (matcher.transparentBounds)
  5030                 matcher.from = 0;
  5031             for (int j = i - rmin; !conditionMatched && j >= from; j--) {
  5032                 conditionMatched = cond.match(matcher, j, seq);
  5033             }
  5034             matcher.from = savedFrom;
  5035             matcher.lookbehindTo = savedLBT;
  5036             return conditionMatched && next.match(matcher, i, seq);
  5037         }
  5038     }
  5039 
  5040     /**
  5041      * Zero width positive lookbehind, including supplementary
  5042      * characters or unpaired surrogates.
  5043      */
  5044     static final class BehindS extends Behind {
  5045         BehindS(Node cond, int rmax, int rmin) {
  5046             super(cond, rmax, rmin);
  5047         }
  5048         boolean match(Matcher matcher, int i, CharSequence seq) {
  5049             int rmaxChars = countChars(seq, i, -rmax);
  5050             int rminChars = countChars(seq, i, -rmin);
  5051             int savedFrom = matcher.from;
  5052             int startIndex = (!matcher.transparentBounds) ?
  5053                              matcher.from : 0;
  5054             boolean conditionMatched = false;
  5055             int from = Math.max(i - rmaxChars, startIndex);
  5056             // Set end boundary
  5057             int savedLBT = matcher.lookbehindTo;
  5058             matcher.lookbehindTo = i;
  5059             // Relax transparent region boundaries for lookbehind
  5060             if (matcher.transparentBounds)
  5061                 matcher.from = 0;
  5062 
  5063             for (int j = i - rminChars;
  5064                  !conditionMatched && j >= from;
  5065                  j -= j>from ? countChars(seq, j, -1) : 1) {
  5066                 conditionMatched = cond.match(matcher, j, seq);
  5067             }
  5068             matcher.from = savedFrom;
  5069             matcher.lookbehindTo = savedLBT;
  5070             return conditionMatched && next.match(matcher, i, seq);
  5071         }
  5072     }
  5073 
  5074     /**
  5075      * Zero width negative lookbehind.
  5076      */
  5077     static class NotBehind extends Node {
  5078         Node cond;
  5079         int rmax, rmin;
  5080         NotBehind(Node cond, int rmax, int rmin) {
  5081             this.cond = cond;
  5082             this.rmax = rmax;
  5083             this.rmin = rmin;
  5084         }
  5085 
  5086         boolean match(Matcher matcher, int i, CharSequence seq) {
  5087             int savedLBT = matcher.lookbehindTo;
  5088             int savedFrom = matcher.from;
  5089             boolean conditionMatched = false;
  5090             int startIndex = (!matcher.transparentBounds) ?
  5091                              matcher.from : 0;
  5092             int from = Math.max(i - rmax, startIndex);
  5093             matcher.lookbehindTo = i;
  5094             // Relax transparent region boundaries for lookbehind
  5095             if (matcher.transparentBounds)
  5096                 matcher.from = 0;
  5097             for (int j = i - rmin; !conditionMatched && j >= from; j--) {
  5098                 conditionMatched = cond.match(matcher, j, seq);
  5099             }
  5100             // Reinstate region boundaries
  5101             matcher.from = savedFrom;
  5102             matcher.lookbehindTo = savedLBT;
  5103             return !conditionMatched && next.match(matcher, i, seq);
  5104         }
  5105     }
  5106 
  5107     /**
  5108      * Zero width negative lookbehind, including supplementary
  5109      * characters or unpaired surrogates.
  5110      */
  5111     static final class NotBehindS extends NotBehind {
  5112         NotBehindS(Node cond, int rmax, int rmin) {
  5113             super(cond, rmax, rmin);
  5114         }
  5115         boolean match(Matcher matcher, int i, CharSequence seq) {
  5116             int rmaxChars = countChars(seq, i, -rmax);
  5117             int rminChars = countChars(seq, i, -rmin);
  5118             int savedFrom = matcher.from;
  5119             int savedLBT = matcher.lookbehindTo;
  5120             boolean conditionMatched = false;
  5121             int startIndex = (!matcher.transparentBounds) ?
  5122                              matcher.from : 0;
  5123             int from = Math.max(i - rmaxChars, startIndex);
  5124             matcher.lookbehindTo = i;
  5125             // Relax transparent region boundaries for lookbehind
  5126             if (matcher.transparentBounds)
  5127                 matcher.from = 0;
  5128             for (int j = i - rminChars;
  5129                  !conditionMatched && j >= from;
  5130                  j -= j>from ? countChars(seq, j, -1) : 1) {
  5131                 conditionMatched = cond.match(matcher, j, seq);
  5132             }
  5133             //Reinstate region boundaries
  5134             matcher.from = savedFrom;
  5135             matcher.lookbehindTo = savedLBT;
  5136             return !conditionMatched && next.match(matcher, i, seq);
  5137         }
  5138     }
  5139 
  5140     /**
  5141      * Returns the set union of two CharProperty nodes.
  5142      */
  5143     private static CharProperty union(final CharProperty lhs,
  5144                                       final CharProperty rhs) {
  5145         return new CharProperty() {
  5146                 boolean isSatisfiedBy(int ch) {
  5147                     return lhs.isSatisfiedBy(ch) || rhs.isSatisfiedBy(ch);}};
  5148     }
  5149 
  5150     /**
  5151      * Returns the set intersection of two CharProperty nodes.
  5152      */
  5153     private static CharProperty intersection(final CharProperty lhs,
  5154                                              final CharProperty rhs) {
  5155         return new CharProperty() {
  5156                 boolean isSatisfiedBy(int ch) {
  5157                     return lhs.isSatisfiedBy(ch) && rhs.isSatisfiedBy(ch);}};
  5158     }
  5159 
  5160     /**
  5161      * Returns the set difference of two CharProperty nodes.
  5162      */
  5163     private static CharProperty setDifference(final CharProperty lhs,
  5164                                               final CharProperty rhs) {
  5165         return new CharProperty() {
  5166                 boolean isSatisfiedBy(int ch) {
  5167                     return ! rhs.isSatisfiedBy(ch) && lhs.isSatisfiedBy(ch);}};
  5168     }
  5169 
  5170     /**
  5171      * Handles word boundaries. Includes a field to allow this one class to
  5172      * deal with the different types of word boundaries we can match. The word
  5173      * characters include underscores, letters, and digits. Non spacing marks
  5174      * can are also part of a word if they have a base character, otherwise
  5175      * they are ignored for purposes of finding word boundaries.
  5176      */
  5177     static final class Bound extends Node {
  5178         static int LEFT = 0x1;
  5179         static int RIGHT= 0x2;
  5180         static int BOTH = 0x3;
  5181         static int NONE = 0x4;
  5182         int type;
  5183         boolean useUWORD;
  5184         Bound(int n, boolean useUWORD) {
  5185             type = n;
  5186             this.useUWORD = useUWORD;
  5187         }
  5188 
  5189         boolean isWord(int ch) {
  5190             return useUWORD ? UnicodeProp.WORD.is(ch)
  5191                             : (ch == '_' || Character.isLetterOrDigit(ch));
  5192         }
  5193 
  5194         int check(Matcher matcher, int i, CharSequence seq) {
  5195             int ch;
  5196             boolean left = false;
  5197             int startIndex = matcher.from;
  5198             int endIndex = matcher.to;
  5199             if (matcher.transparentBounds) {
  5200                 startIndex = 0;
  5201                 endIndex = matcher.getTextLength();
  5202             }
  5203             if (i > startIndex) {
  5204                 ch = Character.codePointBefore(seq, i);
  5205                 left = (isWord(ch) ||
  5206                     ((Character.getType(ch) == Character.NON_SPACING_MARK)
  5207                      && hasBaseCharacter(matcher, i-1, seq)));
  5208             }
  5209             boolean right = false;
  5210             if (i < endIndex) {
  5211                 ch = Character.codePointAt(seq, i);
  5212                 right = (isWord(ch) ||
  5213                     ((Character.getType(ch) == Character.NON_SPACING_MARK)
  5214                      && hasBaseCharacter(matcher, i, seq)));
  5215             } else {
  5216                 // Tried to access char past the end
  5217                 matcher.hitEnd = true;
  5218                 // The addition of another char could wreck a boundary
  5219                 matcher.requireEnd = true;
  5220             }
  5221             return ((left ^ right) ? (right ? LEFT : RIGHT) : NONE);
  5222         }
  5223         boolean match(Matcher matcher, int i, CharSequence seq) {
  5224             return (check(matcher, i, seq) & type) > 0
  5225                 && next.match(matcher, i, seq);
  5226         }
  5227     }
  5228 
  5229     /**
  5230      * Non spacing marks only count as word characters in bounds calculations
  5231      * if they have a base character.
  5232      */
  5233     private static boolean hasBaseCharacter(Matcher matcher, int i,
  5234                                             CharSequence seq)
  5235     {
  5236         int start = (!matcher.transparentBounds) ?
  5237             matcher.from : 0;
  5238         for (int x=i; x >= start; x--) {
  5239             int ch = Character.codePointAt(seq, x);
  5240             if (Character.isLetterOrDigit(ch))
  5241                 return true;
  5242             if (Character.getType(ch) == Character.NON_SPACING_MARK)
  5243                 continue;
  5244             return false;
  5245         }
  5246         return false;
  5247     }
  5248 
  5249     /**
  5250      * Attempts to match a slice in the input using the Boyer-Moore string
  5251      * matching algorithm. The algorithm is based on the idea that the
  5252      * pattern can be shifted farther ahead in the search text if it is
  5253      * matched right to left.
  5254      * <p>
  5255      * The pattern is compared to the input one character at a time, from
  5256      * the rightmost character in the pattern to the left. If the characters
  5257      * all match the pattern has been found. If a character does not match,
  5258      * the pattern is shifted right a distance that is the maximum of two
  5259      * functions, the bad character shift and the good suffix shift. This
  5260      * shift moves the attempted match position through the input more
  5261      * quickly than a naive one position at a time check.
  5262      * <p>
  5263      * The bad character shift is based on the character from the text that
  5264      * did not match. If the character does not appear in the pattern, the
  5265      * pattern can be shifted completely beyond the bad character. If the
  5266      * character does occur in the pattern, the pattern can be shifted to
  5267      * line the pattern up with the next occurrence of that character.
  5268      * <p>
  5269      * The good suffix shift is based on the idea that some subset on the right
  5270      * side of the pattern has matched. When a bad character is found, the
  5271      * pattern can be shifted right by the pattern length if the subset does
  5272      * not occur again in pattern, or by the amount of distance to the
  5273      * next occurrence of the subset in the pattern.
  5274      *
  5275      * Boyer-Moore search methods adapted from code by Amy Yu.
  5276      */
  5277     static class BnM extends Node {
  5278         int[] buffer;
  5279         int[] lastOcc;
  5280         int[] optoSft;
  5281 
  5282         /**
  5283          * Pre calculates arrays needed to generate the bad character
  5284          * shift and the good suffix shift. Only the last seven bits
  5285          * are used to see if chars match; This keeps the tables small
  5286          * and covers the heavily used ASCII range, but occasionally
  5287          * results in an aliased match for the bad character shift.
  5288          */
  5289         static Node optimize(Node node) {
  5290             if (!(node instanceof Slice)) {
  5291                 return node;
  5292             }
  5293 
  5294             int[] src = ((Slice) node).buffer;
  5295             int patternLength = src.length;
  5296             // The BM algorithm requires a bit of overhead;
  5297             // If the pattern is short don't use it, since
  5298             // a shift larger than the pattern length cannot
  5299             // be used anyway.
  5300             if (patternLength < 4) {
  5301                 return node;
  5302             }
  5303             int i, j, k;
  5304             int[] lastOcc = new int[128];
  5305             int[] optoSft = new int[patternLength];
  5306             // Precalculate part of the bad character shift
  5307             // It is a table for where in the pattern each
  5308             // lower 7-bit value occurs
  5309             for (i = 0; i < patternLength; i++) {
  5310                 lastOcc[src[i]&0x7F] = i + 1;
  5311             }
  5312             // Precalculate the good suffix shift
  5313             // i is the shift amount being considered
  5314 NEXT:       for (i = patternLength; i > 0; i--) {
  5315                 // j is the beginning index of suffix being considered
  5316                 for (j = patternLength - 1; j >= i; j--) {
  5317                     // Testing for good suffix
  5318                     if (src[j] == src[j-i]) {
  5319                         // src[j..len] is a good suffix
  5320                         optoSft[j-1] = i;
  5321                     } else {
  5322                         // No match. The array has already been
  5323                         // filled up with correct values before.
  5324                         continue NEXT;
  5325                     }
  5326                 }
  5327                 // This fills up the remaining of optoSft
  5328                 // any suffix can not have larger shift amount
  5329                 // then its sub-suffix. Why???
  5330                 while (j > 0) {
  5331                     optoSft[--j] = i;
  5332                 }
  5333             }
  5334             // Set the guard value because of unicode compression
  5335             optoSft[patternLength-1] = 1;
  5336             if (node instanceof SliceS)
  5337                 return new BnMS(src, lastOcc, optoSft, node.next);
  5338             return new BnM(src, lastOcc, optoSft, node.next);
  5339         }
  5340         BnM(int[] src, int[] lastOcc, int[] optoSft, Node next) {
  5341             this.buffer = src;
  5342             this.lastOcc = lastOcc;
  5343             this.optoSft = optoSft;
  5344             this.next = next;
  5345         }
  5346         boolean match(Matcher matcher, int i, CharSequence seq) {
  5347             int[] src = buffer;
  5348             int patternLength = src.length;
  5349             int last = matcher.to - patternLength;
  5350 
  5351             // Loop over all possible match positions in text
  5352 NEXT:       while (i <= last) {
  5353                 // Loop over pattern from right to left
  5354                 for (int j = patternLength - 1; j >= 0; j--) {
  5355                     int ch = seq.charAt(i+j);
  5356                     if (ch != src[j]) {
  5357                         // Shift search to the right by the maximum of the
  5358                         // bad character shift and the good suffix shift
  5359                         i += Math.max(j + 1 - lastOcc[ch&0x7F], optoSft[j]);
  5360                         continue NEXT;
  5361                     }
  5362                 }
  5363                 // Entire pattern matched starting at i
  5364                 matcher.first = i;
  5365                 boolean ret = next.match(matcher, i + patternLength, seq);
  5366                 if (ret) {
  5367                     matcher.first = i;
  5368                     matcher.groups[0] = matcher.first;
  5369                     matcher.groups[1] = matcher.last;
  5370                     return true;
  5371                 }
  5372                 i++;
  5373             }
  5374             // BnM is only used as the leading node in the unanchored case,
  5375             // and it replaced its Start() which always searches to the end
  5376             // if it doesn't find what it's looking for, so hitEnd is true.
  5377             matcher.hitEnd = true;
  5378             return false;
  5379         }
  5380         boolean study(TreeInfo info) {
  5381             info.minLength += buffer.length;
  5382             info.maxValid = false;
  5383             return next.study(info);
  5384         }
  5385     }
  5386 
  5387     /**
  5388      * Supplementary support version of BnM(). Unpaired surrogates are
  5389      * also handled by this class.
  5390      */
  5391     static final class BnMS extends BnM {
  5392         int lengthInChars;
  5393 
  5394         BnMS(int[] src, int[] lastOcc, int[] optoSft, Node next) {
  5395             super(src, lastOcc, optoSft, next);
  5396             for (int x = 0; x < buffer.length; x++) {
  5397                 lengthInChars += Character.charCount(buffer[x]);
  5398             }
  5399         }
  5400         boolean match(Matcher matcher, int i, CharSequence seq) {
  5401             int[] src = buffer;
  5402             int patternLength = src.length;
  5403             int last = matcher.to - lengthInChars;
  5404 
  5405             // Loop over all possible match positions in text
  5406 NEXT:       while (i <= last) {
  5407                 // Loop over pattern from right to left
  5408                 int ch;
  5409                 for (int j = countChars(seq, i, patternLength), x = patternLength - 1;
  5410                      j > 0; j -= Character.charCount(ch), x--) {
  5411                     ch = Character.codePointBefore(seq, i+j);
  5412                     if (ch != src[x]) {
  5413                         // Shift search to the right by the maximum of the
  5414                         // bad character shift and the good suffix shift
  5415                         int n = Math.max(x + 1 - lastOcc[ch&0x7F], optoSft[x]);
  5416                         i += countChars(seq, i, n);
  5417                         continue NEXT;
  5418                     }
  5419                 }
  5420                 // Entire pattern matched starting at i
  5421                 matcher.first = i;
  5422                 boolean ret = next.match(matcher, i + lengthInChars, seq);
  5423                 if (ret) {
  5424                     matcher.first = i;
  5425                     matcher.groups[0] = matcher.first;
  5426                     matcher.groups[1] = matcher.last;
  5427                     return true;
  5428                 }
  5429                 i += countChars(seq, i, 1);
  5430             }
  5431             matcher.hitEnd = true;
  5432             return false;
  5433         }
  5434     }
  5435 
  5436 ///////////////////////////////////////////////////////////////////////////////
  5437 ///////////////////////////////////////////////////////////////////////////////
  5438 
  5439     /**
  5440      *  This must be the very first initializer.
  5441      */
  5442     static Node accept = new Node();
  5443 
  5444     static Node lastAccept = new LastNode();
  5445 
  5446     private static class CharPropertyNames {
  5447 
  5448         static CharProperty charPropertyFor(String name) {
  5449             CharPropertyFactory m = map.get(name);
  5450             return m == null ? null : m.make();
  5451         }
  5452 
  5453         private static abstract class CharPropertyFactory {
  5454             abstract CharProperty make();
  5455         }
  5456 
  5457         private static void defCategory(String name,
  5458                                         final int typeMask) {
  5459             map.put(name, new CharPropertyFactory() {
  5460                     CharProperty make() { return new Category(typeMask);}});
  5461         }
  5462 
  5463         private static void defRange(String name,
  5464                                      final int lower, final int upper) {
  5465             map.put(name, new CharPropertyFactory() {
  5466                     CharProperty make() { return rangeFor(lower, upper);}});
  5467         }
  5468 
  5469         private static void defCtype(String name,
  5470                                      final int ctype) {
  5471             map.put(name, new CharPropertyFactory() {
  5472                     CharProperty make() { return new Ctype(ctype);}});
  5473         }
  5474 
  5475         private static abstract class CloneableProperty
  5476             extends CharProperty implements Cloneable
  5477         {
  5478             public CloneableProperty clone() {
  5479                 try {
  5480                     return (CloneableProperty) super.clone();
  5481                 } catch (CloneNotSupportedException e) {
  5482                     throw new AssertionError(e);
  5483                 }
  5484             }
  5485         }
  5486 
  5487         private static void defClone(String name,
  5488                                      final CloneableProperty p) {
  5489             map.put(name, new CharPropertyFactory() {
  5490                     CharProperty make() { return p.clone();}});
  5491         }
  5492 
  5493         private static final HashMap<String, CharPropertyFactory> map
  5494             = new HashMap<>();
  5495 
  5496         static {
  5497             // Unicode character property aliases, defined in
  5498             // http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt
  5499             defCategory("Cn", 1<<Character.UNASSIGNED);
  5500             defCategory("Lu", 1<<Character.UPPERCASE_LETTER);
  5501             defCategory("Ll", 1<<Character.LOWERCASE_LETTER);
  5502             defCategory("Lt", 1<<Character.TITLECASE_LETTER);
  5503             defCategory("Lm", 1<<Character.MODIFIER_LETTER);
  5504             defCategory("Lo", 1<<Character.OTHER_LETTER);
  5505             defCategory("Mn", 1<<Character.NON_SPACING_MARK);
  5506             defCategory("Me", 1<<Character.ENCLOSING_MARK);
  5507             defCategory("Mc", 1<<Character.COMBINING_SPACING_MARK);
  5508             defCategory("Nd", 1<<Character.DECIMAL_DIGIT_NUMBER);
  5509             defCategory("Nl", 1<<Character.LETTER_NUMBER);
  5510             defCategory("No", 1<<Character.OTHER_NUMBER);
  5511             defCategory("Zs", 1<<Character.SPACE_SEPARATOR);
  5512             defCategory("Zl", 1<<Character.LINE_SEPARATOR);
  5513             defCategory("Zp", 1<<Character.PARAGRAPH_SEPARATOR);
  5514             defCategory("Cc", 1<<Character.CONTROL);
  5515             defCategory("Cf", 1<<Character.FORMAT);
  5516             defCategory("Co", 1<<Character.PRIVATE_USE);
  5517             defCategory("Cs", 1<<Character.SURROGATE);
  5518             defCategory("Pd", 1<<Character.DASH_PUNCTUATION);
  5519             defCategory("Ps", 1<<Character.START_PUNCTUATION);
  5520             defCategory("Pe", 1<<Character.END_PUNCTUATION);
  5521             defCategory("Pc", 1<<Character.CONNECTOR_PUNCTUATION);
  5522             defCategory("Po", 1<<Character.OTHER_PUNCTUATION);
  5523             defCategory("Sm", 1<<Character.MATH_SYMBOL);
  5524             defCategory("Sc", 1<<Character.CURRENCY_SYMBOL);
  5525             defCategory("Sk", 1<<Character.MODIFIER_SYMBOL);
  5526             defCategory("So", 1<<Character.OTHER_SYMBOL);
  5527             defCategory("Pi", 1<<Character.INITIAL_QUOTE_PUNCTUATION);
  5528             defCategory("Pf", 1<<Character.FINAL_QUOTE_PUNCTUATION);
  5529             defCategory("L", ((1<<Character.UPPERCASE_LETTER) |
  5530                               (1<<Character.LOWERCASE_LETTER) |
  5531                               (1<<Character.TITLECASE_LETTER) |
  5532                               (1<<Character.MODIFIER_LETTER)  |
  5533                               (1<<Character.OTHER_LETTER)));
  5534             defCategory("M", ((1<<Character.NON_SPACING_MARK) |
  5535                               (1<<Character.ENCLOSING_MARK)   |
  5536                               (1<<Character.COMBINING_SPACING_MARK)));
  5537             defCategory("N", ((1<<Character.DECIMAL_DIGIT_NUMBER) |
  5538                               (1<<Character.LETTER_NUMBER)        |
  5539                               (1<<Character.OTHER_NUMBER)));
  5540             defCategory("Z", ((1<<Character.SPACE_SEPARATOR) |
  5541                               (1<<Character.LINE_SEPARATOR)  |
  5542                               (1<<Character.PARAGRAPH_SEPARATOR)));
  5543             defCategory("C", ((1<<Character.CONTROL)     |
  5544                               (1<<Character.FORMAT)      |
  5545                               (1<<Character.PRIVATE_USE) |
  5546                               (1<<Character.SURROGATE))); // Other
  5547             defCategory("P", ((1<<Character.DASH_PUNCTUATION)      |
  5548                               (1<<Character.START_PUNCTUATION)     |
  5549                               (1<<Character.END_PUNCTUATION)       |
  5550                               (1<<Character.CONNECTOR_PUNCTUATION) |
  5551                               (1<<Character.OTHER_PUNCTUATION)     |
  5552                               (1<<Character.INITIAL_QUOTE_PUNCTUATION) |
  5553                               (1<<Character.FINAL_QUOTE_PUNCTUATION)));
  5554             defCategory("S", ((1<<Character.MATH_SYMBOL)     |
  5555                               (1<<Character.CURRENCY_SYMBOL) |
  5556                               (1<<Character.MODIFIER_SYMBOL) |
  5557                               (1<<Character.OTHER_SYMBOL)));
  5558             defCategory("LC", ((1<<Character.UPPERCASE_LETTER) |
  5559                                (1<<Character.LOWERCASE_LETTER) |
  5560                                (1<<Character.TITLECASE_LETTER)));
  5561             defCategory("LD", ((1<<Character.UPPERCASE_LETTER) |
  5562                                (1<<Character.LOWERCASE_LETTER) |
  5563                                (1<<Character.TITLECASE_LETTER) |
  5564                                (1<<Character.MODIFIER_LETTER)  |
  5565                                (1<<Character.OTHER_LETTER)     |
  5566                                (1<<Character.DECIMAL_DIGIT_NUMBER)));
  5567             defRange("L1", 0x00, 0xFF); // Latin-1
  5568             map.put("all", new CharPropertyFactory() {
  5569                     CharProperty make() { return new All(); }});
  5570 
  5571             // Posix regular expression character classes, defined in
  5572             // http://www.unix.org/onlinepubs/009695399/basedefs/xbd_chap09.html
  5573             defRange("ASCII", 0x00, 0x7F);   // ASCII
  5574             defCtype("Alnum", ASCII.ALNUM);  // Alphanumeric characters
  5575             defCtype("Alpha", ASCII.ALPHA);  // Alphabetic characters
  5576             defCtype("Blank", ASCII.BLANK);  // Space and tab characters
  5577             defCtype("Cntrl", ASCII.CNTRL);  // Control characters
  5578             defRange("Digit", '0', '9');     // Numeric characters
  5579             defCtype("Graph", ASCII.GRAPH);  // printable and visible
  5580             defRange("Lower", 'a', 'z');     // Lower-case alphabetic
  5581             defRange("Print", 0x20, 0x7E);   // Printable characters
  5582             defCtype("Punct", ASCII.PUNCT);  // Punctuation characters
  5583             defCtype("Space", ASCII.SPACE);  // Space characters
  5584             defRange("Upper", 'A', 'Z');     // Upper-case alphabetic
  5585             defCtype("XDigit",ASCII.XDIGIT); // hexadecimal digits
  5586 
  5587             // Java character properties, defined by methods in Character.java
  5588             defClone("javaLowerCase", new CloneableProperty() {
  5589                 boolean isSatisfiedBy(int ch) {
  5590                     return Character.isLowerCase(ch);}});
  5591             defClone("javaUpperCase", new CloneableProperty() {
  5592                 boolean isSatisfiedBy(int ch) {
  5593                     return Character.isUpperCase(ch);}});
  5594             defClone("javaAlphabetic", new CloneableProperty() {
  5595                 boolean isSatisfiedBy(int ch) {
  5596                     return Character.isAlphabetic(ch);}});
  5597             defClone("javaIdeographic", new CloneableProperty() {
  5598                 boolean isSatisfiedBy(int ch) {
  5599                     return Character.isIdeographic(ch);}});
  5600             defClone("javaTitleCase", new CloneableProperty() {
  5601                 boolean isSatisfiedBy(int ch) {
  5602                     return Character.isTitleCase(ch);}});
  5603             defClone("javaDigit", new CloneableProperty() {
  5604                 boolean isSatisfiedBy(int ch) {
  5605                     return Character.isDigit(ch);}});
  5606             defClone("javaDefined", new CloneableProperty() {
  5607                 boolean isSatisfiedBy(int ch) {
  5608                     return Character.isDefined(ch);}});
  5609             defClone("javaLetter", new CloneableProperty() {
  5610                 boolean isSatisfiedBy(int ch) {
  5611                     return Character.isLetter(ch);}});
  5612             defClone("javaLetterOrDigit", new CloneableProperty() {
  5613                 boolean isSatisfiedBy(int ch) {
  5614                     return Character.isLetterOrDigit(ch);}});
  5615             defClone("javaJavaIdentifierStart", new CloneableProperty() {
  5616                 boolean isSatisfiedBy(int ch) {
  5617                     return Character.isJavaIdentifierStart(ch);}});
  5618             defClone("javaJavaIdentifierPart", new CloneableProperty() {
  5619                 boolean isSatisfiedBy(int ch) {
  5620                     return Character.isJavaIdentifierPart(ch);}});
  5621             defClone("javaUnicodeIdentifierStart", new CloneableProperty() {
  5622                 boolean isSatisfiedBy(int ch) {
  5623                     return Character.isUnicodeIdentifierStart(ch);}});
  5624             defClone("javaUnicodeIdentifierPart", new CloneableProperty() {
  5625                 boolean isSatisfiedBy(int ch) {
  5626                     return Character.isUnicodeIdentifierPart(ch);}});
  5627             defClone("javaIdentifierIgnorable", new CloneableProperty() {
  5628                 boolean isSatisfiedBy(int ch) {
  5629                     return Character.isIdentifierIgnorable(ch);}});
  5630             defClone("javaSpaceChar", new CloneableProperty() {
  5631                 boolean isSatisfiedBy(int ch) {
  5632                     return Character.isSpaceChar(ch);}});
  5633             defClone("javaWhitespace", new CloneableProperty() {
  5634                 boolean isSatisfiedBy(int ch) {
  5635                     return Character.isWhitespace(ch);}});
  5636             defClone("javaISOControl", new CloneableProperty() {
  5637                 boolean isSatisfiedBy(int ch) {
  5638                     return Character.isISOControl(ch);}});
  5639             defClone("javaMirrored", new CloneableProperty() {
  5640                 boolean isSatisfiedBy(int ch) {
  5641                     return Character.isMirrored(ch);}});
  5642         }
  5643     }
  5644     
  5645     private static final class Normalizer {
  5646         public static final int NFD = 1;
  5647         public static final int NFC = 2;
  5648 
  5649         static String normalize(String pattern, int NFD) {
  5650             return pattern;
  5651         }
  5652 
  5653         private static int getCombiningClass(int c) {
  5654             return 1;
  5655         }
  5656     }
  5657 }