hg/bck2brwsr: rt/emul/compact/src/main/java/java/util/regex/Pattern.java@f14e9730d4e9

     1 /*

     2  * Copyright (c) 1999, 2011, Oracle and/or its affiliates. All rights reserved.

     3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.

     4  *

     5  * This code is free software; you can redistribute it and/or modify it

     6  * under the terms of the GNU General Public License version 2 only, as

     7  * published by the Free Software Foundation.  Oracle designates this

     8  * particular file as subject to the "Classpath" exception as provided

     9  * by Oracle in the LICENSE file that accompanied this code.

    10  *

    11  * This code is distributed in the hope that it will be useful, but WITHOUT

    12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or

    13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License

    14  * version 2 for more details (a copy is included in the LICENSE file that

    15  * accompanied this code).

    16  *

    17  * You should have received a copy of the GNU General Public License version

    18  * 2 along with this work; if not, write to the Free Software Foundation,

    19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.

    20  *

    21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA

    22  * or visit www.oracle.com if you need additional information or have any

    23  * questions.

    24  */

    26 package java.util.regex;

    28 import java.util.Locale;

    29 import java.util.Map;

    30 import java.util.ArrayList;

    31 import java.util.HashMap;

    32 import java.util.Arrays;

    35 /**

    36  * A compiled representation of a regular expression.

    37  *

    38  * <p> A regular expression, specified as a string, must first be compiled into

    39  * an instance of this class.  The resulting pattern can then be used to create

    40  * a {@link Matcher} object that can match arbitrary {@link

    41  * java.lang.CharSequence </code>character sequences<code>} against the regular

    42  * expression.  All of the state involved in performing a match resides in the

    43  * matcher, so many matchers can share the same pattern.

    44  *

    45  * <p> A typical invocation sequence is thus

    46  *

    47  * <blockquote><pre>

    48  * Pattern p = Pattern.{@link #compile compile}("a*b");

    49  * Matcher m = p.{@link #matcher matcher}("aaaaab");

    50  * boolean b = m.{@link Matcher#matches matches}();</pre></blockquote>

    51  *

    52  * <p> A {@link #matches matches} method is defined by this class as a

    53  * convenience for when a regular expression is used just once.  This method

    54  * compiles an expression and matches an input sequence against it in a single

    55  * invocation.  The statement

    56  *

    57  * <blockquote><pre>

    58  * boolean b = Pattern.matches("a*b", "aaaaab");</pre></blockquote>

    59  *

    60  * is equivalent to the three statements above, though for repeated matches it

    61  * is less efficient since it does not allow the compiled pattern to be reused.

    62  *

    63  * <p> Instances of this class are immutable and are safe for use by multiple

    64  * concurrent threads.  Instances of the {@link Matcher} class are not safe for

    65  * such use.

    66  *

    67  *

    68  * <a name="sum">

    69  * <h4> Summary of regular-expression constructs </h4>

    70  *

    71  * <table border="0" cellpadding="1" cellspacing="0"

    72  *  summary="Regular expression constructs, and what they match">

    73  *

    74  * <tr align="left">

    75  * <th bgcolor="#CCCCFF" align="left" id="construct">Construct</th>

    76  * <th bgcolor="#CCCCFF" align="left" id="matches">Matches</th>

    77  * </tr>

    78  *

    79  * <tr><th>&nbsp;</th></tr>

    80  * <tr align="left"><th colspan="2" id="characters">Characters</th></tr>

    81  *

    82  * <tr><td valign="top" headers="construct characters"><i>x</i></td>

    83  *     <td headers="matches">The character <i>x</i></td></tr>

    84  * <tr><td valign="top" headers="construct characters"><tt>\\</tt></td>

    85  *     <td headers="matches">The backslash character</td></tr>

    86  * <tr><td valign="top" headers="construct characters"><tt>\0</tt><i>n</i></td>

    87  *     <td headers="matches">The character with octal value <tt>0</tt><i>n</i>

    88  *         (0&nbsp;<tt>&lt;=</tt>&nbsp;<i>n</i>&nbsp;<tt>&lt;=</tt>&nbsp;7)</td></tr>

    89  * <tr><td valign="top" headers="construct characters"><tt>\0</tt><i>nn</i></td>

    90  *     <td headers="matches">The character with octal value <tt>0</tt><i>nn</i>

    91  *         (0&nbsp;<tt>&lt;=</tt>&nbsp;<i>n</i>&nbsp;<tt>&lt;=</tt>&nbsp;7)</td></tr>

    92  * <tr><td valign="top" headers="construct characters"><tt>\0</tt><i>mnn</i></td>

    93  *     <td headers="matches">The character with octal value <tt>0</tt><i>mnn</i>

    94  *         (0&nbsp;<tt>&lt;=</tt>&nbsp;<i>m</i>&nbsp;<tt>&lt;=</tt>&nbsp;3,

    95  *         0&nbsp;<tt>&lt;=</tt>&nbsp;<i>n</i>&nbsp;<tt>&lt;=</tt>&nbsp;7)</td></tr>

    96  * <tr><td valign="top" headers="construct characters"><tt>\x</tt><i>hh</i></td>

    97  *     <td headers="matches">The character with hexadecimal&nbsp;value&nbsp;<tt>0x</tt><i>hh</i></td></tr>

    98  * <tr><td valign="top" headers="construct characters"><tt>&#92;u</tt><i>hhhh</i></td>

    99  *     <td headers="matches">The character with hexadecimal&nbsp;value&nbsp;<tt>0x</tt><i>hhhh</i></td></tr>

   100  * <tr><td valign="top" headers="construct characters"><tt>&#92;x</tt><i>{h...h}</i></td>

   101  *     <td headers="matches">The character with hexadecimal&nbsp;value&nbsp;<tt>0x</tt><i>h...h</i>

   102  *         ({@link java.lang.Character#MIN_CODE_POINT Character.MIN_CODE_POINT}

   103  *         &nbsp;&lt;=&nbsp;<tt>0x</tt><i>h...h</i>&nbsp;&lt;=&nbsp

   104  *          {@link java.lang.Character#MAX_CODE_POINT Character.MAX_CODE_POINT})</td></tr>

   105  * <tr><td valign="top" headers="matches"><tt>\t</tt></td>

   106  *     <td headers="matches">The tab character (<tt>'&#92;u0009'</tt>)</td></tr>

   107  * <tr><td valign="top" headers="construct characters"><tt>\n</tt></td>

   108  *     <td headers="matches">The newline (line feed) character (<tt>'&#92;u000A'</tt>)</td></tr>

   109  * <tr><td valign="top" headers="construct characters"><tt>\r</tt></td>

   110  *     <td headers="matches">The carriage-return character (<tt>'&#92;u000D'</tt>)</td></tr>

   111  * <tr><td valign="top" headers="construct characters"><tt>\f</tt></td>

   112  *     <td headers="matches">The form-feed character (<tt>'&#92;u000C'</tt>)</td></tr>

   113  * <tr><td valign="top" headers="construct characters"><tt>\a</tt></td>

   114  *     <td headers="matches">The alert (bell) character (<tt>'&#92;u0007'</tt>)</td></tr>

   115  * <tr><td valign="top" headers="construct characters"><tt>\e</tt></td>

   116  *     <td headers="matches">The escape character (<tt>'&#92;u001B'</tt>)</td></tr>

   117  * <tr><td valign="top" headers="construct characters"><tt>\c</tt><i>x</i></td>

   118  *     <td headers="matches">The control character corresponding to <i>x</i></td></tr>

   119  *

   120  * <tr><th>&nbsp;</th></tr>

   121  * <tr align="left"><th colspan="2" id="classes">Character classes</th></tr>

   122  *

   123  * <tr><td valign="top" headers="construct classes"><tt>[abc]</tt></td>

   124  *     <td headers="matches"><tt>a</tt>, <tt>b</tt>, or <tt>c</tt> (simple class)</td></tr>

   125  * <tr><td valign="top" headers="construct classes"><tt>[^abc]</tt></td>

   126  *     <td headers="matches">Any character except <tt>a</tt>, <tt>b</tt>, or <tt>c</tt> (negation)</td></tr>

   127  * <tr><td valign="top" headers="construct classes"><tt>[a-zA-Z]</tt></td>

   128  *     <td headers="matches"><tt>a</tt> through <tt>z</tt>

   129  *         or <tt>A</tt> through <tt>Z</tt>, inclusive (range)</td></tr>

   130  * <tr><td valign="top" headers="construct classes"><tt>[a-d[m-p]]</tt></td>

   131  *     <td headers="matches"><tt>a</tt> through <tt>d</tt>,

   132  *      or <tt>m</tt> through <tt>p</tt>: <tt>[a-dm-p]</tt> (union)</td></tr>

   133  * <tr><td valign="top" headers="construct classes"><tt>[a-z&&[def]]</tt></td>

   134  *     <td headers="matches"><tt>d</tt>, <tt>e</tt>, or <tt>f</tt> (intersection)</tr>

   135  * <tr><td valign="top" headers="construct classes"><tt>[a-z&&[^bc]]</tt></td>

   136  *     <td headers="matches"><tt>a</tt> through <tt>z</tt>,

   137  *         except for <tt>b</tt> and <tt>c</tt>: <tt>[ad-z]</tt> (subtraction)</td></tr>

   138  * <tr><td valign="top" headers="construct classes"><tt>[a-z&&[^m-p]]</tt></td>

   139  *     <td headers="matches"><tt>a</tt> through <tt>z</tt>,

   140  *          and not <tt>m</tt> through <tt>p</tt>: <tt>[a-lq-z]</tt>(subtraction)</td></tr>

   141  * <tr><th>&nbsp;</th></tr>

   142  *

   143  * <tr align="left"><th colspan="2" id="predef">Predefined character classes</th></tr>

   144  *

   145  * <tr><td valign="top" headers="construct predef"><tt>.</tt></td>

   146  *     <td headers="matches">Any character (may or may not match <a href="#lt">line terminators</a>)</td></tr>

   147  * <tr><td valign="top" headers="construct predef"><tt>\d</tt></td>

   148  *     <td headers="matches">A digit: <tt>[0-9]</tt></td></tr>

   149  * <tr><td valign="top" headers="construct predef"><tt>\D</tt></td>

   150  *     <td headers="matches">A non-digit: <tt>[^0-9]</tt></td></tr>

   151  * <tr><td valign="top" headers="construct predef"><tt>\s</tt></td>

   152  *     <td headers="matches">A whitespace character: <tt>[ \t\n\x0B\f\r]</tt></td></tr>

   153  * <tr><td valign="top" headers="construct predef"><tt>\S</tt></td>

   154  *     <td headers="matches">A non-whitespace character: <tt>[^\s]</tt></td></tr>

   155  * <tr><td valign="top" headers="construct predef"><tt>\w</tt></td>

   156  *     <td headers="matches">A word character: <tt>[a-zA-Z_0-9]</tt></td></tr>

   157  * <tr><td valign="top" headers="construct predef"><tt>\W</tt></td>

   158  *     <td headers="matches">A non-word character: <tt>[^\w]</tt></td></tr>

   159  *

   160  * <tr><th>&nbsp;</th></tr>

   161  * <tr align="left"><th colspan="2" id="posix">POSIX character classes</b> (US-ASCII only)<b></th></tr>

   162  *

   163  * <tr><td valign="top" headers="construct posix"><tt>\p{Lower}</tt></td>

   164  *     <td headers="matches">A lower-case alphabetic character: <tt>[a-z]</tt></td></tr>

   165  * <tr><td valign="top" headers="construct posix"><tt>\p{Upper}</tt></td>

   166  *     <td headers="matches">An upper-case alphabetic character:<tt>[A-Z]</tt></td></tr>

   167  * <tr><td valign="top" headers="construct posix"><tt>\p{ASCII}</tt></td>

   168  *     <td headers="matches">All ASCII:<tt>[\x00-\x7F]</tt></td></tr>

   169  * <tr><td valign="top" headers="construct posix"><tt>\p{Alpha}</tt></td>

   170  *     <td headers="matches">An alphabetic character:<tt>[\p{Lower}\p{Upper}]</tt></td></tr>

   171  * <tr><td valign="top" headers="construct posix"><tt>\p{Digit}</tt></td>

   172  *     <td headers="matches">A decimal digit: <tt>[0-9]</tt></td></tr>

   173  * <tr><td valign="top" headers="construct posix"><tt>\p{Alnum}</tt></td>

   174  *     <td headers="matches">An alphanumeric character:<tt>[\p{Alpha}\p{Digit}]</tt></td></tr>

   175  * <tr><td valign="top" headers="construct posix"><tt>\p{Punct}</tt></td>

   176  *     <td headers="matches">Punctuation: One of <tt>!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~</tt></td></tr>

   177  *     <!-- <tt>[\!"#\$%&'\(\)\*\+,\-\./:;\<=\>\?@\[\\\]\^_`\{\|\}~]</tt>

   178  *          <tt>[\X21-\X2F\X31-\X40\X5B-\X60\X7B-\X7E]</tt> -->

   179  * <tr><td valign="top" headers="construct posix"><tt>\p{Graph}</tt></td>

   180  *     <td headers="matches">A visible character: <tt>[\p{Alnum}\p{Punct}]</tt></td></tr>

   181  * <tr><td valign="top" headers="construct posix"><tt>\p{Print}</tt></td>

   182  *     <td headers="matches">A printable character: <tt>[\p{Graph}\x20]</tt></td></tr>

   183  * <tr><td valign="top" headers="construct posix"><tt>\p{Blank}</tt></td>

   184  *     <td headers="matches">A space or a tab: <tt>[ \t]</tt></td></tr>

   185  * <tr><td valign="top" headers="construct posix"><tt>\p{Cntrl}</tt></td>

   186  *     <td headers="matches">A control character: <tt>[\x00-\x1F\x7F]</tt></td></tr>

   187  * <tr><td valign="top" headers="construct posix"><tt>\p{XDigit}</tt></td>

   188  *     <td headers="matches">A hexadecimal digit: <tt>[0-9a-fA-F]</tt></td></tr>

   189  * <tr><td valign="top" headers="construct posix"><tt>\p{Space}</tt></td>

   190  *     <td headers="matches">A whitespace character: <tt>[ \t\n\x0B\f\r]</tt></td></tr>

   191  *

   192  * <tr><th>&nbsp;</th></tr>

   193  * <tr align="left"><th colspan="2">java.lang.Character classes (simple <a href="#jcc">java character type</a>)</th></tr>

   194  *

   195  * <tr><td valign="top"><tt>\p{javaLowerCase}</tt></td>

   196  *     <td>Equivalent to java.lang.Character.isLowerCase()</td></tr>

   197  * <tr><td valign="top"><tt>\p{javaUpperCase}</tt></td>

   198  *     <td>Equivalent to java.lang.Character.isUpperCase()</td></tr>

   199  * <tr><td valign="top"><tt>\p{javaWhitespace}</tt></td>

   200  *     <td>Equivalent to java.lang.Character.isWhitespace()</td></tr>

   201  * <tr><td valign="top"><tt>\p{javaMirrored}</tt></td>

   202  *     <td>Equivalent to java.lang.Character.isMirrored()</td></tr>

   203  *

   204  * <tr><th>&nbsp;</th></tr>

   205  * <tr align="left"><th colspan="2" id="unicode">Classes for Unicode scripts, blocks, categories and binary properties</th></tr>

   206  * * <tr><td valign="top" headers="construct unicode"><tt>\p{IsLatin}</tt></td>

   207  *     <td headers="matches">A Latin&nbsp;script character (<a href="#usc">script</a>)</td></tr>

   208  * <tr><td valign="top" headers="construct unicode"><tt>\p{InGreek}</tt></td>

   209  *     <td headers="matches">A character in the Greek&nbsp;block (<a href="#ubc">block</a>)</td></tr>

   210  * <tr><td valign="top" headers="construct unicode"><tt>\p{Lu}</tt></td>

   211  *     <td headers="matches">An uppercase letter (<a href="#ucc">category</a>)</td></tr>

   212  * <tr><td valign="top" headers="construct unicode"><tt>\p{IsAlphabetic}</tt></td>

   213  *     <td headers="matches">An alphabetic character (<a href="#ubpc">binary property</a>)</td></tr>

   214  * <tr><td valign="top" headers="construct unicode"><tt>\p{Sc}</tt></td>

   215  *     <td headers="matches">A currency symbol</td></tr>

   216  * <tr><td valign="top" headers="construct unicode"><tt>\P{InGreek}</tt></td>

   217  *     <td headers="matches">Any character except one in the Greek block (negation)</td></tr>

   218  * <tr><td valign="top" headers="construct unicode"><tt>[\p{L}&&[^\p{Lu}]]&nbsp;</tt></td>

   219  *     <td headers="matches">Any letter except an uppercase letter (subtraction)</td></tr>

   220  *

   221  * <tr><th>&nbsp;</th></tr>

   222  * <tr align="left"><th colspan="2" id="bounds">Boundary matchers</th></tr>

   223  *

   224  * <tr><td valign="top" headers="construct bounds"><tt>^</tt></td>

   225  *     <td headers="matches">The beginning of a line</td></tr>

   226  * <tr><td valign="top" headers="construct bounds"><tt>$</tt></td>

   227  *     <td headers="matches">The end of a line</td></tr>

   228  * <tr><td valign="top" headers="construct bounds"><tt>\b</tt></td>

   229  *     <td headers="matches">A word boundary</td></tr>

   230  * <tr><td valign="top" headers="construct bounds"><tt>\B</tt></td>

   231  *     <td headers="matches">A non-word boundary</td></tr>

   232  * <tr><td valign="top" headers="construct bounds"><tt>\A</tt></td>

   233  *     <td headers="matches">The beginning of the input</td></tr>

   234  * <tr><td valign="top" headers="construct bounds"><tt>\G</tt></td>

   235  *     <td headers="matches">The end of the previous match</td></tr>

   236  * <tr><td valign="top" headers="construct bounds"><tt>\Z</tt></td>

   237  *     <td headers="matches">The end of the input but for the final

   238  *         <a href="#lt">terminator</a>, if&nbsp;any</td></tr>

   239  * <tr><td valign="top" headers="construct bounds"><tt>\z</tt></td>

   240  *     <td headers="matches">The end of the input</td></tr>

   241  *

   242  * <tr><th>&nbsp;</th></tr>

   243  * <tr align="left"><th colspan="2" id="greedy">Greedy quantifiers</th></tr>

   244  *

   245  * <tr><td valign="top" headers="construct greedy"><i>X</i><tt>?</tt></td>

   246  *     <td headers="matches"><i>X</i>, once or not at all</td></tr>

   247  * <tr><td valign="top" headers="construct greedy"><i>X</i><tt>*</tt></td>

   248  *     <td headers="matches"><i>X</i>, zero or more times</td></tr>

   249  * <tr><td valign="top" headers="construct greedy"><i>X</i><tt>+</tt></td>

   250  *     <td headers="matches"><i>X</i>, one or more times</td></tr>

   251  * <tr><td valign="top" headers="construct greedy"><i>X</i><tt>{</tt><i>n</i><tt>}</tt></td>

   252  *     <td headers="matches"><i>X</i>, exactly <i>n</i> times</td></tr>

   253  * <tr><td valign="top" headers="construct greedy"><i>X</i><tt>{</tt><i>n</i><tt>,}</tt></td>

   254  *     <td headers="matches"><i>X</i>, at least <i>n</i> times</td></tr>

   255  * <tr><td valign="top" headers="construct greedy"><i>X</i><tt>{</tt><i>n</i><tt>,</tt><i>m</i><tt>}</tt></td>

   256  *     <td headers="matches"><i>X</i>, at least <i>n</i> but not more than <i>m</i> times</td></tr>

   257  *

   258  * <tr><th>&nbsp;</th></tr>

   259  * <tr align="left"><th colspan="2" id="reluc">Reluctant quantifiers</th></tr>

   260  *

   261  * <tr><td valign="top" headers="construct reluc"><i>X</i><tt>??</tt></td>

   262  *     <td headers="matches"><i>X</i>, once or not at all</td></tr>

   263  * <tr><td valign="top" headers="construct reluc"><i>X</i><tt>*?</tt></td>

   264  *     <td headers="matches"><i>X</i>, zero or more times</td></tr>

   265  * <tr><td valign="top" headers="construct reluc"><i>X</i><tt>+?</tt></td>

   266  *     <td headers="matches"><i>X</i>, one or more times</td></tr>

   267  * <tr><td valign="top" headers="construct reluc"><i>X</i><tt>{</tt><i>n</i><tt>}?</tt></td>

   268  *     <td headers="matches"><i>X</i>, exactly <i>n</i> times</td></tr>

   269  * <tr><td valign="top" headers="construct reluc"><i>X</i><tt>{</tt><i>n</i><tt>,}?</tt></td>

   270  *     <td headers="matches"><i>X</i>, at least <i>n</i> times</td></tr>

   271  * <tr><td valign="top" headers="construct reluc"><i>X</i><tt>{</tt><i>n</i><tt>,</tt><i>m</i><tt>}?</tt></td>

   272  *     <td headers="matches"><i>X</i>, at least <i>n</i> but not more than <i>m</i> times</td></tr>

   273  *

   274  * <tr><th>&nbsp;</th></tr>

   275  * <tr align="left"><th colspan="2" id="poss">Possessive quantifiers</th></tr>

   276  *

   277  * <tr><td valign="top" headers="construct poss"><i>X</i><tt>?+</tt></td>

   278  *     <td headers="matches"><i>X</i>, once or not at all</td></tr>

   279  * <tr><td valign="top" headers="construct poss"><i>X</i><tt>*+</tt></td>

   280  *     <td headers="matches"><i>X</i>, zero or more times</td></tr>

   281  * <tr><td valign="top" headers="construct poss"><i>X</i><tt>++</tt></td>

   282  *     <td headers="matches"><i>X</i>, one or more times</td></tr>

   283  * <tr><td valign="top" headers="construct poss"><i>X</i><tt>{</tt><i>n</i><tt>}+</tt></td>

   284  *     <td headers="matches"><i>X</i>, exactly <i>n</i> times</td></tr>

   285  * <tr><td valign="top" headers="construct poss"><i>X</i><tt>{</tt><i>n</i><tt>,}+</tt></td>

   286  *     <td headers="matches"><i>X</i>, at least <i>n</i> times</td></tr>

   287  * <tr><td valign="top" headers="construct poss"><i>X</i><tt>{</tt><i>n</i><tt>,</tt><i>m</i><tt>}+</tt></td>

   288  *     <td headers="matches"><i>X</i>, at least <i>n</i> but not more than <i>m</i> times</td></tr>

   289  *

   290  * <tr><th>&nbsp;</th></tr>

   291  * <tr align="left"><th colspan="2" id="logical">Logical operators</th></tr>

   292  *

   293  * <tr><td valign="top" headers="construct logical"><i>XY</i></td>

   294  *     <td headers="matches"><i>X</i> followed by <i>Y</i></td></tr>

   295  * <tr><td valign="top" headers="construct logical"><i>X</i><tt>|</tt><i>Y</i></td>

   296  *     <td headers="matches">Either <i>X</i> or <i>Y</i></td></tr>

   297  * <tr><td valign="top" headers="construct logical"><tt>(</tt><i>X</i><tt>)</tt></td>

   298  *     <td headers="matches">X, as a <a href="#cg">capturing group</a></td></tr>

   299  *

   300  * <tr><th>&nbsp;</th></tr>

   301  * <tr align="left"><th colspan="2" id="backref">Back references</th></tr>

   302  *

   303  * <tr><td valign="bottom" headers="construct backref"><tt>\</tt><i>n</i></td>

   304  *     <td valign="bottom" headers="matches">Whatever the <i>n</i><sup>th</sup>

   305  *     <a href="#cg">capturing group</a> matched</td></tr>

   306  *

   307  * <tr><td valign="bottom" headers="construct backref"><tt>\</tt><i>k</i>&lt;<i>name</i>&gt;</td>

   308  *     <td valign="bottom" headers="matches">Whatever the

   309  *     <a href="#groupname">named-capturing group</a> "name" matched</td></tr>

   310  *

   311  * <tr><th>&nbsp;</th></tr>

   312  * <tr align="left"><th colspan="2" id="quot">Quotation</th></tr>

   313  *

   314  * <tr><td valign="top" headers="construct quot"><tt>\</tt></td>

   315  *     <td headers="matches">Nothing, but quotes the following character</td></tr>

   316  * <tr><td valign="top" headers="construct quot"><tt>\Q</tt></td>

   317  *     <td headers="matches">Nothing, but quotes all characters until <tt>\E</tt></td></tr>

   318  * <tr><td valign="top" headers="construct quot"><tt>\E</tt></td>

   319  *     <td headers="matches">Nothing, but ends quoting started by <tt>\Q</tt></td></tr>

   320  *     <!-- Metachars: !$()*+.<>?[\]^{|} -->

   321  *

   322  * <tr><th>&nbsp;</th></tr>

   323  * <tr align="left"><th colspan="2" id="special">Special constructs (named-capturing and non-capturing)</th></tr>

   324  *

   325  * <tr><td valign="top" headers="construct special"><tt>(?&lt;<a href="#groupname">name</a>&gt;</tt><i>X</i><tt>)</tt></td>

   326  *     <td headers="matches"><i>X</i>, as a named-capturing group</td></tr>

   327  * <tr><td valign="top" headers="construct special"><tt>(?:</tt><i>X</i><tt>)</tt></td>

   328  *     <td headers="matches"><i>X</i>, as a non-capturing group</td></tr>

   329  * <tr><td valign="top" headers="construct special"><tt>(?idmsuxU-idmsuxU)&nbsp;</tt></td>

   330  *     <td headers="matches">Nothing, but turns match flags <a href="#CASE_INSENSITIVE">i</a>

   331  * <a href="#UNIX_LINES">d</a> <a href="#MULTILINE">m</a> <a href="#DOTALL">s</a>

   332  * <a href="#UNICODE_CASE">u</a> <a href="#COMMENTS">x</a> <a href="#UNICODE_CHARACTER_CLASS">U</a>

   333  * on - off</td></tr>

   334  * <tr><td valign="top" headers="construct special"><tt>(?idmsux-idmsux:</tt><i>X</i><tt>)</tt>&nbsp;&nbsp;</td>

   335  *     <td headers="matches"><i>X</i>, as a <a href="#cg">non-capturing group</a> with the

   336  *         given flags <a href="#CASE_INSENSITIVE">i</a> <a href="#UNIX_LINES">d</a>

   337  * <a href="#MULTILINE">m</a> <a href="#DOTALL">s</a> <a href="#UNICODE_CASE">u</a >

   338  * <a href="#COMMENTS">x</a> on - off</td></tr>

   339  * <tr><td valign="top" headers="construct special"><tt>(?=</tt><i>X</i><tt>)</tt></td>

   340  *     <td headers="matches"><i>X</i>, via zero-width positive lookahead</td></tr>

   341  * <tr><td valign="top" headers="construct special"><tt>(?!</tt><i>X</i><tt>)</tt></td>

   342  *     <td headers="matches"><i>X</i>, via zero-width negative lookahead</td></tr>

   343  * <tr><td valign="top" headers="construct special"><tt>(?&lt;=</tt><i>X</i><tt>)</tt></td>

   344  *     <td headers="matches"><i>X</i>, via zero-width positive lookbehind</td></tr>

   345  * <tr><td valign="top" headers="construct special"><tt>(?&lt;!</tt><i>X</i><tt>)</tt></td>

   346  *     <td headers="matches"><i>X</i>, via zero-width negative lookbehind</td></tr>

   347  * <tr><td valign="top" headers="construct special"><tt>(?&gt;</tt><i>X</i><tt>)</tt></td>

   348  *     <td headers="matches"><i>X</i>, as an independent, non-capturing group</td></tr>

   349  *

   350  * </table>

   351  *

   352  * <hr>

   353  *

   354  *

   355  * <a name="bs">

   356  * <h4> Backslashes, escapes, and quoting </h4>

   357  *

   358  * <p> The backslash character (<tt>'\'</tt>) serves to introduce escaped

   359  * constructs, as defined in the table above, as well as to quote characters

   360  * that otherwise would be interpreted as unescaped constructs.  Thus the

   361  * expression <tt>\\</tt> matches a single backslash and <tt>\{</tt> matches a

   362  * left brace.

   363  *

   364  * <p> It is an error to use a backslash prior to any alphabetic character that

   365  * does not denote an escaped construct; these are reserved for future

   366  * extensions to the regular-expression language.  A backslash may be used

   367  * prior to a non-alphabetic character regardless of whether that character is

   368  * part of an unescaped construct.

   369  *

   370  * <p> Backslashes within string literals in Java source code are interpreted

   371  * as required by

   372  * <cite>The Java&trade; Language Specification</cite>

   373  * as either Unicode escapes (section 3.3) or other character escapes (section 3.10.6)

   374  * It is therefore necessary to double backslashes in string

   375  * literals that represent regular expressions to protect them from

   376  * interpretation by the Java bytecode compiler.  The string literal

   377  * <tt>"&#92;b"</tt>, for example, matches a single backspace character when

   378  * interpreted as a regular expression, while <tt>"&#92;&#92;b"</tt> matches a

   379  * word boundary.  The string literal <tt>"&#92;(hello&#92;)"</tt> is illegal

   380  * and leads to a compile-time error; in order to match the string

   381  * <tt>(hello)</tt> the string literal <tt>"&#92;&#92;(hello&#92;&#92;)"</tt>

   382  * must be used.

   383  *

   384  * <a name="cc">

   385  * <h4> Character Classes </h4>

   386  *

   387  *    <p> Character classes may appear within other character classes, and

   388  *    may be composed by the union operator (implicit) and the intersection

   389  *    operator (<tt>&amp;&amp;</tt>).

   390  *    The union operator denotes a class that contains every character that is

   391  *    in at least one of its operand classes.  The intersection operator

   392  *    denotes a class that contains every character that is in both of its

   393  *    operand classes.

   394  *

   395  *    <p> The precedence of character-class operators is as follows, from

   396  *    highest to lowest:

   397  *

   398  *    <blockquote><table border="0" cellpadding="1" cellspacing="0"

   399  *                 summary="Precedence of character class operators.">

   400  *      <tr><th>1&nbsp;&nbsp;&nbsp;&nbsp;</th>

   401  *        <td>Literal escape&nbsp;&nbsp;&nbsp;&nbsp;</td>

   402  *        <td><tt>\x</tt></td></tr>

   403  *     <tr><th>2&nbsp;&nbsp;&nbsp;&nbsp;</th>

   404  *        <td>Grouping</td>

   405  *        <td><tt>[...]</tt></td></tr>

   406  *     <tr><th>3&nbsp;&nbsp;&nbsp;&nbsp;</th>

   407  *        <td>Range</td>

   408  *        <td><tt>a-z</tt></td></tr>

   409  *      <tr><th>4&nbsp;&nbsp;&nbsp;&nbsp;</th>

   410  *        <td>Union</td>

   411  *        <td><tt>[a-e][i-u]</tt></td></tr>

   412  *      <tr><th>5&nbsp;&nbsp;&nbsp;&nbsp;</th>

   413  *        <td>Intersection</td>

   414  *        <td><tt>[a-z&&[aeiou]]</tt></td></tr>

   415  *    </table></blockquote>

   416  *

   417  *    <p> Note that a different set of metacharacters are in effect inside

   418  *    a character class than outside a character class. For instance, the

   419  *    regular expression <tt>.</tt> loses its special meaning inside a

   420  *    character class, while the expression <tt>-</tt> becomes a range

   421  *    forming metacharacter.

   422  *

   423  * <a name="lt">

   424  * <h4> Line terminators </h4>

   425  *

   426  * <p> A <i>line terminator</i> is a one- or two-character sequence that marks

   427  * the end of a line of the input character sequence.  The following are

   428  * recognized as line terminators:

   429  *

   430  * <ul>

   431  *

   432  *   <li> A newline (line feed) character&nbsp;(<tt>'\n'</tt>),

   433  *

   434  *   <li> A carriage-return character followed immediately by a newline

   435  *   character&nbsp;(<tt>"\r\n"</tt>),

   436  *

   437  *   <li> A standalone carriage-return character&nbsp;(<tt>'\r'</tt>),

   438  *

   439  *   <li> A next-line character&nbsp;(<tt>'&#92;u0085'</tt>),

   440  *

   441  *   <li> A line-separator character&nbsp;(<tt>'&#92;u2028'</tt>), or

   442  *

   443  *   <li> A paragraph-separator character&nbsp;(<tt>'&#92;u2029</tt>).

   444  *

   445  * </ul>

   446  * <p>If {@link #UNIX_LINES} mode is activated, then the only line terminators

   447  * recognized are newline characters.

   448  *

   449  * <p> The regular expression <tt>.</tt> matches any character except a line

   450  * terminator unless the {@link #DOTALL} flag is specified.

   451  *

   452  * <p> By default, the regular expressions <tt>^</tt> and <tt>$</tt> ignore

   453  * line terminators and only match at the beginning and the end, respectively,

   454  * of the entire input sequence. If {@link #MULTILINE} mode is activated then

   455  * <tt>^</tt> matches at the beginning of input and after any line terminator

   456  * except at the end of input. When in {@link #MULTILINE} mode <tt>$</tt>

   457  * matches just before a line terminator or the end of the input sequence.

   458  *

   459  * <a name="cg">

   460  * <h4> Groups and capturing </h4>

   461  *

   462  * <a name="gnumber">

   463  * <h5> Group number </h5>

   464  * <p> Capturing groups are numbered by counting their opening parentheses from

   465  * left to right.  In the expression <tt>((A)(B(C)))</tt>, for example, there

   466  * are four such groups: </p>

   467  *

   468  * <blockquote><table cellpadding=1 cellspacing=0 summary="Capturing group numberings">

   469  * <tr><th>1&nbsp;&nbsp;&nbsp;&nbsp;</th>

   470  *     <td><tt>((A)(B(C)))</tt></td></tr>

   471  * <tr><th>2&nbsp;&nbsp;&nbsp;&nbsp;</th>

   472  *     <td><tt>(A)</tt></td></tr>

   473  * <tr><th>3&nbsp;&nbsp;&nbsp;&nbsp;</th>

   474  *     <td><tt>(B(C))</tt></td></tr>

   475  * <tr><th>4&nbsp;&nbsp;&nbsp;&nbsp;</th>

   476  *     <td><tt>(C)</tt></td></tr>

   477  * </table></blockquote>

   478  *

   479  * <p> Group zero always stands for the entire expression.

   480  *

   481  * <p> Capturing groups are so named because, during a match, each subsequence

   482  * of the input sequence that matches such a group is saved.  The captured

   483  * subsequence may be used later in the expression, via a back reference, and

   484  * may also be retrieved from the matcher once the match operation is complete.

   485  *

   486  * <a name="groupname">

   487  * <h5> Group name </h5>

   488  * <p>A capturing group can also be assigned a "name", a <tt>named-capturing group</tt>,

   489  * and then be back-referenced later by the "name". Group names are composed of

   490  * the following characters. The first character must be a <tt>letter</tt>.

   491  *

   492  * <ul>

   493  *   <li> The uppercase letters <tt>'A'</tt> through <tt>'Z'</tt>

   494  *        (<tt>'&#92;u0041'</tt>&nbsp;through&nbsp;<tt>'&#92;u005a'</tt>),

   495  *   <li> The lowercase letters <tt>'a'</tt> through <tt>'z'</tt>

   496  *        (<tt>'&#92;u0061'</tt>&nbsp;through&nbsp;<tt>'&#92;u007a'</tt>),

   497  *   <li> The digits <tt>'0'</tt> through <tt>'9'</tt>

   498  *        (<tt>'&#92;u0030'</tt>&nbsp;through&nbsp;<tt>'&#92;u0039'</tt>),

   499  * </ul>

   500  *

   501  * <p> A <tt>named-capturing group</tt> is still numbered as described in

   502  * <a href="#gnumber">Group number</a>.

   503  *

   504  * <p> The captured input associated with a group is always the subsequence

   505  * that the group most recently matched.  If a group is evaluated a second time

   506  * because of quantification then its previously-captured value, if any, will

   507  * be retained if the second evaluation fails.  Matching the string

   508  * <tt>"aba"</tt> against the expression <tt>(a(b)?)+</tt>, for example, leaves

   509  * group two set to <tt>"b"</tt>.  All captured input is discarded at the

   510  * beginning of each match.

   511  *

   512  * <p> Groups beginning with <tt>(?</tt> are either pure, <i>non-capturing</i> groups

   513  * that do not capture text and do not count towards the group total, or

   514  * <i>named-capturing</i> group.

   515  *

   516  * <h4> Unicode support </h4>

   517  *

   518  * <p> This class is in conformance with Level 1 of <a

   519  * href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical

   520  * Standard #18: Unicode Regular Expression</i></a>, plus RL2.1

   521  * Canonical Equivalents.

   522  * <p>

   523  * <b>Unicode escape sequences</b> such as <tt>&#92;u2014</tt> in Java source code

   524  * are processed as described in section 3.3 of

   525  * <cite>The Java&trade; Language Specification</cite>.

   526  * Such escape sequences are also implemented directly by the regular-expression

   527  * parser so that Unicode escapes can be used in expressions that are read from

   528  * files or from the keyboard.  Thus the strings <tt>"&#92;u2014"</tt> and

   529  * <tt>"\\u2014"</tt>, while not equal, compile into the same pattern, which

   530  * matches the character with hexadecimal value <tt>0x2014</tt>.

   531  * <p>

   532  * A Unicode character can also be represented in a regular-expression by

   533  * using its <b>Hex notation</b>(hexadecimal code point value) directly as described in construct

   534  * <tt>&#92;x{...}</tt>, for example a supplementary character U+2011F

   535  * can be specified as <tt>&#92;x{2011F}</tt>, instead of two consecutive

   536  * Unicode escape sequences of the surrogate pair

   537  * <tt>&#92;uD840</tt><tt>&#92;uDD1F</tt>.

   538  * <p>

   539  * Unicode scripts, blocks, categories and binary properties are written with

   540  * the <tt>\p</tt> and <tt>\P</tt> constructs as in Perl.

   541  * <tt>\p{</tt><i>prop</i><tt>}</tt> matches if

   542  * the input has the property <i>prop</i>, while <tt>\P{</tt><i>prop</i><tt>}</tt>

   543  * does not match if the input has that property.

   544  * <p>

   545  * Scripts, blocks, categories and binary properties can be used both inside

   546  * and outside of a character class.

   547  * <a name="usc">

   548  * <p>

   549  * <b>Scripts</b> are specified either with the prefix {@code Is}, as in

   550  * {@code IsHiragana}, or by using  the {@code script} keyword (or its short

   551  * form {@code sc})as in {@code script=Hiragana} or {@code sc=Hiragana}.

   552  * <p>

   553  * The script names supported by <code>Pattern</code> are the valid script names

   554  * accepted and defined by

   555  * {@link java.lang.Character.UnicodeScript#forName(String) UnicodeScript.forName}.

   556  * <a name="ubc">

   557  * <p>

   558  * <b>Blocks</b> are specified with the prefix {@code In}, as in

   559  * {@code InMongolian}, or by using the keyword {@code block} (or its short

   560  * form {@code blk}) as in {@code block=Mongolian} or {@code blk=Mongolian}.

   561  * <p>

   562  * The block names supported by <code>Pattern</code> are the valid block names

   563  * accepted and defined by

   564  * {@link java.lang.Character.UnicodeBlock#forName(String) UnicodeBlock.forName}.

   565  * <p>

   566  * <a name="ucc">

   567  * <b>Categories</b> may be specified with the optional prefix {@code Is}:

   568  * Both {@code \p{L}} and {@code \p{IsL}} denote the category of Unicode

   569  * letters. Same as scripts and blocks, categories can also be specified

   570  * by using the keyword {@code general_category} (or its short form

   571  * {@code gc}) as in {@code general_category=Lu} or {@code gc=Lu}.

   572  * <p>

   573  * The supported categories are those of

   574  * <a href="http://www.unicode.org/unicode/standard/standard.html">

   575  * <i>The Unicode Standard</i></a> in the version specified by the

   576  * {@link java.lang.Character Character} class. The category names are those

   577  * defined in the Standard, both normative and informative.

   578  * <p>

   579  * <a name="ubpc">

   580  * <b>Binary properties</b> are specified with the prefix {@code Is}, as in

   581  * {@code IsAlphabetic}. The supported binary properties by <code>Pattern</code>

   582  * are

   583  * <ul>

   584  *   <li> Alphabetic

   585  *   <li> Ideographic

   586  *   <li> Letter

   587  *   <li> Lowercase

   588  *   <li> Uppercase

   589  *   <li> Titlecase

   590  *   <li> Punctuation

   591  *   <Li> Control

   592  *   <li> White_Space

   593  *   <li> Digit

   594  *   <li> Hex_Digit

   595  *   <li> Noncharacter_Code_Point

   596  *   <li> Assigned

   597  * </ul>

   600  * <p>

   601  * <b>Predefined Character classes</b> and <b>POSIX character classes</b> are in

   602  * conformance with the recommendation of <i>Annex C: Compatibility Properties</i>

   603  * of <a href="http://www.unicode.org/reports/tr18/"><i>Unicode Regular Expression

   604  * </i></a>, when {@link #UNICODE_CHARACTER_CLASS} flag is specified.

   605  * <p>

   606  * <table border="0" cellpadding="1" cellspacing="0"

   607  *  summary="predefined and posix character classes in Unicode mode">

   608  * <tr align="left">

   609  * <th bgcolor="#CCCCFF" align="left" id="classes">Classes</th>

   610  * <th bgcolor="#CCCCFF" align="left" id="matches">Matches</th>

   611  *</tr>

   612  * <tr><td><tt>\p{Lower}</tt></td>

   613  *     <td>A lowercase character:<tt>\p{IsLowercase}</tt></td></tr>

   614  * <tr><td><tt>\p{Upper}</tt></td>

   615  *     <td>An uppercase character:<tt>\p{IsUppercase}</tt></td></tr>

   616  * <tr><td><tt>\p{ASCII}</tt></td>

   617  *     <td>All ASCII:<tt>[\x00-\x7F]</tt></td></tr>

   618  * <tr><td><tt>\p{Alpha}</tt></td>

   619  *     <td>An alphabetic character:<tt>\p{IsAlphabetic}</tt></td></tr>

   620  * <tr><td><tt>\p{Digit}</tt></td>

   621  *     <td>A decimal digit character:<tt>p{IsDigit}</tt></td></tr>

   622  * <tr><td><tt>\p{Alnum}</tt></td>

   623  *     <td>An alphanumeric character:<tt>[\p{IsAlphabetic}\p{IsDigit}]</tt></td></tr>

   624  * <tr><td><tt>\p{Punct}</tt></td>

   625  *     <td>A punctuation character:<tt>p{IsPunctuation}</tt></td></tr>

   626  * <tr><td><tt>\p{Graph}</tt></td>

   627  *     <td>A visible character: <tt>[^\p{IsWhite_Space}\p{gc=Cc}\p{gc=Cs}\p{gc=Cn}]</tt></td></tr>

   628  * <tr><td><tt>\p{Print}</tt></td>

   629  *     <td>A printable character: <tt>[\p{Graph}\p{Blank}&&[^\p{Cntrl}]]</tt></td></tr>

   630  * <tr><td><tt>\p{Blank}</tt></td>

   631  *     <td>A space or a tab: <tt>[\p{IsWhite_Space}&&[^\p{gc=Zl}\p{gc=Zp}\x0a\x0b\x0c\x0d\x85]]</tt></td></tr>

   632  * <tr><td><tt>\p{Cntrl}</tt></td>

   633  *     <td>A control character: <tt>\p{gc=Cc}</tt></td></tr>

   634  * <tr><td><tt>\p{XDigit}</tt></td>

   635  *     <td>A hexadecimal digit: <tt>[\p{gc=Nd}\p{IsHex_Digit}]</tt></td></tr>

   636  * <tr><td><tt>\p{Space}</tt></td>

   637  *     <td>A whitespace character:<tt>\p{IsWhite_Space}</tt></td></tr>

   638  * <tr><td><tt>\d</tt></td>

   639  *     <td>A digit: <tt>\p{IsDigit}</tt></td></tr>

   640  * <tr><td><tt>\D</tt></td>

   641  *     <td>A non-digit: <tt>[^\d]</tt></td></tr>

   642  * <tr><td><tt>\s</tt></td>

   643  *     <td>A whitespace character: <tt>\p{IsWhite_Space}</tt></td></tr>

   644  * <tr><td><tt>\S</tt></td>

   645  *     <td>A non-whitespace character: <tt>[^\s]</tt></td></tr>

   646  * <tr><td><tt>\w</tt></td>

   647  *     <td>A word character: <tt>[\p{Alpha}\p{gc=Mn}\p{gc=Me}\p{gc=Mc}\p{Digit}\p{gc=Pc}]</tt></td></tr>

   648  * <tr><td><tt>\W</tt></td>

   649  *     <td>A non-word character: <tt>[^\w]</tt></td></tr>

   650  * </table>

   651  * <p>

   652  * <a name="jcc">

   653  * Categories that behave like the java.lang.Character

   654  * boolean is<i>methodname</i> methods (except for the deprecated ones) are

   655  * available through the same <tt>\p{</tt><i>prop</i><tt>}</tt> syntax where

   656  * the specified property has the name <tt>java<i>methodname</i></tt>.

   657  *

   658  * <h4> Comparison to Perl 5 </h4>

   659  *

   660  * <p>The <code>Pattern</code> engine performs traditional NFA-based matching

   661  * with ordered alternation as occurs in Perl 5.

   662  *

   663  * <p> Perl constructs not supported by this class: </p>

   664  *

   665  * <ul>

   666  *    <li><p> Predefined character classes (Unicode character)

   667  *    <p><tt>\h&nbsp;&nbsp;&nbsp;&nbsp;</tt>A horizontal whitespace

   668  *    <p><tt>\H&nbsp;&nbsp;&nbsp;&nbsp;</tt>A non horizontal whitespace

   669  *    <p><tt>\v&nbsp;&nbsp;&nbsp;&nbsp;</tt>A vertical whitespace

   670  *    <p><tt>\V&nbsp;&nbsp;&nbsp;&nbsp;</tt>A non vertical whitespace

   671  *    <p><tt>\R&nbsp;&nbsp;&nbsp;&nbsp;</tt>Any Unicode linebreak sequence

   672  *    <tt>\u005cu000D\u005cu000A|[\u005cu000A\u005cu000B\u005cu000C\u005cu000D\u005cu0085\u005cu2028\u005cu2029]</tt>

   673  *    <p><tt>\X&nbsp;&nbsp;&nbsp;&nbsp;</tt>Match Unicode

   674  *    <a href="http://www.unicode.org/reports/tr18/#Default_Grapheme_Clusters">

   675  *    <i>extended grapheme cluster</i></a>

   676  *    </p></li>

   677  *

   678  *    <li><p> The backreference constructs, <tt>\g{</tt><i>n</i><tt>}</tt> for

   679  *    the <i>n</i><sup>th</sup><a href="#cg">capturing group</a> and

   680  *    <tt>\g{</tt><i>name</i><tt>}</tt> for

   681  *    <a href="#groupname">named-capturing group</a>.

   682  *    </p></li>

   683  *

   684  *    <li><p> The named character construct, <tt>\N{</tt><i>name</i><tt>}</tt>

   685  *    for a Unicode character by its name.

   686  *    </p></li>

   687  *

   688  *    <li><p> The conditional constructs

   689  *    <tt>(?(</tt><i>condition</i><tt>)</tt><i>X</i><tt>)</tt> and

   690  *    <tt>(?(</tt><i>condition</i><tt>)</tt><i>X</i><tt>|</tt><i>Y</i><tt>)</tt>,

   691  *    </p></li>

   692  *

   693  *    <li><p> The embedded code constructs <tt>(?{</tt><i>code</i><tt>})</tt>

   694  *    and <tt>(??{</tt><i>code</i><tt>})</tt>,</p></li>

   695  *

   696  *    <li><p> The embedded comment syntax <tt>(?#comment)</tt>, and </p></li>

   697  *

   698  *    <li><p> The preprocessing operations <tt>\l</tt> <tt>&#92;u</tt>,

   699  *    <tt>\L</tt>, and <tt>\U</tt>.  </p></li>

   700  *

   701  * </ul>

   702  *

   703  * <p> Constructs supported by this class but not by Perl: </p>

   704  *

   705  * <ul>

   706  *

   707  *    <li><p> Character-class union and intersection as described

   708  *    <a href="#cc">above</a>.</p></li>

   709  *

   710  * </ul>

   711  *

   712  * <p> Notable differences from Perl: </p>

   713  *

   714  * <ul>

   715  *

   716  *    <li><p> In Perl, <tt>\1</tt> through <tt>\9</tt> are always interpreted

   717  *    as back references; a backslash-escaped number greater than <tt>9</tt> is

   718  *    treated as a back reference if at least that many subexpressions exist,

   719  *    otherwise it is interpreted, if possible, as an octal escape.  In this

   720  *    class octal escapes must always begin with a zero. In this class,

   721  *    <tt>\1</tt> through <tt>\9</tt> are always interpreted as back

   722  *    references, and a larger number is accepted as a back reference if at

   723  *    least that many subexpressions exist at that point in the regular

   724  *    expression, otherwise the parser will drop digits until the number is

   725  *    smaller or equal to the existing number of groups or it is one digit.

   726  *    </p></li>

   727  *

   728  *    <li><p> Perl uses the <tt>g</tt> flag to request a match that resumes

   729  *    where the last match left off.  This functionality is provided implicitly

   730  *    by the {@link Matcher} class: Repeated invocations of the {@link

   731  *    Matcher#find find} method will resume where the last match left off,

   732  *    unless the matcher is reset.  </p></li>

   733  *

   734  *    <li><p> In Perl, embedded flags at the top level of an expression affect

   735  *    the whole expression.  In this class, embedded flags always take effect

   736  *    at the point at which they appear, whether they are at the top level or

   737  *    within a group; in the latter case, flags are restored at the end of the

   738  *    group just as in Perl.  </p></li>

   739  *

   740  * </ul>

   741  *

   742  *

   743  * <p> For a more precise description of the behavior of regular expression

   744  * constructs, please see <a href="http://www.oreilly.com/catalog/regex3/">

   745  * <i>Mastering Regular Expressions, 3nd Edition</i>, Jeffrey E. F. Friedl,

   746  * O'Reilly and Associates, 2006.</a>

   747  * </p>

   748  *

   749  * @see java.lang.String#split(String, int)

   750  * @see java.lang.String#split(String)

   751  *

   752  * @author      Mike McCloskey

   753  * @author      Mark Reinhold

   754  * @author      JSR-51 Expert Group

   755  * @since       1.4

   756  * @spec        JSR-51

   757  */

   759 public final class Pattern

   760     implements java.io.Serializable

   761 {

   763     /**

   764      * Regular expression modifier values.  Instead of being passed as

   765      * arguments, they can also be passed as inline modifiers.

   766      * For example, the following statements have the same effect.

   767      * <pre>

   768      * RegExp r1 = RegExp.compile("abc", Pattern.I|Pattern.M);

   769      * RegExp r2 = RegExp.compile("(?im)abc", 0);

   770      * </pre>

   771      *

   772      * The flags are duplicated so that the familiar Perl match flag

   773      * names are available.

   774      */

   776     /**

   777      * Enables Unix lines mode.

   778      *

   779      * <p> In this mode, only the <tt>'\n'</tt> line terminator is recognized

   780      * in the behavior of <tt>.</tt>, <tt>^</tt>, and <tt>$</tt>.

   781      *

   782      * <p> Unix lines mode can also be enabled via the embedded flag

   783      * expression&nbsp;<tt>(?d)</tt>.

   784      */

   785     public static final int UNIX_LINES = 0x01;

   787     /**

   788      * Enables case-insensitive matching.

   789      *

   790      * <p> By default, case-insensitive matching assumes that only characters

   791      * in the US-ASCII charset are being matched.  Unicode-aware

   792      * case-insensitive matching can be enabled by specifying the {@link

   793      * #UNICODE_CASE} flag in conjunction with this flag.

   794      *

   795      * <p> Case-insensitive matching can also be enabled via the embedded flag

   796      * expression&nbsp;<tt>(?i)</tt>.

   797      *

   798      * <p> Specifying this flag may impose a slight performance penalty.  </p>

   799      */

   800     public static final int CASE_INSENSITIVE = 0x02;

   802     /**

   803      * Permits whitespace and comments in pattern.

   804      *

   805      * <p> In this mode, whitespace is ignored, and embedded comments starting

   806      * with <tt>#</tt> are ignored until the end of a line.

   807      *

   808      * <p> Comments mode can also be enabled via the embedded flag

   809      * expression&nbsp;<tt>(?x)</tt>.

   810      */

   811     public static final int COMMENTS = 0x04;

   813     /**

   814      * Enables multiline mode.

   815      *

   816      * <p> In multiline mode the expressions <tt>^</tt> and <tt>$</tt> match

   817      * just after or just before, respectively, a line terminator or the end of

   818      * the input sequence.  By default these expressions only match at the

   819      * beginning and the end of the entire input sequence.

   820      *

   821      * <p> Multiline mode can also be enabled via the embedded flag

   822      * expression&nbsp;<tt>(?m)</tt>.  </p>

   823      */

   824     public static final int MULTILINE = 0x08;

   826     /**

   827      * Enables literal parsing of the pattern.

   828      *

   829      * <p> When this flag is specified then the input string that specifies

   830      * the pattern is treated as a sequence of literal characters.

   831      * Metacharacters or escape sequences in the input sequence will be

   832      * given no special meaning.

   833      *

   834      * <p>The flags CASE_INSENSITIVE and UNICODE_CASE retain their impact on

   835      * matching when used in conjunction with this flag. The other flags

   836      * become superfluous.

   837      *

   838      * <p> There is no embedded flag character for enabling literal parsing.

   839      * @since 1.5

   840      */

   841     public static final int LITERAL = 0x10;

   843     /**

   844      * Enables dotall mode.

   845      *

   846      * <p> In dotall mode, the expression <tt>.</tt> matches any character,

   847      * including a line terminator.  By default this expression does not match

   848      * line terminators.

   849      *

   850      * <p> Dotall mode can also be enabled via the embedded flag

   851      * expression&nbsp;<tt>(?s)</tt>.  (The <tt>s</tt> is a mnemonic for

   852      * "single-line" mode, which is what this is called in Perl.)  </p>

   853      */

   854     public static final int DOTALL = 0x20;

   856     /**

   857      * Enables Unicode-aware case folding.

   858      *

   859      * <p> When this flag is specified then case-insensitive matching, when

   860      * enabled by the {@link #CASE_INSENSITIVE} flag, is done in a manner

   861      * consistent with the Unicode Standard.  By default, case-insensitive

   862      * matching assumes that only characters in the US-ASCII charset are being

   863      * matched.

   864      *

   865      * <p> Unicode-aware case folding can also be enabled via the embedded flag

   866      * expression&nbsp;<tt>(?u)</tt>.

   867      *

   868      * <p> Specifying this flag may impose a performance penalty.  </p>

   869      */

   870     public static final int UNICODE_CASE = 0x40;

   872     /**

   873      * Enables canonical equivalence.

   874      *

   875      * <p> When this flag is specified then two characters will be considered

   876      * to match if, and only if, their full canonical decompositions match.

   877      * The expression <tt>"a&#92;u030A"</tt>, for example, will match the

   878      * string <tt>"&#92;u00E5"</tt> when this flag is specified.  By default,

   879      * matching does not take canonical equivalence into account.

   880      *

   881      * <p> There is no embedded flag character for enabling canonical

   882      * equivalence.

   883      *

   884      * <p> Specifying this flag may impose a performance penalty.  </p>

   885      */

   886     public static final int CANON_EQ = 0x80;

   888     /**

   889      * Enables the Unicode version of <i>Predefined character classes</i> and

   890      * <i>POSIX character classes</i>.

   891      *

   892      * <p> When this flag is specified then the (US-ASCII only)

   893      * <i>Predefined character classes</i> and <i>POSIX character classes</i>

   894      * are in conformance with

   895      * <a href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical

   896      * Standard #18: Unicode Regular Expression</i></a>

   897      * <i>Annex C: Compatibility Properties</i>.

   898      * <p>

   899      * The UNICODE_CHARACTER_CLASS mode can also be enabled via the embedded

   900      * flag expression&nbsp;<tt>(?U)</tt>.

   901      * <p>

   902      * The flag implies UNICODE_CASE, that is, it enables Unicode-aware case

   903      * folding.

   904      * <p>

   905      * Specifying this flag may impose a performance penalty.  </p>

   906      * @since 1.7

   907      */

   908     public static final int UNICODE_CHARACTER_CLASS = 0x100;

   910     /* Pattern has only two serialized components: The pattern string

   911      * and the flags, which are all that is needed to recompile the pattern

   912      * when it is deserialized.

   913      */

   915     /** use serialVersionUID from Merlin b59 for interoperability */

   916     private static final long serialVersionUID = 5073258162644648461L;

   918     /**

   919      * The original regular-expression pattern string.

   920      *

   921      * @serial

   922      */

   923     private String pattern;

   925     /**

   926      * The original pattern flags.

   927      *

   928      * @serial

   929      */

   930     private int flags;

   932     /**

   933      * Boolean indicating this Pattern is compiled; this is necessary in order

   934      * to lazily compile deserialized Patterns.

   935      */

   936     private transient volatile boolean compiled = false;

   938     /**

   939      * The normalized pattern string.

   940      */

   941     private transient String normalizedPattern;

   943     /**

   944      * The starting point of state machine for the find operation.  This allows

   945      * a match to start anywhere in the input.

   946      */

   947     transient Node root;

   949     /**

   950      * The root of object tree for a match operation.  The pattern is matched

   951      * at the beginning.  This may include a find that uses BnM or a First

   952      * node.

   953      */

   954     transient Node matchRoot;

   956     /**

   957      * Temporary storage used by parsing pattern slice.

   958      */

   959     transient int[] buffer;

   961     /**

   962      * Map the "name" of the "named capturing group" to its group id

   963      * node.

   964      */

   965     transient volatile Map<String, Integer> namedGroups;

   967     /**

   968      * Temporary storage used while parsing group references.

   969      */

   970     transient GroupHead[] groupNodes;

   972     /**

   973      * Temporary null terminated code point array used by pattern compiling.

   974      */

   975     private transient int[] temp;

   977     /**

   978      * The number of capturing groups in this Pattern. Used by matchers to

   979      * allocate storage needed to perform a match.

   980      */

   981     transient int capturingGroupCount;

   983     /**

   984      * The local variable count used by parsing tree. Used by matchers to

   985      * allocate storage needed to perform a match.

   986      */

   987     transient int localCount;

   989     /**

   990      * Index into the pattern string that keeps track of how much has been

   991      * parsed.

   992      */

   993     private transient int cursor;

   995     /**

   996      * Holds the length of the pattern string.

   997      */

   998     private transient int patternLength;

  1000     /**

  1001      * If the Start node might possibly match supplementary characters.

  1002      * It is set to true during compiling if

  1003      * (1) There is supplementary char in pattern, or

  1004      * (2) There is complement node of Category or Block

  1005      */

  1006     private transient boolean hasSupplementary;

  1008     /**

  1009      * Compiles the given regular expression into a pattern.  </p>

  1010      *

  1011      * @param  regex

  1012      *         The expression to be compiled

  1013      *

  1014      * @throws  PatternSyntaxException

  1015      *          If the expression's syntax is invalid

  1016      */

  1017     public static Pattern compile(String regex) {

  1018         return new Pattern(regex, 0);

  1019     }

  1021     /**

  1022      * Compiles the given regular expression into a pattern with the given

  1023      * flags.  </p>

  1024      *

  1025      * @param  regex

  1026      *         The expression to be compiled

  1027      *

  1028      * @param  flags

  1029      *         Match flags, a bit mask that may include

  1030      *         {@link #CASE_INSENSITIVE}, {@link #MULTILINE}, {@link #DOTALL},

  1031      *         {@link #UNICODE_CASE}, {@link #CANON_EQ}, {@link #UNIX_LINES},

  1032      *         {@link #LITERAL}, {@link #UNICODE_CHARACTER_CLASS}

  1033      *         and {@link #COMMENTS}

  1034      *

  1035      * @throws  IllegalArgumentException

  1036      *          If bit values other than those corresponding to the defined

  1037      *          match flags are set in <tt>flags</tt>

  1038      *

  1039      * @throws  PatternSyntaxException

  1040      *          If the expression's syntax is invalid

  1041      */

  1042     public static Pattern compile(String regex, int flags) {

  1043         return new Pattern(regex, flags);

  1044     }

  1046     /**

  1047      * Returns the regular expression from which this pattern was compiled.

  1048      * </p>

  1049      *

  1050      * @return  The source of this pattern

  1051      */

  1052     public String pattern() {

  1053         return pattern;

  1054     }

  1056     /**

  1057      * <p>Returns the string representation of this pattern. This

  1058      * is the regular expression from which this pattern was

  1059      * compiled.</p>

  1060      *

  1061      * @return  The string representation of this pattern

  1062      * @since 1.5

  1063      */

  1064     public String toString() {

  1065         return pattern;

  1066     }

  1068     /**

  1069      * Creates a matcher that will match the given input against this pattern.

  1070      * </p>

  1071      *

  1072      * @param  input

  1073      *         The character sequence to be matched

  1074      *

  1075      * @return  A new matcher for this pattern

  1076      */

  1077     public Matcher matcher(CharSequence input) {

  1078         if (!compiled) {

  1079             synchronized(this) {

  1080                 if (!compiled)

  1081                     compile();

  1082             }

  1083         }

  1084         Matcher m = new Matcher(this, input);

  1085         return m;

  1086     }

  1088     /**

  1089      * Returns this pattern's match flags.  </p>

  1090      *

  1091      * @return  The match flags specified when this pattern was compiled

  1092      */

  1093     public int flags() {

  1094         return flags;

  1095     }

  1097     /**

  1098      * Compiles the given regular expression and attempts to match the given

  1099      * input against it.

  1100      *

  1101      * <p> An invocation of this convenience method of the form

  1102      *

  1103      * <blockquote><pre>

  1104      * Pattern.matches(regex, input);</pre></blockquote>

  1105      *

  1106      * behaves in exactly the same way as the expression

  1107      *

  1108      * <blockquote><pre>

  1109      * Pattern.compile(regex).matcher(input).matches()</pre></blockquote>

  1110      *

  1111      * <p> If a pattern is to be used multiple times, compiling it once and reusing

  1112      * it will be more efficient than invoking this method each time.  </p>

  1113      *

  1114      * @param  regex

  1115      *         The expression to be compiled

  1116      *

  1117      * @param  input

  1118      *         The character sequence to be matched

  1119      *

  1120      * @throws  PatternSyntaxException

  1121      *          If the expression's syntax is invalid

  1122      */

  1123     public static boolean matches(String regex, CharSequence input) {

  1124         Pattern p = Pattern.compile(regex);

  1125         Matcher m = p.matcher(input);

  1126         return m.matches();

  1127     }

  1129     /**

  1130      * Splits the given input sequence around matches of this pattern.

  1131      *

  1132      * <p> The array returned by this method contains each substring of the

  1133      * input sequence that is terminated by another subsequence that matches

  1134      * this pattern or is terminated by the end of the input sequence.  The

  1135      * substrings in the array are in the order in which they occur in the

  1136      * input.  If this pattern does not match any subsequence of the input then

  1137      * the resulting array has just one element, namely the input sequence in

  1138      * string form.

  1139      *

  1140      * <p> The <tt>limit</tt> parameter controls the number of times the

  1141      * pattern is applied and therefore affects the length of the resulting

  1142      * array.  If the limit <i>n</i> is greater than zero then the pattern

  1143      * will be applied at most <i>n</i>&nbsp;-&nbsp;1 times, the array's

  1144      * length will be no greater than <i>n</i>, and the array's last entry

  1145      * will contain all input beyond the last matched delimiter.  If <i>n</i>

  1146      * is non-positive then the pattern will be applied as many times as

  1147      * possible and the array can have any length.  If <i>n</i> is zero then

  1148      * the pattern will be applied as many times as possible, the array can

  1149      * have any length, and trailing empty strings will be discarded.

  1150      *

  1151      * <p> The input <tt>"boo:and:foo"</tt>, for example, yields the following

  1152      * results with these parameters:

  1153      *

  1154      * <blockquote><table cellpadding=1 cellspacing=0

  1155      *              summary="Split examples showing regex, limit, and result">

  1156      * <tr><th><P align="left"><i>Regex&nbsp;&nbsp;&nbsp;&nbsp;</i></th>

  1157      *     <th><P align="left"><i>Limit&nbsp;&nbsp;&nbsp;&nbsp;</i></th>

  1158      *     <th><P align="left"><i>Result&nbsp;&nbsp;&nbsp;&nbsp;</i></th></tr>

  1159      * <tr><td align=center>:</td>

  1160      *     <td align=center>2</td>

  1161      *     <td><tt>{ "boo", "and:foo" }</tt></td></tr>

  1162      * <tr><td align=center>:</td>

  1163      *     <td align=center>5</td>

  1164      *     <td><tt>{ "boo", "and", "foo" }</tt></td></tr>

  1165      * <tr><td align=center>:</td>

  1166      *     <td align=center>-2</td>

  1167      *     <td><tt>{ "boo", "and", "foo" }</tt></td></tr>

  1168      * <tr><td align=center>o</td>

  1169      *     <td align=center>5</td>

  1170      *     <td><tt>{ "b", "", ":and:f", "", "" }</tt></td></tr>

  1171      * <tr><td align=center>o</td>

  1172      *     <td align=center>-2</td>

  1173      *     <td><tt>{ "b", "", ":and:f", "", "" }</tt></td></tr>

  1174      * <tr><td align=center>o</td>

  1175      *     <td align=center>0</td>

  1176      *     <td><tt>{ "b", "", ":and:f" }</tt></td></tr>

  1177      * </table></blockquote>

  1178      *

  1179      *

  1180      * @param  input

  1181      *         The character sequence to be split

  1182      *

  1183      * @param  limit

  1184      *         The result threshold, as described above

  1185      *

  1186      * @return  The array of strings computed by splitting the input

  1187      *          around matches of this pattern

  1188      */

  1189     public String[] split(CharSequence input, int limit) {

  1190         int index = 0;

  1191         boolean matchLimited = limit > 0;

  1192         ArrayList<String> matchList = new ArrayList<>();

  1193         Matcher m = matcher(input);

  1195         // Add segments before each match found

  1196         while(m.find()) {

  1197             if (!matchLimited || matchList.size() < limit - 1) {

  1198                 String match = input.subSequence(index, m.start()).toString();

  1199                 matchList.add(match);

  1200                 index = m.end();

  1201             } else if (matchList.size() == limit - 1) { // last one

  1202                 String match = input.subSequence(index,

  1203                                                  input.length()).toString();

  1204                 matchList.add(match);

  1205                 index = m.end();

  1206             }

  1207         }

  1209         // If no match was found, return this

  1210         if (index == 0)

  1211             return new String[] {input.toString()};

  1213         // Add remaining segment

  1214         if (!matchLimited || matchList.size() < limit)

  1215             matchList.add(input.subSequence(index, input.length()).toString());

  1217         // Construct result

  1218         int resultSize = matchList.size();

  1219         if (limit == 0)

  1220             while (resultSize > 0 && matchList.get(resultSize-1).equals(""))

  1221                 resultSize--;

  1222         String[] result = new String[resultSize];

  1223         return matchList.subList(0, resultSize).toArray(result);

  1224     }

  1226     /**

  1227      * Splits the given input sequence around matches of this pattern.

  1228      *

  1229      * <p> This method works as if by invoking the two-argument {@link

  1230      * #split(java.lang.CharSequence, int) split} method with the given input

  1231      * sequence and a limit argument of zero.  Trailing empty strings are

  1232      * therefore not included in the resulting array. </p>

  1233      *

  1234      * <p> The input <tt>"boo:and:foo"</tt>, for example, yields the following

  1235      * results with these expressions:

  1236      *

  1237      * <blockquote><table cellpadding=1 cellspacing=0

  1238      *              summary="Split examples showing regex and result">

  1239      * <tr><th><P align="left"><i>Regex&nbsp;&nbsp;&nbsp;&nbsp;</i></th>

  1240      *     <th><P align="left"><i>Result</i></th></tr>

  1241      * <tr><td align=center>:</td>

  1242      *     <td><tt>{ "boo", "and", "foo" }</tt></td></tr>

  1243      * <tr><td align=center>o</td>

  1244      *     <td><tt>{ "b", "", ":and:f" }</tt></td></tr>

  1245      * </table></blockquote>

  1246      *

  1247      *

  1248      * @param  input

  1249      *         The character sequence to be split

  1250      *

  1251      * @return  The array of strings computed by splitting the input

  1252      *          around matches of this pattern

  1253      */

  1254     public String[] split(CharSequence input) {

  1255         return split(input, 0);

  1256     }

  1258     /**

  1259      * Returns a literal pattern <code>String</code> for the specified

  1260      * <code>String</code>.

  1261      *

  1262      * <p>This method produces a <code>String</code> that can be used to

  1263      * create a <code>Pattern</code> that would match the string

  1264      * <code>s</code> as if it were a literal pattern.</p> Metacharacters

  1265      * or escape sequences in the input sequence will be given no special

  1266      * meaning.

  1267      *

  1268      * @param  s The string to be literalized

  1269      * @return  A literal string replacement

  1270      * @since 1.5

  1271      */

  1272     public static String quote(String s) {

  1273         int slashEIndex = s.indexOf("\\E");

  1274         if (slashEIndex == -1)

  1275             return "\\Q" + s + "\\E";

  1277         StringBuilder sb = new StringBuilder(s.length() * 2);

  1278         sb.append("\\Q");

  1279         slashEIndex = 0;

  1280         int current = 0;

  1281         while ((slashEIndex = s.indexOf("\\E", current)) != -1) {

  1282             sb.append(s.substring(current, slashEIndex));

  1283             current = slashEIndex + 2;

  1284             sb.append("\\E\\\\E\\Q");

  1285         }

  1286         sb.append(s.substring(current, s.length()));

  1287         sb.append("\\E");

  1288         return sb.toString();

  1289     }

  1291     /**

  1292      * Recompile the Pattern instance from a stream.  The original pattern

  1293      * string is read in and the object tree is recompiled from it.

  1294      */

  1295     private void readObject(java.io.ObjectInputStream s)

  1296         throws java.io.IOException, ClassNotFoundException {

  1298         // Read in all fields

  1299         s.defaultReadObject();

  1301         // Initialize counts

  1302         capturingGroupCount = 1;

  1303         localCount = 0;

  1305         // if length > 0, the Pattern is lazily compiled

  1306         compiled = false;

  1307         if (pattern.length() == 0) {

  1308             root = new Start(lastAccept);

  1309             matchRoot = lastAccept;

  1310             compiled = true;

  1311         }

  1312     }

  1314     /**

  1315      * This private constructor is used to create all Patterns. The pattern

  1316      * string and match flags are all that is needed to completely describe

  1317      * a Pattern. An empty pattern string results in an object tree with

  1318      * only a Start node and a LastNode node.

  1319      */

  1320     private Pattern(String p, int f) {

  1321         pattern = p;

  1322         flags = f;

  1324         // to use UNICODE_CASE if UNICODE_CHARACTER_CLASS present

  1325         if ((flags & UNICODE_CHARACTER_CLASS) != 0)

  1326             flags |= UNICODE_CASE;

  1328         // Reset group index count

  1329         capturingGroupCount = 1;

  1330         localCount = 0;

  1332         if (pattern.length() > 0) {

  1333             compile();

  1334         } else {

  1335             root = new Start(lastAccept);

  1336             matchRoot = lastAccept;

  1337         }

  1338     }

  1340     /**

  1341      * The pattern is converted to normalizedD form and then a pure group

  1342      * is constructed to match canonical equivalences of the characters.

  1343      */

  1344     private void normalize() {

  1345         boolean inCharClass = false;

  1346         int lastCodePoint = -1;

  1348         // Convert pattern into normalizedD form

  1349         normalizedPattern = Normalizer.normalize(pattern, Normalizer.NFD);

  1350         patternLength = normalizedPattern.length();

  1352         // Modify pattern to match canonical equivalences

  1353         StringBuilder newPattern = new StringBuilder(patternLength);

  1354         for(int i=0; i<patternLength; ) {

  1355             int c = normalizedPattern.codePointAt(i);

  1356             StringBuilder sequenceBuffer;

  1357             if ((Character.getType(c) == Character.NON_SPACING_MARK)

  1358                 && (lastCodePoint != -1)) {

  1359                 sequenceBuffer = new StringBuilder();

  1360                 sequenceBuffer.appendCodePoint(lastCodePoint);

  1361                 sequenceBuffer.appendCodePoint(c);

  1362                 while(Character.getType(c) == Character.NON_SPACING_MARK) {

  1363                     i += Character.charCount(c);

  1364                     if (i >= patternLength)

  1365                         break;

  1366                     c = normalizedPattern.codePointAt(i);

  1367                     sequenceBuffer.appendCodePoint(c);

  1368                 }

  1369                 String ea = produceEquivalentAlternation(

  1370                                                sequenceBuffer.toString());

  1371                 newPattern.setLength(newPattern.length()-Character.charCount(lastCodePoint));

  1372                 newPattern.append("(?:").append(ea).append(")");

  1373             } else if (c == '[' && lastCodePoint != '\\') {

  1374                 i = normalizeCharClass(newPattern, i);

  1375             } else {

  1376                 newPattern.appendCodePoint(c);

  1377             }

  1378             lastCodePoint = c;

  1379             i += Character.charCount(c);

  1380         }

  1381         normalizedPattern = newPattern.toString();

  1382     }

  1384     /**

  1385      * Complete the character class being parsed and add a set

  1386      * of alternations to it that will match the canonical equivalences

  1387      * of the characters within the class.

  1388      */

  1389     private int normalizeCharClass(StringBuilder newPattern, int i) {

  1390         StringBuilder charClass = new StringBuilder();

  1391         StringBuilder eq = null;

  1392         int lastCodePoint = -1;

  1393         String result;

  1395         i++;

  1396         charClass.append("[");

  1397         while(true) {

  1398             int c = normalizedPattern.codePointAt(i);

  1399             StringBuilder sequenceBuffer;

  1401             if (c == ']' && lastCodePoint != '\\') {

  1402                 charClass.append((char)c);

  1403                 break;

  1404             } else if (Character.getType(c) == Character.NON_SPACING_MARK) {

  1405                 sequenceBuffer = new StringBuilder();

  1406                 sequenceBuffer.appendCodePoint(lastCodePoint);

  1407                 while(Character.getType(c) == Character.NON_SPACING_MARK) {

  1408                     sequenceBuffer.appendCodePoint(c);

  1409                     i += Character.charCount(c);

  1410                     if (i >= normalizedPattern.length())

  1411                         break;

  1412                     c = normalizedPattern.codePointAt(i);

  1413                 }

  1414                 String ea = produceEquivalentAlternation(

  1415                                                   sequenceBuffer.toString());

  1417                 charClass.setLength(charClass.length()-Character.charCount(lastCodePoint));

  1418                 if (eq == null)

  1419                     eq = new StringBuilder();

  1420                 eq.append('|');

  1421                 eq.append(ea);

  1422             } else {

  1423                 charClass.appendCodePoint(c);

  1424                 i++;

  1425             }

  1426             if (i == normalizedPattern.length())

  1427                 throw error("Unclosed character class");

  1428             lastCodePoint = c;

  1429         }

  1431         if (eq != null) {

  1432             result = "(?:"+charClass.toString()+eq.toString()+")";

  1433         } else {

  1434             result = charClass.toString();

  1435         }

  1437         newPattern.append(result);

  1438         return i;

  1439     }

  1441     /**

  1442      * Given a specific sequence composed of a regular character and

  1443      * combining marks that follow it, produce the alternation that will

  1444      * match all canonical equivalences of that sequence.

  1445      */

  1446     private String produceEquivalentAlternation(String source) {

  1447         int len = countChars(source, 0, 1);

  1448         if (source.length() == len)

  1449             // source has one character.

  1450             return source;

  1452         String base = source.substring(0,len);

  1453         String combiningMarks = source.substring(len);

  1455         String[] perms = producePermutations(combiningMarks);

  1456         StringBuilder result = new StringBuilder(source);

  1458         // Add combined permutations

  1459         for(int x=0; x<perms.length; x++) {

  1460             String next = base + perms[x];

  1461             if (x>0)

  1462                 result.append("|"+next);

  1463             next = composeOneStep(next);

  1464             if (next != null)

  1465                 result.append("|"+produceEquivalentAlternation(next));

  1466         }

  1467         return result.toString();

  1468     }

  1470     /**

  1471      * Returns an array of strings that have all the possible

  1472      * permutations of the characters in the input string.

  1473      * This is used to get a list of all possible orderings

  1474      * of a set of combining marks. Note that some of the permutations

  1475      * are invalid because of combining class collisions, and these

  1476      * possibilities must be removed because they are not canonically

  1477      * equivalent.

  1478      */

  1479     private String[] producePermutations(String input) {

  1480         if (input.length() == countChars(input, 0, 1))

  1481             return new String[] {input};

  1483         if (input.length() == countChars(input, 0, 2)) {

  1484             int c0 = Character.codePointAt(input, 0);

  1485             int c1 = Character.codePointAt(input, Character.charCount(c0));

  1486             if (getClass(c1) == getClass(c0)) {

  1487                 return new String[] {input};

  1488             }

  1489             String[] result = new String[2];

  1490             result[0] = input;

  1491             StringBuilder sb = new StringBuilder(2);

  1492             sb.appendCodePoint(c1);

  1493             sb.appendCodePoint(c0);

  1494             result[1] = sb.toString();

  1495             return result;

  1496         }

  1498         int length = 1;

  1499         int nCodePoints = countCodePoints(input);

  1500         for(int x=1; x<nCodePoints; x++)

  1501             length = length * (x+1);

  1503         String[] temp = new String[length];

  1505         int combClass[] = new int[nCodePoints];

  1506         for(int x=0, i=0; x<nCodePoints; x++) {

  1507             int c = Character.codePointAt(input, i);

  1508             combClass[x] = getClass(c);

  1509             i +=  Character.charCount(c);

  1510         }

  1512         // For each char, take it out and add the permutations

  1513         // of the remaining chars

  1514         int index = 0;

  1515         int len;

  1516         // offset maintains the index in code units.

  1517 loop:   for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {

  1518             len = countChars(input, offset, 1);

  1519             boolean skip = false;

  1520             for(int y=x-1; y>=0; y--) {

  1521                 if (combClass[y] == combClass[x]) {

  1522                     continue loop;

  1523                 }

  1524             }

  1525             StringBuilder sb = new StringBuilder(input);

  1526             String otherChars = sb.delete(offset, offset+len).toString();

  1527             String[] subResult = producePermutations(otherChars);

  1529             String prefix = input.substring(offset, offset+len);

  1530             for(int y=0; y<subResult.length; y++)

  1531                 temp[index++] =  prefix + subResult[y];

  1532         }

  1533         String[] result = new String[index];

  1534         for (int x=0; x<index; x++)

  1535             result[x] = temp[x];

  1536         return result;

  1537     }

  1539     private int getClass(int c) {

  1540         return Normalizer.getCombiningClass(c);

  1541     }

  1543     /**

  1544      * Attempts to compose input by combining the first character

  1545      * with the first combining mark following it. Returns a String

  1546      * that is the composition of the leading character with its first

  1547      * combining mark followed by the remaining combining marks. Returns

  1548      * null if the first two characters cannot be further composed.

  1549      */

  1550     private String composeOneStep(String input) {

  1551         int len = countChars(input, 0, 2);

  1552         String firstTwoCharacters = input.substring(0, len);

  1553         String result = Normalizer.normalize(firstTwoCharacters, Normalizer.NFC);

  1555         if (result.equals(firstTwoCharacters))

  1556             return null;

  1557         else {

  1558             String remainder = input.substring(len);

  1559             return result + remainder;

  1560         }

  1561     }

  1563     /**

  1564      * Preprocess any \Q...\E sequences in `temp', meta-quoting them.

  1565      * See the description of `quotemeta' in perlfunc(1).

  1566      */

  1567     private void RemoveQEQuoting() {

  1568         final int pLen = patternLength;

  1569         int i = 0;

  1570         while (i < pLen-1) {

  1571             if (temp[i] != '\\')

  1572                 i += 1;

  1573             else if (temp[i + 1] != 'Q')

  1574                 i += 2;

  1575             else

  1576                 break;

  1577         }

  1578         if (i >= pLen - 1)    // No \Q sequence found

  1579             return;

  1580         int j = i;

  1581         i += 2;

  1582         int[] newtemp = new int[j + 2*(pLen-i) + 2];

  1583         System.arraycopy(temp, 0, newtemp, 0, j);

  1585         boolean inQuote = true;

  1586         while (i < pLen) {

  1587             int c = temp[i++];

  1588             if (! ASCII.isAscii(c) || ASCII.isAlnum(c)) {

  1589                 newtemp[j++] = c;

  1590             } else if (c != '\\') {

  1591                 if (inQuote) newtemp[j++] = '\\';

  1592                 newtemp[j++] = c;

  1593             } else if (inQuote) {

  1594                 if (temp[i] == 'E') {

  1595                     i++;

  1596                     inQuote = false;

  1597                 } else {

  1598                     newtemp[j++] = '\\';

  1599                     newtemp[j++] = '\\';

  1600                 }

  1601             } else {

  1602                 if (temp[i] == 'Q') {

  1603                     i++;

  1604                     inQuote = true;

  1605                 } else {

  1606                     newtemp[j++] = c;

  1607                     if (i != pLen)

  1608                         newtemp[j++] = temp[i++];

  1609                 }

  1610             }

  1611         }

  1613         patternLength = j;

  1614         temp = Arrays.copyOf(newtemp, j + 2); // double zero termination

  1615     }

  1617     /**

  1618      * Copies regular expression to an int array and invokes the parsing

  1619      * of the expression which will create the object tree.

  1620      */

  1621     private void compile() {

  1622         // Handle canonical equivalences

  1623         if (has(CANON_EQ) && !has(LITERAL)) {

  1624             normalize();

  1625         } else {

  1626             normalizedPattern = pattern;

  1627         }

  1628         patternLength = normalizedPattern.length();

  1630         // Copy pattern to int array for convenience

  1631         // Use double zero to terminate pattern

  1632         temp = new int[patternLength + 2];

  1634         hasSupplementary = false;

  1635         int c, count = 0;

  1636         // Convert all chars into code points

  1637         for (int x = 0; x < patternLength; x += Character.charCount(c)) {

  1638             c = normalizedPattern.codePointAt(x);

  1639             if (isSupplementary(c)) {

  1640                 hasSupplementary = true;

  1641             }

  1642             temp[count++] = c;

  1643         }

  1645         patternLength = count;   // patternLength now in code points

  1647         if (! has(LITERAL))

  1648             RemoveQEQuoting();

  1650         // Allocate all temporary objects here.

  1651         buffer = new int[32];

  1652         groupNodes = new GroupHead[10];

  1653         namedGroups = null;

  1655         if (has(LITERAL)) {

  1656             // Literal pattern handling

  1657             matchRoot = newSlice(temp, patternLength, hasSupplementary);

  1658             matchRoot.next = lastAccept;

  1659         } else {

  1660             // Start recursive descent parsing

  1661             matchRoot = expr(lastAccept);

  1662             // Check extra pattern characters

  1663             if (patternLength != cursor) {

  1664                 if (peek() == ')') {

  1665                     throw error("Unmatched closing ')'");

  1666                 } else {

  1667                     throw error("Unexpected internal error");

  1668                 }

  1669             }

  1670         }

  1672         // Peephole optimization

  1673         if (matchRoot instanceof Slice) {

  1674             root = BnM.optimize(matchRoot);

  1675             if (root == matchRoot) {

  1676                 root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot);

  1677             }

  1678         } else if (matchRoot instanceof Begin || matchRoot instanceof First) {

  1679             root = matchRoot;

  1680         } else {

  1681             root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot);

  1682         }

  1684         // Release temporary storage

  1685         temp = null;

  1686         buffer = null;

  1687         groupNodes = null;

  1688         patternLength = 0;

  1689         compiled = true;

  1690     }

  1692     Map<String, Integer> namedGroups() {

  1693         if (namedGroups == null)

  1694             namedGroups = new HashMap<>(2);

  1695         return namedGroups;

  1696     }

  1698     /**

  1699      * Used to print out a subtree of the Pattern to help with debugging.

  1700      */

  1701     private static void printObjectTree(Node node) {

  1702         while(node != null) {

  1703             if (node instanceof Prolog) {

  1704                 System.out.println(node);

  1705                 printObjectTree(((Prolog)node).loop);

  1706                 System.out.println("**** end contents prolog loop");

  1707             } else if (node instanceof Loop) {

  1708                 System.out.println(node);

  1709                 printObjectTree(((Loop)node).body);

  1710                 System.out.println("**** end contents Loop body");

  1711             } else if (node instanceof Curly) {

  1712                 System.out.println(node);

  1713                 printObjectTree(((Curly)node).atom);

  1714                 System.out.println("**** end contents Curly body");

  1715             } else if (node instanceof GroupCurly) {

  1716                 System.out.println(node);

  1717                 printObjectTree(((GroupCurly)node).atom);

  1718                 System.out.println("**** end contents GroupCurly body");

  1719             } else if (node instanceof GroupTail) {

  1720                 System.out.println(node);

  1721                 System.out.println("Tail next is "+node.next);

  1722                 return;

  1723             } else {

  1724                 System.out.println(node);

  1725             }

  1726             node = node.next;

  1727             if (node != null)

  1728                 System.out.println("->next:");

  1729             if (node == Pattern.accept) {

  1730                 System.out.println("Accept Node");

  1731                 node = null;

  1732             }

  1733        }

  1734     }

  1736     /**

  1737      * Used to accumulate information about a subtree of the object graph

  1738      * so that optimizations can be applied to the subtree.

  1739      */

  1740     static final class TreeInfo {

  1741         int minLength;

  1742         int maxLength;

  1743         boolean maxValid;

  1744         boolean deterministic;

  1746         TreeInfo() {

  1747             reset();

  1748         }

  1749         void reset() {

  1750             minLength = 0;

  1751             maxLength = 0;

  1752             maxValid = true;

  1753             deterministic = true;

  1754         }

  1755     }

  1757     /*

  1758      * The following private methods are mainly used to improve the

  1759      * readability of the code. In order to let the Java compiler easily

  1760      * inline them, we should not put many assertions or error checks in them.

  1761      */

  1763     /**

  1764      * Indicates whether a particular flag is set or not.

  1765      */

  1766     private boolean has(int f) {

  1767         return (flags & f) != 0;

  1768     }

  1770     /**

  1771      * Match next character, signal error if failed.

  1772      */

  1773     private void accept(int ch, String s) {

  1774         int testChar = temp[cursor++];

  1775         if (has(COMMENTS))

  1776             testChar = parsePastWhitespace(testChar);

  1777         if (ch != testChar) {

  1778             throw error(s);

  1779         }

  1780     }

  1782     /**

  1783      * Mark the end of pattern with a specific character.

  1784      */

  1785     private void mark(int c) {

  1786         temp[patternLength] = c;

  1787     }

  1789     /**

  1790      * Peek the next character, and do not advance the cursor.

  1791      */

  1792     private int peek() {

  1793         int ch = temp[cursor];

  1794         if (has(COMMENTS))

  1795             ch = peekPastWhitespace(ch);

  1796         return ch;

  1797     }

  1799     /**

  1800      * Read the next character, and advance the cursor by one.

  1801      */

  1802     private int read() {

  1803         int ch = temp[cursor++];

  1804         if (has(COMMENTS))

  1805             ch = parsePastWhitespace(ch);

  1806         return ch;

  1807     }

  1809     /**

  1810      * Read the next character, and advance the cursor by one,

  1811      * ignoring the COMMENTS setting

  1812      */

  1813     private int readEscaped() {

  1814         int ch = temp[cursor++];

  1815         return ch;

  1816     }

  1818     /**

  1819      * Advance the cursor by one, and peek the next character.

  1820      */

  1821     private int next() {

  1822         int ch = temp[++cursor];

  1823         if (has(COMMENTS))

  1824             ch = peekPastWhitespace(ch);

  1825         return ch;

  1826     }

  1828     /**

  1829      * Advance the cursor by one, and peek the next character,

  1830      * ignoring the COMMENTS setting

  1831      */

  1832     private int nextEscaped() {

  1833         int ch = temp[++cursor];

  1834         return ch;

  1835     }

  1837     /**

  1838      * If in xmode peek past whitespace and comments.

  1839      */

  1840     private int peekPastWhitespace(int ch) {

  1841         while (ASCII.isSpace(ch) || ch == '#') {

  1842             while (ASCII.isSpace(ch))

  1843                 ch = temp[++cursor];

  1844             if (ch == '#') {

  1845                 ch = peekPastLine();

  1846             }

  1847         }

  1848         return ch;

  1849     }

  1851     /**

  1852      * If in xmode parse past whitespace and comments.

  1853      */

  1854     private int parsePastWhitespace(int ch) {

  1855         while (ASCII.isSpace(ch) || ch == '#') {

  1856             while (ASCII.isSpace(ch))

  1857                 ch = temp[cursor++];

  1858             if (ch == '#')

  1859                 ch = parsePastLine();

  1860         }

  1861         return ch;

  1862     }

  1864     /**

  1865      * xmode parse past comment to end of line.

  1866      */

  1867     private int parsePastLine() {

  1868         int ch = temp[cursor++];

  1869         while (ch != 0 && !isLineSeparator(ch))

  1870             ch = temp[cursor++];

  1871         return ch;

  1872     }

  1874     /**

  1875      * xmode peek past comment to end of line.

  1876      */

  1877     private int peekPastLine() {

  1878         int ch = temp[++cursor];

  1879         while (ch != 0 && !isLineSeparator(ch))

  1880             ch = temp[++cursor];

  1881         return ch;

  1882     }

  1884     /**

  1885      * Determines if character is a line separator in the current mode

  1886      */

  1887     private boolean isLineSeparator(int ch) {

  1888         if (has(UNIX_LINES)) {

  1889             return ch == '\n';

  1890         } else {

  1891             return (ch == '\n' ||

  1892                     ch == '\r' ||

  1893                     (ch|1) == '\u2029' ||

  1894                     ch == '\u0085');

  1895         }

  1896     }

  1898     /**

  1899      * Read the character after the next one, and advance the cursor by two.

  1900      */

  1901     private int skip() {

  1902         int i = cursor;

  1903         int ch = temp[i+1];

  1904         cursor = i + 2;

  1905         return ch;

  1906     }

  1908     /**

  1909      * Unread one next character, and retreat cursor by one.

  1910      */

  1911     private void unread() {

  1912         cursor--;

  1913     }

  1915     /**

  1916      * Internal method used for handling all syntax errors. The pattern is

  1917      * displayed with a pointer to aid in locating the syntax error.

  1918      */

  1919     private PatternSyntaxException error(String s) {

  1920         return new PatternSyntaxException(s, normalizedPattern,  cursor - 1);

  1921     }

  1923     /**

  1924      * Determines if there is any supplementary character or unpaired

  1925      * surrogate in the specified range.

  1926      */

  1927     private boolean findSupplementary(int start, int end) {

  1928         for (int i = start; i < end; i++) {

  1929             if (isSupplementary(temp[i]))

  1930                 return true;

  1931         }

  1932         return false;

  1933     }

  1935     /**

  1936      * Determines if the specified code point is a supplementary

  1937      * character or unpaired surrogate.

  1938      */

  1939     private static final boolean isSupplementary(int ch) {

  1940         return ch >= Character.MIN_SUPPLEMENTARY_CODE_POINT ||

  1941                Character.isSurrogate((char)ch);

  1942     }

  1944     /**

  1945      *  The following methods handle the main parsing. They are sorted

  1946      *  according to their precedence order, the lowest one first.

  1947      */

  1949     /**

  1950      * The expression is parsed with branch nodes added for alternations.

  1951      * This may be called recursively to parse sub expressions that may

  1952      * contain alternations.

  1953      */

  1954     private Node expr(Node end) {

  1955         Node prev = null;

  1956         Node firstTail = null;

  1957         Node branchConn = null;

  1959         for (;;) {

  1960             Node node = sequence(end);

  1961             Node nodeTail = root;      //double return

  1962             if (prev == null) {

  1963                 prev = node;

  1964                 firstTail = nodeTail;

  1965             } else {

  1966                 // Branch

  1967                 if (branchConn == null) {

  1968                     branchConn = new BranchConn();

  1969                     branchConn.next = end;

  1970                 }

  1971                 if (node == end) {

  1972                     // if the node returned from sequence() is "end"

  1973                     // we have an empty expr, set a null atom into

  1974                     // the branch to indicate to go "next" directly.

  1975                     node = null;

  1976                 } else {

  1977                     // the "tail.next" of each atom goes to branchConn

  1978                     nodeTail.next = branchConn;

  1979                 }

  1980                 if (prev instanceof Branch) {

  1981                     ((Branch)prev).add(node);

  1982                 } else {

  1983                     if (prev == end) {

  1984                         prev = null;

  1985                     } else {

  1986                         // replace the "end" with "branchConn" at its tail.next

  1987                         // when put the "prev" into the branch as the first atom.

  1988                         firstTail.next = branchConn;

  1989                     }

  1990                     prev = new Branch(prev, node, branchConn);

  1991                 }

  1992             }

  1993             if (peek() != '|') {

  1994                 return prev;

  1995             }

  1996             next();

  1997         }

  1998     }

  2000     /**

  2001      * Parsing of sequences between alternations.

  2002      */

  2003     private Node sequence(Node end) {

  2004         Node head = null;

  2005         Node tail = null;

  2006         Node node = null;

  2007     LOOP:

  2008         for (;;) {

  2009             int ch = peek();

  2010             switch (ch) {

  2011             case '(':

  2012                 // Because group handles its own closure,

  2013                 // we need to treat it differently

  2014                 node = group0();

  2015                 // Check for comment or flag group

  2016                 if (node == null)

  2017                     continue;

  2018                 if (head == null)

  2019                     head = node;

  2020                 else

  2021                     tail.next = node;

  2022                 // Double return: Tail was returned in root

  2023                 tail = root;

  2024                 continue;

  2025             case '[':

  2026                 node = clazz(true);

  2027                 break;

  2028             case '\\':

  2029                 ch = nextEscaped();

  2030                 if (ch == 'p' || ch == 'P') {

  2031                     boolean oneLetter = true;

  2032                     boolean comp = (ch == 'P');

  2033                     ch = next(); // Consume { if present

  2034                     if (ch != '{') {

  2035                         unread();

  2036                     } else {

  2037                         oneLetter = false;

  2038                     }

  2039                     node = family(oneLetter, comp);

  2040                 } else {

  2041                     unread();

  2042                     node = atom();

  2043                 }

  2044                 break;

  2045             case '^':

  2046                 next();

  2047                 if (has(MULTILINE)) {

  2048                     if (has(UNIX_LINES))

  2049                         node = new UnixCaret();

  2050                     else

  2051                         node = new Caret();

  2052                 } else {

  2053                     node = new Begin();

  2054                 }

  2055                 break;

  2056             case '$':

  2057                 next();

  2058                 if (has(UNIX_LINES))

  2059                     node = new UnixDollar(has(MULTILINE));

  2060                 else

  2061                     node = new Dollar(has(MULTILINE));

  2062                 break;

  2063             case '.':

  2064                 next();

  2065                 if (has(DOTALL)) {

  2066                     node = new All();

  2067                 } else {

  2068                     if (has(UNIX_LINES))

  2069                         node = new UnixDot();

  2070                     else {

  2071                         node = new Dot();

  2072                     }

  2073                 }

  2074                 break;

  2075             case '|':

  2076             case ')':

  2077                 break LOOP;

  2078             case ']': // Now interpreting dangling ] and } as literals

  2079             case '}':

  2080                 node = atom();

  2081                 break;

  2082             case '?':

  2083             case '*':

  2084             case '+':

  2085                 next();

  2086                 throw error("Dangling meta character '" + ((char)ch) + "'");

  2087             case 0:

  2088                 if (cursor >= patternLength) {

  2089                     break LOOP;

  2090                 }

  2091                 // Fall through

  2092             default:

  2093                 node = atom();

  2094                 break;

  2095             }

  2097             node = closure(node);

  2099             if (head == null) {

  2100                 head = tail = node;

  2101             } else {

  2102                 tail.next = node;

  2103                 tail = node;

  2104             }

  2105         }

  2106         if (head == null) {

  2107             return end;

  2108         }

  2109         tail.next = end;

  2110         root = tail;      //double return

  2111         return head;

  2112     }

  2114     /**

  2115      * Parse and add a new Single or Slice.

  2116      */

  2117     private Node atom() {

  2118         int first = 0;

  2119         int prev = -1;

  2120         boolean hasSupplementary = false;

  2121         int ch = peek();

  2122         for (;;) {

  2123             switch (ch) {

  2124             case '*':

  2125             case '+':

  2126             case '?':

  2127             case '{':

  2128                 if (first > 1) {

  2129                     cursor = prev;    // Unwind one character

  2130                     first--;

  2131                 }

  2132                 break;

  2133             case '$':

  2134             case '.':

  2135             case '^':

  2136             case '(':

  2137             case '[':

  2138             case '|':

  2139             case ')':

  2140                 break;

  2141             case '\\':

  2142                 ch = nextEscaped();

  2143                 if (ch == 'p' || ch == 'P') { // Property

  2144                     if (first > 0) { // Slice is waiting; handle it first

  2145                         unread();

  2146                         break;

  2147                     } else { // No slice; just return the family node

  2148                         boolean comp = (ch == 'P');

  2149                         boolean oneLetter = true;

  2150                         ch = next(); // Consume { if present

  2151                         if (ch != '{')

  2152                             unread();

  2153                         else

  2154                             oneLetter = false;

  2155                         return family(oneLetter, comp);

  2156                     }

  2157                 }

  2158                 unread();

  2159                 prev = cursor;

  2160                 ch = escape(false, first == 0);

  2161                 if (ch >= 0) {

  2162                     append(ch, first);

  2163                     first++;

  2164                     if (isSupplementary(ch)) {

  2165                         hasSupplementary = true;

  2166                     }

  2167                     ch = peek();

  2168                     continue;

  2169                 } else if (first == 0) {

  2170                     return root;

  2171                 }

  2172                 // Unwind meta escape sequence

  2173                 cursor = prev;

  2174                 break;

  2175             case 0:

  2176                 if (cursor >= patternLength) {

  2177                     break;

  2178                 }

  2179                 // Fall through

  2180             default:

  2181                 prev = cursor;

  2182                 append(ch, first);

  2183                 first++;

  2184                 if (isSupplementary(ch)) {

  2185                     hasSupplementary = true;

  2186                 }

  2187                 ch = next();

  2188                 continue;

  2189             }

  2190             break;

  2191         }

  2192         if (first == 1) {

  2193             return newSingle(buffer[0]);

  2194         } else {

  2195             return newSlice(buffer, first, hasSupplementary);

  2196         }

  2197     }

  2199     private void append(int ch, int len) {

  2200         if (len >= buffer.length) {

  2201             int[] tmp = new int[len+len];

  2202             System.arraycopy(buffer, 0, tmp, 0, len);

  2203             buffer = tmp;

  2204         }

  2205         buffer[len] = ch;

  2206     }

  2208     /**

  2209      * Parses a backref greedily, taking as many numbers as it

  2210      * can. The first digit is always treated as a backref, but

  2211      * multi digit numbers are only treated as a backref if at

  2212      * least that many backrefs exist at this point in the regex.

  2213      */

  2214     private Node ref(int refNum) {

  2215         boolean done = false;

  2216         while(!done) {

  2217             int ch = peek();

  2218             switch(ch) {

  2219             case '0':

  2220             case '1':

  2221             case '2':

  2222             case '3':

  2223             case '4':

  2224             case '5':

  2225             case '6':

  2226             case '7':

  2227             case '8':

  2228             case '9':

  2229                 int newRefNum = (refNum * 10) + (ch - '0');

  2230                 // Add another number if it doesn't make a group

  2231                 // that doesn't exist

  2232                 if (capturingGroupCount - 1 < newRefNum) {

  2233                     done = true;

  2234                     break;

  2235                 }

  2236                 refNum = newRefNum;

  2237                 read();

  2238                 break;

  2239             default:

  2240                 done = true;

  2241                 break;

  2242             }

  2243         }

  2244         if (has(CASE_INSENSITIVE))

  2245             return new CIBackRef(refNum, has(UNICODE_CASE));

  2246         else

  2247             return new BackRef(refNum);

  2248     }

  2250     /**

  2251      * Parses an escape sequence to determine the actual value that needs

  2252      * to be matched.

  2253      * If -1 is returned and create was true a new object was added to the tree

  2254      * to handle the escape sequence.

  2255      * If the returned value is greater than zero, it is the value that

  2256      * matches the escape sequence.

  2257      */

  2258     private int escape(boolean inclass, boolean create) {

  2259         int ch = skip();

  2260         switch (ch) {

  2261         case '0':

  2262             return o();

  2263         case '1':

  2264         case '2':

  2265         case '3':

  2266         case '4':

  2267         case '5':

  2268         case '6':

  2269         case '7':

  2270         case '8':

  2271         case '9':

  2272             if (inclass) break;

  2273             if (create) {

  2274                 root = ref((ch - '0'));

  2275             }

  2276             return -1;

  2277         case 'A':

  2278             if (inclass) break;

  2279             if (create) root = new Begin();

  2280             return -1;

  2281         case 'B':

  2282             if (inclass) break;

  2283             if (create) root = new Bound(Bound.NONE, has(UNICODE_CHARACTER_CLASS));

  2284             return -1;

  2285         case 'C':

  2286             break;

  2287         case 'D':

  2288             if (create) root = has(UNICODE_CHARACTER_CLASS)

  2289                                ? new Utype(UnicodeProp.DIGIT).complement()

  2290                                : new Ctype(ASCII.DIGIT).complement();

  2291             return -1;

  2292         case 'E':

  2293         case 'F':

  2294             break;

  2295         case 'G':

  2296             if (inclass) break;

  2297             if (create) root = new LastMatch();

  2298             return -1;

  2299         case 'H':

  2300         case 'I':

  2301         case 'J':

  2302         case 'K':

  2303         case 'L':

  2304         case 'M':

  2305         case 'N':

  2306         case 'O':

  2307         case 'P':

  2308         case 'Q':

  2309         case 'R':

  2310             break;

  2311         case 'S':

  2312             if (create) root = has(UNICODE_CHARACTER_CLASS)

  2313                                ? new Utype(UnicodeProp.WHITE_SPACE).complement()

  2314                                : new Ctype(ASCII.SPACE).complement();

  2315             return -1;

  2316         case 'T':

  2317         case 'U':

  2318         case 'V':

  2319             break;

  2320         case 'W':

  2321             if (create) root = has(UNICODE_CHARACTER_CLASS)

  2322                                ? new Utype(UnicodeProp.WORD).complement()

  2323                                : new Ctype(ASCII.WORD).complement();

  2324             return -1;

  2325         case 'X':

  2326         case 'Y':

  2327             break;

  2328         case 'Z':

  2329             if (inclass) break;

  2330             if (create) {

  2331                 if (has(UNIX_LINES))

  2332                     root = new UnixDollar(false);

  2333                 else

  2334                     root = new Dollar(false);

  2335             }

  2336             return -1;

  2337         case 'a':

  2338             return '\007';

  2339         case 'b':

  2340             if (inclass) break;

  2341             if (create) root = new Bound(Bound.BOTH, has(UNICODE_CHARACTER_CLASS));

  2342             return -1;

  2343         case 'c':

  2344             return c();

  2345         case 'd':

  2346             if (create) root = has(UNICODE_CHARACTER_CLASS)

  2347                                ? new Utype(UnicodeProp.DIGIT)

  2348                                : new Ctype(ASCII.DIGIT);

  2349             return -1;

  2350         case 'e':

  2351             return '\033';

  2352         case 'f':

  2353             return '\f';

  2354         case 'g':

  2355         case 'h':

  2356         case 'i':

  2357         case 'j':

  2358             break;

  2359         case 'k':

  2360             if (inclass)

  2361                 break;

  2362             if (read() != '<')

  2363                 throw error("\\k is not followed by '<' for named capturing group");

  2364             String name = groupname(read());

  2365             if (!namedGroups().containsKey(name))

  2366                 throw error("(named capturing group <"+ name+"> does not exit");

  2367             if (create) {

  2368                 if (has(CASE_INSENSITIVE))

  2369                     root = new CIBackRef(namedGroups().get(name), has(UNICODE_CASE));

  2370                 else

  2371                     root = new BackRef(namedGroups().get(name));

  2372             }

  2373             return -1;

  2374         case 'l':

  2375         case 'm':

  2376             break;

  2377         case 'n':

  2378             return '\n';

  2379         case 'o':

  2380         case 'p':

  2381         case 'q':

  2382             break;

  2383         case 'r':

  2384             return '\r';

  2385         case 's':

  2386             if (create) root = has(UNICODE_CHARACTER_CLASS)

  2387                                ? new Utype(UnicodeProp.WHITE_SPACE)

  2388                                : new Ctype(ASCII.SPACE);

  2389             return -1;

  2390         case 't':

  2391             return '\t';

  2392         case 'u':

  2393             return u();

  2394         case 'v':

  2395             return '\013';

  2396         case 'w':

  2397             if (create) root = has(UNICODE_CHARACTER_CLASS)

  2398                                ? new Utype(UnicodeProp.WORD)

  2399                                : new Ctype(ASCII.WORD);

  2400             return -1;

  2401         case 'x':

  2402             return x();

  2403         case 'y':

  2404             break;

  2405         case 'z':

  2406             if (inclass) break;

  2407             if (create) root = new End();

  2408             return -1;

  2409         default:

  2410             return ch;

  2411         }

  2412         throw error("Illegal/unsupported escape sequence");

  2413     }

  2415     /**

  2416      * Parse a character class, and return the node that matches it.

  2417      *

  2418      * Consumes a ] on the way out if consume is true. Usually consume

  2419      * is true except for the case of [abc&&def] where def is a separate

  2420      * right hand node with "understood" brackets.

  2421      */

  2422     private CharProperty clazz(boolean consume) {

  2423         CharProperty prev = null;

  2424         CharProperty node = null;

  2425         BitClass bits = new BitClass();

  2426         boolean include = true;

  2427         boolean firstInClass = true;

  2428         int ch = next();

  2429         for (;;) {

  2430             switch (ch) {

  2431                 case '^':

  2432                     // Negates if first char in a class, otherwise literal

  2433                     if (firstInClass) {

  2434                         if (temp[cursor-1] != '[')

  2435                             break;

  2436                         ch = next();

  2437                         include = !include;

  2438                         continue;

  2439                     } else {

  2440                         // ^ not first in class, treat as literal

  2441                         break;

  2442                     }

  2443                 case '[':

  2444                     firstInClass = false;

  2445                     node = clazz(true);

  2446                     if (prev == null)

  2447                         prev = node;

  2448                     else

  2449                         prev = union(prev, node);

  2450                     ch = peek();

  2451                     continue;

  2452                 case '&':

  2453                     firstInClass = false;

  2454                     ch = next();

  2455                     if (ch == '&') {

  2456                         ch = next();

  2457                         CharProperty rightNode = null;

  2458                         while (ch != ']' && ch != '&') {

  2459                             if (ch == '[') {

  2460                                 if (rightNode == null)

  2461                                     rightNode = clazz(true);

  2462                                 else

  2463                                     rightNode = union(rightNode, clazz(true));

  2464                             } else { // abc&&def

  2465                                 unread();

  2466                                 rightNode = clazz(false);

  2467                             }

  2468                             ch = peek();

  2469                         }

  2470                         if (rightNode != null)

  2471                             node = rightNode;

  2472                         if (prev == null) {

  2473                             if (rightNode == null)

  2474                                 throw error("Bad class syntax");

  2475                             else

  2476                                 prev = rightNode;

  2477                         } else {

  2478                             prev = intersection(prev, node);

  2479                         }

  2480                     } else {

  2481                         // treat as a literal &

  2482                         unread();

  2483                         break;

  2484                     }

  2485                     continue;

  2486                 case 0:

  2487                     firstInClass = false;

  2488                     if (cursor >= patternLength)

  2489                         throw error("Unclosed character class");

  2490                     break;

  2491                 case ']':

  2492                     firstInClass = false;

  2493                     if (prev != null) {

  2494                         if (consume)

  2495                             next();

  2496                         return prev;

  2497                     }

  2498                     break;

  2499                 default:

  2500                     firstInClass = false;

  2501                     break;

  2502             }

  2503             node = range(bits);

  2504             if (include) {

  2505                 if (prev == null) {

  2506                     prev = node;

  2507                 } else {

  2508                     if (prev != node)

  2509                         prev = union(prev, node);

  2510                 }

  2511             } else {

  2512                 if (prev == null) {

  2513                     prev = node.complement();

  2514                 } else {

  2515                     if (prev != node)

  2516                         prev = setDifference(prev, node);

  2517                 }

  2518             }

  2519             ch = peek();

  2520         }

  2521     }

  2523     private CharProperty bitsOrSingle(BitClass bits, int ch) {

  2524         /* Bits can only handle codepoints in [u+0000-u+00ff] range.

  2525            Use "single" node instead of bits when dealing with unicode

  2526            case folding for codepoints listed below.

  2527            (1)Uppercase out of range: u+00ff, u+00b5

  2528               toUpperCase(u+00ff) -> u+0178

  2529               toUpperCase(u+00b5) -> u+039c

  2530            (2)LatinSmallLetterLongS u+17f

  2531               toUpperCase(u+017f) -> u+0053

  2532            (3)LatinSmallLetterDotlessI u+131

  2533               toUpperCase(u+0131) -> u+0049

  2534            (4)LatinCapitalLetterIWithDotAbove u+0130

  2535               toLowerCase(u+0130) -> u+0069

  2536            (5)KelvinSign u+212a

  2537               toLowerCase(u+212a) ==> u+006B

  2538            (6)AngstromSign u+212b

  2539               toLowerCase(u+212b) ==> u+00e5

  2540         */

  2541         int d;

  2542         if (ch < 256 &&

  2543             !(has(CASE_INSENSITIVE) && has(UNICODE_CASE) &&

  2544               (ch == 0xff || ch == 0xb5 ||

  2545                ch == 0x49 || ch == 0x69 ||  //I and i

  2546                ch == 0x53 || ch == 0x73 ||  //S and s

  2547                ch == 0x4b || ch == 0x6b ||  //K and k

  2548                ch == 0xc5 || ch == 0xe5)))  //A+ring

  2549             return bits.add(ch, flags());

  2550         return newSingle(ch);

  2551     }

  2553     /**

  2554      * Parse a single character or a character range in a character class

  2555      * and return its representative node.

  2556      */

  2557     private CharProperty range(BitClass bits) {

  2558         int ch = peek();

  2559         if (ch == '\\') {

  2560             ch = nextEscaped();

  2561             if (ch == 'p' || ch == 'P') { // A property

  2562                 boolean comp = (ch == 'P');

  2563                 boolean oneLetter = true;

  2564                 // Consume { if present

  2565                 ch = next();

  2566                 if (ch != '{')

  2567                     unread();

  2568                 else

  2569                     oneLetter = false;

  2570                 return family(oneLetter, comp);

  2571             } else { // ordinary escape

  2572                 unread();

  2573                 ch = escape(true, true);

  2574                 if (ch == -1)

  2575                     return (CharProperty) root;

  2576             }

  2577         } else {

  2578             ch = single();

  2579         }

  2580         if (ch >= 0) {

  2581             if (peek() == '-') {

  2582                 int endRange = temp[cursor+1];

  2583                 if (endRange == '[') {

  2584                     return bitsOrSingle(bits, ch);

  2585                 }

  2586                 if (endRange != ']') {

  2587                     next();

  2588                     int m = single();

  2589                     if (m < ch)

  2590                         throw error("Illegal character range");

  2591                     if (has(CASE_INSENSITIVE))

  2592                         return caseInsensitiveRangeFor(ch, m);

  2593                     else

  2594                         return rangeFor(ch, m);

  2595                 }

  2596             }

  2597             return bitsOrSingle(bits, ch);

  2598         }

  2599         throw error("Unexpected character '"+((char)ch)+"'");

  2600     }

  2602     private int single() {

  2603         int ch = peek();

  2604         switch (ch) {

  2605         case '\\':

  2606             return escape(true, false);

  2607         default:

  2608             next();

  2609             return ch;

  2610         }

  2611     }

  2613     /**

  2614      * Parses a Unicode character family and returns its representative node.

  2615      */

  2616     private CharProperty family(boolean singleLetter,

  2617                                 boolean maybeComplement)

  2618     {

  2619         next();

  2620         String name;

  2621         CharProperty node = null;

  2623         if (singleLetter) {

  2624             int c = temp[cursor];

  2625             if (!Character.isSupplementaryCodePoint(c)) {

  2626                 name = String.valueOf((char)c);

  2627             } else {

  2628                 name = new String(temp, cursor, 1);

  2629             }

  2630             read();

  2631         } else {

  2632             int i = cursor;

  2633             mark('}');

  2634             while(read() != '}') {

  2635             }

  2636             mark('\000');

  2637             int j = cursor;

  2638             if (j > patternLength)

  2639                 throw error("Unclosed character family");

  2640             if (i + 1 >= j)

  2641                 throw error("Empty character family");

  2642             name = new String(temp, i, j-i-1);

  2643         }

  2645         int i = name.indexOf('=');

  2646         if (i != -1) {

  2647             // property construct \p{name=value}

  2648             String value = name.substring(i + 1);

  2649             name = name.substring(0, i).toLowerCase(Locale.ENGLISH);

  2650     /*        if ("sc".equals(name) || "script".equals(name)) {

  2651                 node = unicodeScriptPropertyFor(value);

  2652             } else if ("blk".equals(name) || "block".equals(name)) {

  2653                 node = unicodeBlockPropertyFor(value);

  2654             } else*/ if ("gc".equals(name) || "general_category".equals(name)) {

  2655                 node = charPropertyNodeFor(value);

  2656             } else {

  2657                 throw error("Unknown Unicode property {name=<" + name + ">, "

  2658                              + "value=<" + value + ">}");

  2659             }

  2660         } else {

  2661             /*if (name.startsWith("In")) {

  2662                 // \p{inBlockName}

  2663                 node = unicodeBlockPropertyFor(name.substring(2));

  2664             } else if (name.startsWith("Is")) {

  2665                 // \p{isGeneralCategory} and \p{isScriptName}

  2666                 name = name.substring(2);

  2667                 UnicodeProp uprop = UnicodeProp.forName(name);

  2668                 if (uprop != null)

  2669                     node = new Utype(uprop);

  2670                 if (node == null)

  2671                     node = CharPropertyNames.charPropertyFor(name);

  2672                 if (node == null)

  2673                     node = unicodeScriptPropertyFor(name);

  2674             } else*/ {

  2675                 if (has(UNICODE_CHARACTER_CLASS)) {

  2676                     UnicodeProp uprop = UnicodeProp.forPOSIXName(name);

  2677                     if (uprop != null)

  2678                         node = new Utype(uprop);

  2679                 }

  2680                 if (node == null)

  2681                     node = charPropertyNodeFor(name);

  2682             }

  2683         }

  2684         if (maybeComplement) {

  2685             if (node instanceof Category /*|| node instanceof Block*/)

  2686                 hasSupplementary = true;

  2687             node = node.complement();

  2688         }

  2689         return node;

  2690     }

  2693     /**

  2694      * Returns a CharProperty matching all characters belong to

  2695      * a UnicodeScript.

  2696      *

  2697     private CharProperty unicodeScriptPropertyFor(String name) {

  2698         final Character.UnicodeScript script;

  2699         try {

  2700             script = Character.UnicodeScript.forName(name);

  2701         } catch (IllegalArgumentException iae) {

  2702             throw error("Unknown character script name {" + name + "}");

  2703         }

  2704         return new Script(script);

  2705     }

  2707     /**

  2708      * Returns a CharProperty matching all characters in a UnicodeBlock.

  2709      *

  2710     private CharProperty unicodeBlockPropertyFor(String name) {

  2711         final Character.UnicodeBlock block;

  2712         try {

  2713             block = Character.UnicodeBlock.forName(name);

  2714         } catch (IllegalArgumentException iae) {

  2715             throw error("Unknown character block name {" + name + "}");

  2716         }

  2717         return new Block(block);

  2718     }

  2720     /**

  2721      * Returns a CharProperty matching all characters in a named property.

  2722      */

  2723     private CharProperty charPropertyNodeFor(String name) {

  2724         CharProperty p = CharPropertyNames.charPropertyFor(name);

  2725         if (p == null)

  2726             throw error("Unknown character property name {" + name + "}");

  2727         return p;

  2728     }

  2730     /**

  2731      * Parses and returns the name of a "named capturing group", the trailing

  2732      * ">" is consumed after parsing.

  2733      */

  2734     private String groupname(int ch) {

  2735         StringBuilder sb = new StringBuilder();

  2736         sb.append(Character.toChars(ch));

  2737         while (ASCII.isLower(ch=read()) || ASCII.isUpper(ch) ||

  2738                ASCII.isDigit(ch)) {

  2739             sb.append(Character.toChars(ch));

  2740         }

  2741         if (sb.length() == 0)

  2742             throw error("named capturing group has 0 length name");

  2743         if (ch != '>')

  2744             throw error("named capturing group is missing trailing '>'");

  2745         return sb.toString();

  2746     }

  2748     /**

  2749      * Parses a group and returns the head node of a set of nodes that process

  2750      * the group. Sometimes a double return system is used where the tail is

  2751      * returned in root.

  2752      */

  2753     private Node group0() {

  2754         boolean capturingGroup = false;

  2755         Node head = null;

  2756         Node tail = null;

  2757         int save = flags;

  2758         root = null;

  2759         int ch = next();

  2760         if (ch == '?') {

  2761             ch = skip();

  2762             switch (ch) {

  2763             case ':':   //  (?:xxx) pure group

  2764                 head = createGroup(true);

  2765                 tail = root;

  2766                 head.next = expr(tail);

  2767                 break;

  2768             case '=':   // (?=xxx) and (?!xxx) lookahead

  2769             case '!':

  2770                 head = createGroup(true);

  2771                 tail = root;

  2772                 head.next = expr(tail);

  2773                 if (ch == '=') {

  2774                     head = tail = new Pos(head);

  2775                 } else {

  2776                     head = tail = new Neg(head);

  2777                 }

  2778                 break;

  2779             case '>':   // (?>xxx)  independent group

  2780                 head = createGroup(true);

  2781                 tail = root;

  2782                 head.next = expr(tail);

  2783                 head = tail = new Ques(head, INDEPENDENT);

  2784                 break;

  2785             case '<':   // (?<xxx)  look behind

  2786                 ch = read();

  2787                 if (ASCII.isLower(ch) || ASCII.isUpper(ch)) {

  2788                     // named captured group

  2789                     String name = groupname(ch);

  2790                     if (namedGroups().containsKey(name))

  2791                         throw error("Named capturing group <" + name

  2792                                     + "> is already defined");

  2793                     capturingGroup = true;

  2794                     head = createGroup(false);

  2795                     tail = root;

  2796                     namedGroups().put(name, capturingGroupCount-1);

  2797                     head.next = expr(tail);

  2798                     break;

  2799                 }

  2800                 int start = cursor;

  2801                 head = createGroup(true);

  2802                 tail = root;

  2803                 head.next = expr(tail);

  2804                 tail.next = lookbehindEnd;

  2805                 TreeInfo info = new TreeInfo();

  2806                 head.study(info);

  2807                 if (info.maxValid == false) {

  2808                     throw error("Look-behind group does not have "

  2809                                 + "an obvious maximum length");

  2810                 }

  2811                 boolean hasSupplementary = findSupplementary(start, patternLength);

  2812                 if (ch == '=') {

  2813                     head = tail = (hasSupplementary ?

  2814                                    new BehindS(head, info.maxLength,

  2815                                                info.minLength) :

  2816                                    new Behind(head, info.maxLength,

  2817                                               info.minLength));

  2818                 } else if (ch == '!') {

  2819                     head = tail = (hasSupplementary ?

  2820                                    new NotBehindS(head, info.maxLength,

  2821                                                   info.minLength) :

  2822                                    new NotBehind(head, info.maxLength,

  2823                                                  info.minLength));

  2824                 } else {

  2825                     throw error("Unknown look-behind group");

  2826                 }

  2827                 break;

  2828             case '$':

  2829             case '@':

  2830                 throw error("Unknown group type");

  2831             default:    // (?xxx:) inlined match flags

  2832                 unread();

  2833                 addFlag();

  2834                 ch = read();

  2835                 if (ch == ')') {

  2836                     return null;    // Inline modifier only

  2837                 }

  2838                 if (ch != ':') {

  2839                     throw error("Unknown inline modifier");

  2840                 }

  2841                 head = createGroup(true);

  2842                 tail = root;

  2843                 head.next = expr(tail);

  2844                 break;

  2845             }

  2846         } else { // (xxx) a regular group

  2847             capturingGroup = true;

  2848             head = createGroup(false);

  2849             tail = root;

  2850             head.next = expr(tail);

  2851         }

  2853         accept(')', "Unclosed group");

  2854         flags = save;

  2856         // Check for quantifiers

  2857         Node node = closure(head);

  2858         if (node == head) { // No closure

  2859             root = tail;

  2860             return node;    // Dual return

  2861         }

  2862         if (head == tail) { // Zero length assertion

  2863             root = node;

  2864             return node;    // Dual return

  2865         }

  2867         if (node instanceof Ques) {

  2868             Ques ques = (Ques) node;

  2869             if (ques.type == POSSESSIVE) {

  2870                 root = node;

  2871                 return node;

  2872             }

  2873             tail.next = new BranchConn();

  2874             tail = tail.next;

  2875             if (ques.type == GREEDY) {

  2876                 head = new Branch(head, null, tail);

  2877             } else { // Reluctant quantifier

  2878                 head = new Branch(null, head, tail);

  2879             }

  2880             root = tail;

  2881             return head;

  2882         } else if (node instanceof Curly) {

  2883             Curly curly = (Curly) node;

  2884             if (curly.type == POSSESSIVE) {

  2885                 root = node;

  2886                 return node;

  2887             }

  2888             // Discover if the group is deterministic

  2889             TreeInfo info = new TreeInfo();

  2890             if (head.study(info)) { // Deterministic

  2891                 GroupTail temp = (GroupTail) tail;

  2892                 head = root = new GroupCurly(head.next, curly.cmin,

  2893                                    curly.cmax, curly.type,

  2894                                    ((GroupTail)tail).localIndex,

  2895                                    ((GroupTail)tail).groupIndex,

  2896                                              capturingGroup);

  2897                 return head;

  2898             } else { // Non-deterministic

  2899                 int temp = ((GroupHead) head).localIndex;

  2900                 Loop loop;

  2901                 if (curly.type == GREEDY)

  2902                     loop = new Loop(this.localCount, temp);

  2903                 else  // Reluctant Curly

  2904                     loop = new LazyLoop(this.localCount, temp);

  2905                 Prolog prolog = new Prolog(loop);

  2906                 this.localCount += 1;

  2907                 loop.cmin = curly.cmin;

  2908                 loop.cmax = curly.cmax;

  2909                 loop.body = head;

  2910                 tail.next = loop;

  2911                 root = loop;

  2912                 return prolog; // Dual return

  2913             }

  2914         }

  2915         throw error("Internal logic error");

  2916     }

  2918     /**

  2919      * Create group head and tail nodes using double return. If the group is

  2920      * created with anonymous true then it is a pure group and should not

  2921      * affect group counting.

  2922      */

  2923     private Node createGroup(boolean anonymous) {

  2924         int localIndex = localCount++;

  2925         int groupIndex = 0;

  2926         if (!anonymous)

  2927             groupIndex = capturingGroupCount++;

  2928         GroupHead head = new GroupHead(localIndex);

  2929         root = new GroupTail(localIndex, groupIndex);

  2930         if (!anonymous && groupIndex < 10)

  2931             groupNodes[groupIndex] = head;

  2932         return head;

  2933     }

  2935     /**

  2936      * Parses inlined match flags and set them appropriately.

  2937      */

  2938     private void addFlag() {

  2939         int ch = peek();

  2940         for (;;) {

  2941             switch (ch) {

  2942             case 'i':

  2943                 flags |= CASE_INSENSITIVE;

  2944                 break;

  2945             case 'm':

  2946                 flags |= MULTILINE;

  2947                 break;

  2948             case 's':

  2949                 flags |= DOTALL;

  2950                 break;

  2951             case 'd':

  2952                 flags |= UNIX_LINES;

  2953                 break;

  2954             case 'u':

  2955                 flags |= UNICODE_CASE;

  2956                 break;

  2957             case 'c':

  2958                 flags |= CANON_EQ;

  2959                 break;

  2960             case 'x':

  2961                 flags |= COMMENTS;

  2962                 break;

  2963             case 'U':

  2964                 flags |= (UNICODE_CHARACTER_CLASS | UNICODE_CASE);

  2965                 break;

  2966             case '-': // subFlag then fall through

  2967                 ch = next();

  2968                 subFlag();

  2969             default:

  2970                 return;

  2971             }

  2972             ch = next();

  2973         }

  2974     }

  2976     /**

  2977      * Parses the second part of inlined match flags and turns off

  2978      * flags appropriately.

  2979      */

  2980     private void subFlag() {

  2981         int ch = peek();

  2982         for (;;) {

  2983             switch (ch) {

  2984             case 'i':

  2985                 flags &= ~CASE_INSENSITIVE;

  2986                 break;

  2987             case 'm':

  2988                 flags &= ~MULTILINE;

  2989                 break;

  2990             case 's':

  2991                 flags &= ~DOTALL;

  2992                 break;

  2993             case 'd':

  2994                 flags &= ~UNIX_LINES;

  2995                 break;

  2996             case 'u':

  2997                 flags &= ~UNICODE_CASE;

  2998                 break;

  2999             case 'c':

  3000                 flags &= ~CANON_EQ;

  3001                 break;

  3002             case 'x':

  3003                 flags &= ~COMMENTS;

  3004                 break;

  3005             case 'U':

  3006                 flags &= ~(UNICODE_CHARACTER_CLASS | UNICODE_CASE);

  3007             default:

  3008                 return;

  3009             }

  3010             ch = next();

  3011         }

  3012     }

  3014     static final int MAX_REPS   = 0x7FFFFFFF;

  3016     static final int GREEDY     = 0;

  3018     static final int LAZY       = 1;

  3020     static final int POSSESSIVE = 2;

  3022     static final int INDEPENDENT = 3;

  3024     /**

  3025      * Processes repetition. If the next character peeked is a quantifier

  3026      * then new nodes must be appended to handle the repetition.

  3027      * Prev could be a single or a group, so it could be a chain of nodes.

  3028      */

  3029     private Node closure(Node prev) {

  3030         Node atom;

  3031         int ch = peek();

  3032         switch (ch) {

  3033         case '?':

  3034             ch = next();

  3035             if (ch == '?') {

  3036                 next();

  3037                 return new Ques(prev, LAZY);

  3038             } else if (ch == '+') {

  3039                 next();

  3040                 return new Ques(prev, POSSESSIVE);

  3041             }

  3042             return new Ques(prev, GREEDY);

  3043         case '*':

  3044             ch = next();

  3045             if (ch == '?') {

  3046                 next();

  3047                 return new Curly(prev, 0, MAX_REPS, LAZY);

  3048             } else if (ch == '+') {

  3049                 next();

  3050                 return new Curly(prev, 0, MAX_REPS, POSSESSIVE);

  3051             }

  3052             return new Curly(prev, 0, MAX_REPS, GREEDY);

  3053         case '+':

  3054             ch = next();

  3055             if (ch == '?') {

  3056                 next();

  3057                 return new Curly(prev, 1, MAX_REPS, LAZY);

  3058             } else if (ch == '+') {

  3059                 next();

  3060                 return new Curly(prev, 1, MAX_REPS, POSSESSIVE);

  3061             }

  3062             return new Curly(prev, 1, MAX_REPS, GREEDY);

  3063         case '{':

  3064             ch = temp[cursor+1];

  3065             if (ASCII.isDigit(ch)) {

  3066                 skip();

  3067                 int cmin = 0;

  3068                 do {

  3069                     cmin = cmin * 10 + (ch - '0');

  3070                 } while (ASCII.isDigit(ch = read()));

  3071                 int cmax = cmin;

  3072                 if (ch == ',') {

  3073                     ch = read();

  3074                     cmax = MAX_REPS;

  3075                     if (ch != '}') {

  3076                         cmax = 0;

  3077                         while (ASCII.isDigit(ch)) {

  3078                             cmax = cmax * 10 + (ch - '0');

  3079                             ch = read();

  3080                         }

  3081                     }

  3082                 }

  3083                 if (ch != '}')

  3084                     throw error("Unclosed counted closure");

  3085                 if (((cmin) | (cmax) | (cmax - cmin)) < 0)

  3086                     throw error("Illegal repetition range");

  3087                 Curly curly;

  3088                 ch = peek();

  3089                 if (ch == '?') {

  3090                     next();

  3091                     curly = new Curly(prev, cmin, cmax, LAZY);

  3092                 } else if (ch == '+') {

  3093                     next();

  3094                     curly = new Curly(prev, cmin, cmax, POSSESSIVE);

  3095                 } else {

  3096                     curly = new Curly(prev, cmin, cmax, GREEDY);

  3097                 }

  3098                 return curly;

  3099             } else {

  3100                 throw error("Illegal repetition");

  3101             }

  3102         default:

  3103             return prev;

  3104         }

  3105     }

  3107     /**

  3108      *  Utility method for parsing control escape sequences.

  3109      */

  3110     private int c() {

  3111         if (cursor < patternLength) {

  3112             return read() ^ 64;

  3113         }

  3114         throw error("Illegal control escape sequence");

  3115     }

  3117     /**

  3118      *  Utility method for parsing octal escape sequences.

  3119      */

  3120     private int o() {

  3121         int n = read();

  3122         if (((n-'0')|('7'-n)) >= 0) {

  3123             int m = read();

  3124             if (((m-'0')|('7'-m)) >= 0) {

  3125                 int o = read();

  3126                 if ((((o-'0')|('7'-o)) >= 0) && (((n-'0')|('3'-n)) >= 0)) {

  3127                     return (n - '0') * 64 + (m - '0') * 8 + (o - '0');

  3128                 }

  3129                 unread();

  3130                 return (n - '0') * 8 + (m - '0');

  3131             }

  3132             unread();

  3133             return (n - '0');

  3134         }

  3135         throw error("Illegal octal escape sequence");

  3136     }

  3138     /**

  3139      *  Utility method for parsing hexadecimal escape sequences.

  3140      */

  3141     private int x() {

  3142         int n = read();

  3143         if (ASCII.isHexDigit(n)) {

  3144             int m = read();

  3145             if (ASCII.isHexDigit(m)) {

  3146                 return ASCII.toDigit(n) * 16 + ASCII.toDigit(m);

  3147             }

  3148         } else if (n == '{' && ASCII.isHexDigit(peek())) {

  3149             int ch = 0;

  3150             while (ASCII.isHexDigit(n = read())) {

  3151                 ch = (ch << 4) + ASCII.toDigit(n);

  3152                 if (ch > Character.MAX_CODE_POINT)

  3153                     throw error("Hexadecimal codepoint is too big");

  3154             }

  3155             if (n != '}')

  3156                 throw error("Unclosed hexadecimal escape sequence");

  3157             return ch;

  3158         }

  3159         throw error("Illegal hexadecimal escape sequence");

  3160     }

  3162     /**

  3163      *  Utility method for parsing unicode escape sequences.

  3164      */

  3165     private int cursor() {

  3166         return cursor;

  3167     }

  3169     private void setcursor(int pos) {

  3170         cursor = pos;

  3171     }

  3173     private int uxxxx() {

  3174         int n = 0;

  3175         for (int i = 0; i < 4; i++) {

  3176             int ch = read();

  3177             if (!ASCII.isHexDigit(ch)) {

  3178                 throw error("Illegal Unicode escape sequence");

  3179             }

  3180             n = n * 16 + ASCII.toDigit(ch);

  3181         }

  3182         return n;

  3183     }

  3185     private int u() {

  3186         int n = uxxxx();

  3187         if (Character.isHighSurrogate((char)n)) {

  3188             int cur = cursor();

  3189             if (read() == '\\' && read() == 'u') {

  3190                 int n2 = uxxxx();

  3191                 if (Character.isLowSurrogate((char)n2))

  3192                     return Character.toCodePoint((char)n, (char)n2);

  3193             }

  3194             setcursor(cur);

  3195         }

  3196         return n;

  3197     }

  3199     //

  3200     // Utility methods for code point support

  3201     //

  3203     private static final int countChars(CharSequence seq, int index,

  3204                                         int lengthInCodePoints) {

  3205         // optimization

  3206         if (lengthInCodePoints == 1 && !Character.isHighSurrogate(seq.charAt(index))) {

  3207             assert (index >= 0 && index < seq.length());

  3208             return 1;

  3209         }

  3210         int length = seq.length();

  3211         int x = index;

  3212         if (lengthInCodePoints >= 0) {

  3213             assert (index >= 0 && index < length);

  3214             for (int i = 0; x < length && i < lengthInCodePoints; i++) {

  3215                 if (Character.isHighSurrogate(seq.charAt(x++))) {

  3216                     if (x < length && Character.isLowSurrogate(seq.charAt(x))) {

  3217                         x++;

  3218                     }

  3219                 }

  3220             }

  3221             return x - index;

  3222         }

  3224         assert (index >= 0 && index <= length);

  3225         if (index == 0) {

  3226             return 0;

  3227         }

  3228         int len = -lengthInCodePoints;

  3229         for (int i = 0; x > 0 && i < len; i++) {

  3230             if (Character.isLowSurrogate(seq.charAt(--x))) {

  3231                 if (x > 0 && Character.isHighSurrogate(seq.charAt(x-1))) {

  3232                     x--;

  3233                 }

  3234             }

  3235         }

  3236         return index - x;

  3237     }

  3239     private static final int countCodePoints(CharSequence seq) {

  3240         int length = seq.length();

  3241         int n = 0;

  3242         for (int i = 0; i < length; ) {

  3243             n++;

  3244             if (Character.isHighSurrogate(seq.charAt(i++))) {

  3245                 if (i < length && Character.isLowSurrogate(seq.charAt(i))) {

  3246                     i++;

  3247                 }

  3248             }

  3249         }

  3250         return n;

  3251     }

  3253     /**

  3254      *  Creates a bit vector for matching Latin-1 values. A normal BitClass

  3255      *  never matches values above Latin-1, and a complemented BitClass always

  3256      *  matches values above Latin-1.

  3257      */

  3258     private static final class BitClass extends BmpCharProperty {

  3259         final boolean[] bits;

  3260         BitClass() { bits = new boolean[256]; }

  3261         private BitClass(boolean[] bits) { this.bits = bits; }

  3262         BitClass add(int c, int flags) {

  3263             assert c >= 0 && c <= 255;

  3264             if ((flags & CASE_INSENSITIVE) != 0) {

  3265                 if (ASCII.isAscii(c)) {

  3266                     bits[ASCII.toUpper(c)] = true;

  3267                     bits[ASCII.toLower(c)] = true;

  3268                 } else if ((flags & UNICODE_CASE) != 0) {

  3269                     bits[Character.toLowerCase(c)] = true;

  3270                     bits[Character.toUpperCase(c)] = true;

  3271                 }

  3272             }

  3273             bits[c] = true;

  3274             return this;

  3275         }

  3276         boolean isSatisfiedBy(int ch) {

  3277             return ch < 256 && bits[ch];

  3278         }

  3279     }

  3281     /**

  3282      *  Returns a suitably optimized, single character matcher.

  3283      */

  3284     private CharProperty newSingle(final int ch) {

  3285         if (has(CASE_INSENSITIVE)) {

  3286             int lower, upper;

  3287             if (has(UNICODE_CASE)) {

  3288                 upper = Character.toUpperCase(ch);

  3289                 lower = Character.toLowerCase(upper);

  3290                 if (upper != lower)

  3291                     return new SingleU(lower);

  3292             } else if (ASCII.isAscii(ch)) {

  3293                 lower = ASCII.toLower(ch);

  3294                 upper = ASCII.toUpper(ch);

  3295                 if (lower != upper)

  3296                     return new SingleI(lower, upper);

  3297             }

  3298         }

  3299         if (isSupplementary(ch))

  3300             return new SingleS(ch);    // Match a given Unicode character

  3301         return new Single(ch);         // Match a given BMP character

  3302     }

  3304     /**

  3305      *  Utility method for creating a string slice matcher.

  3306      */

  3307     private Node newSlice(int[] buf, int count, boolean hasSupplementary) {

  3308         int[] tmp = new int[count];

  3309         if (has(CASE_INSENSITIVE)) {

  3310             if (has(UNICODE_CASE)) {

  3311                 for (int i = 0; i < count; i++) {

  3312                     tmp[i] = Character.toLowerCase(

  3313                                  Character.toUpperCase(buf[i]));

  3314                 }

  3315                 return hasSupplementary? new SliceUS(tmp) : new SliceU(tmp);

  3316             }

  3317             for (int i = 0; i < count; i++) {

  3318                 tmp[i] = ASCII.toLower(buf[i]);

  3319             }

  3320             return hasSupplementary? new SliceIS(tmp) : new SliceI(tmp);

  3321         }

  3322         for (int i = 0; i < count; i++) {

  3323             tmp[i] = buf[i];

  3324         }

  3325         return hasSupplementary ? new SliceS(tmp) : new Slice(tmp);

  3326     }

  3328     /**

  3329      * The following classes are the building components of the object

  3330      * tree that represents a compiled regular expression. The object tree

  3331      * is made of individual elements that handle constructs in the Pattern.

  3332      * Each type of object knows how to match its equivalent construct with

  3333      * the match() method.

  3334      */

  3336     /**

  3337      * Base class for all node classes. Subclasses should override the match()

  3338      * method as appropriate. This class is an accepting node, so its match()

  3339      * always returns true.

  3340      */

  3341     static class Node extends Object {

  3342         Node next;

  3343         Node() {

  3344             next = Pattern.accept;

  3345         }

  3346         /**

  3347          * This method implements the classic accept node.

  3348          */

  3349         boolean match(Matcher matcher, int i, CharSequence seq) {

  3350             matcher.last = i;

  3351             matcher.groups[0] = matcher.first;

  3352             matcher.groups[1] = matcher.last;

  3353             return true;

  3354         }

  3355         /**

  3356          * This method is good for all zero length assertions.

  3357          */

  3358         boolean study(TreeInfo info) {

  3359             if (next != null) {

  3360                 return next.study(info);

  3361             } else {

  3362                 return info.deterministic;

  3363             }

  3364         }

  3365     }

  3367     static class LastNode extends Node {

  3368         /**

  3369          * This method implements the classic accept node with

  3370          * the addition of a check to see if the match occurred

  3371          * using all of the input.

  3372          */

  3373         boolean match(Matcher matcher, int i, CharSequence seq) {

  3374             if (matcher.acceptMode == Matcher.ENDANCHOR && i != matcher.to)

  3375                 return false;

  3376             matcher.last = i;

  3377             matcher.groups[0] = matcher.first;

  3378             matcher.groups[1] = matcher.last;

  3379             return true;

  3380         }

  3381     }

  3383     /**

  3384      * Used for REs that can start anywhere within the input string.

  3385      * This basically tries to match repeatedly at each spot in the

  3386      * input string, moving forward after each try. An anchored search

  3387      * or a BnM will bypass this node completely.

  3388      */

  3389     static class Start extends Node {

  3390         int minLength;

  3391         Start(Node node) {

  3392             this.next = node;

  3393             TreeInfo info = new TreeInfo();

  3394             next.study(info);

  3395             minLength = info.minLength;

  3396         }

  3397         boolean match(Matcher matcher, int i, CharSequence seq) {

  3398             if (i > matcher.to - minLength) {

  3399                 matcher.hitEnd = true;

  3400                 return false;

  3401             }

  3402             int guard = matcher.to - minLength;

  3403             for (; i <= guard; i++) {

  3404                 if (next.match(matcher, i, seq)) {

  3405                     matcher.first = i;

  3406                     matcher.groups[0] = matcher.first;

  3407                     matcher.groups[1] = matcher.last;

  3408                     return true;

  3409                 }

  3410             }

  3411             matcher.hitEnd = true;

  3412             return false;

  3413         }

  3414         boolean study(TreeInfo info) {

  3415             next.study(info);

  3416             info.maxValid = false;

  3417             info.deterministic = false;

  3418             return false;

  3419         }

  3420     }

  3422     /*

  3423      * StartS supports supplementary characters, including unpaired surrogates.

  3424      */

  3425     static final class StartS extends Start {

  3426         StartS(Node node) {

  3427             super(node);

  3428         }

  3429         boolean match(Matcher matcher, int i, CharSequence seq) {

  3430             if (i > matcher.to - minLength) {

  3431                 matcher.hitEnd = true;

  3432                 return false;

  3433             }

  3434             int guard = matcher.to - minLength;

  3435             while (i <= guard) {

  3436                 //if ((ret = next.match(matcher, i, seq)) || i == guard)

  3437                 if (next.match(matcher, i, seq)) {

  3438                     matcher.first = i;

  3439                     matcher.groups[0] = matcher.first;

  3440                     matcher.groups[1] = matcher.last;

  3441                     return true;

  3442                 }

  3443                 if (i == guard)

  3444                     break;

  3445                 // Optimization to move to the next character. This is

  3446                 // faster than countChars(seq, i, 1).

  3447                 if (Character.isHighSurrogate(seq.charAt(i++))) {

  3448                     if (i < seq.length() &&

  3449                         Character.isLowSurrogate(seq.charAt(i))) {

  3450                         i++;

  3451                     }

  3452                 }

  3453             }

  3454             matcher.hitEnd = true;

  3455             return false;

  3456         }

  3457     }

  3459     /**

  3460      * Node to anchor at the beginning of input. This object implements the

  3461      * match for a \A sequence, and the caret anchor will use this if not in

  3462      * multiline mode.

  3463      */

  3464     static final class Begin extends Node {

  3465         boolean match(Matcher matcher, int i, CharSequence seq) {

  3466             int fromIndex = (matcher.anchoringBounds) ?

  3467                 matcher.from : 0;

  3468             if (i == fromIndex && next.match(matcher, i, seq)) {

  3469                 matcher.first = i;

  3470                 matcher.groups[0] = i;

  3471                 matcher.groups[1] = matcher.last;

  3472                 return true;

  3473             } else {

  3474                 return false;

  3475             }

  3476         }

  3477     }

  3479     /**

  3480      * Node to anchor at the end of input. This is the absolute end, so this

  3481      * should not match at the last newline before the end as $ will.

  3482      */

  3483     static final class End extends Node {

  3484         boolean match(Matcher matcher, int i, CharSequence seq) {

  3485             int endIndex = (matcher.anchoringBounds) ?

  3486                 matcher.to : matcher.getTextLength();

  3487             if (i == endIndex) {

  3488                 matcher.hitEnd = true;

  3489                 return next.match(matcher, i, seq);

  3490             }

  3491             return false;

  3492         }

  3493     }

  3495     /**

  3496      * Node to anchor at the beginning of a line. This is essentially the

  3497      * object to match for the multiline ^.

  3498      */

  3499     static final class Caret extends Node {

  3500         boolean match(Matcher matcher, int i, CharSequence seq) {

  3501             int startIndex = matcher.from;

  3502             int endIndex = matcher.to;

  3503             if (!matcher.anchoringBounds) {

  3504                 startIndex = 0;

  3505                 endIndex = matcher.getTextLength();

  3506             }

  3507             // Perl does not match ^ at end of input even after newline

  3508             if (i == endIndex) {

  3509                 matcher.hitEnd = true;

  3510                 return false;

  3511             }

  3512             if (i > startIndex) {

  3513                 char ch = seq.charAt(i-1);

  3514                 if (ch != '\n' && ch != '\r'

  3515                     && (ch|1) != '\u2029'

  3516                     && ch != '\u0085' ) {

  3517                     return false;

  3518                 }

  3519                 // Should treat /r/n as one newline

  3520                 if (ch == '\r' && seq.charAt(i) == '\n')

  3521                     return false;

  3522             }

  3523             return next.match(matcher, i, seq);

  3524         }

  3525     }

  3527     /**

  3528      * Node to anchor at the beginning of a line when in unixdot mode.

  3529      */

  3530     static final class UnixCaret extends Node {

  3531         boolean match(Matcher matcher, int i, CharSequence seq) {

  3532             int startIndex = matcher.from;

  3533             int endIndex = matcher.to;

  3534             if (!matcher.anchoringBounds) {

  3535                 startIndex = 0;

  3536                 endIndex = matcher.getTextLength();

  3537             }

  3538             // Perl does not match ^ at end of input even after newline

  3539             if (i == endIndex) {

  3540                 matcher.hitEnd = true;

  3541                 return false;

  3542             }

  3543             if (i > startIndex) {

  3544                 char ch = seq.charAt(i-1);

  3545                 if (ch != '\n') {

  3546                     return false;

  3547                 }

  3548             }

  3549             return next.match(matcher, i, seq);

  3550         }

  3551     }

  3553     /**

  3554      * Node to match the location where the last match ended.

  3555      * This is used for the \G construct.

  3556      */

  3557     static final class LastMatch extends Node {

  3558         boolean match(Matcher matcher, int i, CharSequence seq) {

  3559             if (i != matcher.oldLast)

  3560                 return false;

  3561             return next.match(matcher, i, seq);

  3562         }

  3563     }

  3565     /**

  3566      * Node to anchor at the end of a line or the end of input based on the

  3567      * multiline mode.

  3568      *

  3569      * When not in multiline mode, the $ can only match at the very end

  3570      * of the input, unless the input ends in a line terminator in which

  3571      * it matches right before the last line terminator.

  3572      *

  3573      * Note that \r\n is considered an atomic line terminator.

  3574      *

  3575      * Like ^ the $ operator matches at a position, it does not match the

  3576      * line terminators themselves.

  3577      */

  3578     static final class Dollar extends Node {

  3579         boolean multiline;

  3580         Dollar(boolean mul) {

  3581             multiline = mul;

  3582         }

  3583         boolean match(Matcher matcher, int i, CharSequence seq) {

  3584             int endIndex = (matcher.anchoringBounds) ?

  3585                 matcher.to : matcher.getTextLength();

  3586             if (!multiline) {

  3587                 if (i < endIndex - 2)

  3588                     return false;

  3589                 if (i == endIndex - 2) {

  3590                     char ch = seq.charAt(i);

  3591                     if (ch != '\r')

  3592                         return false;

  3593                     ch = seq.charAt(i + 1);

  3594                     if (ch != '\n')

  3595                         return false;

  3596                 }

  3597             }

  3598             // Matches before any line terminator; also matches at the

  3599             // end of input

  3600             // Before line terminator:

  3601             // If multiline, we match here no matter what

  3602             // If not multiline, fall through so that the end

  3603             // is marked as hit; this must be a /r/n or a /n

  3604             // at the very end so the end was hit; more input

  3605             // could make this not match here

  3606             if (i < endIndex) {

  3607                 char ch = seq.charAt(i);

  3608                  if (ch == '\n') {

  3609                      // No match between \r\n

  3610                      if (i > 0 && seq.charAt(i-1) == '\r')

  3611                          return false;

  3612                      if (multiline)

  3613                          return next.match(matcher, i, seq);

  3614                  } else if (ch == '\r' || ch == '\u0085' ||

  3615                             (ch|1) == '\u2029') {

  3616                      if (multiline)

  3617                          return next.match(matcher, i, seq);

  3618                  } else { // No line terminator, no match

  3619                      return false;

  3620                  }

  3621             }

  3622             // Matched at current end so hit end

  3623             matcher.hitEnd = true;

  3624             // If a $ matches because of end of input, then more input

  3625             // could cause it to fail!

  3626             matcher.requireEnd = true;

  3627             return next.match(matcher, i, seq);

  3628         }

  3629         boolean study(TreeInfo info) {

  3630             next.study(info);

  3631             return info.deterministic;

  3632         }

  3633     }

  3635     /**

  3636      * Node to anchor at the end of a line or the end of input based on the

  3637      * multiline mode when in unix lines mode.

  3638      */

  3639     static final class UnixDollar extends Node {

  3640         boolean multiline;

  3641         UnixDollar(boolean mul) {

  3642             multiline = mul;

  3643         }

  3644         boolean match(Matcher matcher, int i, CharSequence seq) {

  3645             int endIndex = (matcher.anchoringBounds) ?

  3646                 matcher.to : matcher.getTextLength();

  3647             if (i < endIndex) {

  3648                 char ch = seq.charAt(i);

  3649                 if (ch == '\n') {

  3650                     // If not multiline, then only possible to

  3651                     // match at very end or one before end

  3652                     if (multiline == false && i != endIndex - 1)

  3653                         return false;

  3654                     // If multiline return next.match without setting

  3655                     // matcher.hitEnd

  3656                     if (multiline)

  3657                         return next.match(matcher, i, seq);

  3658                 } else {

  3659                     return false;

  3660                 }

  3661             }

  3662             // Matching because at the end or 1 before the end;

  3663             // more input could change this so set hitEnd

  3664             matcher.hitEnd = true;

  3665             // If a $ matches because of end of input, then more input

  3666             // could cause it to fail!

  3667             matcher.requireEnd = true;

  3668             return next.match(matcher, i, seq);

  3669         }

  3670         boolean study(TreeInfo info) {

  3671             next.study(info);

  3672             return info.deterministic;

  3673         }

  3674     }

  3676     /**

  3677      * Abstract node class to match one character satisfying some

  3678      * boolean property.

  3679      */

  3680     private static abstract class CharProperty extends Node {

  3681         abstract boolean isSatisfiedBy(int ch);

  3682         CharProperty complement() {

  3683             return new CharProperty() {

  3684                     boolean isSatisfiedBy(int ch) {

  3685                         return ! CharProperty.this.isSatisfiedBy(ch);}};

  3686         }

  3687         boolean match(Matcher matcher, int i, CharSequence seq) {

  3688             if (i < matcher.to) {

  3689                 int ch = Character.codePointAt(seq, i);

  3690                 return isSatisfiedBy(ch)

  3691                     && next.match(matcher, i+Character.charCount(ch), seq);

  3692             } else {

  3693                 matcher.hitEnd = true;

  3694                 return false;

  3695             }

  3696         }

  3697         boolean study(TreeInfo info) {

  3698             info.minLength++;

  3699             info.maxLength++;

  3700             return next.study(info);

  3701         }

  3702     }

  3704     /**

  3705      * Optimized version of CharProperty that works only for

  3706      * properties never satisfied by Supplementary characters.

  3707      */

  3708     private static abstract class BmpCharProperty extends CharProperty {

  3709         boolean match(Matcher matcher, int i, CharSequence seq) {

  3710             if (i < matcher.to) {

  3711                 return isSatisfiedBy(seq.charAt(i))

  3712                     && next.match(matcher, i+1, seq);

  3713             } else {

  3714                 matcher.hitEnd = true;

  3715                 return false;

  3716             }

  3717         }

  3718     }

  3720     /**

  3721      * Node class that matches a Supplementary Unicode character

  3722      */

  3723     static final class SingleS extends CharProperty {

  3724         final int c;

  3725         SingleS(int c) { this.c = c; }

  3726         boolean isSatisfiedBy(int ch) {

  3727             return ch == c;

  3728         }

  3729     }

  3731     /**

  3732      * Optimization -- matches a given BMP character

  3733      */

  3734     static final class Single extends BmpCharProperty {

  3735         final int c;

  3736         Single(int c) { this.c = c; }

  3737         boolean isSatisfiedBy(int ch) {

  3738             return ch == c;

  3739         }

  3740     }

  3742     /**

  3743      * Case insensitive matches a given BMP character

  3744      */

  3745     static final class SingleI extends BmpCharProperty {

  3746         final int lower;

  3747         final int upper;

  3748         SingleI(int lower, int upper) {

  3749             this.lower = lower;

  3750             this.upper = upper;

  3751         }

  3752         boolean isSatisfiedBy(int ch) {

  3753             return ch == lower || ch == upper;

  3754         }

  3755     }

  3757     /**

  3758      * Unicode case insensitive matches a given Unicode character

  3759      */

  3760     static final class SingleU extends CharProperty {

  3761         final int lower;

  3762         SingleU(int lower) {

  3763             this.lower = lower;

  3764         }

  3765         boolean isSatisfiedBy(int ch) {

  3766             return lower == ch ||

  3767                 lower == Character.toLowerCase(Character.toUpperCase(ch));

  3768         }

  3769     }

  3772     /**

  3773      * Node class that matches a Unicode block.

  3774      *

  3775     static final class Block extends CharProperty {

  3776         final Character.UnicodeBlock block;

  3777         Block(Character.UnicodeBlock block) {

  3778             this.block = block;

  3779         }

  3780         boolean isSatisfiedBy(int ch) {

  3781             return block == Character.UnicodeBlock.of(ch);

  3782         }

  3783     }

  3785     /**

  3786      * Node class that matches a Unicode script

  3787      *

  3788     static final class Script extends CharProperty {

  3789         final Character.UnicodeScript script;

  3790         Script(Character.UnicodeScript script) {

  3791             this.script = script;

  3792         }

  3793         boolean isSatisfiedBy(int ch) {

  3794             return script == Character.UnicodeScript.of(ch);

  3795         }

  3796     }

  3798     /**

  3799      * Node class that matches a Unicode category.

  3800      */

  3801     static final class Category extends CharProperty {

  3802         final int typeMask;

  3803         Category(int typeMask) { this.typeMask = typeMask; }

  3804         boolean isSatisfiedBy(int ch) {

  3805             return (typeMask & (1 << Character.getType(ch))) != 0;

  3806         }

  3807     }

  3809     /**

  3810      * Node class that matches a Unicode "type"

  3811      */

  3812     static final class Utype extends CharProperty {

  3813         final UnicodeProp uprop;

  3814         Utype(UnicodeProp uprop) { this.uprop = uprop; }

  3815         boolean isSatisfiedBy(int ch) {

  3816             return uprop.is(ch);

  3817         }

  3818     }

  3821     /**

  3822      * Node class that matches a POSIX type.

  3823      */

  3824     static final class Ctype extends BmpCharProperty {

  3825         final int ctype;

  3826         Ctype(int ctype) { this.ctype = ctype; }

  3827         boolean isSatisfiedBy(int ch) {

  3828             return ch < 128 && ASCII.isType(ch, ctype);

  3829         }

  3830     }

  3832     /**

  3833      * Base class for all Slice nodes

  3834      */

  3835     static class SliceNode extends Node {

  3836         int[] buffer;

  3837         SliceNode(int[] buf) {

  3838             buffer = buf;

  3839         }

  3840         boolean study(TreeInfo info) {

  3841             info.minLength += buffer.length;

  3842             info.maxLength += buffer.length;

  3843             return next.study(info);

  3844         }

  3845     }

  3847     /**

  3848      * Node class for a case sensitive/BMP-only sequence of literal

  3849      * characters.

  3850      */

  3851     static final class Slice extends SliceNode {

  3852         Slice(int[] buf) {

  3853             super(buf);

  3854         }

  3855         boolean match(Matcher matcher, int i, CharSequence seq) {

  3856             int[] buf = buffer;

  3857             int len = buf.length;

  3858             for (int j=0; j<len; j++) {

  3859                 if ((i+j) >= matcher.to) {

  3860                     matcher.hitEnd = true;

  3861                     return false;

  3862                 }

  3863                 if (buf[j] != seq.charAt(i+j))

  3864                     return false;

  3865             }

  3866             return next.match(matcher, i+len, seq);

  3867         }

  3868     }

  3870     /**

  3871      * Node class for a case_insensitive/BMP-only sequence of literal

  3872      * characters.

  3873      */

  3874     static class SliceI extends SliceNode {

  3875         SliceI(int[] buf) {

  3876             super(buf);

  3877         }

  3878         boolean match(Matcher matcher, int i, CharSequence seq) {

  3879             int[] buf = buffer;

  3880             int len = buf.length;

  3881             for (int j=0; j<len; j++) {

  3882                 if ((i+j) >= matcher.to) {

  3883                     matcher.hitEnd = true;

  3884                     return false;

  3885                 }

  3886                 int c = seq.charAt(i+j);

  3887                 if (buf[j] != c &&

  3888                     buf[j] != ASCII.toLower(c))

  3889                     return false;

  3890             }

  3891             return next.match(matcher, i+len, seq);

  3892         }

  3893     }

  3895     /**

  3896      * Node class for a unicode_case_insensitive/BMP-only sequence of

  3897      * literal characters. Uses unicode case folding.

  3898      */

  3899     static final class SliceU extends SliceNode {

  3900         SliceU(int[] buf) {

  3901             super(buf);

  3902         }

  3903         boolean match(Matcher matcher, int i, CharSequence seq) {

  3904             int[] buf = buffer;

  3905             int len = buf.length;

  3906             for (int j=0; j<len; j++) {

  3907                 if ((i+j) >= matcher.to) {

  3908                     matcher.hitEnd = true;

  3909                     return false;

  3910                 }

  3911                 int c = seq.charAt(i+j);

  3912                 if (buf[j] != c &&

  3913                     buf[j] != Character.toLowerCase(Character.toUpperCase(c)))

  3914                     return false;

  3915             }

  3916             return next.match(matcher, i+len, seq);

  3917         }

  3918     }

  3920     /**

  3921      * Node class for a case sensitive sequence of literal characters

  3922      * including supplementary characters.

  3923      */

  3924     static final class SliceS extends SliceNode {

  3925         SliceS(int[] buf) {

  3926             super(buf);

  3927         }

  3928         boolean match(Matcher matcher, int i, CharSequence seq) {

  3929             int[] buf = buffer;

  3930             int x = i;

  3931             for (int j = 0; j < buf.length; j++) {

  3932                 if (x >= matcher.to) {

  3933                     matcher.hitEnd = true;

  3934                     return false;

  3935                 }

  3936                 int c = Character.codePointAt(seq, x);

  3937                 if (buf[j] != c)

  3938                     return false;

  3939                 x += Character.charCount(c);

  3940                 if (x > matcher.to) {

  3941                     matcher.hitEnd = true;

  3942                     return false;

  3943                 }

  3944             }

  3945             return next.match(matcher, x, seq);

  3946         }

  3947     }

  3949     /**

  3950      * Node class for a case insensitive sequence of literal characters

  3951      * including supplementary characters.

  3952      */

  3953     static class SliceIS extends SliceNode {

  3954         SliceIS(int[] buf) {

  3955             super(buf);

  3956         }

  3957         int toLower(int c) {

  3958             return ASCII.toLower(c);

  3959         }

  3960         boolean match(Matcher matcher, int i, CharSequence seq) {

  3961             int[] buf = buffer;

  3962             int x = i;

  3963             for (int j = 0; j < buf.length; j++) {

  3964                 if (x >= matcher.to) {

  3965                     matcher.hitEnd = true;

  3966                     return false;

  3967                 }

  3968                 int c = Character.codePointAt(seq, x);

  3969                 if (buf[j] != c && buf[j] != toLower(c))

  3970                     return false;

  3971                 x += Character.charCount(c);

  3972                 if (x > matcher.to) {

  3973                     matcher.hitEnd = true;

  3974                     return false;

  3975                 }

  3976             }

  3977             return next.match(matcher, x, seq);

  3978         }

  3979     }

  3981     /**

  3982      * Node class for a case insensitive sequence of literal characters.

  3983      * Uses unicode case folding.

  3984      */

  3985     static final class SliceUS extends SliceIS {

  3986         SliceUS(int[] buf) {

  3987             super(buf);

  3988         }

  3989         int toLower(int c) {

  3990             return Character.toLowerCase(Character.toUpperCase(c));

  3991         }

  3992     }

  3994     private static boolean inRange(int lower, int ch, int upper) {

  3995         return lower <= ch && ch <= upper;

  3996     }

  3998     /**

  3999      * Returns node for matching characters within an explicit value range.

  4000      */

  4001     private static CharProperty rangeFor(final int lower,

  4002                                          final int upper) {

  4003         return new CharProperty() {

  4004                 boolean isSatisfiedBy(int ch) {

  4005                     return inRange(lower, ch, upper);}};

  4006     }

  4008     /**

  4009      * Returns node for matching characters within an explicit value

  4010      * range in a case insensitive manner.

  4011      */

  4012     private CharProperty caseInsensitiveRangeFor(final int lower,

  4013                                                  final int upper) {

  4014         if (has(UNICODE_CASE))

  4015             return new CharProperty() {

  4016                 boolean isSatisfiedBy(int ch) {

  4017                     if (inRange(lower, ch, upper))

  4018                         return true;

  4019                     int up = Character.toUpperCase(ch);

  4020                     return inRange(lower, up, upper) ||

  4021                            inRange(lower, Character.toLowerCase(up), upper);}};

  4022         return new CharProperty() {

  4023             boolean isSatisfiedBy(int ch) {

  4024                 return inRange(lower, ch, upper) ||

  4025                     ASCII.isAscii(ch) &&

  4026                         (inRange(lower, ASCII.toUpper(ch), upper) ||

  4027                          inRange(lower, ASCII.toLower(ch), upper));

  4028             }};

  4029     }

  4031     /**

  4032      * Implements the Unicode category ALL and the dot metacharacter when

  4033      * in dotall mode.

  4034      */

  4035     static final class All extends CharProperty {

  4036         boolean isSatisfiedBy(int ch) {

  4037             return true;

  4038         }

  4039     }

  4041     /**

  4042      * Node class for the dot metacharacter when dotall is not enabled.

  4043      */

  4044     static final class Dot extends CharProperty {

  4045         boolean isSatisfiedBy(int ch) {

  4046             return (ch != '\n' && ch != '\r'

  4047                     && (ch|1) != '\u2029'

  4048                     && ch != '\u0085');

  4049         }

  4050     }

  4052     /**

  4053      * Node class for the dot metacharacter when dotall is not enabled

  4054      * but UNIX_LINES is enabled.

  4055      */

  4056     static final class UnixDot extends CharProperty {

  4057         boolean isSatisfiedBy(int ch) {

  4058             return ch != '\n';

  4059         }

  4060     }

  4062     /**

  4063      * The 0 or 1 quantifier. This one class implements all three types.

  4064      */

  4065     static final class Ques extends Node {

  4066         Node atom;

  4067         int type;

  4068         Ques(Node node, int type) {

  4069             this.atom = node;

  4070             this.type = type;

  4071         }

  4072         boolean match(Matcher matcher, int i, CharSequence seq) {

  4073             switch (type) {

  4074             case GREEDY:

  4075                 return (atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq))

  4076                     || next.match(matcher, i, seq);

  4077             case LAZY:

  4078                 return next.match(matcher, i, seq)

  4079                     || (atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq));

  4080             case POSSESSIVE:

  4081                 if (atom.match(matcher, i, seq)) i = matcher.last;

  4082                 return next.match(matcher, i, seq);

  4083             default:

  4084                 return atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq);

  4085             }

  4086         }

  4087         boolean study(TreeInfo info) {

  4088             if (type != INDEPENDENT) {

  4089                 int minL = info.minLength;

  4090                 atom.study(info);

  4091                 info.minLength = minL;

  4092                 info.deterministic = false;

  4093                 return next.study(info);

  4094             } else {

  4095                 atom.study(info);

  4096                 return next.study(info);

  4097             }

  4098         }

  4099     }

  4101     /**

  4102      * Handles the curly-brace style repetition with a specified minimum and

  4103      * maximum occurrences. The * quantifier is handled as a special case.

  4104      * This class handles the three types.

  4105      */

  4106     static final class Curly extends Node {

  4107         Node atom;

  4108         int type;

  4109         int cmin;

  4110         int cmax;

  4112         Curly(Node node, int cmin, int cmax, int type) {

  4113             this.atom = node;

  4114             this.type = type;

  4115             this.cmin = cmin;

  4116             this.cmax = cmax;

  4117         }

  4118         boolean match(Matcher matcher, int i, CharSequence seq) {

  4119             int j;

  4120             for (j = 0; j < cmin; j++) {

  4121                 if (atom.match(matcher, i, seq)) {

  4122                     i = matcher.last;

  4123                     continue;

  4124                 }

  4125                 return false;

  4126             }

  4127             if (type == GREEDY)

  4128                 return match0(matcher, i, j, seq);

  4129             else if (type == LAZY)

  4130                 return match1(matcher, i, j, seq);

  4131             else

  4132                 return match2(matcher, i, j, seq);

  4133         }

  4134         // Greedy match.

  4135         // i is the index to start matching at

  4136         // j is the number of atoms that have matched

  4137         boolean match0(Matcher matcher, int i, int j, CharSequence seq) {

  4138             if (j >= cmax) {

  4139                 // We have matched the maximum... continue with the rest of

  4140                 // the regular expression

  4141                 return next.match(matcher, i, seq);

  4142             }

  4143             int backLimit = j;

  4144             while (atom.match(matcher, i, seq)) {

  4145                 // k is the length of this match

  4146                 int k = matcher.last - i;

  4147                 if (k == 0) // Zero length match

  4148                     break;

  4149                 // Move up index and number matched

  4150                 i = matcher.last;

  4151                 j++;

  4152                 // We are greedy so match as many as we can

  4153                 while (j < cmax) {

  4154                     if (!atom.match(matcher, i, seq))

  4155                         break;

  4156                     if (i + k != matcher.last) {

  4157                         if (match0(matcher, matcher.last, j+1, seq))

  4158                             return true;

  4159                         break;

  4160                     }

  4161                     i += k;

  4162                     j++;

  4163                 }

  4164                 // Handle backing off if match fails

  4165                 while (j >= backLimit) {

  4166                    if (next.match(matcher, i, seq))

  4167                         return true;

  4168                     i -= k;

  4169                     j--;

  4170                 }

  4171                 return false;

  4172             }

  4173             return next.match(matcher, i, seq);

  4174         }

  4175         // Reluctant match. At this point, the minimum has been satisfied.

  4176         // i is the index to start matching at

  4177         // j is the number of atoms that have matched

  4178         boolean match1(Matcher matcher, int i, int j, CharSequence seq) {

  4179             for (;;) {

  4180                 // Try finishing match without consuming any more

  4181                 if (next.match(matcher, i, seq))

  4182                     return true;

  4183                 // At the maximum, no match found

  4184                 if (j >= cmax)

  4185                     return false;

  4186                 // Okay, must try one more atom

  4187                 if (!atom.match(matcher, i, seq))

  4188                     return false;

  4189                 // If we haven't moved forward then must break out

  4190                 if (i == matcher.last)

  4191                     return false;

  4192                 // Move up index and number matched

  4193                 i = matcher.last;

  4194                 j++;

  4195             }

  4196         }

  4197         boolean match2(Matcher matcher, int i, int j, CharSequence seq) {

  4198             for (; j < cmax; j++) {

  4199                 if (!atom.match(matcher, i, seq))

  4200                     break;

  4201                 if (i == matcher.last)

  4202                     break;

  4203                 i = matcher.last;

  4204             }

  4205             return next.match(matcher, i, seq);

  4206         }

  4207         boolean study(TreeInfo info) {

  4208             // Save original info

  4209             int minL = info.minLength;

  4210             int maxL = info.maxLength;

  4211             boolean maxV = info.maxValid;

  4212             boolean detm = info.deterministic;

  4213             info.reset();

  4215             atom.study(info);

  4217             int temp = info.minLength * cmin + minL;

  4218             if (temp < minL) {

  4219                 temp = 0xFFFFFFF; // arbitrary large number

  4220             }

  4221             info.minLength = temp;

  4223             if (maxV & info.maxValid) {

  4224                 temp = info.maxLength * cmax + maxL;

  4225                 info.maxLength = temp;

  4226                 if (temp < maxL) {

  4227                     info.maxValid = false;

  4228                 }

  4229             } else {

  4230                 info.maxValid = false;

  4231             }

  4233             if (info.deterministic && cmin == cmax)

  4234                 info.deterministic = detm;

  4235             else

  4236                 info.deterministic = false;

  4238             return next.study(info);

  4239         }

  4240     }

  4242     /**

  4243      * Handles the curly-brace style repetition with a specified minimum and

  4244      * maximum occurrences in deterministic cases. This is an iterative

  4245      * optimization over the Prolog and Loop system which would handle this

  4246      * in a recursive way. The * quantifier is handled as a special case.

  4247      * If capture is true then this class saves group settings and ensures

  4248      * that groups are unset when backing off of a group match.

  4249      */

  4250     static final class GroupCurly extends Node {

  4251         Node atom;

  4252         int type;

  4253         int cmin;

  4254         int cmax;

  4255         int localIndex;

  4256         int groupIndex;

  4257         boolean capture;

  4259         GroupCurly(Node node, int cmin, int cmax, int type, int local,

  4260                    int group, boolean capture) {

  4261             this.atom = node;

  4262             this.type = type;

  4263             this.cmin = cmin;

  4264             this.cmax = cmax;

  4265             this.localIndex = local;

  4266             this.groupIndex = group;

  4267             this.capture = capture;

  4268         }

  4269         boolean match(Matcher matcher, int i, CharSequence seq) {

  4270             int[] groups = matcher.groups;

  4271             int[] locals = matcher.locals;

  4272             int save0 = locals[localIndex];

  4273             int save1 = 0;

  4274             int save2 = 0;

  4276             if (capture) {

  4277                 save1 = groups[groupIndex];

  4278                 save2 = groups[groupIndex+1];

  4279             }

  4281             // Notify GroupTail there is no need to setup group info

  4282             // because it will be set here

  4283             locals[localIndex] = -1;

  4285             boolean ret = true;

  4286             for (int j = 0; j < cmin; j++) {

  4287                 if (atom.match(matcher, i, seq)) {

  4288                     if (capture) {

  4289                         groups[groupIndex] = i;

  4290                         groups[groupIndex+1] = matcher.last;

  4291                     }

  4292                     i = matcher.last;

  4293                 } else {

  4294                     ret = false;

  4295                     break;

  4296                 }

  4297             }

  4298             if (ret) {

  4299                 if (type == GREEDY) {

  4300                     ret = match0(matcher, i, cmin, seq);

  4301                 } else if (type == LAZY) {

  4302                     ret = match1(matcher, i, cmin, seq);

  4303                 } else {

  4304                     ret = match2(matcher, i, cmin, seq);

  4305                 }

  4306             }

  4307             if (!ret) {

  4308                 locals[localIndex] = save0;

  4309                 if (capture) {

  4310                     groups[groupIndex] = save1;

  4311                     groups[groupIndex+1] = save2;

  4312                 }

  4313             }

  4314             return ret;

  4315         }

  4316         // Aggressive group match

  4317         boolean match0(Matcher matcher, int i, int j, CharSequence seq) {

  4318             int[] groups = matcher.groups;

  4319             int save0 = 0;

  4320             int save1 = 0;

  4321             if (capture) {

  4322                 save0 = groups[groupIndex];

  4323                 save1 = groups[groupIndex+1];

  4324             }

  4325             for (;;) {

  4326                 if (j >= cmax)

  4327                     break;

  4328                 if (!atom.match(matcher, i, seq))

  4329                     break;

  4330                 int k = matcher.last - i;

  4331                 if (k <= 0) {

  4332                     if (capture) {

  4333                         groups[groupIndex] = i;

  4334                         groups[groupIndex+1] = i + k;

  4335                     }

  4336                     i = i + k;

  4337                     break;

  4338                 }

  4339                 for (;;) {

  4340                     if (capture) {

  4341                         groups[groupIndex] = i;

  4342                         groups[groupIndex+1] = i + k;

  4343                     }

  4344                     i = i + k;

  4345                     if (++j >= cmax)

  4346                         break;

  4347                     if (!atom.match(matcher, i, seq))

  4348                         break;

  4349                     if (i + k != matcher.last) {

  4350                         if (match0(matcher, i, j, seq))

  4351                             return true;

  4352                         break;

  4353                     }

  4354                 }

  4355                 while (j > cmin) {

  4356                     if (next.match(matcher, i, seq)) {

  4357                         if (capture) {

  4358                             groups[groupIndex+1] = i;

  4359                             groups[groupIndex] = i - k;

  4360                         }

  4361                         i = i - k;

  4362                         return true;

  4363                     }

  4364                     // backing off

  4365                     if (capture) {

  4366                         groups[groupIndex+1] = i;

  4367                         groups[groupIndex] = i - k;

  4368                     }

  4369                     i = i - k;

  4370                     j--;

  4371                 }

  4372                 break;

  4373             }

  4374             if (capture) {

  4375                 groups[groupIndex] = save0;

  4376                 groups[groupIndex+1] = save1;

  4377             }

  4378             return next.match(matcher, i, seq);

  4379         }

  4380         // Reluctant matching

  4381         boolean match1(Matcher matcher, int i, int j, CharSequence seq) {

  4382             for (;;) {

  4383                 if (next.match(matcher, i, seq))

  4384                     return true;

  4385                 if (j >= cmax)

  4386                     return false;

  4387                 if (!atom.match(matcher, i, seq))

  4388                     return false;

  4389                 if (i == matcher.last)

  4390                     return false;

  4391                 if (capture) {

  4392                     matcher.groups[groupIndex] = i;

  4393                     matcher.groups[groupIndex+1] = matcher.last;

  4394                 }

  4395                 i = matcher.last;

  4396                 j++;

  4397             }

  4398         }

  4399         // Possessive matching

  4400         boolean match2(Matcher matcher, int i, int j, CharSequence seq) {

  4401             for (; j < cmax; j++) {

  4402                 if (!atom.match(matcher, i, seq)) {

  4403                     break;

  4404                 }

  4405                 if (capture) {

  4406                     matcher.groups[groupIndex] = i;

  4407                     matcher.groups[groupIndex+1] = matcher.last;

  4408                 }

  4409                 if (i == matcher.last) {

  4410                     break;

  4411                 }

  4412                 i = matcher.last;

  4413             }

  4414             return next.match(matcher, i, seq);

  4415         }

  4416         boolean study(TreeInfo info) {

  4417             // Save original info

  4418             int minL = info.minLength;

  4419             int maxL = info.maxLength;

  4420             boolean maxV = info.maxValid;

  4421             boolean detm = info.deterministic;

  4422             info.reset();

  4424             atom.study(info);

  4426             int temp = info.minLength * cmin + minL;

  4427             if (temp < minL) {

  4428                 temp = 0xFFFFFFF; // Arbitrary large number

  4429             }

  4430             info.minLength = temp;

  4432             if (maxV & info.maxValid) {

  4433                 temp = info.maxLength * cmax + maxL;

  4434                 info.maxLength = temp;

  4435                 if (temp < maxL) {

  4436                     info.maxValid = false;

  4437                 }

  4438             } else {

  4439                 info.maxValid = false;

  4440             }

  4442             if (info.deterministic && cmin == cmax) {

  4443                 info.deterministic = detm;

  4444             } else {

  4445                 info.deterministic = false;

  4446             }

  4448             return next.study(info);

  4449         }

  4450     }

  4452     /**

  4453      * A Guard node at the end of each atom node in a Branch. It

  4454      * serves the purpose of chaining the "match" operation to

  4455      * "next" but not the "study", so we can collect the TreeInfo

  4456      * of each atom node without including the TreeInfo of the

  4457      * "next".

  4458      */

  4459     static final class BranchConn extends Node {

  4460         BranchConn() {};

  4461         boolean match(Matcher matcher, int i, CharSequence seq) {

  4462             return next.match(matcher, i, seq);

  4463         }

  4464         boolean study(TreeInfo info) {

  4465             return info.deterministic;

  4466         }

  4467     }

  4469     /**

  4470      * Handles the branching of alternations. Note this is also used for

  4471      * the ? quantifier to branch between the case where it matches once

  4472      * and where it does not occur.

  4473      */

  4474     static final class Branch extends Node {

  4475         Node[] atoms = new Node[2];

  4476         int size = 2;

  4477         Node conn;

  4478         Branch(Node first, Node second, Node branchConn) {

  4479             conn = branchConn;

  4480             atoms[0] = first;

  4481             atoms[1] = second;

  4482         }

  4484         void add(Node node) {

  4485             if (size >= atoms.length) {

  4486                 Node[] tmp = new Node[atoms.length*2];

  4487                 System.arraycopy(atoms, 0, tmp, 0, atoms.length);

  4488                 atoms = tmp;

  4489             }

  4490             atoms[size++] = node;

  4491         }

  4493         boolean match(Matcher matcher, int i, CharSequence seq) {

  4494             for (int n = 0; n < size; n++) {

  4495                 if (atoms[n] == null) {

  4496                     if (conn.next.match(matcher, i, seq))

  4497                         return true;

  4498                 } else if (atoms[n].match(matcher, i, seq)) {

  4499                     return true;

  4500                 }

  4501             }

  4502             return false;

  4503         }

  4505         boolean study(TreeInfo info) {

  4506             int minL = info.minLength;

  4507             int maxL = info.maxLength;

  4508             boolean maxV = info.maxValid;

  4510             int minL2 = Integer.MAX_VALUE; //arbitrary large enough num

  4511             int maxL2 = -1;

  4512             for (int n = 0; n < size; n++) {

  4513                 info.reset();

  4514                 if (atoms[n] != null)

  4515                     atoms[n].study(info);

  4516                 minL2 = Math.min(minL2, info.minLength);

  4517                 maxL2 = Math.max(maxL2, info.maxLength);

  4518                 maxV = (maxV & info.maxValid);

  4519             }

  4521             minL += minL2;

  4522             maxL += maxL2;

  4524             info.reset();

  4525             conn.next.study(info);

  4527             info.minLength += minL;

  4528             info.maxLength += maxL;

  4529             info.maxValid &= maxV;

  4530             info.deterministic = false;

  4531             return false;

  4532         }

  4533     }

  4535     /**

  4536      * The GroupHead saves the location where the group begins in the locals

  4537      * and restores them when the match is done.

  4538      *

  4539      * The matchRef is used when a reference to this group is accessed later

  4540      * in the expression. The locals will have a negative value in them to

  4541      * indicate that we do not want to unset the group if the reference

  4542      * doesn't match.

  4543      */

  4544     static final class GroupHead extends Node {

  4545         int localIndex;

  4546         GroupHead(int localCount) {

  4547             localIndex = localCount;

  4548         }

  4549         boolean match(Matcher matcher, int i, CharSequence seq) {

  4550             int save = matcher.locals[localIndex];

  4551             matcher.locals[localIndex] = i;

  4552             boolean ret = next.match(matcher, i, seq);

  4553             matcher.locals[localIndex] = save;

  4554             return ret;

  4555         }

  4556         boolean matchRef(Matcher matcher, int i, CharSequence seq) {

  4557             int save = matcher.locals[localIndex];

  4558             matcher.locals[localIndex] = ~i; // HACK

  4559             boolean ret = next.match(matcher, i, seq);

  4560             matcher.locals[localIndex] = save;

  4561             return ret;

  4562         }

  4563     }

  4565     /**

  4566      * Recursive reference to a group in the regular expression. It calls

  4567      * matchRef because if the reference fails to match we would not unset

  4568      * the group.

  4569      */

  4570     static final class GroupRef extends Node {

  4571         GroupHead head;

  4572         GroupRef(GroupHead head) {

  4573             this.head = head;

  4574         }

  4575         boolean match(Matcher matcher, int i, CharSequence seq) {

  4576             return head.matchRef(matcher, i, seq)

  4577                 && next.match(matcher, matcher.last, seq);

  4578         }

  4579         boolean study(TreeInfo info) {

  4580             info.maxValid = false;

  4581             info.deterministic = false;

  4582             return next.study(info);

  4583         }

  4584     }

  4586     /**

  4587      * The GroupTail handles the setting of group beginning and ending

  4588      * locations when groups are successfully matched. It must also be able to

  4589      * unset groups that have to be backed off of.

  4590      *

  4591      * The GroupTail node is also used when a previous group is referenced,

  4592      * and in that case no group information needs to be set.

  4593      */

  4594     static final class GroupTail extends Node {

  4595         int localIndex;

  4596         int groupIndex;

  4597         GroupTail(int localCount, int groupCount) {

  4598             localIndex = localCount;

  4599             groupIndex = groupCount + groupCount;

  4600         }

  4601         boolean match(Matcher matcher, int i, CharSequence seq) {

  4602             int tmp = matcher.locals[localIndex];

  4603             if (tmp >= 0) { // This is the normal group case.

  4604                 // Save the group so we can unset it if it

  4605                 // backs off of a match.

  4606                 int groupStart = matcher.groups[groupIndex];

  4607                 int groupEnd = matcher.groups[groupIndex+1];

  4609                 matcher.groups[groupIndex] = tmp;

  4610                 matcher.groups[groupIndex+1] = i;

  4611                 if (next.match(matcher, i, seq)) {

  4612                     return true;

  4613                 }

  4614                 matcher.groups[groupIndex] = groupStart;

  4615                 matcher.groups[groupIndex+1] = groupEnd;

  4616                 return false;

  4617             } else {

  4618                 // This is a group reference case. We don't need to save any

  4619                 // group info because it isn't really a group.

  4620                 matcher.last = i;

  4621                 return true;

  4622             }

  4623         }

  4624     }

  4626     /**

  4627      * This sets up a loop to handle a recursive quantifier structure.

  4628      */

  4629     static final class Prolog extends Node {

  4630         Loop loop;

  4631         Prolog(Loop loop) {

  4632             this.loop = loop;

  4633         }

  4634         boolean match(Matcher matcher, int i, CharSequence seq) {

  4635             return loop.matchInit(matcher, i, seq);

  4636         }

  4637         boolean study(TreeInfo info) {

  4638             return loop.study(info);

  4639         }

  4640     }

  4642     /**

  4643      * Handles the repetition count for a greedy Curly. The matchInit

  4644      * is called from the Prolog to save the index of where the group

  4645      * beginning is stored. A zero length group check occurs in the

  4646      * normal match but is skipped in the matchInit.

  4647      */

  4648     static class Loop extends Node {

  4649         Node body;

  4650         int countIndex; // local count index in matcher locals

  4651         int beginIndex; // group beginning index

  4652         int cmin, cmax;

  4653         Loop(int countIndex, int beginIndex) {

  4654             this.countIndex = countIndex;

  4655             this.beginIndex = beginIndex;

  4656         }

  4657         boolean match(Matcher matcher, int i, CharSequence seq) {

  4658             // Avoid infinite loop in zero-length case.

  4659             if (i > matcher.locals[beginIndex]) {

  4660                 int count = matcher.locals[countIndex];

  4662                 // This block is for before we reach the minimum

  4663                 // iterations required for the loop to match

  4664                 if (count < cmin) {

  4665                     matcher.locals[countIndex] = count + 1;

  4666                     boolean b = body.match(matcher, i, seq);

  4667                     // If match failed we must backtrack, so

  4668                     // the loop count should NOT be incremented

  4669                     if (!b)

  4670                         matcher.locals[countIndex] = count;

  4671                     // Return success or failure since we are under

  4672                     // minimum

  4673                     return b;

  4674                 }

  4675                 // This block is for after we have the minimum

  4676                 // iterations required for the loop to match

  4677                 if (count < cmax) {

  4678                     matcher.locals[countIndex] = count + 1;

  4679                     boolean b = body.match(matcher, i, seq);

  4680                     // If match failed we must backtrack, so

  4681                     // the loop count should NOT be incremented

  4682                     if (!b)

  4683                         matcher.locals[countIndex] = count;

  4684                     else

  4685                         return true;

  4686                 }

  4687             }

  4688             return next.match(matcher, i, seq);

  4689         }

  4690         boolean matchInit(Matcher matcher, int i, CharSequence seq) {

  4691             int save = matcher.locals[countIndex];

  4692             boolean ret = false;

  4693             if (0 < cmin) {

  4694                 matcher.locals[countIndex] = 1;

  4695                 ret = body.match(matcher, i, seq);

  4696             } else if (0 < cmax) {

  4697                 matcher.locals[countIndex] = 1;

  4698                 ret = body.match(matcher, i, seq);

  4699                 if (ret == false)

  4700                     ret = next.match(matcher, i, seq);

  4701             } else {

  4702                 ret = next.match(matcher, i, seq);

  4703             }

  4704             matcher.locals[countIndex] = save;

  4705             return ret;

  4706         }

  4707         boolean study(TreeInfo info) {

  4708             info.maxValid = false;

  4709             info.deterministic = false;

  4710             return false;

  4711         }

  4712     }

  4714     /**

  4715      * Handles the repetition count for a reluctant Curly. The matchInit

  4716      * is called from the Prolog to save the index of where the group

  4717      * beginning is stored. A zero length group check occurs in the

  4718      * normal match but is skipped in the matchInit.

  4719      */

  4720     static final class LazyLoop extends Loop {

  4721         LazyLoop(int countIndex, int beginIndex) {

  4722             super(countIndex, beginIndex);

  4723         }

  4724         boolean match(Matcher matcher, int i, CharSequence seq) {

  4725             // Check for zero length group

  4726             if (i > matcher.locals[beginIndex]) {

  4727                 int count = matcher.locals[countIndex];

  4728                 if (count < cmin) {

  4729                     matcher.locals[countIndex] = count + 1;

  4730                     boolean result = body.match(matcher, i, seq);

  4731                     // If match failed we must backtrack, so

  4732                     // the loop count should NOT be incremented

  4733                     if (!result)

  4734                         matcher.locals[countIndex] = count;

  4735                     return result;

  4736                 }

  4737                 if (next.match(matcher, i, seq))

  4738                     return true;

  4739                 if (count < cmax) {

  4740                     matcher.locals[countIndex] = count + 1;

  4741                     boolean result = body.match(matcher, i, seq);

  4742                     // If match failed we must backtrack, so

  4743                     // the loop count should NOT be incremented

  4744                     if (!result)

  4745                         matcher.locals[countIndex] = count;

  4746                     return result;

  4747                 }

  4748                 return false;

  4749             }

  4750             return next.match(matcher, i, seq);

  4751         }

  4752         boolean matchInit(Matcher matcher, int i, CharSequence seq) {

  4753             int save = matcher.locals[countIndex];

  4754             boolean ret = false;

  4755             if (0 < cmin) {

  4756                 matcher.locals[countIndex] = 1;

  4757                 ret = body.match(matcher, i, seq);

  4758             } else if (next.match(matcher, i, seq)) {

  4759                 ret = true;

  4760             } else if (0 < cmax) {

  4761                 matcher.locals[countIndex] = 1;

  4762                 ret = body.match(matcher, i, seq);

  4763             }

  4764             matcher.locals[countIndex] = save;

  4765             return ret;

  4766         }

  4767         boolean study(TreeInfo info) {

  4768             info.maxValid = false;

  4769             info.deterministic = false;

  4770             return false;

  4771         }

  4772     }

  4774     /**

  4775      * Refers to a group in the regular expression. Attempts to match

  4776      * whatever the group referred to last matched.

  4777      */

  4778     static class BackRef extends Node {

  4779         int groupIndex;

  4780         BackRef(int groupCount) {

  4781             super();

  4782             groupIndex = groupCount + groupCount;

  4783         }

  4784         boolean match(Matcher matcher, int i, CharSequence seq) {

  4785             int j = matcher.groups[groupIndex];

  4786             int k = matcher.groups[groupIndex+1];

  4788             int groupSize = k - j;

  4790             // If the referenced group didn't match, neither can this

  4791             if (j < 0)

  4792                 return false;

  4794             // If there isn't enough input left no match

  4795             if (i + groupSize > matcher.to) {

  4796                 matcher.hitEnd = true;

  4797                 return false;

  4798             }

  4800             // Check each new char to make sure it matches what the group

  4801             // referenced matched last time around

  4802             for (int index=0; index<groupSize; index++)

  4803                 if (seq.charAt(i+index) != seq.charAt(j+index))

  4804                     return false;

  4806             return next.match(matcher, i+groupSize, seq);

  4807         }

  4808         boolean study(TreeInfo info) {

  4809             info.maxValid = false;

  4810             return next.study(info);

  4811         }

  4812     }

  4814     static class CIBackRef extends Node {

  4815         int groupIndex;

  4816         boolean doUnicodeCase;

  4817         CIBackRef(int groupCount, boolean doUnicodeCase) {

  4818             super();

  4819             groupIndex = groupCount + groupCount;

  4820             this.doUnicodeCase = doUnicodeCase;

  4821         }

  4822         boolean match(Matcher matcher, int i, CharSequence seq) {

  4823             int j = matcher.groups[groupIndex];

  4824             int k = matcher.groups[groupIndex+1];

  4826             int groupSize = k - j;

  4828             // If the referenced group didn't match, neither can this

  4829             if (j < 0)

  4830                 return false;

  4832             // If there isn't enough input left no match

  4833             if (i + groupSize > matcher.to) {

  4834                 matcher.hitEnd = true;

  4835                 return false;

  4836             }

  4838             // Check each new char to make sure it matches what the group

  4839             // referenced matched last time around

  4840             int x = i;

  4841             for (int index=0; index<groupSize; index++) {

  4842                 int c1 = Character.codePointAt(seq, x);

  4843                 int c2 = Character.codePointAt(seq, j);

  4844                 if (c1 != c2) {

  4845                     if (doUnicodeCase) {

  4846                         int cc1 = Character.toUpperCase(c1);

  4847                         int cc2 = Character.toUpperCase(c2);

  4848                         if (cc1 != cc2 &&

  4849                             Character.toLowerCase(cc1) !=

  4850                             Character.toLowerCase(cc2))

  4851                             return false;

  4852                     } else {

  4853                         if (ASCII.toLower(c1) != ASCII.toLower(c2))

  4854                             return false;

  4855                     }

  4856                 }

  4857                 x += Character.charCount(c1);

  4858                 j += Character.charCount(c2);

  4859             }

  4861             return next.match(matcher, i+groupSize, seq);

  4862         }

  4863         boolean study(TreeInfo info) {

  4864             info.maxValid = false;

  4865             return next.study(info);

  4866         }

  4867     }

  4869     /**

  4870      * Searches until the next instance of its atom. This is useful for

  4871      * finding the atom efficiently without passing an instance of it

  4872      * (greedy problem) and without a lot of wasted search time (reluctant

  4873      * problem).

  4874      */

  4875     static final class First extends Node {

  4876         Node atom;

  4877         First(Node node) {

  4878             this.atom = BnM.optimize(node);

  4879         }

  4880         boolean match(Matcher matcher, int i, CharSequence seq) {

  4881             if (atom instanceof BnM) {

  4882                 return atom.match(matcher, i, seq)

  4883                     && next.match(matcher, matcher.last, seq);

  4884             }

  4885             for (;;) {

  4886                 if (i > matcher.to) {

  4887                     matcher.hitEnd = true;

  4888                     return false;

  4889                 }

  4890                 if (atom.match(matcher, i, seq)) {

  4891                     return next.match(matcher, matcher.last, seq);

  4892                 }

  4893                 i += countChars(seq, i, 1);

  4894                 matcher.first++;

  4895             }

  4896         }

  4897         boolean study(TreeInfo info) {

  4898             atom.study(info);

  4899             info.maxValid = false;

  4900             info.deterministic = false;

  4901             return next.study(info);

  4902         }

  4903     }

  4905     static final class Conditional extends Node {

  4906         Node cond, yes, not;

  4907         Conditional(Node cond, Node yes, Node not) {

  4908             this.cond = cond;

  4909             this.yes = yes;

  4910             this.not = not;

  4911         }

  4912         boolean match(Matcher matcher, int i, CharSequence seq) {

  4913             if (cond.match(matcher, i, seq)) {

  4914                 return yes.match(matcher, i, seq);

  4915             } else {

  4916                 return not.match(matcher, i, seq);

  4917             }

  4918         }

  4919         boolean study(TreeInfo info) {

  4920             int minL = info.minLength;

  4921             int maxL = info.maxLength;

  4922             boolean maxV = info.maxValid;

  4923             info.reset();

  4924             yes.study(info);

  4926             int minL2 = info.minLength;

  4927             int maxL2 = info.maxLength;

  4928             boolean maxV2 = info.maxValid;

  4929             info.reset();

  4930             not.study(info);

  4932             info.minLength = minL + Math.min(minL2, info.minLength);

  4933             info.maxLength = maxL + Math.max(maxL2, info.maxLength);

  4934             info.maxValid = (maxV & maxV2 & info.maxValid);

  4935             info.deterministic = false;

  4936             return next.study(info);

  4937         }

  4938     }

  4940     /**

  4941      * Zero width positive lookahead.

  4942      */

  4943     static final class Pos extends Node {

  4944         Node cond;

  4945         Pos(Node cond) {

  4946             this.cond = cond;

  4947         }

  4948         boolean match(Matcher matcher, int i, CharSequence seq) {

  4949             int savedTo = matcher.to;

  4950             boolean conditionMatched = false;

  4952             // Relax transparent region boundaries for lookahead

  4953             if (matcher.transparentBounds)

  4954                 matcher.to = matcher.getTextLength();

  4955             try {

  4956                 conditionMatched = cond.match(matcher, i, seq);

  4957             } finally {

  4958                 // Reinstate region boundaries

  4959                 matcher.to = savedTo;

  4960             }

  4961             return conditionMatched && next.match(matcher, i, seq);

  4962         }

  4963     }

  4965     /**

  4966      * Zero width negative lookahead.

  4967      */

  4968     static final class Neg extends Node {

  4969         Node cond;

  4970         Neg(Node cond) {

  4971             this.cond = cond;

  4972         }

  4973         boolean match(Matcher matcher, int i, CharSequence seq) {

  4974             int savedTo = matcher.to;

  4975             boolean conditionMatched = false;

  4977             // Relax transparent region boundaries for lookahead

  4978             if (matcher.transparentBounds)

  4979                 matcher.to = matcher.getTextLength();

  4980             try {

  4981                 if (i < matcher.to) {

  4982                     conditionMatched = !cond.match(matcher, i, seq);

  4983                 } else {

  4984                     // If a negative lookahead succeeds then more input

  4985                     // could cause it to fail!

  4986                     matcher.requireEnd = true;

  4987                     conditionMatched = !cond.match(matcher, i, seq);

  4988                 }

  4989             } finally {

  4990                 // Reinstate region boundaries

  4991                 matcher.to = savedTo;

  4992             }

  4993             return conditionMatched && next.match(matcher, i, seq);

  4994         }

  4995     }

  4997     /**

  4998      * For use with lookbehinds; matches the position where the lookbehind

  4999      * was encountered.

  5000      */

  5001     static Node lookbehindEnd = new Node() {

  5002         boolean match(Matcher matcher, int i, CharSequence seq) {

  5003             return i == matcher.lookbehindTo;

  5004         }

  5005     };

  5007     /**

  5008      * Zero width positive lookbehind.

  5009      */

  5010     static class Behind extends Node {

  5011         Node cond;

  5012         int rmax, rmin;

  5013         Behind(Node cond, int rmax, int rmin) {

  5014             this.cond = cond;

  5015             this.rmax = rmax;

  5016             this.rmin = rmin;

  5017         }

  5019         boolean match(Matcher matcher, int i, CharSequence seq) {

  5020             int savedFrom = matcher.from;

  5021             boolean conditionMatched = false;

  5022             int startIndex = (!matcher.transparentBounds) ?

  5023                              matcher.from : 0;

  5024             int from = Math.max(i - rmax, startIndex);

  5025             // Set end boundary

  5026             int savedLBT = matcher.lookbehindTo;

  5027             matcher.lookbehindTo = i;

  5028             // Relax transparent region boundaries for lookbehind

  5029             if (matcher.transparentBounds)

  5030                 matcher.from = 0;

  5031             for (int j = i - rmin; !conditionMatched && j >= from; j--) {

  5032                 conditionMatched = cond.match(matcher, j, seq);

  5033             }

  5034             matcher.from = savedFrom;

  5035             matcher.lookbehindTo = savedLBT;

  5036             return conditionMatched && next.match(matcher, i, seq);

  5037         }

  5038     }

  5040     /**

  5041      * Zero width positive lookbehind, including supplementary

  5042      * characters or unpaired surrogates.

  5043      */

  5044     static final class BehindS extends Behind {

  5045         BehindS(Node cond, int rmax, int rmin) {

  5046             super(cond, rmax, rmin);

  5047         }

  5048         boolean match(Matcher matcher, int i, CharSequence seq) {

  5049             int rmaxChars = countChars(seq, i, -rmax);

  5050             int rminChars = countChars(seq, i, -rmin);

  5051             int savedFrom = matcher.from;

  5052             int startIndex = (!matcher.transparentBounds) ?

  5053                              matcher.from : 0;

  5054             boolean conditionMatched = false;

  5055             int from = Math.max(i - rmaxChars, startIndex);

  5056             // Set end boundary

  5057             int savedLBT = matcher.lookbehindTo;

  5058             matcher.lookbehindTo = i;

  5059             // Relax transparent region boundaries for lookbehind

  5060             if (matcher.transparentBounds)

  5061                 matcher.from = 0;

  5063             for (int j = i - rminChars;

  5064                  !conditionMatched && j >= from;

  5065                  j -= j>from ? countChars(seq, j, -1) : 1) {

  5066                 conditionMatched = cond.match(matcher, j, seq);

  5067             }

  5068             matcher.from = savedFrom;

  5069             matcher.lookbehindTo = savedLBT;

  5070             return conditionMatched && next.match(matcher, i, seq);

  5071         }

  5072     }

  5074     /**

  5075      * Zero width negative lookbehind.

  5076      */

  5077     static class NotBehind extends Node {

  5078         Node cond;

  5079         int rmax, rmin;

  5080         NotBehind(Node cond, int rmax, int rmin) {

  5081             this.cond = cond;

  5082             this.rmax = rmax;

  5083             this.rmin = rmin;

  5084         }

  5086         boolean match(Matcher matcher, int i, CharSequence seq) {

  5087             int savedLBT = matcher.lookbehindTo;

  5088             int savedFrom = matcher.from;

  5089             boolean conditionMatched = false;

  5090             int startIndex = (!matcher.transparentBounds) ?

  5091                              matcher.from : 0;

  5092             int from = Math.max(i - rmax, startIndex);

  5093             matcher.lookbehindTo = i;

  5094             // Relax transparent region boundaries for lookbehind

  5095             if (matcher.transparentBounds)

  5096                 matcher.from = 0;

  5097             for (int j = i - rmin; !conditionMatched && j >= from; j--) {

  5098                 conditionMatched = cond.match(matcher, j, seq);

  5099             }

  5100             // Reinstate region boundaries

  5101             matcher.from = savedFrom;

  5102             matcher.lookbehindTo = savedLBT;

  5103             return !conditionMatched && next.match(matcher, i, seq);

  5104         }

  5105     }

  5107     /**

  5108      * Zero width negative lookbehind, including supplementary

  5109      * characters or unpaired surrogates.

  5110      */

  5111     static final class NotBehindS extends NotBehind {

  5112         NotBehindS(Node cond, int rmax, int rmin) {

  5113             super(cond, rmax, rmin);

  5114         }

  5115         boolean match(Matcher matcher, int i, CharSequence seq) {

  5116             int rmaxChars = countChars(seq, i, -rmax);

  5117             int rminChars = countChars(seq, i, -rmin);

  5118             int savedFrom = matcher.from;

  5119             int savedLBT = matcher.lookbehindTo;

  5120             boolean conditionMatched = false;

  5121             int startIndex = (!matcher.transparentBounds) ?

  5122                              matcher.from : 0;

  5123             int from = Math.max(i - rmaxChars, startIndex);

  5124             matcher.lookbehindTo = i;

  5125             // Relax transparent region boundaries for lookbehind

  5126             if (matcher.transparentBounds)

  5127                 matcher.from = 0;

  5128             for (int j = i - rminChars;

  5129                  !conditionMatched && j >= from;

  5130                  j -= j>from ? countChars(seq, j, -1) : 1) {

  5131                 conditionMatched = cond.match(matcher, j, seq);

  5132             }

  5133             //Reinstate region boundaries

  5134             matcher.from = savedFrom;

  5135             matcher.lookbehindTo = savedLBT;

  5136             return !conditionMatched && next.match(matcher, i, seq);

  5137         }

  5138     }

  5140     /**

  5141      * Returns the set union of two CharProperty nodes.

  5142      */

  5143     private static CharProperty union(final CharProperty lhs,

  5144                                       final CharProperty rhs) {

  5145         return new CharProperty() {

  5146                 boolean isSatisfiedBy(int ch) {

  5147                     return lhs.isSatisfiedBy(ch) || rhs.isSatisfiedBy(ch);}};

  5148     }

  5150     /**

  5151      * Returns the set intersection of two CharProperty nodes.

  5152      */

  5153     private static CharProperty intersection(final CharProperty lhs,

  5154                                              final CharProperty rhs) {

  5155         return new CharProperty() {

  5156                 boolean isSatisfiedBy(int ch) {

  5157                     return lhs.isSatisfiedBy(ch) && rhs.isSatisfiedBy(ch);}};

  5158     }

  5160     /**

  5161      * Returns the set difference of two CharProperty nodes.

  5162      */

  5163     private static CharProperty setDifference(final CharProperty lhs,

  5164                                               final CharProperty rhs) {

  5165         return new CharProperty() {

  5166                 boolean isSatisfiedBy(int ch) {

  5167                     return ! rhs.isSatisfiedBy(ch) && lhs.isSatisfiedBy(ch);}};

  5168     }

  5170     /**

  5171      * Handles word boundaries. Includes a field to allow this one class to

  5172      * deal with the different types of word boundaries we can match. The word

  5173      * characters include underscores, letters, and digits. Non spacing marks

  5174      * can are also part of a word if they have a base character, otherwise

  5175      * they are ignored for purposes of finding word boundaries.

  5176      */

  5177     static final class Bound extends Node {

  5178         static int LEFT = 0x1;

  5179         static int RIGHT= 0x2;

  5180         static int BOTH = 0x3;

  5181         static int NONE = 0x4;

  5182         int type;

  5183         boolean useUWORD;

  5184         Bound(int n, boolean useUWORD) {

  5185             type = n;

  5186             this.useUWORD = useUWORD;

  5187         }

  5189         boolean isWord(int ch) {

  5190             return useUWORD ? UnicodeProp.WORD.is(ch)

  5191                             : (ch == '_' || Character.isLetterOrDigit(ch));

  5192         }

  5194         int check(Matcher matcher, int i, CharSequence seq) {

  5195             int ch;

  5196             boolean left = false;

  5197             int startIndex = matcher.from;

  5198             int endIndex = matcher.to;

  5199             if (matcher.transparentBounds) {

  5200                 startIndex = 0;

  5201                 endIndex = matcher.getTextLength();

  5202             }

  5203             if (i > startIndex) {

  5204                 ch = Character.codePointBefore(seq, i);

  5205                 left = (isWord(ch) ||

  5206                     ((Character.getType(ch) == Character.NON_SPACING_MARK)

  5207                      && hasBaseCharacter(matcher, i-1, seq)));

  5208             }

  5209             boolean right = false;

  5210             if (i < endIndex) {

  5211                 ch = Character.codePointAt(seq, i);

  5212                 right = (isWord(ch) ||

  5213                     ((Character.getType(ch) == Character.NON_SPACING_MARK)

  5214                      && hasBaseCharacter(matcher, i, seq)));

  5215             } else {

  5216                 // Tried to access char past the end

  5217                 matcher.hitEnd = true;

  5218                 // The addition of another char could wreck a boundary

  5219                 matcher.requireEnd = true;

  5220             }

  5221             return ((left ^ right) ? (right ? LEFT : RIGHT) : NONE);

  5222         }

  5223         boolean match(Matcher matcher, int i, CharSequence seq) {

  5224             return (check(matcher, i, seq) & type) > 0

  5225                 && next.match(matcher, i, seq);

  5226         }

  5227     }

  5229     /**

  5230      * Non spacing marks only count as word characters in bounds calculations

  5231      * if they have a base character.

  5232      */

  5233     private static boolean hasBaseCharacter(Matcher matcher, int i,

  5234                                             CharSequence seq)

  5235     {

  5236         int start = (!matcher.transparentBounds) ?

  5237             matcher.from : 0;

  5238         for (int x=i; x >= start; x--) {

  5239             int ch = Character.codePointAt(seq, x);

  5240             if (Character.isLetterOrDigit(ch))

  5241                 return true;

  5242             if (Character.getType(ch) == Character.NON_SPACING_MARK)

  5243                 continue;

  5244             return false;

  5245         }

  5246         return false;

  5247     }

  5249     /**

  5250      * Attempts to match a slice in the input using the Boyer-Moore string

  5251      * matching algorithm. The algorithm is based on the idea that the

  5252      * pattern can be shifted farther ahead in the search text if it is

  5253      * matched right to left.

  5254      * <p>

  5255      * The pattern is compared to the input one character at a time, from

  5256      * the rightmost character in the pattern to the left. If the characters

  5257      * all match the pattern has been found. If a character does not match,

  5258      * the pattern is shifted right a distance that is the maximum of two

  5259      * functions, the bad character shift and the good suffix shift. This

  5260      * shift moves the attempted match position through the input more

  5261      * quickly than a naive one position at a time check.

  5262      * <p>

  5263      * The bad character shift is based on the character from the text that

  5264      * did not match. If the character does not appear in the pattern, the

  5265      * pattern can be shifted completely beyond the bad character. If the

  5266      * character does occur in the pattern, the pattern can be shifted to

  5267      * line the pattern up with the next occurrence of that character.

  5268      * <p>

  5269      * The good suffix shift is based on the idea that some subset on the right

  5270      * side of the pattern has matched. When a bad character is found, the

  5271      * pattern can be shifted right by the pattern length if the subset does

  5272      * not occur again in pattern, or by the amount of distance to the

  5273      * next occurrence of the subset in the pattern.

  5274      *

  5275      * Boyer-Moore search methods adapted from code by Amy Yu.

  5276      */

  5277     static class BnM extends Node {

  5278         int[] buffer;

  5279         int[] lastOcc;

  5280         int[] optoSft;

  5282         /**

  5283          * Pre calculates arrays needed to generate the bad character

  5284          * shift and the good suffix shift. Only the last seven bits

  5285          * are used to see if chars match; This keeps the tables small

  5286          * and covers the heavily used ASCII range, but occasionally

  5287          * results in an aliased match for the bad character shift.

  5288          */

  5289         static Node optimize(Node node) {

  5290             if (!(node instanceof Slice)) {

  5291                 return node;

  5292             }

  5294             int[] src = ((Slice) node).buffer;

  5295             int patternLength = src.length;

  5296             // The BM algorithm requires a bit of overhead;

  5297             // If the pattern is short don't use it, since

  5298             // a shift larger than the pattern length cannot

  5299             // be used anyway.

  5300             if (patternLength < 4) {

  5301                 return node;

  5302             }

  5303             int i, j, k;

  5304             int[] lastOcc = new int[128];

  5305             int[] optoSft = new int[patternLength];

  5306             // Precalculate part of the bad character shift

  5307             // It is a table for where in the pattern each

  5308             // lower 7-bit value occurs

  5309             for (i = 0; i < patternLength; i++) {

  5310                 lastOcc[src[i]&0x7F] = i + 1;

  5311             }

  5312             // Precalculate the good suffix shift

  5313             // i is the shift amount being considered

  5314 NEXT:       for (i = patternLength; i > 0; i--) {

  5315                 // j is the beginning index of suffix being considered

  5316                 for (j = patternLength - 1; j >= i; j--) {

  5317                     // Testing for good suffix

  5318                     if (src[j] == src[j-i]) {

  5319                         // src[j..len] is a good suffix

  5320                         optoSft[j-1] = i;

  5321                     } else {

  5322                         // No match. The array has already been

  5323                         // filled up with correct values before.

  5324                         continue NEXT;

  5325                     }

  5326                 }

  5327                 // This fills up the remaining of optoSft

  5328                 // any suffix can not have larger shift amount

  5329                 // then its sub-suffix. Why???

  5330                 while (j > 0) {

  5331                     optoSft[--j] = i;

  5332                 }

  5333             }

  5334             // Set the guard value because of unicode compression

  5335             optoSft[patternLength-1] = 1;

  5336             if (node instanceof SliceS)

  5337                 return new BnMS(src, lastOcc, optoSft, node.next);

  5338             return new BnM(src, lastOcc, optoSft, node.next);

  5339         }

  5340         BnM(int[] src, int[] lastOcc, int[] optoSft, Node next) {

  5341             this.buffer = src;

  5342             this.lastOcc = lastOcc;

  5343             this.optoSft = optoSft;

  5344             this.next = next;

  5345         }

  5346         boolean match(Matcher matcher, int i, CharSequence seq) {

  5347             int[] src = buffer;

  5348             int patternLength = src.length;

  5349             int last = matcher.to - patternLength;

  5351             // Loop over all possible match positions in text

  5352 NEXT:       while (i <= last) {

  5353                 // Loop over pattern from right to left

  5354                 for (int j = patternLength - 1; j >= 0; j--) {

  5355                     int ch = seq.charAt(i+j);

  5356                     if (ch != src[j]) {

  5357                         // Shift search to the right by the maximum of the

  5358                         // bad character shift and the good suffix shift

  5359                         i += Math.max(j + 1 - lastOcc[ch&0x7F], optoSft[j]);

  5360                         continue NEXT;

  5361                     }

  5362                 }

  5363                 // Entire pattern matched starting at i

  5364                 matcher.first = i;

  5365                 boolean ret = next.match(matcher, i + patternLength, seq);

  5366                 if (ret) {

  5367                     matcher.first = i;

  5368                     matcher.groups[0] = matcher.first;

  5369                     matcher.groups[1] = matcher.last;

  5370                     return true;

  5371                 }

  5372                 i++;

  5373             }

  5374             // BnM is only used as the leading node in the unanchored case,

  5375             // and it replaced its Start() which always searches to the end

  5376             // if it doesn't find what it's looking for, so hitEnd is true.

  5377             matcher.hitEnd = true;

  5378             return false;

  5379         }

  5380         boolean study(TreeInfo info) {

  5381             info.minLength += buffer.length;

  5382             info.maxValid = false;

  5383             return next.study(info);

  5384         }

  5385     }

  5387     /**

  5388      * Supplementary support version of BnM(). Unpaired surrogates are

  5389      * also handled by this class.

  5390      */

  5391     static final class BnMS extends BnM {

  5392         int lengthInChars;

  5394         BnMS(int[] src, int[] lastOcc, int[] optoSft, Node next) {

  5395             super(src, lastOcc, optoSft, next);

  5396             for (int x = 0; x < buffer.length; x++) {

  5397                 lengthInChars += Character.charCount(buffer[x]);

  5398             }

  5399         }

  5400         boolean match(Matcher matcher, int i, CharSequence seq) {

  5401             int[] src = buffer;

  5402             int patternLength = src.length;

  5403             int last = matcher.to - lengthInChars;

  5405             // Loop over all possible match positions in text

  5406 NEXT:       while (i <= last) {

  5407                 // Loop over pattern from right to left

  5408                 int ch;

  5409                 for (int j = countChars(seq, i, patternLength), x = patternLength - 1;

  5410                      j > 0; j -= Character.charCount(ch), x--) {

  5411                     ch = Character.codePointBefore(seq, i+j);

  5412                     if (ch != src[x]) {

  5413                         // Shift search to the right by the maximum of the

  5414                         // bad character shift and the good suffix shift

  5415                         int n = Math.max(x + 1 - lastOcc[ch&0x7F], optoSft[x]);

  5416                         i += countChars(seq, i, n);

  5417                         continue NEXT;

  5418                     }

  5419                 }

  5420                 // Entire pattern matched starting at i

  5421                 matcher.first = i;

  5422                 boolean ret = next.match(matcher, i + lengthInChars, seq);

  5423                 if (ret) {

  5424                     matcher.first = i;

  5425                     matcher.groups[0] = matcher.first;

  5426                     matcher.groups[1] = matcher.last;

  5427                     return true;

  5428                 }

  5429                 i += countChars(seq, i, 1);

  5430             }

  5431             matcher.hitEnd = true;

  5432             return false;

  5433         }

  5434     }

  5436 ///////////////////////////////////////////////////////////////////////////////

  5437 ///////////////////////////////////////////////////////////////////////////////

  5439     /**

  5440      *  This must be the very first initializer.

  5441      */

  5442     static Node accept = new Node();

  5444     static Node lastAccept = new LastNode();

  5446     private static class CharPropertyNames {

  5448         static CharProperty charPropertyFor(String name) {

  5449             CharPropertyFactory m = map.get(name);

  5450             return m == null ? null : m.make();

  5451         }

  5453         private static abstract class CharPropertyFactory {

  5454             abstract CharProperty make();

  5455         }

  5457         private static void defCategory(String name,

  5458                                         final int typeMask) {

  5459             map.put(name, new CharPropertyFactory() {

  5460                     CharProperty make() { return new Category(typeMask);}});

  5461         }

  5463         private static void defRange(String name,

  5464                                      final int lower, final int upper) {

  5465             map.put(name, new CharPropertyFactory() {

  5466                     CharProperty make() { return rangeFor(lower, upper);}});

  5467         }

  5469         private static void defCtype(String name,

  5470                                      final int ctype) {

  5471             map.put(name, new CharPropertyFactory() {

  5472                     CharProperty make() { return new Ctype(ctype);}});

  5473         }

  5475         private static abstract class CloneableProperty

  5476             extends CharProperty implements Cloneable

  5477         {

  5478             public CloneableProperty clone() {

  5479                 try {

  5480                     return (CloneableProperty) super.clone();

  5481                 } catch (CloneNotSupportedException e) {

  5482                     throw new AssertionError(e);

  5483                 }

  5484             }

  5485         }

  5487         private static void defClone(String name,

  5488                                      final CloneableProperty p) {

  5489             map.put(name, new CharPropertyFactory() {

  5490                     CharProperty make() { return p.clone();}});

  5491         }

  5493         private static final HashMap<String, CharPropertyFactory> map

  5494             = new HashMap<>();

  5496         static {

  5497             // Unicode character property aliases, defined in

  5498             // http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt

  5499             defCategory("Cn", 1<<Character.UNASSIGNED);

  5500             defCategory("Lu", 1<<Character.UPPERCASE_LETTER);

  5501             defCategory("Ll", 1<<Character.LOWERCASE_LETTER);

  5502             defCategory("Lt", 1<<Character.TITLECASE_LETTER);

  5503             defCategory("Lm", 1<<Character.MODIFIER_LETTER);

  5504             defCategory("Lo", 1<<Character.OTHER_LETTER);

  5505             defCategory("Mn", 1<<Character.NON_SPACING_MARK);

  5506             defCategory("Me", 1<<Character.ENCLOSING_MARK);

  5507             defCategory("Mc", 1<<Character.COMBINING_SPACING_MARK);

  5508             defCategory("Nd", 1<<Character.DECIMAL_DIGIT_NUMBER);

  5509             defCategory("Nl", 1<<Character.LETTER_NUMBER);

  5510             defCategory("No", 1<<Character.OTHER_NUMBER);

  5511             defCategory("Zs", 1<<Character.SPACE_SEPARATOR);

  5512             defCategory("Zl", 1<<Character.LINE_SEPARATOR);

  5513             defCategory("Zp", 1<<Character.PARAGRAPH_SEPARATOR);

  5514             defCategory("Cc", 1<<Character.CONTROL);

  5515             defCategory("Cf", 1<<Character.FORMAT);

  5516             defCategory("Co", 1<<Character.PRIVATE_USE);

  5517             defCategory("Cs", 1<<Character.SURROGATE);

  5518             defCategory("Pd", 1<<Character.DASH_PUNCTUATION);

  5519             defCategory("Ps", 1<<Character.START_PUNCTUATION);

  5520             defCategory("Pe", 1<<Character.END_PUNCTUATION);

  5521             defCategory("Pc", 1<<Character.CONNECTOR_PUNCTUATION);

  5522             defCategory("Po", 1<<Character.OTHER_PUNCTUATION);

  5523             defCategory("Sm", 1<<Character.MATH_SYMBOL);

  5524             defCategory("Sc", 1<<Character.CURRENCY_SYMBOL);

  5525             defCategory("Sk", 1<<Character.MODIFIER_SYMBOL);

  5526             defCategory("So", 1<<Character.OTHER_SYMBOL);

  5527             defCategory("Pi", 1<<Character.INITIAL_QUOTE_PUNCTUATION);

  5528             defCategory("Pf", 1<<Character.FINAL_QUOTE_PUNCTUATION);

  5529             defCategory("L", ((1<<Character.UPPERCASE_LETTER) |

  5530                               (1<<Character.LOWERCASE_LETTER) |

  5531                               (1<<Character.TITLECASE_LETTER) |

  5532                               (1<<Character.MODIFIER_LETTER)  |

  5533                               (1<<Character.OTHER_LETTER)));

  5534             defCategory("M", ((1<<Character.NON_SPACING_MARK) |

  5535                               (1<<Character.ENCLOSING_MARK)   |

  5536                               (1<<Character.COMBINING_SPACING_MARK)));

  5537             defCategory("N", ((1<<Character.DECIMAL_DIGIT_NUMBER) |

  5538                               (1<<Character.LETTER_NUMBER)        |

  5539                               (1<<Character.OTHER_NUMBER)));

  5540             defCategory("Z", ((1<<Character.SPACE_SEPARATOR) |

  5541                               (1<<Character.LINE_SEPARATOR)  |

  5542                               (1<<Character.PARAGRAPH_SEPARATOR)));

  5543             defCategory("C", ((1<<Character.CONTROL)     |

  5544                               (1<<Character.FORMAT)      |

  5545                               (1<<Character.PRIVATE_USE) |

  5546                               (1<<Character.SURROGATE))); // Other

  5547             defCategory("P", ((1<<Character.DASH_PUNCTUATION)      |

  5548                               (1<<Character.START_PUNCTUATION)     |

  5549                               (1<<Character.END_PUNCTUATION)       |

  5550                               (1<<Character.CONNECTOR_PUNCTUATION) |

  5551                               (1<<Character.OTHER_PUNCTUATION)     |

  5552                               (1<<Character.INITIAL_QUOTE_PUNCTUATION) |

  5553                               (1<<Character.FINAL_QUOTE_PUNCTUATION)));

  5554             defCategory("S", ((1<<Character.MATH_SYMBOL)     |

  5555                               (1<<Character.CURRENCY_SYMBOL) |

  5556                               (1<<Character.MODIFIER_SYMBOL) |

  5557                               (1<<Character.OTHER_SYMBOL)));

  5558             defCategory("LC", ((1<<Character.UPPERCASE_LETTER) |

  5559                                (1<<Character.LOWERCASE_LETTER) |

  5560                                (1<<Character.TITLECASE_LETTER)));

  5561             defCategory("LD", ((1<<Character.UPPERCASE_LETTER) |

  5562                                (1<<Character.LOWERCASE_LETTER) |

  5563                                (1<<Character.TITLECASE_LETTER) |

  5564                                (1<<Character.MODIFIER_LETTER)  |

  5565                                (1<<Character.OTHER_LETTER)     |

  5566                                (1<<Character.DECIMAL_DIGIT_NUMBER)));

  5567             defRange("L1", 0x00, 0xFF); // Latin-1

  5568             map.put("all", new CharPropertyFactory() {

  5569                     CharProperty make() { return new All(); }});

  5571             // Posix regular expression character classes, defined in

  5572             // http://www.unix.org/onlinepubs/009695399/basedefs/xbd_chap09.html

  5573             defRange("ASCII", 0x00, 0x7F);   // ASCII

  5574             defCtype("Alnum", ASCII.ALNUM);  // Alphanumeric characters

  5575             defCtype("Alpha", ASCII.ALPHA);  // Alphabetic characters

  5576             defCtype("Blank", ASCII.BLANK);  // Space and tab characters

  5577             defCtype("Cntrl", ASCII.CNTRL);  // Control characters

  5578             defRange("Digit", '0', '9');     // Numeric characters

  5579             defCtype("Graph", ASCII.GRAPH);  // printable and visible

  5580             defRange("Lower", 'a', 'z');     // Lower-case alphabetic

  5581             defRange("Print", 0x20, 0x7E);   // Printable characters

  5582             defCtype("Punct", ASCII.PUNCT);  // Punctuation characters

  5583             defCtype("Space", ASCII.SPACE);  // Space characters

  5584             defRange("Upper", 'A', 'Z');     // Upper-case alphabetic

  5585             defCtype("XDigit",ASCII.XDIGIT); // hexadecimal digits

  5587             // Java character properties, defined by methods in Character.java

  5588             defClone("javaLowerCase", new CloneableProperty() {

  5589                 boolean isSatisfiedBy(int ch) {

  5590                     return Character.isLowerCase(ch);}});

  5591             defClone("javaUpperCase", new CloneableProperty() {

  5592                 boolean isSatisfiedBy(int ch) {

  5593                     return Character.isUpperCase(ch);}});

  5594             defClone("javaAlphabetic", new CloneableProperty() {

  5595                 boolean isSatisfiedBy(int ch) {

  5596                     return Character.isAlphabetic(ch);}});

  5597             defClone("javaIdeographic", new CloneableProperty() {

  5598                 boolean isSatisfiedBy(int ch) {

  5599                     return Character.isIdeographic(ch);}});

  5600             defClone("javaTitleCase", new CloneableProperty() {

  5601                 boolean isSatisfiedBy(int ch) {

  5602                     return Character.isTitleCase(ch);}});

  5603             defClone("javaDigit", new CloneableProperty() {

  5604                 boolean isSatisfiedBy(int ch) {

  5605                     return Character.isDigit(ch);}});

  5606             defClone("javaDefined", new CloneableProperty() {

  5607                 boolean isSatisfiedBy(int ch) {

  5608                     return Character.isDefined(ch);}});

  5609             defClone("javaLetter", new CloneableProperty() {

  5610                 boolean isSatisfiedBy(int ch) {

  5611                     return Character.isLetter(ch);}});

  5612             defClone("javaLetterOrDigit", new CloneableProperty() {

  5613                 boolean isSatisfiedBy(int ch) {

  5614                     return Character.isLetterOrDigit(ch);}});

  5615             defClone("javaJavaIdentifierStart", new CloneableProperty() {

  5616                 boolean isSatisfiedBy(int ch) {

  5617                     return Character.isJavaIdentifierStart(ch);}});

  5618             defClone("javaJavaIdentifierPart", new CloneableProperty() {

  5619                 boolean isSatisfiedBy(int ch) {

  5620                     return Character.isJavaIdentifierPart(ch);}});

  5621             defClone("javaUnicodeIdentifierStart", new CloneableProperty() {

  5622                 boolean isSatisfiedBy(int ch) {

  5623                     return Character.isUnicodeIdentifierStart(ch);}});

  5624             defClone("javaUnicodeIdentifierPart", new CloneableProperty() {

  5625                 boolean isSatisfiedBy(int ch) {

  5626                     return Character.isUnicodeIdentifierPart(ch);}});

  5627             defClone("javaIdentifierIgnorable", new CloneableProperty() {

  5628                 boolean isSatisfiedBy(int ch) {

  5629                     return Character.isIdentifierIgnorable(ch);}});

  5630             defClone("javaSpaceChar", new CloneableProperty() {

  5631                 boolean isSatisfiedBy(int ch) {

  5632                     return Character.isSpaceChar(ch);}});

  5633             defClone("javaWhitespace", new CloneableProperty() {

  5634                 boolean isSatisfiedBy(int ch) {

  5635                     return Character.isWhitespace(ch);}});

  5636             defClone("javaISOControl", new CloneableProperty() {

  5637                 boolean isSatisfiedBy(int ch) {

  5638                     return Character.isISOControl(ch);}});

  5639             defClone("javaMirrored", new CloneableProperty() {

  5640                 boolean isSatisfiedBy(int ch) {

  5641                     return Character.isMirrored(ch);}});

  5642         }

  5643     }

  5645     private static final class Normalizer {

  5646         public static final int NFD = 1;

  5647         public static final int NFC = 2;

  5649         static String normalize(String pattern, int NFD) {

  5650             return pattern;

  5651         }

  5653         private static int getCombiningClass(int c) {

  5654             return 1;

  5655         }

  5656     }

  5657 }

author	Jaroslav Tulach <jtulach@netbeans.org>
	Mon, 07 Oct 2013 16:17:21 +0200
changeset 1350	f14e9730d4e9
parent 1348	bca65655b36b
permissions	-rw-r--r--