# HG changeset patch
# User Jaroslav Tulach This interface contains query methods used to determine the
+ * results of a match against a regular expression. The match boundaries,
+ * groups and group boundaries can be seen but not modified through
+ * a Capturing groups are indexed from left
+ * to right, starting at one. Group zero denotes the entire pattern, so
+ * the expression m.start(0) is equivalent to
+ * m.start(). MatchResult
.
+ *
+ * @author Michael McCloskey
+ * @see Matcher
+ * @since 1.5
+ */
+public interface MatchResult {
+
+ /**
+ * Returns the start index of the match.
+ *
+ * @return The index of the first character matched
+ *
+ * @throws IllegalStateException
+ * If no match has yet been attempted,
+ * or if the previous match operation failed
+ */
+ public int start();
+
+ /**
+ * Returns the start index of the subsequence captured by the given group
+ * during this match.
+ *
+ *
Capturing groups are indexed from left + * to right, starting at one. Group zero denotes the entire pattern, so + * the expression m.end(0) is equivalent to + * m.end().
+ * + * @param group + * The index of a capturing group in this matcher's pattern + * + * @return The offset after the last character captured by the group, + * or -1 if the match was successful + * but the group itself did not match anything + * + * @throws IllegalStateException + * If no match has yet been attempted, + * or if the previous match operation failed + * + * @throws IndexOutOfBoundsException + * If there is no capturing group in the pattern + * with the given index + */ + public int end(int group); + + /** + * Returns the input subsequence matched by the previous match. + * + *For a matcher m with input sequence s, + * the expressions m.group() and + * s.substring(m.start(), m.end()) + * are equivalent.
+ * + *Note that some patterns, for example a*, match the empty + * string. This method will return the empty string when the pattern + * successfully matches the empty string in the input.
+ * + * @return The (possibly empty) subsequence matched by the previous match, + * in string form + * + * @throws IllegalStateException + * If no match has yet been attempted, + * or if the previous match operation failed + */ + public String group(); + + /** + * Returns the input subsequence captured by the given group during the + * previous match operation. + * + *For a matcher m, input sequence s, and group index + * g, the expressions m.group(g) and + * s.substring(m.start(g), m.end(g)) + * are equivalent.
+ * + *Capturing groups are indexed from left + * to right, starting at one. Group zero denotes the entire pattern, so + * the expression m.group(0) is equivalent to m.group(). + *
+ * + *If the match was successful but the group specified failed to match + * any part of the input sequence, then null is returned. Note + * that some groups, for example (a*), match the empty string. + * This method will return the empty string when such a group successfully + * matches the empty string in the input.
+ * + * @param group + * The index of a capturing group in this matcher's pattern + * + * @return The (possibly empty) subsequence captured by the group + * during the previous match, or null if the group + * failed to match part of the input + * + * @throws IllegalStateException + * If no match has yet been attempted, + * or if the previous match operation failed + * + * @throws IndexOutOfBoundsException + * If there is no capturing group in the pattern + * with the given index + */ + public String group(int group); + + /** + * Returns the number of capturing groups in this match result's pattern. + * + *Group zero denotes the entire pattern by convention. It is not + * included in this count. + * + *
Any non-negative integer smaller than or equal to the value + * returned by this method is guaranteed to be a valid group index for + * this matcher.
+ * + * @return The number of capturing groups in this matcher's pattern + */ + public int groupCount(); + +} diff -r 9cc0e1034f92 -r 7df5f83fff8d rt/emul/compact/src/main/java/java/util/regex/Matcher.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rt/emul/compact/src/main/java/java/util/regex/Matcher.java Mon Oct 07 16:14:56 2013 +0200 @@ -0,0 +1,1256 @@ +/* + * Copyright (c) 1999, 2009, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package java.util.regex; + + +/** + * An engine that performs match operations on a {@link java.lang.CharSequence + * character sequence} by interpreting a {@link Pattern}.
+ *
+ * A matcher is created from a pattern by invoking the pattern's {@link
+ * Pattern#matcher matcher} method. Once created, a matcher can be used to
+ * perform three different kinds of match operations:
+ *
+ *
+ *
+ * The {@link #matches matches} method attempts to match the entire
+ * input sequence against the pattern.
+ *
+ * The {@link #lookingAt lookingAt} method attempts to match the
+ * input sequence, starting at the beginning, against the pattern.
+ *
+ * The {@link #find find} method scans the input sequence looking for
+ * the next subsequence that matches the pattern.
+ *
+ *
+ *
+ * Each of these methods returns a boolean indicating success or failure.
+ * More information about a successful match can be obtained by querying the
+ * state of the matcher.
+ *
+ *
A matcher finds matches in a subset of its input called the
+ * region. By default, the region contains all of the matcher's input.
+ * The region can be modified via the{@link #region region} method and queried
+ * via the {@link #regionStart regionStart} and {@link #regionEnd regionEnd}
+ * methods. The way that the region boundaries interact with some pattern
+ * constructs can be changed. See {@link #useAnchoringBounds
+ * useAnchoringBounds} and {@link #useTransparentBounds useTransparentBounds}
+ * for more details.
+ *
+ *
This class also defines methods for replacing matched subsequences with
+ * new strings whose contents can, if desired, be computed from the match
+ * result. The {@link #appendReplacement appendReplacement} and {@link
+ * #appendTail appendTail} methods can be used in tandem in order to collect
+ * the result into an existing string buffer, or the more convenient {@link
+ * #replaceAll replaceAll} method can be used to create a string in which every
+ * matching subsequence in the input sequence is replaced.
+ *
+ *
The explicit state of a matcher includes the start and end indices of
+ * the most recent successful match. It also includes the start and end
+ * indices of the input subsequence captured by each capturing group in the pattern as well as a total
+ * count of such subsequences. As a convenience, methods are also provided for
+ * returning these captured subsequences in string form.
+ *
+ *
The explicit state of a matcher is initially undefined; attempting to
+ * query any part of it before a successful match will cause an {@link
+ * IllegalStateException} to be thrown. The explicit state of a matcher is
+ * recomputed by every match operation.
+ *
+ *
The implicit state of a matcher includes the input character sequence as
+ * well as the append position, which is initially zero and is updated
+ * by the {@link #appendReplacement appendReplacement} method.
+ *
+ *
A matcher may be reset explicitly by invoking its {@link #reset()}
+ * method or, if a new input sequence is desired, its {@link
+ * #reset(java.lang.CharSequence) reset(CharSequence)} method. Resetting a
+ * matcher discards its explicit state information and sets the append position
+ * to zero.
+ *
+ *
Instances of this class are not safe for use by multiple concurrent
+ * threads.
+ *
+ *
+ * @author Mike McCloskey
+ * @author Mark Reinhold
+ * @author JSR-51 Expert Group
+ * @since 1.4
+ * @spec JSR-51
+ */
+
+public final class Matcher implements MatchResult {
+
+ /**
+ * The Pattern object that created this Matcher.
+ */
+ Pattern parentPattern;
+
+ /**
+ * The storage used by groups. They may contain invalid values if
+ * a group was skipped during the matching.
+ */
+ int[] groups;
+
+ /**
+ * The range within the sequence that is to be matched. Anchors
+ * will match at these "hard" boundaries. Changing the region
+ * changes these values.
+ */
+ int from, to;
+
+ /**
+ * Lookbehind uses this value to ensure that the subexpression
+ * match ends at the point where the lookbehind was encountered.
+ */
+ int lookbehindTo;
+
+ /**
+ * The original string being matched.
+ */
+ CharSequence text;
+
+ /**
+ * Matcher state used by the last node. NOANCHOR is used when a
+ * match does not have to consume all of the input. ENDANCHOR is
+ * the mode used for matching all the input.
+ */
+ static final int ENDANCHOR = 1;
+ static final int NOANCHOR = 0;
+ int acceptMode = NOANCHOR;
+
+ /**
+ * The range of string that last matched the pattern. If the last
+ * match failed then first is -1; last initially holds 0 then it
+ * holds the index of the end of the last match (which is where the
+ * next search starts).
+ */
+ int first = -1, last = 0;
+
+ /**
+ * The end index of what matched in the last match operation.
+ */
+ int oldLast = -1;
+
+ /**
+ * The index of the last position appended in a substitution.
+ */
+ int lastAppendPosition = 0;
+
+ /**
+ * Storage used by nodes to tell what repetition they are on in
+ * a pattern, and where groups begin. The nodes themselves are stateless,
+ * so they rely on this field to hold state during a match.
+ */
+ int[] locals;
+
+ /**
+ * Boolean indicating whether or not more input could change
+ * the results of the last match.
+ *
+ * If hitEnd is true, and a match was found, then more input
+ * might cause a different match to be found.
+ * If hitEnd is true and a match was not found, then more
+ * input could cause a match to be found.
+ * If hitEnd is false and a match was found, then more input
+ * will not change the match.
+ * If hitEnd is false and a match was not found, then more
+ * input will not cause a match to be found.
+ */
+ boolean hitEnd;
+
+ /**
+ * Boolean indicating whether or not more input could change
+ * a positive match into a negative one.
+ *
+ * If requireEnd is true, and a match was found, then more
+ * input could cause the match to be lost.
+ * If requireEnd is false and a match was found, then more
+ * input might change the match but the match won't be lost.
+ * If a match was not found, then requireEnd has no meaning.
+ */
+ boolean requireEnd;
+
+ /**
+ * If transparentBounds is true then the boundaries of this
+ * matcher's region are transparent to lookahead, lookbehind,
+ * and boundary matching constructs that try to see beyond them.
+ */
+ boolean transparentBounds = false;
+
+ /**
+ * If anchoringBounds is true then the boundaries of this
+ * matcher's region match anchors such as ^ and $.
+ */
+ boolean anchoringBounds = true;
+
+ /**
+ * No default constructor.
+ */
+ Matcher() {
+ }
+
+ /**
+ * All matchers have the state used by Pattern during a match.
+ */
+ Matcher(Pattern parent, CharSequence text) {
+ this.parentPattern = parent;
+ this.text = text;
+
+ // Allocate state storage
+ int parentGroupCount = Math.max(parent.capturingGroupCount, 10);
+ groups = new int[parentGroupCount * 2];
+ locals = new int[parent.localCount];
+
+ // Put fields into initial states
+ reset();
+ }
+
+ /**
+ * Returns the pattern that is interpreted by this matcher.
+ *
+ * @return The pattern for which this matcher was created
+ */
+ public Pattern pattern() {
+ return parentPattern;
+ }
+
+ /**
+ * Returns the match state of this matcher as a {@link MatchResult}.
+ * The result is unaffected by subsequent operations performed upon this
+ * matcher.
+ *
+ * @return a MatchResult
with the state of this matcher
+ * @since 1.5
+ */
+ public MatchResult toMatchResult() {
+ Matcher result = new Matcher(this.parentPattern, text.toString());
+ result.first = this.first;
+ result.last = this.last;
+ result.groups = this.groups.clone();
+ return result;
+ }
+
+ /**
+ * Changes the Pattern that this Matcher uses to
+ * find matches with.
+ *
+ * This method causes this matcher to lose information
+ * about the groups of the last match that occurred. The
+ * matcher's position in the input is maintained and its
+ * last append position is unaffected.
+ *
+ * @param newPattern
+ * The new pattern used by this matcher
+ * @return This matcher
+ * @throws IllegalArgumentException
+ * If newPattern is null
+ * @since 1.5
+ */
+ public Matcher usePattern(Pattern newPattern) {
+ if (newPattern == null)
+ throw new IllegalArgumentException("Pattern cannot be null");
+ parentPattern = newPattern;
+
+ // Reallocate state storage
+ int parentGroupCount = Math.max(newPattern.capturingGroupCount, 10);
+ groups = new int[parentGroupCount * 2];
+ locals = new int[newPattern.localCount];
+ for (int i = 0; i < groups.length; i++)
+ groups[i] = -1;
+ for (int i = 0; i < locals.length; i++)
+ locals[i] = -1;
+ return this;
+ }
+
+ /**
+ * Resets this matcher.
+ *
+ * Resetting a matcher discards all of its explicit state information
+ * and sets its append position to zero. The matcher's region is set to the
+ * default region, which is its entire character sequence. The anchoring
+ * and transparency of this matcher's region boundaries are unaffected.
+ *
+ * @return This matcher
+ */
+ public Matcher reset() {
+ first = -1;
+ last = 0;
+ oldLast = -1;
+ for(int i=0; i Resetting a matcher discards all of its explicit state information
+ * and sets its append position to zero. The matcher's region is set to
+ * the default region, which is its entire character sequence. The
+ * anchoring and transparency of this matcher's region boundaries are
+ * unaffected.
+ *
+ * @param input
+ * The new input character sequence
+ *
+ * @return This matcher
+ */
+ public Matcher reset(CharSequence input) {
+ text = input;
+ return reset();
+ }
+
+ /**
+ * Returns the start index of the previous match.
+ *
+ * @return The index of the first character matched
+ *
+ * @throws IllegalStateException
+ * If no match has yet been attempted,
+ * or if the previous match operation failed
+ */
+ public int start() {
+ if (first < 0)
+ throw new IllegalStateException("No match available");
+ return first;
+ }
+
+ /**
+ * Returns the start index of the subsequence captured by the given group
+ * during the previous match operation.
+ *
+ * Capturing groups are indexed from left
+ * to right, starting at one. Group zero denotes the entire pattern, so
+ * the expression m.start(0) is equivalent to
+ * m.start().
+ *
+ * @param group
+ * The index of a capturing group in this matcher's pattern
+ *
+ * @return The index of the first character captured by the group,
+ * or -1 if the match was successful but the group
+ * itself did not match anything
+ *
+ * @throws IllegalStateException
+ * If no match has yet been attempted,
+ * or if the previous match operation failed
+ *
+ * @throws IndexOutOfBoundsException
+ * If there is no capturing group in the pattern
+ * with the given index
+ */
+ public int start(int group) {
+ if (first < 0)
+ throw new IllegalStateException("No match available");
+ if (group > groupCount())
+ throw new IndexOutOfBoundsException("No group " + group);
+ return groups[group * 2];
+ }
+
+ /**
+ * Returns the offset after the last character matched.
+ *
+ * @return The offset after the last character matched
+ *
+ * @throws IllegalStateException
+ * If no match has yet been attempted,
+ * or if the previous match operation failed
+ */
+ public int end() {
+ if (first < 0)
+ throw new IllegalStateException("No match available");
+ return last;
+ }
+
+ /**
+ * Returns the offset after the last character of the subsequence
+ * captured by the given group during the previous match operation.
+ *
+ * Capturing groups are indexed from left
+ * to right, starting at one. Group zero denotes the entire pattern, so
+ * the expression m.end(0) is equivalent to
+ * m.end().
+ *
+ * @param group
+ * The index of a capturing group in this matcher's pattern
+ *
+ * @return The offset after the last character captured by the group,
+ * or -1 if the match was successful
+ * but the group itself did not match anything
+ *
+ * @throws IllegalStateException
+ * If no match has yet been attempted,
+ * or if the previous match operation failed
+ *
+ * @throws IndexOutOfBoundsException
+ * If there is no capturing group in the pattern
+ * with the given index
+ */
+ public int end(int group) {
+ if (first < 0)
+ throw new IllegalStateException("No match available");
+ if (group > groupCount())
+ throw new IndexOutOfBoundsException("No group " + group);
+ return groups[group * 2 + 1];
+ }
+
+ /**
+ * Returns the input subsequence matched by the previous match.
+ *
+ * For a matcher m with input sequence s,
+ * the expressions m.group() and
+ * s.substring(m.start(), m.end())
+ * are equivalent.
+ *
+ * Note that some patterns, for example a*, match the empty
+ * string. This method will return the empty string when the pattern
+ * successfully matches the empty string in the input.
+ *
+ * @return The (possibly empty) subsequence matched by the previous match,
+ * in string form
+ *
+ * @throws IllegalStateException
+ * If no match has yet been attempted,
+ * or if the previous match operation failed
+ */
+ public String group() {
+ return group(0);
+ }
+
+ /**
+ * Returns the input subsequence captured by the given group during the
+ * previous match operation.
+ *
+ * For a matcher m, input sequence s, and group index
+ * g, the expressions m.group(g) and
+ * s.substring(m.start(g), m.end(g))
+ * are equivalent.
+ *
+ * Capturing groups are indexed from left
+ * to right, starting at one. Group zero denotes the entire pattern, so
+ * the expression m.group(0) is equivalent to m.group().
+ *
+ *
+ * If the match was successful but the group specified failed to match
+ * any part of the input sequence, then null is returned. Note
+ * that some groups, for example (a*), match the empty string.
+ * This method will return the empty string when such a group successfully
+ * matches the empty string in the input.
+ *
+ * @param group
+ * The index of a capturing group in this matcher's pattern
+ *
+ * @return The (possibly empty) subsequence captured by the group
+ * during the previous match, or null if the group
+ * failed to match part of the input
+ *
+ * @throws IllegalStateException
+ * If no match has yet been attempted,
+ * or if the previous match operation failed
+ *
+ * @throws IndexOutOfBoundsException
+ * If there is no capturing group in the pattern
+ * with the given index
+ */
+ public String group(int group) {
+ if (first < 0)
+ throw new IllegalStateException("No match found");
+ if (group < 0 || group > groupCount())
+ throw new IndexOutOfBoundsException("No group " + group);
+ if ((groups[group*2] == -1) || (groups[group*2+1] == -1))
+ return null;
+ return getSubSequence(groups[group * 2], groups[group * 2 + 1]).toString();
+ }
+
+ /**
+ * Returns the input subsequence captured by the given
+ * named-capturing group during the previous
+ * match operation.
+ *
+ * If the match was successful but the group specified failed to match
+ * any part of the input sequence, then null is returned. Note
+ * that some groups, for example (a*), match the empty string.
+ * This method will return the empty string when such a group successfully
+ * matches the empty string in the input.
+ *
+ * @param name
+ * The name of a named-capturing group in this matcher's pattern
+ *
+ * @return The (possibly empty) subsequence captured by the named group
+ * during the previous match, or null if the group
+ * failed to match part of the input
+ *
+ * @throws IllegalStateException
+ * If no match has yet been attempted,
+ * or if the previous match operation failed
+ *
+ * @throws IllegalArgumentException
+ * If there is no capturing group in the pattern
+ * with the given name
+ */
+ public String group(String name) {
+ if (name == null)
+ throw new NullPointerException("Null group name");
+ if (first < 0)
+ throw new IllegalStateException("No match found");
+ if (!parentPattern.namedGroups().containsKey(name))
+ throw new IllegalArgumentException("No group with name <" + name + ">");
+ int group = parentPattern.namedGroups().get(name);
+ if ((groups[group*2] == -1) || (groups[group*2+1] == -1))
+ return null;
+ return getSubSequence(groups[group * 2], groups[group * 2 + 1]).toString();
+ }
+
+ /**
+ * Returns the number of capturing groups in this matcher's pattern.
+ *
+ * Group zero denotes the entire pattern by convention. It is not
+ * included in this count.
+ *
+ *
Any non-negative integer smaller than or equal to the value
+ * returned by this method is guaranteed to be a valid group index for
+ * this matcher.
+ *
+ * @return The number of capturing groups in this matcher's pattern
+ */
+ public int groupCount() {
+ return parentPattern.capturingGroupCount - 1;
+ }
+
+ /**
+ * Attempts to match the entire region against the pattern.
+ *
+ * If the match succeeds then more information can be obtained via the
+ * start, end, and group methods.
+ *
+ * @return true if, and only if, the entire region sequence
+ * matches this matcher's pattern
+ */
+ public boolean matches() {
+ return match(from, ENDANCHOR);
+ }
+
+ /**
+ * Attempts to find the next subsequence of the input sequence that matches
+ * the pattern.
+ *
+ * This method starts at the beginning of this matcher's region, or, if
+ * a previous invocation of the method was successful and the matcher has
+ * not since been reset, at the first character not matched by the previous
+ * match.
+ *
+ *
If the match succeeds then more information can be obtained via the
+ * start, end, and group methods.
+ *
+ * @return true if, and only if, a subsequence of the input
+ * sequence matches this matcher's pattern
+ */
+ public boolean find() {
+ int nextSearchIndex = last;
+ if (nextSearchIndex == first)
+ nextSearchIndex++;
+
+ // If next search starts before region, start it at region
+ if (nextSearchIndex < from)
+ nextSearchIndex = from;
+
+ // If next search starts beyond region then it fails
+ if (nextSearchIndex > to) {
+ for (int i = 0; i < groups.length; i++)
+ groups[i] = -1;
+ return false;
+ }
+ return search(nextSearchIndex);
+ }
+
+ /**
+ * Resets this matcher and then attempts to find the next subsequence of
+ * the input sequence that matches the pattern, starting at the specified
+ * index.
+ *
+ * If the match succeeds then more information can be obtained via the
+ * start, end, and group methods, and subsequent
+ * invocations of the {@link #find()} method will start at the first
+ * character not matched by this match.
+ *
+ * @throws IndexOutOfBoundsException
+ * If start is less than zero or if start is greater than the
+ * length of the input sequence.
+ *
+ * @return true if, and only if, a subsequence of the input
+ * sequence starting at the given index matches this matcher's
+ * pattern
+ */
+ public boolean find(int start) {
+ int limit = getTextLength();
+ if ((start < 0) || (start > limit))
+ throw new IndexOutOfBoundsException("Illegal start index");
+ reset();
+ return search(start);
+ }
+
+ /**
+ * Attempts to match the input sequence, starting at the beginning of the
+ * region, against the pattern.
+ *
+ * Like the {@link #matches matches} method, this method always starts
+ * at the beginning of the region; unlike that method, it does not
+ * require that the entire region be matched.
+ *
+ *
If the match succeeds then more information can be obtained via the
+ * start, end, and group methods.
+ *
+ * @return true if, and only if, a prefix of the input
+ * sequence matches this matcher's pattern
+ */
+ public boolean lookingAt() {
+ return match(from, NOANCHOR);
+ }
+
+ /**
+ * Returns a literal replacement String
for the specified
+ * String
.
+ *
+ * This method produces a String
that will work
+ * as a literal replacement s
in the
+ * appendReplacement
method of the {@link Matcher} class.
+ * The String
produced will match the sequence of characters
+ * in s
treated as a literal sequence. Slashes ('\') and
+ * dollar signs ('$') will be given no special meaning.
+ *
+ * @param s The string to be literalized
+ * @return A literal string replacement
+ * @since 1.5
+ */
+ public static String quoteReplacement(String s) {
+ if ((s.indexOf('\\') == -1) && (s.indexOf('$') == -1))
+ return s;
+ StringBuilder sb = new StringBuilder();
+ for (int i=0; i This method performs the following actions:
+ *
+ *
+ *
+ * It reads characters from the input sequence, starting at the
+ * append position, and appends them to the given string buffer. It
+ * stops after reading the last character preceding the previous match,
+ * that is, the character at index {@link
+ * #start()} - 1.
+ *
+ * It appends the given replacement string to the string buffer.
+ *
+ *
+ * It sets the append position of this matcher to the index of
+ * the last character matched, plus one, that is, to {@link #end()}.
+ *
+ *
+ *
+ *
+ * The replacement string may contain references to subsequences
+ * captured during the previous match: Each occurrence of
+ * ${name} or $g
+ * will be replaced by the result of evaluating the corresponding
+ * {@link #group(String) group(name)} or {@link #group(int) group(g)}
+ * respectively. For $g,
+ * the first number after the $ is always treated as part of
+ * the group reference. Subsequent numbers are incorporated into g if
+ * they would form a legal group reference. Only the numerals '0'
+ * through '9' are considered as potential components of the group
+ * reference. If the second group matched the string "foo", for
+ * example, then passing the replacement string "$2bar" would
+ * cause "foobar" to be appended to the string buffer. A dollar
+ * sign ($) may be included as a literal in the replacement
+ * string by preceding it with a backslash (\$).
+ *
+ *
Note that backslashes (\) and dollar signs ($) in
+ * the replacement string may cause the results to be different than if it
+ * were being treated as a literal replacement string. Dollar signs may be
+ * treated as references to captured subsequences as described above, and
+ * backslashes are used to escape literal characters in the replacement
+ * string.
+ *
+ *
This method is intended to be used in a loop together with the
+ * {@link #appendTail appendTail} and {@link #find find} methods. The
+ * following code, for example, writes one dog two dogs in the
+ * yard to the standard-output stream:
+ *
+ *
+ * Pattern p = Pattern.compile("cat");
+ * Matcher m = p.matcher("one cat two cats in the yard");
+ * StringBuffer sb = new StringBuffer();
+ * while (m.find()) {
+ * m.appendReplacement(sb, "dog");
+ * }
+ * m.appendTail(sb);
+ * System.out.println(sb.toString());
+ *
+ * @param sb
+ * The target string buffer
+ *
+ * @param replacement
+ * The replacement string
+ *
+ * @return This matcher
+ *
+ * @throws IllegalStateException
+ * If no match has yet been attempted,
+ * or if the previous match operation failed
+ *
+ * @throws IllegalArgumentException
+ * If the replacement string refers to a named-capturing
+ * group that does not exist in the pattern
+ *
+ * @throws IndexOutOfBoundsException
+ * If the replacement string refers to a capturing group
+ * that does not exist in the pattern
+ */
+ public Matcher appendReplacement(StringBuffer sb, String replacement) {
+
+ // If no match, return error
+ if (first < 0)
+ throw new IllegalStateException("No match available");
+
+ // Process substitution string to replace group references with groups
+ int cursor = 0;
+ StringBuilder result = new StringBuilder();
+
+ while (cursor < replacement.length()) {
+ char nextChar = replacement.charAt(cursor);
+ if (nextChar == '\\') {
+ cursor++;
+ nextChar = replacement.charAt(cursor);
+ result.append(nextChar);
+ cursor++;
+ } else if (nextChar == '$') {
+ // Skip past $
+ cursor++;
+ // A StringIndexOutOfBoundsException is thrown if
+ // this "$" is the last character in replacement
+ // string in current implementation, a IAE might be
+ // more appropriate.
+ nextChar = replacement.charAt(cursor);
+ int refNum = -1;
+ if (nextChar == '{') {
+ cursor++;
+ StringBuilder gsb = new StringBuilder();
+ while (cursor < replacement.length()) {
+ nextChar = replacement.charAt(cursor);
+ if (ASCII.isLower(nextChar) ||
+ ASCII.isUpper(nextChar) ||
+ ASCII.isDigit(nextChar)) {
+ gsb.append(nextChar);
+ cursor++;
+ } else {
+ break;
+ }
+ }
+ if (gsb.length() == 0)
+ throw new IllegalArgumentException(
+ "named capturing group has 0 length name");
+ if (nextChar != '}')
+ throw new IllegalArgumentException(
+ "named capturing group is missing trailing '}'");
+ String gname = gsb.toString();
+ if (ASCII.isDigit(gname.charAt(0)))
+ throw new IllegalArgumentException(
+ "capturing group name {" + gname +
+ "} starts with digit character");
+ if (!parentPattern.namedGroups().containsKey(gname))
+ throw new IllegalArgumentException(
+ "No group with name {" + gname + "}");
+ refNum = parentPattern.namedGroups().get(gname);
+ cursor++;
+ } else {
+ // The first number is always a group
+ refNum = (int)nextChar - '0';
+ if ((refNum < 0)||(refNum > 9))
+ throw new IllegalArgumentException(
+ "Illegal group reference");
+ cursor++;
+ // Capture the largest legal group string
+ boolean done = false;
+ while (!done) {
+ if (cursor >= replacement.length()) {
+ break;
+ }
+ int nextDigit = replacement.charAt(cursor) - '0';
+ if ((nextDigit < 0)||(nextDigit > 9)) { // not a number
+ break;
+ }
+ int newRefNum = (refNum * 10) + nextDigit;
+ if (groupCount() < newRefNum) {
+ done = true;
+ } else {
+ refNum = newRefNum;
+ cursor++;
+ }
+ }
+ }
+ // Append group
+ if (start(refNum) != -1 && end(refNum) != -1)
+ result.append(text, start(refNum), end(refNum));
+ } else {
+ result.append(nextChar);
+ cursor++;
+ }
+ }
+ // Append the intervening text
+ sb.append(text, lastAppendPosition, first);
+ // Append the match substitution
+ sb.append(result);
+
+ lastAppendPosition = last;
+ return this;
+ }
+
+ /**
+ * Implements a terminal append-and-replace step.
+ *
+ * This method reads characters from the input sequence, starting at
+ * the append position, and appends them to the given string buffer. It is
+ * intended to be invoked after one or more invocations of the {@link
+ * #appendReplacement appendReplacement} method in order to copy the
+ * remainder of the input sequence.
+ *
+ * @param sb
+ * The target string buffer
+ *
+ * @return The target string buffer
+ */
+ public StringBuffer appendTail(StringBuffer sb) {
+ sb.append(text, lastAppendPosition, getTextLength());
+ return sb;
+ }
+
+ /**
+ * Replaces every subsequence of the input sequence that matches the
+ * pattern with the given replacement string.
+ *
+ * This method first resets this matcher. It then scans the input
+ * sequence looking for matches of the pattern. Characters that are not
+ * part of any match are appended directly to the result string; each match
+ * is replaced in the result by the replacement string. The replacement
+ * string may contain references to captured subsequences as in the {@link
+ * #appendReplacement appendReplacement} method.
+ *
+ *
Note that backslashes (\) and dollar signs ($) in
+ * the replacement string may cause the results to be different than if it
+ * were being treated as a literal replacement string. Dollar signs may be
+ * treated as references to captured subsequences as described above, and
+ * backslashes are used to escape literal characters in the replacement
+ * string.
+ *
+ *
Given the regular expression a*b, the input
+ * "aabfooaabfooabfoob", and the replacement string
+ * "-", an invocation of this method on a matcher for that
+ * expression would yield the string "-foo-foo-foo-".
+ *
+ *
Invoking this method changes this matcher's state. If the matcher
+ * is to be used in further matching operations then it should first be
+ * reset.
+ *
+ * @param replacement
+ * The replacement string
+ *
+ * @return The string constructed by replacing each matching subsequence
+ * by the replacement string, substituting captured subsequences
+ * as needed
+ */
+ public String replaceAll(String replacement) {
+ reset();
+ boolean result = find();
+ if (result) {
+ StringBuffer sb = new StringBuffer();
+ do {
+ appendReplacement(sb, replacement);
+ result = find();
+ } while (result);
+ appendTail(sb);
+ return sb.toString();
+ }
+ return text.toString();
+ }
+
+ /**
+ * Replaces the first subsequence of the input sequence that matches the
+ * pattern with the given replacement string.
+ *
+ * This method first resets this matcher. It then scans the input
+ * sequence looking for a match of the pattern. Characters that are not
+ * part of the match are appended directly to the result string; the match
+ * is replaced in the result by the replacement string. The replacement
+ * string may contain references to captured subsequences as in the {@link
+ * #appendReplacement appendReplacement} method.
+ *
+ *
Note that backslashes (\) and dollar signs ($) in
+ * the replacement string may cause the results to be different than if it
+ * were being treated as a literal replacement string. Dollar signs may be
+ * treated as references to captured subsequences as described above, and
+ * backslashes are used to escape literal characters in the replacement
+ * string.
+ *
+ *
Given the regular expression dog, the input
+ * "zzzdogzzzdogzzz", and the replacement string
+ * "cat", an invocation of this method on a matcher for that
+ * expression would yield the string "zzzcatzzzdogzzz".
+ *
+ * Invoking this method changes this matcher's state. If the matcher
+ * is to be used in further matching operations then it should first be
+ * reset.
+ *
+ * @param replacement
+ * The replacement string
+ * @return The string constructed by replacing the first matching
+ * subsequence by the replacement string, substituting captured
+ * subsequences as needed
+ */
+ public String replaceFirst(String replacement) {
+ if (replacement == null)
+ throw new NullPointerException("replacement");
+ reset();
+ if (!find())
+ return text.toString();
+ StringBuffer sb = new StringBuffer();
+ appendReplacement(sb, replacement);
+ appendTail(sb);
+ return sb.toString();
+ }
+
+ /**
+ * Sets the limits of this matcher's region. The region is the part of the
+ * input sequence that will be searched to find a match. Invoking this
+ * method resets the matcher, and then sets the region to start at the
+ * index specified by the start
parameter and end at the
+ * index specified by the end
parameter.
+ *
+ * Depending on the transparency and anchoring being used (see
+ * {@link #useTransparentBounds useTransparentBounds} and
+ * {@link #useAnchoringBounds useAnchoringBounds}), certain constructs such
+ * as anchors may behave differently at or around the boundaries of the
+ * region.
+ *
+ * @param start
+ * The index to start searching at (inclusive)
+ * @param end
+ * The index to end searching at (exclusive)
+ * @throws IndexOutOfBoundsException
+ * If start or end is less than zero, if
+ * start is greater than the length of the input sequence, if
+ * end is greater than the length of the input sequence, or if
+ * start is greater than end.
+ * @return this matcher
+ * @since 1.5
+ */
+ public Matcher region(int start, int end) {
+ if ((start < 0) || (start > getTextLength()))
+ throw new IndexOutOfBoundsException("start");
+ if ((end < 0) || (end > getTextLength()))
+ throw new IndexOutOfBoundsException("end");
+ if (start > end)
+ throw new IndexOutOfBoundsException("start > end");
+ reset();
+ from = start;
+ to = end;
+ return this;
+ }
+
+ /**
+ * Reports the start index of this matcher's region. The
+ * searches this matcher conducts are limited to finding matches
+ * within {@link #regionStart regionStart} (inclusive) and
+ * {@link #regionEnd regionEnd} (exclusive).
+ *
+ * @return The starting point of this matcher's region
+ * @since 1.5
+ */
+ public int regionStart() {
+ return from;
+ }
+
+ /**
+ * Reports the end index (exclusive) of this matcher's region.
+ * The searches this matcher conducts are limited to finding matches
+ * within {@link #regionStart regionStart} (inclusive) and
+ * {@link #regionEnd regionEnd} (exclusive).
+ *
+ * @return the ending point of this matcher's region
+ * @since 1.5
+ */
+ public int regionEnd() {
+ return to;
+ }
+
+ /**
+ * Queries the transparency of region bounds for this matcher.
+ *
+ *
This method returns true if this matcher uses
+ * transparent bounds, false if it uses opaque
+ * bounds.
+ *
+ *
See {@link #useTransparentBounds useTransparentBounds} for a
+ * description of transparent and opaque bounds.
+ *
+ *
By default, a matcher uses opaque region boundaries.
+ *
+ * @return true iff this matcher is using transparent bounds,
+ * false otherwise.
+ * @see java.util.regex.Matcher#useTransparentBounds(boolean)
+ * @since 1.5
+ */
+ public boolean hasTransparentBounds() {
+ return transparentBounds;
+ }
+
+ /**
+ * Sets the transparency of region bounds for this matcher.
+ *
+ *
Invoking this method with an argument of true will set this
+ * matcher to use transparent bounds. If the boolean
+ * argument is false, then opaque bounds will be used.
+ *
+ *
Using transparent bounds, the boundaries of this
+ * matcher's region are transparent to lookahead, lookbehind,
+ * and boundary matching constructs. Those constructs can see beyond the
+ * boundaries of the region to see if a match is appropriate.
+ *
+ *
Using opaque bounds, the boundaries of this matcher's
+ * region are opaque to lookahead, lookbehind, and boundary matching
+ * constructs that may try to see beyond them. Those constructs cannot
+ * look past the boundaries so they will fail to match anything outside
+ * of the region.
+ *
+ *
By default, a matcher uses opaque bounds.
+ *
+ * @param b a boolean indicating whether to use opaque or transparent
+ * regions
+ * @return this matcher
+ * @see java.util.regex.Matcher#hasTransparentBounds
+ * @since 1.5
+ */
+ public Matcher useTransparentBounds(boolean b) {
+ transparentBounds = b;
+ return this;
+ }
+
+ /**
+ * Queries the anchoring of region bounds for this matcher.
+ *
+ *
This method returns true if this matcher uses
+ * anchoring bounds, false otherwise.
+ *
+ *
See {@link #useAnchoringBounds useAnchoringBounds} for a
+ * description of anchoring bounds.
+ *
+ *
By default, a matcher uses anchoring region boundaries.
+ *
+ * @return true iff this matcher is using anchoring bounds,
+ * false otherwise.
+ * @see java.util.regex.Matcher#useAnchoringBounds(boolean)
+ * @since 1.5
+ */
+ public boolean hasAnchoringBounds() {
+ return anchoringBounds;
+ }
+
+ /**
+ * Sets the anchoring of region bounds for this matcher.
+ *
+ *
Invoking this method with an argument of true will set this
+ * matcher to use anchoring bounds. If the boolean
+ * argument is false, then non-anchoring bounds will be
+ * used.
+ *
+ *
Using anchoring bounds, the boundaries of this
+ * matcher's region match anchors such as ^ and $.
+ *
+ *
Without anchoring bounds, the boundaries of this
+ * matcher's region will not match anchors such as ^ and $.
+ *
+ *
By default, a matcher uses anchoring region boundaries.
+ *
+ * @param b a boolean indicating whether or not to use anchoring bounds.
+ * @return this matcher
+ * @see java.util.regex.Matcher#hasAnchoringBounds
+ * @since 1.5
+ */
+ public Matcher useAnchoringBounds(boolean b) {
+ anchoringBounds = b;
+ return this;
+ }
+
+ /**
+ *
Returns the string representation of this matcher. The
+ * string representation of a Matcher
contains information
+ * that may be useful for debugging. The exact format is unspecified.
+ *
+ * @return The string representation of this matcher
+ * @since 1.5
+ */
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append("java.util.regex.Matcher");
+ sb.append("[pattern=" + pattern());
+ sb.append(" region=");
+ sb.append(regionStart() + "," + regionEnd());
+ sb.append(" lastmatch=");
+ if ((first >= 0) && (group() != null)) {
+ sb.append(group());
+ }
+ sb.append("]");
+ return sb.toString();
+ }
+
+ /**
+ *
Returns true if the end of input was hit by the search engine in
+ * the last match operation performed by this matcher.
+ *
+ *
When this method returns true, then it is possible that more input
+ * would have changed the result of the last search.
+ *
+ * @return true iff the end of input was hit in the last match; false
+ * otherwise
+ * @since 1.5
+ */
+ public boolean hitEnd() {
+ return hitEnd;
+ }
+
+ /**
+ *
Returns true if more input could change a positive match into a
+ * negative one.
+ *
+ *
If this method returns true, and a match was found, then more
+ * input could cause the match to be lost. If this method returns false
+ * and a match was found, then more input might change the match but the
+ * match won't be lost. If a match was not found, then requireEnd has no
+ * meaning.
+ *
+ * @return true iff more input could change a positive match into a
+ * negative one.
+ * @since 1.5
+ */
+ public boolean requireEnd() {
+ return requireEnd;
+ }
+
+ /**
+ * Initiates a search to find a Pattern within the given bounds.
+ * The groups are filled with default values and the match of the root
+ * of the state machine is called. The state machine will hold the state
+ * of the match as it proceeds in this matcher.
+ *
+ * Matcher.from is not set here, because it is the "hard" boundary
+ * of the start of the search which anchors will set to. The from param
+ * is the "soft" boundary of the start of the search, meaning that the
+ * regex tries to match at that index but ^ won't match there. Subsequent
+ * calls to the search methods start at a new "soft" boundary which is
+ * the end of the previous match.
+ */
+ boolean search(int from) {
+ this.hitEnd = false;
+ this.requireEnd = false;
+ from = from < 0 ? 0 : from;
+ this.first = from;
+ this.oldLast = oldLast < 0 ? from : oldLast;
+ for (int i = 0; i < groups.length; i++)
+ groups[i] = -1;
+ acceptMode = NOANCHOR;
+ boolean result = parentPattern.root.match(this, from, text);
+ if (!result)
+ this.first = -1;
+ this.oldLast = this.last;
+ return result;
+ }
+
+ /**
+ * Initiates a search for an anchored match to a Pattern within the given
+ * bounds. The groups are filled with default values and the match of the
+ * root of the state machine is called. The state machine will hold the
+ * state of the match as it proceeds in this matcher.
+ */
+ boolean match(int from, int anchor) {
+ this.hitEnd = false;
+ this.requireEnd = false;
+ from = from < 0 ? 0 : from;
+ this.first = from;
+ this.oldLast = oldLast < 0 ? from : oldLast;
+ for (int i = 0; i < groups.length; i++)
+ groups[i] = -1;
+ acceptMode = anchor;
+ boolean result = parentPattern.matchRoot.match(this, from, text);
+ if (!result)
+ this.first = -1;
+ this.oldLast = this.last;
+ return result;
+ }
+
+ /**
+ * Returns the end index of the text.
+ *
+ * @return the index after the last character in the text
+ */
+ int getTextLength() {
+ return text.length();
+ }
+
+ /**
+ * Generates a String from this Matcher's input in the specified range.
+ *
+ * @param beginIndex the beginning index, inclusive
+ * @param endIndex the ending index, exclusive
+ * @return A String generated from this Matcher's input
+ */
+ CharSequence getSubSequence(int beginIndex, int endIndex) {
+ return text.subSequence(beginIndex, endIndex);
+ }
+
+ /**
+ * Returns this Matcher's input character at index i.
+ *
+ * @return A char from the specified index
+ */
+ char charAt(int i) {
+ return text.charAt(i);
+ }
+
+}
diff -r 9cc0e1034f92 -r 7df5f83fff8d rt/emul/compact/src/main/java/java/util/regex/Pattern.java
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/rt/emul/compact/src/main/java/java/util/regex/Pattern.java Mon Oct 07 16:14:56 2013 +0200
@@ -0,0 +1,5648 @@
+/*
+ * Copyright (c) 1999, 2011, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation. Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package java.util.regex;
+
+import java.security.AccessController;
+import java.security.PrivilegedAction;
+import java.text.CharacterIterator;
+import java.text.Normalizer;
+import java.util.Locale;
+import java.util.Map;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Arrays;
+
+
+/**
+ * A compiled representation of a regular expression.
+ *
+ *
A regular expression, specified as a string, must first be compiled into
+ * an instance of this class. The resulting pattern can then be used to create
+ * a {@link Matcher} object that can match arbitrary {@link
+ * java.lang.CharSequence
character sequences} against the regular
+ * expression. All of the state involved in performing a match resides in the
+ * matcher, so many matchers can share the same pattern.
+ *
+ * A typical invocation sequence is thus
+ *
+ *
+ * Pattern p = Pattern.{@link #compile compile}("a*b");
+ * Matcher m = p.{@link #matcher matcher}("aaaaab");
+ * boolean b = m.{@link Matcher#matches matches}();
+ *
+ * A {@link #matches matches} method is defined by this class as a
+ * convenience for when a regular expression is used just once. This method
+ * compiles an expression and matches an input sequence against it in a single
+ * invocation. The statement
+ *
+ *
+ * boolean b = Pattern.matches("a*b", "aaaaab");
+ *
+ * is equivalent to the three statements above, though for repeated matches it
+ * is less efficient since it does not allow the compiled pattern to be reused.
+ *
+ * Instances of this class are immutable and are safe for use by multiple
+ * concurrent threads. Instances of the {@link Matcher} class are not safe for
+ * such use.
+ *
+ *
+ *
+ * Summary of regular-expression constructs
+ *
+ *
+ *
+ *
+ * Construct
+ * Matches
+ *
+ *
+ *
+ * Characters
+ *
+ * x
+ * The character x
+ * \\
+ * The backslash character
+ * \0n
+ * The character with octal value 0n
+ * (0 <= n <= 7)
+ * \0nn
+ * The character with octal value 0nn
+ * (0 <= n <= 7)
+ * \0mnn
+ * The character with octal value 0mnn
+ * (0 <= m <= 3,
+ * 0 <= n <= 7)
+ * \xhh
+ * The character with hexadecimal value 0xhh
+ * \uhhhh
+ * The character with hexadecimal value 0xhhhh
+ * \x{h...h}
+ * The character with hexadecimal value 0xh...h
+ * ({@link java.lang.Character#MIN_CODE_POINT Character.MIN_CODE_POINT}
+ * <= 0xh...h <= 
+ * {@link java.lang.Character#MAX_CODE_POINT Character.MAX_CODE_POINT})
+ * \t
+ * The tab character ('\u0009')
+ * \n
+ * The newline (line feed) character ('\u000A')
+ * \r
+ * The carriage-return character ('\u000D')
+ * \f
+ * The form-feed character ('\u000C')
+ * \a
+ * The alert (bell) character ('\u0007')
+ * \e
+ * The escape character ('\u001B')
+ * \cx
+ * The control character corresponding to x
+ *
+ *
+ * Character classes
+ *
+ * [abc]
+ * a, b, or c (simple class)
+ * [^abc]
+ * Any character except a, b, or c (negation)
+ * [a-zA-Z]
+ * a through z
+ * or A through Z, inclusive (range)
+ * [a-d[m-p]]
+ * a through d,
+ * or m through p: [a-dm-p] (union)
+ * [a-z&&[def]]
+ * d, e, or f (intersection)
+ * [a-z&&[^bc]]
+ * a through z,
+ * except for b and c: [ad-z] (subtraction)
+ * [a-z&&[^m-p]]
+ * a through z,
+ * and not m through p: [a-lq-z](subtraction)
+ *
+ *
+ * Predefined character classes
+ *
+ * .
+ * Any character (may or may not match line terminators)
+ * \d
+ * A digit: [0-9]
+ * \D
+ * A non-digit: [^0-9]
+ * \s
+ * A whitespace character: [ \t\n\x0B\f\r]
+ * \S
+ * A non-whitespace character: [^\s]
+ * \w
+ * A word character: [a-zA-Z_0-9]
+ * \W
+ * A non-word character: [^\w]
+ *
+ *
+ * POSIX character classes (US-ASCII only)
+ *
+ * \p{Lower}
+ * A lower-case alphabetic character: [a-z]
+ * \p{Upper}
+ * An upper-case alphabetic character:[A-Z]
+ * \p{ASCII}
+ * All ASCII:[\x00-\x7F]
+ * \p{Alpha}
+ * An alphabetic character:[\p{Lower}\p{Upper}]
+ * \p{Digit}
+ * A decimal digit: [0-9]
+ * \p{Alnum}
+ * An alphanumeric character:[\p{Alpha}\p{Digit}]
+ * \p{Punct}
+ * Punctuation: One of !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
+ *
+ * \p{Graph}
+ * A visible character: [\p{Alnum}\p{Punct}]
+ * \p{Print}
+ * A printable character: [\p{Graph}\x20]
+ * \p{Blank}
+ * A space or a tab: [ \t]
+ * \p{Cntrl}
+ * A control character: [\x00-\x1F\x7F]
+ * \p{XDigit}
+ * A hexadecimal digit: [0-9a-fA-F]
+ * \p{Space}
+ * A whitespace character: [ \t\n\x0B\f\r]
+ *
+ *
+ * java.lang.Character classes (simple java character type)
+ *
+ * \p{javaLowerCase}
+ * Equivalent to java.lang.Character.isLowerCase()
+ * \p{javaUpperCase}
+ * Equivalent to java.lang.Character.isUpperCase()
+ * \p{javaWhitespace}
+ * Equivalent to java.lang.Character.isWhitespace()
+ * \p{javaMirrored}
+ * Equivalent to java.lang.Character.isMirrored()
+ *
+ *
+ * Classes for Unicode scripts, blocks, categories and binary properties
+ * * \p{IsLatin}
+ * A Latin script character (script)
+ * \p{InGreek}
+ * A character in the Greek block (block)
+ * \p{Lu}
+ * An uppercase letter (category)
+ * \p{IsAlphabetic}
+ * An alphabetic character (binary property)
+ * \p{Sc}
+ * A currency symbol
+ * \P{InGreek}
+ * Any character except one in the Greek block (negation)
+ * [\p{L}&&[^\p{Lu}]]
+ * Any letter except an uppercase letter (subtraction)
+ *
+ *
+ * Boundary matchers
+ *
+ * ^
+ * The beginning of a line
+ * $
+ * The end of a line
+ * \b
+ * A word boundary
+ * \B
+ * A non-word boundary
+ * \A
+ * The beginning of the input
+ * \G
+ * The end of the previous match
+ * \Z
+ * The end of the input but for the final
+ * terminator, if any
+ * \z
+ * The end of the input
+ *
+ *
+ * Greedy quantifiers
+ *
+ * X?
+ * X, once or not at all
+ * X*
+ * X, zero or more times
+ * X+
+ * X, one or more times
+ * X{n}
+ * X, exactly n times
+ * X{n,}
+ * X, at least n times
+ * X{n,m}
+ * X, at least n but not more than m times
+ *
+ *
+ * Reluctant quantifiers
+ *
+ * X??
+ * X, once or not at all
+ * X*?
+ * X, zero or more times
+ * X+?
+ * X, one or more times
+ * X{n}?
+ * X, exactly n times
+ * X{n,}?
+ * X, at least n times
+ * X{n,m}?
+ * X, at least n but not more than m times
+ *
+ *
+ * Possessive quantifiers
+ *
+ * X?+
+ * X, once or not at all
+ * X*+
+ * X, zero or more times
+ * X++
+ * X, one or more times
+ * X{n}+
+ * X, exactly n times
+ * X{n,}+
+ * X, at least n times
+ * X{n,m}+
+ * X, at least n but not more than m times
+ *
+ *
+ * Logical operators
+ *
+ * XY
+ * X followed by Y
+ * X|Y
+ * Either X or Y
+ * (X)
+ * X, as a capturing group
+ *
+ *
+ * Back references
+ *
+ * \n
+ * Whatever the nth
+ * capturing group matched
+ *
+ * \k<name>
+ * Whatever the
+ * named-capturing group "name" matched
+ *
+ *
+ * Quotation
+ *
+ * \
+ * Nothing, but quotes the following character
+ * \Q
+ * Nothing, but quotes all characters until \E
+ * \E
+ * Nothing, but ends quoting started by \Q
+ *
+ *
+ *
+ * Special constructs (named-capturing and non-capturing)
+ *
+ * (?<name>X)
+ * X, as a named-capturing group
+ * (?:X)
+ * X, as a non-capturing group
+ * (?idmsuxU-idmsuxU)
+ * Nothing, but turns match flags i
+ * d m s
+ * u x U
+ * on - off
+ * (?idmsux-idmsux:X)
+ * X, as a non-capturing group with the
+ * given flags i d
+ * m s u
+ * x on - off
+ * (?=X)
+ * X, via zero-width positive lookahead
+ * (?!X)
+ * X, via zero-width negative lookahead
+ * (?<=X)
+ * X, via zero-width positive lookbehind
+ * (?<!X)
+ * X, via zero-width negative lookbehind
+ * (?>X)
+ * X, as an independent, non-capturing group
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ * Backslashes, escapes, and quoting
+ *
+ * The backslash character ('\') serves to introduce escaped
+ * constructs, as defined in the table above, as well as to quote characters
+ * that otherwise would be interpreted as unescaped constructs. Thus the
+ * expression \\ matches a single backslash and \{ matches a
+ * left brace.
+ *
+ *
It is an error to use a backslash prior to any alphabetic character that
+ * does not denote an escaped construct; these are reserved for future
+ * extensions to the regular-expression language. A backslash may be used
+ * prior to a non-alphabetic character regardless of whether that character is
+ * part of an unescaped construct.
+ *
+ *
Backslashes within string literals in Java source code are interpreted
+ * as required by
+ * The Java™ Language Specification
+ * as either Unicode escapes (section 3.3) or other character escapes (section 3.10.6)
+ * It is therefore necessary to double backslashes in string
+ * literals that represent regular expressions to protect them from
+ * interpretation by the Java bytecode compiler. The string literal
+ * "\b", for example, matches a single backspace character when
+ * interpreted as a regular expression, while "\\b" matches a
+ * word boundary. The string literal "\(hello\)" is illegal
+ * and leads to a compile-time error; in order to match the string
+ * (hello) the string literal "\\(hello\\)"
+ * must be used.
+ *
+ *
+ * Character Classes
+ *
+ * Character classes may appear within other character classes, and
+ * may be composed by the union operator (implicit) and the intersection
+ * operator (&&).
+ * The union operator denotes a class that contains every character that is
+ * in at least one of its operand classes. The intersection operator
+ * denotes a class that contains every character that is in both of its
+ * operand classes.
+ *
+ *
The precedence of character-class operators is as follows, from
+ * highest to lowest:
+ *
+ *
+ * 1
+ * Literal escape
+ * \x
+ * 2
+ * Grouping
+ * [...]
+ * 3
+ * Range
+ * a-z
+ * 4
+ * Union
+ * [a-e][i-u]
+ * 5
+ * Intersection
+ * [a-z&&[aeiou]]
+ *
+ *
+ * Note that a different set of metacharacters are in effect inside
+ * a character class than outside a character class. For instance, the
+ * regular expression . loses its special meaning inside a
+ * character class, while the expression - becomes a range
+ * forming metacharacter.
+ *
+ *
+ * Line terminators
+ *
+ * A line terminator is a one- or two-character sequence that marks
+ * the end of a line of the input character sequence. The following are
+ * recognized as line terminators:
+ *
+ *
+ *
+ * - A newline (line feed) character ('\n'),
+ *
+ *
- A carriage-return character followed immediately by a newline
+ * character ("\r\n"),
+ *
+ *
- A standalone carriage-return character ('\r'),
+ *
+ *
- A next-line character ('\u0085'),
+ *
+ *
- A line-separator character ('\u2028'), or
+ *
+ *
- A paragraph-separator character ('\u2029).
+ *
+ *
+ * If {@link #UNIX_LINES} mode is activated, then the only line terminators
+ * recognized are newline characters.
+ *
+ *
The regular expression . matches any character except a line
+ * terminator unless the {@link #DOTALL} flag is specified.
+ *
+ *
By default, the regular expressions ^ and $ ignore
+ * line terminators and only match at the beginning and the end, respectively,
+ * of the entire input sequence. If {@link #MULTILINE} mode is activated then
+ * ^ matches at the beginning of input and after any line terminator
+ * except at the end of input. When in {@link #MULTILINE} mode $
+ * matches just before a line terminator or the end of the input sequence.
+ *
+ *
+ * Groups and capturing
+ *
+ *
+ * Group number
+ * Capturing groups are numbered by counting their opening parentheses from
+ * left to right. In the expression ((A)(B(C))), for example, there
+ * are four such groups:
+ *
+ *
+ * 1
+ * ((A)(B(C)))
+ * 2
+ * (A)
+ * 3
+ * (B(C))
+ * 4
+ * (C)
+ *
+ *
+ * Group zero always stands for the entire expression.
+ *
+ *
Capturing groups are so named because, during a match, each subsequence
+ * of the input sequence that matches such a group is saved. The captured
+ * subsequence may be used later in the expression, via a back reference, and
+ * may also be retrieved from the matcher once the match operation is complete.
+ *
+ *
+ * Group name
+ * A capturing group can also be assigned a "name", a named-capturing group,
+ * and then be back-referenced later by the "name". Group names are composed of
+ * the following characters. The first character must be a letter.
+ *
+ *
+ * - The uppercase letters 'A' through 'Z'
+ * ('\u0041' through '\u005a'),
+ *
- The lowercase letters 'a' through 'z'
+ * ('\u0061' through '\u007a'),
+ *
- The digits '0' through '9'
+ * ('\u0030' through '\u0039'),
+ *
+ *
+ * A named-capturing group is still numbered as described in
+ * Group number.
+ *
+ *
The captured input associated with a group is always the subsequence
+ * that the group most recently matched. If a group is evaluated a second time
+ * because of quantification then its previously-captured value, if any, will
+ * be retained if the second evaluation fails. Matching the string
+ * "aba" against the expression (a(b)?)+, for example, leaves
+ * group two set to "b". All captured input is discarded at the
+ * beginning of each match.
+ *
+ *
Groups beginning with (? are either pure, non-capturing groups
+ * that do not capture text and do not count towards the group total, or
+ * named-capturing group.
+ *
+ *
Unicode support
+ *
+ * This class is in conformance with Level 1 of Unicode Technical
+ * Standard #18: Unicode Regular Expression, plus RL2.1
+ * Canonical Equivalents.
+ *
+ * Unicode escape sequences such as \u2014 in Java source code
+ * are processed as described in section 3.3 of
+ * The Java™ Language Specification.
+ * Such escape sequences are also implemented directly by the regular-expression
+ * parser so that Unicode escapes can be used in expressions that are read from
+ * files or from the keyboard. Thus the strings "\u2014" and
+ * "\\u2014", while not equal, compile into the same pattern, which
+ * matches the character with hexadecimal value 0x2014.
+ *
+ * A Unicode character can also be represented in a regular-expression by
+ * using its Hex notation(hexadecimal code point value) directly as described in construct
+ * \x{...}, for example a supplementary character U+2011F
+ * can be specified as \x{2011F}, instead of two consecutive
+ * Unicode escape sequences of the surrogate pair
+ * \uD840\uDD1F.
+ *
+ * Unicode scripts, blocks, categories and binary properties are written with
+ * the \p and \P constructs as in Perl.
+ * \p{prop} matches if
+ * the input has the property prop, while \P{prop}
+ * does not match if the input has that property.
+ *
+ * Scripts, blocks, categories and binary properties can be used both inside
+ * and outside of a character class.
+ *
+ *
+ * Scripts are specified either with the prefix {@code Is}, as in
+ * {@code IsHiragana}, or by using the {@code script} keyword (or its short
+ * form {@code sc})as in {@code script=Hiragana} or {@code sc=Hiragana}.
+ *
+ * The script names supported by Pattern
are the valid script names
+ * accepted and defined by
+ * {@link java.lang.Character.UnicodeScript#forName(String) UnicodeScript.forName}.
+ *
+ *
+ * Blocks are specified with the prefix {@code In}, as in
+ * {@code InMongolian}, or by using the keyword {@code block} (or its short
+ * form {@code blk}) as in {@code block=Mongolian} or {@code blk=Mongolian}.
+ *
+ * The block names supported by Pattern
are the valid block names
+ * accepted and defined by
+ * {@link java.lang.Character.UnicodeBlock#forName(String) UnicodeBlock.forName}.
+ *
+ *
+ * Categories may be specified with the optional prefix {@code Is}:
+ * Both {@code \p{L}} and {@code \p{IsL}} denote the category of Unicode
+ * letters. Same as scripts and blocks, categories can also be specified
+ * by using the keyword {@code general_category} (or its short form
+ * {@code gc}) as in {@code general_category=Lu} or {@code gc=Lu}.
+ *
+ * The supported categories are those of
+ *
+ * The Unicode Standard in the version specified by the
+ * {@link java.lang.Character Character} class. The category names are those
+ * defined in the Standard, both normative and informative.
+ *
+ *
+ * Binary properties are specified with the prefix {@code Is}, as in
+ * {@code IsAlphabetic}. The supported binary properties by Pattern
+ * are
+ *
+ * - Alphabetic
+ *
- Ideographic
+ *
- Letter
+ *
- Lowercase
+ *
- Uppercase
+ *
- Titlecase
+ *
- Punctuation
+ *
- Control
+ *
- White_Space
+ *
- Digit
+ *
- Hex_Digit
+ *
- Noncharacter_Code_Point
+ *
- Assigned
+ *
+
+
+ *
+ * Predefined Character classes and POSIX character classes are in
+ * conformance with the recommendation of Annex C: Compatibility Properties
+ * of Unicode Regular Expression
+ * , when {@link #UNICODE_CHARACTER_CLASS} flag is specified.
+ *
+ *
+ *
+ * Classes
+ * Matches
+ *
+ * \p{Lower}
+ * A lowercase character:\p{IsLowercase}
+ * \p{Upper}
+ * An uppercase character:\p{IsUppercase}
+ * \p{ASCII}
+ * All ASCII:[\x00-\x7F]
+ * \p{Alpha}
+ * An alphabetic character:\p{IsAlphabetic}
+ * \p{Digit}
+ * A decimal digit character:p{IsDigit}
+ * \p{Alnum}
+ * An alphanumeric character:[\p{IsAlphabetic}\p{IsDigit}]
+ * \p{Punct}
+ * A punctuation character:p{IsPunctuation}
+ * \p{Graph}
+ * A visible character: [^\p{IsWhite_Space}\p{gc=Cc}\p{gc=Cs}\p{gc=Cn}]
+ * \p{Print}
+ * A printable character: [\p{Graph}\p{Blank}&&[^\p{Cntrl}]]
+ * \p{Blank}
+ * A space or a tab: [\p{IsWhite_Space}&&[^\p{gc=Zl}\p{gc=Zp}\x0a\x0b\x0c\x0d\x85]]
+ * \p{Cntrl}
+ * A control character: \p{gc=Cc}
+ * \p{XDigit}
+ * A hexadecimal digit: [\p{gc=Nd}\p{IsHex_Digit}]
+ * \p{Space}
+ * A whitespace character:\p{IsWhite_Space}
+ * \d
+ * A digit: \p{IsDigit}
+ * \D
+ * A non-digit: [^\d]
+ * \s
+ * A whitespace character: \p{IsWhite_Space}
+ * \S
+ * A non-whitespace character: [^\s]
+ * \w
+ * A word character: [\p{Alpha}\p{gc=Mn}\p{gc=Me}\p{gc=Mc}\p{Digit}\p{gc=Pc}]
+ * \W
+ * A non-word character: [^\w]
+ *
+ *
+ *
+ * Categories that behave like the java.lang.Character
+ * boolean ismethodname methods (except for the deprecated ones) are
+ * available through the same \p{prop} syntax where
+ * the specified property has the name javamethodname.
+ *
+ * Comparison to Perl 5
+ *
+ * The Pattern
engine performs traditional NFA-based matching
+ * with ordered alternation as occurs in Perl 5.
+ *
+ *
Perl constructs not supported by this class:
+ *
+ *
+ * Predefined character classes (Unicode character)
+ *
\h A horizontal whitespace
+ *
\H A non horizontal whitespace
+ *
\v A vertical whitespace
+ *
\V A non vertical whitespace
+ *
\R Any Unicode linebreak sequence
+ * \u005cu000D\u005cu000A|[\u005cu000A\u005cu000B\u005cu000C\u005cu000D\u005cu0085\u005cu2028\u005cu2029]
+ *
\X Match Unicode
+ *
+ * extended grapheme cluster
+ *
+ *
+ * The backreference constructs, \g{n} for
+ * the nthcapturing group and
+ * \g{name} for
+ * named-capturing group.
+ *
+ *
+ * The named character construct, \N{name}
+ * for a Unicode character by its name.
+ *
+ *
+ * The conditional constructs
+ * (?(condition)X) and
+ * (?(condition)X|Y),
+ *
+ *
+ * The embedded code constructs (?{code})
+ * and (??{code}),
+ *
+ * The embedded comment syntax (?#comment), and
+ *
+ * The preprocessing operations \l \u,
+ * \L, and \U.
+ *
+ *
+ *
+ * Constructs supported by this class but not by Perl:
+ *
+ *
+ *
+ * Character-class union and intersection as described
+ * above.
+ *
+ *
+ *
+ * Notable differences from Perl:
+ *
+ *
+ *
+ * In Perl, \1 through \9 are always interpreted
+ * as back references; a backslash-escaped number greater than 9 is
+ * treated as a back reference if at least that many subexpressions exist,
+ * otherwise it is interpreted, if possible, as an octal escape. In this
+ * class octal escapes must always begin with a zero. In this class,
+ * \1 through \9 are always interpreted as back
+ * references, and a larger number is accepted as a back reference if at
+ * least that many subexpressions exist at that point in the regular
+ * expression, otherwise the parser will drop digits until the number is
+ * smaller or equal to the existing number of groups or it is one digit.
+ *
+ *
+ * Perl uses the g flag to request a match that resumes
+ * where the last match left off. This functionality is provided implicitly
+ * by the {@link Matcher} class: Repeated invocations of the {@link
+ * Matcher#find find} method will resume where the last match left off,
+ * unless the matcher is reset.
+ *
+ * In Perl, embedded flags at the top level of an expression affect
+ * the whole expression. In this class, embedded flags always take effect
+ * at the point at which they appear, whether they are at the top level or
+ * within a group; in the latter case, flags are restored at the end of the
+ * group just as in Perl.
+ *
+ *
+ *
+ *
+ * For a more precise description of the behavior of regular expression
+ * constructs, please see
+ * Mastering Regular Expressions, 3nd Edition, Jeffrey E. F. Friedl,
+ * O'Reilly and Associates, 2006.
+ *
+ *
+ * @see java.lang.String#split(String, int)
+ * @see java.lang.String#split(String)
+ *
+ * @author Mike McCloskey
+ * @author Mark Reinhold
+ * @author JSR-51 Expert Group
+ * @since 1.4
+ * @spec JSR-51
+ */
+
+public final class Pattern
+ implements java.io.Serializable
+{
+
+ /**
+ * Regular expression modifier values. Instead of being passed as
+ * arguments, they can also be passed as inline modifiers.
+ * For example, the following statements have the same effect.
+ *
+ * RegExp r1 = RegExp.compile("abc", Pattern.I|Pattern.M);
+ * RegExp r2 = RegExp.compile("(?im)abc", 0);
+ *
+ *
+ * The flags are duplicated so that the familiar Perl match flag
+ * names are available.
+ */
+
+ /**
+ * Enables Unix lines mode.
+ *
+ * In this mode, only the '\n' line terminator is recognized
+ * in the behavior of ., ^, and $.
+ *
+ *
Unix lines mode can also be enabled via the embedded flag
+ * expression (?d).
+ */
+ public static final int UNIX_LINES = 0x01;
+
+ /**
+ * Enables case-insensitive matching.
+ *
+ *
By default, case-insensitive matching assumes that only characters
+ * in the US-ASCII charset are being matched. Unicode-aware
+ * case-insensitive matching can be enabled by specifying the {@link
+ * #UNICODE_CASE} flag in conjunction with this flag.
+ *
+ *
Case-insensitive matching can also be enabled via the embedded flag
+ * expression (?i).
+ *
+ *
Specifying this flag may impose a slight performance penalty.
+ */
+ public static final int CASE_INSENSITIVE = 0x02;
+
+ /**
+ * Permits whitespace and comments in pattern.
+ *
+ * In this mode, whitespace is ignored, and embedded comments starting
+ * with # are ignored until the end of a line.
+ *
+ *
Comments mode can also be enabled via the embedded flag
+ * expression (?x).
+ */
+ public static final int COMMENTS = 0x04;
+
+ /**
+ * Enables multiline mode.
+ *
+ *
In multiline mode the expressions ^ and $ match
+ * just after or just before, respectively, a line terminator or the end of
+ * the input sequence. By default these expressions only match at the
+ * beginning and the end of the entire input sequence.
+ *
+ *
Multiline mode can also be enabled via the embedded flag
+ * expression (?m).
+ */
+ public static final int MULTILINE = 0x08;
+
+ /**
+ * Enables literal parsing of the pattern.
+ *
+ * When this flag is specified then the input string that specifies
+ * the pattern is treated as a sequence of literal characters.
+ * Metacharacters or escape sequences in the input sequence will be
+ * given no special meaning.
+ *
+ *
The flags CASE_INSENSITIVE and UNICODE_CASE retain their impact on
+ * matching when used in conjunction with this flag. The other flags
+ * become superfluous.
+ *
+ *
There is no embedded flag character for enabling literal parsing.
+ * @since 1.5
+ */
+ public static final int LITERAL = 0x10;
+
+ /**
+ * Enables dotall mode.
+ *
+ *
In dotall mode, the expression . matches any character,
+ * including a line terminator. By default this expression does not match
+ * line terminators.
+ *
+ *
Dotall mode can also be enabled via the embedded flag
+ * expression (?s). (The s is a mnemonic for
+ * "single-line" mode, which is what this is called in Perl.)
+ */
+ public static final int DOTALL = 0x20;
+
+ /**
+ * Enables Unicode-aware case folding.
+ *
+ * When this flag is specified then case-insensitive matching, when
+ * enabled by the {@link #CASE_INSENSITIVE} flag, is done in a manner
+ * consistent with the Unicode Standard. By default, case-insensitive
+ * matching assumes that only characters in the US-ASCII charset are being
+ * matched.
+ *
+ *
Unicode-aware case folding can also be enabled via the embedded flag
+ * expression (?u).
+ *
+ *
Specifying this flag may impose a performance penalty.
+ */
+ public static final int UNICODE_CASE = 0x40;
+
+ /**
+ * Enables canonical equivalence.
+ *
+ * When this flag is specified then two characters will be considered
+ * to match if, and only if, their full canonical decompositions match.
+ * The expression "a\u030A", for example, will match the
+ * string "\u00E5" when this flag is specified. By default,
+ * matching does not take canonical equivalence into account.
+ *
+ *
There is no embedded flag character for enabling canonical
+ * equivalence.
+ *
+ *
Specifying this flag may impose a performance penalty.
+ */
+ public static final int CANON_EQ = 0x80;
+
+ /**
+ * Enables the Unicode version of Predefined character classes and
+ * POSIX character classes.
+ *
+ * When this flag is specified then the (US-ASCII only)
+ * Predefined character classes and POSIX character classes
+ * are in conformance with
+ * Unicode Technical
+ * Standard #18: Unicode Regular Expression
+ * Annex C: Compatibility Properties.
+ *
+ * The UNICODE_CHARACTER_CLASS mode can also be enabled via the embedded
+ * flag expression (?U).
+ *
+ * The flag implies UNICODE_CASE, that is, it enables Unicode-aware case
+ * folding.
+ *
+ * Specifying this flag may impose a performance penalty.
+ * @since 1.7
+ */
+ public static final int UNICODE_CHARACTER_CLASS = 0x100;
+
+ /* Pattern has only two serialized components: The pattern string
+ * and the flags, which are all that is needed to recompile the pattern
+ * when it is deserialized.
+ */
+
+ /** use serialVersionUID from Merlin b59 for interoperability */
+ private static final long serialVersionUID = 5073258162644648461L;
+
+ /**
+ * The original regular-expression pattern string.
+ *
+ * @serial
+ */
+ private String pattern;
+
+ /**
+ * The original pattern flags.
+ *
+ * @serial
+ */
+ private int flags;
+
+ /**
+ * Boolean indicating this Pattern is compiled; this is necessary in order
+ * to lazily compile deserialized Patterns.
+ */
+ private transient volatile boolean compiled = false;
+
+ /**
+ * The normalized pattern string.
+ */
+ private transient String normalizedPattern;
+
+ /**
+ * The starting point of state machine for the find operation. This allows
+ * a match to start anywhere in the input.
+ */
+ transient Node root;
+
+ /**
+ * The root of object tree for a match operation. The pattern is matched
+ * at the beginning. This may include a find that uses BnM or a First
+ * node.
+ */
+ transient Node matchRoot;
+
+ /**
+ * Temporary storage used by parsing pattern slice.
+ */
+ transient int[] buffer;
+
+ /**
+ * Map the "name" of the "named capturing group" to its group id
+ * node.
+ */
+ transient volatile Map namedGroups;
+
+ /**
+ * Temporary storage used while parsing group references.
+ */
+ transient GroupHead[] groupNodes;
+
+ /**
+ * Temporary null terminated code point array used by pattern compiling.
+ */
+ private transient int[] temp;
+
+ /**
+ * The number of capturing groups in this Pattern. Used by matchers to
+ * allocate storage needed to perform a match.
+ */
+ transient int capturingGroupCount;
+
+ /**
+ * The local variable count used by parsing tree. Used by matchers to
+ * allocate storage needed to perform a match.
+ */
+ transient int localCount;
+
+ /**
+ * Index into the pattern string that keeps track of how much has been
+ * parsed.
+ */
+ private transient int cursor;
+
+ /**
+ * Holds the length of the pattern string.
+ */
+ private transient int patternLength;
+
+ /**
+ * If the Start node might possibly match supplementary characters.
+ * It is set to true during compiling if
+ * (1) There is supplementary char in pattern, or
+ * (2) There is complement node of Category or Block
+ */
+ private transient boolean hasSupplementary;
+
+ /**
+ * Compiles the given regular expression into a pattern.
+ *
+ * @param regex
+ * The expression to be compiled
+ *
+ * @throws PatternSyntaxException
+ * If the expression's syntax is invalid
+ */
+ public static Pattern compile(String regex) {
+ return new Pattern(regex, 0);
+ }
+
+ /**
+ * Compiles the given regular expression into a pattern with the given
+ * flags.
+ *
+ * @param regex
+ * The expression to be compiled
+ *
+ * @param flags
+ * Match flags, a bit mask that may include
+ * {@link #CASE_INSENSITIVE}, {@link #MULTILINE}, {@link #DOTALL},
+ * {@link #UNICODE_CASE}, {@link #CANON_EQ}, {@link #UNIX_LINES},
+ * {@link #LITERAL}, {@link #UNICODE_CHARACTER_CLASS}
+ * and {@link #COMMENTS}
+ *
+ * @throws IllegalArgumentException
+ * If bit values other than those corresponding to the defined
+ * match flags are set in flags
+ *
+ * @throws PatternSyntaxException
+ * If the expression's syntax is invalid
+ */
+ public static Pattern compile(String regex, int flags) {
+ return new Pattern(regex, flags);
+ }
+
+ /**
+ * Returns the regular expression from which this pattern was compiled.
+ *
+ *
+ * @return The source of this pattern
+ */
+ public String pattern() {
+ return pattern;
+ }
+
+ /**
+ * Returns the string representation of this pattern. This
+ * is the regular expression from which this pattern was
+ * compiled.
+ *
+ * @return The string representation of this pattern
+ * @since 1.5
+ */
+ public String toString() {
+ return pattern;
+ }
+
+ /**
+ * Creates a matcher that will match the given input against this pattern.
+ *
+ *
+ * @param input
+ * The character sequence to be matched
+ *
+ * @return A new matcher for this pattern
+ */
+ public Matcher matcher(CharSequence input) {
+ if (!compiled) {
+ synchronized(this) {
+ if (!compiled)
+ compile();
+ }
+ }
+ Matcher m = new Matcher(this, input);
+ return m;
+ }
+
+ /**
+ * Returns this pattern's match flags.
+ *
+ * @return The match flags specified when this pattern was compiled
+ */
+ public int flags() {
+ return flags;
+ }
+
+ /**
+ * Compiles the given regular expression and attempts to match the given
+ * input against it.
+ *
+ * An invocation of this convenience method of the form
+ *
+ *
+ * Pattern.matches(regex, input);
+ *
+ * behaves in exactly the same way as the expression
+ *
+ *
+ * Pattern.compile(regex).matcher(input).matches()
+ *
+ * If a pattern is to be used multiple times, compiling it once and reusing
+ * it will be more efficient than invoking this method each time.
+ *
+ * @param regex
+ * The expression to be compiled
+ *
+ * @param input
+ * The character sequence to be matched
+ *
+ * @throws PatternSyntaxException
+ * If the expression's syntax is invalid
+ */
+ public static boolean matches(String regex, CharSequence input) {
+ Pattern p = Pattern.compile(regex);
+ Matcher m = p.matcher(input);
+ return m.matches();
+ }
+
+ /**
+ * Splits the given input sequence around matches of this pattern.
+ *
+ * The array returned by this method contains each substring of the
+ * input sequence that is terminated by another subsequence that matches
+ * this pattern or is terminated by the end of the input sequence. The
+ * substrings in the array are in the order in which they occur in the
+ * input. If this pattern does not match any subsequence of the input then
+ * the resulting array has just one element, namely the input sequence in
+ * string form.
+ *
+ *
The limit parameter controls the number of times the
+ * pattern is applied and therefore affects the length of the resulting
+ * array. If the limit n is greater than zero then the pattern
+ * will be applied at most n - 1 times, the array's
+ * length will be no greater than n, and the array's last entry
+ * will contain all input beyond the last matched delimiter. If n
+ * is non-positive then the pattern will be applied as many times as
+ * possible and the array can have any length. If n is zero then
+ * the pattern will be applied as many times as possible, the array can
+ * have any length, and trailing empty strings will be discarded.
+ *
+ *
The input "boo:and:foo", for example, yields the following
+ * results with these parameters:
+ *
+ *
+ * Regex
+ * Limit
+ * Result
+ * :
+ * 2
+ * { "boo", "and:foo" }
+ * :
+ * 5
+ * { "boo", "and", "foo" }
+ * :
+ * -2
+ * { "boo", "and", "foo" }
+ * o
+ * 5
+ * { "b", "", ":and:f", "", "" }
+ * o
+ * -2
+ * { "b", "", ":and:f", "", "" }
+ * o
+ * 0
+ * { "b", "", ":and:f" }
+ *
+ *
+ *
+ * @param input
+ * The character sequence to be split
+ *
+ * @param limit
+ * The result threshold, as described above
+ *
+ * @return The array of strings computed by splitting the input
+ * around matches of this pattern
+ */
+ public String[] split(CharSequence input, int limit) {
+ int index = 0;
+ boolean matchLimited = limit > 0;
+ ArrayList matchList = new ArrayList<>();
+ Matcher m = matcher(input);
+
+ // Add segments before each match found
+ while(m.find()) {
+ if (!matchLimited || matchList.size() < limit - 1) {
+ String match = input.subSequence(index, m.start()).toString();
+ matchList.add(match);
+ index = m.end();
+ } else if (matchList.size() == limit - 1) { // last one
+ String match = input.subSequence(index,
+ input.length()).toString();
+ matchList.add(match);
+ index = m.end();
+ }
+ }
+
+ // If no match was found, return this
+ if (index == 0)
+ return new String[] {input.toString()};
+
+ // Add remaining segment
+ if (!matchLimited || matchList.size() < limit)
+ matchList.add(input.subSequence(index, input.length()).toString());
+
+ // Construct result
+ int resultSize = matchList.size();
+ if (limit == 0)
+ while (resultSize > 0 && matchList.get(resultSize-1).equals(""))
+ resultSize--;
+ String[] result = new String[resultSize];
+ return matchList.subList(0, resultSize).toArray(result);
+ }
+
+ /**
+ * Splits the given input sequence around matches of this pattern.
+ *
+ * This method works as if by invoking the two-argument {@link
+ * #split(java.lang.CharSequence, int) split} method with the given input
+ * sequence and a limit argument of zero. Trailing empty strings are
+ * therefore not included in the resulting array.
+ *
+ * The input "boo:and:foo", for example, yields the following
+ * results with these expressions:
+ *
+ *
+ * Regex
+ * Result
+ * :
+ * { "boo", "and", "foo" }
+ * o
+ * { "b", "", ":and:f" }
+ *
+ *
+ *
+ * @param input
+ * The character sequence to be split
+ *
+ * @return The array of strings computed by splitting the input
+ * around matches of this pattern
+ */
+ public String[] split(CharSequence input) {
+ return split(input, 0);
+ }
+
+ /**
+ * Returns a literal pattern String
for the specified
+ * String
.
+ *
+ * This method produces a String
that can be used to
+ * create a Pattern
that would match the string
+ * s
as if it were a literal pattern.
Metacharacters
+ * or escape sequences in the input sequence will be given no special
+ * meaning.
+ *
+ * @param s The string to be literalized
+ * @return A literal string replacement
+ * @since 1.5
+ */
+ public static String quote(String s) {
+ int slashEIndex = s.indexOf("\\E");
+ if (slashEIndex == -1)
+ return "\\Q" + s + "\\E";
+
+ StringBuilder sb = new StringBuilder(s.length() * 2);
+ sb.append("\\Q");
+ slashEIndex = 0;
+ int current = 0;
+ while ((slashEIndex = s.indexOf("\\E", current)) != -1) {
+ sb.append(s.substring(current, slashEIndex));
+ current = slashEIndex + 2;
+ sb.append("\\E\\\\E\\Q");
+ }
+ sb.append(s.substring(current, s.length()));
+ sb.append("\\E");
+ return sb.toString();
+ }
+
+ /**
+ * Recompile the Pattern instance from a stream. The original pattern
+ * string is read in and the object tree is recompiled from it.
+ */
+ private void readObject(java.io.ObjectInputStream s)
+ throws java.io.IOException, ClassNotFoundException {
+
+ // Read in all fields
+ s.defaultReadObject();
+
+ // Initialize counts
+ capturingGroupCount = 1;
+ localCount = 0;
+
+ // if length > 0, the Pattern is lazily compiled
+ compiled = false;
+ if (pattern.length() == 0) {
+ root = new Start(lastAccept);
+ matchRoot = lastAccept;
+ compiled = true;
+ }
+ }
+
+ /**
+ * This private constructor is used to create all Patterns. The pattern
+ * string and match flags are all that is needed to completely describe
+ * a Pattern. An empty pattern string results in an object tree with
+ * only a Start node and a LastNode node.
+ */
+ private Pattern(String p, int f) {
+ pattern = p;
+ flags = f;
+
+ // to use UNICODE_CASE if UNICODE_CHARACTER_CLASS present
+ if ((flags & UNICODE_CHARACTER_CLASS) != 0)
+ flags |= UNICODE_CASE;
+
+ // Reset group index count
+ capturingGroupCount = 1;
+ localCount = 0;
+
+ if (pattern.length() > 0) {
+ compile();
+ } else {
+ root = new Start(lastAccept);
+ matchRoot = lastAccept;
+ }
+ }
+
+ /**
+ * The pattern is converted to normalizedD form and then a pure group
+ * is constructed to match canonical equivalences of the characters.
+ */
+ private void normalize() {
+ boolean inCharClass = false;
+ int lastCodePoint = -1;
+
+ // Convert pattern into normalizedD form
+ normalizedPattern = Normalizer.normalize(pattern, Normalizer.Form.NFD);
+ patternLength = normalizedPattern.length();
+
+ // Modify pattern to match canonical equivalences
+ StringBuilder newPattern = new StringBuilder(patternLength);
+ for(int i=0; i= patternLength)
+ break;
+ c = normalizedPattern.codePointAt(i);
+ sequenceBuffer.appendCodePoint(c);
+ }
+ String ea = produceEquivalentAlternation(
+ sequenceBuffer.toString());
+ newPattern.setLength(newPattern.length()-Character.charCount(lastCodePoint));
+ newPattern.append("(?:").append(ea).append(")");
+ } else if (c == '[' && lastCodePoint != '\\') {
+ i = normalizeCharClass(newPattern, i);
+ } else {
+ newPattern.appendCodePoint(c);
+ }
+ lastCodePoint = c;
+ i += Character.charCount(c);
+ }
+ normalizedPattern = newPattern.toString();
+ }
+
+ /**
+ * Complete the character class being parsed and add a set
+ * of alternations to it that will match the canonical equivalences
+ * of the characters within the class.
+ */
+ private int normalizeCharClass(StringBuilder newPattern, int i) {
+ StringBuilder charClass = new StringBuilder();
+ StringBuilder eq = null;
+ int lastCodePoint = -1;
+ String result;
+
+ i++;
+ charClass.append("[");
+ while(true) {
+ int c = normalizedPattern.codePointAt(i);
+ StringBuilder sequenceBuffer;
+
+ if (c == ']' && lastCodePoint != '\\') {
+ charClass.append((char)c);
+ break;
+ } else if (Character.getType(c) == Character.NON_SPACING_MARK) {
+ sequenceBuffer = new StringBuilder();
+ sequenceBuffer.appendCodePoint(lastCodePoint);
+ while(Character.getType(c) == Character.NON_SPACING_MARK) {
+ sequenceBuffer.appendCodePoint(c);
+ i += Character.charCount(c);
+ if (i >= normalizedPattern.length())
+ break;
+ c = normalizedPattern.codePointAt(i);
+ }
+ String ea = produceEquivalentAlternation(
+ sequenceBuffer.toString());
+
+ charClass.setLength(charClass.length()-Character.charCount(lastCodePoint));
+ if (eq == null)
+ eq = new StringBuilder();
+ eq.append('|');
+ eq.append(ea);
+ } else {
+ charClass.appendCodePoint(c);
+ i++;
+ }
+ if (i == normalizedPattern.length())
+ throw error("Unclosed character class");
+ lastCodePoint = c;
+ }
+
+ if (eq != null) {
+ result = "(?:"+charClass.toString()+eq.toString()+")";
+ } else {
+ result = charClass.toString();
+ }
+
+ newPattern.append(result);
+ return i;
+ }
+
+ /**
+ * Given a specific sequence composed of a regular character and
+ * combining marks that follow it, produce the alternation that will
+ * match all canonical equivalences of that sequence.
+ */
+ private String produceEquivalentAlternation(String source) {
+ int len = countChars(source, 0, 1);
+ if (source.length() == len)
+ // source has one character.
+ return source;
+
+ String base = source.substring(0,len);
+ String combiningMarks = source.substring(len);
+
+ String[] perms = producePermutations(combiningMarks);
+ StringBuilder result = new StringBuilder(source);
+
+ // Add combined permutations
+ for(int x=0; x0)
+ result.append("|"+next);
+ next = composeOneStep(next);
+ if (next != null)
+ result.append("|"+produceEquivalentAlternation(next));
+ }
+ return result.toString();
+ }
+
+ /**
+ * Returns an array of strings that have all the possible
+ * permutations of the characters in the input string.
+ * This is used to get a list of all possible orderings
+ * of a set of combining marks. Note that some of the permutations
+ * are invalid because of combining class collisions, and these
+ * possibilities must be removed because they are not canonically
+ * equivalent.
+ */
+ private String[] producePermutations(String input) {
+ if (input.length() == countChars(input, 0, 1))
+ return new String[] {input};
+
+ if (input.length() == countChars(input, 0, 2)) {
+ int c0 = Character.codePointAt(input, 0);
+ int c1 = Character.codePointAt(input, Character.charCount(c0));
+ if (getClass(c1) == getClass(c0)) {
+ return new String[] {input};
+ }
+ String[] result = new String[2];
+ result[0] = input;
+ StringBuilder sb = new StringBuilder(2);
+ sb.appendCodePoint(c1);
+ sb.appendCodePoint(c0);
+ result[1] = sb.toString();
+ return result;
+ }
+
+ int length = 1;
+ int nCodePoints = countCodePoints(input);
+ for(int x=1; x=0; y--) {
+ if (combClass[y] == combClass[x]) {
+ continue loop;
+ }
+ }
+ StringBuilder sb = new StringBuilder(input);
+ String otherChars = sb.delete(offset, offset+len).toString();
+ String[] subResult = producePermutations(otherChars);
+
+ String prefix = input.substring(offset, offset+len);
+ for(int y=0; y= pLen - 1) // No \Q sequence found
+ return;
+ int j = i;
+ i += 2;
+ int[] newtemp = new int[j + 2*(pLen-i) + 2];
+ System.arraycopy(temp, 0, newtemp, 0, j);
+
+ boolean inQuote = true;
+ while (i < pLen) {
+ int c = temp[i++];
+ if (! ASCII.isAscii(c) || ASCII.isAlnum(c)) {
+ newtemp[j++] = c;
+ } else if (c != '\\') {
+ if (inQuote) newtemp[j++] = '\\';
+ newtemp[j++] = c;
+ } else if (inQuote) {
+ if (temp[i] == 'E') {
+ i++;
+ inQuote = false;
+ } else {
+ newtemp[j++] = '\\';
+ newtemp[j++] = '\\';
+ }
+ } else {
+ if (temp[i] == 'Q') {
+ i++;
+ inQuote = true;
+ } else {
+ newtemp[j++] = c;
+ if (i != pLen)
+ newtemp[j++] = temp[i++];
+ }
+ }
+ }
+
+ patternLength = j;
+ temp = Arrays.copyOf(newtemp, j + 2); // double zero termination
+ }
+
+ /**
+ * Copies regular expression to an int array and invokes the parsing
+ * of the expression which will create the object tree.
+ */
+ private void compile() {
+ // Handle canonical equivalences
+ if (has(CANON_EQ) && !has(LITERAL)) {
+ normalize();
+ } else {
+ normalizedPattern = pattern;
+ }
+ patternLength = normalizedPattern.length();
+
+ // Copy pattern to int array for convenience
+ // Use double zero to terminate pattern
+ temp = new int[patternLength + 2];
+
+ hasSupplementary = false;
+ int c, count = 0;
+ // Convert all chars into code points
+ for (int x = 0; x < patternLength; x += Character.charCount(c)) {
+ c = normalizedPattern.codePointAt(x);
+ if (isSupplementary(c)) {
+ hasSupplementary = true;
+ }
+ temp[count++] = c;
+ }
+
+ patternLength = count; // patternLength now in code points
+
+ if (! has(LITERAL))
+ RemoveQEQuoting();
+
+ // Allocate all temporary objects here.
+ buffer = new int[32];
+ groupNodes = new GroupHead[10];
+ namedGroups = null;
+
+ if (has(LITERAL)) {
+ // Literal pattern handling
+ matchRoot = newSlice(temp, patternLength, hasSupplementary);
+ matchRoot.next = lastAccept;
+ } else {
+ // Start recursive descent parsing
+ matchRoot = expr(lastAccept);
+ // Check extra pattern characters
+ if (patternLength != cursor) {
+ if (peek() == ')') {
+ throw error("Unmatched closing ')'");
+ } else {
+ throw error("Unexpected internal error");
+ }
+ }
+ }
+
+ // Peephole optimization
+ if (matchRoot instanceof Slice) {
+ root = BnM.optimize(matchRoot);
+ if (root == matchRoot) {
+ root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot);
+ }
+ } else if (matchRoot instanceof Begin || matchRoot instanceof First) {
+ root = matchRoot;
+ } else {
+ root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot);
+ }
+
+ // Release temporary storage
+ temp = null;
+ buffer = null;
+ groupNodes = null;
+ patternLength = 0;
+ compiled = true;
+ }
+
+ Map namedGroups() {
+ if (namedGroups == null)
+ namedGroups = new HashMap<>(2);
+ return namedGroups;
+ }
+
+ /**
+ * Used to print out a subtree of the Pattern to help with debugging.
+ */
+ private static void printObjectTree(Node node) {
+ while(node != null) {
+ if (node instanceof Prolog) {
+ System.out.println(node);
+ printObjectTree(((Prolog)node).loop);
+ System.out.println("**** end contents prolog loop");
+ } else if (node instanceof Loop) {
+ System.out.println(node);
+ printObjectTree(((Loop)node).body);
+ System.out.println("**** end contents Loop body");
+ } else if (node instanceof Curly) {
+ System.out.println(node);
+ printObjectTree(((Curly)node).atom);
+ System.out.println("**** end contents Curly body");
+ } else if (node instanceof GroupCurly) {
+ System.out.println(node);
+ printObjectTree(((GroupCurly)node).atom);
+ System.out.println("**** end contents GroupCurly body");
+ } else if (node instanceof GroupTail) {
+ System.out.println(node);
+ System.out.println("Tail next is "+node.next);
+ return;
+ } else {
+ System.out.println(node);
+ }
+ node = node.next;
+ if (node != null)
+ System.out.println("->next:");
+ if (node == Pattern.accept) {
+ System.out.println("Accept Node");
+ node = null;
+ }
+ }
+ }
+
+ /**
+ * Used to accumulate information about a subtree of the object graph
+ * so that optimizations can be applied to the subtree.
+ */
+ static final class TreeInfo {
+ int minLength;
+ int maxLength;
+ boolean maxValid;
+ boolean deterministic;
+
+ TreeInfo() {
+ reset();
+ }
+ void reset() {
+ minLength = 0;
+ maxLength = 0;
+ maxValid = true;
+ deterministic = true;
+ }
+ }
+
+ /*
+ * The following private methods are mainly used to improve the
+ * readability of the code. In order to let the Java compiler easily
+ * inline them, we should not put many assertions or error checks in them.
+ */
+
+ /**
+ * Indicates whether a particular flag is set or not.
+ */
+ private boolean has(int f) {
+ return (flags & f) != 0;
+ }
+
+ /**
+ * Match next character, signal error if failed.
+ */
+ private void accept(int ch, String s) {
+ int testChar = temp[cursor++];
+ if (has(COMMENTS))
+ testChar = parsePastWhitespace(testChar);
+ if (ch != testChar) {
+ throw error(s);
+ }
+ }
+
+ /**
+ * Mark the end of pattern with a specific character.
+ */
+ private void mark(int c) {
+ temp[patternLength] = c;
+ }
+
+ /**
+ * Peek the next character, and do not advance the cursor.
+ */
+ private int peek() {
+ int ch = temp[cursor];
+ if (has(COMMENTS))
+ ch = peekPastWhitespace(ch);
+ return ch;
+ }
+
+ /**
+ * Read the next character, and advance the cursor by one.
+ */
+ private int read() {
+ int ch = temp[cursor++];
+ if (has(COMMENTS))
+ ch = parsePastWhitespace(ch);
+ return ch;
+ }
+
+ /**
+ * Read the next character, and advance the cursor by one,
+ * ignoring the COMMENTS setting
+ */
+ private int readEscaped() {
+ int ch = temp[cursor++];
+ return ch;
+ }
+
+ /**
+ * Advance the cursor by one, and peek the next character.
+ */
+ private int next() {
+ int ch = temp[++cursor];
+ if (has(COMMENTS))
+ ch = peekPastWhitespace(ch);
+ return ch;
+ }
+
+ /**
+ * Advance the cursor by one, and peek the next character,
+ * ignoring the COMMENTS setting
+ */
+ private int nextEscaped() {
+ int ch = temp[++cursor];
+ return ch;
+ }
+
+ /**
+ * If in xmode peek past whitespace and comments.
+ */
+ private int peekPastWhitespace(int ch) {
+ while (ASCII.isSpace(ch) || ch == '#') {
+ while (ASCII.isSpace(ch))
+ ch = temp[++cursor];
+ if (ch == '#') {
+ ch = peekPastLine();
+ }
+ }
+ return ch;
+ }
+
+ /**
+ * If in xmode parse past whitespace and comments.
+ */
+ private int parsePastWhitespace(int ch) {
+ while (ASCII.isSpace(ch) || ch == '#') {
+ while (ASCII.isSpace(ch))
+ ch = temp[cursor++];
+ if (ch == '#')
+ ch = parsePastLine();
+ }
+ return ch;
+ }
+
+ /**
+ * xmode parse past comment to end of line.
+ */
+ private int parsePastLine() {
+ int ch = temp[cursor++];
+ while (ch != 0 && !isLineSeparator(ch))
+ ch = temp[cursor++];
+ return ch;
+ }
+
+ /**
+ * xmode peek past comment to end of line.
+ */
+ private int peekPastLine() {
+ int ch = temp[++cursor];
+ while (ch != 0 && !isLineSeparator(ch))
+ ch = temp[++cursor];
+ return ch;
+ }
+
+ /**
+ * Determines if character is a line separator in the current mode
+ */
+ private boolean isLineSeparator(int ch) {
+ if (has(UNIX_LINES)) {
+ return ch == '\n';
+ } else {
+ return (ch == '\n' ||
+ ch == '\r' ||
+ (ch|1) == '\u2029' ||
+ ch == '\u0085');
+ }
+ }
+
+ /**
+ * Read the character after the next one, and advance the cursor by two.
+ */
+ private int skip() {
+ int i = cursor;
+ int ch = temp[i+1];
+ cursor = i + 2;
+ return ch;
+ }
+
+ /**
+ * Unread one next character, and retreat cursor by one.
+ */
+ private void unread() {
+ cursor--;
+ }
+
+ /**
+ * Internal method used for handling all syntax errors. The pattern is
+ * displayed with a pointer to aid in locating the syntax error.
+ */
+ private PatternSyntaxException error(String s) {
+ return new PatternSyntaxException(s, normalizedPattern, cursor - 1);
+ }
+
+ /**
+ * Determines if there is any supplementary character or unpaired
+ * surrogate in the specified range.
+ */
+ private boolean findSupplementary(int start, int end) {
+ for (int i = start; i < end; i++) {
+ if (isSupplementary(temp[i]))
+ return true;
+ }
+ return false;
+ }
+
+ /**
+ * Determines if the specified code point is a supplementary
+ * character or unpaired surrogate.
+ */
+ private static final boolean isSupplementary(int ch) {
+ return ch >= Character.MIN_SUPPLEMENTARY_CODE_POINT ||
+ Character.isSurrogate((char)ch);
+ }
+
+ /**
+ * The following methods handle the main parsing. They are sorted
+ * according to their precedence order, the lowest one first.
+ */
+
+ /**
+ * The expression is parsed with branch nodes added for alternations.
+ * This may be called recursively to parse sub expressions that may
+ * contain alternations.
+ */
+ private Node expr(Node end) {
+ Node prev = null;
+ Node firstTail = null;
+ Node branchConn = null;
+
+ for (;;) {
+ Node node = sequence(end);
+ Node nodeTail = root; //double return
+ if (prev == null) {
+ prev = node;
+ firstTail = nodeTail;
+ } else {
+ // Branch
+ if (branchConn == null) {
+ branchConn = new BranchConn();
+ branchConn.next = end;
+ }
+ if (node == end) {
+ // if the node returned from sequence() is "end"
+ // we have an empty expr, set a null atom into
+ // the branch to indicate to go "next" directly.
+ node = null;
+ } else {
+ // the "tail.next" of each atom goes to branchConn
+ nodeTail.next = branchConn;
+ }
+ if (prev instanceof Branch) {
+ ((Branch)prev).add(node);
+ } else {
+ if (prev == end) {
+ prev = null;
+ } else {
+ // replace the "end" with "branchConn" at its tail.next
+ // when put the "prev" into the branch as the first atom.
+ firstTail.next = branchConn;
+ }
+ prev = new Branch(prev, node, branchConn);
+ }
+ }
+ if (peek() != '|') {
+ return prev;
+ }
+ next();
+ }
+ }
+
+ /**
+ * Parsing of sequences between alternations.
+ */
+ private Node sequence(Node end) {
+ Node head = null;
+ Node tail = null;
+ Node node = null;
+ LOOP:
+ for (;;) {
+ int ch = peek();
+ switch (ch) {
+ case '(':
+ // Because group handles its own closure,
+ // we need to treat it differently
+ node = group0();
+ // Check for comment or flag group
+ if (node == null)
+ continue;
+ if (head == null)
+ head = node;
+ else
+ tail.next = node;
+ // Double return: Tail was returned in root
+ tail = root;
+ continue;
+ case '[':
+ node = clazz(true);
+ break;
+ case '\\':
+ ch = nextEscaped();
+ if (ch == 'p' || ch == 'P') {
+ boolean oneLetter = true;
+ boolean comp = (ch == 'P');
+ ch = next(); // Consume { if present
+ if (ch != '{') {
+ unread();
+ } else {
+ oneLetter = false;
+ }
+ node = family(oneLetter, comp);
+ } else {
+ unread();
+ node = atom();
+ }
+ break;
+ case '^':
+ next();
+ if (has(MULTILINE)) {
+ if (has(UNIX_LINES))
+ node = new UnixCaret();
+ else
+ node = new Caret();
+ } else {
+ node = new Begin();
+ }
+ break;
+ case '$':
+ next();
+ if (has(UNIX_LINES))
+ node = new UnixDollar(has(MULTILINE));
+ else
+ node = new Dollar(has(MULTILINE));
+ break;
+ case '.':
+ next();
+ if (has(DOTALL)) {
+ node = new All();
+ } else {
+ if (has(UNIX_LINES))
+ node = new UnixDot();
+ else {
+ node = new Dot();
+ }
+ }
+ break;
+ case '|':
+ case ')':
+ break LOOP;
+ case ']': // Now interpreting dangling ] and } as literals
+ case '}':
+ node = atom();
+ break;
+ case '?':
+ case '*':
+ case '+':
+ next();
+ throw error("Dangling meta character '" + ((char)ch) + "'");
+ case 0:
+ if (cursor >= patternLength) {
+ break LOOP;
+ }
+ // Fall through
+ default:
+ node = atom();
+ break;
+ }
+
+ node = closure(node);
+
+ if (head == null) {
+ head = tail = node;
+ } else {
+ tail.next = node;
+ tail = node;
+ }
+ }
+ if (head == null) {
+ return end;
+ }
+ tail.next = end;
+ root = tail; //double return
+ return head;
+ }
+
+ /**
+ * Parse and add a new Single or Slice.
+ */
+ private Node atom() {
+ int first = 0;
+ int prev = -1;
+ boolean hasSupplementary = false;
+ int ch = peek();
+ for (;;) {
+ switch (ch) {
+ case '*':
+ case '+':
+ case '?':
+ case '{':
+ if (first > 1) {
+ cursor = prev; // Unwind one character
+ first--;
+ }
+ break;
+ case '$':
+ case '.':
+ case '^':
+ case '(':
+ case '[':
+ case '|':
+ case ')':
+ break;
+ case '\\':
+ ch = nextEscaped();
+ if (ch == 'p' || ch == 'P') { // Property
+ if (first > 0) { // Slice is waiting; handle it first
+ unread();
+ break;
+ } else { // No slice; just return the family node
+ boolean comp = (ch == 'P');
+ boolean oneLetter = true;
+ ch = next(); // Consume { if present
+ if (ch != '{')
+ unread();
+ else
+ oneLetter = false;
+ return family(oneLetter, comp);
+ }
+ }
+ unread();
+ prev = cursor;
+ ch = escape(false, first == 0);
+ if (ch >= 0) {
+ append(ch, first);
+ first++;
+ if (isSupplementary(ch)) {
+ hasSupplementary = true;
+ }
+ ch = peek();
+ continue;
+ } else if (first == 0) {
+ return root;
+ }
+ // Unwind meta escape sequence
+ cursor = prev;
+ break;
+ case 0:
+ if (cursor >= patternLength) {
+ break;
+ }
+ // Fall through
+ default:
+ prev = cursor;
+ append(ch, first);
+ first++;
+ if (isSupplementary(ch)) {
+ hasSupplementary = true;
+ }
+ ch = next();
+ continue;
+ }
+ break;
+ }
+ if (first == 1) {
+ return newSingle(buffer[0]);
+ } else {
+ return newSlice(buffer, first, hasSupplementary);
+ }
+ }
+
+ private void append(int ch, int len) {
+ if (len >= buffer.length) {
+ int[] tmp = new int[len+len];
+ System.arraycopy(buffer, 0, tmp, 0, len);
+ buffer = tmp;
+ }
+ buffer[len] = ch;
+ }
+
+ /**
+ * Parses a backref greedily, taking as many numbers as it
+ * can. The first digit is always treated as a backref, but
+ * multi digit numbers are only treated as a backref if at
+ * least that many backrefs exist at this point in the regex.
+ */
+ private Node ref(int refNum) {
+ boolean done = false;
+ while(!done) {
+ int ch = peek();
+ switch(ch) {
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9':
+ int newRefNum = (refNum * 10) + (ch - '0');
+ // Add another number if it doesn't make a group
+ // that doesn't exist
+ if (capturingGroupCount - 1 < newRefNum) {
+ done = true;
+ break;
+ }
+ refNum = newRefNum;
+ read();
+ break;
+ default:
+ done = true;
+ break;
+ }
+ }
+ if (has(CASE_INSENSITIVE))
+ return new CIBackRef(refNum, has(UNICODE_CASE));
+ else
+ return new BackRef(refNum);
+ }
+
+ /**
+ * Parses an escape sequence to determine the actual value that needs
+ * to be matched.
+ * If -1 is returned and create was true a new object was added to the tree
+ * to handle the escape sequence.
+ * If the returned value is greater than zero, it is the value that
+ * matches the escape sequence.
+ */
+ private int escape(boolean inclass, boolean create) {
+ int ch = skip();
+ switch (ch) {
+ case '0':
+ return o();
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9':
+ if (inclass) break;
+ if (create) {
+ root = ref((ch - '0'));
+ }
+ return -1;
+ case 'A':
+ if (inclass) break;
+ if (create) root = new Begin();
+ return -1;
+ case 'B':
+ if (inclass) break;
+ if (create) root = new Bound(Bound.NONE, has(UNICODE_CHARACTER_CLASS));
+ return -1;
+ case 'C':
+ break;
+ case 'D':
+ if (create) root = has(UNICODE_CHARACTER_CLASS)
+ ? new Utype(UnicodeProp.DIGIT).complement()
+ : new Ctype(ASCII.DIGIT).complement();
+ return -1;
+ case 'E':
+ case 'F':
+ break;
+ case 'G':
+ if (inclass) break;
+ if (create) root = new LastMatch();
+ return -1;
+ case 'H':
+ case 'I':
+ case 'J':
+ case 'K':
+ case 'L':
+ case 'M':
+ case 'N':
+ case 'O':
+ case 'P':
+ case 'Q':
+ case 'R':
+ break;
+ case 'S':
+ if (create) root = has(UNICODE_CHARACTER_CLASS)
+ ? new Utype(UnicodeProp.WHITE_SPACE).complement()
+ : new Ctype(ASCII.SPACE).complement();
+ return -1;
+ case 'T':
+ case 'U':
+ case 'V':
+ break;
+ case 'W':
+ if (create) root = has(UNICODE_CHARACTER_CLASS)
+ ? new Utype(UnicodeProp.WORD).complement()
+ : new Ctype(ASCII.WORD).complement();
+ return -1;
+ case 'X':
+ case 'Y':
+ break;
+ case 'Z':
+ if (inclass) break;
+ if (create) {
+ if (has(UNIX_LINES))
+ root = new UnixDollar(false);
+ else
+ root = new Dollar(false);
+ }
+ return -1;
+ case 'a':
+ return '\007';
+ case 'b':
+ if (inclass) break;
+ if (create) root = new Bound(Bound.BOTH, has(UNICODE_CHARACTER_CLASS));
+ return -1;
+ case 'c':
+ return c();
+ case 'd':
+ if (create) root = has(UNICODE_CHARACTER_CLASS)
+ ? new Utype(UnicodeProp.DIGIT)
+ : new Ctype(ASCII.DIGIT);
+ return -1;
+ case 'e':
+ return '\033';
+ case 'f':
+ return '\f';
+ case 'g':
+ case 'h':
+ case 'i':
+ case 'j':
+ break;
+ case 'k':
+ if (inclass)
+ break;
+ if (read() != '<')
+ throw error("\\k is not followed by '<' for named capturing group");
+ String name = groupname(read());
+ if (!namedGroups().containsKey(name))
+ throw error("(named capturing group <"+ name+"> does not exit");
+ if (create) {
+ if (has(CASE_INSENSITIVE))
+ root = new CIBackRef(namedGroups().get(name), has(UNICODE_CASE));
+ else
+ root = new BackRef(namedGroups().get(name));
+ }
+ return -1;
+ case 'l':
+ case 'm':
+ break;
+ case 'n':
+ return '\n';
+ case 'o':
+ case 'p':
+ case 'q':
+ break;
+ case 'r':
+ return '\r';
+ case 's':
+ if (create) root = has(UNICODE_CHARACTER_CLASS)
+ ? new Utype(UnicodeProp.WHITE_SPACE)
+ : new Ctype(ASCII.SPACE);
+ return -1;
+ case 't':
+ return '\t';
+ case 'u':
+ return u();
+ case 'v':
+ return '\013';
+ case 'w':
+ if (create) root = has(UNICODE_CHARACTER_CLASS)
+ ? new Utype(UnicodeProp.WORD)
+ : new Ctype(ASCII.WORD);
+ return -1;
+ case 'x':
+ return x();
+ case 'y':
+ break;
+ case 'z':
+ if (inclass) break;
+ if (create) root = new End();
+ return -1;
+ default:
+ return ch;
+ }
+ throw error("Illegal/unsupported escape sequence");
+ }
+
+ /**
+ * Parse a character class, and return the node that matches it.
+ *
+ * Consumes a ] on the way out if consume is true. Usually consume
+ * is true except for the case of [abc&&def] where def is a separate
+ * right hand node with "understood" brackets.
+ */
+ private CharProperty clazz(boolean consume) {
+ CharProperty prev = null;
+ CharProperty node = null;
+ BitClass bits = new BitClass();
+ boolean include = true;
+ boolean firstInClass = true;
+ int ch = next();
+ for (;;) {
+ switch (ch) {
+ case '^':
+ // Negates if first char in a class, otherwise literal
+ if (firstInClass) {
+ if (temp[cursor-1] != '[')
+ break;
+ ch = next();
+ include = !include;
+ continue;
+ } else {
+ // ^ not first in class, treat as literal
+ break;
+ }
+ case '[':
+ firstInClass = false;
+ node = clazz(true);
+ if (prev == null)
+ prev = node;
+ else
+ prev = union(prev, node);
+ ch = peek();
+ continue;
+ case '&':
+ firstInClass = false;
+ ch = next();
+ if (ch == '&') {
+ ch = next();
+ CharProperty rightNode = null;
+ while (ch != ']' && ch != '&') {
+ if (ch == '[') {
+ if (rightNode == null)
+ rightNode = clazz(true);
+ else
+ rightNode = union(rightNode, clazz(true));
+ } else { // abc&&def
+ unread();
+ rightNode = clazz(false);
+ }
+ ch = peek();
+ }
+ if (rightNode != null)
+ node = rightNode;
+ if (prev == null) {
+ if (rightNode == null)
+ throw error("Bad class syntax");
+ else
+ prev = rightNode;
+ } else {
+ prev = intersection(prev, node);
+ }
+ } else {
+ // treat as a literal &
+ unread();
+ break;
+ }
+ continue;
+ case 0:
+ firstInClass = false;
+ if (cursor >= patternLength)
+ throw error("Unclosed character class");
+ break;
+ case ']':
+ firstInClass = false;
+ if (prev != null) {
+ if (consume)
+ next();
+ return prev;
+ }
+ break;
+ default:
+ firstInClass = false;
+ break;
+ }
+ node = range(bits);
+ if (include) {
+ if (prev == null) {
+ prev = node;
+ } else {
+ if (prev != node)
+ prev = union(prev, node);
+ }
+ } else {
+ if (prev == null) {
+ prev = node.complement();
+ } else {
+ if (prev != node)
+ prev = setDifference(prev, node);
+ }
+ }
+ ch = peek();
+ }
+ }
+
+ private CharProperty bitsOrSingle(BitClass bits, int ch) {
+ /* Bits can only handle codepoints in [u+0000-u+00ff] range.
+ Use "single" node instead of bits when dealing with unicode
+ case folding for codepoints listed below.
+ (1)Uppercase out of range: u+00ff, u+00b5
+ toUpperCase(u+00ff) -> u+0178
+ toUpperCase(u+00b5) -> u+039c
+ (2)LatinSmallLetterLongS u+17f
+ toUpperCase(u+017f) -> u+0053
+ (3)LatinSmallLetterDotlessI u+131
+ toUpperCase(u+0131) -> u+0049
+ (4)LatinCapitalLetterIWithDotAbove u+0130
+ toLowerCase(u+0130) -> u+0069
+ (5)KelvinSign u+212a
+ toLowerCase(u+212a) ==> u+006B
+ (6)AngstromSign u+212b
+ toLowerCase(u+212b) ==> u+00e5
+ */
+ int d;
+ if (ch < 256 &&
+ !(has(CASE_INSENSITIVE) && has(UNICODE_CASE) &&
+ (ch == 0xff || ch == 0xb5 ||
+ ch == 0x49 || ch == 0x69 || //I and i
+ ch == 0x53 || ch == 0x73 || //S and s
+ ch == 0x4b || ch == 0x6b || //K and k
+ ch == 0xc5 || ch == 0xe5))) //A+ring
+ return bits.add(ch, flags());
+ return newSingle(ch);
+ }
+
+ /**
+ * Parse a single character or a character range in a character class
+ * and return its representative node.
+ */
+ private CharProperty range(BitClass bits) {
+ int ch = peek();
+ if (ch == '\\') {
+ ch = nextEscaped();
+ if (ch == 'p' || ch == 'P') { // A property
+ boolean comp = (ch == 'P');
+ boolean oneLetter = true;
+ // Consume { if present
+ ch = next();
+ if (ch != '{')
+ unread();
+ else
+ oneLetter = false;
+ return family(oneLetter, comp);
+ } else { // ordinary escape
+ unread();
+ ch = escape(true, true);
+ if (ch == -1)
+ return (CharProperty) root;
+ }
+ } else {
+ ch = single();
+ }
+ if (ch >= 0) {
+ if (peek() == '-') {
+ int endRange = temp[cursor+1];
+ if (endRange == '[') {
+ return bitsOrSingle(bits, ch);
+ }
+ if (endRange != ']') {
+ next();
+ int m = single();
+ if (m < ch)
+ throw error("Illegal character range");
+ if (has(CASE_INSENSITIVE))
+ return caseInsensitiveRangeFor(ch, m);
+ else
+ return rangeFor(ch, m);
+ }
+ }
+ return bitsOrSingle(bits, ch);
+ }
+ throw error("Unexpected character '"+((char)ch)+"'");
+ }
+
+ private int single() {
+ int ch = peek();
+ switch (ch) {
+ case '\\':
+ return escape(true, false);
+ default:
+ next();
+ return ch;
+ }
+ }
+
+ /**
+ * Parses a Unicode character family and returns its representative node.
+ */
+ private CharProperty family(boolean singleLetter,
+ boolean maybeComplement)
+ {
+ next();
+ String name;
+ CharProperty node = null;
+
+ if (singleLetter) {
+ int c = temp[cursor];
+ if (!Character.isSupplementaryCodePoint(c)) {
+ name = String.valueOf((char)c);
+ } else {
+ name = new String(temp, cursor, 1);
+ }
+ read();
+ } else {
+ int i = cursor;
+ mark('}');
+ while(read() != '}') {
+ }
+ mark('\000');
+ int j = cursor;
+ if (j > patternLength)
+ throw error("Unclosed character family");
+ if (i + 1 >= j)
+ throw error("Empty character family");
+ name = new String(temp, i, j-i-1);
+ }
+
+ int i = name.indexOf('=');
+ if (i != -1) {
+ // property construct \p{name=value}
+ String value = name.substring(i + 1);
+ name = name.substring(0, i).toLowerCase(Locale.ENGLISH);
+ if ("sc".equals(name) || "script".equals(name)) {
+ node = unicodeScriptPropertyFor(value);
+ } else if ("blk".equals(name) || "block".equals(name)) {
+ node = unicodeBlockPropertyFor(value);
+ } else if ("gc".equals(name) || "general_category".equals(name)) {
+ node = charPropertyNodeFor(value);
+ } else {
+ throw error("Unknown Unicode property {name=<" + name + ">, "
+ + "value=<" + value + ">}");
+ }
+ } else {
+ if (name.startsWith("In")) {
+ // \p{inBlockName}
+ node = unicodeBlockPropertyFor(name.substring(2));
+ } else if (name.startsWith("Is")) {
+ // \p{isGeneralCategory} and \p{isScriptName}
+ name = name.substring(2);
+ UnicodeProp uprop = UnicodeProp.forName(name);
+ if (uprop != null)
+ node = new Utype(uprop);
+ if (node == null)
+ node = CharPropertyNames.charPropertyFor(name);
+ if (node == null)
+ node = unicodeScriptPropertyFor(name);
+ } else {
+ if (has(UNICODE_CHARACTER_CLASS)) {
+ UnicodeProp uprop = UnicodeProp.forPOSIXName(name);
+ if (uprop != null)
+ node = new Utype(uprop);
+ }
+ if (node == null)
+ node = charPropertyNodeFor(name);
+ }
+ }
+ if (maybeComplement) {
+ if (node instanceof Category || node instanceof Block)
+ hasSupplementary = true;
+ node = node.complement();
+ }
+ return node;
+ }
+
+
+ /**
+ * Returns a CharProperty matching all characters belong to
+ * a UnicodeScript.
+ */
+ private CharProperty unicodeScriptPropertyFor(String name) {
+ final Character.UnicodeScript script;
+ try {
+ script = Character.UnicodeScript.forName(name);
+ } catch (IllegalArgumentException iae) {
+ throw error("Unknown character script name {" + name + "}");
+ }
+ return new Script(script);
+ }
+
+ /**
+ * Returns a CharProperty matching all characters in a UnicodeBlock.
+ */
+ private CharProperty unicodeBlockPropertyFor(String name) {
+ final Character.UnicodeBlock block;
+ try {
+ block = Character.UnicodeBlock.forName(name);
+ } catch (IllegalArgumentException iae) {
+ throw error("Unknown character block name {" + name + "}");
+ }
+ return new Block(block);
+ }
+
+ /**
+ * Returns a CharProperty matching all characters in a named property.
+ */
+ private CharProperty charPropertyNodeFor(String name) {
+ CharProperty p = CharPropertyNames.charPropertyFor(name);
+ if (p == null)
+ throw error("Unknown character property name {" + name + "}");
+ return p;
+ }
+
+ /**
+ * Parses and returns the name of a "named capturing group", the trailing
+ * ">" is consumed after parsing.
+ */
+ private String groupname(int ch) {
+ StringBuilder sb = new StringBuilder();
+ sb.append(Character.toChars(ch));
+ while (ASCII.isLower(ch=read()) || ASCII.isUpper(ch) ||
+ ASCII.isDigit(ch)) {
+ sb.append(Character.toChars(ch));
+ }
+ if (sb.length() == 0)
+ throw error("named capturing group has 0 length name");
+ if (ch != '>')
+ throw error("named capturing group is missing trailing '>'");
+ return sb.toString();
+ }
+
+ /**
+ * Parses a group and returns the head node of a set of nodes that process
+ * the group. Sometimes a double return system is used where the tail is
+ * returned in root.
+ */
+ private Node group0() {
+ boolean capturingGroup = false;
+ Node head = null;
+ Node tail = null;
+ int save = flags;
+ root = null;
+ int ch = next();
+ if (ch == '?') {
+ ch = skip();
+ switch (ch) {
+ case ':': // (?:xxx) pure group
+ head = createGroup(true);
+ tail = root;
+ head.next = expr(tail);
+ break;
+ case '=': // (?=xxx) and (?!xxx) lookahead
+ case '!':
+ head = createGroup(true);
+ tail = root;
+ head.next = expr(tail);
+ if (ch == '=') {
+ head = tail = new Pos(head);
+ } else {
+ head = tail = new Neg(head);
+ }
+ break;
+ case '>': // (?>xxx) independent group
+ head = createGroup(true);
+ tail = root;
+ head.next = expr(tail);
+ head = tail = new Ques(head, INDEPENDENT);
+ break;
+ case '<': // (? is already defined");
+ capturingGroup = true;
+ head = createGroup(false);
+ tail = root;
+ namedGroups().put(name, capturingGroupCount-1);
+ head.next = expr(tail);
+ break;
+ }
+ int start = cursor;
+ head = createGroup(true);
+ tail = root;
+ head.next = expr(tail);
+ tail.next = lookbehindEnd;
+ TreeInfo info = new TreeInfo();
+ head.study(info);
+ if (info.maxValid == false) {
+ throw error("Look-behind group does not have "
+ + "an obvious maximum length");
+ }
+ boolean hasSupplementary = findSupplementary(start, patternLength);
+ if (ch == '=') {
+ head = tail = (hasSupplementary ?
+ new BehindS(head, info.maxLength,
+ info.minLength) :
+ new Behind(head, info.maxLength,
+ info.minLength));
+ } else if (ch == '!') {
+ head = tail = (hasSupplementary ?
+ new NotBehindS(head, info.maxLength,
+ info.minLength) :
+ new NotBehind(head, info.maxLength,
+ info.minLength));
+ } else {
+ throw error("Unknown look-behind group");
+ }
+ break;
+ case '$':
+ case '@':
+ throw error("Unknown group type");
+ default: // (?xxx:) inlined match flags
+ unread();
+ addFlag();
+ ch = read();
+ if (ch == ')') {
+ return null; // Inline modifier only
+ }
+ if (ch != ':') {
+ throw error("Unknown inline modifier");
+ }
+ head = createGroup(true);
+ tail = root;
+ head.next = expr(tail);
+ break;
+ }
+ } else { // (xxx) a regular group
+ capturingGroup = true;
+ head = createGroup(false);
+ tail = root;
+ head.next = expr(tail);
+ }
+
+ accept(')', "Unclosed group");
+ flags = save;
+
+ // Check for quantifiers
+ Node node = closure(head);
+ if (node == head) { // No closure
+ root = tail;
+ return node; // Dual return
+ }
+ if (head == tail) { // Zero length assertion
+ root = node;
+ return node; // Dual return
+ }
+
+ if (node instanceof Ques) {
+ Ques ques = (Ques) node;
+ if (ques.type == POSSESSIVE) {
+ root = node;
+ return node;
+ }
+ tail.next = new BranchConn();
+ tail = tail.next;
+ if (ques.type == GREEDY) {
+ head = new Branch(head, null, tail);
+ } else { // Reluctant quantifier
+ head = new Branch(null, head, tail);
+ }
+ root = tail;
+ return head;
+ } else if (node instanceof Curly) {
+ Curly curly = (Curly) node;
+ if (curly.type == POSSESSIVE) {
+ root = node;
+ return node;
+ }
+ // Discover if the group is deterministic
+ TreeInfo info = new TreeInfo();
+ if (head.study(info)) { // Deterministic
+ GroupTail temp = (GroupTail) tail;
+ head = root = new GroupCurly(head.next, curly.cmin,
+ curly.cmax, curly.type,
+ ((GroupTail)tail).localIndex,
+ ((GroupTail)tail).groupIndex,
+ capturingGroup);
+ return head;
+ } else { // Non-deterministic
+ int temp = ((GroupHead) head).localIndex;
+ Loop loop;
+ if (curly.type == GREEDY)
+ loop = new Loop(this.localCount, temp);
+ else // Reluctant Curly
+ loop = new LazyLoop(this.localCount, temp);
+ Prolog prolog = new Prolog(loop);
+ this.localCount += 1;
+ loop.cmin = curly.cmin;
+ loop.cmax = curly.cmax;
+ loop.body = head;
+ tail.next = loop;
+ root = loop;
+ return prolog; // Dual return
+ }
+ }
+ throw error("Internal logic error");
+ }
+
+ /**
+ * Create group head and tail nodes using double return. If the group is
+ * created with anonymous true then it is a pure group and should not
+ * affect group counting.
+ */
+ private Node createGroup(boolean anonymous) {
+ int localIndex = localCount++;
+ int groupIndex = 0;
+ if (!anonymous)
+ groupIndex = capturingGroupCount++;
+ GroupHead head = new GroupHead(localIndex);
+ root = new GroupTail(localIndex, groupIndex);
+ if (!anonymous && groupIndex < 10)
+ groupNodes[groupIndex] = head;
+ return head;
+ }
+
+ /**
+ * Parses inlined match flags and set them appropriately.
+ */
+ private void addFlag() {
+ int ch = peek();
+ for (;;) {
+ switch (ch) {
+ case 'i':
+ flags |= CASE_INSENSITIVE;
+ break;
+ case 'm':
+ flags |= MULTILINE;
+ break;
+ case 's':
+ flags |= DOTALL;
+ break;
+ case 'd':
+ flags |= UNIX_LINES;
+ break;
+ case 'u':
+ flags |= UNICODE_CASE;
+ break;
+ case 'c':
+ flags |= CANON_EQ;
+ break;
+ case 'x':
+ flags |= COMMENTS;
+ break;
+ case 'U':
+ flags |= (UNICODE_CHARACTER_CLASS | UNICODE_CASE);
+ break;
+ case '-': // subFlag then fall through
+ ch = next();
+ subFlag();
+ default:
+ return;
+ }
+ ch = next();
+ }
+ }
+
+ /**
+ * Parses the second part of inlined match flags and turns off
+ * flags appropriately.
+ */
+ private void subFlag() {
+ int ch = peek();
+ for (;;) {
+ switch (ch) {
+ case 'i':
+ flags &= ~CASE_INSENSITIVE;
+ break;
+ case 'm':
+ flags &= ~MULTILINE;
+ break;
+ case 's':
+ flags &= ~DOTALL;
+ break;
+ case 'd':
+ flags &= ~UNIX_LINES;
+ break;
+ case 'u':
+ flags &= ~UNICODE_CASE;
+ break;
+ case 'c':
+ flags &= ~CANON_EQ;
+ break;
+ case 'x':
+ flags &= ~COMMENTS;
+ break;
+ case 'U':
+ flags &= ~(UNICODE_CHARACTER_CLASS | UNICODE_CASE);
+ default:
+ return;
+ }
+ ch = next();
+ }
+ }
+
+ static final int MAX_REPS = 0x7FFFFFFF;
+
+ static final int GREEDY = 0;
+
+ static final int LAZY = 1;
+
+ static final int POSSESSIVE = 2;
+
+ static final int INDEPENDENT = 3;
+
+ /**
+ * Processes repetition. If the next character peeked is a quantifier
+ * then new nodes must be appended to handle the repetition.
+ * Prev could be a single or a group, so it could be a chain of nodes.
+ */
+ private Node closure(Node prev) {
+ Node atom;
+ int ch = peek();
+ switch (ch) {
+ case '?':
+ ch = next();
+ if (ch == '?') {
+ next();
+ return new Ques(prev, LAZY);
+ } else if (ch == '+') {
+ next();
+ return new Ques(prev, POSSESSIVE);
+ }
+ return new Ques(prev, GREEDY);
+ case '*':
+ ch = next();
+ if (ch == '?') {
+ next();
+ return new Curly(prev, 0, MAX_REPS, LAZY);
+ } else if (ch == '+') {
+ next();
+ return new Curly(prev, 0, MAX_REPS, POSSESSIVE);
+ }
+ return new Curly(prev, 0, MAX_REPS, GREEDY);
+ case '+':
+ ch = next();
+ if (ch == '?') {
+ next();
+ return new Curly(prev, 1, MAX_REPS, LAZY);
+ } else if (ch == '+') {
+ next();
+ return new Curly(prev, 1, MAX_REPS, POSSESSIVE);
+ }
+ return new Curly(prev, 1, MAX_REPS, GREEDY);
+ case '{':
+ ch = temp[cursor+1];
+ if (ASCII.isDigit(ch)) {
+ skip();
+ int cmin = 0;
+ do {
+ cmin = cmin * 10 + (ch - '0');
+ } while (ASCII.isDigit(ch = read()));
+ int cmax = cmin;
+ if (ch == ',') {
+ ch = read();
+ cmax = MAX_REPS;
+ if (ch != '}') {
+ cmax = 0;
+ while (ASCII.isDigit(ch)) {
+ cmax = cmax * 10 + (ch - '0');
+ ch = read();
+ }
+ }
+ }
+ if (ch != '}')
+ throw error("Unclosed counted closure");
+ if (((cmin) | (cmax) | (cmax - cmin)) < 0)
+ throw error("Illegal repetition range");
+ Curly curly;
+ ch = peek();
+ if (ch == '?') {
+ next();
+ curly = new Curly(prev, cmin, cmax, LAZY);
+ } else if (ch == '+') {
+ next();
+ curly = new Curly(prev, cmin, cmax, POSSESSIVE);
+ } else {
+ curly = new Curly(prev, cmin, cmax, GREEDY);
+ }
+ return curly;
+ } else {
+ throw error("Illegal repetition");
+ }
+ default:
+ return prev;
+ }
+ }
+
+ /**
+ * Utility method for parsing control escape sequences.
+ */
+ private int c() {
+ if (cursor < patternLength) {
+ return read() ^ 64;
+ }
+ throw error("Illegal control escape sequence");
+ }
+
+ /**
+ * Utility method for parsing octal escape sequences.
+ */
+ private int o() {
+ int n = read();
+ if (((n-'0')|('7'-n)) >= 0) {
+ int m = read();
+ if (((m-'0')|('7'-m)) >= 0) {
+ int o = read();
+ if ((((o-'0')|('7'-o)) >= 0) && (((n-'0')|('3'-n)) >= 0)) {
+ return (n - '0') * 64 + (m - '0') * 8 + (o - '0');
+ }
+ unread();
+ return (n - '0') * 8 + (m - '0');
+ }
+ unread();
+ return (n - '0');
+ }
+ throw error("Illegal octal escape sequence");
+ }
+
+ /**
+ * Utility method for parsing hexadecimal escape sequences.
+ */
+ private int x() {
+ int n = read();
+ if (ASCII.isHexDigit(n)) {
+ int m = read();
+ if (ASCII.isHexDigit(m)) {
+ return ASCII.toDigit(n) * 16 + ASCII.toDigit(m);
+ }
+ } else if (n == '{' && ASCII.isHexDigit(peek())) {
+ int ch = 0;
+ while (ASCII.isHexDigit(n = read())) {
+ ch = (ch << 4) + ASCII.toDigit(n);
+ if (ch > Character.MAX_CODE_POINT)
+ throw error("Hexadecimal codepoint is too big");
+ }
+ if (n != '}')
+ throw error("Unclosed hexadecimal escape sequence");
+ return ch;
+ }
+ throw error("Illegal hexadecimal escape sequence");
+ }
+
+ /**
+ * Utility method for parsing unicode escape sequences.
+ */
+ private int cursor() {
+ return cursor;
+ }
+
+ private void setcursor(int pos) {
+ cursor = pos;
+ }
+
+ private int uxxxx() {
+ int n = 0;
+ for (int i = 0; i < 4; i++) {
+ int ch = read();
+ if (!ASCII.isHexDigit(ch)) {
+ throw error("Illegal Unicode escape sequence");
+ }
+ n = n * 16 + ASCII.toDigit(ch);
+ }
+ return n;
+ }
+
+ private int u() {
+ int n = uxxxx();
+ if (Character.isHighSurrogate((char)n)) {
+ int cur = cursor();
+ if (read() == '\\' && read() == 'u') {
+ int n2 = uxxxx();
+ if (Character.isLowSurrogate((char)n2))
+ return Character.toCodePoint((char)n, (char)n2);
+ }
+ setcursor(cur);
+ }
+ return n;
+ }
+
+ //
+ // Utility methods for code point support
+ //
+
+ private static final int countChars(CharSequence seq, int index,
+ int lengthInCodePoints) {
+ // optimization
+ if (lengthInCodePoints == 1 && !Character.isHighSurrogate(seq.charAt(index))) {
+ assert (index >= 0 && index < seq.length());
+ return 1;
+ }
+ int length = seq.length();
+ int x = index;
+ if (lengthInCodePoints >= 0) {
+ assert (index >= 0 && index < length);
+ for (int i = 0; x < length && i < lengthInCodePoints; i++) {
+ if (Character.isHighSurrogate(seq.charAt(x++))) {
+ if (x < length && Character.isLowSurrogate(seq.charAt(x))) {
+ x++;
+ }
+ }
+ }
+ return x - index;
+ }
+
+ assert (index >= 0 && index <= length);
+ if (index == 0) {
+ return 0;
+ }
+ int len = -lengthInCodePoints;
+ for (int i = 0; x > 0 && i < len; i++) {
+ if (Character.isLowSurrogate(seq.charAt(--x))) {
+ if (x > 0 && Character.isHighSurrogate(seq.charAt(x-1))) {
+ x--;
+ }
+ }
+ }
+ return index - x;
+ }
+
+ private static final int countCodePoints(CharSequence seq) {
+ int length = seq.length();
+ int n = 0;
+ for (int i = 0; i < length; ) {
+ n++;
+ if (Character.isHighSurrogate(seq.charAt(i++))) {
+ if (i < length && Character.isLowSurrogate(seq.charAt(i))) {
+ i++;
+ }
+ }
+ }
+ return n;
+ }
+
+ /**
+ * Creates a bit vector for matching Latin-1 values. A normal BitClass
+ * never matches values above Latin-1, and a complemented BitClass always
+ * matches values above Latin-1.
+ */
+ private static final class BitClass extends BmpCharProperty {
+ final boolean[] bits;
+ BitClass() { bits = new boolean[256]; }
+ private BitClass(boolean[] bits) { this.bits = bits; }
+ BitClass add(int c, int flags) {
+ assert c >= 0 && c <= 255;
+ if ((flags & CASE_INSENSITIVE) != 0) {
+ if (ASCII.isAscii(c)) {
+ bits[ASCII.toUpper(c)] = true;
+ bits[ASCII.toLower(c)] = true;
+ } else if ((flags & UNICODE_CASE) != 0) {
+ bits[Character.toLowerCase(c)] = true;
+ bits[Character.toUpperCase(c)] = true;
+ }
+ }
+ bits[c] = true;
+ return this;
+ }
+ boolean isSatisfiedBy(int ch) {
+ return ch < 256 && bits[ch];
+ }
+ }
+
+ /**
+ * Returns a suitably optimized, single character matcher.
+ */
+ private CharProperty newSingle(final int ch) {
+ if (has(CASE_INSENSITIVE)) {
+ int lower, upper;
+ if (has(UNICODE_CASE)) {
+ upper = Character.toUpperCase(ch);
+ lower = Character.toLowerCase(upper);
+ if (upper != lower)
+ return new SingleU(lower);
+ } else if (ASCII.isAscii(ch)) {
+ lower = ASCII.toLower(ch);
+ upper = ASCII.toUpper(ch);
+ if (lower != upper)
+ return new SingleI(lower, upper);
+ }
+ }
+ if (isSupplementary(ch))
+ return new SingleS(ch); // Match a given Unicode character
+ return new Single(ch); // Match a given BMP character
+ }
+
+ /**
+ * Utility method for creating a string slice matcher.
+ */
+ private Node newSlice(int[] buf, int count, boolean hasSupplementary) {
+ int[] tmp = new int[count];
+ if (has(CASE_INSENSITIVE)) {
+ if (has(UNICODE_CASE)) {
+ for (int i = 0; i < count; i++) {
+ tmp[i] = Character.toLowerCase(
+ Character.toUpperCase(buf[i]));
+ }
+ return hasSupplementary? new SliceUS(tmp) : new SliceU(tmp);
+ }
+ for (int i = 0; i < count; i++) {
+ tmp[i] = ASCII.toLower(buf[i]);
+ }
+ return hasSupplementary? new SliceIS(tmp) : new SliceI(tmp);
+ }
+ for (int i = 0; i < count; i++) {
+ tmp[i] = buf[i];
+ }
+ return hasSupplementary ? new SliceS(tmp) : new Slice(tmp);
+ }
+
+ /**
+ * The following classes are the building components of the object
+ * tree that represents a compiled regular expression. The object tree
+ * is made of individual elements that handle constructs in the Pattern.
+ * Each type of object knows how to match its equivalent construct with
+ * the match() method.
+ */
+
+ /**
+ * Base class for all node classes. Subclasses should override the match()
+ * method as appropriate. This class is an accepting node, so its match()
+ * always returns true.
+ */
+ static class Node extends Object {
+ Node next;
+ Node() {
+ next = Pattern.accept;
+ }
+ /**
+ * This method implements the classic accept node.
+ */
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ matcher.last = i;
+ matcher.groups[0] = matcher.first;
+ matcher.groups[1] = matcher.last;
+ return true;
+ }
+ /**
+ * This method is good for all zero length assertions.
+ */
+ boolean study(TreeInfo info) {
+ if (next != null) {
+ return next.study(info);
+ } else {
+ return info.deterministic;
+ }
+ }
+ }
+
+ static class LastNode extends Node {
+ /**
+ * This method implements the classic accept node with
+ * the addition of a check to see if the match occurred
+ * using all of the input.
+ */
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ if (matcher.acceptMode == Matcher.ENDANCHOR && i != matcher.to)
+ return false;
+ matcher.last = i;
+ matcher.groups[0] = matcher.first;
+ matcher.groups[1] = matcher.last;
+ return true;
+ }
+ }
+
+ /**
+ * Used for REs that can start anywhere within the input string.
+ * This basically tries to match repeatedly at each spot in the
+ * input string, moving forward after each try. An anchored search
+ * or a BnM will bypass this node completely.
+ */
+ static class Start extends Node {
+ int minLength;
+ Start(Node node) {
+ this.next = node;
+ TreeInfo info = new TreeInfo();
+ next.study(info);
+ minLength = info.minLength;
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ if (i > matcher.to - minLength) {
+ matcher.hitEnd = true;
+ return false;
+ }
+ int guard = matcher.to - minLength;
+ for (; i <= guard; i++) {
+ if (next.match(matcher, i, seq)) {
+ matcher.first = i;
+ matcher.groups[0] = matcher.first;
+ matcher.groups[1] = matcher.last;
+ return true;
+ }
+ }
+ matcher.hitEnd = true;
+ return false;
+ }
+ boolean study(TreeInfo info) {
+ next.study(info);
+ info.maxValid = false;
+ info.deterministic = false;
+ return false;
+ }
+ }
+
+ /*
+ * StartS supports supplementary characters, including unpaired surrogates.
+ */
+ static final class StartS extends Start {
+ StartS(Node node) {
+ super(node);
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ if (i > matcher.to - minLength) {
+ matcher.hitEnd = true;
+ return false;
+ }
+ int guard = matcher.to - minLength;
+ while (i <= guard) {
+ //if ((ret = next.match(matcher, i, seq)) || i == guard)
+ if (next.match(matcher, i, seq)) {
+ matcher.first = i;
+ matcher.groups[0] = matcher.first;
+ matcher.groups[1] = matcher.last;
+ return true;
+ }
+ if (i == guard)
+ break;
+ // Optimization to move to the next character. This is
+ // faster than countChars(seq, i, 1).
+ if (Character.isHighSurrogate(seq.charAt(i++))) {
+ if (i < seq.length() &&
+ Character.isLowSurrogate(seq.charAt(i))) {
+ i++;
+ }
+ }
+ }
+ matcher.hitEnd = true;
+ return false;
+ }
+ }
+
+ /**
+ * Node to anchor at the beginning of input. This object implements the
+ * match for a \A sequence, and the caret anchor will use this if not in
+ * multiline mode.
+ */
+ static final class Begin extends Node {
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ int fromIndex = (matcher.anchoringBounds) ?
+ matcher.from : 0;
+ if (i == fromIndex && next.match(matcher, i, seq)) {
+ matcher.first = i;
+ matcher.groups[0] = i;
+ matcher.groups[1] = matcher.last;
+ return true;
+ } else {
+ return false;
+ }
+ }
+ }
+
+ /**
+ * Node to anchor at the end of input. This is the absolute end, so this
+ * should not match at the last newline before the end as $ will.
+ */
+ static final class End extends Node {
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ int endIndex = (matcher.anchoringBounds) ?
+ matcher.to : matcher.getTextLength();
+ if (i == endIndex) {
+ matcher.hitEnd = true;
+ return next.match(matcher, i, seq);
+ }
+ return false;
+ }
+ }
+
+ /**
+ * Node to anchor at the beginning of a line. This is essentially the
+ * object to match for the multiline ^.
+ */
+ static final class Caret extends Node {
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ int startIndex = matcher.from;
+ int endIndex = matcher.to;
+ if (!matcher.anchoringBounds) {
+ startIndex = 0;
+ endIndex = matcher.getTextLength();
+ }
+ // Perl does not match ^ at end of input even after newline
+ if (i == endIndex) {
+ matcher.hitEnd = true;
+ return false;
+ }
+ if (i > startIndex) {
+ char ch = seq.charAt(i-1);
+ if (ch != '\n' && ch != '\r'
+ && (ch|1) != '\u2029'
+ && ch != '\u0085' ) {
+ return false;
+ }
+ // Should treat /r/n as one newline
+ if (ch == '\r' && seq.charAt(i) == '\n')
+ return false;
+ }
+ return next.match(matcher, i, seq);
+ }
+ }
+
+ /**
+ * Node to anchor at the beginning of a line when in unixdot mode.
+ */
+ static final class UnixCaret extends Node {
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ int startIndex = matcher.from;
+ int endIndex = matcher.to;
+ if (!matcher.anchoringBounds) {
+ startIndex = 0;
+ endIndex = matcher.getTextLength();
+ }
+ // Perl does not match ^ at end of input even after newline
+ if (i == endIndex) {
+ matcher.hitEnd = true;
+ return false;
+ }
+ if (i > startIndex) {
+ char ch = seq.charAt(i-1);
+ if (ch != '\n') {
+ return false;
+ }
+ }
+ return next.match(matcher, i, seq);
+ }
+ }
+
+ /**
+ * Node to match the location where the last match ended.
+ * This is used for the \G construct.
+ */
+ static final class LastMatch extends Node {
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ if (i != matcher.oldLast)
+ return false;
+ return next.match(matcher, i, seq);
+ }
+ }
+
+ /**
+ * Node to anchor at the end of a line or the end of input based on the
+ * multiline mode.
+ *
+ * When not in multiline mode, the $ can only match at the very end
+ * of the input, unless the input ends in a line terminator in which
+ * it matches right before the last line terminator.
+ *
+ * Note that \r\n is considered an atomic line terminator.
+ *
+ * Like ^ the $ operator matches at a position, it does not match the
+ * line terminators themselves.
+ */
+ static final class Dollar extends Node {
+ boolean multiline;
+ Dollar(boolean mul) {
+ multiline = mul;
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ int endIndex = (matcher.anchoringBounds) ?
+ matcher.to : matcher.getTextLength();
+ if (!multiline) {
+ if (i < endIndex - 2)
+ return false;
+ if (i == endIndex - 2) {
+ char ch = seq.charAt(i);
+ if (ch != '\r')
+ return false;
+ ch = seq.charAt(i + 1);
+ if (ch != '\n')
+ return false;
+ }
+ }
+ // Matches before any line terminator; also matches at the
+ // end of input
+ // Before line terminator:
+ // If multiline, we match here no matter what
+ // If not multiline, fall through so that the end
+ // is marked as hit; this must be a /r/n or a /n
+ // at the very end so the end was hit; more input
+ // could make this not match here
+ if (i < endIndex) {
+ char ch = seq.charAt(i);
+ if (ch == '\n') {
+ // No match between \r\n
+ if (i > 0 && seq.charAt(i-1) == '\r')
+ return false;
+ if (multiline)
+ return next.match(matcher, i, seq);
+ } else if (ch == '\r' || ch == '\u0085' ||
+ (ch|1) == '\u2029') {
+ if (multiline)
+ return next.match(matcher, i, seq);
+ } else { // No line terminator, no match
+ return false;
+ }
+ }
+ // Matched at current end so hit end
+ matcher.hitEnd = true;
+ // If a $ matches because of end of input, then more input
+ // could cause it to fail!
+ matcher.requireEnd = true;
+ return next.match(matcher, i, seq);
+ }
+ boolean study(TreeInfo info) {
+ next.study(info);
+ return info.deterministic;
+ }
+ }
+
+ /**
+ * Node to anchor at the end of a line or the end of input based on the
+ * multiline mode when in unix lines mode.
+ */
+ static final class UnixDollar extends Node {
+ boolean multiline;
+ UnixDollar(boolean mul) {
+ multiline = mul;
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ int endIndex = (matcher.anchoringBounds) ?
+ matcher.to : matcher.getTextLength();
+ if (i < endIndex) {
+ char ch = seq.charAt(i);
+ if (ch == '\n') {
+ // If not multiline, then only possible to
+ // match at very end or one before end
+ if (multiline == false && i != endIndex - 1)
+ return false;
+ // If multiline return next.match without setting
+ // matcher.hitEnd
+ if (multiline)
+ return next.match(matcher, i, seq);
+ } else {
+ return false;
+ }
+ }
+ // Matching because at the end or 1 before the end;
+ // more input could change this so set hitEnd
+ matcher.hitEnd = true;
+ // If a $ matches because of end of input, then more input
+ // could cause it to fail!
+ matcher.requireEnd = true;
+ return next.match(matcher, i, seq);
+ }
+ boolean study(TreeInfo info) {
+ next.study(info);
+ return info.deterministic;
+ }
+ }
+
+ /**
+ * Abstract node class to match one character satisfying some
+ * boolean property.
+ */
+ private static abstract class CharProperty extends Node {
+ abstract boolean isSatisfiedBy(int ch);
+ CharProperty complement() {
+ return new CharProperty() {
+ boolean isSatisfiedBy(int ch) {
+ return ! CharProperty.this.isSatisfiedBy(ch);}};
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ if (i < matcher.to) {
+ int ch = Character.codePointAt(seq, i);
+ return isSatisfiedBy(ch)
+ && next.match(matcher, i+Character.charCount(ch), seq);
+ } else {
+ matcher.hitEnd = true;
+ return false;
+ }
+ }
+ boolean study(TreeInfo info) {
+ info.minLength++;
+ info.maxLength++;
+ return next.study(info);
+ }
+ }
+
+ /**
+ * Optimized version of CharProperty that works only for
+ * properties never satisfied by Supplementary characters.
+ */
+ private static abstract class BmpCharProperty extends CharProperty {
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ if (i < matcher.to) {
+ return isSatisfiedBy(seq.charAt(i))
+ && next.match(matcher, i+1, seq);
+ } else {
+ matcher.hitEnd = true;
+ return false;
+ }
+ }
+ }
+
+ /**
+ * Node class that matches a Supplementary Unicode character
+ */
+ static final class SingleS extends CharProperty {
+ final int c;
+ SingleS(int c) { this.c = c; }
+ boolean isSatisfiedBy(int ch) {
+ return ch == c;
+ }
+ }
+
+ /**
+ * Optimization -- matches a given BMP character
+ */
+ static final class Single extends BmpCharProperty {
+ final int c;
+ Single(int c) { this.c = c; }
+ boolean isSatisfiedBy(int ch) {
+ return ch == c;
+ }
+ }
+
+ /**
+ * Case insensitive matches a given BMP character
+ */
+ static final class SingleI extends BmpCharProperty {
+ final int lower;
+ final int upper;
+ SingleI(int lower, int upper) {
+ this.lower = lower;
+ this.upper = upper;
+ }
+ boolean isSatisfiedBy(int ch) {
+ return ch == lower || ch == upper;
+ }
+ }
+
+ /**
+ * Unicode case insensitive matches a given Unicode character
+ */
+ static final class SingleU extends CharProperty {
+ final int lower;
+ SingleU(int lower) {
+ this.lower = lower;
+ }
+ boolean isSatisfiedBy(int ch) {
+ return lower == ch ||
+ lower == Character.toLowerCase(Character.toUpperCase(ch));
+ }
+ }
+
+
+ /**
+ * Node class that matches a Unicode block.
+ */
+ static final class Block extends CharProperty {
+ final Character.UnicodeBlock block;
+ Block(Character.UnicodeBlock block) {
+ this.block = block;
+ }
+ boolean isSatisfiedBy(int ch) {
+ return block == Character.UnicodeBlock.of(ch);
+ }
+ }
+
+ /**
+ * Node class that matches a Unicode script
+ */
+ static final class Script extends CharProperty {
+ final Character.UnicodeScript script;
+ Script(Character.UnicodeScript script) {
+ this.script = script;
+ }
+ boolean isSatisfiedBy(int ch) {
+ return script == Character.UnicodeScript.of(ch);
+ }
+ }
+
+ /**
+ * Node class that matches a Unicode category.
+ */
+ static final class Category extends CharProperty {
+ final int typeMask;
+ Category(int typeMask) { this.typeMask = typeMask; }
+ boolean isSatisfiedBy(int ch) {
+ return (typeMask & (1 << Character.getType(ch))) != 0;
+ }
+ }
+
+ /**
+ * Node class that matches a Unicode "type"
+ */
+ static final class Utype extends CharProperty {
+ final UnicodeProp uprop;
+ Utype(UnicodeProp uprop) { this.uprop = uprop; }
+ boolean isSatisfiedBy(int ch) {
+ return uprop.is(ch);
+ }
+ }
+
+
+ /**
+ * Node class that matches a POSIX type.
+ */
+ static final class Ctype extends BmpCharProperty {
+ final int ctype;
+ Ctype(int ctype) { this.ctype = ctype; }
+ boolean isSatisfiedBy(int ch) {
+ return ch < 128 && ASCII.isType(ch, ctype);
+ }
+ }
+
+ /**
+ * Base class for all Slice nodes
+ */
+ static class SliceNode extends Node {
+ int[] buffer;
+ SliceNode(int[] buf) {
+ buffer = buf;
+ }
+ boolean study(TreeInfo info) {
+ info.minLength += buffer.length;
+ info.maxLength += buffer.length;
+ return next.study(info);
+ }
+ }
+
+ /**
+ * Node class for a case sensitive/BMP-only sequence of literal
+ * characters.
+ */
+ static final class Slice extends SliceNode {
+ Slice(int[] buf) {
+ super(buf);
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ int[] buf = buffer;
+ int len = buf.length;
+ for (int j=0; j= matcher.to) {
+ matcher.hitEnd = true;
+ return false;
+ }
+ if (buf[j] != seq.charAt(i+j))
+ return false;
+ }
+ return next.match(matcher, i+len, seq);
+ }
+ }
+
+ /**
+ * Node class for a case_insensitive/BMP-only sequence of literal
+ * characters.
+ */
+ static class SliceI extends SliceNode {
+ SliceI(int[] buf) {
+ super(buf);
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ int[] buf = buffer;
+ int len = buf.length;
+ for (int j=0; j= matcher.to) {
+ matcher.hitEnd = true;
+ return false;
+ }
+ int c = seq.charAt(i+j);
+ if (buf[j] != c &&
+ buf[j] != ASCII.toLower(c))
+ return false;
+ }
+ return next.match(matcher, i+len, seq);
+ }
+ }
+
+ /**
+ * Node class for a unicode_case_insensitive/BMP-only sequence of
+ * literal characters. Uses unicode case folding.
+ */
+ static final class SliceU extends SliceNode {
+ SliceU(int[] buf) {
+ super(buf);
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ int[] buf = buffer;
+ int len = buf.length;
+ for (int j=0; j= matcher.to) {
+ matcher.hitEnd = true;
+ return false;
+ }
+ int c = seq.charAt(i+j);
+ if (buf[j] != c &&
+ buf[j] != Character.toLowerCase(Character.toUpperCase(c)))
+ return false;
+ }
+ return next.match(matcher, i+len, seq);
+ }
+ }
+
+ /**
+ * Node class for a case sensitive sequence of literal characters
+ * including supplementary characters.
+ */
+ static final class SliceS extends SliceNode {
+ SliceS(int[] buf) {
+ super(buf);
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ int[] buf = buffer;
+ int x = i;
+ for (int j = 0; j < buf.length; j++) {
+ if (x >= matcher.to) {
+ matcher.hitEnd = true;
+ return false;
+ }
+ int c = Character.codePointAt(seq, x);
+ if (buf[j] != c)
+ return false;
+ x += Character.charCount(c);
+ if (x > matcher.to) {
+ matcher.hitEnd = true;
+ return false;
+ }
+ }
+ return next.match(matcher, x, seq);
+ }
+ }
+
+ /**
+ * Node class for a case insensitive sequence of literal characters
+ * including supplementary characters.
+ */
+ static class SliceIS extends SliceNode {
+ SliceIS(int[] buf) {
+ super(buf);
+ }
+ int toLower(int c) {
+ return ASCII.toLower(c);
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ int[] buf = buffer;
+ int x = i;
+ for (int j = 0; j < buf.length; j++) {
+ if (x >= matcher.to) {
+ matcher.hitEnd = true;
+ return false;
+ }
+ int c = Character.codePointAt(seq, x);
+ if (buf[j] != c && buf[j] != toLower(c))
+ return false;
+ x += Character.charCount(c);
+ if (x > matcher.to) {
+ matcher.hitEnd = true;
+ return false;
+ }
+ }
+ return next.match(matcher, x, seq);
+ }
+ }
+
+ /**
+ * Node class for a case insensitive sequence of literal characters.
+ * Uses unicode case folding.
+ */
+ static final class SliceUS extends SliceIS {
+ SliceUS(int[] buf) {
+ super(buf);
+ }
+ int toLower(int c) {
+ return Character.toLowerCase(Character.toUpperCase(c));
+ }
+ }
+
+ private static boolean inRange(int lower, int ch, int upper) {
+ return lower <= ch && ch <= upper;
+ }
+
+ /**
+ * Returns node for matching characters within an explicit value range.
+ */
+ private static CharProperty rangeFor(final int lower,
+ final int upper) {
+ return new CharProperty() {
+ boolean isSatisfiedBy(int ch) {
+ return inRange(lower, ch, upper);}};
+ }
+
+ /**
+ * Returns node for matching characters within an explicit value
+ * range in a case insensitive manner.
+ */
+ private CharProperty caseInsensitiveRangeFor(final int lower,
+ final int upper) {
+ if (has(UNICODE_CASE))
+ return new CharProperty() {
+ boolean isSatisfiedBy(int ch) {
+ if (inRange(lower, ch, upper))
+ return true;
+ int up = Character.toUpperCase(ch);
+ return inRange(lower, up, upper) ||
+ inRange(lower, Character.toLowerCase(up), upper);}};
+ return new CharProperty() {
+ boolean isSatisfiedBy(int ch) {
+ return inRange(lower, ch, upper) ||
+ ASCII.isAscii(ch) &&
+ (inRange(lower, ASCII.toUpper(ch), upper) ||
+ inRange(lower, ASCII.toLower(ch), upper));
+ }};
+ }
+
+ /**
+ * Implements the Unicode category ALL and the dot metacharacter when
+ * in dotall mode.
+ */
+ static final class All extends CharProperty {
+ boolean isSatisfiedBy(int ch) {
+ return true;
+ }
+ }
+
+ /**
+ * Node class for the dot metacharacter when dotall is not enabled.
+ */
+ static final class Dot extends CharProperty {
+ boolean isSatisfiedBy(int ch) {
+ return (ch != '\n' && ch != '\r'
+ && (ch|1) != '\u2029'
+ && ch != '\u0085');
+ }
+ }
+
+ /**
+ * Node class for the dot metacharacter when dotall is not enabled
+ * but UNIX_LINES is enabled.
+ */
+ static final class UnixDot extends CharProperty {
+ boolean isSatisfiedBy(int ch) {
+ return ch != '\n';
+ }
+ }
+
+ /**
+ * The 0 or 1 quantifier. This one class implements all three types.
+ */
+ static final class Ques extends Node {
+ Node atom;
+ int type;
+ Ques(Node node, int type) {
+ this.atom = node;
+ this.type = type;
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ switch (type) {
+ case GREEDY:
+ return (atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq))
+ || next.match(matcher, i, seq);
+ case LAZY:
+ return next.match(matcher, i, seq)
+ || (atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq));
+ case POSSESSIVE:
+ if (atom.match(matcher, i, seq)) i = matcher.last;
+ return next.match(matcher, i, seq);
+ default:
+ return atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq);
+ }
+ }
+ boolean study(TreeInfo info) {
+ if (type != INDEPENDENT) {
+ int minL = info.minLength;
+ atom.study(info);
+ info.minLength = minL;
+ info.deterministic = false;
+ return next.study(info);
+ } else {
+ atom.study(info);
+ return next.study(info);
+ }
+ }
+ }
+
+ /**
+ * Handles the curly-brace style repetition with a specified minimum and
+ * maximum occurrences. The * quantifier is handled as a special case.
+ * This class handles the three types.
+ */
+ static final class Curly extends Node {
+ Node atom;
+ int type;
+ int cmin;
+ int cmax;
+
+ Curly(Node node, int cmin, int cmax, int type) {
+ this.atom = node;
+ this.type = type;
+ this.cmin = cmin;
+ this.cmax = cmax;
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ int j;
+ for (j = 0; j < cmin; j++) {
+ if (atom.match(matcher, i, seq)) {
+ i = matcher.last;
+ continue;
+ }
+ return false;
+ }
+ if (type == GREEDY)
+ return match0(matcher, i, j, seq);
+ else if (type == LAZY)
+ return match1(matcher, i, j, seq);
+ else
+ return match2(matcher, i, j, seq);
+ }
+ // Greedy match.
+ // i is the index to start matching at
+ // j is the number of atoms that have matched
+ boolean match0(Matcher matcher, int i, int j, CharSequence seq) {
+ if (j >= cmax) {
+ // We have matched the maximum... continue with the rest of
+ // the regular expression
+ return next.match(matcher, i, seq);
+ }
+ int backLimit = j;
+ while (atom.match(matcher, i, seq)) {
+ // k is the length of this match
+ int k = matcher.last - i;
+ if (k == 0) // Zero length match
+ break;
+ // Move up index and number matched
+ i = matcher.last;
+ j++;
+ // We are greedy so match as many as we can
+ while (j < cmax) {
+ if (!atom.match(matcher, i, seq))
+ break;
+ if (i + k != matcher.last) {
+ if (match0(matcher, matcher.last, j+1, seq))
+ return true;
+ break;
+ }
+ i += k;
+ j++;
+ }
+ // Handle backing off if match fails
+ while (j >= backLimit) {
+ if (next.match(matcher, i, seq))
+ return true;
+ i -= k;
+ j--;
+ }
+ return false;
+ }
+ return next.match(matcher, i, seq);
+ }
+ // Reluctant match. At this point, the minimum has been satisfied.
+ // i is the index to start matching at
+ // j is the number of atoms that have matched
+ boolean match1(Matcher matcher, int i, int j, CharSequence seq) {
+ for (;;) {
+ // Try finishing match without consuming any more
+ if (next.match(matcher, i, seq))
+ return true;
+ // At the maximum, no match found
+ if (j >= cmax)
+ return false;
+ // Okay, must try one more atom
+ if (!atom.match(matcher, i, seq))
+ return false;
+ // If we haven't moved forward then must break out
+ if (i == matcher.last)
+ return false;
+ // Move up index and number matched
+ i = matcher.last;
+ j++;
+ }
+ }
+ boolean match2(Matcher matcher, int i, int j, CharSequence seq) {
+ for (; j < cmax; j++) {
+ if (!atom.match(matcher, i, seq))
+ break;
+ if (i == matcher.last)
+ break;
+ i = matcher.last;
+ }
+ return next.match(matcher, i, seq);
+ }
+ boolean study(TreeInfo info) {
+ // Save original info
+ int minL = info.minLength;
+ int maxL = info.maxLength;
+ boolean maxV = info.maxValid;
+ boolean detm = info.deterministic;
+ info.reset();
+
+ atom.study(info);
+
+ int temp = info.minLength * cmin + minL;
+ if (temp < minL) {
+ temp = 0xFFFFFFF; // arbitrary large number
+ }
+ info.minLength = temp;
+
+ if (maxV & info.maxValid) {
+ temp = info.maxLength * cmax + maxL;
+ info.maxLength = temp;
+ if (temp < maxL) {
+ info.maxValid = false;
+ }
+ } else {
+ info.maxValid = false;
+ }
+
+ if (info.deterministic && cmin == cmax)
+ info.deterministic = detm;
+ else
+ info.deterministic = false;
+
+ return next.study(info);
+ }
+ }
+
+ /**
+ * Handles the curly-brace style repetition with a specified minimum and
+ * maximum occurrences in deterministic cases. This is an iterative
+ * optimization over the Prolog and Loop system which would handle this
+ * in a recursive way. The * quantifier is handled as a special case.
+ * If capture is true then this class saves group settings and ensures
+ * that groups are unset when backing off of a group match.
+ */
+ static final class GroupCurly extends Node {
+ Node atom;
+ int type;
+ int cmin;
+ int cmax;
+ int localIndex;
+ int groupIndex;
+ boolean capture;
+
+ GroupCurly(Node node, int cmin, int cmax, int type, int local,
+ int group, boolean capture) {
+ this.atom = node;
+ this.type = type;
+ this.cmin = cmin;
+ this.cmax = cmax;
+ this.localIndex = local;
+ this.groupIndex = group;
+ this.capture = capture;
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ int[] groups = matcher.groups;
+ int[] locals = matcher.locals;
+ int save0 = locals[localIndex];
+ int save1 = 0;
+ int save2 = 0;
+
+ if (capture) {
+ save1 = groups[groupIndex];
+ save2 = groups[groupIndex+1];
+ }
+
+ // Notify GroupTail there is no need to setup group info
+ // because it will be set here
+ locals[localIndex] = -1;
+
+ boolean ret = true;
+ for (int j = 0; j < cmin; j++) {
+ if (atom.match(matcher, i, seq)) {
+ if (capture) {
+ groups[groupIndex] = i;
+ groups[groupIndex+1] = matcher.last;
+ }
+ i = matcher.last;
+ } else {
+ ret = false;
+ break;
+ }
+ }
+ if (ret) {
+ if (type == GREEDY) {
+ ret = match0(matcher, i, cmin, seq);
+ } else if (type == LAZY) {
+ ret = match1(matcher, i, cmin, seq);
+ } else {
+ ret = match2(matcher, i, cmin, seq);
+ }
+ }
+ if (!ret) {
+ locals[localIndex] = save0;
+ if (capture) {
+ groups[groupIndex] = save1;
+ groups[groupIndex+1] = save2;
+ }
+ }
+ return ret;
+ }
+ // Aggressive group match
+ boolean match0(Matcher matcher, int i, int j, CharSequence seq) {
+ int[] groups = matcher.groups;
+ int save0 = 0;
+ int save1 = 0;
+ if (capture) {
+ save0 = groups[groupIndex];
+ save1 = groups[groupIndex+1];
+ }
+ for (;;) {
+ if (j >= cmax)
+ break;
+ if (!atom.match(matcher, i, seq))
+ break;
+ int k = matcher.last - i;
+ if (k <= 0) {
+ if (capture) {
+ groups[groupIndex] = i;
+ groups[groupIndex+1] = i + k;
+ }
+ i = i + k;
+ break;
+ }
+ for (;;) {
+ if (capture) {
+ groups[groupIndex] = i;
+ groups[groupIndex+1] = i + k;
+ }
+ i = i + k;
+ if (++j >= cmax)
+ break;
+ if (!atom.match(matcher, i, seq))
+ break;
+ if (i + k != matcher.last) {
+ if (match0(matcher, i, j, seq))
+ return true;
+ break;
+ }
+ }
+ while (j > cmin) {
+ if (next.match(matcher, i, seq)) {
+ if (capture) {
+ groups[groupIndex+1] = i;
+ groups[groupIndex] = i - k;
+ }
+ i = i - k;
+ return true;
+ }
+ // backing off
+ if (capture) {
+ groups[groupIndex+1] = i;
+ groups[groupIndex] = i - k;
+ }
+ i = i - k;
+ j--;
+ }
+ break;
+ }
+ if (capture) {
+ groups[groupIndex] = save0;
+ groups[groupIndex+1] = save1;
+ }
+ return next.match(matcher, i, seq);
+ }
+ // Reluctant matching
+ boolean match1(Matcher matcher, int i, int j, CharSequence seq) {
+ for (;;) {
+ if (next.match(matcher, i, seq))
+ return true;
+ if (j >= cmax)
+ return false;
+ if (!atom.match(matcher, i, seq))
+ return false;
+ if (i == matcher.last)
+ return false;
+ if (capture) {
+ matcher.groups[groupIndex] = i;
+ matcher.groups[groupIndex+1] = matcher.last;
+ }
+ i = matcher.last;
+ j++;
+ }
+ }
+ // Possessive matching
+ boolean match2(Matcher matcher, int i, int j, CharSequence seq) {
+ for (; j < cmax; j++) {
+ if (!atom.match(matcher, i, seq)) {
+ break;
+ }
+ if (capture) {
+ matcher.groups[groupIndex] = i;
+ matcher.groups[groupIndex+1] = matcher.last;
+ }
+ if (i == matcher.last) {
+ break;
+ }
+ i = matcher.last;
+ }
+ return next.match(matcher, i, seq);
+ }
+ boolean study(TreeInfo info) {
+ // Save original info
+ int minL = info.minLength;
+ int maxL = info.maxLength;
+ boolean maxV = info.maxValid;
+ boolean detm = info.deterministic;
+ info.reset();
+
+ atom.study(info);
+
+ int temp = info.minLength * cmin + minL;
+ if (temp < minL) {
+ temp = 0xFFFFFFF; // Arbitrary large number
+ }
+ info.minLength = temp;
+
+ if (maxV & info.maxValid) {
+ temp = info.maxLength * cmax + maxL;
+ info.maxLength = temp;
+ if (temp < maxL) {
+ info.maxValid = false;
+ }
+ } else {
+ info.maxValid = false;
+ }
+
+ if (info.deterministic && cmin == cmax) {
+ info.deterministic = detm;
+ } else {
+ info.deterministic = false;
+ }
+
+ return next.study(info);
+ }
+ }
+
+ /**
+ * A Guard node at the end of each atom node in a Branch. It
+ * serves the purpose of chaining the "match" operation to
+ * "next" but not the "study", so we can collect the TreeInfo
+ * of each atom node without including the TreeInfo of the
+ * "next".
+ */
+ static final class BranchConn extends Node {
+ BranchConn() {};
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ return next.match(matcher, i, seq);
+ }
+ boolean study(TreeInfo info) {
+ return info.deterministic;
+ }
+ }
+
+ /**
+ * Handles the branching of alternations. Note this is also used for
+ * the ? quantifier to branch between the case where it matches once
+ * and where it does not occur.
+ */
+ static final class Branch extends Node {
+ Node[] atoms = new Node[2];
+ int size = 2;
+ Node conn;
+ Branch(Node first, Node second, Node branchConn) {
+ conn = branchConn;
+ atoms[0] = first;
+ atoms[1] = second;
+ }
+
+ void add(Node node) {
+ if (size >= atoms.length) {
+ Node[] tmp = new Node[atoms.length*2];
+ System.arraycopy(atoms, 0, tmp, 0, atoms.length);
+ atoms = tmp;
+ }
+ atoms[size++] = node;
+ }
+
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ for (int n = 0; n < size; n++) {
+ if (atoms[n] == null) {
+ if (conn.next.match(matcher, i, seq))
+ return true;
+ } else if (atoms[n].match(matcher, i, seq)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ boolean study(TreeInfo info) {
+ int minL = info.minLength;
+ int maxL = info.maxLength;
+ boolean maxV = info.maxValid;
+
+ int minL2 = Integer.MAX_VALUE; //arbitrary large enough num
+ int maxL2 = -1;
+ for (int n = 0; n < size; n++) {
+ info.reset();
+ if (atoms[n] != null)
+ atoms[n].study(info);
+ minL2 = Math.min(minL2, info.minLength);
+ maxL2 = Math.max(maxL2, info.maxLength);
+ maxV = (maxV & info.maxValid);
+ }
+
+ minL += minL2;
+ maxL += maxL2;
+
+ info.reset();
+ conn.next.study(info);
+
+ info.minLength += minL;
+ info.maxLength += maxL;
+ info.maxValid &= maxV;
+ info.deterministic = false;
+ return false;
+ }
+ }
+
+ /**
+ * The GroupHead saves the location where the group begins in the locals
+ * and restores them when the match is done.
+ *
+ * The matchRef is used when a reference to this group is accessed later
+ * in the expression. The locals will have a negative value in them to
+ * indicate that we do not want to unset the group if the reference
+ * doesn't match.
+ */
+ static final class GroupHead extends Node {
+ int localIndex;
+ GroupHead(int localCount) {
+ localIndex = localCount;
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ int save = matcher.locals[localIndex];
+ matcher.locals[localIndex] = i;
+ boolean ret = next.match(matcher, i, seq);
+ matcher.locals[localIndex] = save;
+ return ret;
+ }
+ boolean matchRef(Matcher matcher, int i, CharSequence seq) {
+ int save = matcher.locals[localIndex];
+ matcher.locals[localIndex] = ~i; // HACK
+ boolean ret = next.match(matcher, i, seq);
+ matcher.locals[localIndex] = save;
+ return ret;
+ }
+ }
+
+ /**
+ * Recursive reference to a group in the regular expression. It calls
+ * matchRef because if the reference fails to match we would not unset
+ * the group.
+ */
+ static final class GroupRef extends Node {
+ GroupHead head;
+ GroupRef(GroupHead head) {
+ this.head = head;
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ return head.matchRef(matcher, i, seq)
+ && next.match(matcher, matcher.last, seq);
+ }
+ boolean study(TreeInfo info) {
+ info.maxValid = false;
+ info.deterministic = false;
+ return next.study(info);
+ }
+ }
+
+ /**
+ * The GroupTail handles the setting of group beginning and ending
+ * locations when groups are successfully matched. It must also be able to
+ * unset groups that have to be backed off of.
+ *
+ * The GroupTail node is also used when a previous group is referenced,
+ * and in that case no group information needs to be set.
+ */
+ static final class GroupTail extends Node {
+ int localIndex;
+ int groupIndex;
+ GroupTail(int localCount, int groupCount) {
+ localIndex = localCount;
+ groupIndex = groupCount + groupCount;
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ int tmp = matcher.locals[localIndex];
+ if (tmp >= 0) { // This is the normal group case.
+ // Save the group so we can unset it if it
+ // backs off of a match.
+ int groupStart = matcher.groups[groupIndex];
+ int groupEnd = matcher.groups[groupIndex+1];
+
+ matcher.groups[groupIndex] = tmp;
+ matcher.groups[groupIndex+1] = i;
+ if (next.match(matcher, i, seq)) {
+ return true;
+ }
+ matcher.groups[groupIndex] = groupStart;
+ matcher.groups[groupIndex+1] = groupEnd;
+ return false;
+ } else {
+ // This is a group reference case. We don't need to save any
+ // group info because it isn't really a group.
+ matcher.last = i;
+ return true;
+ }
+ }
+ }
+
+ /**
+ * This sets up a loop to handle a recursive quantifier structure.
+ */
+ static final class Prolog extends Node {
+ Loop loop;
+ Prolog(Loop loop) {
+ this.loop = loop;
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ return loop.matchInit(matcher, i, seq);
+ }
+ boolean study(TreeInfo info) {
+ return loop.study(info);
+ }
+ }
+
+ /**
+ * Handles the repetition count for a greedy Curly. The matchInit
+ * is called from the Prolog to save the index of where the group
+ * beginning is stored. A zero length group check occurs in the
+ * normal match but is skipped in the matchInit.
+ */
+ static class Loop extends Node {
+ Node body;
+ int countIndex; // local count index in matcher locals
+ int beginIndex; // group beginning index
+ int cmin, cmax;
+ Loop(int countIndex, int beginIndex) {
+ this.countIndex = countIndex;
+ this.beginIndex = beginIndex;
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ // Avoid infinite loop in zero-length case.
+ if (i > matcher.locals[beginIndex]) {
+ int count = matcher.locals[countIndex];
+
+ // This block is for before we reach the minimum
+ // iterations required for the loop to match
+ if (count < cmin) {
+ matcher.locals[countIndex] = count + 1;
+ boolean b = body.match(matcher, i, seq);
+ // If match failed we must backtrack, so
+ // the loop count should NOT be incremented
+ if (!b)
+ matcher.locals[countIndex] = count;
+ // Return success or failure since we are under
+ // minimum
+ return b;
+ }
+ // This block is for after we have the minimum
+ // iterations required for the loop to match
+ if (count < cmax) {
+ matcher.locals[countIndex] = count + 1;
+ boolean b = body.match(matcher, i, seq);
+ // If match failed we must backtrack, so
+ // the loop count should NOT be incremented
+ if (!b)
+ matcher.locals[countIndex] = count;
+ else
+ return true;
+ }
+ }
+ return next.match(matcher, i, seq);
+ }
+ boolean matchInit(Matcher matcher, int i, CharSequence seq) {
+ int save = matcher.locals[countIndex];
+ boolean ret = false;
+ if (0 < cmin) {
+ matcher.locals[countIndex] = 1;
+ ret = body.match(matcher, i, seq);
+ } else if (0 < cmax) {
+ matcher.locals[countIndex] = 1;
+ ret = body.match(matcher, i, seq);
+ if (ret == false)
+ ret = next.match(matcher, i, seq);
+ } else {
+ ret = next.match(matcher, i, seq);
+ }
+ matcher.locals[countIndex] = save;
+ return ret;
+ }
+ boolean study(TreeInfo info) {
+ info.maxValid = false;
+ info.deterministic = false;
+ return false;
+ }
+ }
+
+ /**
+ * Handles the repetition count for a reluctant Curly. The matchInit
+ * is called from the Prolog to save the index of where the group
+ * beginning is stored. A zero length group check occurs in the
+ * normal match but is skipped in the matchInit.
+ */
+ static final class LazyLoop extends Loop {
+ LazyLoop(int countIndex, int beginIndex) {
+ super(countIndex, beginIndex);
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ // Check for zero length group
+ if (i > matcher.locals[beginIndex]) {
+ int count = matcher.locals[countIndex];
+ if (count < cmin) {
+ matcher.locals[countIndex] = count + 1;
+ boolean result = body.match(matcher, i, seq);
+ // If match failed we must backtrack, so
+ // the loop count should NOT be incremented
+ if (!result)
+ matcher.locals[countIndex] = count;
+ return result;
+ }
+ if (next.match(matcher, i, seq))
+ return true;
+ if (count < cmax) {
+ matcher.locals[countIndex] = count + 1;
+ boolean result = body.match(matcher, i, seq);
+ // If match failed we must backtrack, so
+ // the loop count should NOT be incremented
+ if (!result)
+ matcher.locals[countIndex] = count;
+ return result;
+ }
+ return false;
+ }
+ return next.match(matcher, i, seq);
+ }
+ boolean matchInit(Matcher matcher, int i, CharSequence seq) {
+ int save = matcher.locals[countIndex];
+ boolean ret = false;
+ if (0 < cmin) {
+ matcher.locals[countIndex] = 1;
+ ret = body.match(matcher, i, seq);
+ } else if (next.match(matcher, i, seq)) {
+ ret = true;
+ } else if (0 < cmax) {
+ matcher.locals[countIndex] = 1;
+ ret = body.match(matcher, i, seq);
+ }
+ matcher.locals[countIndex] = save;
+ return ret;
+ }
+ boolean study(TreeInfo info) {
+ info.maxValid = false;
+ info.deterministic = false;
+ return false;
+ }
+ }
+
+ /**
+ * Refers to a group in the regular expression. Attempts to match
+ * whatever the group referred to last matched.
+ */
+ static class BackRef extends Node {
+ int groupIndex;
+ BackRef(int groupCount) {
+ super();
+ groupIndex = groupCount + groupCount;
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ int j = matcher.groups[groupIndex];
+ int k = matcher.groups[groupIndex+1];
+
+ int groupSize = k - j;
+
+ // If the referenced group didn't match, neither can this
+ if (j < 0)
+ return false;
+
+ // If there isn't enough input left no match
+ if (i + groupSize > matcher.to) {
+ matcher.hitEnd = true;
+ return false;
+ }
+
+ // Check each new char to make sure it matches what the group
+ // referenced matched last time around
+ for (int index=0; index matcher.to) {
+ matcher.hitEnd = true;
+ return false;
+ }
+
+ // Check each new char to make sure it matches what the group
+ // referenced matched last time around
+ int x = i;
+ for (int index=0; index matcher.to) {
+ matcher.hitEnd = true;
+ return false;
+ }
+ if (atom.match(matcher, i, seq)) {
+ return next.match(matcher, matcher.last, seq);
+ }
+ i += countChars(seq, i, 1);
+ matcher.first++;
+ }
+ }
+ boolean study(TreeInfo info) {
+ atom.study(info);
+ info.maxValid = false;
+ info.deterministic = false;
+ return next.study(info);
+ }
+ }
+
+ static final class Conditional extends Node {
+ Node cond, yes, not;
+ Conditional(Node cond, Node yes, Node not) {
+ this.cond = cond;
+ this.yes = yes;
+ this.not = not;
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ if (cond.match(matcher, i, seq)) {
+ return yes.match(matcher, i, seq);
+ } else {
+ return not.match(matcher, i, seq);
+ }
+ }
+ boolean study(TreeInfo info) {
+ int minL = info.minLength;
+ int maxL = info.maxLength;
+ boolean maxV = info.maxValid;
+ info.reset();
+ yes.study(info);
+
+ int minL2 = info.minLength;
+ int maxL2 = info.maxLength;
+ boolean maxV2 = info.maxValid;
+ info.reset();
+ not.study(info);
+
+ info.minLength = minL + Math.min(minL2, info.minLength);
+ info.maxLength = maxL + Math.max(maxL2, info.maxLength);
+ info.maxValid = (maxV & maxV2 & info.maxValid);
+ info.deterministic = false;
+ return next.study(info);
+ }
+ }
+
+ /**
+ * Zero width positive lookahead.
+ */
+ static final class Pos extends Node {
+ Node cond;
+ Pos(Node cond) {
+ this.cond = cond;
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ int savedTo = matcher.to;
+ boolean conditionMatched = false;
+
+ // Relax transparent region boundaries for lookahead
+ if (matcher.transparentBounds)
+ matcher.to = matcher.getTextLength();
+ try {
+ conditionMatched = cond.match(matcher, i, seq);
+ } finally {
+ // Reinstate region boundaries
+ matcher.to = savedTo;
+ }
+ return conditionMatched && next.match(matcher, i, seq);
+ }
+ }
+
+ /**
+ * Zero width negative lookahead.
+ */
+ static final class Neg extends Node {
+ Node cond;
+ Neg(Node cond) {
+ this.cond = cond;
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ int savedTo = matcher.to;
+ boolean conditionMatched = false;
+
+ // Relax transparent region boundaries for lookahead
+ if (matcher.transparentBounds)
+ matcher.to = matcher.getTextLength();
+ try {
+ if (i < matcher.to) {
+ conditionMatched = !cond.match(matcher, i, seq);
+ } else {
+ // If a negative lookahead succeeds then more input
+ // could cause it to fail!
+ matcher.requireEnd = true;
+ conditionMatched = !cond.match(matcher, i, seq);
+ }
+ } finally {
+ // Reinstate region boundaries
+ matcher.to = savedTo;
+ }
+ return conditionMatched && next.match(matcher, i, seq);
+ }
+ }
+
+ /**
+ * For use with lookbehinds; matches the position where the lookbehind
+ * was encountered.
+ */
+ static Node lookbehindEnd = new Node() {
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ return i == matcher.lookbehindTo;
+ }
+ };
+
+ /**
+ * Zero width positive lookbehind.
+ */
+ static class Behind extends Node {
+ Node cond;
+ int rmax, rmin;
+ Behind(Node cond, int rmax, int rmin) {
+ this.cond = cond;
+ this.rmax = rmax;
+ this.rmin = rmin;
+ }
+
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ int savedFrom = matcher.from;
+ boolean conditionMatched = false;
+ int startIndex = (!matcher.transparentBounds) ?
+ matcher.from : 0;
+ int from = Math.max(i - rmax, startIndex);
+ // Set end boundary
+ int savedLBT = matcher.lookbehindTo;
+ matcher.lookbehindTo = i;
+ // Relax transparent region boundaries for lookbehind
+ if (matcher.transparentBounds)
+ matcher.from = 0;
+ for (int j = i - rmin; !conditionMatched && j >= from; j--) {
+ conditionMatched = cond.match(matcher, j, seq);
+ }
+ matcher.from = savedFrom;
+ matcher.lookbehindTo = savedLBT;
+ return conditionMatched && next.match(matcher, i, seq);
+ }
+ }
+
+ /**
+ * Zero width positive lookbehind, including supplementary
+ * characters or unpaired surrogates.
+ */
+ static final class BehindS extends Behind {
+ BehindS(Node cond, int rmax, int rmin) {
+ super(cond, rmax, rmin);
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ int rmaxChars = countChars(seq, i, -rmax);
+ int rminChars = countChars(seq, i, -rmin);
+ int savedFrom = matcher.from;
+ int startIndex = (!matcher.transparentBounds) ?
+ matcher.from : 0;
+ boolean conditionMatched = false;
+ int from = Math.max(i - rmaxChars, startIndex);
+ // Set end boundary
+ int savedLBT = matcher.lookbehindTo;
+ matcher.lookbehindTo = i;
+ // Relax transparent region boundaries for lookbehind
+ if (matcher.transparentBounds)
+ matcher.from = 0;
+
+ for (int j = i - rminChars;
+ !conditionMatched && j >= from;
+ j -= j>from ? countChars(seq, j, -1) : 1) {
+ conditionMatched = cond.match(matcher, j, seq);
+ }
+ matcher.from = savedFrom;
+ matcher.lookbehindTo = savedLBT;
+ return conditionMatched && next.match(matcher, i, seq);
+ }
+ }
+
+ /**
+ * Zero width negative lookbehind.
+ */
+ static class NotBehind extends Node {
+ Node cond;
+ int rmax, rmin;
+ NotBehind(Node cond, int rmax, int rmin) {
+ this.cond = cond;
+ this.rmax = rmax;
+ this.rmin = rmin;
+ }
+
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ int savedLBT = matcher.lookbehindTo;
+ int savedFrom = matcher.from;
+ boolean conditionMatched = false;
+ int startIndex = (!matcher.transparentBounds) ?
+ matcher.from : 0;
+ int from = Math.max(i - rmax, startIndex);
+ matcher.lookbehindTo = i;
+ // Relax transparent region boundaries for lookbehind
+ if (matcher.transparentBounds)
+ matcher.from = 0;
+ for (int j = i - rmin; !conditionMatched && j >= from; j--) {
+ conditionMatched = cond.match(matcher, j, seq);
+ }
+ // Reinstate region boundaries
+ matcher.from = savedFrom;
+ matcher.lookbehindTo = savedLBT;
+ return !conditionMatched && next.match(matcher, i, seq);
+ }
+ }
+
+ /**
+ * Zero width negative lookbehind, including supplementary
+ * characters or unpaired surrogates.
+ */
+ static final class NotBehindS extends NotBehind {
+ NotBehindS(Node cond, int rmax, int rmin) {
+ super(cond, rmax, rmin);
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ int rmaxChars = countChars(seq, i, -rmax);
+ int rminChars = countChars(seq, i, -rmin);
+ int savedFrom = matcher.from;
+ int savedLBT = matcher.lookbehindTo;
+ boolean conditionMatched = false;
+ int startIndex = (!matcher.transparentBounds) ?
+ matcher.from : 0;
+ int from = Math.max(i - rmaxChars, startIndex);
+ matcher.lookbehindTo = i;
+ // Relax transparent region boundaries for lookbehind
+ if (matcher.transparentBounds)
+ matcher.from = 0;
+ for (int j = i - rminChars;
+ !conditionMatched && j >= from;
+ j -= j>from ? countChars(seq, j, -1) : 1) {
+ conditionMatched = cond.match(matcher, j, seq);
+ }
+ //Reinstate region boundaries
+ matcher.from = savedFrom;
+ matcher.lookbehindTo = savedLBT;
+ return !conditionMatched && next.match(matcher, i, seq);
+ }
+ }
+
+ /**
+ * Returns the set union of two CharProperty nodes.
+ */
+ private static CharProperty union(final CharProperty lhs,
+ final CharProperty rhs) {
+ return new CharProperty() {
+ boolean isSatisfiedBy(int ch) {
+ return lhs.isSatisfiedBy(ch) || rhs.isSatisfiedBy(ch);}};
+ }
+
+ /**
+ * Returns the set intersection of two CharProperty nodes.
+ */
+ private static CharProperty intersection(final CharProperty lhs,
+ final CharProperty rhs) {
+ return new CharProperty() {
+ boolean isSatisfiedBy(int ch) {
+ return lhs.isSatisfiedBy(ch) && rhs.isSatisfiedBy(ch);}};
+ }
+
+ /**
+ * Returns the set difference of two CharProperty nodes.
+ */
+ private static CharProperty setDifference(final CharProperty lhs,
+ final CharProperty rhs) {
+ return new CharProperty() {
+ boolean isSatisfiedBy(int ch) {
+ return ! rhs.isSatisfiedBy(ch) && lhs.isSatisfiedBy(ch);}};
+ }
+
+ /**
+ * Handles word boundaries. Includes a field to allow this one class to
+ * deal with the different types of word boundaries we can match. The word
+ * characters include underscores, letters, and digits. Non spacing marks
+ * can are also part of a word if they have a base character, otherwise
+ * they are ignored for purposes of finding word boundaries.
+ */
+ static final class Bound extends Node {
+ static int LEFT = 0x1;
+ static int RIGHT= 0x2;
+ static int BOTH = 0x3;
+ static int NONE = 0x4;
+ int type;
+ boolean useUWORD;
+ Bound(int n, boolean useUWORD) {
+ type = n;
+ this.useUWORD = useUWORD;
+ }
+
+ boolean isWord(int ch) {
+ return useUWORD ? UnicodeProp.WORD.is(ch)
+ : (ch == '_' || Character.isLetterOrDigit(ch));
+ }
+
+ int check(Matcher matcher, int i, CharSequence seq) {
+ int ch;
+ boolean left = false;
+ int startIndex = matcher.from;
+ int endIndex = matcher.to;
+ if (matcher.transparentBounds) {
+ startIndex = 0;
+ endIndex = matcher.getTextLength();
+ }
+ if (i > startIndex) {
+ ch = Character.codePointBefore(seq, i);
+ left = (isWord(ch) ||
+ ((Character.getType(ch) == Character.NON_SPACING_MARK)
+ && hasBaseCharacter(matcher, i-1, seq)));
+ }
+ boolean right = false;
+ if (i < endIndex) {
+ ch = Character.codePointAt(seq, i);
+ right = (isWord(ch) ||
+ ((Character.getType(ch) == Character.NON_SPACING_MARK)
+ && hasBaseCharacter(matcher, i, seq)));
+ } else {
+ // Tried to access char past the end
+ matcher.hitEnd = true;
+ // The addition of another char could wreck a boundary
+ matcher.requireEnd = true;
+ }
+ return ((left ^ right) ? (right ? LEFT : RIGHT) : NONE);
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ return (check(matcher, i, seq) & type) > 0
+ && next.match(matcher, i, seq);
+ }
+ }
+
+ /**
+ * Non spacing marks only count as word characters in bounds calculations
+ * if they have a base character.
+ */
+ private static boolean hasBaseCharacter(Matcher matcher, int i,
+ CharSequence seq)
+ {
+ int start = (!matcher.transparentBounds) ?
+ matcher.from : 0;
+ for (int x=i; x >= start; x--) {
+ int ch = Character.codePointAt(seq, x);
+ if (Character.isLetterOrDigit(ch))
+ return true;
+ if (Character.getType(ch) == Character.NON_SPACING_MARK)
+ continue;
+ return false;
+ }
+ return false;
+ }
+
+ /**
+ * Attempts to match a slice in the input using the Boyer-Moore string
+ * matching algorithm. The algorithm is based on the idea that the
+ * pattern can be shifted farther ahead in the search text if it is
+ * matched right to left.
+ *
+ * The pattern is compared to the input one character at a time, from
+ * the rightmost character in the pattern to the left. If the characters
+ * all match the pattern has been found. If a character does not match,
+ * the pattern is shifted right a distance that is the maximum of two
+ * functions, the bad character shift and the good suffix shift. This
+ * shift moves the attempted match position through the input more
+ * quickly than a naive one position at a time check.
+ *
+ * The bad character shift is based on the character from the text that
+ * did not match. If the character does not appear in the pattern, the
+ * pattern can be shifted completely beyond the bad character. If the
+ * character does occur in the pattern, the pattern can be shifted to
+ * line the pattern up with the next occurrence of that character.
+ *
+ * The good suffix shift is based on the idea that some subset on the right
+ * side of the pattern has matched. When a bad character is found, the
+ * pattern can be shifted right by the pattern length if the subset does
+ * not occur again in pattern, or by the amount of distance to the
+ * next occurrence of the subset in the pattern.
+ *
+ * Boyer-Moore search methods adapted from code by Amy Yu.
+ */
+ static class BnM extends Node {
+ int[] buffer;
+ int[] lastOcc;
+ int[] optoSft;
+
+ /**
+ * Pre calculates arrays needed to generate the bad character
+ * shift and the good suffix shift. Only the last seven bits
+ * are used to see if chars match; This keeps the tables small
+ * and covers the heavily used ASCII range, but occasionally
+ * results in an aliased match for the bad character shift.
+ */
+ static Node optimize(Node node) {
+ if (!(node instanceof Slice)) {
+ return node;
+ }
+
+ int[] src = ((Slice) node).buffer;
+ int patternLength = src.length;
+ // The BM algorithm requires a bit of overhead;
+ // If the pattern is short don't use it, since
+ // a shift larger than the pattern length cannot
+ // be used anyway.
+ if (patternLength < 4) {
+ return node;
+ }
+ int i, j, k;
+ int[] lastOcc = new int[128];
+ int[] optoSft = new int[patternLength];
+ // Precalculate part of the bad character shift
+ // It is a table for where in the pattern each
+ // lower 7-bit value occurs
+ for (i = 0; i < patternLength; i++) {
+ lastOcc[src[i]&0x7F] = i + 1;
+ }
+ // Precalculate the good suffix shift
+ // i is the shift amount being considered
+NEXT: for (i = patternLength; i > 0; i--) {
+ // j is the beginning index of suffix being considered
+ for (j = patternLength - 1; j >= i; j--) {
+ // Testing for good suffix
+ if (src[j] == src[j-i]) {
+ // src[j..len] is a good suffix
+ optoSft[j-1] = i;
+ } else {
+ // No match. The array has already been
+ // filled up with correct values before.
+ continue NEXT;
+ }
+ }
+ // This fills up the remaining of optoSft
+ // any suffix can not have larger shift amount
+ // then its sub-suffix. Why???
+ while (j > 0) {
+ optoSft[--j] = i;
+ }
+ }
+ // Set the guard value because of unicode compression
+ optoSft[patternLength-1] = 1;
+ if (node instanceof SliceS)
+ return new BnMS(src, lastOcc, optoSft, node.next);
+ return new BnM(src, lastOcc, optoSft, node.next);
+ }
+ BnM(int[] src, int[] lastOcc, int[] optoSft, Node next) {
+ this.buffer = src;
+ this.lastOcc = lastOcc;
+ this.optoSft = optoSft;
+ this.next = next;
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ int[] src = buffer;
+ int patternLength = src.length;
+ int last = matcher.to - patternLength;
+
+ // Loop over all possible match positions in text
+NEXT: while (i <= last) {
+ // Loop over pattern from right to left
+ for (int j = patternLength - 1; j >= 0; j--) {
+ int ch = seq.charAt(i+j);
+ if (ch != src[j]) {
+ // Shift search to the right by the maximum of the
+ // bad character shift and the good suffix shift
+ i += Math.max(j + 1 - lastOcc[ch&0x7F], optoSft[j]);
+ continue NEXT;
+ }
+ }
+ // Entire pattern matched starting at i
+ matcher.first = i;
+ boolean ret = next.match(matcher, i + patternLength, seq);
+ if (ret) {
+ matcher.first = i;
+ matcher.groups[0] = matcher.first;
+ matcher.groups[1] = matcher.last;
+ return true;
+ }
+ i++;
+ }
+ // BnM is only used as the leading node in the unanchored case,
+ // and it replaced its Start() which always searches to the end
+ // if it doesn't find what it's looking for, so hitEnd is true.
+ matcher.hitEnd = true;
+ return false;
+ }
+ boolean study(TreeInfo info) {
+ info.minLength += buffer.length;
+ info.maxValid = false;
+ return next.study(info);
+ }
+ }
+
+ /**
+ * Supplementary support version of BnM(). Unpaired surrogates are
+ * also handled by this class.
+ */
+ static final class BnMS extends BnM {
+ int lengthInChars;
+
+ BnMS(int[] src, int[] lastOcc, int[] optoSft, Node next) {
+ super(src, lastOcc, optoSft, next);
+ for (int x = 0; x < buffer.length; x++) {
+ lengthInChars += Character.charCount(buffer[x]);
+ }
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ int[] src = buffer;
+ int patternLength = src.length;
+ int last = matcher.to - lengthInChars;
+
+ // Loop over all possible match positions in text
+NEXT: while (i <= last) {
+ // Loop over pattern from right to left
+ int ch;
+ for (int j = countChars(seq, i, patternLength), x = patternLength - 1;
+ j > 0; j -= Character.charCount(ch), x--) {
+ ch = Character.codePointBefore(seq, i+j);
+ if (ch != src[x]) {
+ // Shift search to the right by the maximum of the
+ // bad character shift and the good suffix shift
+ int n = Math.max(x + 1 - lastOcc[ch&0x7F], optoSft[x]);
+ i += countChars(seq, i, n);
+ continue NEXT;
+ }
+ }
+ // Entire pattern matched starting at i
+ matcher.first = i;
+ boolean ret = next.match(matcher, i + lengthInChars, seq);
+ if (ret) {
+ matcher.first = i;
+ matcher.groups[0] = matcher.first;
+ matcher.groups[1] = matcher.last;
+ return true;
+ }
+ i += countChars(seq, i, 1);
+ }
+ matcher.hitEnd = true;
+ return false;
+ }
+ }
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+ /**
+ * This must be the very first initializer.
+ */
+ static Node accept = new Node();
+
+ static Node lastAccept = new LastNode();
+
+ private static class CharPropertyNames {
+
+ static CharProperty charPropertyFor(String name) {
+ CharPropertyFactory m = map.get(name);
+ return m == null ? null : m.make();
+ }
+
+ private static abstract class CharPropertyFactory {
+ abstract CharProperty make();
+ }
+
+ private static void defCategory(String name,
+ final int typeMask) {
+ map.put(name, new CharPropertyFactory() {
+ CharProperty make() { return new Category(typeMask);}});
+ }
+
+ private static void defRange(String name,
+ final int lower, final int upper) {
+ map.put(name, new CharPropertyFactory() {
+ CharProperty make() { return rangeFor(lower, upper);}});
+ }
+
+ private static void defCtype(String name,
+ final int ctype) {
+ map.put(name, new CharPropertyFactory() {
+ CharProperty make() { return new Ctype(ctype);}});
+ }
+
+ private static abstract class CloneableProperty
+ extends CharProperty implements Cloneable
+ {
+ public CloneableProperty clone() {
+ try {
+ return (CloneableProperty) super.clone();
+ } catch (CloneNotSupportedException e) {
+ throw new AssertionError(e);
+ }
+ }
+ }
+
+ private static void defClone(String name,
+ final CloneableProperty p) {
+ map.put(name, new CharPropertyFactory() {
+ CharProperty make() { return p.clone();}});
+ }
+
+ private static final HashMap map
+ = new HashMap<>();
+
+ static {
+ // Unicode character property aliases, defined in
+ // http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt
+ defCategory("Cn", 1<-1 if the index is not known
+ */
+ public PatternSyntaxException(String desc, String regex, int index) {
+ this.desc = desc;
+ this.pattern = regex;
+ this.index = index;
+ }
+
+ /**
+ * Retrieves the error index.
+ *
+ * @return The approximate index in the pattern of the error,
+ * or -1 if the index is not known
+ */
+ public int getIndex() {
+ return index;
+ }
+
+ /**
+ * Retrieves the description of the error.
+ *
+ * @return The description of the error
+ */
+ public String getDescription() {
+ return desc;
+ }
+
+ /**
+ * Retrieves the erroneous regular-expression pattern.
+ *
+ * @return The erroneous pattern
+ */
+ public String getPattern() {
+ return pattern;
+ }
+
+ private static final String nl =
+ java.security.AccessController
+ .doPrivileged(new GetPropertyAction("line.separator"));
+
+ /**
+ * Returns a multi-line string containing the description of the syntax
+ * error and its index, the erroneous regular-expression pattern, and a
+ * visual indication of the error index within the pattern.
+ *
+ * @return The full detail message
+ */
+ public String getMessage() {
+ StringBuffer sb = new StringBuffer();
+ sb.append(desc);
+ if (index >= 0) {
+ sb.append(" near index ");
+ sb.append(index);
+ }
+ sb.append(nl);
+ sb.append(pattern);
+ if (index >= 0) {
+ sb.append(nl);
+ for (int i = 0; i < index; i++) sb.append(' ');
+ sb.append('^');
+ }
+ return sb.toString();
+ }
+
+}
diff -r 9cc0e1034f92 -r 7df5f83fff8d rt/emul/compact/src/main/java/java/util/regex/UnicodeProp.java
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/rt/emul/compact/src/main/java/java/util/regex/UnicodeProp.java Mon Oct 07 16:14:56 2013 +0200
@@ -0,0 +1,236 @@
+/*
+ * Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation. Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package java.util.regex;
+
+import java.util.HashMap;
+import java.util.Locale;
+
+enum UnicodeProp {
+
+ ALPHABETIC {
+ public boolean is(int ch) {
+ return Character.isAlphabetic(ch);
+ }
+ },
+
+ LETTER {
+ public boolean is(int ch) {
+ return Character.isLetter(ch);
+ }
+ },
+
+ IDEOGRAPHIC {
+ public boolean is(int ch) {
+ return Character.isIdeographic(ch);
+ }
+ },
+
+ LOWERCASE {
+ public boolean is(int ch) {
+ return Character.isLowerCase(ch);
+ }
+ },
+
+ UPPERCASE {
+ public boolean is(int ch) {
+ return Character.isUpperCase(ch);
+ }
+ },
+
+ TITLECASE {
+ public boolean is(int ch) {
+ return Character.isTitleCase(ch);
+ }
+ },
+
+ WHITE_SPACE {
+ // \p{Whitespace}
+ public boolean is(int ch) {
+ return ((((1 << Character.SPACE_SEPARATOR) |
+ (1 << Character.LINE_SEPARATOR) |
+ (1 << Character.PARAGRAPH_SEPARATOR)) >> Character.getType(ch)) & 1)
+ != 0 || (ch >= 0x9 && ch <= 0xd) || (ch == 0x85);
+ }
+ },
+
+ CONTROL {
+ // \p{gc=Control}
+ public boolean is(int ch) {
+ return Character.getType(ch) == Character.CONTROL;
+ }
+ },
+
+ PUNCTUATION {
+ // \p{gc=Punctuation}
+ public boolean is(int ch) {
+ return ((((1 << Character.CONNECTOR_PUNCTUATION) |
+ (1 << Character.DASH_PUNCTUATION) |
+ (1 << Character.START_PUNCTUATION) |
+ (1 << Character.END_PUNCTUATION) |
+ (1 << Character.OTHER_PUNCTUATION) |
+ (1 << Character.INITIAL_QUOTE_PUNCTUATION) |
+ (1 << Character.FINAL_QUOTE_PUNCTUATION)) >> Character.getType(ch)) & 1)
+ != 0;
+ }
+ },
+
+ HEX_DIGIT {
+ // \p{gc=Decimal_Number}
+ // \p{Hex_Digit} -> PropList.txt: Hex_Digit
+ public boolean is(int ch) {
+ return DIGIT.is(ch) ||
+ (ch >= 0x0030 && ch <= 0x0039) ||
+ (ch >= 0x0041 && ch <= 0x0046) ||
+ (ch >= 0x0061 && ch <= 0x0066) ||
+ (ch >= 0xFF10 && ch <= 0xFF19) ||
+ (ch >= 0xFF21 && ch <= 0xFF26) ||
+ (ch >= 0xFF41 && ch <= 0xFF46);
+ }
+ },
+
+ ASSIGNED {
+ public boolean is(int ch) {
+ return Character.getType(ch) != Character.UNASSIGNED;
+ }
+ },
+
+ NONCHARACTER_CODE_POINT {
+ // PropList.txt:Noncharacter_Code_Point
+ public boolean is(int ch) {
+ return (ch & 0xfffe) == 0xfffe || (ch >= 0xfdd0 && ch <= 0xfdef);
+ }
+ },
+
+ DIGIT {
+ // \p{gc=Decimal_Number}
+ public boolean is(int ch) {
+ return Character.isDigit(ch);
+ }
+ },
+
+ ALNUM {
+ // \p{alpha}
+ // \p{digit}
+ public boolean is(int ch) {
+ return ALPHABETIC.is(ch) || DIGIT.is(ch);
+ }
+ },
+
+ BLANK {
+ // \p{Whitespace} --
+ // [\N{LF} \N{VT} \N{FF} \N{CR} \N{NEL} -> 0xa, 0xb, 0xc, 0xd, 0x85
+ // \p{gc=Line_Separator}
+ // \p{gc=Paragraph_Separator}]
+ public boolean is(int ch) {
+ return Character.getType(ch) == Character.SPACE_SEPARATOR ||
+ ch == 0x9; // \N{HT}
+ }
+ },
+
+ GRAPH {
+ // [^
+ // \p{space}
+ // \p{gc=Control}
+ // \p{gc=Surrogate}
+ // \p{gc=Unassigned}]
+ public boolean is(int ch) {
+ return ((((1 << Character.SPACE_SEPARATOR) |
+ (1 << Character.LINE_SEPARATOR) |
+ (1 << Character.PARAGRAPH_SEPARATOR) |
+ (1 << Character.CONTROL) |
+ (1 << Character.SURROGATE) |
+ (1 << Character.UNASSIGNED)) >> Character.getType(ch)) & 1)
+ == 0;
+ }
+ },
+
+ PRINT {
+ // \p{graph}
+ // \p{blank}
+ // -- \p{cntrl}
+ public boolean is(int ch) {
+ return (GRAPH.is(ch) || BLANK.is(ch)) && !CONTROL.is(ch);
+ }
+ },
+
+ WORD {
+ // \p{alpha}
+ // \p{gc=Mark}
+ // \p{digit}
+ // \p{gc=Connector_Punctuation}
+
+ public boolean is(int ch) {
+ return ALPHABETIC.is(ch) ||
+ ((((1 << Character.NON_SPACING_MARK) |
+ (1 << Character.ENCLOSING_MARK) |
+ (1 << Character.COMBINING_SPACING_MARK) |
+ (1 << Character.DECIMAL_DIGIT_NUMBER) |
+ (1 << Character.CONNECTOR_PUNCTUATION)) >> Character.getType(ch)) & 1)
+ != 0;
+ }
+ };
+
+ private final static HashMap posix = new HashMap<>();
+ private final static HashMap aliases = new HashMap<>();
+ static {
+ posix.put("ALPHA", "ALPHABETIC");
+ posix.put("LOWER", "LOWERCASE");
+ posix.put("UPPER", "UPPERCASE");
+ posix.put("SPACE", "WHITE_SPACE");
+ posix.put("PUNCT", "PUNCTUATION");
+ posix.put("XDIGIT","HEX_DIGIT");
+ posix.put("ALNUM", "ALNUM");
+ posix.put("CNTRL", "CONTROL");
+ posix.put("DIGIT", "DIGIT");
+ posix.put("BLANK", "BLANK");
+ posix.put("GRAPH", "GRAPH");
+ posix.put("PRINT", "PRINT");
+
+ aliases.put("WHITESPACE", "WHITE_SPACE");
+ aliases.put("HEXDIGIT","HEX_DIGIT");
+ aliases.put("NONCHARACTERCODEPOINT", "NONCHARACTER_CODE_POINT");
+ }
+
+ public static UnicodeProp forName(String propName) {
+ propName = propName.toUpperCase(Locale.ENGLISH);
+ String alias = aliases.get(propName);
+ if (alias != null)
+ propName = alias;
+ try {
+ return valueOf (propName);
+ } catch (IllegalArgumentException x) {}
+ return null;
+ }
+
+ public static UnicodeProp forPOSIXName(String propName) {
+ propName = posix.get(propName.toUpperCase(Locale.ENGLISH));
+ if (propName == null)
+ return null;
+ return valueOf (propName);
+ }
+
+ public abstract boolean is(int ch);
+}
diff -r 9cc0e1034f92 -r 7df5f83fff8d rt/emul/compact/src/main/java/java/util/regex/package.html
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/rt/emul/compact/src/main/java/java/util/regex/package.html Mon Oct 07 16:14:56 2013 +0200
@@ -0,0 +1,66 @@
+
+
+
+
+
+
+
+
+Classes for matching character sequences against patterns specified by regular
+expressions.
+
+ An instance of the {@link java.util.regex.Pattern} class represents a
+regular expression that is specified in string form in a syntax similar to
+that used by Perl.
+
+
Instances of the {@link java.util.regex.Matcher} class are used to match
+character sequences against a given pattern. Input is provided to matchers via
+the {@link java.lang.CharSequence} interface in order to support matching
+against characters from a wide variety of input sources.
+
+ Unless otherwise noted, passing a null argument to a method
+in any class or interface in this package will cause a
+{@link java.lang.NullPointerException NullPointerException} to be thrown.
+
+
Related Documentation
+
+ An excellent tutorial and overview of regular expressions is Mastering Regular
+Expressions, Jeffrey E. F. Friedl, O'Reilly and Associates, 1997.
+
+
+
+@since 1.4
+@author Mike McCloskey
+@author Mark Reinhold
+
+
+