# HG changeset patch # User Jaroslav Tulach # Date 1381155207 -7200 # Node ID bca65655b36b342cad11418ccd4a5fa0b9a4f19b # Parent 588d5bf7a5601660c2124a269afe88758fbcb784 Adding RegEx implementation diff -r 588d5bf7a560 -r bca65655b36b rt/emul/compact/src/main/java/java/util/regex/ASCII.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rt/emul/compact/src/main/java/java/util/regex/ASCII.java Mon Oct 07 16:13:27 2013 +0200 @@ -0,0 +1,274 @@ +/* + * Copyright (c) 1999, 2000, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package java.util.regex; + + +/** + * Utility class that implements the standard C ctype functionality. + * + * @author Hong Zhang + */ + +final class ASCII { + + static final int UPPER = 0x00000100; + + static final int LOWER = 0x00000200; + + static final int DIGIT = 0x00000400; + + static final int SPACE = 0x00000800; + + static final int PUNCT = 0x00001000; + + static final int CNTRL = 0x00002000; + + static final int BLANK = 0x00004000; + + static final int HEX = 0x00008000; + + static final int UNDER = 0x00010000; + + static final int ASCII = 0x0000FF00; + + static final int ALPHA = (UPPER|LOWER); + + static final int ALNUM = (UPPER|LOWER|DIGIT); + + static final int GRAPH = (PUNCT|UPPER|LOWER|DIGIT); + + static final int WORD = (UPPER|LOWER|UNDER|DIGIT); + + static final int XDIGIT = (HEX); + + private static final int[] ctype = new int[] { + CNTRL, /* 00 (NUL) */ + CNTRL, /* 01 (SOH) */ + CNTRL, /* 02 (STX) */ + CNTRL, /* 03 (ETX) */ + CNTRL, /* 04 (EOT) */ + CNTRL, /* 05 (ENQ) */ + CNTRL, /* 06 (ACK) */ + CNTRL, /* 07 (BEL) */ + CNTRL, /* 08 (BS) */ + SPACE+CNTRL+BLANK, /* 09 (HT) */ + SPACE+CNTRL, /* 0A (LF) */ + SPACE+CNTRL, /* 0B (VT) */ + SPACE+CNTRL, /* 0C (FF) */ + SPACE+CNTRL, /* 0D (CR) */ + CNTRL, /* 0E (SI) */ + CNTRL, /* 0F (SO) */ + CNTRL, /* 10 (DLE) */ + CNTRL, /* 11 (DC1) */ + CNTRL, /* 12 (DC2) */ + CNTRL, /* 13 (DC3) */ + CNTRL, /* 14 (DC4) */ + CNTRL, /* 15 (NAK) */ + CNTRL, /* 16 (SYN) */ + CNTRL, /* 17 (ETB) */ + CNTRL, /* 18 (CAN) */ + CNTRL, /* 19 (EM) */ + CNTRL, /* 1A (SUB) */ + CNTRL, /* 1B (ESC) */ + CNTRL, /* 1C (FS) */ + CNTRL, /* 1D (GS) */ + CNTRL, /* 1E (RS) */ + CNTRL, /* 1F (US) */ + SPACE+BLANK, /* 20 SPACE */ + PUNCT, /* 21 ! */ + PUNCT, /* 22 " */ + PUNCT, /* 23 # */ + PUNCT, /* 24 $ */ + PUNCT, /* 25 % */ + PUNCT, /* 26 & */ + PUNCT, /* 27 ' */ + PUNCT, /* 28 ( */ + PUNCT, /* 29 ) */ + PUNCT, /* 2A * */ + PUNCT, /* 2B + */ + PUNCT, /* 2C , */ + PUNCT, /* 2D - */ + PUNCT, /* 2E . */ + PUNCT, /* 2F / */ + DIGIT+HEX+0, /* 30 0 */ + DIGIT+HEX+1, /* 31 1 */ + DIGIT+HEX+2, /* 32 2 */ + DIGIT+HEX+3, /* 33 3 */ + DIGIT+HEX+4, /* 34 4 */ + DIGIT+HEX+5, /* 35 5 */ + DIGIT+HEX+6, /* 36 6 */ + DIGIT+HEX+7, /* 37 7 */ + DIGIT+HEX+8, /* 38 8 */ + DIGIT+HEX+9, /* 39 9 */ + PUNCT, /* 3A : */ + PUNCT, /* 3B ; */ + PUNCT, /* 3C < */ + PUNCT, /* 3D = */ + PUNCT, /* 3E > */ + PUNCT, /* 3F ? */ + PUNCT, /* 40 @ */ + UPPER+HEX+10, /* 41 A */ + UPPER+HEX+11, /* 42 B */ + UPPER+HEX+12, /* 43 C */ + UPPER+HEX+13, /* 44 D */ + UPPER+HEX+14, /* 45 E */ + UPPER+HEX+15, /* 46 F */ + UPPER+16, /* 47 G */ + UPPER+17, /* 48 H */ + UPPER+18, /* 49 I */ + UPPER+19, /* 4A J */ + UPPER+20, /* 4B K */ + UPPER+21, /* 4C L */ + UPPER+22, /* 4D M */ + UPPER+23, /* 4E N */ + UPPER+24, /* 4F O */ + UPPER+25, /* 50 P */ + UPPER+26, /* 51 Q */ + UPPER+27, /* 52 R */ + UPPER+28, /* 53 S */ + UPPER+29, /* 54 T */ + UPPER+30, /* 55 U */ + UPPER+31, /* 56 V */ + UPPER+32, /* 57 W */ + UPPER+33, /* 58 X */ + UPPER+34, /* 59 Y */ + UPPER+35, /* 5A Z */ + PUNCT, /* 5B [ */ + PUNCT, /* 5C \ */ + PUNCT, /* 5D ] */ + PUNCT, /* 5E ^ */ + PUNCT|UNDER, /* 5F _ */ + PUNCT, /* 60 ` */ + LOWER+HEX+10, /* 61 a */ + LOWER+HEX+11, /* 62 b */ + LOWER+HEX+12, /* 63 c */ + LOWER+HEX+13, /* 64 d */ + LOWER+HEX+14, /* 65 e */ + LOWER+HEX+15, /* 66 f */ + LOWER+16, /* 67 g */ + LOWER+17, /* 68 h */ + LOWER+18, /* 69 i */ + LOWER+19, /* 6A j */ + LOWER+20, /* 6B k */ + LOWER+21, /* 6C l */ + LOWER+22, /* 6D m */ + LOWER+23, /* 6E n */ + LOWER+24, /* 6F o */ + LOWER+25, /* 70 p */ + LOWER+26, /* 71 q */ + LOWER+27, /* 72 r */ + LOWER+28, /* 73 s */ + LOWER+29, /* 74 t */ + LOWER+30, /* 75 u */ + LOWER+31, /* 76 v */ + LOWER+32, /* 77 w */ + LOWER+33, /* 78 x */ + LOWER+34, /* 79 y */ + LOWER+35, /* 7A z */ + PUNCT, /* 7B { */ + PUNCT, /* 7C | */ + PUNCT, /* 7D } */ + PUNCT, /* 7E ~ */ + CNTRL, /* 7F (DEL) */ + }; + + static int getType(int ch) { + return ((ch & 0xFFFFFF80) == 0 ? ctype[ch] : 0); + } + + static boolean isType(int ch, int type) { + return (getType(ch) & type) != 0; + } + + static boolean isAscii(int ch) { + return ((ch & 0xFFFFFF80) == 0); + } + + static boolean isAlpha(int ch) { + return isType(ch, ALPHA); + } + + static boolean isDigit(int ch) { + return ((ch-'0')|('9'-ch)) >= 0; + } + + static boolean isAlnum(int ch) { + return isType(ch, ALNUM); + } + + static boolean isGraph(int ch) { + return isType(ch, GRAPH); + } + + static boolean isPrint(int ch) { + return ((ch-0x20)|(0x7E-ch)) >= 0; + } + + static boolean isPunct(int ch) { + return isType(ch, PUNCT); + } + + static boolean isSpace(int ch) { + return isType(ch, SPACE); + } + + static boolean isHexDigit(int ch) { + return isType(ch, HEX); + } + + static boolean isOctDigit(int ch) { + return ((ch-'0')|('7'-ch)) >= 0; + } + + static boolean isCntrl(int ch) { + return isType(ch, CNTRL); + } + + static boolean isLower(int ch) { + return ((ch-'a')|('z'-ch)) >= 0; + } + + static boolean isUpper(int ch) { + return ((ch-'A')|('Z'-ch)) >= 0; + } + + static boolean isWord(int ch) { + return isType(ch, WORD); + } + + static int toDigit(int ch) { + return (ctype[ch & 0x7F] & 0x3F); + } + + static int toLower(int ch) { + return isUpper(ch) ? (ch + 0x20) : ch; + } + + static int toUpper(int ch) { + return isLower(ch) ? (ch - 0x20) : ch; + } + +} diff -r 588d5bf7a560 -r bca65655b36b rt/emul/compact/src/main/java/java/util/regex/MatchResult.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rt/emul/compact/src/main/java/java/util/regex/MatchResult.java Mon Oct 07 16:13:27 2013 +0200 @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2003, 2004, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package java.util.regex; + +/** + * The result of a match operation. + * + *

This interface contains query methods used to determine the + * results of a match against a regular expression. The match boundaries, + * groups and group boundaries can be seen but not modified through + * a MatchResult. + * + * @author Michael McCloskey + * @see Matcher + * @since 1.5 + */ +public interface MatchResult { + + /** + * Returns the start index of the match. + * + * @return The index of the first character matched + * + * @throws IllegalStateException + * If no match has yet been attempted, + * or if the previous match operation failed + */ + public int start(); + + /** + * Returns the start index of the subsequence captured by the given group + * during this match. + * + *

Capturing groups are indexed from left + * to right, starting at one. Group zero denotes the entire pattern, so + * the expression m.start(0) is equivalent to + * m.start().

+ * + * @param group + * The index of a capturing group in this matcher's pattern + * + * @return The index of the first character captured by the group, + * or -1 if the match was successful but the group + * itself did not match anything + * + * @throws IllegalStateException + * If no match has yet been attempted, + * or if the previous match operation failed + * + * @throws IndexOutOfBoundsException + * If there is no capturing group in the pattern + * with the given index + */ + public int start(int group); + + /** + * Returns the offset after the last character matched.

+ * + * @return @return The offset after the last character matched + * + * @throws IllegalStateException + * If no match has yet been attempted, + * or if the previous match operation failed + */ + public int end(); + + /** + * Returns the offset after the last character of the subsequence + * captured by the given group during this match. + * + *

Capturing groups are indexed from left + * to right, starting at one. Group zero denotes the entire pattern, so + * the expression m.end(0) is equivalent to + * m.end().

+ * + * @param group + * The index of a capturing group in this matcher's pattern + * + * @return The offset after the last character captured by the group, + * or -1 if the match was successful + * but the group itself did not match anything + * + * @throws IllegalStateException + * If no match has yet been attempted, + * or if the previous match operation failed + * + * @throws IndexOutOfBoundsException + * If there is no capturing group in the pattern + * with the given index + */ + public int end(int group); + + /** + * Returns the input subsequence matched by the previous match. + * + *

For a matcher m with input sequence s, + * the expressions m.group() and + * s.substring(m.start(), m.end()) + * are equivalent.

+ * + *

Note that some patterns, for example a*, match the empty + * string. This method will return the empty string when the pattern + * successfully matches the empty string in the input.

+ * + * @return The (possibly empty) subsequence matched by the previous match, + * in string form + * + * @throws IllegalStateException + * If no match has yet been attempted, + * or if the previous match operation failed + */ + public String group(); + + /** + * Returns the input subsequence captured by the given group during the + * previous match operation. + * + *

For a matcher m, input sequence s, and group index + * g, the expressions m.group(g) and + * s.substring(m.start(g), m.end(g)) + * are equivalent.

+ * + *

Capturing groups are indexed from left + * to right, starting at one. Group zero denotes the entire pattern, so + * the expression m.group(0) is equivalent to m.group(). + *

+ * + *

If the match was successful but the group specified failed to match + * any part of the input sequence, then null is returned. Note + * that some groups, for example (a*), match the empty string. + * This method will return the empty string when such a group successfully + * matches the empty string in the input.

+ * + * @param group + * The index of a capturing group in this matcher's pattern + * + * @return The (possibly empty) subsequence captured by the group + * during the previous match, or null if the group + * failed to match part of the input + * + * @throws IllegalStateException + * If no match has yet been attempted, + * or if the previous match operation failed + * + * @throws IndexOutOfBoundsException + * If there is no capturing group in the pattern + * with the given index + */ + public String group(int group); + + /** + * Returns the number of capturing groups in this match result's pattern. + * + *

Group zero denotes the entire pattern by convention. It is not + * included in this count. + * + *

Any non-negative integer smaller than or equal to the value + * returned by this method is guaranteed to be a valid group index for + * this matcher.

+ * + * @return The number of capturing groups in this matcher's pattern + */ + public int groupCount(); + +} diff -r 588d5bf7a560 -r bca65655b36b rt/emul/compact/src/main/java/java/util/regex/Matcher.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rt/emul/compact/src/main/java/java/util/regex/Matcher.java Mon Oct 07 16:13:27 2013 +0200 @@ -0,0 +1,1256 @@ +/* + * Copyright (c) 1999, 2009, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package java.util.regex; + + +/** + * An engine that performs match operations on a {@link java.lang.CharSequence + * character sequence} by interpreting a {@link Pattern}. + * + *

A matcher is created from a pattern by invoking the pattern's {@link + * Pattern#matcher matcher} method. Once created, a matcher can be used to + * perform three different kinds of match operations: + * + *

+ * + *

Each of these methods returns a boolean indicating success or failure. + * More information about a successful match can be obtained by querying the + * state of the matcher. + * + *

A matcher finds matches in a subset of its input called the + * region. By default, the region contains all of the matcher's input. + * The region can be modified via the{@link #region region} method and queried + * via the {@link #regionStart regionStart} and {@link #regionEnd regionEnd} + * methods. The way that the region boundaries interact with some pattern + * constructs can be changed. See {@link #useAnchoringBounds + * useAnchoringBounds} and {@link #useTransparentBounds useTransparentBounds} + * for more details. + * + *

This class also defines methods for replacing matched subsequences with + * new strings whose contents can, if desired, be computed from the match + * result. The {@link #appendReplacement appendReplacement} and {@link + * #appendTail appendTail} methods can be used in tandem in order to collect + * the result into an existing string buffer, or the more convenient {@link + * #replaceAll replaceAll} method can be used to create a string in which every + * matching subsequence in the input sequence is replaced. + * + *

The explicit state of a matcher includes the start and end indices of + * the most recent successful match. It also includes the start and end + * indices of the input subsequence captured by each capturing group in the pattern as well as a total + * count of such subsequences. As a convenience, methods are also provided for + * returning these captured subsequences in string form. + * + *

The explicit state of a matcher is initially undefined; attempting to + * query any part of it before a successful match will cause an {@link + * IllegalStateException} to be thrown. The explicit state of a matcher is + * recomputed by every match operation. + * + *

The implicit state of a matcher includes the input character sequence as + * well as the append position, which is initially zero and is updated + * by the {@link #appendReplacement appendReplacement} method. + * + *

A matcher may be reset explicitly by invoking its {@link #reset()} + * method or, if a new input sequence is desired, its {@link + * #reset(java.lang.CharSequence) reset(CharSequence)} method. Resetting a + * matcher discards its explicit state information and sets the append position + * to zero. + * + *

Instances of this class are not safe for use by multiple concurrent + * threads.

+ * + * + * @author Mike McCloskey + * @author Mark Reinhold + * @author JSR-51 Expert Group + * @since 1.4 + * @spec JSR-51 + */ + +public final class Matcher implements MatchResult { + + /** + * The Pattern object that created this Matcher. + */ + Pattern parentPattern; + + /** + * The storage used by groups. They may contain invalid values if + * a group was skipped during the matching. + */ + int[] groups; + + /** + * The range within the sequence that is to be matched. Anchors + * will match at these "hard" boundaries. Changing the region + * changes these values. + */ + int from, to; + + /** + * Lookbehind uses this value to ensure that the subexpression + * match ends at the point where the lookbehind was encountered. + */ + int lookbehindTo; + + /** + * The original string being matched. + */ + CharSequence text; + + /** + * Matcher state used by the last node. NOANCHOR is used when a + * match does not have to consume all of the input. ENDANCHOR is + * the mode used for matching all the input. + */ + static final int ENDANCHOR = 1; + static final int NOANCHOR = 0; + int acceptMode = NOANCHOR; + + /** + * The range of string that last matched the pattern. If the last + * match failed then first is -1; last initially holds 0 then it + * holds the index of the end of the last match (which is where the + * next search starts). + */ + int first = -1, last = 0; + + /** + * The end index of what matched in the last match operation. + */ + int oldLast = -1; + + /** + * The index of the last position appended in a substitution. + */ + int lastAppendPosition = 0; + + /** + * Storage used by nodes to tell what repetition they are on in + * a pattern, and where groups begin. The nodes themselves are stateless, + * so they rely on this field to hold state during a match. + */ + int[] locals; + + /** + * Boolean indicating whether or not more input could change + * the results of the last match. + * + * If hitEnd is true, and a match was found, then more input + * might cause a different match to be found. + * If hitEnd is true and a match was not found, then more + * input could cause a match to be found. + * If hitEnd is false and a match was found, then more input + * will not change the match. + * If hitEnd is false and a match was not found, then more + * input will not cause a match to be found. + */ + boolean hitEnd; + + /** + * Boolean indicating whether or not more input could change + * a positive match into a negative one. + * + * If requireEnd is true, and a match was found, then more + * input could cause the match to be lost. + * If requireEnd is false and a match was found, then more + * input might change the match but the match won't be lost. + * If a match was not found, then requireEnd has no meaning. + */ + boolean requireEnd; + + /** + * If transparentBounds is true then the boundaries of this + * matcher's region are transparent to lookahead, lookbehind, + * and boundary matching constructs that try to see beyond them. + */ + boolean transparentBounds = false; + + /** + * If anchoringBounds is true then the boundaries of this + * matcher's region match anchors such as ^ and $. + */ + boolean anchoringBounds = true; + + /** + * No default constructor. + */ + Matcher() { + } + + /** + * All matchers have the state used by Pattern during a match. + */ + Matcher(Pattern parent, CharSequence text) { + this.parentPattern = parent; + this.text = text; + + // Allocate state storage + int parentGroupCount = Math.max(parent.capturingGroupCount, 10); + groups = new int[parentGroupCount * 2]; + locals = new int[parent.localCount]; + + // Put fields into initial states + reset(); + } + + /** + * Returns the pattern that is interpreted by this matcher. + * + * @return The pattern for which this matcher was created + */ + public Pattern pattern() { + return parentPattern; + } + + /** + * Returns the match state of this matcher as a {@link MatchResult}. + * The result is unaffected by subsequent operations performed upon this + * matcher. + * + * @return a MatchResult with the state of this matcher + * @since 1.5 + */ + public MatchResult toMatchResult() { + Matcher result = new Matcher(this.parentPattern, text.toString()); + result.first = this.first; + result.last = this.last; + result.groups = this.groups.clone(); + return result; + } + + /** + * Changes the Pattern that this Matcher uses to + * find matches with. + * + *

This method causes this matcher to lose information + * about the groups of the last match that occurred. The + * matcher's position in the input is maintained and its + * last append position is unaffected.

+ * + * @param newPattern + * The new pattern used by this matcher + * @return This matcher + * @throws IllegalArgumentException + * If newPattern is null + * @since 1.5 + */ + public Matcher usePattern(Pattern newPattern) { + if (newPattern == null) + throw new IllegalArgumentException("Pattern cannot be null"); + parentPattern = newPattern; + + // Reallocate state storage + int parentGroupCount = Math.max(newPattern.capturingGroupCount, 10); + groups = new int[parentGroupCount * 2]; + locals = new int[newPattern.localCount]; + for (int i = 0; i < groups.length; i++) + groups[i] = -1; + for (int i = 0; i < locals.length; i++) + locals[i] = -1; + return this; + } + + /** + * Resets this matcher. + * + *

Resetting a matcher discards all of its explicit state information + * and sets its append position to zero. The matcher's region is set to the + * default region, which is its entire character sequence. The anchoring + * and transparency of this matcher's region boundaries are unaffected. + * + * @return This matcher + */ + public Matcher reset() { + first = -1; + last = 0; + oldLast = -1; + for(int i=0; i Resetting a matcher discards all of its explicit state information + * and sets its append position to zero. The matcher's region is set to + * the default region, which is its entire character sequence. The + * anchoring and transparency of this matcher's region boundaries are + * unaffected. + * + * @param input + * The new input character sequence + * + * @return This matcher + */ + public Matcher reset(CharSequence input) { + text = input; + return reset(); + } + + /** + * Returns the start index of the previous match.

+ * + * @return The index of the first character matched + * + * @throws IllegalStateException + * If no match has yet been attempted, + * or if the previous match operation failed + */ + public int start() { + if (first < 0) + throw new IllegalStateException("No match available"); + return first; + } + + /** + * Returns the start index of the subsequence captured by the given group + * during the previous match operation. + * + *

Capturing groups are indexed from left + * to right, starting at one. Group zero denotes the entire pattern, so + * the expression m.start(0) is equivalent to + * m.start().

+ * + * @param group + * The index of a capturing group in this matcher's pattern + * + * @return The index of the first character captured by the group, + * or -1 if the match was successful but the group + * itself did not match anything + * + * @throws IllegalStateException + * If no match has yet been attempted, + * or if the previous match operation failed + * + * @throws IndexOutOfBoundsException + * If there is no capturing group in the pattern + * with the given index + */ + public int start(int group) { + if (first < 0) + throw new IllegalStateException("No match available"); + if (group > groupCount()) + throw new IndexOutOfBoundsException("No group " + group); + return groups[group * 2]; + } + + /** + * Returns the offset after the last character matched.

+ * + * @return The offset after the last character matched + * + * @throws IllegalStateException + * If no match has yet been attempted, + * or if the previous match operation failed + */ + public int end() { + if (first < 0) + throw new IllegalStateException("No match available"); + return last; + } + + /** + * Returns the offset after the last character of the subsequence + * captured by the given group during the previous match operation. + * + *

Capturing groups are indexed from left + * to right, starting at one. Group zero denotes the entire pattern, so + * the expression m.end(0) is equivalent to + * m.end().

+ * + * @param group + * The index of a capturing group in this matcher's pattern + * + * @return The offset after the last character captured by the group, + * or -1 if the match was successful + * but the group itself did not match anything + * + * @throws IllegalStateException + * If no match has yet been attempted, + * or if the previous match operation failed + * + * @throws IndexOutOfBoundsException + * If there is no capturing group in the pattern + * with the given index + */ + public int end(int group) { + if (first < 0) + throw new IllegalStateException("No match available"); + if (group > groupCount()) + throw new IndexOutOfBoundsException("No group " + group); + return groups[group * 2 + 1]; + } + + /** + * Returns the input subsequence matched by the previous match. + * + *

For a matcher m with input sequence s, + * the expressions m.group() and + * s.substring(m.start(), m.end()) + * are equivalent.

+ * + *

Note that some patterns, for example a*, match the empty + * string. This method will return the empty string when the pattern + * successfully matches the empty string in the input.

+ * + * @return The (possibly empty) subsequence matched by the previous match, + * in string form + * + * @throws IllegalStateException + * If no match has yet been attempted, + * or if the previous match operation failed + */ + public String group() { + return group(0); + } + + /** + * Returns the input subsequence captured by the given group during the + * previous match operation. + * + *

For a matcher m, input sequence s, and group index + * g, the expressions m.group(g) and + * s.substring(m.start(g), m.end(g)) + * are equivalent.

+ * + *

Capturing groups are indexed from left + * to right, starting at one. Group zero denotes the entire pattern, so + * the expression m.group(0) is equivalent to m.group(). + *

+ * + *

If the match was successful but the group specified failed to match + * any part of the input sequence, then null is returned. Note + * that some groups, for example (a*), match the empty string. + * This method will return the empty string when such a group successfully + * matches the empty string in the input.

+ * + * @param group + * The index of a capturing group in this matcher's pattern + * + * @return The (possibly empty) subsequence captured by the group + * during the previous match, or null if the group + * failed to match part of the input + * + * @throws IllegalStateException + * If no match has yet been attempted, + * or if the previous match operation failed + * + * @throws IndexOutOfBoundsException + * If there is no capturing group in the pattern + * with the given index + */ + public String group(int group) { + if (first < 0) + throw new IllegalStateException("No match found"); + if (group < 0 || group > groupCount()) + throw new IndexOutOfBoundsException("No group " + group); + if ((groups[group*2] == -1) || (groups[group*2+1] == -1)) + return null; + return getSubSequence(groups[group * 2], groups[group * 2 + 1]).toString(); + } + + /** + * Returns the input subsequence captured by the given + * named-capturing group during the previous + * match operation. + * + *

If the match was successful but the group specified failed to match + * any part of the input sequence, then null is returned. Note + * that some groups, for example (a*), match the empty string. + * This method will return the empty string when such a group successfully + * matches the empty string in the input.

+ * + * @param name + * The name of a named-capturing group in this matcher's pattern + * + * @return The (possibly empty) subsequence captured by the named group + * during the previous match, or null if the group + * failed to match part of the input + * + * @throws IllegalStateException + * If no match has yet been attempted, + * or if the previous match operation failed + * + * @throws IllegalArgumentException + * If there is no capturing group in the pattern + * with the given name + */ + public String group(String name) { + if (name == null) + throw new NullPointerException("Null group name"); + if (first < 0) + throw new IllegalStateException("No match found"); + if (!parentPattern.namedGroups().containsKey(name)) + throw new IllegalArgumentException("No group with name <" + name + ">"); + int group = parentPattern.namedGroups().get(name); + if ((groups[group*2] == -1) || (groups[group*2+1] == -1)) + return null; + return getSubSequence(groups[group * 2], groups[group * 2 + 1]).toString(); + } + + /** + * Returns the number of capturing groups in this matcher's pattern. + * + *

Group zero denotes the entire pattern by convention. It is not + * included in this count. + * + *

Any non-negative integer smaller than or equal to the value + * returned by this method is guaranteed to be a valid group index for + * this matcher.

+ * + * @return The number of capturing groups in this matcher's pattern + */ + public int groupCount() { + return parentPattern.capturingGroupCount - 1; + } + + /** + * Attempts to match the entire region against the pattern. + * + *

If the match succeeds then more information can be obtained via the + * start, end, and group methods.

+ * + * @return true if, and only if, the entire region sequence + * matches this matcher's pattern + */ + public boolean matches() { + return match(from, ENDANCHOR); + } + + /** + * Attempts to find the next subsequence of the input sequence that matches + * the pattern. + * + *

This method starts at the beginning of this matcher's region, or, if + * a previous invocation of the method was successful and the matcher has + * not since been reset, at the first character not matched by the previous + * match. + * + *

If the match succeeds then more information can be obtained via the + * start, end, and group methods.

+ * + * @return true if, and only if, a subsequence of the input + * sequence matches this matcher's pattern + */ + public boolean find() { + int nextSearchIndex = last; + if (nextSearchIndex == first) + nextSearchIndex++; + + // If next search starts before region, start it at region + if (nextSearchIndex < from) + nextSearchIndex = from; + + // If next search starts beyond region then it fails + if (nextSearchIndex > to) { + for (int i = 0; i < groups.length; i++) + groups[i] = -1; + return false; + } + return search(nextSearchIndex); + } + + /** + * Resets this matcher and then attempts to find the next subsequence of + * the input sequence that matches the pattern, starting at the specified + * index. + * + *

If the match succeeds then more information can be obtained via the + * start, end, and group methods, and subsequent + * invocations of the {@link #find()} method will start at the first + * character not matched by this match.

+ * + * @throws IndexOutOfBoundsException + * If start is less than zero or if start is greater than the + * length of the input sequence. + * + * @return true if, and only if, a subsequence of the input + * sequence starting at the given index matches this matcher's + * pattern + */ + public boolean find(int start) { + int limit = getTextLength(); + if ((start < 0) || (start > limit)) + throw new IndexOutOfBoundsException("Illegal start index"); + reset(); + return search(start); + } + + /** + * Attempts to match the input sequence, starting at the beginning of the + * region, against the pattern. + * + *

Like the {@link #matches matches} method, this method always starts + * at the beginning of the region; unlike that method, it does not + * require that the entire region be matched. + * + *

If the match succeeds then more information can be obtained via the + * start, end, and group methods.

+ * + * @return true if, and only if, a prefix of the input + * sequence matches this matcher's pattern + */ + public boolean lookingAt() { + return match(from, NOANCHOR); + } + + /** + * Returns a literal replacement String for the specified + * String. + * + * This method produces a String that will work + * as a literal replacement s in the + * appendReplacement method of the {@link Matcher} class. + * The String produced will match the sequence of characters + * in s treated as a literal sequence. Slashes ('\') and + * dollar signs ('$') will be given no special meaning. + * + * @param s The string to be literalized + * @return A literal string replacement + * @since 1.5 + */ + public static String quoteReplacement(String s) { + if ((s.indexOf('\\') == -1) && (s.indexOf('$') == -1)) + return s; + StringBuilder sb = new StringBuilder(); + for (int i=0; i This method performs the following actions:

+ * + *
    + * + *
  1. It reads characters from the input sequence, starting at the + * append position, and appends them to the given string buffer. It + * stops after reading the last character preceding the previous match, + * that is, the character at index {@link + * #start()} - 1.

  2. + * + *
  3. It appends the given replacement string to the string buffer. + *

  4. + * + *
  5. It sets the append position of this matcher to the index of + * the last character matched, plus one, that is, to {@link #end()}. + *

  6. + * + *
+ * + *

The replacement string may contain references to subsequences + * captured during the previous match: Each occurrence of + * ${name} or $g + * will be replaced by the result of evaluating the corresponding + * {@link #group(String) group(name)} or {@link #group(int) group(g)} + * respectively. For $g, + * the first number after the $ is always treated as part of + * the group reference. Subsequent numbers are incorporated into g if + * they would form a legal group reference. Only the numerals '0' + * through '9' are considered as potential components of the group + * reference. If the second group matched the string "foo", for + * example, then passing the replacement string "$2bar" would + * cause "foobar" to be appended to the string buffer. A dollar + * sign ($) may be included as a literal in the replacement + * string by preceding it with a backslash (\$). + * + *

Note that backslashes (\) and dollar signs ($) in + * the replacement string may cause the results to be different than if it + * were being treated as a literal replacement string. Dollar signs may be + * treated as references to captured subsequences as described above, and + * backslashes are used to escape literal characters in the replacement + * string. + * + *

This method is intended to be used in a loop together with the + * {@link #appendTail appendTail} and {@link #find find} methods. The + * following code, for example, writes one dog two dogs in the + * yard to the standard-output stream:

+ * + *
+     * Pattern p = Pattern.compile("cat");
+     * Matcher m = p.matcher("one cat two cats in the yard");
+     * StringBuffer sb = new StringBuffer();
+     * while (m.find()) {
+     *     m.appendReplacement(sb, "dog");
+     * }
+     * m.appendTail(sb);
+     * System.out.println(sb.toString());
+ * + * @param sb + * The target string buffer + * + * @param replacement + * The replacement string + * + * @return This matcher + * + * @throws IllegalStateException + * If no match has yet been attempted, + * or if the previous match operation failed + * + * @throws IllegalArgumentException + * If the replacement string refers to a named-capturing + * group that does not exist in the pattern + * + * @throws IndexOutOfBoundsException + * If the replacement string refers to a capturing group + * that does not exist in the pattern + */ + public Matcher appendReplacement(StringBuffer sb, String replacement) { + + // If no match, return error + if (first < 0) + throw new IllegalStateException("No match available"); + + // Process substitution string to replace group references with groups + int cursor = 0; + StringBuilder result = new StringBuilder(); + + while (cursor < replacement.length()) { + char nextChar = replacement.charAt(cursor); + if (nextChar == '\\') { + cursor++; + nextChar = replacement.charAt(cursor); + result.append(nextChar); + cursor++; + } else if (nextChar == '$') { + // Skip past $ + cursor++; + // A StringIndexOutOfBoundsException is thrown if + // this "$" is the last character in replacement + // string in current implementation, a IAE might be + // more appropriate. + nextChar = replacement.charAt(cursor); + int refNum = -1; + if (nextChar == '{') { + cursor++; + StringBuilder gsb = new StringBuilder(); + while (cursor < replacement.length()) { + nextChar = replacement.charAt(cursor); + if (ASCII.isLower(nextChar) || + ASCII.isUpper(nextChar) || + ASCII.isDigit(nextChar)) { + gsb.append(nextChar); + cursor++; + } else { + break; + } + } + if (gsb.length() == 0) + throw new IllegalArgumentException( + "named capturing group has 0 length name"); + if (nextChar != '}') + throw new IllegalArgumentException( + "named capturing group is missing trailing '}'"); + String gname = gsb.toString(); + if (ASCII.isDigit(gname.charAt(0))) + throw new IllegalArgumentException( + "capturing group name {" + gname + + "} starts with digit character"); + if (!parentPattern.namedGroups().containsKey(gname)) + throw new IllegalArgumentException( + "No group with name {" + gname + "}"); + refNum = parentPattern.namedGroups().get(gname); + cursor++; + } else { + // The first number is always a group + refNum = (int)nextChar - '0'; + if ((refNum < 0)||(refNum > 9)) + throw new IllegalArgumentException( + "Illegal group reference"); + cursor++; + // Capture the largest legal group string + boolean done = false; + while (!done) { + if (cursor >= replacement.length()) { + break; + } + int nextDigit = replacement.charAt(cursor) - '0'; + if ((nextDigit < 0)||(nextDigit > 9)) { // not a number + break; + } + int newRefNum = (refNum * 10) + nextDigit; + if (groupCount() < newRefNum) { + done = true; + } else { + refNum = newRefNum; + cursor++; + } + } + } + // Append group + if (start(refNum) != -1 && end(refNum) != -1) + result.append(text, start(refNum), end(refNum)); + } else { + result.append(nextChar); + cursor++; + } + } + // Append the intervening text + sb.append(text, lastAppendPosition, first); + // Append the match substitution + sb.append(result); + + lastAppendPosition = last; + return this; + } + + /** + * Implements a terminal append-and-replace step. + * + *

This method reads characters from the input sequence, starting at + * the append position, and appends them to the given string buffer. It is + * intended to be invoked after one or more invocations of the {@link + * #appendReplacement appendReplacement} method in order to copy the + * remainder of the input sequence.

+ * + * @param sb + * The target string buffer + * + * @return The target string buffer + */ + public StringBuffer appendTail(StringBuffer sb) { + sb.append(text, lastAppendPosition, getTextLength()); + return sb; + } + + /** + * Replaces every subsequence of the input sequence that matches the + * pattern with the given replacement string. + * + *

This method first resets this matcher. It then scans the input + * sequence looking for matches of the pattern. Characters that are not + * part of any match are appended directly to the result string; each match + * is replaced in the result by the replacement string. The replacement + * string may contain references to captured subsequences as in the {@link + * #appendReplacement appendReplacement} method. + * + *

Note that backslashes (\) and dollar signs ($) in + * the replacement string may cause the results to be different than if it + * were being treated as a literal replacement string. Dollar signs may be + * treated as references to captured subsequences as described above, and + * backslashes are used to escape literal characters in the replacement + * string. + * + *

Given the regular expression a*b, the input + * "aabfooaabfooabfoob", and the replacement string + * "-", an invocation of this method on a matcher for that + * expression would yield the string "-foo-foo-foo-". + * + *

Invoking this method changes this matcher's state. If the matcher + * is to be used in further matching operations then it should first be + * reset.

+ * + * @param replacement + * The replacement string + * + * @return The string constructed by replacing each matching subsequence + * by the replacement string, substituting captured subsequences + * as needed + */ + public String replaceAll(String replacement) { + reset(); + boolean result = find(); + if (result) { + StringBuffer sb = new StringBuffer(); + do { + appendReplacement(sb, replacement); + result = find(); + } while (result); + appendTail(sb); + return sb.toString(); + } + return text.toString(); + } + + /** + * Replaces the first subsequence of the input sequence that matches the + * pattern with the given replacement string. + * + *

This method first resets this matcher. It then scans the input + * sequence looking for a match of the pattern. Characters that are not + * part of the match are appended directly to the result string; the match + * is replaced in the result by the replacement string. The replacement + * string may contain references to captured subsequences as in the {@link + * #appendReplacement appendReplacement} method. + * + *

Note that backslashes (\) and dollar signs ($) in + * the replacement string may cause the results to be different than if it + * were being treated as a literal replacement string. Dollar signs may be + * treated as references to captured subsequences as described above, and + * backslashes are used to escape literal characters in the replacement + * string. + * + *

Given the regular expression dog, the input + * "zzzdogzzzdogzzz", and the replacement string + * "cat", an invocation of this method on a matcher for that + * expression would yield the string "zzzcatzzzdogzzz".

+ * + *

Invoking this method changes this matcher's state. If the matcher + * is to be used in further matching operations then it should first be + * reset.

+ * + * @param replacement + * The replacement string + * @return The string constructed by replacing the first matching + * subsequence by the replacement string, substituting captured + * subsequences as needed + */ + public String replaceFirst(String replacement) { + if (replacement == null) + throw new NullPointerException("replacement"); + reset(); + if (!find()) + return text.toString(); + StringBuffer sb = new StringBuffer(); + appendReplacement(sb, replacement); + appendTail(sb); + return sb.toString(); + } + + /** + * Sets the limits of this matcher's region. The region is the part of the + * input sequence that will be searched to find a match. Invoking this + * method resets the matcher, and then sets the region to start at the + * index specified by the start parameter and end at the + * index specified by the end parameter. + * + *

Depending on the transparency and anchoring being used (see + * {@link #useTransparentBounds useTransparentBounds} and + * {@link #useAnchoringBounds useAnchoringBounds}), certain constructs such + * as anchors may behave differently at or around the boundaries of the + * region. + * + * @param start + * The index to start searching at (inclusive) + * @param end + * The index to end searching at (exclusive) + * @throws IndexOutOfBoundsException + * If start or end is less than zero, if + * start is greater than the length of the input sequence, if + * end is greater than the length of the input sequence, or if + * start is greater than end. + * @return this matcher + * @since 1.5 + */ + public Matcher region(int start, int end) { + if ((start < 0) || (start > getTextLength())) + throw new IndexOutOfBoundsException("start"); + if ((end < 0) || (end > getTextLength())) + throw new IndexOutOfBoundsException("end"); + if (start > end) + throw new IndexOutOfBoundsException("start > end"); + reset(); + from = start; + to = end; + return this; + } + + /** + * Reports the start index of this matcher's region. The + * searches this matcher conducts are limited to finding matches + * within {@link #regionStart regionStart} (inclusive) and + * {@link #regionEnd regionEnd} (exclusive). + * + * @return The starting point of this matcher's region + * @since 1.5 + */ + public int regionStart() { + return from; + } + + /** + * Reports the end index (exclusive) of this matcher's region. + * The searches this matcher conducts are limited to finding matches + * within {@link #regionStart regionStart} (inclusive) and + * {@link #regionEnd regionEnd} (exclusive). + * + * @return the ending point of this matcher's region + * @since 1.5 + */ + public int regionEnd() { + return to; + } + + /** + * Queries the transparency of region bounds for this matcher. + * + *

This method returns true if this matcher uses + * transparent bounds, false if it uses opaque + * bounds. + * + *

See {@link #useTransparentBounds useTransparentBounds} for a + * description of transparent and opaque bounds. + * + *

By default, a matcher uses opaque region boundaries. + * + * @return true iff this matcher is using transparent bounds, + * false otherwise. + * @see java.util.regex.Matcher#useTransparentBounds(boolean) + * @since 1.5 + */ + public boolean hasTransparentBounds() { + return transparentBounds; + } + + /** + * Sets the transparency of region bounds for this matcher. + * + *

Invoking this method with an argument of true will set this + * matcher to use transparent bounds. If the boolean + * argument is false, then opaque bounds will be used. + * + *

Using transparent bounds, the boundaries of this + * matcher's region are transparent to lookahead, lookbehind, + * and boundary matching constructs. Those constructs can see beyond the + * boundaries of the region to see if a match is appropriate. + * + *

Using opaque bounds, the boundaries of this matcher's + * region are opaque to lookahead, lookbehind, and boundary matching + * constructs that may try to see beyond them. Those constructs cannot + * look past the boundaries so they will fail to match anything outside + * of the region. + * + *

By default, a matcher uses opaque bounds. + * + * @param b a boolean indicating whether to use opaque or transparent + * regions + * @return this matcher + * @see java.util.regex.Matcher#hasTransparentBounds + * @since 1.5 + */ + public Matcher useTransparentBounds(boolean b) { + transparentBounds = b; + return this; + } + + /** + * Queries the anchoring of region bounds for this matcher. + * + *

This method returns true if this matcher uses + * anchoring bounds, false otherwise. + * + *

See {@link #useAnchoringBounds useAnchoringBounds} for a + * description of anchoring bounds. + * + *

By default, a matcher uses anchoring region boundaries. + * + * @return true iff this matcher is using anchoring bounds, + * false otherwise. + * @see java.util.regex.Matcher#useAnchoringBounds(boolean) + * @since 1.5 + */ + public boolean hasAnchoringBounds() { + return anchoringBounds; + } + + /** + * Sets the anchoring of region bounds for this matcher. + * + *

Invoking this method with an argument of true will set this + * matcher to use anchoring bounds. If the boolean + * argument is false, then non-anchoring bounds will be + * used. + * + *

Using anchoring bounds, the boundaries of this + * matcher's region match anchors such as ^ and $. + * + *

Without anchoring bounds, the boundaries of this + * matcher's region will not match anchors such as ^ and $. + * + *

By default, a matcher uses anchoring region boundaries. + * + * @param b a boolean indicating whether or not to use anchoring bounds. + * @return this matcher + * @see java.util.regex.Matcher#hasAnchoringBounds + * @since 1.5 + */ + public Matcher useAnchoringBounds(boolean b) { + anchoringBounds = b; + return this; + } + + /** + *

Returns the string representation of this matcher. The + * string representation of a Matcher contains information + * that may be useful for debugging. The exact format is unspecified. + * + * @return The string representation of this matcher + * @since 1.5 + */ + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("java.util.regex.Matcher"); + sb.append("[pattern=" + pattern()); + sb.append(" region="); + sb.append(regionStart() + "," + regionEnd()); + sb.append(" lastmatch="); + if ((first >= 0) && (group() != null)) { + sb.append(group()); + } + sb.append("]"); + return sb.toString(); + } + + /** + *

Returns true if the end of input was hit by the search engine in + * the last match operation performed by this matcher. + * + *

When this method returns true, then it is possible that more input + * would have changed the result of the last search. + * + * @return true iff the end of input was hit in the last match; false + * otherwise + * @since 1.5 + */ + public boolean hitEnd() { + return hitEnd; + } + + /** + *

Returns true if more input could change a positive match into a + * negative one. + * + *

If this method returns true, and a match was found, then more + * input could cause the match to be lost. If this method returns false + * and a match was found, then more input might change the match but the + * match won't be lost. If a match was not found, then requireEnd has no + * meaning. + * + * @return true iff more input could change a positive match into a + * negative one. + * @since 1.5 + */ + public boolean requireEnd() { + return requireEnd; + } + + /** + * Initiates a search to find a Pattern within the given bounds. + * The groups are filled with default values and the match of the root + * of the state machine is called. The state machine will hold the state + * of the match as it proceeds in this matcher. + * + * Matcher.from is not set here, because it is the "hard" boundary + * of the start of the search which anchors will set to. The from param + * is the "soft" boundary of the start of the search, meaning that the + * regex tries to match at that index but ^ won't match there. Subsequent + * calls to the search methods start at a new "soft" boundary which is + * the end of the previous match. + */ + boolean search(int from) { + this.hitEnd = false; + this.requireEnd = false; + from = from < 0 ? 0 : from; + this.first = from; + this.oldLast = oldLast < 0 ? from : oldLast; + for (int i = 0; i < groups.length; i++) + groups[i] = -1; + acceptMode = NOANCHOR; + boolean result = parentPattern.root.match(this, from, text); + if (!result) + this.first = -1; + this.oldLast = this.last; + return result; + } + + /** + * Initiates a search for an anchored match to a Pattern within the given + * bounds. The groups are filled with default values and the match of the + * root of the state machine is called. The state machine will hold the + * state of the match as it proceeds in this matcher. + */ + boolean match(int from, int anchor) { + this.hitEnd = false; + this.requireEnd = false; + from = from < 0 ? 0 : from; + this.first = from; + this.oldLast = oldLast < 0 ? from : oldLast; + for (int i = 0; i < groups.length; i++) + groups[i] = -1; + acceptMode = anchor; + boolean result = parentPattern.matchRoot.match(this, from, text); + if (!result) + this.first = -1; + this.oldLast = this.last; + return result; + } + + /** + * Returns the end index of the text. + * + * @return the index after the last character in the text + */ + int getTextLength() { + return text.length(); + } + + /** + * Generates a String from this Matcher's input in the specified range. + * + * @param beginIndex the beginning index, inclusive + * @param endIndex the ending index, exclusive + * @return A String generated from this Matcher's input + */ + CharSequence getSubSequence(int beginIndex, int endIndex) { + return text.subSequence(beginIndex, endIndex); + } + + /** + * Returns this Matcher's input character at index i. + * + * @return A char from the specified index + */ + char charAt(int i) { + return text.charAt(i); + } + +} diff -r 588d5bf7a560 -r bca65655b36b rt/emul/compact/src/main/java/java/util/regex/Pattern.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rt/emul/compact/src/main/java/java/util/regex/Pattern.java Mon Oct 07 16:13:27 2013 +0200 @@ -0,0 +1,5648 @@ +/* + * Copyright (c) 1999, 2011, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package java.util.regex; + +import java.security.AccessController; +import java.security.PrivilegedAction; +import java.text.CharacterIterator; +import java.text.Normalizer; +import java.util.Locale; +import java.util.Map; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Arrays; + + +/** + * A compiled representation of a regular expression. + * + *

A regular expression, specified as a string, must first be compiled into + * an instance of this class. The resulting pattern can then be used to create + * a {@link Matcher} object that can match arbitrary {@link + * java.lang.CharSequence character sequences} against the regular + * expression. All of the state involved in performing a match resides in the + * matcher, so many matchers can share the same pattern. + * + *

A typical invocation sequence is thus + * + *

+ * Pattern p = Pattern.{@link #compile compile}("a*b");
+ * Matcher m = p.{@link #matcher matcher}("aaaaab");
+ * boolean b = m.{@link Matcher#matches matches}();
+ * + *

A {@link #matches matches} method is defined by this class as a + * convenience for when a regular expression is used just once. This method + * compiles an expression and matches an input sequence against it in a single + * invocation. The statement + * + *

+ * boolean b = Pattern.matches("a*b", "aaaaab");
+ * + * is equivalent to the three statements above, though for repeated matches it + * is less efficient since it does not allow the compiled pattern to be reused. + * + *

Instances of this class are immutable and are safe for use by multiple + * concurrent threads. Instances of the {@link Matcher} class are not safe for + * such use. + * + * + * + *

Summary of regular-expression constructs

+ * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
ConstructMatches
 
Characters
xThe character x
\\The backslash character
\0nThe character with octal value 0n + * (0 <= n <= 7)
\0nnThe character with octal value 0nn + * (0 <= n <= 7)
\0mnnThe character with octal value 0mnn + * (0 <= m <= 3, + * 0 <= n <= 7)
\xhhThe character with hexadecimal value 0xhh
\uhhhhThe character with hexadecimal value 0xhhhh
\x{h...h}The character with hexadecimal value 0xh...h + * ({@link java.lang.Character#MIN_CODE_POINT Character.MIN_CODE_POINT} + *  <= 0xh...h <=  + * {@link java.lang.Character#MAX_CODE_POINT Character.MAX_CODE_POINT})
\tThe tab character ('\u0009')
\nThe newline (line feed) character ('\u000A')
\rThe carriage-return character ('\u000D')
\fThe form-feed character ('\u000C')
\aThe alert (bell) character ('\u0007')
\eThe escape character ('\u001B')
\cxThe control character corresponding to x
 
Character classes
[abc]a, b, or c (simple class)
[^abc]Any character except a, b, or c (negation)
[a-zA-Z]a through z + * or A through Z, inclusive (range)
[a-d[m-p]]a through d, + * or m through p: [a-dm-p] (union)
[a-z&&[def]]d, e, or f (intersection)
[a-z&&[^bc]]a through z, + * except for b and c: [ad-z] (subtraction)
[a-z&&[^m-p]]a through z, + * and not m through p: [a-lq-z](subtraction)
 
Predefined character classes
.Any character (may or may not match line terminators)
\dA digit: [0-9]
\DA non-digit: [^0-9]
\sA whitespace character: [ \t\n\x0B\f\r]
\SA non-whitespace character: [^\s]
\wA word character: [a-zA-Z_0-9]
\WA non-word character: [^\w]
 
POSIX character classes (US-ASCII only)
\p{Lower}A lower-case alphabetic character: [a-z]
\p{Upper}An upper-case alphabetic character:[A-Z]
\p{ASCII}All ASCII:[\x00-\x7F]
\p{Alpha}An alphabetic character:[\p{Lower}\p{Upper}]
\p{Digit}A decimal digit: [0-9]
\p{Alnum}An alphanumeric character:[\p{Alpha}\p{Digit}]
\p{Punct}Punctuation: One of !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
\p{Graph}A visible character: [\p{Alnum}\p{Punct}]
\p{Print}A printable character: [\p{Graph}\x20]
\p{Blank}A space or a tab: [ \t]
\p{Cntrl}A control character: [\x00-\x1F\x7F]
\p{XDigit}A hexadecimal digit: [0-9a-fA-F]
\p{Space}A whitespace character: [ \t\n\x0B\f\r]
 
java.lang.Character classes (simple java character type)
\p{javaLowerCase}Equivalent to java.lang.Character.isLowerCase()
\p{javaUpperCase}Equivalent to java.lang.Character.isUpperCase()
\p{javaWhitespace}Equivalent to java.lang.Character.isWhitespace()
\p{javaMirrored}Equivalent to java.lang.Character.isMirrored()
 
Classes for Unicode scripts, blocks, categories and binary properties
\p{IsLatin}A Latin script character (script)
\p{InGreek}A character in the Greek block (block)
\p{Lu}An uppercase letter (category)
\p{IsAlphabetic}An alphabetic character (binary property)
\p{Sc}A currency symbol
\P{InGreek}Any character except one in the Greek block (negation)
[\p{L}&&[^\p{Lu}]] Any letter except an uppercase letter (subtraction)
 
Boundary matchers
^The beginning of a line
$The end of a line
\bA word boundary
\BA non-word boundary
\AThe beginning of the input
\GThe end of the previous match
\ZThe end of the input but for the final + * terminator, if any
\zThe end of the input
 
Greedy quantifiers
X?X, once or not at all
X*X, zero or more times
X+X, one or more times
X{n}X, exactly n times
X{n,}X, at least n times
X{n,m}X, at least n but not more than m times
 
Reluctant quantifiers
X??X, once or not at all
X*?X, zero or more times
X+?X, one or more times
X{n}?X, exactly n times
X{n,}?X, at least n times
X{n,m}?X, at least n but not more than m times
 
Possessive quantifiers
X?+X, once or not at all
X*+X, zero or more times
X++X, one or more times
X{n}+X, exactly n times
X{n,}+X, at least n times
X{n,m}+X, at least n but not more than m times
 
Logical operators
XYX followed by Y
X|YEither X or Y
(X)X, as a capturing group
 
Back references
\nWhatever the nth + * capturing group matched
\k<name>Whatever the + * named-capturing group "name" matched
 
Quotation
\Nothing, but quotes the following character
\QNothing, but quotes all characters until \E
\ENothing, but ends quoting started by \Q
 
Special constructs (named-capturing and non-capturing)
(?<name>X)X, as a named-capturing group
(?:X)X, as a non-capturing group
(?idmsuxU-idmsuxU) Nothing, but turns match flags i + * d m s + * u x U + * on - off
(?idmsux-idmsux:X)  X, as a non-capturing group with the + * given flags i d + * m s u + * x on - off
(?=X)X, via zero-width positive lookahead
(?!X)X, via zero-width negative lookahead
(?<=X)X, via zero-width positive lookbehind
(?<!X)X, via zero-width negative lookbehind
(?>X)X, as an independent, non-capturing group
+ * + *
+ * + * + *
+ *

Backslashes, escapes, and quoting

+ * + *

The backslash character ('\') serves to introduce escaped + * constructs, as defined in the table above, as well as to quote characters + * that otherwise would be interpreted as unescaped constructs. Thus the + * expression \\ matches a single backslash and \{ matches a + * left brace. + * + *

It is an error to use a backslash prior to any alphabetic character that + * does not denote an escaped construct; these are reserved for future + * extensions to the regular-expression language. A backslash may be used + * prior to a non-alphabetic character regardless of whether that character is + * part of an unescaped construct. + * + *

Backslashes within string literals in Java source code are interpreted + * as required by + * The Java™ Language Specification + * as either Unicode escapes (section 3.3) or other character escapes (section 3.10.6) + * It is therefore necessary to double backslashes in string + * literals that represent regular expressions to protect them from + * interpretation by the Java bytecode compiler. The string literal + * "\b", for example, matches a single backspace character when + * interpreted as a regular expression, while "\\b" matches a + * word boundary. The string literal "\(hello\)" is illegal + * and leads to a compile-time error; in order to match the string + * (hello) the string literal "\\(hello\\)" + * must be used. + * + * + *

Character Classes

+ * + *

Character classes may appear within other character classes, and + * may be composed by the union operator (implicit) and the intersection + * operator (&&). + * The union operator denotes a class that contains every character that is + * in at least one of its operand classes. The intersection operator + * denotes a class that contains every character that is in both of its + * operand classes. + * + *

The precedence of character-class operators is as follows, from + * highest to lowest: + * + *

+ * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
1    Literal escape    \x
2    Grouping[...]
3    Rangea-z
4    Union[a-e][i-u]
5    Intersection[a-z&&[aeiou]]
+ * + *

Note that a different set of metacharacters are in effect inside + * a character class than outside a character class. For instance, the + * regular expression . loses its special meaning inside a + * character class, while the expression - becomes a range + * forming metacharacter. + * + * + *

Line terminators

+ * + *

A line terminator is a one- or two-character sequence that marks + * the end of a line of the input character sequence. The following are + * recognized as line terminators: + * + *

    + * + *
  • A newline (line feed) character ('\n'), + * + *
  • A carriage-return character followed immediately by a newline + * character ("\r\n"), + * + *
  • A standalone carriage-return character ('\r'), + * + *
  • A next-line character ('\u0085'), + * + *
  • A line-separator character ('\u2028'), or + * + *
  • A paragraph-separator character ('\u2029). + * + *
+ *

If {@link #UNIX_LINES} mode is activated, then the only line terminators + * recognized are newline characters. + * + *

The regular expression . matches any character except a line + * terminator unless the {@link #DOTALL} flag is specified. + * + *

By default, the regular expressions ^ and $ ignore + * line terminators and only match at the beginning and the end, respectively, + * of the entire input sequence. If {@link #MULTILINE} mode is activated then + * ^ matches at the beginning of input and after any line terminator + * except at the end of input. When in {@link #MULTILINE} mode $ + * matches just before a line terminator or the end of the input sequence. + * + * + *

Groups and capturing

+ * + * + *
Group number
+ *

Capturing groups are numbered by counting their opening parentheses from + * left to right. In the expression ((A)(B(C))), for example, there + * are four such groups:

+ * + *
+ * + * + * + * + * + * + * + * + *
1    ((A)(B(C)))
2    (A)
3    (B(C))
4    (C)
+ * + *

Group zero always stands for the entire expression. + * + *

Capturing groups are so named because, during a match, each subsequence + * of the input sequence that matches such a group is saved. The captured + * subsequence may be used later in the expression, via a back reference, and + * may also be retrieved from the matcher once the match operation is complete. + * + * + *

Group name
+ *

A capturing group can also be assigned a "name", a named-capturing group, + * and then be back-referenced later by the "name". Group names are composed of + * the following characters. The first character must be a letter. + * + *

    + *
  • The uppercase letters 'A' through 'Z' + * ('\u0041' through '\u005a'), + *
  • The lowercase letters 'a' through 'z' + * ('\u0061' through '\u007a'), + *
  • The digits '0' through '9' + * ('\u0030' through '\u0039'), + *
+ * + *

A named-capturing group is still numbered as described in + * Group number. + * + *

The captured input associated with a group is always the subsequence + * that the group most recently matched. If a group is evaluated a second time + * because of quantification then its previously-captured value, if any, will + * be retained if the second evaluation fails. Matching the string + * "aba" against the expression (a(b)?)+, for example, leaves + * group two set to "b". All captured input is discarded at the + * beginning of each match. + * + *

Groups beginning with (? are either pure, non-capturing groups + * that do not capture text and do not count towards the group total, or + * named-capturing group. + * + *

Unicode support

+ * + *

This class is in conformance with Level 1 of Unicode Technical + * Standard #18: Unicode Regular Expression, plus RL2.1 + * Canonical Equivalents. + *

+ * Unicode escape sequences such as \u2014 in Java source code + * are processed as described in section 3.3 of + * The Java™ Language Specification. + * Such escape sequences are also implemented directly by the regular-expression + * parser so that Unicode escapes can be used in expressions that are read from + * files or from the keyboard. Thus the strings "\u2014" and + * "\\u2014", while not equal, compile into the same pattern, which + * matches the character with hexadecimal value 0x2014. + *

+ * A Unicode character can also be represented in a regular-expression by + * using its Hex notation(hexadecimal code point value) directly as described in construct + * \x{...}, for example a supplementary character U+2011F + * can be specified as \x{2011F}, instead of two consecutive + * Unicode escape sequences of the surrogate pair + * \uD840\uDD1F. + *

+ * Unicode scripts, blocks, categories and binary properties are written with + * the \p and \P constructs as in Perl. + * \p{prop} matches if + * the input has the property prop, while \P{prop} + * does not match if the input has that property. + *

+ * Scripts, blocks, categories and binary properties can be used both inside + * and outside of a character class. + * + *

+ * Scripts are specified either with the prefix {@code Is}, as in + * {@code IsHiragana}, or by using the {@code script} keyword (or its short + * form {@code sc})as in {@code script=Hiragana} or {@code sc=Hiragana}. + *

+ * The script names supported by Pattern are the valid script names + * accepted and defined by + * {@link java.lang.Character.UnicodeScript#forName(String) UnicodeScript.forName}. + * + *

+ * Blocks are specified with the prefix {@code In}, as in + * {@code InMongolian}, or by using the keyword {@code block} (or its short + * form {@code blk}) as in {@code block=Mongolian} or {@code blk=Mongolian}. + *

+ * The block names supported by Pattern are the valid block names + * accepted and defined by + * {@link java.lang.Character.UnicodeBlock#forName(String) UnicodeBlock.forName}. + *

+ * + * Categories may be specified with the optional prefix {@code Is}: + * Both {@code \p{L}} and {@code \p{IsL}} denote the category of Unicode + * letters. Same as scripts and blocks, categories can also be specified + * by using the keyword {@code general_category} (or its short form + * {@code gc}) as in {@code general_category=Lu} or {@code gc=Lu}. + *

+ * The supported categories are those of + * + * The Unicode Standard in the version specified by the + * {@link java.lang.Character Character} class. The category names are those + * defined in the Standard, both normative and informative. + *

+ * + * Binary properties are specified with the prefix {@code Is}, as in + * {@code IsAlphabetic}. The supported binary properties by Pattern + * are + *

+ + + *

+ * Predefined Character classes and POSIX character classes are in + * conformance with the recommendation of Annex C: Compatibility Properties + * of Unicode Regular Expression + * , when {@link #UNICODE_CHARACTER_CLASS} flag is specified. + *

+ * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
ClassesMatches
\p{Lower}A lowercase character:\p{IsLowercase}
\p{Upper}An uppercase character:\p{IsUppercase}
\p{ASCII}All ASCII:[\x00-\x7F]
\p{Alpha}An alphabetic character:\p{IsAlphabetic}
\p{Digit}A decimal digit character:p{IsDigit}
\p{Alnum}An alphanumeric character:[\p{IsAlphabetic}\p{IsDigit}]
\p{Punct}A punctuation character:p{IsPunctuation}
\p{Graph}A visible character: [^\p{IsWhite_Space}\p{gc=Cc}\p{gc=Cs}\p{gc=Cn}]
\p{Print}A printable character: [\p{Graph}\p{Blank}&&[^\p{Cntrl}]]
\p{Blank}A space or a tab: [\p{IsWhite_Space}&&[^\p{gc=Zl}\p{gc=Zp}\x0a\x0b\x0c\x0d\x85]]
\p{Cntrl}A control character: \p{gc=Cc}
\p{XDigit}A hexadecimal digit: [\p{gc=Nd}\p{IsHex_Digit}]
\p{Space}A whitespace character:\p{IsWhite_Space}
\dA digit: \p{IsDigit}
\DA non-digit: [^\d]
\sA whitespace character: \p{IsWhite_Space}
\SA non-whitespace character: [^\s]
\wA word character: [\p{Alpha}\p{gc=Mn}\p{gc=Me}\p{gc=Mc}\p{Digit}\p{gc=Pc}]
\WA non-word character: [^\w]
+ *

+ * + * Categories that behave like the java.lang.Character + * boolean ismethodname methods (except for the deprecated ones) are + * available through the same \p{prop} syntax where + * the specified property has the name javamethodname. + * + *

Comparison to Perl 5

+ * + *

The Pattern engine performs traditional NFA-based matching + * with ordered alternation as occurs in Perl 5. + * + *

Perl constructs not supported by this class:

+ * + *
+ * + *

Constructs supported by this class but not by Perl:

+ * + * + * + *

Notable differences from Perl:

+ * + * + * + * + *

For a more precise description of the behavior of regular expression + * constructs, please see + * Mastering Regular Expressions, 3nd Edition, Jeffrey E. F. Friedl, + * O'Reilly and Associates, 2006. + *

+ * + * @see java.lang.String#split(String, int) + * @see java.lang.String#split(String) + * + * @author Mike McCloskey + * @author Mark Reinhold + * @author JSR-51 Expert Group + * @since 1.4 + * @spec JSR-51 + */ + +public final class Pattern + implements java.io.Serializable +{ + + /** + * Regular expression modifier values. Instead of being passed as + * arguments, they can also be passed as inline modifiers. + * For example, the following statements have the same effect. + *
+     * RegExp r1 = RegExp.compile("abc", Pattern.I|Pattern.M);
+     * RegExp r2 = RegExp.compile("(?im)abc", 0);
+     * 
+ * + * The flags are duplicated so that the familiar Perl match flag + * names are available. + */ + + /** + * Enables Unix lines mode. + * + *

In this mode, only the '\n' line terminator is recognized + * in the behavior of ., ^, and $. + * + *

Unix lines mode can also be enabled via the embedded flag + * expression (?d). + */ + public static final int UNIX_LINES = 0x01; + + /** + * Enables case-insensitive matching. + * + *

By default, case-insensitive matching assumes that only characters + * in the US-ASCII charset are being matched. Unicode-aware + * case-insensitive matching can be enabled by specifying the {@link + * #UNICODE_CASE} flag in conjunction with this flag. + * + *

Case-insensitive matching can also be enabled via the embedded flag + * expression (?i). + * + *

Specifying this flag may impose a slight performance penalty.

+ */ + public static final int CASE_INSENSITIVE = 0x02; + + /** + * Permits whitespace and comments in pattern. + * + *

In this mode, whitespace is ignored, and embedded comments starting + * with # are ignored until the end of a line. + * + *

Comments mode can also be enabled via the embedded flag + * expression (?x). + */ + public static final int COMMENTS = 0x04; + + /** + * Enables multiline mode. + * + *

In multiline mode the expressions ^ and $ match + * just after or just before, respectively, a line terminator or the end of + * the input sequence. By default these expressions only match at the + * beginning and the end of the entire input sequence. + * + *

Multiline mode can also be enabled via the embedded flag + * expression (?m).

+ */ + public static final int MULTILINE = 0x08; + + /** + * Enables literal parsing of the pattern. + * + *

When this flag is specified then the input string that specifies + * the pattern is treated as a sequence of literal characters. + * Metacharacters or escape sequences in the input sequence will be + * given no special meaning. + * + *

The flags CASE_INSENSITIVE and UNICODE_CASE retain their impact on + * matching when used in conjunction with this flag. The other flags + * become superfluous. + * + *

There is no embedded flag character for enabling literal parsing. + * @since 1.5 + */ + public static final int LITERAL = 0x10; + + /** + * Enables dotall mode. + * + *

In dotall mode, the expression . matches any character, + * including a line terminator. By default this expression does not match + * line terminators. + * + *

Dotall mode can also be enabled via the embedded flag + * expression (?s). (The s is a mnemonic for + * "single-line" mode, which is what this is called in Perl.)

+ */ + public static final int DOTALL = 0x20; + + /** + * Enables Unicode-aware case folding. + * + *

When this flag is specified then case-insensitive matching, when + * enabled by the {@link #CASE_INSENSITIVE} flag, is done in a manner + * consistent with the Unicode Standard. By default, case-insensitive + * matching assumes that only characters in the US-ASCII charset are being + * matched. + * + *

Unicode-aware case folding can also be enabled via the embedded flag + * expression (?u). + * + *

Specifying this flag may impose a performance penalty.

+ */ + public static final int UNICODE_CASE = 0x40; + + /** + * Enables canonical equivalence. + * + *

When this flag is specified then two characters will be considered + * to match if, and only if, their full canonical decompositions match. + * The expression "a\u030A", for example, will match the + * string "\u00E5" when this flag is specified. By default, + * matching does not take canonical equivalence into account. + * + *

There is no embedded flag character for enabling canonical + * equivalence. + * + *

Specifying this flag may impose a performance penalty.

+ */ + public static final int CANON_EQ = 0x80; + + /** + * Enables the Unicode version of Predefined character classes and + * POSIX character classes. + * + *

When this flag is specified then the (US-ASCII only) + * Predefined character classes and POSIX character classes + * are in conformance with + * Unicode Technical + * Standard #18: Unicode Regular Expression + * Annex C: Compatibility Properties. + *

+ * The UNICODE_CHARACTER_CLASS mode can also be enabled via the embedded + * flag expression (?U). + *

+ * The flag implies UNICODE_CASE, that is, it enables Unicode-aware case + * folding. + *

+ * Specifying this flag may impose a performance penalty.

+ * @since 1.7 + */ + public static final int UNICODE_CHARACTER_CLASS = 0x100; + + /* Pattern has only two serialized components: The pattern string + * and the flags, which are all that is needed to recompile the pattern + * when it is deserialized. + */ + + /** use serialVersionUID from Merlin b59 for interoperability */ + private static final long serialVersionUID = 5073258162644648461L; + + /** + * The original regular-expression pattern string. + * + * @serial + */ + private String pattern; + + /** + * The original pattern flags. + * + * @serial + */ + private int flags; + + /** + * Boolean indicating this Pattern is compiled; this is necessary in order + * to lazily compile deserialized Patterns. + */ + private transient volatile boolean compiled = false; + + /** + * The normalized pattern string. + */ + private transient String normalizedPattern; + + /** + * The starting point of state machine for the find operation. This allows + * a match to start anywhere in the input. + */ + transient Node root; + + /** + * The root of object tree for a match operation. The pattern is matched + * at the beginning. This may include a find that uses BnM or a First + * node. + */ + transient Node matchRoot; + + /** + * Temporary storage used by parsing pattern slice. + */ + transient int[] buffer; + + /** + * Map the "name" of the "named capturing group" to its group id + * node. + */ + transient volatile Map namedGroups; + + /** + * Temporary storage used while parsing group references. + */ + transient GroupHead[] groupNodes; + + /** + * Temporary null terminated code point array used by pattern compiling. + */ + private transient int[] temp; + + /** + * The number of capturing groups in this Pattern. Used by matchers to + * allocate storage needed to perform a match. + */ + transient int capturingGroupCount; + + /** + * The local variable count used by parsing tree. Used by matchers to + * allocate storage needed to perform a match. + */ + transient int localCount; + + /** + * Index into the pattern string that keeps track of how much has been + * parsed. + */ + private transient int cursor; + + /** + * Holds the length of the pattern string. + */ + private transient int patternLength; + + /** + * If the Start node might possibly match supplementary characters. + * It is set to true during compiling if + * (1) There is supplementary char in pattern, or + * (2) There is complement node of Category or Block + */ + private transient boolean hasSupplementary; + + /** + * Compiles the given regular expression into a pattern.

+ * + * @param regex + * The expression to be compiled + * + * @throws PatternSyntaxException + * If the expression's syntax is invalid + */ + public static Pattern compile(String regex) { + return new Pattern(regex, 0); + } + + /** + * Compiles the given regular expression into a pattern with the given + * flags.

+ * + * @param regex + * The expression to be compiled + * + * @param flags + * Match flags, a bit mask that may include + * {@link #CASE_INSENSITIVE}, {@link #MULTILINE}, {@link #DOTALL}, + * {@link #UNICODE_CASE}, {@link #CANON_EQ}, {@link #UNIX_LINES}, + * {@link #LITERAL}, {@link #UNICODE_CHARACTER_CLASS} + * and {@link #COMMENTS} + * + * @throws IllegalArgumentException + * If bit values other than those corresponding to the defined + * match flags are set in flags + * + * @throws PatternSyntaxException + * If the expression's syntax is invalid + */ + public static Pattern compile(String regex, int flags) { + return new Pattern(regex, flags); + } + + /** + * Returns the regular expression from which this pattern was compiled. + *

+ * + * @return The source of this pattern + */ + public String pattern() { + return pattern; + } + + /** + *

Returns the string representation of this pattern. This + * is the regular expression from which this pattern was + * compiled.

+ * + * @return The string representation of this pattern + * @since 1.5 + */ + public String toString() { + return pattern; + } + + /** + * Creates a matcher that will match the given input against this pattern. + *

+ * + * @param input + * The character sequence to be matched + * + * @return A new matcher for this pattern + */ + public Matcher matcher(CharSequence input) { + if (!compiled) { + synchronized(this) { + if (!compiled) + compile(); + } + } + Matcher m = new Matcher(this, input); + return m; + } + + /** + * Returns this pattern's match flags.

+ * + * @return The match flags specified when this pattern was compiled + */ + public int flags() { + return flags; + } + + /** + * Compiles the given regular expression and attempts to match the given + * input against it. + * + *

An invocation of this convenience method of the form + * + *

+     * Pattern.matches(regex, input);
+ * + * behaves in exactly the same way as the expression + * + *
+     * Pattern.compile(regex).matcher(input).matches()
+ * + *

If a pattern is to be used multiple times, compiling it once and reusing + * it will be more efficient than invoking this method each time.

+ * + * @param regex + * The expression to be compiled + * + * @param input + * The character sequence to be matched + * + * @throws PatternSyntaxException + * If the expression's syntax is invalid + */ + public static boolean matches(String regex, CharSequence input) { + Pattern p = Pattern.compile(regex); + Matcher m = p.matcher(input); + return m.matches(); + } + + /** + * Splits the given input sequence around matches of this pattern. + * + *

The array returned by this method contains each substring of the + * input sequence that is terminated by another subsequence that matches + * this pattern or is terminated by the end of the input sequence. The + * substrings in the array are in the order in which they occur in the + * input. If this pattern does not match any subsequence of the input then + * the resulting array has just one element, namely the input sequence in + * string form. + * + *

The limit parameter controls the number of times the + * pattern is applied and therefore affects the length of the resulting + * array. If the limit n is greater than zero then the pattern + * will be applied at most n - 1 times, the array's + * length will be no greater than n, and the array's last entry + * will contain all input beyond the last matched delimiter. If n + * is non-positive then the pattern will be applied as many times as + * possible and the array can have any length. If n is zero then + * the pattern will be applied as many times as possible, the array can + * have any length, and trailing empty strings will be discarded. + * + *

The input "boo:and:foo", for example, yields the following + * results with these parameters: + * + *

+ * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *

Regex    

Limit    

Result    

:2{ "boo", "and:foo" }
:5{ "boo", "and", "foo" }
:-2{ "boo", "and", "foo" }
o5{ "b", "", ":and:f", "", "" }
o-2{ "b", "", ":and:f", "", "" }
o0{ "b", "", ":and:f" }
+ * + * + * @param input + * The character sequence to be split + * + * @param limit + * The result threshold, as described above + * + * @return The array of strings computed by splitting the input + * around matches of this pattern + */ + public String[] split(CharSequence input, int limit) { + int index = 0; + boolean matchLimited = limit > 0; + ArrayList matchList = new ArrayList<>(); + Matcher m = matcher(input); + + // Add segments before each match found + while(m.find()) { + if (!matchLimited || matchList.size() < limit - 1) { + String match = input.subSequence(index, m.start()).toString(); + matchList.add(match); + index = m.end(); + } else if (matchList.size() == limit - 1) { // last one + String match = input.subSequence(index, + input.length()).toString(); + matchList.add(match); + index = m.end(); + } + } + + // If no match was found, return this + if (index == 0) + return new String[] {input.toString()}; + + // Add remaining segment + if (!matchLimited || matchList.size() < limit) + matchList.add(input.subSequence(index, input.length()).toString()); + + // Construct result + int resultSize = matchList.size(); + if (limit == 0) + while (resultSize > 0 && matchList.get(resultSize-1).equals("")) + resultSize--; + String[] result = new String[resultSize]; + return matchList.subList(0, resultSize).toArray(result); + } + + /** + * Splits the given input sequence around matches of this pattern. + * + *

This method works as if by invoking the two-argument {@link + * #split(java.lang.CharSequence, int) split} method with the given input + * sequence and a limit argument of zero. Trailing empty strings are + * therefore not included in the resulting array.

+ * + *

The input "boo:and:foo", for example, yields the following + * results with these expressions: + * + *

+ * + * + * + * + * + * + *

Regex    

Result

:{ "boo", "and", "foo" }
o{ "b", "", ":and:f" }
+ * + * + * @param input + * The character sequence to be split + * + * @return The array of strings computed by splitting the input + * around matches of this pattern + */ + public String[] split(CharSequence input) { + return split(input, 0); + } + + /** + * Returns a literal pattern String for the specified + * String. + * + *

This method produces a String that can be used to + * create a Pattern that would match the string + * s as if it were a literal pattern.

Metacharacters + * or escape sequences in the input sequence will be given no special + * meaning. + * + * @param s The string to be literalized + * @return A literal string replacement + * @since 1.5 + */ + public static String quote(String s) { + int slashEIndex = s.indexOf("\\E"); + if (slashEIndex == -1) + return "\\Q" + s + "\\E"; + + StringBuilder sb = new StringBuilder(s.length() * 2); + sb.append("\\Q"); + slashEIndex = 0; + int current = 0; + while ((slashEIndex = s.indexOf("\\E", current)) != -1) { + sb.append(s.substring(current, slashEIndex)); + current = slashEIndex + 2; + sb.append("\\E\\\\E\\Q"); + } + sb.append(s.substring(current, s.length())); + sb.append("\\E"); + return sb.toString(); + } + + /** + * Recompile the Pattern instance from a stream. The original pattern + * string is read in and the object tree is recompiled from it. + */ + private void readObject(java.io.ObjectInputStream s) + throws java.io.IOException, ClassNotFoundException { + + // Read in all fields + s.defaultReadObject(); + + // Initialize counts + capturingGroupCount = 1; + localCount = 0; + + // if length > 0, the Pattern is lazily compiled + compiled = false; + if (pattern.length() == 0) { + root = new Start(lastAccept); + matchRoot = lastAccept; + compiled = true; + } + } + + /** + * This private constructor is used to create all Patterns. The pattern + * string and match flags are all that is needed to completely describe + * a Pattern. An empty pattern string results in an object tree with + * only a Start node and a LastNode node. + */ + private Pattern(String p, int f) { + pattern = p; + flags = f; + + // to use UNICODE_CASE if UNICODE_CHARACTER_CLASS present + if ((flags & UNICODE_CHARACTER_CLASS) != 0) + flags |= UNICODE_CASE; + + // Reset group index count + capturingGroupCount = 1; + localCount = 0; + + if (pattern.length() > 0) { + compile(); + } else { + root = new Start(lastAccept); + matchRoot = lastAccept; + } + } + + /** + * The pattern is converted to normalizedD form and then a pure group + * is constructed to match canonical equivalences of the characters. + */ + private void normalize() { + boolean inCharClass = false; + int lastCodePoint = -1; + + // Convert pattern into normalizedD form + normalizedPattern = Normalizer.normalize(pattern, Normalizer.Form.NFD); + patternLength = normalizedPattern.length(); + + // Modify pattern to match canonical equivalences + StringBuilder newPattern = new StringBuilder(patternLength); + for(int i=0; i= patternLength) + break; + c = normalizedPattern.codePointAt(i); + sequenceBuffer.appendCodePoint(c); + } + String ea = produceEquivalentAlternation( + sequenceBuffer.toString()); + newPattern.setLength(newPattern.length()-Character.charCount(lastCodePoint)); + newPattern.append("(?:").append(ea).append(")"); + } else if (c == '[' && lastCodePoint != '\\') { + i = normalizeCharClass(newPattern, i); + } else { + newPattern.appendCodePoint(c); + } + lastCodePoint = c; + i += Character.charCount(c); + } + normalizedPattern = newPattern.toString(); + } + + /** + * Complete the character class being parsed and add a set + * of alternations to it that will match the canonical equivalences + * of the characters within the class. + */ + private int normalizeCharClass(StringBuilder newPattern, int i) { + StringBuilder charClass = new StringBuilder(); + StringBuilder eq = null; + int lastCodePoint = -1; + String result; + + i++; + charClass.append("["); + while(true) { + int c = normalizedPattern.codePointAt(i); + StringBuilder sequenceBuffer; + + if (c == ']' && lastCodePoint != '\\') { + charClass.append((char)c); + break; + } else if (Character.getType(c) == Character.NON_SPACING_MARK) { + sequenceBuffer = new StringBuilder(); + sequenceBuffer.appendCodePoint(lastCodePoint); + while(Character.getType(c) == Character.NON_SPACING_MARK) { + sequenceBuffer.appendCodePoint(c); + i += Character.charCount(c); + if (i >= normalizedPattern.length()) + break; + c = normalizedPattern.codePointAt(i); + } + String ea = produceEquivalentAlternation( + sequenceBuffer.toString()); + + charClass.setLength(charClass.length()-Character.charCount(lastCodePoint)); + if (eq == null) + eq = new StringBuilder(); + eq.append('|'); + eq.append(ea); + } else { + charClass.appendCodePoint(c); + i++; + } + if (i == normalizedPattern.length()) + throw error("Unclosed character class"); + lastCodePoint = c; + } + + if (eq != null) { + result = "(?:"+charClass.toString()+eq.toString()+")"; + } else { + result = charClass.toString(); + } + + newPattern.append(result); + return i; + } + + /** + * Given a specific sequence composed of a regular character and + * combining marks that follow it, produce the alternation that will + * match all canonical equivalences of that sequence. + */ + private String produceEquivalentAlternation(String source) { + int len = countChars(source, 0, 1); + if (source.length() == len) + // source has one character. + return source; + + String base = source.substring(0,len); + String combiningMarks = source.substring(len); + + String[] perms = producePermutations(combiningMarks); + StringBuilder result = new StringBuilder(source); + + // Add combined permutations + for(int x=0; x0) + result.append("|"+next); + next = composeOneStep(next); + if (next != null) + result.append("|"+produceEquivalentAlternation(next)); + } + return result.toString(); + } + + /** + * Returns an array of strings that have all the possible + * permutations of the characters in the input string. + * This is used to get a list of all possible orderings + * of a set of combining marks. Note that some of the permutations + * are invalid because of combining class collisions, and these + * possibilities must be removed because they are not canonically + * equivalent. + */ + private String[] producePermutations(String input) { + if (input.length() == countChars(input, 0, 1)) + return new String[] {input}; + + if (input.length() == countChars(input, 0, 2)) { + int c0 = Character.codePointAt(input, 0); + int c1 = Character.codePointAt(input, Character.charCount(c0)); + if (getClass(c1) == getClass(c0)) { + return new String[] {input}; + } + String[] result = new String[2]; + result[0] = input; + StringBuilder sb = new StringBuilder(2); + sb.appendCodePoint(c1); + sb.appendCodePoint(c0); + result[1] = sb.toString(); + return result; + } + + int length = 1; + int nCodePoints = countCodePoints(input); + for(int x=1; x=0; y--) { + if (combClass[y] == combClass[x]) { + continue loop; + } + } + StringBuilder sb = new StringBuilder(input); + String otherChars = sb.delete(offset, offset+len).toString(); + String[] subResult = producePermutations(otherChars); + + String prefix = input.substring(offset, offset+len); + for(int y=0; y= pLen - 1) // No \Q sequence found + return; + int j = i; + i += 2; + int[] newtemp = new int[j + 2*(pLen-i) + 2]; + System.arraycopy(temp, 0, newtemp, 0, j); + + boolean inQuote = true; + while (i < pLen) { + int c = temp[i++]; + if (! ASCII.isAscii(c) || ASCII.isAlnum(c)) { + newtemp[j++] = c; + } else if (c != '\\') { + if (inQuote) newtemp[j++] = '\\'; + newtemp[j++] = c; + } else if (inQuote) { + if (temp[i] == 'E') { + i++; + inQuote = false; + } else { + newtemp[j++] = '\\'; + newtemp[j++] = '\\'; + } + } else { + if (temp[i] == 'Q') { + i++; + inQuote = true; + } else { + newtemp[j++] = c; + if (i != pLen) + newtemp[j++] = temp[i++]; + } + } + } + + patternLength = j; + temp = Arrays.copyOf(newtemp, j + 2); // double zero termination + } + + /** + * Copies regular expression to an int array and invokes the parsing + * of the expression which will create the object tree. + */ + private void compile() { + // Handle canonical equivalences + if (has(CANON_EQ) && !has(LITERAL)) { + normalize(); + } else { + normalizedPattern = pattern; + } + patternLength = normalizedPattern.length(); + + // Copy pattern to int array for convenience + // Use double zero to terminate pattern + temp = new int[patternLength + 2]; + + hasSupplementary = false; + int c, count = 0; + // Convert all chars into code points + for (int x = 0; x < patternLength; x += Character.charCount(c)) { + c = normalizedPattern.codePointAt(x); + if (isSupplementary(c)) { + hasSupplementary = true; + } + temp[count++] = c; + } + + patternLength = count; // patternLength now in code points + + if (! has(LITERAL)) + RemoveQEQuoting(); + + // Allocate all temporary objects here. + buffer = new int[32]; + groupNodes = new GroupHead[10]; + namedGroups = null; + + if (has(LITERAL)) { + // Literal pattern handling + matchRoot = newSlice(temp, patternLength, hasSupplementary); + matchRoot.next = lastAccept; + } else { + // Start recursive descent parsing + matchRoot = expr(lastAccept); + // Check extra pattern characters + if (patternLength != cursor) { + if (peek() == ')') { + throw error("Unmatched closing ')'"); + } else { + throw error("Unexpected internal error"); + } + } + } + + // Peephole optimization + if (matchRoot instanceof Slice) { + root = BnM.optimize(matchRoot); + if (root == matchRoot) { + root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot); + } + } else if (matchRoot instanceof Begin || matchRoot instanceof First) { + root = matchRoot; + } else { + root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot); + } + + // Release temporary storage + temp = null; + buffer = null; + groupNodes = null; + patternLength = 0; + compiled = true; + } + + Map namedGroups() { + if (namedGroups == null) + namedGroups = new HashMap<>(2); + return namedGroups; + } + + /** + * Used to print out a subtree of the Pattern to help with debugging. + */ + private static void printObjectTree(Node node) { + while(node != null) { + if (node instanceof Prolog) { + System.out.println(node); + printObjectTree(((Prolog)node).loop); + System.out.println("**** end contents prolog loop"); + } else if (node instanceof Loop) { + System.out.println(node); + printObjectTree(((Loop)node).body); + System.out.println("**** end contents Loop body"); + } else if (node instanceof Curly) { + System.out.println(node); + printObjectTree(((Curly)node).atom); + System.out.println("**** end contents Curly body"); + } else if (node instanceof GroupCurly) { + System.out.println(node); + printObjectTree(((GroupCurly)node).atom); + System.out.println("**** end contents GroupCurly body"); + } else if (node instanceof GroupTail) { + System.out.println(node); + System.out.println("Tail next is "+node.next); + return; + } else { + System.out.println(node); + } + node = node.next; + if (node != null) + System.out.println("->next:"); + if (node == Pattern.accept) { + System.out.println("Accept Node"); + node = null; + } + } + } + + /** + * Used to accumulate information about a subtree of the object graph + * so that optimizations can be applied to the subtree. + */ + static final class TreeInfo { + int minLength; + int maxLength; + boolean maxValid; + boolean deterministic; + + TreeInfo() { + reset(); + } + void reset() { + minLength = 0; + maxLength = 0; + maxValid = true; + deterministic = true; + } + } + + /* + * The following private methods are mainly used to improve the + * readability of the code. In order to let the Java compiler easily + * inline them, we should not put many assertions or error checks in them. + */ + + /** + * Indicates whether a particular flag is set or not. + */ + private boolean has(int f) { + return (flags & f) != 0; + } + + /** + * Match next character, signal error if failed. + */ + private void accept(int ch, String s) { + int testChar = temp[cursor++]; + if (has(COMMENTS)) + testChar = parsePastWhitespace(testChar); + if (ch != testChar) { + throw error(s); + } + } + + /** + * Mark the end of pattern with a specific character. + */ + private void mark(int c) { + temp[patternLength] = c; + } + + /** + * Peek the next character, and do not advance the cursor. + */ + private int peek() { + int ch = temp[cursor]; + if (has(COMMENTS)) + ch = peekPastWhitespace(ch); + return ch; + } + + /** + * Read the next character, and advance the cursor by one. + */ + private int read() { + int ch = temp[cursor++]; + if (has(COMMENTS)) + ch = parsePastWhitespace(ch); + return ch; + } + + /** + * Read the next character, and advance the cursor by one, + * ignoring the COMMENTS setting + */ + private int readEscaped() { + int ch = temp[cursor++]; + return ch; + } + + /** + * Advance the cursor by one, and peek the next character. + */ + private int next() { + int ch = temp[++cursor]; + if (has(COMMENTS)) + ch = peekPastWhitespace(ch); + return ch; + } + + /** + * Advance the cursor by one, and peek the next character, + * ignoring the COMMENTS setting + */ + private int nextEscaped() { + int ch = temp[++cursor]; + return ch; + } + + /** + * If in xmode peek past whitespace and comments. + */ + private int peekPastWhitespace(int ch) { + while (ASCII.isSpace(ch) || ch == '#') { + while (ASCII.isSpace(ch)) + ch = temp[++cursor]; + if (ch == '#') { + ch = peekPastLine(); + } + } + return ch; + } + + /** + * If in xmode parse past whitespace and comments. + */ + private int parsePastWhitespace(int ch) { + while (ASCII.isSpace(ch) || ch == '#') { + while (ASCII.isSpace(ch)) + ch = temp[cursor++]; + if (ch == '#') + ch = parsePastLine(); + } + return ch; + } + + /** + * xmode parse past comment to end of line. + */ + private int parsePastLine() { + int ch = temp[cursor++]; + while (ch != 0 && !isLineSeparator(ch)) + ch = temp[cursor++]; + return ch; + } + + /** + * xmode peek past comment to end of line. + */ + private int peekPastLine() { + int ch = temp[++cursor]; + while (ch != 0 && !isLineSeparator(ch)) + ch = temp[++cursor]; + return ch; + } + + /** + * Determines if character is a line separator in the current mode + */ + private boolean isLineSeparator(int ch) { + if (has(UNIX_LINES)) { + return ch == '\n'; + } else { + return (ch == '\n' || + ch == '\r' || + (ch|1) == '\u2029' || + ch == '\u0085'); + } + } + + /** + * Read the character after the next one, and advance the cursor by two. + */ + private int skip() { + int i = cursor; + int ch = temp[i+1]; + cursor = i + 2; + return ch; + } + + /** + * Unread one next character, and retreat cursor by one. + */ + private void unread() { + cursor--; + } + + /** + * Internal method used for handling all syntax errors. The pattern is + * displayed with a pointer to aid in locating the syntax error. + */ + private PatternSyntaxException error(String s) { + return new PatternSyntaxException(s, normalizedPattern, cursor - 1); + } + + /** + * Determines if there is any supplementary character or unpaired + * surrogate in the specified range. + */ + private boolean findSupplementary(int start, int end) { + for (int i = start; i < end; i++) { + if (isSupplementary(temp[i])) + return true; + } + return false; + } + + /** + * Determines if the specified code point is a supplementary + * character or unpaired surrogate. + */ + private static final boolean isSupplementary(int ch) { + return ch >= Character.MIN_SUPPLEMENTARY_CODE_POINT || + Character.isSurrogate((char)ch); + } + + /** + * The following methods handle the main parsing. They are sorted + * according to their precedence order, the lowest one first. + */ + + /** + * The expression is parsed with branch nodes added for alternations. + * This may be called recursively to parse sub expressions that may + * contain alternations. + */ + private Node expr(Node end) { + Node prev = null; + Node firstTail = null; + Node branchConn = null; + + for (;;) { + Node node = sequence(end); + Node nodeTail = root; //double return + if (prev == null) { + prev = node; + firstTail = nodeTail; + } else { + // Branch + if (branchConn == null) { + branchConn = new BranchConn(); + branchConn.next = end; + } + if (node == end) { + // if the node returned from sequence() is "end" + // we have an empty expr, set a null atom into + // the branch to indicate to go "next" directly. + node = null; + } else { + // the "tail.next" of each atom goes to branchConn + nodeTail.next = branchConn; + } + if (prev instanceof Branch) { + ((Branch)prev).add(node); + } else { + if (prev == end) { + prev = null; + } else { + // replace the "end" with "branchConn" at its tail.next + // when put the "prev" into the branch as the first atom. + firstTail.next = branchConn; + } + prev = new Branch(prev, node, branchConn); + } + } + if (peek() != '|') { + return prev; + } + next(); + } + } + + /** + * Parsing of sequences between alternations. + */ + private Node sequence(Node end) { + Node head = null; + Node tail = null; + Node node = null; + LOOP: + for (;;) { + int ch = peek(); + switch (ch) { + case '(': + // Because group handles its own closure, + // we need to treat it differently + node = group0(); + // Check for comment or flag group + if (node == null) + continue; + if (head == null) + head = node; + else + tail.next = node; + // Double return: Tail was returned in root + tail = root; + continue; + case '[': + node = clazz(true); + break; + case '\\': + ch = nextEscaped(); + if (ch == 'p' || ch == 'P') { + boolean oneLetter = true; + boolean comp = (ch == 'P'); + ch = next(); // Consume { if present + if (ch != '{') { + unread(); + } else { + oneLetter = false; + } + node = family(oneLetter, comp); + } else { + unread(); + node = atom(); + } + break; + case '^': + next(); + if (has(MULTILINE)) { + if (has(UNIX_LINES)) + node = new UnixCaret(); + else + node = new Caret(); + } else { + node = new Begin(); + } + break; + case '$': + next(); + if (has(UNIX_LINES)) + node = new UnixDollar(has(MULTILINE)); + else + node = new Dollar(has(MULTILINE)); + break; + case '.': + next(); + if (has(DOTALL)) { + node = new All(); + } else { + if (has(UNIX_LINES)) + node = new UnixDot(); + else { + node = new Dot(); + } + } + break; + case '|': + case ')': + break LOOP; + case ']': // Now interpreting dangling ] and } as literals + case '}': + node = atom(); + break; + case '?': + case '*': + case '+': + next(); + throw error("Dangling meta character '" + ((char)ch) + "'"); + case 0: + if (cursor >= patternLength) { + break LOOP; + } + // Fall through + default: + node = atom(); + break; + } + + node = closure(node); + + if (head == null) { + head = tail = node; + } else { + tail.next = node; + tail = node; + } + } + if (head == null) { + return end; + } + tail.next = end; + root = tail; //double return + return head; + } + + /** + * Parse and add a new Single or Slice. + */ + private Node atom() { + int first = 0; + int prev = -1; + boolean hasSupplementary = false; + int ch = peek(); + for (;;) { + switch (ch) { + case '*': + case '+': + case '?': + case '{': + if (first > 1) { + cursor = prev; // Unwind one character + first--; + } + break; + case '$': + case '.': + case '^': + case '(': + case '[': + case '|': + case ')': + break; + case '\\': + ch = nextEscaped(); + if (ch == 'p' || ch == 'P') { // Property + if (first > 0) { // Slice is waiting; handle it first + unread(); + break; + } else { // No slice; just return the family node + boolean comp = (ch == 'P'); + boolean oneLetter = true; + ch = next(); // Consume { if present + if (ch != '{') + unread(); + else + oneLetter = false; + return family(oneLetter, comp); + } + } + unread(); + prev = cursor; + ch = escape(false, first == 0); + if (ch >= 0) { + append(ch, first); + first++; + if (isSupplementary(ch)) { + hasSupplementary = true; + } + ch = peek(); + continue; + } else if (first == 0) { + return root; + } + // Unwind meta escape sequence + cursor = prev; + break; + case 0: + if (cursor >= patternLength) { + break; + } + // Fall through + default: + prev = cursor; + append(ch, first); + first++; + if (isSupplementary(ch)) { + hasSupplementary = true; + } + ch = next(); + continue; + } + break; + } + if (first == 1) { + return newSingle(buffer[0]); + } else { + return newSlice(buffer, first, hasSupplementary); + } + } + + private void append(int ch, int len) { + if (len >= buffer.length) { + int[] tmp = new int[len+len]; + System.arraycopy(buffer, 0, tmp, 0, len); + buffer = tmp; + } + buffer[len] = ch; + } + + /** + * Parses a backref greedily, taking as many numbers as it + * can. The first digit is always treated as a backref, but + * multi digit numbers are only treated as a backref if at + * least that many backrefs exist at this point in the regex. + */ + private Node ref(int refNum) { + boolean done = false; + while(!done) { + int ch = peek(); + switch(ch) { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + int newRefNum = (refNum * 10) + (ch - '0'); + // Add another number if it doesn't make a group + // that doesn't exist + if (capturingGroupCount - 1 < newRefNum) { + done = true; + break; + } + refNum = newRefNum; + read(); + break; + default: + done = true; + break; + } + } + if (has(CASE_INSENSITIVE)) + return new CIBackRef(refNum, has(UNICODE_CASE)); + else + return new BackRef(refNum); + } + + /** + * Parses an escape sequence to determine the actual value that needs + * to be matched. + * If -1 is returned and create was true a new object was added to the tree + * to handle the escape sequence. + * If the returned value is greater than zero, it is the value that + * matches the escape sequence. + */ + private int escape(boolean inclass, boolean create) { + int ch = skip(); + switch (ch) { + case '0': + return o(); + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + if (inclass) break; + if (create) { + root = ref((ch - '0')); + } + return -1; + case 'A': + if (inclass) break; + if (create) root = new Begin(); + return -1; + case 'B': + if (inclass) break; + if (create) root = new Bound(Bound.NONE, has(UNICODE_CHARACTER_CLASS)); + return -1; + case 'C': + break; + case 'D': + if (create) root = has(UNICODE_CHARACTER_CLASS) + ? new Utype(UnicodeProp.DIGIT).complement() + : new Ctype(ASCII.DIGIT).complement(); + return -1; + case 'E': + case 'F': + break; + case 'G': + if (inclass) break; + if (create) root = new LastMatch(); + return -1; + case 'H': + case 'I': + case 'J': + case 'K': + case 'L': + case 'M': + case 'N': + case 'O': + case 'P': + case 'Q': + case 'R': + break; + case 'S': + if (create) root = has(UNICODE_CHARACTER_CLASS) + ? new Utype(UnicodeProp.WHITE_SPACE).complement() + : new Ctype(ASCII.SPACE).complement(); + return -1; + case 'T': + case 'U': + case 'V': + break; + case 'W': + if (create) root = has(UNICODE_CHARACTER_CLASS) + ? new Utype(UnicodeProp.WORD).complement() + : new Ctype(ASCII.WORD).complement(); + return -1; + case 'X': + case 'Y': + break; + case 'Z': + if (inclass) break; + if (create) { + if (has(UNIX_LINES)) + root = new UnixDollar(false); + else + root = new Dollar(false); + } + return -1; + case 'a': + return '\007'; + case 'b': + if (inclass) break; + if (create) root = new Bound(Bound.BOTH, has(UNICODE_CHARACTER_CLASS)); + return -1; + case 'c': + return c(); + case 'd': + if (create) root = has(UNICODE_CHARACTER_CLASS) + ? new Utype(UnicodeProp.DIGIT) + : new Ctype(ASCII.DIGIT); + return -1; + case 'e': + return '\033'; + case 'f': + return '\f'; + case 'g': + case 'h': + case 'i': + case 'j': + break; + case 'k': + if (inclass) + break; + if (read() != '<') + throw error("\\k is not followed by '<' for named capturing group"); + String name = groupname(read()); + if (!namedGroups().containsKey(name)) + throw error("(named capturing group <"+ name+"> does not exit"); + if (create) { + if (has(CASE_INSENSITIVE)) + root = new CIBackRef(namedGroups().get(name), has(UNICODE_CASE)); + else + root = new BackRef(namedGroups().get(name)); + } + return -1; + case 'l': + case 'm': + break; + case 'n': + return '\n'; + case 'o': + case 'p': + case 'q': + break; + case 'r': + return '\r'; + case 's': + if (create) root = has(UNICODE_CHARACTER_CLASS) + ? new Utype(UnicodeProp.WHITE_SPACE) + : new Ctype(ASCII.SPACE); + return -1; + case 't': + return '\t'; + case 'u': + return u(); + case 'v': + return '\013'; + case 'w': + if (create) root = has(UNICODE_CHARACTER_CLASS) + ? new Utype(UnicodeProp.WORD) + : new Ctype(ASCII.WORD); + return -1; + case 'x': + return x(); + case 'y': + break; + case 'z': + if (inclass) break; + if (create) root = new End(); + return -1; + default: + return ch; + } + throw error("Illegal/unsupported escape sequence"); + } + + /** + * Parse a character class, and return the node that matches it. + * + * Consumes a ] on the way out if consume is true. Usually consume + * is true except for the case of [abc&&def] where def is a separate + * right hand node with "understood" brackets. + */ + private CharProperty clazz(boolean consume) { + CharProperty prev = null; + CharProperty node = null; + BitClass bits = new BitClass(); + boolean include = true; + boolean firstInClass = true; + int ch = next(); + for (;;) { + switch (ch) { + case '^': + // Negates if first char in a class, otherwise literal + if (firstInClass) { + if (temp[cursor-1] != '[') + break; + ch = next(); + include = !include; + continue; + } else { + // ^ not first in class, treat as literal + break; + } + case '[': + firstInClass = false; + node = clazz(true); + if (prev == null) + prev = node; + else + prev = union(prev, node); + ch = peek(); + continue; + case '&': + firstInClass = false; + ch = next(); + if (ch == '&') { + ch = next(); + CharProperty rightNode = null; + while (ch != ']' && ch != '&') { + if (ch == '[') { + if (rightNode == null) + rightNode = clazz(true); + else + rightNode = union(rightNode, clazz(true)); + } else { // abc&&def + unread(); + rightNode = clazz(false); + } + ch = peek(); + } + if (rightNode != null) + node = rightNode; + if (prev == null) { + if (rightNode == null) + throw error("Bad class syntax"); + else + prev = rightNode; + } else { + prev = intersection(prev, node); + } + } else { + // treat as a literal & + unread(); + break; + } + continue; + case 0: + firstInClass = false; + if (cursor >= patternLength) + throw error("Unclosed character class"); + break; + case ']': + firstInClass = false; + if (prev != null) { + if (consume) + next(); + return prev; + } + break; + default: + firstInClass = false; + break; + } + node = range(bits); + if (include) { + if (prev == null) { + prev = node; + } else { + if (prev != node) + prev = union(prev, node); + } + } else { + if (prev == null) { + prev = node.complement(); + } else { + if (prev != node) + prev = setDifference(prev, node); + } + } + ch = peek(); + } + } + + private CharProperty bitsOrSingle(BitClass bits, int ch) { + /* Bits can only handle codepoints in [u+0000-u+00ff] range. + Use "single" node instead of bits when dealing with unicode + case folding for codepoints listed below. + (1)Uppercase out of range: u+00ff, u+00b5 + toUpperCase(u+00ff) -> u+0178 + toUpperCase(u+00b5) -> u+039c + (2)LatinSmallLetterLongS u+17f + toUpperCase(u+017f) -> u+0053 + (3)LatinSmallLetterDotlessI u+131 + toUpperCase(u+0131) -> u+0049 + (4)LatinCapitalLetterIWithDotAbove u+0130 + toLowerCase(u+0130) -> u+0069 + (5)KelvinSign u+212a + toLowerCase(u+212a) ==> u+006B + (6)AngstromSign u+212b + toLowerCase(u+212b) ==> u+00e5 + */ + int d; + if (ch < 256 && + !(has(CASE_INSENSITIVE) && has(UNICODE_CASE) && + (ch == 0xff || ch == 0xb5 || + ch == 0x49 || ch == 0x69 || //I and i + ch == 0x53 || ch == 0x73 || //S and s + ch == 0x4b || ch == 0x6b || //K and k + ch == 0xc5 || ch == 0xe5))) //A+ring + return bits.add(ch, flags()); + return newSingle(ch); + } + + /** + * Parse a single character or a character range in a character class + * and return its representative node. + */ + private CharProperty range(BitClass bits) { + int ch = peek(); + if (ch == '\\') { + ch = nextEscaped(); + if (ch == 'p' || ch == 'P') { // A property + boolean comp = (ch == 'P'); + boolean oneLetter = true; + // Consume { if present + ch = next(); + if (ch != '{') + unread(); + else + oneLetter = false; + return family(oneLetter, comp); + } else { // ordinary escape + unread(); + ch = escape(true, true); + if (ch == -1) + return (CharProperty) root; + } + } else { + ch = single(); + } + if (ch >= 0) { + if (peek() == '-') { + int endRange = temp[cursor+1]; + if (endRange == '[') { + return bitsOrSingle(bits, ch); + } + if (endRange != ']') { + next(); + int m = single(); + if (m < ch) + throw error("Illegal character range"); + if (has(CASE_INSENSITIVE)) + return caseInsensitiveRangeFor(ch, m); + else + return rangeFor(ch, m); + } + } + return bitsOrSingle(bits, ch); + } + throw error("Unexpected character '"+((char)ch)+"'"); + } + + private int single() { + int ch = peek(); + switch (ch) { + case '\\': + return escape(true, false); + default: + next(); + return ch; + } + } + + /** + * Parses a Unicode character family and returns its representative node. + */ + private CharProperty family(boolean singleLetter, + boolean maybeComplement) + { + next(); + String name; + CharProperty node = null; + + if (singleLetter) { + int c = temp[cursor]; + if (!Character.isSupplementaryCodePoint(c)) { + name = String.valueOf((char)c); + } else { + name = new String(temp, cursor, 1); + } + read(); + } else { + int i = cursor; + mark('}'); + while(read() != '}') { + } + mark('\000'); + int j = cursor; + if (j > patternLength) + throw error("Unclosed character family"); + if (i + 1 >= j) + throw error("Empty character family"); + name = new String(temp, i, j-i-1); + } + + int i = name.indexOf('='); + if (i != -1) { + // property construct \p{name=value} + String value = name.substring(i + 1); + name = name.substring(0, i).toLowerCase(Locale.ENGLISH); + if ("sc".equals(name) || "script".equals(name)) { + node = unicodeScriptPropertyFor(value); + } else if ("blk".equals(name) || "block".equals(name)) { + node = unicodeBlockPropertyFor(value); + } else if ("gc".equals(name) || "general_category".equals(name)) { + node = charPropertyNodeFor(value); + } else { + throw error("Unknown Unicode property {name=<" + name + ">, " + + "value=<" + value + ">}"); + } + } else { + if (name.startsWith("In")) { + // \p{inBlockName} + node = unicodeBlockPropertyFor(name.substring(2)); + } else if (name.startsWith("Is")) { + // \p{isGeneralCategory} and \p{isScriptName} + name = name.substring(2); + UnicodeProp uprop = UnicodeProp.forName(name); + if (uprop != null) + node = new Utype(uprop); + if (node == null) + node = CharPropertyNames.charPropertyFor(name); + if (node == null) + node = unicodeScriptPropertyFor(name); + } else { + if (has(UNICODE_CHARACTER_CLASS)) { + UnicodeProp uprop = UnicodeProp.forPOSIXName(name); + if (uprop != null) + node = new Utype(uprop); + } + if (node == null) + node = charPropertyNodeFor(name); + } + } + if (maybeComplement) { + if (node instanceof Category || node instanceof Block) + hasSupplementary = true; + node = node.complement(); + } + return node; + } + + + /** + * Returns a CharProperty matching all characters belong to + * a UnicodeScript. + */ + private CharProperty unicodeScriptPropertyFor(String name) { + final Character.UnicodeScript script; + try { + script = Character.UnicodeScript.forName(name); + } catch (IllegalArgumentException iae) { + throw error("Unknown character script name {" + name + "}"); + } + return new Script(script); + } + + /** + * Returns a CharProperty matching all characters in a UnicodeBlock. + */ + private CharProperty unicodeBlockPropertyFor(String name) { + final Character.UnicodeBlock block; + try { + block = Character.UnicodeBlock.forName(name); + } catch (IllegalArgumentException iae) { + throw error("Unknown character block name {" + name + "}"); + } + return new Block(block); + } + + /** + * Returns a CharProperty matching all characters in a named property. + */ + private CharProperty charPropertyNodeFor(String name) { + CharProperty p = CharPropertyNames.charPropertyFor(name); + if (p == null) + throw error("Unknown character property name {" + name + "}"); + return p; + } + + /** + * Parses and returns the name of a "named capturing group", the trailing + * ">" is consumed after parsing. + */ + private String groupname(int ch) { + StringBuilder sb = new StringBuilder(); + sb.append(Character.toChars(ch)); + while (ASCII.isLower(ch=read()) || ASCII.isUpper(ch) || + ASCII.isDigit(ch)) { + sb.append(Character.toChars(ch)); + } + if (sb.length() == 0) + throw error("named capturing group has 0 length name"); + if (ch != '>') + throw error("named capturing group is missing trailing '>'"); + return sb.toString(); + } + + /** + * Parses a group and returns the head node of a set of nodes that process + * the group. Sometimes a double return system is used where the tail is + * returned in root. + */ + private Node group0() { + boolean capturingGroup = false; + Node head = null; + Node tail = null; + int save = flags; + root = null; + int ch = next(); + if (ch == '?') { + ch = skip(); + switch (ch) { + case ':': // (?:xxx) pure group + head = createGroup(true); + tail = root; + head.next = expr(tail); + break; + case '=': // (?=xxx) and (?!xxx) lookahead + case '!': + head = createGroup(true); + tail = root; + head.next = expr(tail); + if (ch == '=') { + head = tail = new Pos(head); + } else { + head = tail = new Neg(head); + } + break; + case '>': // (?>xxx) independent group + head = createGroup(true); + tail = root; + head.next = expr(tail); + head = tail = new Ques(head, INDEPENDENT); + break; + case '<': // (? is already defined"); + capturingGroup = true; + head = createGroup(false); + tail = root; + namedGroups().put(name, capturingGroupCount-1); + head.next = expr(tail); + break; + } + int start = cursor; + head = createGroup(true); + tail = root; + head.next = expr(tail); + tail.next = lookbehindEnd; + TreeInfo info = new TreeInfo(); + head.study(info); + if (info.maxValid == false) { + throw error("Look-behind group does not have " + + "an obvious maximum length"); + } + boolean hasSupplementary = findSupplementary(start, patternLength); + if (ch == '=') { + head = tail = (hasSupplementary ? + new BehindS(head, info.maxLength, + info.minLength) : + new Behind(head, info.maxLength, + info.minLength)); + } else if (ch == '!') { + head = tail = (hasSupplementary ? + new NotBehindS(head, info.maxLength, + info.minLength) : + new NotBehind(head, info.maxLength, + info.minLength)); + } else { + throw error("Unknown look-behind group"); + } + break; + case '$': + case '@': + throw error("Unknown group type"); + default: // (?xxx:) inlined match flags + unread(); + addFlag(); + ch = read(); + if (ch == ')') { + return null; // Inline modifier only + } + if (ch != ':') { + throw error("Unknown inline modifier"); + } + head = createGroup(true); + tail = root; + head.next = expr(tail); + break; + } + } else { // (xxx) a regular group + capturingGroup = true; + head = createGroup(false); + tail = root; + head.next = expr(tail); + } + + accept(')', "Unclosed group"); + flags = save; + + // Check for quantifiers + Node node = closure(head); + if (node == head) { // No closure + root = tail; + return node; // Dual return + } + if (head == tail) { // Zero length assertion + root = node; + return node; // Dual return + } + + if (node instanceof Ques) { + Ques ques = (Ques) node; + if (ques.type == POSSESSIVE) { + root = node; + return node; + } + tail.next = new BranchConn(); + tail = tail.next; + if (ques.type == GREEDY) { + head = new Branch(head, null, tail); + } else { // Reluctant quantifier + head = new Branch(null, head, tail); + } + root = tail; + return head; + } else if (node instanceof Curly) { + Curly curly = (Curly) node; + if (curly.type == POSSESSIVE) { + root = node; + return node; + } + // Discover if the group is deterministic + TreeInfo info = new TreeInfo(); + if (head.study(info)) { // Deterministic + GroupTail temp = (GroupTail) tail; + head = root = new GroupCurly(head.next, curly.cmin, + curly.cmax, curly.type, + ((GroupTail)tail).localIndex, + ((GroupTail)tail).groupIndex, + capturingGroup); + return head; + } else { // Non-deterministic + int temp = ((GroupHead) head).localIndex; + Loop loop; + if (curly.type == GREEDY) + loop = new Loop(this.localCount, temp); + else // Reluctant Curly + loop = new LazyLoop(this.localCount, temp); + Prolog prolog = new Prolog(loop); + this.localCount += 1; + loop.cmin = curly.cmin; + loop.cmax = curly.cmax; + loop.body = head; + tail.next = loop; + root = loop; + return prolog; // Dual return + } + } + throw error("Internal logic error"); + } + + /** + * Create group head and tail nodes using double return. If the group is + * created with anonymous true then it is a pure group and should not + * affect group counting. + */ + private Node createGroup(boolean anonymous) { + int localIndex = localCount++; + int groupIndex = 0; + if (!anonymous) + groupIndex = capturingGroupCount++; + GroupHead head = new GroupHead(localIndex); + root = new GroupTail(localIndex, groupIndex); + if (!anonymous && groupIndex < 10) + groupNodes[groupIndex] = head; + return head; + } + + /** + * Parses inlined match flags and set them appropriately. + */ + private void addFlag() { + int ch = peek(); + for (;;) { + switch (ch) { + case 'i': + flags |= CASE_INSENSITIVE; + break; + case 'm': + flags |= MULTILINE; + break; + case 's': + flags |= DOTALL; + break; + case 'd': + flags |= UNIX_LINES; + break; + case 'u': + flags |= UNICODE_CASE; + break; + case 'c': + flags |= CANON_EQ; + break; + case 'x': + flags |= COMMENTS; + break; + case 'U': + flags |= (UNICODE_CHARACTER_CLASS | UNICODE_CASE); + break; + case '-': // subFlag then fall through + ch = next(); + subFlag(); + default: + return; + } + ch = next(); + } + } + + /** + * Parses the second part of inlined match flags and turns off + * flags appropriately. + */ + private void subFlag() { + int ch = peek(); + for (;;) { + switch (ch) { + case 'i': + flags &= ~CASE_INSENSITIVE; + break; + case 'm': + flags &= ~MULTILINE; + break; + case 's': + flags &= ~DOTALL; + break; + case 'd': + flags &= ~UNIX_LINES; + break; + case 'u': + flags &= ~UNICODE_CASE; + break; + case 'c': + flags &= ~CANON_EQ; + break; + case 'x': + flags &= ~COMMENTS; + break; + case 'U': + flags &= ~(UNICODE_CHARACTER_CLASS | UNICODE_CASE); + default: + return; + } + ch = next(); + } + } + + static final int MAX_REPS = 0x7FFFFFFF; + + static final int GREEDY = 0; + + static final int LAZY = 1; + + static final int POSSESSIVE = 2; + + static final int INDEPENDENT = 3; + + /** + * Processes repetition. If the next character peeked is a quantifier + * then new nodes must be appended to handle the repetition. + * Prev could be a single or a group, so it could be a chain of nodes. + */ + private Node closure(Node prev) { + Node atom; + int ch = peek(); + switch (ch) { + case '?': + ch = next(); + if (ch == '?') { + next(); + return new Ques(prev, LAZY); + } else if (ch == '+') { + next(); + return new Ques(prev, POSSESSIVE); + } + return new Ques(prev, GREEDY); + case '*': + ch = next(); + if (ch == '?') { + next(); + return new Curly(prev, 0, MAX_REPS, LAZY); + } else if (ch == '+') { + next(); + return new Curly(prev, 0, MAX_REPS, POSSESSIVE); + } + return new Curly(prev, 0, MAX_REPS, GREEDY); + case '+': + ch = next(); + if (ch == '?') { + next(); + return new Curly(prev, 1, MAX_REPS, LAZY); + } else if (ch == '+') { + next(); + return new Curly(prev, 1, MAX_REPS, POSSESSIVE); + } + return new Curly(prev, 1, MAX_REPS, GREEDY); + case '{': + ch = temp[cursor+1]; + if (ASCII.isDigit(ch)) { + skip(); + int cmin = 0; + do { + cmin = cmin * 10 + (ch - '0'); + } while (ASCII.isDigit(ch = read())); + int cmax = cmin; + if (ch == ',') { + ch = read(); + cmax = MAX_REPS; + if (ch != '}') { + cmax = 0; + while (ASCII.isDigit(ch)) { + cmax = cmax * 10 + (ch - '0'); + ch = read(); + } + } + } + if (ch != '}') + throw error("Unclosed counted closure"); + if (((cmin) | (cmax) | (cmax - cmin)) < 0) + throw error("Illegal repetition range"); + Curly curly; + ch = peek(); + if (ch == '?') { + next(); + curly = new Curly(prev, cmin, cmax, LAZY); + } else if (ch == '+') { + next(); + curly = new Curly(prev, cmin, cmax, POSSESSIVE); + } else { + curly = new Curly(prev, cmin, cmax, GREEDY); + } + return curly; + } else { + throw error("Illegal repetition"); + } + default: + return prev; + } + } + + /** + * Utility method for parsing control escape sequences. + */ + private int c() { + if (cursor < patternLength) { + return read() ^ 64; + } + throw error("Illegal control escape sequence"); + } + + /** + * Utility method for parsing octal escape sequences. + */ + private int o() { + int n = read(); + if (((n-'0')|('7'-n)) >= 0) { + int m = read(); + if (((m-'0')|('7'-m)) >= 0) { + int o = read(); + if ((((o-'0')|('7'-o)) >= 0) && (((n-'0')|('3'-n)) >= 0)) { + return (n - '0') * 64 + (m - '0') * 8 + (o - '0'); + } + unread(); + return (n - '0') * 8 + (m - '0'); + } + unread(); + return (n - '0'); + } + throw error("Illegal octal escape sequence"); + } + + /** + * Utility method for parsing hexadecimal escape sequences. + */ + private int x() { + int n = read(); + if (ASCII.isHexDigit(n)) { + int m = read(); + if (ASCII.isHexDigit(m)) { + return ASCII.toDigit(n) * 16 + ASCII.toDigit(m); + } + } else if (n == '{' && ASCII.isHexDigit(peek())) { + int ch = 0; + while (ASCII.isHexDigit(n = read())) { + ch = (ch << 4) + ASCII.toDigit(n); + if (ch > Character.MAX_CODE_POINT) + throw error("Hexadecimal codepoint is too big"); + } + if (n != '}') + throw error("Unclosed hexadecimal escape sequence"); + return ch; + } + throw error("Illegal hexadecimal escape sequence"); + } + + /** + * Utility method for parsing unicode escape sequences. + */ + private int cursor() { + return cursor; + } + + private void setcursor(int pos) { + cursor = pos; + } + + private int uxxxx() { + int n = 0; + for (int i = 0; i < 4; i++) { + int ch = read(); + if (!ASCII.isHexDigit(ch)) { + throw error("Illegal Unicode escape sequence"); + } + n = n * 16 + ASCII.toDigit(ch); + } + return n; + } + + private int u() { + int n = uxxxx(); + if (Character.isHighSurrogate((char)n)) { + int cur = cursor(); + if (read() == '\\' && read() == 'u') { + int n2 = uxxxx(); + if (Character.isLowSurrogate((char)n2)) + return Character.toCodePoint((char)n, (char)n2); + } + setcursor(cur); + } + return n; + } + + // + // Utility methods for code point support + // + + private static final int countChars(CharSequence seq, int index, + int lengthInCodePoints) { + // optimization + if (lengthInCodePoints == 1 && !Character.isHighSurrogate(seq.charAt(index))) { + assert (index >= 0 && index < seq.length()); + return 1; + } + int length = seq.length(); + int x = index; + if (lengthInCodePoints >= 0) { + assert (index >= 0 && index < length); + for (int i = 0; x < length && i < lengthInCodePoints; i++) { + if (Character.isHighSurrogate(seq.charAt(x++))) { + if (x < length && Character.isLowSurrogate(seq.charAt(x))) { + x++; + } + } + } + return x - index; + } + + assert (index >= 0 && index <= length); + if (index == 0) { + return 0; + } + int len = -lengthInCodePoints; + for (int i = 0; x > 0 && i < len; i++) { + if (Character.isLowSurrogate(seq.charAt(--x))) { + if (x > 0 && Character.isHighSurrogate(seq.charAt(x-1))) { + x--; + } + } + } + return index - x; + } + + private static final int countCodePoints(CharSequence seq) { + int length = seq.length(); + int n = 0; + for (int i = 0; i < length; ) { + n++; + if (Character.isHighSurrogate(seq.charAt(i++))) { + if (i < length && Character.isLowSurrogate(seq.charAt(i))) { + i++; + } + } + } + return n; + } + + /** + * Creates a bit vector for matching Latin-1 values. A normal BitClass + * never matches values above Latin-1, and a complemented BitClass always + * matches values above Latin-1. + */ + private static final class BitClass extends BmpCharProperty { + final boolean[] bits; + BitClass() { bits = new boolean[256]; } + private BitClass(boolean[] bits) { this.bits = bits; } + BitClass add(int c, int flags) { + assert c >= 0 && c <= 255; + if ((flags & CASE_INSENSITIVE) != 0) { + if (ASCII.isAscii(c)) { + bits[ASCII.toUpper(c)] = true; + bits[ASCII.toLower(c)] = true; + } else if ((flags & UNICODE_CASE) != 0) { + bits[Character.toLowerCase(c)] = true; + bits[Character.toUpperCase(c)] = true; + } + } + bits[c] = true; + return this; + } + boolean isSatisfiedBy(int ch) { + return ch < 256 && bits[ch]; + } + } + + /** + * Returns a suitably optimized, single character matcher. + */ + private CharProperty newSingle(final int ch) { + if (has(CASE_INSENSITIVE)) { + int lower, upper; + if (has(UNICODE_CASE)) { + upper = Character.toUpperCase(ch); + lower = Character.toLowerCase(upper); + if (upper != lower) + return new SingleU(lower); + } else if (ASCII.isAscii(ch)) { + lower = ASCII.toLower(ch); + upper = ASCII.toUpper(ch); + if (lower != upper) + return new SingleI(lower, upper); + } + } + if (isSupplementary(ch)) + return new SingleS(ch); // Match a given Unicode character + return new Single(ch); // Match a given BMP character + } + + /** + * Utility method for creating a string slice matcher. + */ + private Node newSlice(int[] buf, int count, boolean hasSupplementary) { + int[] tmp = new int[count]; + if (has(CASE_INSENSITIVE)) { + if (has(UNICODE_CASE)) { + for (int i = 0; i < count; i++) { + tmp[i] = Character.toLowerCase( + Character.toUpperCase(buf[i])); + } + return hasSupplementary? new SliceUS(tmp) : new SliceU(tmp); + } + for (int i = 0; i < count; i++) { + tmp[i] = ASCII.toLower(buf[i]); + } + return hasSupplementary? new SliceIS(tmp) : new SliceI(tmp); + } + for (int i = 0; i < count; i++) { + tmp[i] = buf[i]; + } + return hasSupplementary ? new SliceS(tmp) : new Slice(tmp); + } + + /** + * The following classes are the building components of the object + * tree that represents a compiled regular expression. The object tree + * is made of individual elements that handle constructs in the Pattern. + * Each type of object knows how to match its equivalent construct with + * the match() method. + */ + + /** + * Base class for all node classes. Subclasses should override the match() + * method as appropriate. This class is an accepting node, so its match() + * always returns true. + */ + static class Node extends Object { + Node next; + Node() { + next = Pattern.accept; + } + /** + * This method implements the classic accept node. + */ + boolean match(Matcher matcher, int i, CharSequence seq) { + matcher.last = i; + matcher.groups[0] = matcher.first; + matcher.groups[1] = matcher.last; + return true; + } + /** + * This method is good for all zero length assertions. + */ + boolean study(TreeInfo info) { + if (next != null) { + return next.study(info); + } else { + return info.deterministic; + } + } + } + + static class LastNode extends Node { + /** + * This method implements the classic accept node with + * the addition of a check to see if the match occurred + * using all of the input. + */ + boolean match(Matcher matcher, int i, CharSequence seq) { + if (matcher.acceptMode == Matcher.ENDANCHOR && i != matcher.to) + return false; + matcher.last = i; + matcher.groups[0] = matcher.first; + matcher.groups[1] = matcher.last; + return true; + } + } + + /** + * Used for REs that can start anywhere within the input string. + * This basically tries to match repeatedly at each spot in the + * input string, moving forward after each try. An anchored search + * or a BnM will bypass this node completely. + */ + static class Start extends Node { + int minLength; + Start(Node node) { + this.next = node; + TreeInfo info = new TreeInfo(); + next.study(info); + minLength = info.minLength; + } + boolean match(Matcher matcher, int i, CharSequence seq) { + if (i > matcher.to - minLength) { + matcher.hitEnd = true; + return false; + } + int guard = matcher.to - minLength; + for (; i <= guard; i++) { + if (next.match(matcher, i, seq)) { + matcher.first = i; + matcher.groups[0] = matcher.first; + matcher.groups[1] = matcher.last; + return true; + } + } + matcher.hitEnd = true; + return false; + } + boolean study(TreeInfo info) { + next.study(info); + info.maxValid = false; + info.deterministic = false; + return false; + } + } + + /* + * StartS supports supplementary characters, including unpaired surrogates. + */ + static final class StartS extends Start { + StartS(Node node) { + super(node); + } + boolean match(Matcher matcher, int i, CharSequence seq) { + if (i > matcher.to - minLength) { + matcher.hitEnd = true; + return false; + } + int guard = matcher.to - minLength; + while (i <= guard) { + //if ((ret = next.match(matcher, i, seq)) || i == guard) + if (next.match(matcher, i, seq)) { + matcher.first = i; + matcher.groups[0] = matcher.first; + matcher.groups[1] = matcher.last; + return true; + } + if (i == guard) + break; + // Optimization to move to the next character. This is + // faster than countChars(seq, i, 1). + if (Character.isHighSurrogate(seq.charAt(i++))) { + if (i < seq.length() && + Character.isLowSurrogate(seq.charAt(i))) { + i++; + } + } + } + matcher.hitEnd = true; + return false; + } + } + + /** + * Node to anchor at the beginning of input. This object implements the + * match for a \A sequence, and the caret anchor will use this if not in + * multiline mode. + */ + static final class Begin extends Node { + boolean match(Matcher matcher, int i, CharSequence seq) { + int fromIndex = (matcher.anchoringBounds) ? + matcher.from : 0; + if (i == fromIndex && next.match(matcher, i, seq)) { + matcher.first = i; + matcher.groups[0] = i; + matcher.groups[1] = matcher.last; + return true; + } else { + return false; + } + } + } + + /** + * Node to anchor at the end of input. This is the absolute end, so this + * should not match at the last newline before the end as $ will. + */ + static final class End extends Node { + boolean match(Matcher matcher, int i, CharSequence seq) { + int endIndex = (matcher.anchoringBounds) ? + matcher.to : matcher.getTextLength(); + if (i == endIndex) { + matcher.hitEnd = true; + return next.match(matcher, i, seq); + } + return false; + } + } + + /** + * Node to anchor at the beginning of a line. This is essentially the + * object to match for the multiline ^. + */ + static final class Caret extends Node { + boolean match(Matcher matcher, int i, CharSequence seq) { + int startIndex = matcher.from; + int endIndex = matcher.to; + if (!matcher.anchoringBounds) { + startIndex = 0; + endIndex = matcher.getTextLength(); + } + // Perl does not match ^ at end of input even after newline + if (i == endIndex) { + matcher.hitEnd = true; + return false; + } + if (i > startIndex) { + char ch = seq.charAt(i-1); + if (ch != '\n' && ch != '\r' + && (ch|1) != '\u2029' + && ch != '\u0085' ) { + return false; + } + // Should treat /r/n as one newline + if (ch == '\r' && seq.charAt(i) == '\n') + return false; + } + return next.match(matcher, i, seq); + } + } + + /** + * Node to anchor at the beginning of a line when in unixdot mode. + */ + static final class UnixCaret extends Node { + boolean match(Matcher matcher, int i, CharSequence seq) { + int startIndex = matcher.from; + int endIndex = matcher.to; + if (!matcher.anchoringBounds) { + startIndex = 0; + endIndex = matcher.getTextLength(); + } + // Perl does not match ^ at end of input even after newline + if (i == endIndex) { + matcher.hitEnd = true; + return false; + } + if (i > startIndex) { + char ch = seq.charAt(i-1); + if (ch != '\n') { + return false; + } + } + return next.match(matcher, i, seq); + } + } + + /** + * Node to match the location where the last match ended. + * This is used for the \G construct. + */ + static final class LastMatch extends Node { + boolean match(Matcher matcher, int i, CharSequence seq) { + if (i != matcher.oldLast) + return false; + return next.match(matcher, i, seq); + } + } + + /** + * Node to anchor at the end of a line or the end of input based on the + * multiline mode. + * + * When not in multiline mode, the $ can only match at the very end + * of the input, unless the input ends in a line terminator in which + * it matches right before the last line terminator. + * + * Note that \r\n is considered an atomic line terminator. + * + * Like ^ the $ operator matches at a position, it does not match the + * line terminators themselves. + */ + static final class Dollar extends Node { + boolean multiline; + Dollar(boolean mul) { + multiline = mul; + } + boolean match(Matcher matcher, int i, CharSequence seq) { + int endIndex = (matcher.anchoringBounds) ? + matcher.to : matcher.getTextLength(); + if (!multiline) { + if (i < endIndex - 2) + return false; + if (i == endIndex - 2) { + char ch = seq.charAt(i); + if (ch != '\r') + return false; + ch = seq.charAt(i + 1); + if (ch != '\n') + return false; + } + } + // Matches before any line terminator; also matches at the + // end of input + // Before line terminator: + // If multiline, we match here no matter what + // If not multiline, fall through so that the end + // is marked as hit; this must be a /r/n or a /n + // at the very end so the end was hit; more input + // could make this not match here + if (i < endIndex) { + char ch = seq.charAt(i); + if (ch == '\n') { + // No match between \r\n + if (i > 0 && seq.charAt(i-1) == '\r') + return false; + if (multiline) + return next.match(matcher, i, seq); + } else if (ch == '\r' || ch == '\u0085' || + (ch|1) == '\u2029') { + if (multiline) + return next.match(matcher, i, seq); + } else { // No line terminator, no match + return false; + } + } + // Matched at current end so hit end + matcher.hitEnd = true; + // If a $ matches because of end of input, then more input + // could cause it to fail! + matcher.requireEnd = true; + return next.match(matcher, i, seq); + } + boolean study(TreeInfo info) { + next.study(info); + return info.deterministic; + } + } + + /** + * Node to anchor at the end of a line or the end of input based on the + * multiline mode when in unix lines mode. + */ + static final class UnixDollar extends Node { + boolean multiline; + UnixDollar(boolean mul) { + multiline = mul; + } + boolean match(Matcher matcher, int i, CharSequence seq) { + int endIndex = (matcher.anchoringBounds) ? + matcher.to : matcher.getTextLength(); + if (i < endIndex) { + char ch = seq.charAt(i); + if (ch == '\n') { + // If not multiline, then only possible to + // match at very end or one before end + if (multiline == false && i != endIndex - 1) + return false; + // If multiline return next.match without setting + // matcher.hitEnd + if (multiline) + return next.match(matcher, i, seq); + } else { + return false; + } + } + // Matching because at the end or 1 before the end; + // more input could change this so set hitEnd + matcher.hitEnd = true; + // If a $ matches because of end of input, then more input + // could cause it to fail! + matcher.requireEnd = true; + return next.match(matcher, i, seq); + } + boolean study(TreeInfo info) { + next.study(info); + return info.deterministic; + } + } + + /** + * Abstract node class to match one character satisfying some + * boolean property. + */ + private static abstract class CharProperty extends Node { + abstract boolean isSatisfiedBy(int ch); + CharProperty complement() { + return new CharProperty() { + boolean isSatisfiedBy(int ch) { + return ! CharProperty.this.isSatisfiedBy(ch);}}; + } + boolean match(Matcher matcher, int i, CharSequence seq) { + if (i < matcher.to) { + int ch = Character.codePointAt(seq, i); + return isSatisfiedBy(ch) + && next.match(matcher, i+Character.charCount(ch), seq); + } else { + matcher.hitEnd = true; + return false; + } + } + boolean study(TreeInfo info) { + info.minLength++; + info.maxLength++; + return next.study(info); + } + } + + /** + * Optimized version of CharProperty that works only for + * properties never satisfied by Supplementary characters. + */ + private static abstract class BmpCharProperty extends CharProperty { + boolean match(Matcher matcher, int i, CharSequence seq) { + if (i < matcher.to) { + return isSatisfiedBy(seq.charAt(i)) + && next.match(matcher, i+1, seq); + } else { + matcher.hitEnd = true; + return false; + } + } + } + + /** + * Node class that matches a Supplementary Unicode character + */ + static final class SingleS extends CharProperty { + final int c; + SingleS(int c) { this.c = c; } + boolean isSatisfiedBy(int ch) { + return ch == c; + } + } + + /** + * Optimization -- matches a given BMP character + */ + static final class Single extends BmpCharProperty { + final int c; + Single(int c) { this.c = c; } + boolean isSatisfiedBy(int ch) { + return ch == c; + } + } + + /** + * Case insensitive matches a given BMP character + */ + static final class SingleI extends BmpCharProperty { + final int lower; + final int upper; + SingleI(int lower, int upper) { + this.lower = lower; + this.upper = upper; + } + boolean isSatisfiedBy(int ch) { + return ch == lower || ch == upper; + } + } + + /** + * Unicode case insensitive matches a given Unicode character + */ + static final class SingleU extends CharProperty { + final int lower; + SingleU(int lower) { + this.lower = lower; + } + boolean isSatisfiedBy(int ch) { + return lower == ch || + lower == Character.toLowerCase(Character.toUpperCase(ch)); + } + } + + + /** + * Node class that matches a Unicode block. + */ + static final class Block extends CharProperty { + final Character.UnicodeBlock block; + Block(Character.UnicodeBlock block) { + this.block = block; + } + boolean isSatisfiedBy(int ch) { + return block == Character.UnicodeBlock.of(ch); + } + } + + /** + * Node class that matches a Unicode script + */ + static final class Script extends CharProperty { + final Character.UnicodeScript script; + Script(Character.UnicodeScript script) { + this.script = script; + } + boolean isSatisfiedBy(int ch) { + return script == Character.UnicodeScript.of(ch); + } + } + + /** + * Node class that matches a Unicode category. + */ + static final class Category extends CharProperty { + final int typeMask; + Category(int typeMask) { this.typeMask = typeMask; } + boolean isSatisfiedBy(int ch) { + return (typeMask & (1 << Character.getType(ch))) != 0; + } + } + + /** + * Node class that matches a Unicode "type" + */ + static final class Utype extends CharProperty { + final UnicodeProp uprop; + Utype(UnicodeProp uprop) { this.uprop = uprop; } + boolean isSatisfiedBy(int ch) { + return uprop.is(ch); + } + } + + + /** + * Node class that matches a POSIX type. + */ + static final class Ctype extends BmpCharProperty { + final int ctype; + Ctype(int ctype) { this.ctype = ctype; } + boolean isSatisfiedBy(int ch) { + return ch < 128 && ASCII.isType(ch, ctype); + } + } + + /** + * Base class for all Slice nodes + */ + static class SliceNode extends Node { + int[] buffer; + SliceNode(int[] buf) { + buffer = buf; + } + boolean study(TreeInfo info) { + info.minLength += buffer.length; + info.maxLength += buffer.length; + return next.study(info); + } + } + + /** + * Node class for a case sensitive/BMP-only sequence of literal + * characters. + */ + static final class Slice extends SliceNode { + Slice(int[] buf) { + super(buf); + } + boolean match(Matcher matcher, int i, CharSequence seq) { + int[] buf = buffer; + int len = buf.length; + for (int j=0; j= matcher.to) { + matcher.hitEnd = true; + return false; + } + if (buf[j] != seq.charAt(i+j)) + return false; + } + return next.match(matcher, i+len, seq); + } + } + + /** + * Node class for a case_insensitive/BMP-only sequence of literal + * characters. + */ + static class SliceI extends SliceNode { + SliceI(int[] buf) { + super(buf); + } + boolean match(Matcher matcher, int i, CharSequence seq) { + int[] buf = buffer; + int len = buf.length; + for (int j=0; j= matcher.to) { + matcher.hitEnd = true; + return false; + } + int c = seq.charAt(i+j); + if (buf[j] != c && + buf[j] != ASCII.toLower(c)) + return false; + } + return next.match(matcher, i+len, seq); + } + } + + /** + * Node class for a unicode_case_insensitive/BMP-only sequence of + * literal characters. Uses unicode case folding. + */ + static final class SliceU extends SliceNode { + SliceU(int[] buf) { + super(buf); + } + boolean match(Matcher matcher, int i, CharSequence seq) { + int[] buf = buffer; + int len = buf.length; + for (int j=0; j= matcher.to) { + matcher.hitEnd = true; + return false; + } + int c = seq.charAt(i+j); + if (buf[j] != c && + buf[j] != Character.toLowerCase(Character.toUpperCase(c))) + return false; + } + return next.match(matcher, i+len, seq); + } + } + + /** + * Node class for a case sensitive sequence of literal characters + * including supplementary characters. + */ + static final class SliceS extends SliceNode { + SliceS(int[] buf) { + super(buf); + } + boolean match(Matcher matcher, int i, CharSequence seq) { + int[] buf = buffer; + int x = i; + for (int j = 0; j < buf.length; j++) { + if (x >= matcher.to) { + matcher.hitEnd = true; + return false; + } + int c = Character.codePointAt(seq, x); + if (buf[j] != c) + return false; + x += Character.charCount(c); + if (x > matcher.to) { + matcher.hitEnd = true; + return false; + } + } + return next.match(matcher, x, seq); + } + } + + /** + * Node class for a case insensitive sequence of literal characters + * including supplementary characters. + */ + static class SliceIS extends SliceNode { + SliceIS(int[] buf) { + super(buf); + } + int toLower(int c) { + return ASCII.toLower(c); + } + boolean match(Matcher matcher, int i, CharSequence seq) { + int[] buf = buffer; + int x = i; + for (int j = 0; j < buf.length; j++) { + if (x >= matcher.to) { + matcher.hitEnd = true; + return false; + } + int c = Character.codePointAt(seq, x); + if (buf[j] != c && buf[j] != toLower(c)) + return false; + x += Character.charCount(c); + if (x > matcher.to) { + matcher.hitEnd = true; + return false; + } + } + return next.match(matcher, x, seq); + } + } + + /** + * Node class for a case insensitive sequence of literal characters. + * Uses unicode case folding. + */ + static final class SliceUS extends SliceIS { + SliceUS(int[] buf) { + super(buf); + } + int toLower(int c) { + return Character.toLowerCase(Character.toUpperCase(c)); + } + } + + private static boolean inRange(int lower, int ch, int upper) { + return lower <= ch && ch <= upper; + } + + /** + * Returns node for matching characters within an explicit value range. + */ + private static CharProperty rangeFor(final int lower, + final int upper) { + return new CharProperty() { + boolean isSatisfiedBy(int ch) { + return inRange(lower, ch, upper);}}; + } + + /** + * Returns node for matching characters within an explicit value + * range in a case insensitive manner. + */ + private CharProperty caseInsensitiveRangeFor(final int lower, + final int upper) { + if (has(UNICODE_CASE)) + return new CharProperty() { + boolean isSatisfiedBy(int ch) { + if (inRange(lower, ch, upper)) + return true; + int up = Character.toUpperCase(ch); + return inRange(lower, up, upper) || + inRange(lower, Character.toLowerCase(up), upper);}}; + return new CharProperty() { + boolean isSatisfiedBy(int ch) { + return inRange(lower, ch, upper) || + ASCII.isAscii(ch) && + (inRange(lower, ASCII.toUpper(ch), upper) || + inRange(lower, ASCII.toLower(ch), upper)); + }}; + } + + /** + * Implements the Unicode category ALL and the dot metacharacter when + * in dotall mode. + */ + static final class All extends CharProperty { + boolean isSatisfiedBy(int ch) { + return true; + } + } + + /** + * Node class for the dot metacharacter when dotall is not enabled. + */ + static final class Dot extends CharProperty { + boolean isSatisfiedBy(int ch) { + return (ch != '\n' && ch != '\r' + && (ch|1) != '\u2029' + && ch != '\u0085'); + } + } + + /** + * Node class for the dot metacharacter when dotall is not enabled + * but UNIX_LINES is enabled. + */ + static final class UnixDot extends CharProperty { + boolean isSatisfiedBy(int ch) { + return ch != '\n'; + } + } + + /** + * The 0 or 1 quantifier. This one class implements all three types. + */ + static final class Ques extends Node { + Node atom; + int type; + Ques(Node node, int type) { + this.atom = node; + this.type = type; + } + boolean match(Matcher matcher, int i, CharSequence seq) { + switch (type) { + case GREEDY: + return (atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq)) + || next.match(matcher, i, seq); + case LAZY: + return next.match(matcher, i, seq) + || (atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq)); + case POSSESSIVE: + if (atom.match(matcher, i, seq)) i = matcher.last; + return next.match(matcher, i, seq); + default: + return atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq); + } + } + boolean study(TreeInfo info) { + if (type != INDEPENDENT) { + int minL = info.minLength; + atom.study(info); + info.minLength = minL; + info.deterministic = false; + return next.study(info); + } else { + atom.study(info); + return next.study(info); + } + } + } + + /** + * Handles the curly-brace style repetition with a specified minimum and + * maximum occurrences. The * quantifier is handled as a special case. + * This class handles the three types. + */ + static final class Curly extends Node { + Node atom; + int type; + int cmin; + int cmax; + + Curly(Node node, int cmin, int cmax, int type) { + this.atom = node; + this.type = type; + this.cmin = cmin; + this.cmax = cmax; + } + boolean match(Matcher matcher, int i, CharSequence seq) { + int j; + for (j = 0; j < cmin; j++) { + if (atom.match(matcher, i, seq)) { + i = matcher.last; + continue; + } + return false; + } + if (type == GREEDY) + return match0(matcher, i, j, seq); + else if (type == LAZY) + return match1(matcher, i, j, seq); + else + return match2(matcher, i, j, seq); + } + // Greedy match. + // i is the index to start matching at + // j is the number of atoms that have matched + boolean match0(Matcher matcher, int i, int j, CharSequence seq) { + if (j >= cmax) { + // We have matched the maximum... continue with the rest of + // the regular expression + return next.match(matcher, i, seq); + } + int backLimit = j; + while (atom.match(matcher, i, seq)) { + // k is the length of this match + int k = matcher.last - i; + if (k == 0) // Zero length match + break; + // Move up index and number matched + i = matcher.last; + j++; + // We are greedy so match as many as we can + while (j < cmax) { + if (!atom.match(matcher, i, seq)) + break; + if (i + k != matcher.last) { + if (match0(matcher, matcher.last, j+1, seq)) + return true; + break; + } + i += k; + j++; + } + // Handle backing off if match fails + while (j >= backLimit) { + if (next.match(matcher, i, seq)) + return true; + i -= k; + j--; + } + return false; + } + return next.match(matcher, i, seq); + } + // Reluctant match. At this point, the minimum has been satisfied. + // i is the index to start matching at + // j is the number of atoms that have matched + boolean match1(Matcher matcher, int i, int j, CharSequence seq) { + for (;;) { + // Try finishing match without consuming any more + if (next.match(matcher, i, seq)) + return true; + // At the maximum, no match found + if (j >= cmax) + return false; + // Okay, must try one more atom + if (!atom.match(matcher, i, seq)) + return false; + // If we haven't moved forward then must break out + if (i == matcher.last) + return false; + // Move up index and number matched + i = matcher.last; + j++; + } + } + boolean match2(Matcher matcher, int i, int j, CharSequence seq) { + for (; j < cmax; j++) { + if (!atom.match(matcher, i, seq)) + break; + if (i == matcher.last) + break; + i = matcher.last; + } + return next.match(matcher, i, seq); + } + boolean study(TreeInfo info) { + // Save original info + int minL = info.minLength; + int maxL = info.maxLength; + boolean maxV = info.maxValid; + boolean detm = info.deterministic; + info.reset(); + + atom.study(info); + + int temp = info.minLength * cmin + minL; + if (temp < minL) { + temp = 0xFFFFFFF; // arbitrary large number + } + info.minLength = temp; + + if (maxV & info.maxValid) { + temp = info.maxLength * cmax + maxL; + info.maxLength = temp; + if (temp < maxL) { + info.maxValid = false; + } + } else { + info.maxValid = false; + } + + if (info.deterministic && cmin == cmax) + info.deterministic = detm; + else + info.deterministic = false; + + return next.study(info); + } + } + + /** + * Handles the curly-brace style repetition with a specified minimum and + * maximum occurrences in deterministic cases. This is an iterative + * optimization over the Prolog and Loop system which would handle this + * in a recursive way. The * quantifier is handled as a special case. + * If capture is true then this class saves group settings and ensures + * that groups are unset when backing off of a group match. + */ + static final class GroupCurly extends Node { + Node atom; + int type; + int cmin; + int cmax; + int localIndex; + int groupIndex; + boolean capture; + + GroupCurly(Node node, int cmin, int cmax, int type, int local, + int group, boolean capture) { + this.atom = node; + this.type = type; + this.cmin = cmin; + this.cmax = cmax; + this.localIndex = local; + this.groupIndex = group; + this.capture = capture; + } + boolean match(Matcher matcher, int i, CharSequence seq) { + int[] groups = matcher.groups; + int[] locals = matcher.locals; + int save0 = locals[localIndex]; + int save1 = 0; + int save2 = 0; + + if (capture) { + save1 = groups[groupIndex]; + save2 = groups[groupIndex+1]; + } + + // Notify GroupTail there is no need to setup group info + // because it will be set here + locals[localIndex] = -1; + + boolean ret = true; + for (int j = 0; j < cmin; j++) { + if (atom.match(matcher, i, seq)) { + if (capture) { + groups[groupIndex] = i; + groups[groupIndex+1] = matcher.last; + } + i = matcher.last; + } else { + ret = false; + break; + } + } + if (ret) { + if (type == GREEDY) { + ret = match0(matcher, i, cmin, seq); + } else if (type == LAZY) { + ret = match1(matcher, i, cmin, seq); + } else { + ret = match2(matcher, i, cmin, seq); + } + } + if (!ret) { + locals[localIndex] = save0; + if (capture) { + groups[groupIndex] = save1; + groups[groupIndex+1] = save2; + } + } + return ret; + } + // Aggressive group match + boolean match0(Matcher matcher, int i, int j, CharSequence seq) { + int[] groups = matcher.groups; + int save0 = 0; + int save1 = 0; + if (capture) { + save0 = groups[groupIndex]; + save1 = groups[groupIndex+1]; + } + for (;;) { + if (j >= cmax) + break; + if (!atom.match(matcher, i, seq)) + break; + int k = matcher.last - i; + if (k <= 0) { + if (capture) { + groups[groupIndex] = i; + groups[groupIndex+1] = i + k; + } + i = i + k; + break; + } + for (;;) { + if (capture) { + groups[groupIndex] = i; + groups[groupIndex+1] = i + k; + } + i = i + k; + if (++j >= cmax) + break; + if (!atom.match(matcher, i, seq)) + break; + if (i + k != matcher.last) { + if (match0(matcher, i, j, seq)) + return true; + break; + } + } + while (j > cmin) { + if (next.match(matcher, i, seq)) { + if (capture) { + groups[groupIndex+1] = i; + groups[groupIndex] = i - k; + } + i = i - k; + return true; + } + // backing off + if (capture) { + groups[groupIndex+1] = i; + groups[groupIndex] = i - k; + } + i = i - k; + j--; + } + break; + } + if (capture) { + groups[groupIndex] = save0; + groups[groupIndex+1] = save1; + } + return next.match(matcher, i, seq); + } + // Reluctant matching + boolean match1(Matcher matcher, int i, int j, CharSequence seq) { + for (;;) { + if (next.match(matcher, i, seq)) + return true; + if (j >= cmax) + return false; + if (!atom.match(matcher, i, seq)) + return false; + if (i == matcher.last) + return false; + if (capture) { + matcher.groups[groupIndex] = i; + matcher.groups[groupIndex+1] = matcher.last; + } + i = matcher.last; + j++; + } + } + // Possessive matching + boolean match2(Matcher matcher, int i, int j, CharSequence seq) { + for (; j < cmax; j++) { + if (!atom.match(matcher, i, seq)) { + break; + } + if (capture) { + matcher.groups[groupIndex] = i; + matcher.groups[groupIndex+1] = matcher.last; + } + if (i == matcher.last) { + break; + } + i = matcher.last; + } + return next.match(matcher, i, seq); + } + boolean study(TreeInfo info) { + // Save original info + int minL = info.minLength; + int maxL = info.maxLength; + boolean maxV = info.maxValid; + boolean detm = info.deterministic; + info.reset(); + + atom.study(info); + + int temp = info.minLength * cmin + minL; + if (temp < minL) { + temp = 0xFFFFFFF; // Arbitrary large number + } + info.minLength = temp; + + if (maxV & info.maxValid) { + temp = info.maxLength * cmax + maxL; + info.maxLength = temp; + if (temp < maxL) { + info.maxValid = false; + } + } else { + info.maxValid = false; + } + + if (info.deterministic && cmin == cmax) { + info.deterministic = detm; + } else { + info.deterministic = false; + } + + return next.study(info); + } + } + + /** + * A Guard node at the end of each atom node in a Branch. It + * serves the purpose of chaining the "match" operation to + * "next" but not the "study", so we can collect the TreeInfo + * of each atom node without including the TreeInfo of the + * "next". + */ + static final class BranchConn extends Node { + BranchConn() {}; + boolean match(Matcher matcher, int i, CharSequence seq) { + return next.match(matcher, i, seq); + } + boolean study(TreeInfo info) { + return info.deterministic; + } + } + + /** + * Handles the branching of alternations. Note this is also used for + * the ? quantifier to branch between the case where it matches once + * and where it does not occur. + */ + static final class Branch extends Node { + Node[] atoms = new Node[2]; + int size = 2; + Node conn; + Branch(Node first, Node second, Node branchConn) { + conn = branchConn; + atoms[0] = first; + atoms[1] = second; + } + + void add(Node node) { + if (size >= atoms.length) { + Node[] tmp = new Node[atoms.length*2]; + System.arraycopy(atoms, 0, tmp, 0, atoms.length); + atoms = tmp; + } + atoms[size++] = node; + } + + boolean match(Matcher matcher, int i, CharSequence seq) { + for (int n = 0; n < size; n++) { + if (atoms[n] == null) { + if (conn.next.match(matcher, i, seq)) + return true; + } else if (atoms[n].match(matcher, i, seq)) { + return true; + } + } + return false; + } + + boolean study(TreeInfo info) { + int minL = info.minLength; + int maxL = info.maxLength; + boolean maxV = info.maxValid; + + int minL2 = Integer.MAX_VALUE; //arbitrary large enough num + int maxL2 = -1; + for (int n = 0; n < size; n++) { + info.reset(); + if (atoms[n] != null) + atoms[n].study(info); + minL2 = Math.min(minL2, info.minLength); + maxL2 = Math.max(maxL2, info.maxLength); + maxV = (maxV & info.maxValid); + } + + minL += minL2; + maxL += maxL2; + + info.reset(); + conn.next.study(info); + + info.minLength += minL; + info.maxLength += maxL; + info.maxValid &= maxV; + info.deterministic = false; + return false; + } + } + + /** + * The GroupHead saves the location where the group begins in the locals + * and restores them when the match is done. + * + * The matchRef is used when a reference to this group is accessed later + * in the expression. The locals will have a negative value in them to + * indicate that we do not want to unset the group if the reference + * doesn't match. + */ + static final class GroupHead extends Node { + int localIndex; + GroupHead(int localCount) { + localIndex = localCount; + } + boolean match(Matcher matcher, int i, CharSequence seq) { + int save = matcher.locals[localIndex]; + matcher.locals[localIndex] = i; + boolean ret = next.match(matcher, i, seq); + matcher.locals[localIndex] = save; + return ret; + } + boolean matchRef(Matcher matcher, int i, CharSequence seq) { + int save = matcher.locals[localIndex]; + matcher.locals[localIndex] = ~i; // HACK + boolean ret = next.match(matcher, i, seq); + matcher.locals[localIndex] = save; + return ret; + } + } + + /** + * Recursive reference to a group in the regular expression. It calls + * matchRef because if the reference fails to match we would not unset + * the group. + */ + static final class GroupRef extends Node { + GroupHead head; + GroupRef(GroupHead head) { + this.head = head; + } + boolean match(Matcher matcher, int i, CharSequence seq) { + return head.matchRef(matcher, i, seq) + && next.match(matcher, matcher.last, seq); + } + boolean study(TreeInfo info) { + info.maxValid = false; + info.deterministic = false; + return next.study(info); + } + } + + /** + * The GroupTail handles the setting of group beginning and ending + * locations when groups are successfully matched. It must also be able to + * unset groups that have to be backed off of. + * + * The GroupTail node is also used when a previous group is referenced, + * and in that case no group information needs to be set. + */ + static final class GroupTail extends Node { + int localIndex; + int groupIndex; + GroupTail(int localCount, int groupCount) { + localIndex = localCount; + groupIndex = groupCount + groupCount; + } + boolean match(Matcher matcher, int i, CharSequence seq) { + int tmp = matcher.locals[localIndex]; + if (tmp >= 0) { // This is the normal group case. + // Save the group so we can unset it if it + // backs off of a match. + int groupStart = matcher.groups[groupIndex]; + int groupEnd = matcher.groups[groupIndex+1]; + + matcher.groups[groupIndex] = tmp; + matcher.groups[groupIndex+1] = i; + if (next.match(matcher, i, seq)) { + return true; + } + matcher.groups[groupIndex] = groupStart; + matcher.groups[groupIndex+1] = groupEnd; + return false; + } else { + // This is a group reference case. We don't need to save any + // group info because it isn't really a group. + matcher.last = i; + return true; + } + } + } + + /** + * This sets up a loop to handle a recursive quantifier structure. + */ + static final class Prolog extends Node { + Loop loop; + Prolog(Loop loop) { + this.loop = loop; + } + boolean match(Matcher matcher, int i, CharSequence seq) { + return loop.matchInit(matcher, i, seq); + } + boolean study(TreeInfo info) { + return loop.study(info); + } + } + + /** + * Handles the repetition count for a greedy Curly. The matchInit + * is called from the Prolog to save the index of where the group + * beginning is stored. A zero length group check occurs in the + * normal match but is skipped in the matchInit. + */ + static class Loop extends Node { + Node body; + int countIndex; // local count index in matcher locals + int beginIndex; // group beginning index + int cmin, cmax; + Loop(int countIndex, int beginIndex) { + this.countIndex = countIndex; + this.beginIndex = beginIndex; + } + boolean match(Matcher matcher, int i, CharSequence seq) { + // Avoid infinite loop in zero-length case. + if (i > matcher.locals[beginIndex]) { + int count = matcher.locals[countIndex]; + + // This block is for before we reach the minimum + // iterations required for the loop to match + if (count < cmin) { + matcher.locals[countIndex] = count + 1; + boolean b = body.match(matcher, i, seq); + // If match failed we must backtrack, so + // the loop count should NOT be incremented + if (!b) + matcher.locals[countIndex] = count; + // Return success or failure since we are under + // minimum + return b; + } + // This block is for after we have the minimum + // iterations required for the loop to match + if (count < cmax) { + matcher.locals[countIndex] = count + 1; + boolean b = body.match(matcher, i, seq); + // If match failed we must backtrack, so + // the loop count should NOT be incremented + if (!b) + matcher.locals[countIndex] = count; + else + return true; + } + } + return next.match(matcher, i, seq); + } + boolean matchInit(Matcher matcher, int i, CharSequence seq) { + int save = matcher.locals[countIndex]; + boolean ret = false; + if (0 < cmin) { + matcher.locals[countIndex] = 1; + ret = body.match(matcher, i, seq); + } else if (0 < cmax) { + matcher.locals[countIndex] = 1; + ret = body.match(matcher, i, seq); + if (ret == false) + ret = next.match(matcher, i, seq); + } else { + ret = next.match(matcher, i, seq); + } + matcher.locals[countIndex] = save; + return ret; + } + boolean study(TreeInfo info) { + info.maxValid = false; + info.deterministic = false; + return false; + } + } + + /** + * Handles the repetition count for a reluctant Curly. The matchInit + * is called from the Prolog to save the index of where the group + * beginning is stored. A zero length group check occurs in the + * normal match but is skipped in the matchInit. + */ + static final class LazyLoop extends Loop { + LazyLoop(int countIndex, int beginIndex) { + super(countIndex, beginIndex); + } + boolean match(Matcher matcher, int i, CharSequence seq) { + // Check for zero length group + if (i > matcher.locals[beginIndex]) { + int count = matcher.locals[countIndex]; + if (count < cmin) { + matcher.locals[countIndex] = count + 1; + boolean result = body.match(matcher, i, seq); + // If match failed we must backtrack, so + // the loop count should NOT be incremented + if (!result) + matcher.locals[countIndex] = count; + return result; + } + if (next.match(matcher, i, seq)) + return true; + if (count < cmax) { + matcher.locals[countIndex] = count + 1; + boolean result = body.match(matcher, i, seq); + // If match failed we must backtrack, so + // the loop count should NOT be incremented + if (!result) + matcher.locals[countIndex] = count; + return result; + } + return false; + } + return next.match(matcher, i, seq); + } + boolean matchInit(Matcher matcher, int i, CharSequence seq) { + int save = matcher.locals[countIndex]; + boolean ret = false; + if (0 < cmin) { + matcher.locals[countIndex] = 1; + ret = body.match(matcher, i, seq); + } else if (next.match(matcher, i, seq)) { + ret = true; + } else if (0 < cmax) { + matcher.locals[countIndex] = 1; + ret = body.match(matcher, i, seq); + } + matcher.locals[countIndex] = save; + return ret; + } + boolean study(TreeInfo info) { + info.maxValid = false; + info.deterministic = false; + return false; + } + } + + /** + * Refers to a group in the regular expression. Attempts to match + * whatever the group referred to last matched. + */ + static class BackRef extends Node { + int groupIndex; + BackRef(int groupCount) { + super(); + groupIndex = groupCount + groupCount; + } + boolean match(Matcher matcher, int i, CharSequence seq) { + int j = matcher.groups[groupIndex]; + int k = matcher.groups[groupIndex+1]; + + int groupSize = k - j; + + // If the referenced group didn't match, neither can this + if (j < 0) + return false; + + // If there isn't enough input left no match + if (i + groupSize > matcher.to) { + matcher.hitEnd = true; + return false; + } + + // Check each new char to make sure it matches what the group + // referenced matched last time around + for (int index=0; index matcher.to) { + matcher.hitEnd = true; + return false; + } + + // Check each new char to make sure it matches what the group + // referenced matched last time around + int x = i; + for (int index=0; index matcher.to) { + matcher.hitEnd = true; + return false; + } + if (atom.match(matcher, i, seq)) { + return next.match(matcher, matcher.last, seq); + } + i += countChars(seq, i, 1); + matcher.first++; + } + } + boolean study(TreeInfo info) { + atom.study(info); + info.maxValid = false; + info.deterministic = false; + return next.study(info); + } + } + + static final class Conditional extends Node { + Node cond, yes, not; + Conditional(Node cond, Node yes, Node not) { + this.cond = cond; + this.yes = yes; + this.not = not; + } + boolean match(Matcher matcher, int i, CharSequence seq) { + if (cond.match(matcher, i, seq)) { + return yes.match(matcher, i, seq); + } else { + return not.match(matcher, i, seq); + } + } + boolean study(TreeInfo info) { + int minL = info.minLength; + int maxL = info.maxLength; + boolean maxV = info.maxValid; + info.reset(); + yes.study(info); + + int minL2 = info.minLength; + int maxL2 = info.maxLength; + boolean maxV2 = info.maxValid; + info.reset(); + not.study(info); + + info.minLength = minL + Math.min(minL2, info.minLength); + info.maxLength = maxL + Math.max(maxL2, info.maxLength); + info.maxValid = (maxV & maxV2 & info.maxValid); + info.deterministic = false; + return next.study(info); + } + } + + /** + * Zero width positive lookahead. + */ + static final class Pos extends Node { + Node cond; + Pos(Node cond) { + this.cond = cond; + } + boolean match(Matcher matcher, int i, CharSequence seq) { + int savedTo = matcher.to; + boolean conditionMatched = false; + + // Relax transparent region boundaries for lookahead + if (matcher.transparentBounds) + matcher.to = matcher.getTextLength(); + try { + conditionMatched = cond.match(matcher, i, seq); + } finally { + // Reinstate region boundaries + matcher.to = savedTo; + } + return conditionMatched && next.match(matcher, i, seq); + } + } + + /** + * Zero width negative lookahead. + */ + static final class Neg extends Node { + Node cond; + Neg(Node cond) { + this.cond = cond; + } + boolean match(Matcher matcher, int i, CharSequence seq) { + int savedTo = matcher.to; + boolean conditionMatched = false; + + // Relax transparent region boundaries for lookahead + if (matcher.transparentBounds) + matcher.to = matcher.getTextLength(); + try { + if (i < matcher.to) { + conditionMatched = !cond.match(matcher, i, seq); + } else { + // If a negative lookahead succeeds then more input + // could cause it to fail! + matcher.requireEnd = true; + conditionMatched = !cond.match(matcher, i, seq); + } + } finally { + // Reinstate region boundaries + matcher.to = savedTo; + } + return conditionMatched && next.match(matcher, i, seq); + } + } + + /** + * For use with lookbehinds; matches the position where the lookbehind + * was encountered. + */ + static Node lookbehindEnd = new Node() { + boolean match(Matcher matcher, int i, CharSequence seq) { + return i == matcher.lookbehindTo; + } + }; + + /** + * Zero width positive lookbehind. + */ + static class Behind extends Node { + Node cond; + int rmax, rmin; + Behind(Node cond, int rmax, int rmin) { + this.cond = cond; + this.rmax = rmax; + this.rmin = rmin; + } + + boolean match(Matcher matcher, int i, CharSequence seq) { + int savedFrom = matcher.from; + boolean conditionMatched = false; + int startIndex = (!matcher.transparentBounds) ? + matcher.from : 0; + int from = Math.max(i - rmax, startIndex); + // Set end boundary + int savedLBT = matcher.lookbehindTo; + matcher.lookbehindTo = i; + // Relax transparent region boundaries for lookbehind + if (matcher.transparentBounds) + matcher.from = 0; + for (int j = i - rmin; !conditionMatched && j >= from; j--) { + conditionMatched = cond.match(matcher, j, seq); + } + matcher.from = savedFrom; + matcher.lookbehindTo = savedLBT; + return conditionMatched && next.match(matcher, i, seq); + } + } + + /** + * Zero width positive lookbehind, including supplementary + * characters or unpaired surrogates. + */ + static final class BehindS extends Behind { + BehindS(Node cond, int rmax, int rmin) { + super(cond, rmax, rmin); + } + boolean match(Matcher matcher, int i, CharSequence seq) { + int rmaxChars = countChars(seq, i, -rmax); + int rminChars = countChars(seq, i, -rmin); + int savedFrom = matcher.from; + int startIndex = (!matcher.transparentBounds) ? + matcher.from : 0; + boolean conditionMatched = false; + int from = Math.max(i - rmaxChars, startIndex); + // Set end boundary + int savedLBT = matcher.lookbehindTo; + matcher.lookbehindTo = i; + // Relax transparent region boundaries for lookbehind + if (matcher.transparentBounds) + matcher.from = 0; + + for (int j = i - rminChars; + !conditionMatched && j >= from; + j -= j>from ? countChars(seq, j, -1) : 1) { + conditionMatched = cond.match(matcher, j, seq); + } + matcher.from = savedFrom; + matcher.lookbehindTo = savedLBT; + return conditionMatched && next.match(matcher, i, seq); + } + } + + /** + * Zero width negative lookbehind. + */ + static class NotBehind extends Node { + Node cond; + int rmax, rmin; + NotBehind(Node cond, int rmax, int rmin) { + this.cond = cond; + this.rmax = rmax; + this.rmin = rmin; + } + + boolean match(Matcher matcher, int i, CharSequence seq) { + int savedLBT = matcher.lookbehindTo; + int savedFrom = matcher.from; + boolean conditionMatched = false; + int startIndex = (!matcher.transparentBounds) ? + matcher.from : 0; + int from = Math.max(i - rmax, startIndex); + matcher.lookbehindTo = i; + // Relax transparent region boundaries for lookbehind + if (matcher.transparentBounds) + matcher.from = 0; + for (int j = i - rmin; !conditionMatched && j >= from; j--) { + conditionMatched = cond.match(matcher, j, seq); + } + // Reinstate region boundaries + matcher.from = savedFrom; + matcher.lookbehindTo = savedLBT; + return !conditionMatched && next.match(matcher, i, seq); + } + } + + /** + * Zero width negative lookbehind, including supplementary + * characters or unpaired surrogates. + */ + static final class NotBehindS extends NotBehind { + NotBehindS(Node cond, int rmax, int rmin) { + super(cond, rmax, rmin); + } + boolean match(Matcher matcher, int i, CharSequence seq) { + int rmaxChars = countChars(seq, i, -rmax); + int rminChars = countChars(seq, i, -rmin); + int savedFrom = matcher.from; + int savedLBT = matcher.lookbehindTo; + boolean conditionMatched = false; + int startIndex = (!matcher.transparentBounds) ? + matcher.from : 0; + int from = Math.max(i - rmaxChars, startIndex); + matcher.lookbehindTo = i; + // Relax transparent region boundaries for lookbehind + if (matcher.transparentBounds) + matcher.from = 0; + for (int j = i - rminChars; + !conditionMatched && j >= from; + j -= j>from ? countChars(seq, j, -1) : 1) { + conditionMatched = cond.match(matcher, j, seq); + } + //Reinstate region boundaries + matcher.from = savedFrom; + matcher.lookbehindTo = savedLBT; + return !conditionMatched && next.match(matcher, i, seq); + } + } + + /** + * Returns the set union of two CharProperty nodes. + */ + private static CharProperty union(final CharProperty lhs, + final CharProperty rhs) { + return new CharProperty() { + boolean isSatisfiedBy(int ch) { + return lhs.isSatisfiedBy(ch) || rhs.isSatisfiedBy(ch);}}; + } + + /** + * Returns the set intersection of two CharProperty nodes. + */ + private static CharProperty intersection(final CharProperty lhs, + final CharProperty rhs) { + return new CharProperty() { + boolean isSatisfiedBy(int ch) { + return lhs.isSatisfiedBy(ch) && rhs.isSatisfiedBy(ch);}}; + } + + /** + * Returns the set difference of two CharProperty nodes. + */ + private static CharProperty setDifference(final CharProperty lhs, + final CharProperty rhs) { + return new CharProperty() { + boolean isSatisfiedBy(int ch) { + return ! rhs.isSatisfiedBy(ch) && lhs.isSatisfiedBy(ch);}}; + } + + /** + * Handles word boundaries. Includes a field to allow this one class to + * deal with the different types of word boundaries we can match. The word + * characters include underscores, letters, and digits. Non spacing marks + * can are also part of a word if they have a base character, otherwise + * they are ignored for purposes of finding word boundaries. + */ + static final class Bound extends Node { + static int LEFT = 0x1; + static int RIGHT= 0x2; + static int BOTH = 0x3; + static int NONE = 0x4; + int type; + boolean useUWORD; + Bound(int n, boolean useUWORD) { + type = n; + this.useUWORD = useUWORD; + } + + boolean isWord(int ch) { + return useUWORD ? UnicodeProp.WORD.is(ch) + : (ch == '_' || Character.isLetterOrDigit(ch)); + } + + int check(Matcher matcher, int i, CharSequence seq) { + int ch; + boolean left = false; + int startIndex = matcher.from; + int endIndex = matcher.to; + if (matcher.transparentBounds) { + startIndex = 0; + endIndex = matcher.getTextLength(); + } + if (i > startIndex) { + ch = Character.codePointBefore(seq, i); + left = (isWord(ch) || + ((Character.getType(ch) == Character.NON_SPACING_MARK) + && hasBaseCharacter(matcher, i-1, seq))); + } + boolean right = false; + if (i < endIndex) { + ch = Character.codePointAt(seq, i); + right = (isWord(ch) || + ((Character.getType(ch) == Character.NON_SPACING_MARK) + && hasBaseCharacter(matcher, i, seq))); + } else { + // Tried to access char past the end + matcher.hitEnd = true; + // The addition of another char could wreck a boundary + matcher.requireEnd = true; + } + return ((left ^ right) ? (right ? LEFT : RIGHT) : NONE); + } + boolean match(Matcher matcher, int i, CharSequence seq) { + return (check(matcher, i, seq) & type) > 0 + && next.match(matcher, i, seq); + } + } + + /** + * Non spacing marks only count as word characters in bounds calculations + * if they have a base character. + */ + private static boolean hasBaseCharacter(Matcher matcher, int i, + CharSequence seq) + { + int start = (!matcher.transparentBounds) ? + matcher.from : 0; + for (int x=i; x >= start; x--) { + int ch = Character.codePointAt(seq, x); + if (Character.isLetterOrDigit(ch)) + return true; + if (Character.getType(ch) == Character.NON_SPACING_MARK) + continue; + return false; + } + return false; + } + + /** + * Attempts to match a slice in the input using the Boyer-Moore string + * matching algorithm. The algorithm is based on the idea that the + * pattern can be shifted farther ahead in the search text if it is + * matched right to left. + *

+ * The pattern is compared to the input one character at a time, from + * the rightmost character in the pattern to the left. If the characters + * all match the pattern has been found. If a character does not match, + * the pattern is shifted right a distance that is the maximum of two + * functions, the bad character shift and the good suffix shift. This + * shift moves the attempted match position through the input more + * quickly than a naive one position at a time check. + *

+ * The bad character shift is based on the character from the text that + * did not match. If the character does not appear in the pattern, the + * pattern can be shifted completely beyond the bad character. If the + * character does occur in the pattern, the pattern can be shifted to + * line the pattern up with the next occurrence of that character. + *

+ * The good suffix shift is based on the idea that some subset on the right + * side of the pattern has matched. When a bad character is found, the + * pattern can be shifted right by the pattern length if the subset does + * not occur again in pattern, or by the amount of distance to the + * next occurrence of the subset in the pattern. + * + * Boyer-Moore search methods adapted from code by Amy Yu. + */ + static class BnM extends Node { + int[] buffer; + int[] lastOcc; + int[] optoSft; + + /** + * Pre calculates arrays needed to generate the bad character + * shift and the good suffix shift. Only the last seven bits + * are used to see if chars match; This keeps the tables small + * and covers the heavily used ASCII range, but occasionally + * results in an aliased match for the bad character shift. + */ + static Node optimize(Node node) { + if (!(node instanceof Slice)) { + return node; + } + + int[] src = ((Slice) node).buffer; + int patternLength = src.length; + // The BM algorithm requires a bit of overhead; + // If the pattern is short don't use it, since + // a shift larger than the pattern length cannot + // be used anyway. + if (patternLength < 4) { + return node; + } + int i, j, k; + int[] lastOcc = new int[128]; + int[] optoSft = new int[patternLength]; + // Precalculate part of the bad character shift + // It is a table for where in the pattern each + // lower 7-bit value occurs + for (i = 0; i < patternLength; i++) { + lastOcc[src[i]&0x7F] = i + 1; + } + // Precalculate the good suffix shift + // i is the shift amount being considered +NEXT: for (i = patternLength; i > 0; i--) { + // j is the beginning index of suffix being considered + for (j = patternLength - 1; j >= i; j--) { + // Testing for good suffix + if (src[j] == src[j-i]) { + // src[j..len] is a good suffix + optoSft[j-1] = i; + } else { + // No match. The array has already been + // filled up with correct values before. + continue NEXT; + } + } + // This fills up the remaining of optoSft + // any suffix can not have larger shift amount + // then its sub-suffix. Why??? + while (j > 0) { + optoSft[--j] = i; + } + } + // Set the guard value because of unicode compression + optoSft[patternLength-1] = 1; + if (node instanceof SliceS) + return new BnMS(src, lastOcc, optoSft, node.next); + return new BnM(src, lastOcc, optoSft, node.next); + } + BnM(int[] src, int[] lastOcc, int[] optoSft, Node next) { + this.buffer = src; + this.lastOcc = lastOcc; + this.optoSft = optoSft; + this.next = next; + } + boolean match(Matcher matcher, int i, CharSequence seq) { + int[] src = buffer; + int patternLength = src.length; + int last = matcher.to - patternLength; + + // Loop over all possible match positions in text +NEXT: while (i <= last) { + // Loop over pattern from right to left + for (int j = patternLength - 1; j >= 0; j--) { + int ch = seq.charAt(i+j); + if (ch != src[j]) { + // Shift search to the right by the maximum of the + // bad character shift and the good suffix shift + i += Math.max(j + 1 - lastOcc[ch&0x7F], optoSft[j]); + continue NEXT; + } + } + // Entire pattern matched starting at i + matcher.first = i; + boolean ret = next.match(matcher, i + patternLength, seq); + if (ret) { + matcher.first = i; + matcher.groups[0] = matcher.first; + matcher.groups[1] = matcher.last; + return true; + } + i++; + } + // BnM is only used as the leading node in the unanchored case, + // and it replaced its Start() which always searches to the end + // if it doesn't find what it's looking for, so hitEnd is true. + matcher.hitEnd = true; + return false; + } + boolean study(TreeInfo info) { + info.minLength += buffer.length; + info.maxValid = false; + return next.study(info); + } + } + + /** + * Supplementary support version of BnM(). Unpaired surrogates are + * also handled by this class. + */ + static final class BnMS extends BnM { + int lengthInChars; + + BnMS(int[] src, int[] lastOcc, int[] optoSft, Node next) { + super(src, lastOcc, optoSft, next); + for (int x = 0; x < buffer.length; x++) { + lengthInChars += Character.charCount(buffer[x]); + } + } + boolean match(Matcher matcher, int i, CharSequence seq) { + int[] src = buffer; + int patternLength = src.length; + int last = matcher.to - lengthInChars; + + // Loop over all possible match positions in text +NEXT: while (i <= last) { + // Loop over pattern from right to left + int ch; + for (int j = countChars(seq, i, patternLength), x = patternLength - 1; + j > 0; j -= Character.charCount(ch), x--) { + ch = Character.codePointBefore(seq, i+j); + if (ch != src[x]) { + // Shift search to the right by the maximum of the + // bad character shift and the good suffix shift + int n = Math.max(x + 1 - lastOcc[ch&0x7F], optoSft[x]); + i += countChars(seq, i, n); + continue NEXT; + } + } + // Entire pattern matched starting at i + matcher.first = i; + boolean ret = next.match(matcher, i + lengthInChars, seq); + if (ret) { + matcher.first = i; + matcher.groups[0] = matcher.first; + matcher.groups[1] = matcher.last; + return true; + } + i += countChars(seq, i, 1); + } + matcher.hitEnd = true; + return false; + } + } + +/////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////// + + /** + * This must be the very first initializer. + */ + static Node accept = new Node(); + + static Node lastAccept = new LastNode(); + + private static class CharPropertyNames { + + static CharProperty charPropertyFor(String name) { + CharPropertyFactory m = map.get(name); + return m == null ? null : m.make(); + } + + private static abstract class CharPropertyFactory { + abstract CharProperty make(); + } + + private static void defCategory(String name, + final int typeMask) { + map.put(name, new CharPropertyFactory() { + CharProperty make() { return new Category(typeMask);}}); + } + + private static void defRange(String name, + final int lower, final int upper) { + map.put(name, new CharPropertyFactory() { + CharProperty make() { return rangeFor(lower, upper);}}); + } + + private static void defCtype(String name, + final int ctype) { + map.put(name, new CharPropertyFactory() { + CharProperty make() { return new Ctype(ctype);}}); + } + + private static abstract class CloneableProperty + extends CharProperty implements Cloneable + { + public CloneableProperty clone() { + try { + return (CloneableProperty) super.clone(); + } catch (CloneNotSupportedException e) { + throw new AssertionError(e); + } + } + } + + private static void defClone(String name, + final CloneableProperty p) { + map.put(name, new CharPropertyFactory() { + CharProperty make() { return p.clone();}}); + } + + private static final HashMap map + = new HashMap<>(); + + static { + // Unicode character property aliases, defined in + // http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt + defCategory("Cn", 1<-1 if the index is not known + */ + public PatternSyntaxException(String desc, String regex, int index) { + this.desc = desc; + this.pattern = regex; + this.index = index; + } + + /** + * Retrieves the error index. + * + * @return The approximate index in the pattern of the error, + * or -1 if the index is not known + */ + public int getIndex() { + return index; + } + + /** + * Retrieves the description of the error. + * + * @return The description of the error + */ + public String getDescription() { + return desc; + } + + /** + * Retrieves the erroneous regular-expression pattern. + * + * @return The erroneous pattern + */ + public String getPattern() { + return pattern; + } + + private static final String nl = + java.security.AccessController + .doPrivileged(new GetPropertyAction("line.separator")); + + /** + * Returns a multi-line string containing the description of the syntax + * error and its index, the erroneous regular-expression pattern, and a + * visual indication of the error index within the pattern. + * + * @return The full detail message + */ + public String getMessage() { + StringBuffer sb = new StringBuffer(); + sb.append(desc); + if (index >= 0) { + sb.append(" near index "); + sb.append(index); + } + sb.append(nl); + sb.append(pattern); + if (index >= 0) { + sb.append(nl); + for (int i = 0; i < index; i++) sb.append(' '); + sb.append('^'); + } + return sb.toString(); + } + +} diff -r 588d5bf7a560 -r bca65655b36b rt/emul/compact/src/main/java/java/util/regex/UnicodeProp.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rt/emul/compact/src/main/java/java/util/regex/UnicodeProp.java Mon Oct 07 16:13:27 2013 +0200 @@ -0,0 +1,236 @@ +/* + * Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package java.util.regex; + +import java.util.HashMap; +import java.util.Locale; + +enum UnicodeProp { + + ALPHABETIC { + public boolean is(int ch) { + return Character.isAlphabetic(ch); + } + }, + + LETTER { + public boolean is(int ch) { + return Character.isLetter(ch); + } + }, + + IDEOGRAPHIC { + public boolean is(int ch) { + return Character.isIdeographic(ch); + } + }, + + LOWERCASE { + public boolean is(int ch) { + return Character.isLowerCase(ch); + } + }, + + UPPERCASE { + public boolean is(int ch) { + return Character.isUpperCase(ch); + } + }, + + TITLECASE { + public boolean is(int ch) { + return Character.isTitleCase(ch); + } + }, + + WHITE_SPACE { + // \p{Whitespace} + public boolean is(int ch) { + return ((((1 << Character.SPACE_SEPARATOR) | + (1 << Character.LINE_SEPARATOR) | + (1 << Character.PARAGRAPH_SEPARATOR)) >> Character.getType(ch)) & 1) + != 0 || (ch >= 0x9 && ch <= 0xd) || (ch == 0x85); + } + }, + + CONTROL { + // \p{gc=Control} + public boolean is(int ch) { + return Character.getType(ch) == Character.CONTROL; + } + }, + + PUNCTUATION { + // \p{gc=Punctuation} + public boolean is(int ch) { + return ((((1 << Character.CONNECTOR_PUNCTUATION) | + (1 << Character.DASH_PUNCTUATION) | + (1 << Character.START_PUNCTUATION) | + (1 << Character.END_PUNCTUATION) | + (1 << Character.OTHER_PUNCTUATION) | + (1 << Character.INITIAL_QUOTE_PUNCTUATION) | + (1 << Character.FINAL_QUOTE_PUNCTUATION)) >> Character.getType(ch)) & 1) + != 0; + } + }, + + HEX_DIGIT { + // \p{gc=Decimal_Number} + // \p{Hex_Digit} -> PropList.txt: Hex_Digit + public boolean is(int ch) { + return DIGIT.is(ch) || + (ch >= 0x0030 && ch <= 0x0039) || + (ch >= 0x0041 && ch <= 0x0046) || + (ch >= 0x0061 && ch <= 0x0066) || + (ch >= 0xFF10 && ch <= 0xFF19) || + (ch >= 0xFF21 && ch <= 0xFF26) || + (ch >= 0xFF41 && ch <= 0xFF46); + } + }, + + ASSIGNED { + public boolean is(int ch) { + return Character.getType(ch) != Character.UNASSIGNED; + } + }, + + NONCHARACTER_CODE_POINT { + // PropList.txt:Noncharacter_Code_Point + public boolean is(int ch) { + return (ch & 0xfffe) == 0xfffe || (ch >= 0xfdd0 && ch <= 0xfdef); + } + }, + + DIGIT { + // \p{gc=Decimal_Number} + public boolean is(int ch) { + return Character.isDigit(ch); + } + }, + + ALNUM { + // \p{alpha} + // \p{digit} + public boolean is(int ch) { + return ALPHABETIC.is(ch) || DIGIT.is(ch); + } + }, + + BLANK { + // \p{Whitespace} -- + // [\N{LF} \N{VT} \N{FF} \N{CR} \N{NEL} -> 0xa, 0xb, 0xc, 0xd, 0x85 + // \p{gc=Line_Separator} + // \p{gc=Paragraph_Separator}] + public boolean is(int ch) { + return Character.getType(ch) == Character.SPACE_SEPARATOR || + ch == 0x9; // \N{HT} + } + }, + + GRAPH { + // [^ + // \p{space} + // \p{gc=Control} + // \p{gc=Surrogate} + // \p{gc=Unassigned}] + public boolean is(int ch) { + return ((((1 << Character.SPACE_SEPARATOR) | + (1 << Character.LINE_SEPARATOR) | + (1 << Character.PARAGRAPH_SEPARATOR) | + (1 << Character.CONTROL) | + (1 << Character.SURROGATE) | + (1 << Character.UNASSIGNED)) >> Character.getType(ch)) & 1) + == 0; + } + }, + + PRINT { + // \p{graph} + // \p{blank} + // -- \p{cntrl} + public boolean is(int ch) { + return (GRAPH.is(ch) || BLANK.is(ch)) && !CONTROL.is(ch); + } + }, + + WORD { + // \p{alpha} + // \p{gc=Mark} + // \p{digit} + // \p{gc=Connector_Punctuation} + + public boolean is(int ch) { + return ALPHABETIC.is(ch) || + ((((1 << Character.NON_SPACING_MARK) | + (1 << Character.ENCLOSING_MARK) | + (1 << Character.COMBINING_SPACING_MARK) | + (1 << Character.DECIMAL_DIGIT_NUMBER) | + (1 << Character.CONNECTOR_PUNCTUATION)) >> Character.getType(ch)) & 1) + != 0; + } + }; + + private final static HashMap posix = new HashMap<>(); + private final static HashMap aliases = new HashMap<>(); + static { + posix.put("ALPHA", "ALPHABETIC"); + posix.put("LOWER", "LOWERCASE"); + posix.put("UPPER", "UPPERCASE"); + posix.put("SPACE", "WHITE_SPACE"); + posix.put("PUNCT", "PUNCTUATION"); + posix.put("XDIGIT","HEX_DIGIT"); + posix.put("ALNUM", "ALNUM"); + posix.put("CNTRL", "CONTROL"); + posix.put("DIGIT", "DIGIT"); + posix.put("BLANK", "BLANK"); + posix.put("GRAPH", "GRAPH"); + posix.put("PRINT", "PRINT"); + + aliases.put("WHITESPACE", "WHITE_SPACE"); + aliases.put("HEXDIGIT","HEX_DIGIT"); + aliases.put("NONCHARACTERCODEPOINT", "NONCHARACTER_CODE_POINT"); + } + + public static UnicodeProp forName(String propName) { + propName = propName.toUpperCase(Locale.ENGLISH); + String alias = aliases.get(propName); + if (alias != null) + propName = alias; + try { + return valueOf (propName); + } catch (IllegalArgumentException x) {} + return null; + } + + public static UnicodeProp forPOSIXName(String propName) { + propName = posix.get(propName.toUpperCase(Locale.ENGLISH)); + if (propName == null) + return null; + return valueOf (propName); + } + + public abstract boolean is(int ch); +} diff -r 588d5bf7a560 -r bca65655b36b rt/emul/compact/src/main/java/java/util/regex/package.html --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rt/emul/compact/src/main/java/java/util/regex/package.html Mon Oct 07 16:13:27 2013 +0200 @@ -0,0 +1,66 @@ + + + + + + + + +Classes for matching character sequences against patterns specified by regular +expressions. + +

An instance of the {@link java.util.regex.Pattern} class represents a +regular expression that is specified in string form in a syntax similar to +that used by Perl. + +

Instances of the {@link java.util.regex.Matcher} class are used to match +character sequences against a given pattern. Input is provided to matchers via +the {@link java.lang.CharSequence} interface in order to support matching +against characters from a wide variety of input sources.

+ +

Unless otherwise noted, passing a null argument to a method +in any class or interface in this package will cause a +{@link java.lang.NullPointerException NullPointerException} to be thrown. + +

Related Documentation

+ +

An excellent tutorial and overview of regular expressions is Mastering Regular +Expressions, Jeffrey E. F. Friedl, O'Reilly and Associates, 1997.

+ + + +@since 1.4 +@author Mike McCloskey +@author Mark Reinhold + + +