Automaton
.
Regular expressions are built from the following abstract syntax:
regexp | ::= | unionexp | ||
| | ||||
unionexp | ::= | interexp | unionexp |
(union) | |
| | interexp | |||
interexp | ::= | concatexp & interexp |
(intersection) | [OPTIONAL] |
| | concatexp | |||
concatexp | ::= | repeatexp concatexp | (concatenation) | |
| | repeatexp | |||
repeatexp | ::= | repeatexp ? |
(zero or one occurrence) | |
| | repeatexp * |
(zero or more occurrences) | ||
| | repeatexp + |
(one or more occurrences) | ||
| | repeatexp {n} |
(n occurrences) |
||
| | repeatexp {n,} |
(n or more occurrences) |
||
| | repeatexp {n,m} |
(n to m occurrences, including both) |
||
| | complexp | |||
complexp | ::= | ~ complexp |
(complement) | [OPTIONAL] |
| | charclassexp | |||
charclassexp | ::= | [ charclasses ] |
(character class) | |
| | [^ charclasses ] |
(negated character class) | ||
| | simpleexp | |||
charclasses | ::= | charclass charclasses | ||
| | charclass | |||
charclass | ::= | charexp - charexp |
(character range, including end-points) | |
| | charexp | |||
simpleexp | ::= | charexp | ||
| | . |
(any single character) | ||
| | # |
(the empty language) | [OPTIONAL] | |
| | @ |
(any string) | [OPTIONAL] | |
| | " <Unicode string without double-quotes> " |
(a string) | ||
| | ( ) |
(the empty string) | ||
| | ( unionexp ) |
(precedence override) | ||
| | < <identifier> > |
(named automaton) | [OPTIONAL] | |
| | <n-m> |
(numerical interval) | [OPTIONAL] | |
charexp | ::= | <Unicode character> | (a single non-reserved character) | |
| | \d |
(a digit [0-9]) | ||
| | \D |
(a non-digit [^0-9]) | ||
| | \s |
(whitespace [ \t\n\r]) | ||
| | \S |
(non whitespace [^\s]) | ||
| | \w |
(a word character [a-zA-Z_0-9]) | ||
| | \W |
(a non word character [^\w]) | ||
| | \ <Unicode character> |
(a single character) |
The productions marked [OPTIONAL] are only allowed if specified by the syntax
flags passed to the RegExp
constructor. The reserved characters used in the
(enabled) syntax must be escaped with backslash (\
) or double-quotes (
"..."
). (In contrast to other regexp syntaxes, this is required also in character
classes.) Be aware that dash (-
) has a special meaning in charclass
expressions. An identifier is a string not containing right angle bracket (>
) or dash (-
). Numerical intervals are specified by non-negative
decimal integers and include both end points, and if n
and m
have the same number of digits, then the conforming strings must have that length (i.e.
prefixed by 0's).
-
Nested Class Summary
Nested ClassesModifier and TypeClassDescriptionstatic enum
The type of expression represented by a RegExp node. -
Field Summary
FieldsModifier and TypeFieldDescriptionstatic final int
Syntax flag, enables all optional regexp syntax.static final int
Syntax flag, enables anystring (@
).static final int
Allows case insensitive matching of ASCII characters.static final int
Syntax flag, enables named automata (<
identifier>
).final int
Character expressionstatic final int
Syntax flag, enables complement (~
).final int
Limits for repeatable type expressionsstatic final int
Syntax flag, enables empty language (#
).final RegExp
Child expressions held by a container type expressionfinal RegExp
Child expressions held by a container type expression(package private) final int
final int
Extents for range type expressionsstatic final int
Syntax flag, enables intersection (&
).static final int
Syntax flag, enables numerical intervals (<n-m>
).final RegExp.Kind
The type of expressionfinal int
Limits for repeatable type expressionsfinal int
Limits for repeatable type expressionsstatic final int
Syntax flag, enables no optional regexp syntax.private final String
(package private) int
final String
String expressionfinal int
Extents for range type expressions -
Constructor Summary
ConstructorsConstructorDescriptionRegExp
(int flags, RegExp.Kind kind, RegExp exp1, RegExp exp2, String s, int c, int min, int max, int digits, int from, int to) Constructs newRegExp
from a string.Constructs newRegExp
from a string.Constructs newRegExp
from a string. -
Method Summary
Modifier and TypeMethodDescriptionprivate boolean
check
(int flag) (package private) RegExp
private void
findLeaves
(RegExp exp, RegExp.Kind kind, List<Automaton> list, Map<String, Automaton> automata, AutomatonProvider automaton_provider, int determinizeWorkLimit) Returns set of automaton identifiers that occur in this regular expression.(package private) void
getIdentifiers
(Set<String> set) The string that was used to construct the regex.(package private) static RegExp
makeAnyChar
(int flags) (package private) static RegExp
makeAnyString
(int flags) (package private) static RegExp
makeAutomaton
(int flags, String s) (package private) static RegExp
makeChar
(int flags, int c) (package private) static RegExp
makeCharRange
(int flags, int from, int to) (package private) static RegExp
makeComplement
(int flags, RegExp exp) (package private) static RegExp
makeConcatenation
(int flags, RegExp exp1, RegExp exp2) (package private) static RegExp
makeEmpty
(int flags) (package private) static RegExp
makeIntersection
(int flags, RegExp exp1, RegExp exp2) (package private) static RegExp
makeInterval
(int flags, int min, int max, int digits) (package private) static RegExp
makeOptional
(int flags, RegExp exp) (package private) static RegExp
makeRepeat
(int flags, RegExp exp) (package private) static RegExp
makeRepeat
(int flags, RegExp exp, int min) (package private) static RegExp
makeRepeat
(int flags, RegExp exp, int min, int max) (package private) static RegExp
makeString
(int flags, String s) private static RegExp
makeString
(int flags, RegExp exp1, RegExp exp2) (package private) static RegExp
private boolean
match
(int c) (package private) final RegExp
private boolean
more()
(package private) static RegExp
newContainerNode
(int flags, RegExp.Kind kind, RegExp exp1, RegExp exp2) (package private) static RegExp
newLeafNode
(int flags, RegExp.Kind kind, String s, int c, int min, int max, int digits, int from, int to) (package private) static RegExp
newRepeatingNode
(int flags, RegExp.Kind kind, RegExp exp, int min, int max) private int
next()
(package private) final RegExp
(package private) final RegExp
(package private) final RegExp
(package private) final int
(package private) final RegExp
(package private) final RegExp
(package private) final RegExp
(package private) final RegExp
(package private) final RegExp
(package private) final RegExp
private boolean
Constructs newAutomaton
from thisRegExp
.toAutomaton
(int determinizeWorkLimit) Constructs newAutomaton
from thisRegExp
.toAutomaton
(Map<String, Automaton> automata, int determinizeWorkLimit) Constructs newAutomaton
from thisRegExp
.private Automaton
toAutomaton
(Map<String, Automaton> automata, AutomatonProvider automaton_provider, int determinizeWorkLimit) toAutomaton
(AutomatonProvider automaton_provider, int determinizeWorkLimit) Constructs newAutomaton
from thisRegExp
.private Automaton
toAutomatonInternal
(Map<String, Automaton> automata, AutomatonProvider automaton_provider, int determinizeWorkLimit) private Automaton
toCaseInsensitiveChar
(int codepoint, int determinizeWorkLimit) private Automaton
toCaseInsensitiveString
(int determinizeWorkLimit) toString()
Constructs string from parsed regular expression.(package private) void
Like to string, but more verbose (shows the higherchy more clearly).(package private) void
toStringTree
(StringBuilder b, String indent)
-
Field Details
-
INTERSECTION
public static final int INTERSECTIONSyntax flag, enables intersection (&
).- See Also:
-
COMPLEMENT
public static final int COMPLEMENTSyntax flag, enables complement (~
).- See Also:
-
EMPTY
public static final int EMPTYSyntax flag, enables empty language (#
).- See Also:
-
ANYSTRING
public static final int ANYSTRINGSyntax flag, enables anystring (@
).- See Also:
-
AUTOMATON
public static final int AUTOMATONSyntax flag, enables named automata (<
identifier>
).- See Also:
-
INTERVAL
public static final int INTERVALSyntax flag, enables numerical intervals (<n-m>
).- See Also:
-
ALL
public static final int ALLSyntax flag, enables all optional regexp syntax.- See Also:
-
NONE
public static final int NONESyntax flag, enables no optional regexp syntax.- See Also:
-
ASCII_CASE_INSENSITIVE
public static final int ASCII_CASE_INSENSITIVEAllows case insensitive matching of ASCII characters.- See Also:
-
kind
The type of expression -
exp1
Child expressions held by a container type expression -
exp2
Child expressions held by a container type expression -
s
String expression -
c
public final int cCharacter expression -
min
public final int minLimits for repeatable type expressions -
max
public final int maxLimits for repeatable type expressions -
digits
public final int digitsLimits for repeatable type expressions -
from
public final int fromExtents for range type expressions -
to
public final int toExtents for range type expressions -
originalString
-
flags
final int flags -
pos
int pos
-
-
Constructor Details
-
RegExp
Constructs newRegExp
from a string. Same asRegExp(s, ALL)
.- Parameters:
s
- regexp string- Throws:
IllegalArgumentException
- if an error occurred while parsing the regular expression
-
RegExp
Constructs newRegExp
from a string.- Parameters:
s
- regexp stringsyntax_flags
- boolean 'or' of optional syntax constructs to be enabled- Throws:
IllegalArgumentException
- if an error occurred while parsing the regular expression
-
RegExp
Constructs newRegExp
from a string.- Parameters:
s
- regexp stringsyntax_flags
- boolean 'or' of optional syntax constructs to be enabledmatch_flags
- boolean 'or' of match behavior options such as case insensitivity- Throws:
IllegalArgumentException
- if an error occurred while parsing the regular expression
-
RegExp
RegExp(int flags, RegExp.Kind kind, RegExp exp1, RegExp exp2, String s, int c, int min, int max, int digits, int from, int to)
-
-
Method Details
-
newContainerNode
-
newRepeatingNode
-
newLeafNode
static RegExp newLeafNode(int flags, RegExp.Kind kind, String s, int c, int min, int max, int digits, int from, int to) -
toAutomaton
Constructs newAutomaton
from thisRegExp
. Same astoAutomaton(null)
(empty automaton map). -
toAutomaton
public Automaton toAutomaton(int determinizeWorkLimit) throws IllegalArgumentException, TooComplexToDeterminizeException Constructs newAutomaton
from thisRegExp
. The constructed automaton is minimal and deterministic and has no transitions to dead states.- Parameters:
determinizeWorkLimit
- maximum effort to spend while determinizing the automata. If determinizing the automata would require more than this effort, TooComplexToDeterminizeException is thrown. Higher numbers require more space but can process more complex regexes. UseOperations.DEFAULT_DETERMINIZE_WORK_LIMIT
as a decent default if you don't otherwise know what to specify.- Throws:
IllegalArgumentException
- if this regular expression uses a named identifier that is not available from the automaton providerTooComplexToDeterminizeException
- if determinizing this regexp requires more effort than determinizeWorkLimit states
-
toAutomaton
public Automaton toAutomaton(AutomatonProvider automaton_provider, int determinizeWorkLimit) throws IllegalArgumentException, TooComplexToDeterminizeException Constructs newAutomaton
from thisRegExp
. The constructed automaton is minimal and deterministic and has no transitions to dead states.- Parameters:
automaton_provider
- provider of automata for named identifiersdeterminizeWorkLimit
- maximum effort to spend while determinizing the automata. If determinizing the automata would require more than this effort, TooComplexToDeterminizeException is thrown. Higher numbers require more space but can process more complex regexes. UseOperations.DEFAULT_DETERMINIZE_WORK_LIMIT
as a decent default if you don't otherwise know what to specify.- Throws:
IllegalArgumentException
- if this regular expression uses a named identifier that is not available from the automaton providerTooComplexToDeterminizeException
- if determinizing this regexp requires more effort than determinizeWorkLimit states
-
toAutomaton
public Automaton toAutomaton(Map<String, Automaton> automata, int determinizeWorkLimit) throws IllegalArgumentException, TooComplexToDeterminizeExceptionConstructs newAutomaton
from thisRegExp
. The constructed automaton is minimal and deterministic and has no transitions to dead states.- Parameters:
automata
- a map from automaton identifiers to automata (of typeAutomaton
).determinizeWorkLimit
- maximum effort to spend while determinizing the automata. If determinizing the automata would require more than this effort, TooComplexToDeterminizeException is thrown. Higher numbers require more space but can process more complex regexes.- Throws:
IllegalArgumentException
- if this regular expression uses a named identifier that does not occur in the automaton mapTooComplexToDeterminizeException
- if determinizing this regexp requires more effort than determinizeWorkLimit states
-
toAutomaton
private Automaton toAutomaton(Map<String, Automaton> automata, AutomatonProvider automaton_provider, int determinizeWorkLimit) throws IllegalArgumentException, TooComplexToDeterminizeException -
toAutomatonInternal
private Automaton toAutomatonInternal(Map<String, Automaton> automata, AutomatonProvider automaton_provider, int determinizeWorkLimit) throws IllegalArgumentException- Throws:
IllegalArgumentException
-
toCaseInsensitiveChar
-
toCaseInsensitiveString
-
findLeaves
private void findLeaves(RegExp exp, RegExp.Kind kind, List<Automaton> list, Map<String, Automaton> automata, AutomatonProvider automaton_provider, int determinizeWorkLimit) -
getOriginalString
The string that was used to construct the regex. Compare to toString. -
toString
Constructs string from parsed regular expression. -
toStringBuilder
-
toStringTree
Like to string, but more verbose (shows the higherchy more clearly). -
toStringTree
-
getIdentifiers
Returns set of automaton identifiers that occur in this regular expression. -
getIdentifiers
-
makeUnion
-
makeConcatenation
-
makeString
-
makeIntersection
-
makeOptional
-
makeRepeat
-
makeRepeat
-
makeRepeat
-
makeComplement
-
makeChar
-
makeCharRange
-
makeAnyChar
-
makeEmpty
-
makeString
-
makeAnyString
-
makeAutomaton
-
makeInterval
-
peek
-
match
private boolean match(int c) -
more
private boolean more() -
next
- Throws:
IllegalArgumentException
-
check
private boolean check(int flag) -
parseUnionExp
- Throws:
IllegalArgumentException
-
parseInterExp
- Throws:
IllegalArgumentException
-
parseConcatExp
- Throws:
IllegalArgumentException
-
parseRepeatExp
- Throws:
IllegalArgumentException
-
parseComplExp
- Throws:
IllegalArgumentException
-
parseCharClassExp
- Throws:
IllegalArgumentException
-
parseCharClasses
- Throws:
IllegalArgumentException
-
parseCharClass
- Throws:
IllegalArgumentException
-
expandPredefined
RegExp expandPredefined() -
matchPredefinedCharacterClass
-
parseSimpleExp
- Throws:
IllegalArgumentException
-
parseCharExp
- Throws:
IllegalArgumentException
-