blob: d8528378e7974b86918ca19cfb020f6f7f9f471a [file] [log] [blame]
/*************************************************
* Perl-Compatible Regular Expressions *
*************************************************/
/* PCRE is a library of functions to support regular expressions whose syntax
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2016 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the University of Cambridge nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
-----------------------------------------------------------------------------
*/
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#define NLBLOCK cb /* Block containing newline information */
#define PSSTART start_pattern /* Field containing processed string start */
#define PSEND end_pattern /* Field containing processed string end */
#include "pcre2_internal.h"
/* In rare error cases debugging might require calling pcre2_printint(). */
#if 0
#ifdef EBCDIC
#define PRINTABLE(c) ((c) >= 64 && (c) < 255)
#else
#define PRINTABLE(c) ((c) >= 32 && (c) < 127)
#endif
#include "pcre2_printint.c"
#define CALL_PRINTINT
#endif
/* There are a few things that vary with different code unit sizes. Handle them
by defining macros in order to minimize #if usage. */
#if PCRE2_CODE_UNIT_WIDTH == 8
#define STRING_UTFn_RIGHTPAR STRING_UTF8_RIGHTPAR, 5
#define XDIGIT(c) xdigitab[c]
#else /* Either 16-bit or 32-bit */
#define XDIGIT(c) (MAX_255(c)? xdigitab[c] : 0xff)
#if PCRE2_CODE_UNIT_WIDTH == 16
#define STRING_UTFn_RIGHTPAR STRING_UTF16_RIGHTPAR, 6
#else /* 32-bit */
#define STRING_UTFn_RIGHTPAR STRING_UTF32_RIGHTPAR, 6
#endif
#endif
/* Function definitions to allow mutual recursion */
static int
add_list_to_class(uint8_t *, PCRE2_UCHAR **, uint32_t, compile_block *,
const uint32_t *, unsigned int);
static BOOL
compile_regex(uint32_t, PCRE2_UCHAR **, PCRE2_SPTR *, int *, BOOL, BOOL,
uint32_t, int, uint32_t *, int32_t *, uint32_t *, int32_t *,
branch_chain *, compile_block *, size_t *);
/*************************************************
* Code parameters and static tables *
*************************************************/
/* This value specifies the size of stack workspace, which is used in different
ways in the different pattern scans. The group-identifying pre-scan uses it to
handle nesting, and needs it to be 16-bit aligned.
During the first compiling phase, when determining how much memory is required,
the regex is partly compiled into this space, but the compiled parts are
discarded as soon as they can be, so that hopefully there will never be an
overrun. The code does, however, check for an overrun, which can occur for
pathological patterns. The size of the workspace depends on LINK_SIZE because
the length of compiled items varies with this.
In the real compile phase, the workspace is used for remembering data about
numbered groups, provided there are not too many of them (if there are, extra
memory is acquired). For this phase the memory must be 32-bit aligned. Having
defined the size in code units, we set up C32_WORK_SIZE as the number of
elements in the 32-bit vector. */
#define COMPILE_WORK_SIZE (2048*LINK_SIZE) /* Size in code units */
#define C32_WORK_SIZE \
((COMPILE_WORK_SIZE * sizeof(PCRE2_UCHAR))/sizeof(uint32_t))
/* The overrun tests check for a slightly smaller size so that they detect the
overrun before it actually does run off the end of the data block. */
#define WORK_SIZE_SAFETY_MARGIN (100)
/* This value determines the size of the initial vector that is used for
remembering named groups during the pre-compile. It is allocated on the stack,
but if it is too small, it is expanded, in a similar way to the workspace. The
value is the number of slots in the list. */
#define NAMED_GROUP_LIST_SIZE 20
/* The original PCRE required patterns to be zero-terminated, and it simplifies
the compiling code if it is guaranteed that there is a zero code unit at the
end of the pattern, because this means that tests for coding sequences such as
(*SKIP) or even just (?<= can check a sequence of code units without having to
keep checking for the end of the pattern. The new PCRE2 API allows zero code
units within patterns if a positive length is given, but in order to keep most
of the compiling code as it was, we copy such patterns and add a zero on the
end. This value determines the size of space on the stack that is used if the
pattern fits; if not, heap memory is used. */
#define COPIED_PATTERN_SIZE 1024
/* Maximum length value to check against when making sure that the variable
that holds the compiled pattern length does not overflow. We make it a bit less
than INT_MAX to allow for adding in group terminating bytes, so that we don't
have to check them every time. */
#define OFLOW_MAX (INT_MAX - 20)
/* Macro for setting individual bits in class bitmaps. */
#define SETBIT(a,b) a[(b)/8] |= (1 << ((b)&7))
/* Private flags added to firstcu and reqcu. */
#define REQ_CASELESS (1 << 0) /* Indicates caselessness */
#define REQ_VARY (1 << 1) /* reqcu followed non-literal item */
/* Negative values for the firstcu and reqcu flags */
#define REQ_UNSET (-2) /* Not yet found anything */
#define REQ_NONE (-1) /* Found not fixed char */
/* These flags are used in the groupinfo vector. */
#define GI_SET_COULD_BE_EMPTY 0x80000000u
#define GI_COULD_BE_EMPTY 0x40000000u
#define GI_NOT_FIXED_LENGTH 0x20000000u
#define GI_SET_FIXED_LENGTH 0x10000000u
#define GI_FIXED_LENGTH_MASK 0x0000ffffu
/* This bit (which is greater than any UTF value) is used to indicate that a
variable contains a number of code units instead of an actual code point. */
#define UTF_LENGTH 0x10000000l
/* This simple test for a decimal digit works for both ASCII/Unicode and EBCDIC
and is fast (a good compiler can turn it into a subtraction and unsigned
comparison). */
#define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
/* Table to identify hex digits. The tables in chartables are dependent on the
locale, and may mark arbitrary characters as digits. We want to recognize only
0-9, a-z, and A-Z as hex digits, which is why we have a private table here. It
costs 256 bytes, but it is a lot faster than doing character value tests (at
least in some simple cases I timed), and in some applications one wants PCRE to
compile efficiently as well as match efficiently. The value in the table is
the binary hex digit value, or 0xff for non-hex digits. */
/* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
UTF-8 mode. */
#ifndef EBCDIC
static const uint8_t xdigitab[] =
{
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 0- 7 */
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 8- 15 */
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 16- 23 */
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 24- 31 */
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - ' */
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* ( - / */
0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /* 0 - 7 */
0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff, /* 8 - ? */
0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* @ - G */
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* H - O */
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* P - W */
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* X - _ */
0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* ` - g */
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* h - o */
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* p - w */
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* x -127 */
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 128-135 */
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 136-143 */
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144-151 */
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 152-159 */
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160-167 */
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 168-175 */
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 176-183 */
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 192-199 */
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 2ff-207 */
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 208-215 */
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 216-223 */
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 224-231 */
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 232-239 */
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 240-247 */
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};/* 248-255 */
#else
/* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
static const uint8_t xdigitab[] =
{
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 0- 7 0 */
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 8- 15 */
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 16- 23 10 */
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 24- 31 */
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 32- 39 20 */
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 40- 47 */
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 48- 55 30 */
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 56- 63 */
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - 71 40 */
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 72- | */
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* & - 87 50 */
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 88- 95 */
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - -103 60 */
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 104- ? */
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 112-119 70 */
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 120- " */
0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* 128- g 80 */
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* h -143 */
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144- p 90 */
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* q -159 */
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160- x A0 */
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* y -175 */
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* ^ -183 B0 */
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */
0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* { - G C0 */
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* H -207 */
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* } - P D0 */
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* Q -223 */
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* \ - X E0 */
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* Y -239 */
0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /* 0 - 7 F0 */
0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff};/* 8 -255 */
#endif /* EBCDIC */
/* Table for handling alphanumeric escaped characters. Positive returns are
simple data values; negative values are for special things like \d and so on.
Zero means further processing is needed (for things like \x), or the escape is
invalid. */
/* This is the "normal" table for ASCII systems or for EBCDIC systems running
in UTF-8 mode. It runs from '0' to 'z'. */
#ifndef EBCDIC
#define ESCAPES_FIRST CHAR_0
#define ESCAPES_LAST CHAR_z
#define UPPER_CASE(c) (c-32)
static const short int escapes[] = {
0, 0,
0, 0,
0, 0,
0, 0,
0, 0,
CHAR_COLON, CHAR_SEMICOLON,
CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
CHAR_COMMERCIAL_AT, -ESC_A,
-ESC_B, -ESC_C,
-ESC_D, -ESC_E,
0, -ESC_G,
-ESC_H, 0,
0, -ESC_K,
0, 0,
-ESC_N, 0,
-ESC_P, -ESC_Q,
-ESC_R, -ESC_S,
0, 0,
-ESC_V, -ESC_W,
-ESC_X, 0,
-ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
CHAR_GRAVE_ACCENT, ESC_a,
-ESC_b, 0,
-ESC_d, ESC_e,
ESC_f, 0,
-ESC_h, 0,
0, -ESC_k,
0, 0,
ESC_n, 0,
-ESC_p, 0,
ESC_r, -ESC_s,
ESC_tee, 0,
-ESC_v, -ESC_w,
0, 0,
-ESC_z
};
#else
/* This is the "abnormal" table for EBCDIC systems without UTF-8 support.
It runs from 'a' to '9'. For some minimal testing of EBCDIC features, the code
is sometimes compiled on an ASCII system. In this case, we must not use CHAR_a
because it is defined as 'a', which of course picks up the ASCII value. */
#if 'a' == 0x81 /* Check for a real EBCDIC environment */
#define ESCAPES_FIRST CHAR_a
#define ESCAPES_LAST CHAR_9
#define UPPER_CASE(c) (c+64)
#else /* Testing in an ASCII environment */
#define ESCAPES_FIRST ((unsigned char)'\x81') /* EBCDIC 'a' */
#define ESCAPES_LAST ((unsigned char)'\xf9') /* EBCDIC '9' */
#define UPPER_CASE(c) (c-32)
#endif
static const short int escapes[] = {
/* 80 */ ESC_a, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
/* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
/* 90 */ 0, 0, -ESC_k, 0, 0, ESC_n, 0, -ESC_p,
/* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
/* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
/* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
/* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
/* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
/* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
/* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
/* D0 */ '}', 0, -ESC_K, 0, 0,-ESC_N, 0, -ESC_P,
/* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
/* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
/* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
/* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
/* F8 */ 0, 0
};
/* We also need a table of characters that may follow \c in an EBCDIC
environment for characters 0-31. */
static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_";
#endif /* EBCDIC */
/* Table of special "verbs" like (*PRUNE). This is a short table, so it is
searched linearly. Put all the names into a single string, in order to reduce
the number of relocations when a shared library is dynamically linked. The
string is built from string macros so that it works in UTF-8 mode on EBCDIC
platforms. */
typedef struct verbitem {
int len; /* Length of verb name */
int op; /* Op when no arg, or -1 if arg mandatory */
int op_arg; /* Op when arg present, or -1 if not allowed */
} verbitem;
static const char verbnames[] =
"\0" /* Empty name is a shorthand for MARK */
STRING_MARK0
STRING_ACCEPT0
STRING_COMMIT0
STRING_F0
STRING_FAIL0
STRING_PRUNE0
STRING_SKIP0
STRING_THEN;
static const verbitem verbs[] = {
{ 0, -1, OP_MARK },
{ 4, -1, OP_MARK },
{ 6, OP_ACCEPT, -1 },
{ 6, OP_COMMIT, -1 },
{ 1, OP_FAIL, -1 },
{ 4, OP_FAIL, -1 },
{ 5, OP_PRUNE, OP_PRUNE_ARG },
{ 4, OP_SKIP, OP_SKIP_ARG },
{ 4, OP_THEN, OP_THEN_ARG }
};
static const int verbcount = sizeof(verbs)/sizeof(verbitem);
/* Substitutes for [[:<:]] and [[:>:]], which mean start and end of word in
another regex library. */
static const PCRE2_UCHAR sub_start_of_word[] = {
CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w, CHAR_RIGHT_PARENTHESIS, '\0' };
static const PCRE2_UCHAR sub_end_of_word[] = {
CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w,
CHAR_RIGHT_PARENTHESIS, '\0' };
/* Tables of names of POSIX character classes and their lengths. The names are
now all in a single string, to reduce the number of relocations when a shared
library is dynamically loaded. The list of lengths is terminated by a zero
length entry. The first three must be alpha, lower, upper, as this is assumed
for handling case independence. The indices for graph, print, and punct are
needed, so identify them. */
static const char posix_names[] =
STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
STRING_word0 STRING_xdigit;
static const uint8_t posix_name_lengths[] = {
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
#define PC_GRAPH 8
#define PC_PRINT 9
#define PC_PUNCT 10
/* Table of class bit maps for each POSIX class. Each class is formed from a
base map, with an optional addition or removal of another map. Then, for some
classes, there is some additional tweaking: for [:blank:] the vertical space
characters are removed, and for [:alpha:] and [:alnum:] the underscore
character is removed. The triples in the table consist of the base map offset,
second map offset or -1 if no second map, and a non-negative value for map
addition or a negative value for map subtraction (if there are two maps). The
absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
remove vertical space characters, 2 => remove underscore. */
static const int posix_class_maps[] = {
cbit_word, cbit_digit, -2, /* alpha */
cbit_lower, -1, 0, /* lower */
cbit_upper, -1, 0, /* upper */
cbit_word, -1, 2, /* alnum - word without underscore */
cbit_print, cbit_cntrl, 0, /* ascii */
cbit_space, -1, 1, /* blank - a GNU extension */
cbit_cntrl, -1, 0, /* cntrl */
cbit_digit, -1, 0, /* digit */
cbit_graph, -1, 0, /* graph */
cbit_print, -1, 0, /* print */
cbit_punct, -1, 0, /* punct */
cbit_space, -1, 0, /* space */
cbit_word, -1, 0, /* word - a Perl extension */
cbit_xdigit,-1, 0 /* xdigit */
};
/* Table of substitutes for \d etc when PCRE2_UCP is set. They are replaced by
Unicode property escapes. */
#ifdef SUPPORT_UNICODE
static const PCRE2_UCHAR string_PNd[] = {
CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
static const PCRE2_UCHAR string_pNd[] = {
CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
static const PCRE2_UCHAR string_PXsp[] = {
CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
static const PCRE2_UCHAR string_pXsp[] = {
CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
static const PCRE2_UCHAR string_PXwd[] = {
CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
static const PCRE2_UCHAR string_pXwd[] = {
CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
static PCRE2_SPTR substitutes[] = {
string_PNd, /* \D */
string_pNd, /* \d */
string_PXsp, /* \S */ /* Xsp is Perl space, but from 8.34, Perl */
string_pXsp, /* \s */ /* space and POSIX space are the same. */
string_PXwd, /* \W */
string_pXwd /* \w */
};
/* The POSIX class substitutes must be in the order of the POSIX class names,
defined above, and there are both positive and negative cases. NULL means no
general substitute of a Unicode property escape (\p or \P). However, for some
POSIX classes (e.g. graph, print, punct) a special property code is compiled
directly. */
static const PCRE2_UCHAR string_pCc[] = {
CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
CHAR_C, CHAR_c, CHAR_RIGHT_CURLY_BRACKET, '\0' };
static const PCRE2_UCHAR string_pL[] = {
CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
static const PCRE2_UCHAR string_pLl[] = {
CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
static const PCRE2_UCHAR string_pLu[] = {
CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
static const PCRE2_UCHAR string_pXan[] = {
CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
static const PCRE2_UCHAR string_h[] = {
CHAR_BACKSLASH, CHAR_h, '\0' };
static const PCRE2_UCHAR string_pXps[] = {
CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
static const PCRE2_UCHAR string_PCc[] = {
CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
CHAR_C, CHAR_c, CHAR_RIGHT_CURLY_BRACKET, '\0' };
static const PCRE2_UCHAR string_PL[] = {
CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
static const PCRE2_UCHAR string_PLl[] = {
CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
static const PCRE2_UCHAR string_PLu[] = {
CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
static const PCRE2_UCHAR string_PXan[] = {
CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
static const PCRE2_UCHAR string_H[] = {
CHAR_BACKSLASH, CHAR_H, '\0' };
static const PCRE2_UCHAR string_PXps[] = {
CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
static PCRE2_SPTR posix_substitutes[] = {
string_pL, /* alpha */
string_pLl, /* lower */
string_pLu, /* upper */
string_pXan, /* alnum */
NULL, /* ascii */
string_h, /* blank */
string_pCc, /* cntrl */
string_pNd, /* digit */
NULL, /* graph */
NULL, /* print */
NULL, /* punct */
string_pXps, /* space */ /* Xps is POSIX space, but from 8.34 */
string_pXwd, /* word */ /* Perl and POSIX space are the same */
NULL, /* xdigit */
/* Negated cases */
string_PL, /* ^alpha */
string_PLl, /* ^lower */
string_PLu, /* ^upper */
string_PXan, /* ^alnum */
NULL, /* ^ascii */
string_H, /* ^blank */
string_PCc, /* ^cntrl */
string_PNd, /* ^digit */
NULL, /* ^graph */
NULL, /* ^print */
NULL, /* ^punct */
string_PXps, /* ^space */ /* Xps is POSIX space, but from 8.34 */
string_PXwd, /* ^word */ /* Perl and POSIX space are the same */
NULL /* ^xdigit */
};
#define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(PCRE2_UCHAR *))
#endif /* SUPPORT_UNICODE */
/* Masks for checking option settings. */
#define PUBLIC_COMPILE_OPTIONS \
(PCRE2_ANCHORED|PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_ALT_CIRCUMFLEX| \
PCRE2_ALT_VERBNAMES|PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_DOLLAR_ENDONLY| \
PCRE2_DOTALL|PCRE2_DUPNAMES|PCRE2_EXTENDED|PCRE2_FIRSTLINE| \
PCRE2_MATCH_UNSET_BACKREF|PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C| \
PCRE2_NEVER_UCP|PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE| \
PCRE2_NO_AUTO_POSSESS|PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_NO_START_OPTIMIZE| \
PCRE2_NO_UTF_CHECK|PCRE2_UCP|PCRE2_UNGREEDY|PCRE2_USE_OFFSET_LIMIT| \
PCRE2_UTF)
/* Compile time error code numbers. They are given names so that they can more
easily be tracked. When a new number is added, the tables called eint1 and
eint2 in pcre2posix.c may need to be updated, and a new error text must be
added to compile_error_texts in pcre2_error.c. */
enum { ERR0 = COMPILE_ERROR_BASE,
ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9, ERR10,
ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19, ERR20,
ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29, ERR30,
ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, ERR40,
ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, ERR50,
ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60,
ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70,
ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80,
ERR81, ERR82, ERR83, ERR84, ERR85, ERR86, ERR87, ERR88 };
/* Error codes that correspond to negative error codes returned by
find_fixedlength(). */
static int fixed_length_errors[] =
{
ERR0, /* Not an error */
ERR0, /* Not an error; -1 is used for "process later" */
ERR25, /* Lookbehind is not fixed length */
ERR36, /* \C in lookbehind is not allowed */
ERR87, /* Lookbehind is too long */
ERR86, /* Pattern too complicated */
ERR70 /* Internal error: unknown opcode encountered */
};
/* This is a table of start-of-pattern options such as (*UTF) and settings such
as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
compatibility, (*UTFn) is supported in the relevant libraries, but (*UTF) is
generic and always supported. */
enum { PSO_OPT, /* Value is an option bit */
PSO_FLG, /* Value is a flag bit */
PSO_NL, /* Value is a newline type */
PSO_BSR, /* Value is a \R type */
PSO_LIMM, /* Read integer value for match limit */
PSO_LIMR }; /* Read integer value for recursion limit */
typedef struct pso {
const uint8_t *name;
uint16_t length;
uint16_t type;
uint32_t value;
} pso;
/* NB: STRING_UTFn_RIGHTPAR contains the length as well */
static pso pso_list[] = {
{ (uint8_t *)STRING_UTFn_RIGHTPAR, PSO_OPT, PCRE2_UTF },
{ (uint8_t *)STRING_UTF_RIGHTPAR, 4, PSO_OPT, PCRE2_UTF },
{ (uint8_t *)STRING_UCP_RIGHTPAR, 4, PSO_OPT, PCRE2_UCP },
{ (uint8_t *)STRING_NOTEMPTY_RIGHTPAR, 9, PSO_FLG, PCRE2_NOTEMPTY_SET },
{ (uint8_t *)STRING_NOTEMPTY_ATSTART_RIGHTPAR, 17, PSO_FLG, PCRE2_NE_ATST_SET },
{ (uint8_t *)STRING_NO_AUTO_POSSESS_RIGHTPAR, 16, PSO_OPT, PCRE2_NO_AUTO_POSSESS },
{ (uint8_t *)STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR, 18, PSO_OPT, PCRE2_NO_DOTSTAR_ANCHOR },
{ (uint8_t *)STRING_NO_JIT_RIGHTPAR, 7, PSO_FLG, PCRE2_NOJIT },
{ (uint8_t *)STRING_NO_START_OPT_RIGHTPAR, 13, PSO_OPT, PCRE2_NO_START_OPTIMIZE },
{ (uint8_t *)STRING_LIMIT_MATCH_EQ, 12, PSO_LIMM, 0 },
{ (uint8_t *)STRING_LIMIT_RECURSION_EQ, 16, PSO_LIMR, 0 },
{ (uint8_t *)STRING_CR_RIGHTPAR, 3, PSO_NL, PCRE2_NEWLINE_CR },
{ (uint8_t *)STRING_LF_RIGHTPAR, 3, PSO_NL, PCRE2_NEWLINE_LF },
{ (uint8_t *)STRING_CRLF_RIGHTPAR, 5, PSO_NL, PCRE2_NEWLINE_CRLF },
{ (uint8_t *)STRING_ANY_RIGHTPAR, 4, PSO_NL, PCRE2_NEWLINE_ANY },
{ (uint8_t *)STRING_ANYCRLF_RIGHTPAR, 8, PSO_NL, PCRE2_NEWLINE_ANYCRLF },
{ (uint8_t *)STRING_BSR_ANYCRLF_RIGHTPAR, 12, PSO_BSR, PCRE2_BSR_ANYCRLF },
{ (uint8_t *)STRING_BSR_UNICODE_RIGHTPAR, 12, PSO_BSR, PCRE2_BSR_UNICODE }
};
/* This table is used when converting repeating opcodes into possessified
versions as a result of an explicit possessive quantifier such as ++. A zero
value means there is no possessified version - in those cases the item in
question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
because all relevant opcodes are less than that. */
static const uint8_t opcode_possessify[] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 15 */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16 - 31 */
0, /* NOTI */
OP_POSSTAR, 0, /* STAR, MINSTAR */
OP_POSPLUS, 0, /* PLUS, MINPLUS */
OP_POSQUERY, 0, /* QUERY, MINQUERY */
OP_POSUPTO, 0, /* UPTO, MINUPTO */
0, /* EXACT */
0, 0, 0, 0, /* POS{STAR,PLUS,QUERY,UPTO} */
OP_POSSTARI, 0, /* STARI, MINSTARI */
OP_POSPLUSI, 0, /* PLUSI, MINPLUSI */
OP_POSQUERYI, 0, /* QUERYI, MINQUERYI */
OP_POSUPTOI, 0, /* UPTOI, MINUPTOI */
0, /* EXACTI */
0, 0, 0, 0, /* POS{STARI,PLUSI,QUERYI,UPTOI} */
OP_NOTPOSSTAR, 0, /* NOTSTAR, NOTMINSTAR */
OP_NOTPOSPLUS, 0, /* NOTPLUS, NOTMINPLUS */
OP_NOTPOSQUERY, 0, /* NOTQUERY, NOTMINQUERY */
OP_NOTPOSUPTO, 0, /* NOTUPTO, NOTMINUPTO */
0, /* NOTEXACT */
0, 0, 0, 0, /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
OP_NOTPOSSTARI, 0, /* NOTSTARI, NOTMINSTARI */
OP_NOTPOSPLUSI, 0, /* NOTPLUSI, NOTMINPLUSI */
OP_NOTPOSQUERYI, 0, /* NOTQUERYI, NOTMINQUERYI */
OP_NOTPOSUPTOI, 0, /* NOTUPTOI, NOTMINUPTOI */
0, /* NOTEXACTI */
0, 0, 0, 0, /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
OP_TYPEPOSSTAR, 0, /* TYPESTAR, TYPEMINSTAR */
OP_TYPEPOSPLUS, 0, /* TYPEPLUS, TYPEMINPLUS */
OP_TYPEPOSQUERY, 0, /* TYPEQUERY, TYPEMINQUERY */
OP_TYPEPOSUPTO, 0, /* TYPEUPTO, TYPEMINUPTO */
0, /* TYPEEXACT */
0, 0, 0, 0, /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
OP_CRPOSSTAR, 0, /* CRSTAR, CRMINSTAR */
OP_CRPOSPLUS, 0, /* CRPLUS, CRMINPLUS */
OP_CRPOSQUERY, 0, /* CRQUERY, CRMINQUERY */
OP_CRPOSRANGE, 0, /* CRRANGE, CRMINRANGE */
0, 0, 0, 0, /* CRPOS{STAR,PLUS,QUERY,RANGE} */
0, 0, 0, /* CLASS, NCLASS, XCLASS */
0, 0, /* REF, REFI */
0, 0, /* DNREF, DNREFI */
0, 0 /* RECURSE, CALLOUT */
};
/*************************************************
* Free compiled code *
*************************************************/
PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
pcre2_code_free(pcre2_code *code)
{
PCRE2_SIZE* ref_count;
if (code != NULL)
{
if (code->executable_jit != NULL)
PRIV(jit_free)(code->executable_jit, &code->memctl);
if ((code->flags & PCRE2_DEREF_TABLES) != 0)
{
/* Decoded tables belong to the codes after deserialization, and they must
be freed when there are no more reference to them. The *ref_count should
always be > 0. */
ref_count = (PCRE2_SIZE *)(code->tables + tables_length);
if (*ref_count > 0)
{
(*ref_count)--;
if (*ref_count == 0)
code->memctl.free((void *)code->tables, code->memctl.memory_data);
}
}
code->memctl.free(code, code->memctl.memory_data);
}
}
/*************************************************
* Insert an automatic callout point *
*************************************************/
/* This function is called when the PCRE2_AUTO_CALLOUT option is set, to insert
callout points before each pattern item.
Arguments:
code current code pointer
ptr current pattern pointer
cb general compile-time data
Returns: new code pointer
*/
static PCRE2_UCHAR *
auto_callout(PCRE2_UCHAR *code, PCRE2_SPTR ptr, compile_block *cb)
{
code[0] = OP_CALLOUT;
PUT(code, 1, ptr - cb->start_pattern); /* Pattern offset */
PUT(code, 1 + LINK_SIZE, 0); /* Default length */
code[1 + 2*LINK_SIZE] = 255;
return code + PRIV(OP_lengths)[OP_CALLOUT];
}
/*************************************************
* Complete a callout item *
*************************************************/
/* A callout item contains the length of the next item in the pattern, which
we can't fill in till after we have reached the relevant point. This is used
for both automatic and manual callouts.
Arguments:
previous_callout points to previous callout item
ptr current pattern pointer
cb general compile-time data
Returns: nothing
*/
static void
complete_callout(PCRE2_UCHAR *previous_callout, PCRE2_SPTR ptr,
compile_block *cb)
{
size_t length = ptr - cb->start_pattern - GET(previous_callout, 1);
PUT(previous_callout, 1 + LINK_SIZE, length);
}
/*************************************************
* Find the fixed length of a branch *
*************************************************/
/* Scan a branch and compute the fixed length of subject that will match it, if
the length is fixed. This is needed for dealing with lookbehind assertions. In
UTF mode, the result is in code units rather than bytes. The branch is
temporarily terminated with OP_END when this function is called.
This function is called when a lookbehind assertion is encountered, so that if
it fails, the error message can point to the correct place in the pattern.
However, we cannot do this when the assertion contains subroutine calls,
because they can be forward references. We solve this by remembering this case
and doing the check at the end; a flag specifies which mode we are running in.
Lookbehind lengths are held in 16-bit fields and the maximum value is defined
as LOOKBEHIND_MAX.
Arguments:
code points to the start of the pattern (the bracket)
utf TRUE in UTF mode
atend TRUE if called when the pattern is complete
cb the "compile data" structure
recurses chain of recurse_check to catch mutual recursion
countptr pointer to counter, to catch over-complexity
Returns: if non-negative, the fixed length,
or -1 if an OP_RECURSE item was encountered and atend is FALSE
or -2 if there is no fixed length,
or -3 if \C was encountered (in UTF-8 mode only)
or -4 length is too long
or -5 if an unknown opcode was encountered (internal error)
*/
#define FFL_LATER (-1)
#define FFL_NOTFIXED (-2)
#define FFL_BACKSLASHC (-3)
#define FFL_TOOLONG (-4)
#define FFL_TOOCOMPLICATED (-5)
#define FFL_UNKNOWNOP (-6)
static int
find_fixedlength(PCRE2_UCHAR *code, BOOL utf, BOOL atend, compile_block *cb,
recurse_check *recurses, int *countptr)
{
int length = -1;
uint32_t group = 0;
uint32_t groupinfo = 0;
recurse_check this_recurse;
register int branchlength = 0;
register PCRE2_UCHAR *cc = code + 1 + LINK_SIZE;
/* If this is a capturing group, we may have the answer cached, but we can only
use this information if there are no (?| groups in the pattern, because
otherwise group numbers are not unique. */
if (*code == OP_CBRA || *code == OP_CBRAPOS || *code == OP_SCBRA ||
*code == OP_SCBRAPOS)
{
group = GET2(cc, 0);
cc += IMM2_SIZE;
groupinfo = cb->groupinfo[group];
if ((cb->external_flags & PCRE2_DUPCAPUSED) == 0)
{
if ((groupinfo & GI_NOT_FIXED_LENGTH) != 0) return FFL_NOTFIXED;
if ((groupinfo & GI_SET_FIXED_LENGTH) != 0)
return groupinfo & GI_FIXED_LENGTH_MASK;
}
}
/* A large and/or complex regex can take too long to process. This can happen
more often when (?| groups are present in the pattern. */
if ((*countptr)++ > 2000) return FFL_TOOCOMPLICATED;
/* Scan along the opcodes for this branch. If we get to the end of the
branch, check the length against that of the other branches. */
for (;;)
{
int d;
PCRE2_UCHAR *ce, *cs;
register PCRE2_UCHAR op = *cc;
if (branchlength > LOOKBEHIND_MAX) return FFL_TOOLONG;
switch (op)
{
/* We only need to continue for OP_CBRA (normal capturing bracket) and
OP_BRA (normal non-capturing bracket) because the other variants of these
opcodes are all concerned with unlimited repeated groups, which of course
are not of fixed length. */
case OP_CBRA:
case OP_BRA:
case OP_ONCE:
case OP_ONCE_NC:
case OP_COND:
d = find_fixedlength(cc, utf, atend, cb, recurses, countptr);
if (d < 0) return d;
branchlength += d;
do cc += GET(cc, 1); while (*cc == OP_ALT);
cc += 1 + LINK_SIZE;
break;
/* Reached end of a branch; if it's a ket it is the end of a nested call.
If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
an ALT. If it is END it's the end of the outer call. All can be handled by
the same code. Note that we must not include the OP_KETRxxx opcodes here,
because they all imply an unlimited repeat. */
case OP_ALT:
case OP_KET:
case OP_END:
case OP_ACCEPT:
case OP_ASSERT_ACCEPT:
if (length < 0) length = branchlength;
else if (length != branchlength) goto ISNOTFIXED;
if (*cc != OP_ALT)
{
if (group > 0)
{
groupinfo |= (GI_SET_FIXED_LENGTH | length);
cb->groupinfo[group] = groupinfo;
}
return length;
}
cc += 1 + LINK_SIZE;
branchlength = 0;
break;
/* A true recursion implies not fixed length, but a subroutine call may
be OK. If the subroutine is a forward reference, we can't deal with
it until the end of the pattern, so return FFL_LATER. */
case OP_RECURSE:
if (!atend) return FFL_LATER;
cs = ce = (PCRE2_UCHAR *)cb->start_code + GET(cc, 1); /* Start subpattern */
do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */
if (cc > cs && cc < ce) goto ISNOTFIXED; /* Recursion */
else /* Check for mutual recursion */
{
recurse_check *r = recurses;
for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
if (r != NULL) goto ISNOTFIXED; /* Mutual recursion */
}
this_recurse.prev = recurses;
this_recurse.group = cs;
d = find_fixedlength(cs, utf, atend, cb, &this_recurse, countptr);
if (d < 0) return d;
branchlength += d;
cc += 1 + LINK_SIZE;
break;
/* Skip over assertive subpatterns. Note that we must increment cc by
1 + LINK_SIZE at the end, not by OP_length[*cc] because in a recursive
situation this assertion may be the one that is ultimately being checked
for having a fixed length, in which case its terminating OP_KET will have
been temporarily replaced by OP_END. */
case OP_ASSERT:
case OP_ASSERT_NOT:
case OP_ASSERTBACK:
case OP_ASSERTBACK_NOT:
do cc += GET(cc, 1); while (*cc == OP_ALT);
cc += 1 + LINK_SIZE;
break;
/* Skip over things that don't match chars */
case OP_MARK:
case OP_PRUNE_ARG:
case OP_SKIP_ARG:
case OP_THEN_ARG:
cc += cc[1] + PRIV(OP_lengths)[*cc];
break;
case OP_CALLOUT:
case OP_CIRC:
case OP_CIRCM:
case OP_CLOSE:
case OP_COMMIT:
case OP_CREF:
case OP_FALSE:
case OP_TRUE:
case OP_DNCREF:
case OP_DNRREF:
case OP_DOLL:
case OP_DOLLM:
case OP_EOD:
case OP_EODN:
case OP_FAIL:
case OP_NOT_WORD_BOUNDARY:
case OP_PRUNE:
case OP_REVERSE:
case OP_RREF:
case OP_SET_SOM:
case OP_SKIP:
case OP_SOD:
case OP_SOM:
case OP_THEN:
case OP_WORD_BOUNDARY:
cc += PRIV(OP_lengths)[*cc];
break;
case OP_CALLOUT_STR:
cc += GET(cc, 1 + 2*LINK_SIZE);
break;
/* Handle literal characters */
case OP_CHAR:
case OP_CHARI:
case OP_NOT:
case OP_NOTI:
branchlength++;
cc += 2;
#ifdef SUPPORT_UNICODE
if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
#endif
break;
/* Handle exact repetitions. The count is already in characters, but we
need to skip over a multibyte character in UTF8 mode. */
case OP_EXACT:
case OP_EXACTI:
case OP_NOTEXACT:
case OP_NOTEXACTI:
branchlength += (int)GET2(cc,1);
cc += 2 + IMM2_SIZE;
#ifdef SUPPORT_UNICODE
if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
#endif
break;
case OP_TYPEEXACT:
branchlength += GET2(cc,1);
if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP)
cc += 2;
cc += 1 + IMM2_SIZE + 1;
break;
/* Handle single-char matchers */
case OP_PROP:
case OP_NOTPROP:
cc += 2;
/* Fall through */
case OP_HSPACE:
case OP_VSPACE:
case OP_NOT_HSPACE:
case OP_NOT_VSPACE:
case OP_NOT_DIGIT:
case OP_DIGIT:
case OP_NOT_WHITESPACE:
case OP_WHITESPACE:
case OP_NOT_WORDCHAR:
case OP_WORDCHAR:
case OP_ANY:
case OP_ALLANY:
branchlength++;
cc++;
break;
/* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
otherwise \C is coded as OP_ALLANY. */
case OP_ANYBYTE:
return FFL_BACKSLASHC;
/* Check a class for variable quantification */
case OP_CLASS:
case OP_NCLASS:
#ifdef SUPPORT_WIDE_CHARS
case OP_XCLASS:
/* The original code caused an unsigned overflow in 64 bit systems,
so now we use a conditional statement. */
if (op == OP_XCLASS)
cc += GET(cc, 1);
else
cc += PRIV(OP_lengths)[OP_CLASS];
#else
cc += PRIV(OP_lengths)[OP_CLASS];
#endif
switch (*cc)
{
case OP_CRSTAR:
case OP_CRMINSTAR:
case OP_CRPLUS:
case OP_CRMINPLUS:
case OP_CRQUERY:
case OP_CRMINQUERY:
case OP_CRPOSSTAR:
case OP_CRPOSPLUS:
case OP_CRPOSQUERY:
goto ISNOTFIXED;
case OP_CRRANGE:
case OP_CRMINRANGE:
case OP_CRPOSRANGE:
if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) goto ISNOTFIXED;
branchlength += (int)GET2(cc,1);
cc += 1 + 2 * IMM2_SIZE;
break;
default:
branchlength++;
}
break;
/* Anything else is variable length */
case OP_ANYNL:
case OP_BRAMINZERO:
case OP_BRAPOS:
case OP_BRAPOSZERO:
case OP_BRAZERO:
case OP_CBRAPOS:
case OP_EXTUNI:
case OP_KETRMAX:
case OP_KETRMIN:
case OP_KETRPOS:
case OP_MINPLUS:
case OP_MINPLUSI:
case OP_MINQUERY:
case OP_MINQUERYI:
case OP_MINSTAR:
case OP_MINSTARI:
case OP_MINUPTO:
case OP_MINUPTOI:
case OP_NOTMINPLUS:
case OP_NOTMINPLUSI:
case OP_NOTMINQUERY:
case OP_NOTMINQUERYI:
case OP_NOTMINSTAR:
case OP_NOTMINSTARI:
case OP_NOTMINUPTO:
case OP_NOTMINUPTOI:
case OP_NOTPLUS:
case OP_NOTPLUSI:
case OP_NOTPOSPLUS:
case OP_NOTPOSPLUSI:
case OP_NOTPOSQUERY:
case OP_NOTPOSQUERYI:
case OP_NOTPOSSTAR:
case OP_NOTPOSSTARI:
case OP_NOTPOSUPTO:
case OP_NOTPOSUPTOI:
case OP_NOTQUERY:
case OP_NOTQUERYI:
case OP_NOTSTAR:
case OP_NOTSTARI:
case OP_NOTUPTO:
case OP_NOTUPTOI:
case OP_PLUS:
case OP_PLUSI:
case OP_POSPLUS:
case OP_POSPLUSI:
case OP_POSQUERY:
case OP_POSQUERYI:
case OP_POSSTAR:
case OP_POSSTARI:
case OP_POSUPTO:
case OP_POSUPTOI:
case OP_QUERY:
case OP_QUERYI:
case OP_REF:
case OP_REFI:
case OP_DNREF:
case OP_DNREFI:
case OP_SBRA:
case OP_SBRAPOS:
case OP_SCBRA:
case OP_SCBRAPOS:
case OP_SCOND:
case OP_SKIPZERO:
case OP_STAR:
case OP_STARI:
case OP_TYPEMINPLUS:
case OP_TYPEMINQUERY:
case OP_TYPEMINSTAR:
case OP_TYPEMINUPTO:
case OP_TYPEPLUS:
case OP_TYPEPOSPLUS:
case OP_TYPEPOSQUERY:
case OP_TYPEPOSSTAR:
case OP_TYPEPOSUPTO:
case OP_TYPEQUERY:
case OP_TYPESTAR:
case OP_TYPEUPTO:
case OP_UPTO:
case OP_UPTOI:
goto ISNOTFIXED;
/* Catch unrecognized opcodes so that when new ones are added they
are not forgotten, as has happened in the past. */
default:
return FFL_UNKNOWNOP;
}
}
/* Control never gets here except by goto. */
ISNOTFIXED:
if (group > 0)
{
groupinfo |= GI_NOT_FIXED_LENGTH;
cb->groupinfo[group] = groupinfo;
}
return FFL_NOTFIXED;
}
/*************************************************
* Find first significant op code *
*************************************************/
/* This is called by several functions that scan a compiled expression looking
for a fixed first character, or an anchoring op code etc. It skips over things
that do not influence this. For some calls, it makes sense to skip negative
forward and all backward assertions, and also the \b assertion; for others it
does not.
Arguments:
code pointer to the start of the group
skipassert TRUE if certain assertions are to be skipped
Returns: pointer to the first significant opcode
*/
static const PCRE2_UCHAR*
first_significant_code(PCRE2_SPTR code, BOOL skipassert)
{
for (;;)
{
switch ((int)*code)
{
case OP_ASSERT_NOT:
case OP_ASSERTBACK:
case OP_ASSERTBACK_NOT:
if (!skipassert) return code;
do code += GET(code, 1); while (*code == OP_ALT);
code += PRIV(OP_lengths)[*code];
break;
case OP_WORD_BOUNDARY:
case OP_NOT_WORD_BOUNDARY:
if (!skipassert) return code;
/* Fall through */
case OP_CALLOUT:
case OP_CREF:
case OP_DNCREF:
case OP_RREF:
case OP_DNRREF:
case OP_FALSE:
case OP_TRUE:
code += PRIV(OP_lengths)[*code];
break;
case OP_CALLOUT_STR:
code += GET(code, 1 + 2*LINK_SIZE);
break;
default:
return code;
}
}
/* Control never reaches here */
}
/*************************************************
* Scan compiled branch for non-emptiness *
*************************************************/
/* This function scans through a branch of a compiled pattern to see whether it
can match the empty string. It is called at the end of compiling to check the
entire pattern, and from compile_branch() when checking for an unlimited repeat
of a group that can match nothing. In the latter case it is called only when
doing the real compile, not during the pre-compile that measures the size of
the compiled pattern.
Note that first_significant_code() skips over backward and negative forward
assertions when its final argument is TRUE. If we hit an unclosed bracket, we
return "empty" - this means we've struck an inner bracket whose current branch
will already have been scanned.
Arguments:
code points to start of search
endcode points to where to stop
utf TRUE if in UTF mode
cb compile data
atend TRUE if being called to check an entire pattern
recurses chain of recurse_check to catch mutual recursion
countptr pointer to count to catch over-complicated pattern
Returns: 0 if what is matched cannot be empty
1 if what is matched could be empty
-1 if the pattern is too complicated
*/
#define CBE_NOTEMPTY 0
#define CBE_EMPTY 1
#define CBE_TOOCOMPLICATED (-1)
static int
could_be_empty_branch(PCRE2_SPTR code, PCRE2_SPTR endcode, BOOL utf,
compile_block *cb, BOOL atend, recurse_check *recurses, int *countptr)
{
uint32_t group = 0;
uint32_t groupinfo = 0;
register PCRE2_UCHAR c;
recurse_check this_recurse;
/* If what we are checking has already been set as "could be empty", we know
the answer. */
if (*code >= OP_SBRA && *code <= OP_SCOND) return CBE_EMPTY;
/* If this is a capturing group, we may have the answer cached, but we can only
use this information if there are no (?| groups in the pattern, because
otherwise group numbers are not unique. */
if ((cb->external_flags & PCRE2_DUPCAPUSED) == 0 &&
(*code == OP_CBRA || *code == OP_CBRAPOS))
{
group = GET2(code, 1 + LINK_SIZE);
groupinfo = cb->groupinfo[group];
if ((groupinfo & GI_SET_COULD_BE_EMPTY) != 0)
return ((groupinfo & GI_COULD_BE_EMPTY) != 0)? CBE_EMPTY : CBE_NOTEMPTY;
}
/* A large and/or complex regex can take too long to process. We have to assume
it can match an empty string. This can happen more often when (?| groups are
present in the pattern and the caching is disabled. Setting the cap at 1100
allows the test for more than 1023 capturing patterns to work. */
if ((*countptr)++ > 1100) return CBE_TOOCOMPLICATED;
/* Scan the opcodes for this branch. */
for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
code < endcode;
code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
{
PCRE2_SPTR ccode;
c = *code;
/* Skip over forward assertions; the other assertions are skipped by
first_significant_code() with a TRUE final argument. */
if (c == OP_ASSERT)
{
do code += GET(code, 1); while (*code == OP_ALT);
c = *code;
continue;
}
/* For a recursion/subroutine call we can scan the recursion when this
function is called at the end, to check a complete pattern. Before then,
recursions just have the group number as their argument and in any case may
be forward references. In that situation, we return CBE_EMPTY, just in case.
It means that unlimited repeats of groups that contain recursions are always
treated as "could be empty" - which just adds a bit more processing time
because of the runtime check. */
if (c == OP_RECURSE)
{
PCRE2_SPTR scode, endgroup;
BOOL empty_branch;
if (!atend) goto ISTRUE;
scode = cb->start_code + GET(code, 1);
endgroup = scode;
/* We need to detect whether this is a recursive call, as otherwise there
will be an infinite loop. If it is a recursion, just skip over it. Simple
recursions are easily detected. For mutual recursions we keep a chain on
the stack. */
do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT);
if (code >= scode && code <= endgroup) continue; /* Simple recursion */
else
{
recurse_check *r = recurses;
for (r = recurses; r != NULL; r = r->prev)
if (r->group == scode) break;
if (r != NULL) continue; /* Mutual recursion */
}
/* Scan the referenced group, remembering it on the stack chain to detect
mutual recursions. */
empty_branch = FALSE;
this_recurse.prev = recurses;
this_recurse.group = scode;
do
{
int rc = could_be_empty_branch(scode, endcode, utf, cb, atend,
&this_recurse, countptr);
if (rc < 0) return rc;
if (rc > 0)
{
empty_branch = TRUE;
break;
}
scode += GET(scode, 1);
}
while (*scode == OP_ALT);
if (!empty_branch) goto ISFALSE; /* All branches are non-empty */
continue;
}
/* Groups with zero repeats can of course be empty; skip them. */
if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
c == OP_BRAPOSZERO)
{
code += PRIV(OP_lengths)[c];
do code += GET(code, 1); while (*code == OP_ALT);
c = *code;
continue;
}
/* A nested group that is already marked as "could be empty" can just be
skipped. */
if (c == OP_SBRA || c == OP_SBRAPOS ||
c == OP_SCBRA || c == OP_SCBRAPOS)
{
do code += GET(code, 1); while (*code == OP_ALT);
c = *code;
continue;
}
/* For other groups, scan the branches. */
if (c == OP_BRA || c == OP_BRAPOS ||
c == OP_CBRA || c == OP_CBRAPOS ||
c == OP_ONCE || c == OP_ONCE_NC ||
c == OP_COND || c == OP_SCOND)
{
BOOL empty_branch;
if (GET(code, 1) == 0) goto ISTRUE; /* Hit unclosed bracket */
/* If a conditional group has only one branch, there is a second, implied,
empty branch, so just skip over the conditional, because it could be empty.
Otherwise, scan the individual branches of the group. */
if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
code += GET(code, 1);
else
{
empty_branch = FALSE;
do
{
if (!empty_branch)
{
int rc = could_be_empty_branch(code, endcode, utf, cb, atend,
recurses, countptr);
if (rc < 0) return rc;
if (rc > 0) empty_branch = TRUE;
}
code += GET(code, 1);
}
while (*code == OP_ALT);
if (!empty_branch) goto ISFALSE; /* All branches are non-empty */
}
c = *code;
continue;
}
/* Handle the other opcodes */
switch (c)
{
/* Check for quantifiers after a class. XCLASS is used for classes that
cannot be represented just by a bit map. This includes negated single
high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
actual length is stored in the compiled code, so we must update "code"
here. */
#if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8
case OP_XCLASS:
ccode = code += GET(code, 1);
goto CHECK_CLASS_REPEAT;
#endif
case OP_CLASS:
case OP_NCLASS:
ccode = code + PRIV(OP_lengths)[OP_CLASS];
#if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8
CHECK_CLASS_REPEAT:
#endif
switch (*ccode)
{
case OP_CRSTAR: /* These could be empty; continue */
case OP_CRMINSTAR:
case OP_CRQUERY:
case OP_CRMINQUERY:
case OP_CRPOSSTAR:
case OP_CRPOSQUERY:
break;
default: /* Non-repeat => class must match */
case OP_CRPLUS: /* These repeats aren't empty */
case OP_CRMINPLUS:
case OP_CRPOSPLUS:
goto ISFALSE;
case OP_CRRANGE:
case OP_CRMINRANGE:
case OP_CRPOSRANGE:
if (GET2(ccode, 1) > 0) goto ISFALSE; /* Minimum > 0 */
break;
}
break;
/* Opcodes that must match a character */
case OP_ANY:
case OP_ALLANY:
case OP_ANYBYTE:
case OP_PROP:
case OP_NOTPROP:
case OP_ANYNL:
case OP_NOT_HSPACE:
case OP_HSPACE:
case OP_NOT_VSPACE:
case OP_VSPACE:
case OP_EXTUNI:
case OP_NOT_DIGIT:
case OP_DIGIT:
case OP_NOT_WHITESPACE:
case OP_WHITESPACE:
case OP_NOT_WORDCHAR:
case OP_WORDCHAR:
case OP_CHAR:
case OP_CHARI:
case OP_NOT:
case OP_NOTI:
case OP_PLUS:
case OP_PLUSI:
case OP_MINPLUS:
case OP_MINPLUSI:
case OP_NOTPLUS:
case OP_NOTPLUSI:
case OP_NOTMINPLUS:
case OP_NOTMINPLUSI:
case OP_POSPLUS:
case OP_POSPLUSI:
case OP_NOTPOSPLUS:
case OP_NOTPOSPLUSI:
case OP_EXACT:
case OP_EXACTI:
case OP_NOTEXACT:
case OP_NOTEXACTI:
case OP_TYPEPLUS:
case OP_TYPEMINPLUS:
case OP_TYPEPOSPLUS:
case OP_TYPEEXACT:
goto ISFALSE;
/* These are going to continue, as they may be empty, but we have to
fudge the length for the \p and \P cases. */
case OP_TYPESTAR:
case OP_TYPEMINSTAR:
case OP_TYPEPOSSTAR:
case OP_TYPEQUERY:
case OP_TYPEMINQUERY:
case OP_TYPEPOSQUERY:
if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
break;
/* Same for these */
case OP_TYPEUPTO:
case OP_TYPEMINUPTO:
case OP_TYPEPOSUPTO:
if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
code += 2;
break;
/* End of branch */
case OP_KET:
case OP_KETRMAX:
case OP_KETRMIN:
case OP_KETRPOS:
case OP_ALT:
goto ISTRUE;
/* In UTF-8 or UTF-16 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY,
POSQUERY, UPTO, MINUPTO, and POSUPTO and their caseless and negative
versions may be followed by a multibyte character. */
#ifdef MAYBE_UTF_MULTI
case OP_STAR:
case OP_STARI:
case OP_NOTSTAR:
case OP_NOTSTARI:
case OP_MINSTAR:
case OP_MINSTARI:
case OP_NOTMINSTAR:
case OP_NOTMINSTARI:
case OP_POSSTAR:
case OP_POSSTARI:
case OP_NOTPOSSTAR:
case OP_NOTPOSSTARI:
case OP_QUERY:
case OP_QUERYI:
case OP_NOTQUERY:
case OP_NOTQUERYI:
case OP_MINQUERY:
case OP_MINQUERYI:
case OP_NOTMINQUERY:
case OP_NOTMINQUERYI:
case OP_POSQUERY:
case OP_POSQUERYI:
case OP_NOTPOSQUERY:
case OP_NOTPOSQUERYI:
if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
break;
case OP_UPTO:
case OP_UPTOI:
case OP_NOTUPTO:
case OP_NOTUPTOI:
case OP_MINUPTO:
case OP_MINUPTOI:
case OP_NOTMINUPTO:
case OP_NOTMINUPTOI:
case OP_POSUPTO:
case OP_POSUPTOI:
case OP_NOTPOSUPTO:
case OP_NOTPOSUPTOI:
if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
break;
#endif /* MAYBE_UTF_MULTI */
/* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
string. */
case OP_MARK:
case OP_PRUNE_ARG:
case OP_SKIP_ARG:
case OP_THEN_ARG:
code += code[1];
break;
/* None of the remaining opcodes are required to match a character. */
default:
break;
}
}
ISTRUE:
groupinfo |= GI_COULD_BE_EMPTY;
ISFALSE:
if (group > 0) cb->groupinfo[group] = groupinfo | GI_SET_COULD_BE_EMPTY;
return ((groupinfo & GI_COULD_BE_EMPTY) != 0)? CBE_EMPTY : CBE_NOTEMPTY;
}
/*************************************************
* Check for counted repeat *
*************************************************/
/* This function is called when a '{' is encountered in a place where it might
start a quantifier. It looks ahead to see if it really is a quantifier, that
is, one of the forms {ddd} {ddd,} or {ddd,ddd} where the ddds are digits.
Argument: pointer to the first char after '{'
Returns: TRUE or FALSE
*/
static BOOL
is_counted_repeat(PCRE2_SPTR p)
{
if (!IS_DIGIT(*p)) return FALSE;
p++;
while (IS_DIGIT(*p)) p++;
if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
if (*p++ != CHAR_COMMA) return FALSE;
if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
if (!IS_DIGIT(*p)) return FALSE;
p++;
while (IS_DIGIT(*p)) p++;
return (*p == CHAR_RIGHT_CURLY_BRACKET);
}
/*************************************************
* Handle escapes *
*************************************************/
/* This function is called when a \ has been encountered. It either returns a
positive value for a simple escape such as \d, or 0 for a data character, which
is placed in chptr. A backreference to group n is returned as negative n. On
entry, ptr is pointing at the \. On exit, it points the final code unit of the
escape sequence.
This function is also called from pcre2_substitute() to handle escape sequences
in replacement strings. In this case, the cb argument is NULL, and only
sequences that define a data character are recognised. The isclass argument is
not relevant, but the options argument is the final value of the compiled
pattern's options.
There is one "trick" case: when a sequence such as [[:>:]] or \s in UCP mode is
processed, it is replaced by a nested alternative sequence. If this contains a
backslash (which is usually does), ptrend does not point to its end - it still
points to the end of the whole pattern. However, we can detect this case
because cb->nestptr[0] will be non-NULL. The nested sequences are all zero-
terminated and there are only ever two levels of nesting.
Arguments:
ptrptr points to the input position pointer
ptrend points to the end of the input
chptr points to a returned data character
errorcodeptr points to the errorcode variable (containing zero)
options the current options bits
isclass TRUE if inside a character class
cb compile data block
Returns: zero => a data character
positive => a special escape sequence
negative => a back reference
on error, errorcodeptr is set non-zero
*/
int
PRIV(check_escape)(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *chptr,
int *errorcodeptr, uint32_t options, BOOL isclass, compile_block *cb)
{
BOOL utf = (options & PCRE2_UTF) != 0;
PCRE2_SPTR ptr = *ptrptr + 1;
register uint32_t c, cc;
int escape = 0;
int i;
/* Find the end of a nested insert. */
if (cb != NULL && cb->nestptr[0] != NULL)
ptrend = ptr + PRIV(strlen)(ptr);
/* If backslash is at the end of the string, it's an error. */
if (ptr >= ptrend)
{
*errorcodeptr = ERR1;
return 0;
}
GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
ptr--; /* Set pointer back to the last code unit */
/* Non-alphanumerics are literals, so we just leave the value in c. An initial
value test saves a memory lookup for code points outside the alphanumeric
range. Otherwise, do a table lookup. A non-zero result is something that can be
returned immediately. Otherwise further processing is required. */
if (c < ESCAPES_FIRST || c > ESCAPES_LAST) {} /* Definitely literal */
else if ((i = escapes[c - ESCAPES_FIRST]) != 0)
{
if (i > 0) c = (uint32_t)i; else /* Positive is a data character */
{
escape = -i; /* Else return a special escape */
if (escape == ESC_P || escape == ESC_p || escape == ESC_X)
cb->external_flags |= PCRE2_HASBKPORX; /* Note \P, \p, or \X */
}
}
/* Escapes that need further processing, including those that are unknown.
When called from pcre2_substitute(), only \c, \o, and \x are recognized (and \u
when BSUX is set). */
else
{
PCRE2_SPTR oldptr;
BOOL braced, negated, overflow;
unsigned int s;
/* Filter calls from pcre2_substitute(). */
if (cb == NULL && c != CHAR_c && c != CHAR_o && c != CHAR_x &&
(c != CHAR_u || (options & PCRE2_ALT_BSUX) != 0))
{
*errorcodeptr = ERR3;
return 0;
}
switch (c)
{
/* A number of Perl escapes are not handled by PCRE. We give an explicit
error. */
case CHAR_l:
case CHAR_L:
*errorcodeptr = ERR37;
break;
/* \u is unrecognized when PCRE2_ALT_BSUX is not set. When it is treated
specially, \u must be followed by four hex digits. Otherwise it is a
lowercase u letter. */
case CHAR_u:
if ((options & PCRE2_ALT_BSUX) == 0) *errorcodeptr = ERR37; else
{
uint32_t xc;
if ((cc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */
if ((xc = XDIGIT(ptr[2])) == 0xff) break; /* Not a hex digit */
cc = (cc << 4) | xc;
if ((xc = XDIGIT(ptr[3])) == 0xff) break; /* Not a hex digit */
cc = (cc << 4) | xc;
if ((xc = XDIGIT(ptr[4])) == 0xff) break; /* Not a hex digit */
c = (cc << 4) | xc;
ptr += 4;
if (utf)
{
if (c > 0x10ffffU) *errorcodeptr = ERR77;
else if (c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
}
else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR77;
}
break;
case CHAR_U:
/* \U is unrecognized unless PCRE2_ALT_BSUX is set, in which case it is an
upper case letter. */
if ((options & PCRE2_ALT_BSUX) == 0) *errorcodeptr = ERR37;
break;
/* In a character class, \g is just a literal "g". Outside a character
class, \g must be followed by one of a number of specific things:
(1) A number, either plain or braced. If positive, it is an absolute
backreference. If negative, it is a relative backreference. This is a Perl
5.10 feature.
(2) Perl 5.10 also supports \g{name} as a reference to a named group. This
is part of Perl's movement towards a unified syntax for back references. As
this is synonymous with \k{name}, we fudge it up by pretending it really
was \k.
(3) For Oniguruma compatibility we also support \g followed by a name or a
number either in angle brackets or in single quotes. However, these are
(possibly recursive) subroutine calls, _not_ backreferences. Just return
the ESC_g code (cf \k). */
case CHAR_g:
if (isclass) break;
if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
{
escape = ESC_g;
break;
}
/* Handle the Perl-compatible cases */
if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
{
PCRE2_SPTR p;
for (p = ptr+2; *p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
if (*p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET)
{
escape = ESC_k;
break;
}
braced = TRUE;
ptr++;
}
else braced = FALSE;
if (ptr[1] == CHAR_MINUS)
{
negated = TRUE;
ptr++;
}
else negated = FALSE;
/* The integer range is limited by the machine's int representation. */
s = 0;
overflow = FALSE;
while (IS_DIGIT(ptr[1]))
{
if (s > INT_MAX / 10 - 1) /* Integer overflow */
{
overflow = TRUE;
break;
}
s = s * 10 + (int)(*(++ptr) - CHAR_0);
}
if (overflow) /* Integer overflow */
{
while (IS_DIGIT(ptr[1])) ptr++;
*errorcodeptr = ERR61;
break;
}
if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
{
*errorcodeptr = ERR57;
break;
}
if (s == 0)
{
*errorcodeptr = ERR58;
break;
}
if (negated)
{
if (s > cb->bracount)
{
*errorcodeptr = ERR15;
break;
}
s = cb->bracount - (s - 1);
}
escape = -(int)s;
break;
/* The handling of escape sequences consisting of a string of digits
starting with one that is not zero is not straightforward. Perl has changed
over the years. Nowadays \g{} for backreferences and \o{} for octal are
recommended to avoid the ambiguities in the old syntax.
Outside a character class, the digits are read as a decimal number. If the
number is less than 10, or if there are that many previous extracting left
brackets, it is a back reference. Otherwise, up to three octal digits are
read to form an escaped character code. Thus \123 is likely to be octal 123
(cf \0123, which is octal 012 followed by the literal 3).
Inside a character class, \ followed by a digit is always either a literal
8 or 9 or an octal number. */
case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
if (!isclass)
{
oldptr = ptr;
/* The integer range is limited by the machine's int representation. */
s = c - CHAR_0;
overflow = FALSE;
while (IS_DIGIT(ptr[1]))
{
if (s > INT_MAX / 10 - 1) /* Integer overflow */
{
overflow = TRUE;
break;
}
s = s * 10 + (int)(*(++ptr) - CHAR_0);
}
if (overflow) /* Integer overflow */
{
while (IS_DIGIT(ptr[1])) ptr++;
*errorcodeptr = ERR61;
break;
}
/* \1 to \9 are always back references. \8x and \9x are too; \1x to \7x
are octal escapes if there are not that many previous captures. */
if (s < 10 || *oldptr >= CHAR_8 || s <= cb->bracount)
{
escape = -(int)s; /* Indicates a back reference */
break;
}
ptr = oldptr; /* Put the pointer back and fall through */
}
/* Handle a digit following \ when the number is not a back reference, or
we are within a character class. If the first digit is 8 or 9, Perl used to
generate a binary zero byte and then treat the digit as a following
literal. At least by Perl 5.18 this changed so as not to insert the binary
zero. */
if ((c = *ptr) >= CHAR_8) break;
/* Fall through with a digit less than 8 */
/* \0 always starts an octal number, but we may drop through to here with a
larger first octal digit. The original code used just to take the least
significant 8 bits of octal numbers (I think this is what early Perls used
to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
but no more than 3 octal digits. */
case CHAR_0:
c -= CHAR_0;
while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
c = c * 8 + *(++ptr) - CHAR_0;
#if PCRE2_CODE_UNIT_WIDTH == 8
if (!utf && c > 0xff) *errorcodeptr = ERR51;
#endif
break;
/* \o is a relatively new Perl feature, supporting a more general way of
specifying character codes in octal. The only supported form is \o{ddd}. */
case CHAR_o:
if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR55; else
if (ptr[2] == CHAR_RIGHT_CURLY_BRACKET) *errorcodeptr = ERR78; else
{
ptr += 2;
c = 0;
overflow = FALSE;
while (*ptr >= CHAR_0 && *ptr <= CHAR_7)
{
cc = *ptr++;
if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
#if PCRE2_CODE_UNIT_WIDTH == 32
if (c >= 0x20000000l) { overflow = TRUE; break; }
#endif
c = (c << 3) + (cc - CHAR_0);
#if PCRE2_CODE_UNIT_WIDTH == 8
if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
#elif PCRE2_CODE_UNIT_WIDTH == 16
if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
#elif PCRE2_CODE_UNIT_WIDTH == 32
if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
#endif
}
if (overflow)
{
while (*ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
*errorcodeptr = ERR34;
}
else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
{
if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
}
else *errorcodeptr = ERR64;
}
break;
/* \x is complicated. When PCRE2_ALT_BSUX is set, \x must be followed by
two hexadecimal digits. Otherwise it is a lowercase x letter. */
case CHAR_x:
if ((options & PCRE2_ALT_BSUX) != 0)
{
uint32_t xc;
if ((cc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */
if ((xc = XDIGIT(ptr[2])) == 0xff) break; /* Not a hex digit */
c = (cc << 4) | xc;
ptr += 2;
} /* End PCRE2_ALT_BSUX handling */
/* Handle \x in Perl's style. \x{ddd} is a character number which can be
greater than 0xff in UTF-8 or non-8bit mode, but only if the ddd are hex
digits. If not, { used to be treated as a data character. However, Perl
seems to read hex digits up to the first non-such, and ignore the rest, so
that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
now gives an error. */
else
{
if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
{
ptr += 2;
if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
{
*errorcodeptr = ERR78;
break;
}
c = 0;
overflow = FALSE;
while ((cc = XDIGIT(*ptr)) != 0xff)
{
ptr++;
if (c == 0 && cc == 0) continue; /* Leading zeroes */
#if PCRE2_CODE_UNIT_WIDTH == 32
if (c >= 0x10000000l) { overflow = TRUE; break; }
#endif
c = (c << 4) | cc;
if ((utf && c > 0x10ffffU) || (!utf && c > MAX_NON_UTF_CHAR))
{
overflow = TRUE;
break;
}
}
if (overflow)
{
while (XDIGIT(*ptr) != 0xff) ptr++;
*errorcodeptr = ERR34;
}
else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
{
if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
}
/* If the sequence of hex digits does not end with '}', give an error.
We used just to recognize this construct and fall through to the normal
\x handling, but nowadays Perl gives an error, which seems much more
sensible, so we do too. */
else *errorcodeptr = ERR67;
} /* End of \x{} processing */
/* Read a single-byte hex-defined char (up to two hex digits after \x) */
else
{
c = 0;
if ((cc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */
ptr++;
c = cc;
if ((cc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */
ptr++;
c = (c << 4) | cc;
} /* End of \xdd handling */
} /* End of Perl-style \x handling */
break;
/* The handling of \c is different in ASCII and EBCDIC environments. In an
ASCII (or Unicode) environment, an error is given if the character
following \c is not a printable ASCII character. Otherwise, the following
character is upper-cased if it is a letter, and after that the 0x40 bit is
flipped. The result is the value of the escape.
In an EBCDIC environment the handling of \c is compatible with the
specification in the perlebcdic document. The following character must be
a letter or one of small number of special characters. These provide a
means of defining the character values 0-31.
For testing the EBCDIC handling of \c in an ASCII environment, recognize
the EBCDIC value of 'c' explicitly. */
#if defined EBCDIC && 'a' != 0x81
case 0x83:
#else
case CHAR_c:
#endif
c = *(++ptr);
if (c >= CHAR_a && c <= CHAR_z) c = UPPER_CASE(c);
if (c == CHAR_NULL && ptr >= ptrend)
{
*errorcodeptr = ERR2;
break;
}
/* Handle \c in an ASCII/Unicode environment. */
#ifndef EBCDIC /* ASCII/UTF-8 coding */
if (c < 32 || c > 126) /* Excludes all non-printable ASCII */
{
*errorcodeptr = ERR68;
break;
}
c ^= 0x40;
/* Handle \c in an EBCDIC environment. The special case \c? is converted to
255 (0xff) or 95 (0x5f) if other character suggest we are using th POSIX-BC
encoding. (This is the way Perl indicates that it handles \c?.) The other
valid sequences correspond to a list of specific characters. */
#else
if (c == CHAR_QUESTION_MARK)
c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff;
else
{
for (i = 0; i < 32; i++)
{
if (c == ebcdic_escape_c[i]) break;
}
if (i < 32) c = i; else *errorcodeptr = ERR68;
}
#endif /* EBCDIC */
break;
/* Any other alphanumeric following \ is an error. Perl gives an error only
if in warning mode, but PCRE doesn't have a warning mode. */
default:
*errorcodeptr = ERR3;
break;
}
}
/* Perl supports \N{name} for character names, as well as plain \N for "not
newline". PCRE does not support \N{name}. However, it does support
quantification such as \N{2,3}. */
if (escape == ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
!is_counted_repeat(ptr+2))
*errorcodeptr = ERR37;
/* If PCRE2_UCP is set, we change the values for \d etc. */
if ((options & PCRE2_UCP) != 0 && escape >= ESC_D && escape <= ESC_w)
escape += (ESC_DU - ESC_D);
/* Set the pointer to the final character before returning. */
*ptrptr = ptr;
*chptr = c;
return escape;
}
#ifdef SUPPORT_UNICODE
/*************************************************
* Handle \P and \p *
*************************************************/
/* This function is called after \P or \p has been encountered, provided that
PCRE2 is compiled with support for UTF and Unicode properties. On entry, the
contents of ptrptr are pointing at the P or p. On exit, it is left pointing at
the final code unit of the escape sequence.
Arguments:
ptrptr the pattern position pointer
negptr a boolean that is set TRUE for negation else FALSE
ptypeptr an unsigned int that is set to the type value
pdataptr an unsigned int that is set to the detailed property value
errorcodeptr the error code variable
cb the compile data
Returns: TRUE if the type value was found, or FALSE for an invalid type
*/
static BOOL
get_ucp(PCRE2_SPTR *ptrptr, BOOL *negptr, unsigned int *ptypeptr,
unsigned int *pdataptr, int *errorcodeptr, compile_block *cb)
{
register PCRE2_UCHAR c;
int i, bot, top;
PCRE2_SPTR ptr = *ptrptr;
PCRE2_UCHAR name[32];
*negptr = FALSE;
c = *(++ptr);
/* \P or \p can be followed by a name in {}, optionally preceded by ^ for
negation. */
if (c == CHAR_LEFT_CURLY_BRACKET)
{
if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
{
*negptr = TRUE;
ptr++;
}
for (i = 0; i < (int)(sizeof(name) / sizeof(PCRE2_UCHAR)) - 1; i++)
{
c = *(++ptr);
if (c == CHAR_NULL) goto ERROR_RETURN;
if (c == CHAR_RIGHT_CURLY_BRACKET) break;
name[i] = c;
}
if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
name[i] = 0;
}
/* Otherwise there is just one following character, which must be an ASCII
letter. */
else if (MAX_255(c) && (cb->ctypes[c] & ctype_letter) != 0)
{
name[0] = c;
name[1] = 0;
}
else goto ERROR_RETURN;
*ptrptr = ptr;
/* Search for a recognized property name using binary chop. */
bot = 0;
top = PRIV(utt_size);
while (bot < top)
{
int r;
i = (bot + top) >> 1;
r = PRIV(strcmp_c8)(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
if (r == 0)
{
*ptypeptr = PRIV(utt)[i].type;
*pdataptr = PRIV(utt)[i].value;
return TRUE;
}
if (r > 0) bot = i + 1; else top = i;
}
*errorcodeptr = ERR47; /* Unrecognized name */
return FALSE;
ERROR_RETURN: /* Malformed \P or \p */
*errorcodeptr = ERR46;
*ptrptr = ptr;
return FALSE;
}
#endif
/*************************************************
* Read repeat counts *
*************************************************/
/* Read an item of the form {n,m} and return the values. This is called only
after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
so the syntax is guaranteed to be correct, but we need to check the values.
Arguments:
p pointer to first char after '{'
minp pointer to int for min
maxp pointer to int for max
returned as -1 if no max
errorcodeptr points to error code variable
Returns: pointer to '}' on success;
current ptr on error, with errorcodeptr set non-zero
*/
static PCRE2_SPTR
read_repeat_counts(PCRE2_SPTR p, int *minp, int *maxp, int *errorcodeptr)
{
int min = 0;
int max = -1;
while (IS_DIGIT(*p))
{
min = min * 10 + (int)(*p++ - CHAR_0);
if (min > 65535)
{
*errorcodeptr = ERR5;
return p;
}
}
if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
{
if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
{
max = 0;
while(IS_DIGIT(*p))
{
max = max * 10 + (int)(*p++ - CHAR_0);
if (max > 65535)
{
*errorcodeptr = ERR5;
return p;
}
}
if (max < min)
{
*errorcodeptr = ERR4;
return p;
}
}
}
*minp = min;
*maxp = max;
return p;
}
/*************************************************
* Scan compiled regex for recursion reference *
*************************************************/
/* This function scans through a compiled pattern until it finds an instance of
OP_RECURSE.
Arguments:
code points to start of expression
utf TRUE in UTF mode
Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
*/
static PCRE2_SPTR
find_recurse(PCRE2_SPTR code, BOOL utf)
{
for (;;)
{
register PCRE2_UCHAR c = *code;
if (c == OP_END) return NULL;
if (c == OP_RECURSE) return code;
/* XCLASS is used for classes that cannot be represented just by a bit map.
This includes negated single high-valued characters. CALLOUT_STR is used for
callouts with string arguments. In both cases the length in the table is
zero; the actual length is stored in the compiled code. */
if (c == OP_XCLASS) code += GET(code, 1);
else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE);
/* Otherwise, we can get the item's length from the table, except that for
repeated character types, we have to test for \p and \P, which have an extra
two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
must add in its length. */
else
{
switch(c)
{
case OP_TYPESTAR:
case OP_TYPEMINSTAR:
case OP_TYPEPLUS:
case OP_TYPEMINPLUS:
case OP_TYPEQUERY:
case OP_TYPEMINQUERY:
case OP_TYPEPOSSTAR:
case OP_TYPEPOSPLUS:
case OP_TYPEPOSQUERY:
if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
break;
case OP_TYPEPOSUPTO:
case OP_TYPEUPTO:
case OP_TYPEMINUPTO:
case OP_TYPEEXACT:
if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
code += 2;
break;
case OP_MARK:
case OP_PRUNE_ARG:
case OP_SKIP_ARG:
case OP_THEN_ARG:
code += code[1];
break;
}
/* Add in the fixed length from the table */
code += PRIV(OP_lengths)[c];
/* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may
be followed by a multi-unit character. The length in the table is a
minimum, so we have to arrange to skip the extra units. */
#ifdef MAYBE_UTF_MULTI
if (utf) switch(c)
{
case OP_CHAR:
case OP_CHARI:
case OP_NOT:
case OP_NOTI:
case OP_EXACT:
case OP_EXACTI:
case OP_NOTEXACT:
case OP_NOTEXACTI:
case OP_UPTO:
case OP_UPTOI:
case OP_NOTUPTO:
case OP_NOTUPTOI:
case OP_MINUPTO:
case OP_MINUPTOI:
case OP_NOTMINUPTO:
case OP_NOTMINUPTOI:
case OP_POSUPTO:
case OP_POSUPTOI:
case OP_NOTPOSUPTO:
case OP_NOTPOSUPTOI:
case OP_STAR:
case OP_STARI:
case OP_NOTSTAR:
case OP_NOTSTARI:
case OP_MINSTAR:
case OP_MINSTARI:
case OP_NOTMINSTAR:
case OP_NOTMINSTARI:
case OP_POSSTAR:
case OP_POSSTARI:
case OP_NOTPOSSTAR:
case OP_NOTPOSSTARI:
case OP_PLUS:
case OP_PLUSI:
case OP_NOTPLUS:
case OP_NOTPLUSI:
case OP_MINPLUS:
case OP_MINPLUSI:
case OP_NOTMINPLUS:
case OP_NOTMINPLUSI:
case OP_POSPLUS:
case OP_POSPLUSI:
case OP_NOTPOSPLUS:
case OP_NOTPOSPLUSI:
case OP_QUERY:
case OP_QUERYI:
case OP_NOTQUERY:
case OP_NOTQUERYI:
case OP_MINQUERY:
case OP_MINQUERYI:
case OP_NOTMINQUERY:
case OP_NOTMINQUERYI:
case OP_POSQUERY:
case OP_POSQUERYI:
case OP_NOTPOSQUERY:
case OP_NOTPOSQUERYI:
if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
break;
}
#else
(void)(utf); /* Keep compiler happy by referencing function argument */
#endif /* MAYBE_UTF_MULTI */
}
}
}
/*************************************************
* Check for POSIX class syntax *
*************************************************/
/* This function is called when the sequence "[:" or "[." or "[=" is
encountered in a character class. It checks whether this is followed by a
sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
reach an unescaped ']' without the special preceding character, return FALSE.
Originally, this function only recognized a sequence of letters between the
terminators, but it seems that Perl recognizes any sequence of characters,
though of course unknown POSIX names are subsequently rejected. Perl gives an
"Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
didn't consider this to be a POSIX class. Likewise for [:1234:].
The problem in trying to be exactly like Perl is in the handling of escapes. We
have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
below handles the special cases \\ and \], but does not try to do any other
escape processing. This makes it different from Perl for cases such as
[:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does
not recognize "l\ower". This is a lesser evil than not diagnosing bad classes
when Perl does, I think.
A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
It seems that the appearance of a nested POSIX class supersedes an apparent
external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
a digit. This is handled by returning FALSE if the start of a new group with
the same terminator is encountered, since the next closing sequence must close
the nested group, not the outer one.
In Perl, unescaped square brackets may also appear as part of class names. For
example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
[:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
seem right at all. PCRE does not allow closing square brackets in POSIX class
names.
Arguments:
ptr pointer to the initial [
endptr where to return a pointer to the terminating ':', '.', or '='
Returns: TRUE or FALSE
*/
static BOOL
check_posix_syntax(PCRE2_SPTR ptr, PCRE2_SPTR *endptr)
{
PCRE2_UCHAR terminator; /* Don't combine these lines; the Solaris cc */
terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
for (++ptr; *ptr != CHAR_NULL; ptr++)
{
if (*ptr == CHAR_BACKSLASH &&
(ptr[1] == CHAR_RIGHT_SQUARE_BRACKET || ptr[1] == CHAR_BACKSLASH))
ptr++;
else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) ||
*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
{
*endptr = ptr;
return TRUE;
}
}
return FALSE;
}
/*************************************************
* Check POSIX class name *
*************************************************/
/* This function is called to check the name given in a POSIX-style class entry
such as [:alnum:].
Arguments:
ptr points to the first letter
len the length of the name
Returns: a value representing the name, or -1 if unknown
*/
static int
check_posix_name(PCRE2_SPTR ptr, int len)
{
const char *pn = posix_names;
register int yield = 0;
while (posix_name_lengths[yield] != 0)
{
if (len == posix_name_lengths[yield] &&
PRIV(strncmp_c8)(ptr, pn, (unsigned int)len) == 0) return yield;
pn += posix_name_lengths[yield] + 1;
yield++;
}
return -1;
}
#ifdef SUPPORT_UNICODE
/*************************************************
* Get othercase range *
*************************************************/
/* This function is passed the start and end of a class range in UCT mode. It
searches up the characters, looking for ranges of characters in the "other"
case. Each call returns the next one, updating the start address. A character
with multiple other cases is returned on its own with a special return value.
Arguments:
cptr points to starting character value; updated
d end value
ocptr where to put start of othercase range
odptr where to put end of othercase range
Yield: -1 when no more
0 when a range is returned
>0 the CASESET offset for char with multiple other cases
in this case, ocptr contains the original
*/
static int
get_othercase_range(uint32_t *cptr, uint32_t d, uint32_t *ocptr,
uint32_t *odptr)
{
uint32_t c, othercase, next;
unsigned int co;
/* Find the first character that has an other case. If it has multiple other
cases, return its case offset value. */
for (c = *cptr; c <= d; c++)
{
if ((co = UCD_CASESET(c)) != 0)
{
*ocptr = c++; /* Character that has the set */
*cptr = c; /* Rest of input range */
return (int)co;
}
if ((othercase = UCD_OTHERCASE(c)) != c) break;
}
if (c > d) return -1; /* Reached end of range */
/* Found a character that has a single other case. Search for the end of the
range, which is either the end of the input range, or a character that has zero
or more than one other cases. */
*ocptr = othercase;
next = othercase + 1;
for (++c; c <= d; c++)
{
if ((co = UCD_CASESET(c)) != 0 || UCD_OTHERCASE(c) != next) break;
next++;
}
*odptr = next - 1; /* End of othercase range */
*cptr = c; /* Rest of input range */
return 0;
}
#endif /* SUPPORT_UNICODE */
/*************************************************
* Add a character or range to a class *
*************************************************/
/* This function packages up the logic of adding a character or range of
characters to a class. The character values in the arguments will be within the
valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
mutually recursive with the function immediately below.
Arguments:
classbits the bit map for characters < 256
uchardptr points to the pointer for extra data
options the options word
cb compile data
start start of range character
end end of range character
Returns: the number of < 256 characters added
the pointer to extra data is updated
*/
static int
add_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
compile_block *cb, uint32_t start, uint32_t end)
{
uint32_t c;
uint32_t classbits_end = (end <= 0xff ? end : 0xff);
int n8 = 0;
/* If caseless matching is required, scan the range and process alternate
cases. In Unicode, there are 8-bit characters that have alternate cases that
are greater than 255 and vice-versa. Sometimes we can just extend the original
range. */
if ((options & PCRE2_CASELESS) != 0)
{
#ifdef SUPPORT_UNICODE
if ((options & PCRE2_UTF) != 0)
{
int rc;
uint32_t oc, od;
options &= ~PCRE2_CASELESS; /* Remove for recursive calls */
c = start;
while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0)
{
/* Handle a single character that has more than one other case. */
if (rc > 0) n8 += add_list_to_class(classbits, uchardptr, options, cb,
PRIV(ucd_caseless_sets) + rc, oc);
/* Do nothing if the other case range is within the original range. */
else if (oc >= start && od <= end) continue;
/* Extend the original range if there is overlap, noting that if oc < c, we
can't have od > end because a subrange is always shorter than the basic
range. Otherwise, use a recursive call to add the additional range. */
else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
else if (od > end && oc <= end + 1)
{
end = od; /* Extend upwards */
if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff);
}
else n8 += add_to_class(classbits, uchardptr, options, cb, oc, od);
}
}
else
#endif /* SUPPORT_UNICODE */
/* Not UTF mode */
for (c = start; c <= classbits_end; c++)
{
SETBIT(classbits, cb->fcc[c]);
n8++;
}
}
/* Now handle the original range. Adjust the final value according to the bit
length - this means that the same lists of (e.g.) horizontal spaces can be used
in all cases. */
if ((options & PCRE2_UTF) == 0 && end > MAX_NON_UTF_CHAR)
end = MAX_NON_UTF_CHAR;
/* Use the bitmap for characters < 256. Otherwise use extra data.*/
for (c = start; c <= classbits_end; c++)
{
/* Regardless of start, c will always be <= 255. */
SETBIT(classbits, c);
n8++;
}
#ifdef SUPPORT_WIDE_CHARS
if (start <= 0xff) start = 0xff + 1;
if (end >= start)
{
PCRE2_UCHAR *uchardata = *uchardptr;
#ifdef SUPPORT_UNICODE
if ((options & PCRE2_UTF) != 0)
{
if (start < end)
{
*uchardata++ = XCL_RANGE;
uchardata += PRIV(ord2utf)(start, uchardata);
uchardata += PRIV(ord2utf)(end, uchardata);
}
else if (start == end)
{
*uchardata++ = XCL_SINGLE;
uchardata += PRIV(ord2utf)(start, uchardata);
}
}
else
#endif /* SUPPORT_UNICODE */
/* Without UTF support, character values are constrained by the bit length,
and can only be > 256 for 16-bit and 32-bit libraries. */
#if PCRE2_CODE_UNIT_WIDTH == 8
{}
#else
if (start < end)
{
*uchardata++ = XCL_RANGE;
*uchardata++ = start;
*uchardata++ = end;
}
else if (start == end)
{
*uchardata++ = XCL_SINGLE;
*uchardata++ = start;
}
#endif
*uchardptr = uchardata; /* Updata extra data pointer */
}
#else
(void)uchardptr; /* Avoid compiler warning */
#endif /* SUPPORT_WIDE_CHARS */
return n8; /* Number of 8-bit characters */
}
/*************************************************
* Add a list of characters to a class *
*************************************************/
/* This function is used for adding a list of case-equivalent characters to a
class, and also for adding a list of horizontal or vertical whitespace. If the
list is in order (which it should be), ranges of characters are detected and
handled appropriately. This function is mutually recursive with the function
above.
Arguments:
classbits the bit map for characters < 256
uchardptr points to the pointer for extra data
options the options word
cb contains pointers to tables etc.
p points to row of 32-bit values, terminated by NOTACHAR
except character to omit; this is used when adding lists of
case-equivalent characters to avoid including the one we
already know about
Returns: the number of < 256 characters added
the pointer to extra data is updated
*/
static int
add_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
compile_block *cb, const uint32_t *p, unsigned int except)
{
int n8 = 0;
while (p[0] < NOTACHAR)
{
int n = 0;
if (p[0] != except)
{
while(p[n+1] == p[0] + n + 1) n++;
n8 += add_to_class(classbits, uchardptr, options, cb, p[0], p[n]);
}
p += n + 1;
}
return n8;
}
/*************************************************
* Add characters not in a list to a class *
*************************************************/
/* This function is used for adding the complement of a list of horizontal or
vertical whitespace to a class. The list must be in order.
Arguments:
classbits the bit map for characters < 256
uchardptr points to the pointer for extra data
options the options word
cb contains pointers to tables etc.
p points to row of 32-bit values, terminated by NOTACHAR
Returns: the number of < 256 characters added
the pointer to extra data is updated
*/
static int
add_not_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
uint32_t options, compile_block *cb, const uint32_t *p)
{
BOOL utf = (options & PCRE2_UTF) != 0;
int n8 = 0;
if (p[0] > 0)
n8 += add_to_class(classbits, uchardptr, options, cb, 0, p[0] - 1);
while (p[0] < NOTACHAR)
{
while (p[1] == p[0] + 1) p++;
n8 += add_to_class(classbits, uchardptr, options, cb, p[0] + 1,
(p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
p++;
}
return n8;
}
/*************************************************
* Process (*VERB) name for escapes *
*************************************************/
/* This function is called when the PCRE2_ALT_VERBNAMES option is set, to
process the characters in a verb's name argument. It is called twice, once with
codeptr == NULL, to find out the length of the processed name, and again to put
the name into memory.
Arguments:
ptrptr pointer to the input pointer
codeptr pointer to the compiled code pointer
errorcodeptr pointer to the error code
options the options bits
utf TRUE if processing UTF
cb compile data block
Returns: length of the processed name, or < 0 on error
*/
static int
process_verb_name(PCRE2_SPTR *ptrptr, PCRE2_UCHAR **codeptr, int *errorcodeptr,
uint32_t options, BOOL utf, compile_block *cb)
{
int32_t arglen = 0;
BOOL inescq = FALSE;
PCRE2_SPTR ptr = *ptrptr;
PCRE2_UCHAR *code = (codeptr == NULL)? NULL : *codeptr;
for (; ptr < cb->end_pattern; ptr++)
{
uint32_t x = *ptr;
/* Skip over literals */
if (inescq)
{
if (x == CHAR_BACKSLASH && ptr[1] == CHAR_E)
{
inescq = FALSE;
ptr++;;
continue;
}
}
else /* Not a literal character */
{
if (x == CHAR_RIGHT_PARENTHESIS) break;
/* Skip over comments and whitespace in extended mode. */
if ((options & PCRE2_EXTENDED) != 0)
{
PCRE2_SPTR wscptr = ptr;
while (MAX_255(x) && (cb->ctypes[x] & ctype_space) != 0) x = *(++ptr);
if (x == CHAR_NUMBER_SIGN)
{
ptr++;
while (*ptr != CHAR_NULL || ptr < cb->end_pattern)
{
if (IS_NEWLINE(ptr)) /* For non-fixed-length newline cases, */
{ /* IS_NEWLINE sets cb->nllen. */
ptr += cb->nllen;
break;
}
ptr++;
#ifdef SUPPORT_UNICODE
if (utf) FORWARDCHAR(ptr);
#endif
}
}
/* If we have skipped any characters, restart the loop. */
if (ptr > wscptr)
{
ptr--;
continue;
}
}
/* Process escapes */
if (x == '\\')
{
int rc;
*errorcodeptr = 0;
rc = PRIV(check_escape)(&ptr, cb->end_pattern, &x, errorcodeptr, options,
FALSE, cb);
*ptrptr = ptr; /* For possible error */
if (*errorcodeptr != 0) return -1;
if (rc != 0)
{
if (rc == ESC_Q)
{
inescq = TRUE;
continue;
}
if (rc == ESC_E) continue;
*errorcodeptr = ERR40;
return -1;
}
}
}
/* We have the next character in the name. */
#ifdef SUPPORT_UNICODE
if (utf)
{
if (code == NULL) /* Just want the length */
{
#if PCRE2_CODE_UNIT_WIDTH == 8
int i;
for (i = 0; i < PRIV(utf8_table1_size); i++)
if ((int)x <= PRIV(utf8_table1)[i]) break;
arglen += i;
#elif PCRE2_CODE_UNIT_WIDTH == 16
if (x > 0xffff) arglen++;
#endif
}
else
{
PCRE2_UCHAR cbuff[8];
x = PRIV(ord2utf)(x, cbuff);
memcpy(code, cbuff, CU2BYTES(x));
code += x;
}
}
else
#endif /* SUPPORT_UNICODE */
/* Not UTF */
{
if (code != NULL) *code++ = x;
}
arglen++;
if ((unsigned int)arglen > MAX_MARK)
{
*errorcodeptr = ERR76;
*ptrptr = ptr;
return -1;
}
}
/* Update the pointers before returning. */
*ptrptr = ptr;
if (codeptr != NULL) *codeptr = code;
return arglen;
}