| /************************************************* |
| * Perl-Compatible Regular Expressions * |
| *************************************************/ |
| |
| /* PCRE is a library of functions to support regular expressions whose syntax |
| and semantics are as close as possible to those of the Perl 5 language. |
| |
| Written by Philip Hazel |
| Original API code Copyright (c) 1997-2012 University of Cambridge |
| New API code Copyright (c) 2016 University of Cambridge |
| |
| ----------------------------------------------------------------------------- |
| Redistribution and use in source and binary forms, with or without |
| modification, are permitted provided that the following conditions are met: |
| |
| * Redistributions of source code must retain the above copyright notice, |
| this list of conditions and the following disclaimer. |
| |
| * Redistributions in binary form must reproduce the above copyright |
| notice, this list of conditions and the following disclaimer in the |
| documentation and/or other materials provided with the distribution. |
| |
| * Neither the name of the University of Cambridge nor the names of its |
| contributors may be used to endorse or promote products derived from |
| this software without specific prior written permission. |
| |
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| POSSIBILITY OF SUCH DAMAGE. |
| ----------------------------------------------------------------------------- |
| */ |
| |
| |
| #ifdef HAVE_CONFIG_H |
| #include "config.h" |
| #endif |
| |
| #define NLBLOCK cb /* Block containing newline information */ |
| #define PSSTART start_pattern /* Field containing processed string start */ |
| #define PSEND end_pattern /* Field containing processed string end */ |
| |
| #include "pcre2_internal.h" |
| |
| /* In rare error cases debugging might require calling pcre2_printint(). */ |
| |
| #if 0 |
| #ifdef EBCDIC |
| #define PRINTABLE(c) ((c) >= 64 && (c) < 255) |
| #else |
| #define PRINTABLE(c) ((c) >= 32 && (c) < 127) |
| #endif |
| #include "pcre2_printint.c" |
| #define CALL_PRINTINT |
| #endif |
| |
| /* There are a few things that vary with different code unit sizes. Handle them |
| by defining macros in order to minimize #if usage. */ |
| |
| #if PCRE2_CODE_UNIT_WIDTH == 8 |
| #define STRING_UTFn_RIGHTPAR STRING_UTF8_RIGHTPAR, 5 |
| #define XDIGIT(c) xdigitab[c] |
| |
| #else /* Either 16-bit or 32-bit */ |
| #define XDIGIT(c) (MAX_255(c)? xdigitab[c] : 0xff) |
| |
| #if PCRE2_CODE_UNIT_WIDTH == 16 |
| #define STRING_UTFn_RIGHTPAR STRING_UTF16_RIGHTPAR, 6 |
| |
| #else /* 32-bit */ |
| #define STRING_UTFn_RIGHTPAR STRING_UTF32_RIGHTPAR, 6 |
| #endif |
| #endif |
| |
| /* Function definitions to allow mutual recursion */ |
| |
| static int |
| add_list_to_class(uint8_t *, PCRE2_UCHAR **, uint32_t, compile_block *, |
| const uint32_t *, unsigned int); |
| |
| static BOOL |
| compile_regex(uint32_t, PCRE2_UCHAR **, PCRE2_SPTR *, int *, BOOL, BOOL, |
| uint32_t, int, uint32_t *, int32_t *, uint32_t *, int32_t *, |
| branch_chain *, compile_block *, size_t *); |
| |
| |
| |
| /************************************************* |
| * Code parameters and static tables * |
| *************************************************/ |
| |
| /* This value specifies the size of stack workspace, which is used in different |
| ways in the different pattern scans. The group-identifying pre-scan uses it to |
| handle nesting, and needs it to be 16-bit aligned. |
| |
| During the first compiling phase, when determining how much memory is required, |
| the regex is partly compiled into this space, but the compiled parts are |
| discarded as soon as they can be, so that hopefully there will never be an |
| overrun. The code does, however, check for an overrun, which can occur for |
| pathological patterns. The size of the workspace depends on LINK_SIZE because |
| the length of compiled items varies with this. |
| |
| In the real compile phase, the workspace is used for remembering data about |
| numbered groups, provided there are not too many of them (if there are, extra |
| memory is acquired). For this phase the memory must be 32-bit aligned. Having |
| defined the size in code units, we set up C32_WORK_SIZE as the number of |
| elements in the 32-bit vector. */ |
| |
| #define COMPILE_WORK_SIZE (2048*LINK_SIZE) /* Size in code units */ |
| |
| #define C32_WORK_SIZE \ |
| ((COMPILE_WORK_SIZE * sizeof(PCRE2_UCHAR))/sizeof(uint32_t)) |
| |
| /* The overrun tests check for a slightly smaller size so that they detect the |
| overrun before it actually does run off the end of the data block. */ |
| |
| #define WORK_SIZE_SAFETY_MARGIN (100) |
| |
| /* This value determines the size of the initial vector that is used for |
| remembering named groups during the pre-compile. It is allocated on the stack, |
| but if it is too small, it is expanded, in a similar way to the workspace. The |
| value is the number of slots in the list. */ |
| |
| #define NAMED_GROUP_LIST_SIZE 20 |
| |
| /* The original PCRE required patterns to be zero-terminated, and it simplifies |
| the compiling code if it is guaranteed that there is a zero code unit at the |
| end of the pattern, because this means that tests for coding sequences such as |
| (*SKIP) or even just (?<= can check a sequence of code units without having to |
| keep checking for the end of the pattern. The new PCRE2 API allows zero code |
| units within patterns if a positive length is given, but in order to keep most |
| of the compiling code as it was, we copy such patterns and add a zero on the |
| end. This value determines the size of space on the stack that is used if the |
| pattern fits; if not, heap memory is used. */ |
| |
| #define COPIED_PATTERN_SIZE 1024 |
| |
| /* Maximum length value to check against when making sure that the variable |
| that holds the compiled pattern length does not overflow. We make it a bit less |
| than INT_MAX to allow for adding in group terminating bytes, so that we don't |
| have to check them every time. */ |
| |
| #define OFLOW_MAX (INT_MAX - 20) |
| |
| /* Macro for setting individual bits in class bitmaps. */ |
| |
| #define SETBIT(a,b) a[(b)/8] |= (1 << ((b)&7)) |
| |
| /* Private flags added to firstcu and reqcu. */ |
| |
| #define REQ_CASELESS (1 << 0) /* Indicates caselessness */ |
| #define REQ_VARY (1 << 1) /* reqcu followed non-literal item */ |
| /* Negative values for the firstcu and reqcu flags */ |
| #define REQ_UNSET (-2) /* Not yet found anything */ |
| #define REQ_NONE (-1) /* Found not fixed char */ |
| |
| /* These flags are used in the groupinfo vector. */ |
| |
| #define GI_SET_COULD_BE_EMPTY 0x80000000u |
| #define GI_COULD_BE_EMPTY 0x40000000u |
| #define GI_NOT_FIXED_LENGTH 0x20000000u |
| #define GI_SET_FIXED_LENGTH 0x10000000u |
| #define GI_FIXED_LENGTH_MASK 0x0000ffffu |
| |
| /* This bit (which is greater than any UTF value) is used to indicate that a |
| variable contains a number of code units instead of an actual code point. */ |
| |
| #define UTF_LENGTH 0x10000000l |
| |
| /* This simple test for a decimal digit works for both ASCII/Unicode and EBCDIC |
| and is fast (a good compiler can turn it into a subtraction and unsigned |
| comparison). */ |
| |
| #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9) |
| |
| /* Table to identify hex digits. The tables in chartables are dependent on the |
| locale, and may mark arbitrary characters as digits. We want to recognize only |
| 0-9, a-z, and A-Z as hex digits, which is why we have a private table here. It |
| costs 256 bytes, but it is a lot faster than doing character value tests (at |
| least in some simple cases I timed), and in some applications one wants PCRE to |
| compile efficiently as well as match efficiently. The value in the table is |
| the binary hex digit value, or 0xff for non-hex digits. */ |
| |
| /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in |
| UTF-8 mode. */ |
| |
| #ifndef EBCDIC |
| static const uint8_t xdigitab[] = |
| { |
| 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 0- 7 */ |
| 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 8- 15 */ |
| 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 16- 23 */ |
| 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 24- 31 */ |
| 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - ' */ |
| 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* ( - / */ |
| 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /* 0 - 7 */ |
| 0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff, /* 8 - ? */ |
| 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* @ - G */ |
| 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* H - O */ |
| 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* P - W */ |
| 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* X - _ */ |
| 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* ` - g */ |
| 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* h - o */ |
| 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* p - w */ |
| 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* x -127 */ |
| 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 128-135 */ |
| 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 136-143 */ |
| 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144-151 */ |
| 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 152-159 */ |
| 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160-167 */ |
| 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 168-175 */ |
| 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 176-183 */ |
| 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */ |
| 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 192-199 */ |
| 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 2ff-207 */ |
| 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 208-215 */ |
| 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 216-223 */ |
| 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 224-231 */ |
| 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 232-239 */ |
| 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 240-247 */ |
| 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};/* 248-255 */ |
| |
| #else |
| |
| /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */ |
| |
| static const uint8_t xdigitab[] = |
| { |
| 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 0- 7 0 */ |
| 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 8- 15 */ |
| 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 16- 23 10 */ |
| 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 24- 31 */ |
| 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 32- 39 20 */ |
| 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 40- 47 */ |
| 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 48- 55 30 */ |
| 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 56- 63 */ |
| 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - 71 40 */ |
| 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 72- | */ |
| 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* & - 87 50 */ |
| 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 88- 95 */ |
| 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - -103 60 */ |
| 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 104- ? */ |
| 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 112-119 70 */ |
| 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 120- " */ |
| 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* 128- g 80 */ |
| 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* h -143 */ |
| 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144- p 90 */ |
| 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* q -159 */ |
| 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160- x A0 */ |
| 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* y -175 */ |
| 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* ^ -183 B0 */ |
| 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */ |
| 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* { - G C0 */ |
| 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* H -207 */ |
| 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* } - P D0 */ |
| 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* Q -223 */ |
| 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* \ - X E0 */ |
| 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* Y -239 */ |
| 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /* 0 - 7 F0 */ |
| 0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff};/* 8 -255 */ |
| #endif /* EBCDIC */ |
| |
| |
| /* Table for handling alphanumeric escaped characters. Positive returns are |
| simple data values; negative values are for special things like \d and so on. |
| Zero means further processing is needed (for things like \x), or the escape is |
| invalid. */ |
| |
| /* This is the "normal" table for ASCII systems or for EBCDIC systems running |
| in UTF-8 mode. It runs from '0' to 'z'. */ |
| |
| #ifndef EBCDIC |
| #define ESCAPES_FIRST CHAR_0 |
| #define ESCAPES_LAST CHAR_z |
| #define UPPER_CASE(c) (c-32) |
| |
| static const short int escapes[] = { |
| 0, 0, |
| 0, 0, |
| 0, 0, |
| 0, 0, |
| 0, 0, |
| CHAR_COLON, CHAR_SEMICOLON, |
| CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN, |
| CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK, |
| CHAR_COMMERCIAL_AT, -ESC_A, |
| -ESC_B, -ESC_C, |
| -ESC_D, -ESC_E, |
| 0, -ESC_G, |
| -ESC_H, 0, |
| 0, -ESC_K, |
| 0, 0, |
| -ESC_N, 0, |
| -ESC_P, -ESC_Q, |
| -ESC_R, -ESC_S, |
| 0, 0, |
| -ESC_V, -ESC_W, |
| -ESC_X, 0, |
| -ESC_Z, CHAR_LEFT_SQUARE_BRACKET, |
| CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET, |
| CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE, |
| CHAR_GRAVE_ACCENT, ESC_a, |
| -ESC_b, 0, |
| -ESC_d, ESC_e, |
| ESC_f, 0, |
| -ESC_h, 0, |
| 0, -ESC_k, |
| 0, 0, |
| ESC_n, 0, |
| -ESC_p, 0, |
| ESC_r, -ESC_s, |
| ESC_tee, 0, |
| -ESC_v, -ESC_w, |
| 0, 0, |
| -ESC_z |
| }; |
| |
| #else |
| |
| /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. |
| It runs from 'a' to '9'. For some minimal testing of EBCDIC features, the code |
| is sometimes compiled on an ASCII system. In this case, we must not use CHAR_a |
| because it is defined as 'a', which of course picks up the ASCII value. */ |
| |
| #if 'a' == 0x81 /* Check for a real EBCDIC environment */ |
| #define ESCAPES_FIRST CHAR_a |
| #define ESCAPES_LAST CHAR_9 |
| #define UPPER_CASE(c) (c+64) |
| #else /* Testing in an ASCII environment */ |
| #define ESCAPES_FIRST ((unsigned char)'\x81') /* EBCDIC 'a' */ |
| #define ESCAPES_LAST ((unsigned char)'\xf9') /* EBCDIC '9' */ |
| #define UPPER_CASE(c) (c-32) |
| #endif |
| |
| static const short int escapes[] = { |
| /* 80 */ ESC_a, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, |
| /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0, |
| /* 90 */ 0, 0, -ESC_k, 0, 0, ESC_n, 0, -ESC_p, |
| /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0, |
| /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0, |
| /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0, |
| /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0, |
| /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-', |
| /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G, |
| /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0, |
| /* D0 */ '}', 0, -ESC_K, 0, 0,-ESC_N, 0, -ESC_P, |
| /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0, |
| /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X, |
| /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0, |
| /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0, |
| /* F8 */ 0, 0 |
| }; |
| |
| /* We also need a table of characters that may follow \c in an EBCDIC |
| environment for characters 0-31. */ |
| |
| static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_"; |
| |
| #endif /* EBCDIC */ |
| |
| |
| /* Table of special "verbs" like (*PRUNE). This is a short table, so it is |
| searched linearly. Put all the names into a single string, in order to reduce |
| the number of relocations when a shared library is dynamically linked. The |
| string is built from string macros so that it works in UTF-8 mode on EBCDIC |
| platforms. */ |
| |
| typedef struct verbitem { |
| int len; /* Length of verb name */ |
| int op; /* Op when no arg, or -1 if arg mandatory */ |
| int op_arg; /* Op when arg present, or -1 if not allowed */ |
| } verbitem; |
| |
| static const char verbnames[] = |
| "\0" /* Empty name is a shorthand for MARK */ |
| STRING_MARK0 |
| STRING_ACCEPT0 |
| STRING_COMMIT0 |
| STRING_F0 |
| STRING_FAIL0 |
| STRING_PRUNE0 |
| STRING_SKIP0 |
| STRING_THEN; |
| |
| static const verbitem verbs[] = { |
| { 0, -1, OP_MARK }, |
| { 4, -1, OP_MARK }, |
| { 6, OP_ACCEPT, -1 }, |
| { 6, OP_COMMIT, -1 }, |
| { 1, OP_FAIL, -1 }, |
| { 4, OP_FAIL, -1 }, |
| { 5, OP_PRUNE, OP_PRUNE_ARG }, |
| { 4, OP_SKIP, OP_SKIP_ARG }, |
| { 4, OP_THEN, OP_THEN_ARG } |
| }; |
| |
| static const int verbcount = sizeof(verbs)/sizeof(verbitem); |
| |
| |
| /* Substitutes for [[:<:]] and [[:>:]], which mean start and end of word in |
| another regex library. */ |
| |
| static const PCRE2_UCHAR sub_start_of_word[] = { |
| CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK, |
| CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w, CHAR_RIGHT_PARENTHESIS, '\0' }; |
| |
| static const PCRE2_UCHAR sub_end_of_word[] = { |
| CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK, |
| CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w, |
| CHAR_RIGHT_PARENTHESIS, '\0' }; |
| |
| |
| /* Tables of names of POSIX character classes and their lengths. The names are |
| now all in a single string, to reduce the number of relocations when a shared |
| library is dynamically loaded. The list of lengths is terminated by a zero |
| length entry. The first three must be alpha, lower, upper, as this is assumed |
| for handling case independence. The indices for graph, print, and punct are |
| needed, so identify them. */ |
| |
| static const char posix_names[] = |
| STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0 |
| STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0 |
| STRING_graph0 STRING_print0 STRING_punct0 STRING_space0 |
| STRING_word0 STRING_xdigit; |
| |
| static const uint8_t posix_name_lengths[] = { |
| 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 }; |
| |
| #define PC_GRAPH 8 |
| #define PC_PRINT 9 |
| #define PC_PUNCT 10 |
| |
| |
| /* Table of class bit maps for each POSIX class. Each class is formed from a |
| base map, with an optional addition or removal of another map. Then, for some |
| classes, there is some additional tweaking: for [:blank:] the vertical space |
| characters are removed, and for [:alpha:] and [:alnum:] the underscore |
| character is removed. The triples in the table consist of the base map offset, |
| second map offset or -1 if no second map, and a non-negative value for map |
| addition or a negative value for map subtraction (if there are two maps). The |
| absolute value of the third field has these meanings: 0 => no tweaking, 1 => |
| remove vertical space characters, 2 => remove underscore. */ |
| |
| static const int posix_class_maps[] = { |
| cbit_word, cbit_digit, -2, /* alpha */ |
| cbit_lower, -1, 0, /* lower */ |
| cbit_upper, -1, 0, /* upper */ |
| cbit_word, -1, 2, /* alnum - word without underscore */ |
| cbit_print, cbit_cntrl, 0, /* ascii */ |
| cbit_space, -1, 1, /* blank - a GNU extension */ |
| cbit_cntrl, -1, 0, /* cntrl */ |
| cbit_digit, -1, 0, /* digit */ |
| cbit_graph, -1, 0, /* graph */ |
| cbit_print, -1, 0, /* print */ |
| cbit_punct, -1, 0, /* punct */ |
| cbit_space, -1, 0, /* space */ |
| cbit_word, -1, 0, /* word - a Perl extension */ |
| cbit_xdigit,-1, 0 /* xdigit */ |
| }; |
| |
| /* Table of substitutes for \d etc when PCRE2_UCP is set. They are replaced by |
| Unicode property escapes. */ |
| |
| #ifdef SUPPORT_UNICODE |
| static const PCRE2_UCHAR string_PNd[] = { |
| CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET, |
| CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' }; |
| static const PCRE2_UCHAR string_pNd[] = { |
| CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET, |
| CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' }; |
| static const PCRE2_UCHAR string_PXsp[] = { |
| CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET, |
| CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' }; |
| static const PCRE2_UCHAR string_pXsp[] = { |
| CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET, |
| CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' }; |
| static const PCRE2_UCHAR string_PXwd[] = { |
| CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET, |
| CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' }; |
| static const PCRE2_UCHAR string_pXwd[] = { |
| CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET, |
| CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' }; |
| |
| static PCRE2_SPTR substitutes[] = { |
| string_PNd, /* \D */ |
| string_pNd, /* \d */ |
| string_PXsp, /* \S */ /* Xsp is Perl space, but from 8.34, Perl */ |
| string_pXsp, /* \s */ /* space and POSIX space are the same. */ |
| string_PXwd, /* \W */ |
| string_pXwd /* \w */ |
| }; |
| |
| /* The POSIX class substitutes must be in the order of the POSIX class names, |
| defined above, and there are both positive and negative cases. NULL means no |
| general substitute of a Unicode property escape (\p or \P). However, for some |
| POSIX classes (e.g. graph, print, punct) a special property code is compiled |
| directly. */ |
| |
| static const PCRE2_UCHAR string_pCc[] = { |
| CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET, |
| CHAR_C, CHAR_c, CHAR_RIGHT_CURLY_BRACKET, '\0' }; |
| static const PCRE2_UCHAR string_pL[] = { |
| CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET, |
| CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' }; |
| static const PCRE2_UCHAR string_pLl[] = { |
| CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET, |
| CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' }; |
| static const PCRE2_UCHAR string_pLu[] = { |
| CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET, |
| CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' }; |
| static const PCRE2_UCHAR string_pXan[] = { |
| CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET, |
| CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' }; |
| static const PCRE2_UCHAR string_h[] = { |
| CHAR_BACKSLASH, CHAR_h, '\0' }; |
| static const PCRE2_UCHAR string_pXps[] = { |
| CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET, |
| CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' }; |
| static const PCRE2_UCHAR string_PCc[] = { |
| CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET, |
| CHAR_C, CHAR_c, CHAR_RIGHT_CURLY_BRACKET, '\0' }; |
| static const PCRE2_UCHAR string_PL[] = { |
| CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET, |
| CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' }; |
| static const PCRE2_UCHAR string_PLl[] = { |
| CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET, |
| CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' }; |
| static const PCRE2_UCHAR string_PLu[] = { |
| CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET, |
| CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' }; |
| static const PCRE2_UCHAR string_PXan[] = { |
| CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET, |
| CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' }; |
| static const PCRE2_UCHAR string_H[] = { |
| CHAR_BACKSLASH, CHAR_H, '\0' }; |
| static const PCRE2_UCHAR string_PXps[] = { |
| CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET, |
| CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' }; |
| |
| static PCRE2_SPTR posix_substitutes[] = { |
| string_pL, /* alpha */ |
| string_pLl, /* lower */ |
| string_pLu, /* upper */ |
| string_pXan, /* alnum */ |
| NULL, /* ascii */ |
| string_h, /* blank */ |
| string_pCc, /* cntrl */ |
| string_pNd, /* digit */ |
| NULL, /* graph */ |
| NULL, /* print */ |
| NULL, /* punct */ |
| string_pXps, /* space */ /* Xps is POSIX space, but from 8.34 */ |
| string_pXwd, /* word */ /* Perl and POSIX space are the same */ |
| NULL, /* xdigit */ |
| /* Negated cases */ |
| string_PL, /* ^alpha */ |
| string_PLl, /* ^lower */ |
| string_PLu, /* ^upper */ |
| string_PXan, /* ^alnum */ |
| NULL, /* ^ascii */ |
| string_H, /* ^blank */ |
| string_PCc, /* ^cntrl */ |
| string_PNd, /* ^digit */ |
| NULL, /* ^graph */ |
| NULL, /* ^print */ |
| NULL, /* ^punct */ |
| string_PXps, /* ^space */ /* Xps is POSIX space, but from 8.34 */ |
| string_PXwd, /* ^word */ /* Perl and POSIX space are the same */ |
| NULL /* ^xdigit */ |
| }; |
| #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(PCRE2_UCHAR *)) |
| #endif /* SUPPORT_UNICODE */ |
| |
| /* Masks for checking option settings. */ |
| |
| #define PUBLIC_COMPILE_OPTIONS \ |
| (PCRE2_ANCHORED|PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_ALT_CIRCUMFLEX| \ |
| PCRE2_ALT_VERBNAMES|PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_DOLLAR_ENDONLY| \ |
| PCRE2_DOTALL|PCRE2_DUPNAMES|PCRE2_EXTENDED|PCRE2_FIRSTLINE| \ |
| PCRE2_MATCH_UNSET_BACKREF|PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C| \ |
| PCRE2_NEVER_UCP|PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE| \ |
| PCRE2_NO_AUTO_POSSESS|PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_NO_START_OPTIMIZE| \ |
| PCRE2_NO_UTF_CHECK|PCRE2_UCP|PCRE2_UNGREEDY|PCRE2_USE_OFFSET_LIMIT| \ |
| PCRE2_UTF) |
| |
| /* Compile time error code numbers. They are given names so that they can more |
| easily be tracked. When a new number is added, the tables called eint1 and |
| eint2 in pcre2posix.c may need to be updated, and a new error text must be |
| added to compile_error_texts in pcre2_error.c. */ |
| |
| enum { ERR0 = COMPILE_ERROR_BASE, |
| ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9, ERR10, |
| ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19, ERR20, |
| ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29, ERR30, |
| ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, ERR40, |
| ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, ERR50, |
| ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60, |
| ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70, |
| ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80, |
| ERR81, ERR82, ERR83, ERR84, ERR85, ERR86, ERR87, ERR88 }; |
| |
| /* Error codes that correspond to negative error codes returned by |
| find_fixedlength(). */ |
| |
| static int fixed_length_errors[] = |
| { |
| ERR0, /* Not an error */ |
| ERR0, /* Not an error; -1 is used for "process later" */ |
| ERR25, /* Lookbehind is not fixed length */ |
| ERR36, /* \C in lookbehind is not allowed */ |
| ERR87, /* Lookbehind is too long */ |
| ERR86, /* Pattern too complicated */ |
| ERR70 /* Internal error: unknown opcode encountered */ |
| }; |
| |
| /* This is a table of start-of-pattern options such as (*UTF) and settings such |
| as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward |
| compatibility, (*UTFn) is supported in the relevant libraries, but (*UTF) is |
| generic and always supported. */ |
| |
| enum { PSO_OPT, /* Value is an option bit */ |
| PSO_FLG, /* Value is a flag bit */ |
| PSO_NL, /* Value is a newline type */ |
| PSO_BSR, /* Value is a \R type */ |
| PSO_LIMM, /* Read integer value for match limit */ |
| PSO_LIMR }; /* Read integer value for recursion limit */ |
| |
| typedef struct pso { |
| const uint8_t *name; |
| uint16_t length; |
| uint16_t type; |
| uint32_t value; |
| } pso; |
| |
| /* NB: STRING_UTFn_RIGHTPAR contains the length as well */ |
| |
| static pso pso_list[] = { |
| { (uint8_t *)STRING_UTFn_RIGHTPAR, PSO_OPT, PCRE2_UTF }, |
| { (uint8_t *)STRING_UTF_RIGHTPAR, 4, PSO_OPT, PCRE2_UTF }, |
| { (uint8_t *)STRING_UCP_RIGHTPAR, 4, PSO_OPT, PCRE2_UCP }, |
| { (uint8_t *)STRING_NOTEMPTY_RIGHTPAR, 9, PSO_FLG, PCRE2_NOTEMPTY_SET }, |
| { (uint8_t *)STRING_NOTEMPTY_ATSTART_RIGHTPAR, 17, PSO_FLG, PCRE2_NE_ATST_SET }, |
| { (uint8_t *)STRING_NO_AUTO_POSSESS_RIGHTPAR, 16, PSO_OPT, PCRE2_NO_AUTO_POSSESS }, |
| { (uint8_t *)STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR, 18, PSO_OPT, PCRE2_NO_DOTSTAR_ANCHOR }, |
| { (uint8_t *)STRING_NO_JIT_RIGHTPAR, 7, PSO_FLG, PCRE2_NOJIT }, |
| { (uint8_t *)STRING_NO_START_OPT_RIGHTPAR, 13, PSO_OPT, PCRE2_NO_START_OPTIMIZE }, |
| { (uint8_t *)STRING_LIMIT_MATCH_EQ, 12, PSO_LIMM, 0 }, |
| { (uint8_t *)STRING_LIMIT_RECURSION_EQ, 16, PSO_LIMR, 0 }, |
| { (uint8_t *)STRING_CR_RIGHTPAR, 3, PSO_NL, PCRE2_NEWLINE_CR }, |
| { (uint8_t *)STRING_LF_RIGHTPAR, 3, PSO_NL, PCRE2_NEWLINE_LF }, |
| { (uint8_t *)STRING_CRLF_RIGHTPAR, 5, PSO_NL, PCRE2_NEWLINE_CRLF }, |
| { (uint8_t *)STRING_ANY_RIGHTPAR, 4, PSO_NL, PCRE2_NEWLINE_ANY }, |
| { (uint8_t *)STRING_ANYCRLF_RIGHTPAR, 8, PSO_NL, PCRE2_NEWLINE_ANYCRLF }, |
| { (uint8_t *)STRING_BSR_ANYCRLF_RIGHTPAR, 12, PSO_BSR, PCRE2_BSR_ANYCRLF }, |
| { (uint8_t *)STRING_BSR_UNICODE_RIGHTPAR, 12, PSO_BSR, PCRE2_BSR_UNICODE } |
| }; |
| |
| /* This table is used when converting repeating opcodes into possessified |
| versions as a result of an explicit possessive quantifier such as ++. A zero |
| value means there is no possessified version - in those cases the item in |
| question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT |
| because all relevant opcodes are less than that. */ |
| |
| static const uint8_t opcode_possessify[] = { |
| 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 15 */ |
| 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16 - 31 */ |
| |
| 0, /* NOTI */ |
| OP_POSSTAR, 0, /* STAR, MINSTAR */ |
| OP_POSPLUS, 0, /* PLUS, MINPLUS */ |
| OP_POSQUERY, 0, /* QUERY, MINQUERY */ |
| OP_POSUPTO, 0, /* UPTO, MINUPTO */ |
| 0, /* EXACT */ |
| 0, 0, 0, 0, /* POS{STAR,PLUS,QUERY,UPTO} */ |
| |
| OP_POSSTARI, 0, /* STARI, MINSTARI */ |
| OP_POSPLUSI, 0, /* PLUSI, MINPLUSI */ |
| OP_POSQUERYI, 0, /* QUERYI, MINQUERYI */ |
| OP_POSUPTOI, 0, /* UPTOI, MINUPTOI */ |
| 0, /* EXACTI */ |
| 0, 0, 0, 0, /* POS{STARI,PLUSI,QUERYI,UPTOI} */ |
| |
| OP_NOTPOSSTAR, 0, /* NOTSTAR, NOTMINSTAR */ |
| OP_NOTPOSPLUS, 0, /* NOTPLUS, NOTMINPLUS */ |
| OP_NOTPOSQUERY, 0, /* NOTQUERY, NOTMINQUERY */ |
| OP_NOTPOSUPTO, 0, /* NOTUPTO, NOTMINUPTO */ |
| 0, /* NOTEXACT */ |
| 0, 0, 0, 0, /* NOTPOS{STAR,PLUS,QUERY,UPTO} */ |
| |
| OP_NOTPOSSTARI, 0, /* NOTSTARI, NOTMINSTARI */ |
| OP_NOTPOSPLUSI, 0, /* NOTPLUSI, NOTMINPLUSI */ |
| OP_NOTPOSQUERYI, 0, /* NOTQUERYI, NOTMINQUERYI */ |
| OP_NOTPOSUPTOI, 0, /* NOTUPTOI, NOTMINUPTOI */ |
| 0, /* NOTEXACTI */ |
| 0, 0, 0, 0, /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */ |
| |
| OP_TYPEPOSSTAR, 0, /* TYPESTAR, TYPEMINSTAR */ |
| OP_TYPEPOSPLUS, 0, /* TYPEPLUS, TYPEMINPLUS */ |
| OP_TYPEPOSQUERY, 0, /* TYPEQUERY, TYPEMINQUERY */ |
| OP_TYPEPOSUPTO, 0, /* TYPEUPTO, TYPEMINUPTO */ |
| 0, /* TYPEEXACT */ |
| 0, 0, 0, 0, /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */ |
| |
| OP_CRPOSSTAR, 0, /* CRSTAR, CRMINSTAR */ |
| OP_CRPOSPLUS, 0, /* CRPLUS, CRMINPLUS */ |
| OP_CRPOSQUERY, 0, /* CRQUERY, CRMINQUERY */ |
| OP_CRPOSRANGE, 0, /* CRRANGE, CRMINRANGE */ |
| 0, 0, 0, 0, /* CRPOS{STAR,PLUS,QUERY,RANGE} */ |
| |
| 0, 0, 0, /* CLASS, NCLASS, XCLASS */ |
| 0, 0, /* REF, REFI */ |
| 0, 0, /* DNREF, DNREFI */ |
| 0, 0 /* RECURSE, CALLOUT */ |
| }; |
| |
| |
| |
| /************************************************* |
| * Free compiled code * |
| *************************************************/ |
| |
| PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION |
| pcre2_code_free(pcre2_code *code) |
| { |
| PCRE2_SIZE* ref_count; |
| |
| if (code != NULL) |
| { |
| if (code->executable_jit != NULL) |
| PRIV(jit_free)(code->executable_jit, &code->memctl); |
| |
| if ((code->flags & PCRE2_DEREF_TABLES) != 0) |
| { |
| /* Decoded tables belong to the codes after deserialization, and they must |
| be freed when there are no more reference to them. The *ref_count should |
| always be > 0. */ |
| |
| ref_count = (PCRE2_SIZE *)(code->tables + tables_length); |
| if (*ref_count > 0) |
| { |
| (*ref_count)--; |
| if (*ref_count == 0) |
| code->memctl.free((void *)code->tables, code->memctl.memory_data); |
| } |
| } |
| |
| code->memctl.free(code, code->memctl.memory_data); |
| } |
| } |
| |
| |
| |
| /************************************************* |
| * Insert an automatic callout point * |
| *************************************************/ |
| |
| /* This function is called when the PCRE2_AUTO_CALLOUT option is set, to insert |
| callout points before each pattern item. |
| |
| Arguments: |
| code current code pointer |
| ptr current pattern pointer |
| cb general compile-time data |
| |
| Returns: new code pointer |
| */ |
| |
| static PCRE2_UCHAR * |
| auto_callout(PCRE2_UCHAR *code, PCRE2_SPTR ptr, compile_block *cb) |
| { |
| code[0] = OP_CALLOUT; |
| PUT(code, 1, ptr - cb->start_pattern); /* Pattern offset */ |
| PUT(code, 1 + LINK_SIZE, 0); /* Default length */ |
| code[1 + 2*LINK_SIZE] = 255; |
| return code + PRIV(OP_lengths)[OP_CALLOUT]; |
| } |
| |
| |
| |
| /************************************************* |
| * Complete a callout item * |
| *************************************************/ |
| |
| /* A callout item contains the length of the next item in the pattern, which |
| we can't fill in till after we have reached the relevant point. This is used |
| for both automatic and manual callouts. |
| |
| Arguments: |
| previous_callout points to previous callout item |
| ptr current pattern pointer |
| cb general compile-time data |
| |
| Returns: nothing |
| */ |
| |
| static void |
| complete_callout(PCRE2_UCHAR *previous_callout, PCRE2_SPTR ptr, |
| compile_block *cb) |
| { |
| size_t length = ptr - cb->start_pattern - GET(previous_callout, 1); |
| PUT(previous_callout, 1 + LINK_SIZE, length); |
| } |
| |
| |
| |
| /************************************************* |
| * Find the fixed length of a branch * |
| *************************************************/ |
| |
| /* Scan a branch and compute the fixed length of subject that will match it, if |
| the length is fixed. This is needed for dealing with lookbehind assertions. In |
| UTF mode, the result is in code units rather than bytes. The branch is |
| temporarily terminated with OP_END when this function is called. |
| |
| This function is called when a lookbehind assertion is encountered, so that if |
| it fails, the error message can point to the correct place in the pattern. |
| However, we cannot do this when the assertion contains subroutine calls, |
| because they can be forward references. We solve this by remembering this case |
| and doing the check at the end; a flag specifies which mode we are running in. |
| |
| Lookbehind lengths are held in 16-bit fields and the maximum value is defined |
| as LOOKBEHIND_MAX. |
| |
| Arguments: |
| code points to the start of the pattern (the bracket) |
| utf TRUE in UTF mode |
| atend TRUE if called when the pattern is complete |
| cb the "compile data" structure |
| recurses chain of recurse_check to catch mutual recursion |
| countptr pointer to counter, to catch over-complexity |
| |
| Returns: if non-negative, the fixed length, |
| or -1 if an OP_RECURSE item was encountered and atend is FALSE |
| or -2 if there is no fixed length, |
| or -3 if \C was encountered (in UTF-8 mode only) |
| or -4 length is too long |
| or -5 if an unknown opcode was encountered (internal error) |
| */ |
| |
| #define FFL_LATER (-1) |
| #define FFL_NOTFIXED (-2) |
| #define FFL_BACKSLASHC (-3) |
| #define FFL_TOOLONG (-4) |
| #define FFL_TOOCOMPLICATED (-5) |
| #define FFL_UNKNOWNOP (-6) |
| |
| static int |
| find_fixedlength(PCRE2_UCHAR *code, BOOL utf, BOOL atend, compile_block *cb, |
| recurse_check *recurses, int *countptr) |
| { |
| int length = -1; |
| uint32_t group = 0; |
| uint32_t groupinfo = 0; |
| recurse_check this_recurse; |
| register int branchlength = 0; |
| register PCRE2_UCHAR *cc = code + 1 + LINK_SIZE; |
| |
| /* If this is a capturing group, we may have the answer cached, but we can only |
| use this information if there are no (?| groups in the pattern, because |
| otherwise group numbers are not unique. */ |
| |
| if (*code == OP_CBRA || *code == OP_CBRAPOS || *code == OP_SCBRA || |
| *code == OP_SCBRAPOS) |
| { |
| group = GET2(cc, 0); |
| cc += IMM2_SIZE; |
| groupinfo = cb->groupinfo[group]; |
| if ((cb->external_flags & PCRE2_DUPCAPUSED) == 0) |
| { |
| if ((groupinfo & GI_NOT_FIXED_LENGTH) != 0) return FFL_NOTFIXED; |
| if ((groupinfo & GI_SET_FIXED_LENGTH) != 0) |
| return groupinfo & GI_FIXED_LENGTH_MASK; |
| } |
| } |
| |
| /* A large and/or complex regex can take too long to process. This can happen |
| more often when (?| groups are present in the pattern. */ |
| |
| if ((*countptr)++ > 2000) return FFL_TOOCOMPLICATED; |
| |
| /* Scan along the opcodes for this branch. If we get to the end of the |
| branch, check the length against that of the other branches. */ |
| |
| for (;;) |
| { |
| int d; |
| PCRE2_UCHAR *ce, *cs; |
| register PCRE2_UCHAR op = *cc; |
| |
| if (branchlength > LOOKBEHIND_MAX) return FFL_TOOLONG; |
| |
| switch (op) |
| { |
| /* We only need to continue for OP_CBRA (normal capturing bracket) and |
| OP_BRA (normal non-capturing bracket) because the other variants of these |
| opcodes are all concerned with unlimited repeated groups, which of course |
| are not of fixed length. */ |
| |
| case OP_CBRA: |
| case OP_BRA: |
| case OP_ONCE: |
| case OP_ONCE_NC: |
| case OP_COND: |
| d = find_fixedlength(cc, utf, atend, cb, recurses, countptr); |
| if (d < 0) return d; |
| branchlength += d; |
| do cc += GET(cc, 1); while (*cc == OP_ALT); |
| cc += 1 + LINK_SIZE; |
| break; |
| |
| /* Reached end of a branch; if it's a ket it is the end of a nested call. |
| If it's ALT it is an alternation in a nested call. An ACCEPT is effectively |
| an ALT. If it is END it's the end of the outer call. All can be handled by |
| the same code. Note that we must not include the OP_KETRxxx opcodes here, |
| because they all imply an unlimited repeat. */ |
| |
| case OP_ALT: |
| case OP_KET: |
| case OP_END: |
| case OP_ACCEPT: |
| case OP_ASSERT_ACCEPT: |
| if (length < 0) length = branchlength; |
| else if (length != branchlength) goto ISNOTFIXED; |
| if (*cc != OP_ALT) |
| { |
| if (group > 0) |
| { |
| groupinfo |= (GI_SET_FIXED_LENGTH | length); |
| cb->groupinfo[group] = groupinfo; |
| } |
| return length; |
| } |
| cc += 1 + LINK_SIZE; |
| branchlength = 0; |
| break; |
| |
| /* A true recursion implies not fixed length, but a subroutine call may |
| be OK. If the subroutine is a forward reference, we can't deal with |
| it until the end of the pattern, so return FFL_LATER. */ |
| |
| case OP_RECURSE: |
| if (!atend) return FFL_LATER; |
| cs = ce = (PCRE2_UCHAR *)cb->start_code + GET(cc, 1); /* Start subpattern */ |
| do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */ |
| if (cc > cs && cc < ce) goto ISNOTFIXED; /* Recursion */ |
| else /* Check for mutual recursion */ |
| { |
| recurse_check *r = recurses; |
| for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break; |
| if (r != NULL) goto ISNOTFIXED; /* Mutual recursion */ |
| } |
| this_recurse.prev = recurses; |
| this_recurse.group = cs; |
| d = find_fixedlength(cs, utf, atend, cb, &this_recurse, countptr); |
| if (d < 0) return d; |
| branchlength += d; |
| cc += 1 + LINK_SIZE; |
| break; |
| |
| /* Skip over assertive subpatterns. Note that we must increment cc by |
| 1 + LINK_SIZE at the end, not by OP_length[*cc] because in a recursive |
| situation this assertion may be the one that is ultimately being checked |
| for having a fixed length, in which case its terminating OP_KET will have |
| been temporarily replaced by OP_END. */ |
| |
| case OP_ASSERT: |
| case OP_ASSERT_NOT: |
| case OP_ASSERTBACK: |
| case OP_ASSERTBACK_NOT: |
| do cc += GET(cc, 1); while (*cc == OP_ALT); |
| cc += 1 + LINK_SIZE; |
| break; |
| |
| /* Skip over things that don't match chars */ |
| |
| case OP_MARK: |
| case OP_PRUNE_ARG: |
| case OP_SKIP_ARG: |
| case OP_THEN_ARG: |
| cc += cc[1] + PRIV(OP_lengths)[*cc]; |
| break; |
| |
| case OP_CALLOUT: |
| case OP_CIRC: |
| case OP_CIRCM: |
| case OP_CLOSE: |
| case OP_COMMIT: |
| case OP_CREF: |
| case OP_FALSE: |
| case OP_TRUE: |
| case OP_DNCREF: |
| case OP_DNRREF: |
| case OP_DOLL: |
| case OP_DOLLM: |
| case OP_EOD: |
| case OP_EODN: |
| case OP_FAIL: |
| case OP_NOT_WORD_BOUNDARY: |
| case OP_PRUNE: |
| case OP_REVERSE: |
| case OP_RREF: |
| case OP_SET_SOM: |
| case OP_SKIP: |
| case OP_SOD: |
| case OP_SOM: |
| case OP_THEN: |
| case OP_WORD_BOUNDARY: |
| cc += PRIV(OP_lengths)[*cc]; |
| break; |
| |
| case OP_CALLOUT_STR: |
| cc += GET(cc, 1 + 2*LINK_SIZE); |
| break; |
| |
| /* Handle literal characters */ |
| |
| case OP_CHAR: |
| case OP_CHARI: |
| case OP_NOT: |
| case OP_NOTI: |
| branchlength++; |
| cc += 2; |
| #ifdef SUPPORT_UNICODE |
| if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); |
| #endif |
| break; |
| |
| /* Handle exact repetitions. The count is already in characters, but we |
| need to skip over a multibyte character in UTF8 mode. */ |
| |
| case OP_EXACT: |
| case OP_EXACTI: |
| case OP_NOTEXACT: |
| case OP_NOTEXACTI: |
| branchlength += (int)GET2(cc,1); |
| cc += 2 + IMM2_SIZE; |
| #ifdef SUPPORT_UNICODE |
| if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); |
| #endif |
| break; |
| |
| case OP_TYPEEXACT: |
| branchlength += GET2(cc,1); |
| if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP) |
| cc += 2; |
| cc += 1 + IMM2_SIZE + 1; |
| break; |
| |
| /* Handle single-char matchers */ |
| |
| case OP_PROP: |
| case OP_NOTPROP: |
| cc += 2; |
| /* Fall through */ |
| |
| case OP_HSPACE: |
| case OP_VSPACE: |
| case OP_NOT_HSPACE: |
| case OP_NOT_VSPACE: |
| case OP_NOT_DIGIT: |
| case OP_DIGIT: |
| case OP_NOT_WHITESPACE: |
| case OP_WHITESPACE: |
| case OP_NOT_WORDCHAR: |
| case OP_WORDCHAR: |
| case OP_ANY: |
| case OP_ALLANY: |
| branchlength++; |
| cc++; |
| break; |
| |
| /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode; |
| otherwise \C is coded as OP_ALLANY. */ |
| |
| case OP_ANYBYTE: |
| return FFL_BACKSLASHC; |
| |
| /* Check a class for variable quantification */ |
| |
| case OP_CLASS: |
| case OP_NCLASS: |
| #ifdef SUPPORT_WIDE_CHARS |
| case OP_XCLASS: |
| /* The original code caused an unsigned overflow in 64 bit systems, |
| so now we use a conditional statement. */ |
| if (op == OP_XCLASS) |
| cc += GET(cc, 1); |
| else |
| cc += PRIV(OP_lengths)[OP_CLASS]; |
| #else |
| cc += PRIV(OP_lengths)[OP_CLASS]; |
| #endif |
| |
| switch (*cc) |
| { |
| case OP_CRSTAR: |
| case OP_CRMINSTAR: |
| case OP_CRPLUS: |
| case OP_CRMINPLUS: |
| case OP_CRQUERY: |
| case OP_CRMINQUERY: |
| case OP_CRPOSSTAR: |
| case OP_CRPOSPLUS: |
| case OP_CRPOSQUERY: |
| goto ISNOTFIXED; |
| |
| case OP_CRRANGE: |
| case OP_CRMINRANGE: |
| case OP_CRPOSRANGE: |
| if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) goto ISNOTFIXED; |
| branchlength += (int)GET2(cc,1); |
| cc += 1 + 2 * IMM2_SIZE; |
| break; |
| |
| default: |
| branchlength++; |
| } |
| break; |
| |
| /* Anything else is variable length */ |
| |
| case OP_ANYNL: |
| case OP_BRAMINZERO: |
| case OP_BRAPOS: |
| case OP_BRAPOSZERO: |
| case OP_BRAZERO: |
| case OP_CBRAPOS: |
| case OP_EXTUNI: |
| case OP_KETRMAX: |
| case OP_KETRMIN: |
| case OP_KETRPOS: |
| case OP_MINPLUS: |
| case OP_MINPLUSI: |
| case OP_MINQUERY: |
| case OP_MINQUERYI: |
| case OP_MINSTAR: |
| case OP_MINSTARI: |
| case OP_MINUPTO: |
| case OP_MINUPTOI: |
| case OP_NOTMINPLUS: |
| case OP_NOTMINPLUSI: |
| case OP_NOTMINQUERY: |
| case OP_NOTMINQUERYI: |
| case OP_NOTMINSTAR: |
| case OP_NOTMINSTARI: |
| case OP_NOTMINUPTO: |
| case OP_NOTMINUPTOI: |
| case OP_NOTPLUS: |
| case OP_NOTPLUSI: |
| case OP_NOTPOSPLUS: |
| case OP_NOTPOSPLUSI: |
| case OP_NOTPOSQUERY: |
| case OP_NOTPOSQUERYI: |
| case OP_NOTPOSSTAR: |
| case OP_NOTPOSSTARI: |
| case OP_NOTPOSUPTO: |
| case OP_NOTPOSUPTOI: |
| case OP_NOTQUERY: |
| case OP_NOTQUERYI: |
| case OP_NOTSTAR: |
| case OP_NOTSTARI: |
| case OP_NOTUPTO: |
| case OP_NOTUPTOI: |
| case OP_PLUS: |
| case OP_PLUSI: |
| case OP_POSPLUS: |
| case OP_POSPLUSI: |
| case OP_POSQUERY: |
| case OP_POSQUERYI: |
| case OP_POSSTAR: |
| case OP_POSSTARI: |
| case OP_POSUPTO: |
| case OP_POSUPTOI: |
| case OP_QUERY: |
| case OP_QUERYI: |
| case OP_REF: |
| case OP_REFI: |
| case OP_DNREF: |
| case OP_DNREFI: |
| case OP_SBRA: |
| case OP_SBRAPOS: |
| case OP_SCBRA: |
| case OP_SCBRAPOS: |
| case OP_SCOND: |
| case OP_SKIPZERO: |
| case OP_STAR: |
| case OP_STARI: |
| case OP_TYPEMINPLUS: |
| case OP_TYPEMINQUERY: |
| case OP_TYPEMINSTAR: |
| case OP_TYPEMINUPTO: |
| case OP_TYPEPLUS: |
| case OP_TYPEPOSPLUS: |
| case OP_TYPEPOSQUERY: |
| case OP_TYPEPOSSTAR: |
| case OP_TYPEPOSUPTO: |
| case OP_TYPEQUERY: |
| case OP_TYPESTAR: |
| case OP_TYPEUPTO: |
| case OP_UPTO: |
| case OP_UPTOI: |
| goto ISNOTFIXED; |
| |
| /* Catch unrecognized opcodes so that when new ones are added they |
| are not forgotten, as has happened in the past. */ |
| |
| default: |
| return FFL_UNKNOWNOP; |
| } |
| } |
| /* Control never gets here except by goto. */ |
| |
| ISNOTFIXED: |
| if (group > 0) |
| { |
| groupinfo |= GI_NOT_FIXED_LENGTH; |
| cb->groupinfo[group] = groupinfo; |
| } |
| return FFL_NOTFIXED; |
| } |
| |
| |
| |
| /************************************************* |
| * Find first significant op code * |
| *************************************************/ |
| |
| /* This is called by several functions that scan a compiled expression looking |
| for a fixed first character, or an anchoring op code etc. It skips over things |
| that do not influence this. For some calls, it makes sense to skip negative |
| forward and all backward assertions, and also the \b assertion; for others it |
| does not. |
| |
| Arguments: |
| code pointer to the start of the group |
| skipassert TRUE if certain assertions are to be skipped |
| |
| Returns: pointer to the first significant opcode |
| */ |
| |
| static const PCRE2_UCHAR* |
| first_significant_code(PCRE2_SPTR code, BOOL skipassert) |
| { |
| for (;;) |
| { |
| switch ((int)*code) |
| { |
| case OP_ASSERT_NOT: |
| case OP_ASSERTBACK: |
| case OP_ASSERTBACK_NOT: |
| if (!skipassert) return code; |
| do code += GET(code, 1); while (*code == OP_ALT); |
| code += PRIV(OP_lengths)[*code]; |
| break; |
| |
| case OP_WORD_BOUNDARY: |
| case OP_NOT_WORD_BOUNDARY: |
| if (!skipassert) return code; |
| /* Fall through */ |
| |
| case OP_CALLOUT: |
| case OP_CREF: |
| case OP_DNCREF: |
| case OP_RREF: |
| case OP_DNRREF: |
| case OP_FALSE: |
| case OP_TRUE: |
| code += PRIV(OP_lengths)[*code]; |
| break; |
| |
| case OP_CALLOUT_STR: |
| code += GET(code, 1 + 2*LINK_SIZE); |
| break; |
| |
| default: |
| return code; |
| } |
| } |
| /* Control never reaches here */ |
| } |
| |
| |
| |
| /************************************************* |
| * Scan compiled branch for non-emptiness * |
| *************************************************/ |
| |
| /* This function scans through a branch of a compiled pattern to see whether it |
| can match the empty string. It is called at the end of compiling to check the |
| entire pattern, and from compile_branch() when checking for an unlimited repeat |
| of a group that can match nothing. In the latter case it is called only when |
| doing the real compile, not during the pre-compile that measures the size of |
| the compiled pattern. |
| |
| Note that first_significant_code() skips over backward and negative forward |
| assertions when its final argument is TRUE. If we hit an unclosed bracket, we |
| return "empty" - this means we've struck an inner bracket whose current branch |
| will already have been scanned. |
| |
| Arguments: |
| code points to start of search |
| endcode points to where to stop |
| utf TRUE if in UTF mode |
| cb compile data |
| atend TRUE if being called to check an entire pattern |
| recurses chain of recurse_check to catch mutual recursion |
| countptr pointer to count to catch over-complicated pattern |
| |
| Returns: 0 if what is matched cannot be empty |
| 1 if what is matched could be empty |
| -1 if the pattern is too complicated |
| */ |
| |
| #define CBE_NOTEMPTY 0 |
| #define CBE_EMPTY 1 |
| #define CBE_TOOCOMPLICATED (-1) |
| |
| |
| static int |
| could_be_empty_branch(PCRE2_SPTR code, PCRE2_SPTR endcode, BOOL utf, |
| compile_block *cb, BOOL atend, recurse_check *recurses, int *countptr) |
| { |
| uint32_t group = 0; |
| uint32_t groupinfo = 0; |
| register PCRE2_UCHAR c; |
| recurse_check this_recurse; |
| |
| /* If what we are checking has already been set as "could be empty", we know |
| the answer. */ |
| |
| if (*code >= OP_SBRA && *code <= OP_SCOND) return CBE_EMPTY; |
| |
| /* If this is a capturing group, we may have the answer cached, but we can only |
| use this information if there are no (?| groups in the pattern, because |
| otherwise group numbers are not unique. */ |
| |
| if ((cb->external_flags & PCRE2_DUPCAPUSED) == 0 && |
| (*code == OP_CBRA || *code == OP_CBRAPOS)) |
| { |
| group = GET2(code, 1 + LINK_SIZE); |
| groupinfo = cb->groupinfo[group]; |
| if ((groupinfo & GI_SET_COULD_BE_EMPTY) != 0) |
| return ((groupinfo & GI_COULD_BE_EMPTY) != 0)? CBE_EMPTY : CBE_NOTEMPTY; |
| } |
| |
| /* A large and/or complex regex can take too long to process. We have to assume |
| it can match an empty string. This can happen more often when (?| groups are |
| present in the pattern and the caching is disabled. Setting the cap at 1100 |
| allows the test for more than 1023 capturing patterns to work. */ |
| |
| if ((*countptr)++ > 1100) return CBE_TOOCOMPLICATED; |
| |
| /* Scan the opcodes for this branch. */ |
| |
| for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE); |
| code < endcode; |
| code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE)) |
| { |
| PCRE2_SPTR ccode; |
| |
| c = *code; |
| |
| /* Skip over forward assertions; the other assertions are skipped by |
| first_significant_code() with a TRUE final argument. */ |
| |
| if (c == OP_ASSERT) |
| { |
| do code += GET(code, 1); while (*code == OP_ALT); |
| c = *code; |
| continue; |
| } |
| |
| /* For a recursion/subroutine call we can scan the recursion when this |
| function is called at the end, to check a complete pattern. Before then, |
| recursions just have the group number as their argument and in any case may |
| be forward references. In that situation, we return CBE_EMPTY, just in case. |
| It means that unlimited repeats of groups that contain recursions are always |
| treated as "could be empty" - which just adds a bit more processing time |
| because of the runtime check. */ |
| |
| if (c == OP_RECURSE) |
| { |
| PCRE2_SPTR scode, endgroup; |
| BOOL empty_branch; |
| |
| if (!atend) goto ISTRUE; |
| scode = cb->start_code + GET(code, 1); |
| endgroup = scode; |
| |
| /* We need to detect whether this is a recursive call, as otherwise there |
| will be an infinite loop. If it is a recursion, just skip over it. Simple |
| recursions are easily detected. For mutual recursions we keep a chain on |
| the stack. */ |
| |
| do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT); |
| if (code >= scode && code <= endgroup) continue; /* Simple recursion */ |
| else |
| { |
| recurse_check *r = recurses; |
| for (r = recurses; r != NULL; r = r->prev) |
| if (r->group == scode) break; |
| if (r != NULL) continue; /* Mutual recursion */ |
| } |
| |
| /* Scan the referenced group, remembering it on the stack chain to detect |
| mutual recursions. */ |
| |
| empty_branch = FALSE; |
| this_recurse.prev = recurses; |
| this_recurse.group = scode; |
| |
| do |
| { |
| int rc = could_be_empty_branch(scode, endcode, utf, cb, atend, |
| &this_recurse, countptr); |
| if (rc < 0) return rc; |
| if (rc > 0) |
| { |
| empty_branch = TRUE; |
| break; |
| } |
| scode += GET(scode, 1); |
| } |
| while (*scode == OP_ALT); |
| |
| if (!empty_branch) goto ISFALSE; /* All branches are non-empty */ |
| continue; |
| } |
| |
| /* Groups with zero repeats can of course be empty; skip them. */ |
| |
| if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO || |
| c == OP_BRAPOSZERO) |
| { |
| code += PRIV(OP_lengths)[c]; |
| do code += GET(code, 1); while (*code == OP_ALT); |
| c = *code; |
| continue; |
| } |
| |
| /* A nested group that is already marked as "could be empty" can just be |
| skipped. */ |
| |
| if (c == OP_SBRA || c == OP_SBRAPOS || |
| c == OP_SCBRA || c == OP_SCBRAPOS) |
| { |
| do code += GET(code, 1); while (*code == OP_ALT); |
| c = *code; |
| continue; |
| } |
| |
| /* For other groups, scan the branches. */ |
| |
| if (c == OP_BRA || c == OP_BRAPOS || |
| c == OP_CBRA || c == OP_CBRAPOS || |
| c == OP_ONCE || c == OP_ONCE_NC || |
| c == OP_COND || c == OP_SCOND) |
| { |
| BOOL empty_branch; |
| if (GET(code, 1) == 0) goto ISTRUE; /* Hit unclosed bracket */ |
| |
| /* If a conditional group has only one branch, there is a second, implied, |
| empty branch, so just skip over the conditional, because it could be empty. |
| Otherwise, scan the individual branches of the group. */ |
| |
| if (c == OP_COND && code[GET(code, 1)] != OP_ALT) |
| code += GET(code, 1); |
| else |
| { |
| empty_branch = FALSE; |
| do |
| { |
| if (!empty_branch) |
| { |
| int rc = could_be_empty_branch(code, endcode, utf, cb, atend, |
| recurses, countptr); |
| if (rc < 0) return rc; |
| if (rc > 0) empty_branch = TRUE; |
| } |
| code += GET(code, 1); |
| } |
| while (*code == OP_ALT); |
| if (!empty_branch) goto ISFALSE; /* All branches are non-empty */ |
| } |
| |
| c = *code; |
| continue; |
| } |
| |
| /* Handle the other opcodes */ |
| |
| switch (c) |
| { |
| /* Check for quantifiers after a class. XCLASS is used for classes that |
| cannot be represented just by a bit map. This includes negated single |
| high-valued characters. The length in PRIV(OP_lengths)[] is zero; the |
| actual length is stored in the compiled code, so we must update "code" |
| here. */ |
| |
| #if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8 |
| case OP_XCLASS: |
| ccode = code += GET(code, 1); |
| goto CHECK_CLASS_REPEAT; |
| #endif |
| |
| case OP_CLASS: |
| case OP_NCLASS: |
| ccode = code + PRIV(OP_lengths)[OP_CLASS]; |
| |
| #if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8 |
| CHECK_CLASS_REPEAT: |
| #endif |
| |
| switch (*ccode) |
| { |
| case OP_CRSTAR: /* These could be empty; continue */ |
| case OP_CRMINSTAR: |
| case OP_CRQUERY: |
| case OP_CRMINQUERY: |
| case OP_CRPOSSTAR: |
| case OP_CRPOSQUERY: |
| break; |
| |
| default: /* Non-repeat => class must match */ |
| case OP_CRPLUS: /* These repeats aren't empty */ |
| case OP_CRMINPLUS: |
| case OP_CRPOSPLUS: |
| goto ISFALSE; |
| |
| case OP_CRRANGE: |
| case OP_CRMINRANGE: |
| case OP_CRPOSRANGE: |
| if (GET2(ccode, 1) > 0) goto ISFALSE; /* Minimum > 0 */ |
| break; |
| } |
| break; |
| |
| /* Opcodes that must match a character */ |
| |
| case OP_ANY: |
| case OP_ALLANY: |
| case OP_ANYBYTE: |
| |
| case OP_PROP: |
| case OP_NOTPROP: |
| case OP_ANYNL: |
| |
| case OP_NOT_HSPACE: |
| case OP_HSPACE: |
| case OP_NOT_VSPACE: |
| case OP_VSPACE: |
| case OP_EXTUNI: |
| |
| case OP_NOT_DIGIT: |
| case OP_DIGIT: |
| case OP_NOT_WHITESPACE: |
| case OP_WHITESPACE: |
| case OP_NOT_WORDCHAR: |
| case OP_WORDCHAR: |
| |
| case OP_CHAR: |
| case OP_CHARI: |
| case OP_NOT: |
| case OP_NOTI: |
| |
| case OP_PLUS: |
| case OP_PLUSI: |
| case OP_MINPLUS: |
| case OP_MINPLUSI: |
| |
| case OP_NOTPLUS: |
| case OP_NOTPLUSI: |
| case OP_NOTMINPLUS: |
| case OP_NOTMINPLUSI: |
| |
| case OP_POSPLUS: |
| case OP_POSPLUSI: |
| case OP_NOTPOSPLUS: |
| case OP_NOTPOSPLUSI: |
| |
| case OP_EXACT: |
| case OP_EXACTI: |
| case OP_NOTEXACT: |
| case OP_NOTEXACTI: |
| |
| case OP_TYPEPLUS: |
| case OP_TYPEMINPLUS: |
| case OP_TYPEPOSPLUS: |
| case OP_TYPEEXACT: |
| goto ISFALSE; |
| |
| /* These are going to continue, as they may be empty, but we have to |
| fudge the length for the \p and \P cases. */ |
| |
| case OP_TYPESTAR: |
| case OP_TYPEMINSTAR: |
| case OP_TYPEPOSSTAR: |
| case OP_TYPEQUERY: |
| case OP_TYPEMINQUERY: |
| case OP_TYPEPOSQUERY: |
| if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; |
| break; |
| |
| /* Same for these */ |
| |
| case OP_TYPEUPTO: |
| case OP_TYPEMINUPTO: |
| case OP_TYPEPOSUPTO: |
| if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP) |
| code += 2; |
| break; |
| |
| /* End of branch */ |
| |
| case OP_KET: |
| case OP_KETRMAX: |
| case OP_KETRMIN: |
| case OP_KETRPOS: |
| case OP_ALT: |
| goto ISTRUE; |
| |
| /* In UTF-8 or UTF-16 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, |
| POSQUERY, UPTO, MINUPTO, and POSUPTO and their caseless and negative |
| versions may be followed by a multibyte character. */ |
| |
| #ifdef MAYBE_UTF_MULTI |
| case OP_STAR: |
| case OP_STARI: |
| case OP_NOTSTAR: |
| case OP_NOTSTARI: |
| |
| case OP_MINSTAR: |
| case OP_MINSTARI: |
| case OP_NOTMINSTAR: |
| case OP_NOTMINSTARI: |
| |
| case OP_POSSTAR: |
| case OP_POSSTARI: |
| case OP_NOTPOSSTAR: |
| case OP_NOTPOSSTARI: |
| |
| case OP_QUERY: |
| case OP_QUERYI: |
| case OP_NOTQUERY: |
| case OP_NOTQUERYI: |
| |
| case OP_MINQUERY: |
| case OP_MINQUERYI: |
| case OP_NOTMINQUERY: |
| case OP_NOTMINQUERYI: |
| |
| case OP_POSQUERY: |
| case OP_POSQUERYI: |
| case OP_NOTPOSQUERY: |
| case OP_NOTPOSQUERYI: |
| if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]); |
| break; |
| |
| case OP_UPTO: |
| case OP_UPTOI: |
| case OP_NOTUPTO: |
| case OP_NOTUPTOI: |
| |
| case OP_MINUPTO: |
| case OP_MINUPTOI: |
| case OP_NOTMINUPTO: |
| case OP_NOTMINUPTOI: |
| |
| case OP_POSUPTO: |
| case OP_POSUPTOI: |
| case OP_NOTPOSUPTO: |
| case OP_NOTPOSUPTOI: |
| if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]); |
| break; |
| #endif /* MAYBE_UTF_MULTI */ |
| |
| /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument |
| string. */ |
| |
| case OP_MARK: |
| case OP_PRUNE_ARG: |
| case OP_SKIP_ARG: |
| case OP_THEN_ARG: |
| code += code[1]; |
| break; |
| |
| /* None of the remaining opcodes are required to match a character. */ |
| |
| default: |
| break; |
| } |
| } |
| |
| ISTRUE: |
| groupinfo |= GI_COULD_BE_EMPTY; |
| |
| ISFALSE: |
| if (group > 0) cb->groupinfo[group] = groupinfo | GI_SET_COULD_BE_EMPTY; |
| |
| return ((groupinfo & GI_COULD_BE_EMPTY) != 0)? CBE_EMPTY : CBE_NOTEMPTY; |
| } |
| |
| |
| |
| /************************************************* |
| * Check for counted repeat * |
| *************************************************/ |
| |
| /* This function is called when a '{' is encountered in a place where it might |
| start a quantifier. It looks ahead to see if it really is a quantifier, that |
| is, one of the forms {ddd} {ddd,} or {ddd,ddd} where the ddds are digits. |
| |
| Argument: pointer to the first char after '{' |
| Returns: TRUE or FALSE |
| */ |
| |
| static BOOL |
| is_counted_repeat(PCRE2_SPTR p) |
| { |
| if (!IS_DIGIT(*p)) return FALSE; |
| p++; |
| while (IS_DIGIT(*p)) p++; |
| if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE; |
| |
| if (*p++ != CHAR_COMMA) return FALSE; |
| if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE; |
| |
| if (!IS_DIGIT(*p)) return FALSE; |
| p++; |
| while (IS_DIGIT(*p)) p++; |
| |
| return (*p == CHAR_RIGHT_CURLY_BRACKET); |
| } |
| |
| |
| |
| /************************************************* |
| * Handle escapes * |
| *************************************************/ |
| |
| /* This function is called when a \ has been encountered. It either returns a |
| positive value for a simple escape such as \d, or 0 for a data character, which |
| is placed in chptr. A backreference to group n is returned as negative n. On |
| entry, ptr is pointing at the \. On exit, it points the final code unit of the |
| escape sequence. |
| |
| This function is also called from pcre2_substitute() to handle escape sequences |
| in replacement strings. In this case, the cb argument is NULL, and only |
| sequences that define a data character are recognised. The isclass argument is |
| not relevant, but the options argument is the final value of the compiled |
| pattern's options. |
| |
| There is one "trick" case: when a sequence such as [[:>:]] or \s in UCP mode is |
| processed, it is replaced by a nested alternative sequence. If this contains a |
| backslash (which is usually does), ptrend does not point to its end - it still |
| points to the end of the whole pattern. However, we can detect this case |
| because cb->nestptr[0] will be non-NULL. The nested sequences are all zero- |
| terminated and there are only ever two levels of nesting. |
| |
| Arguments: |
| ptrptr points to the input position pointer |
| ptrend points to the end of the input |
| chptr points to a returned data character |
| errorcodeptr points to the errorcode variable (containing zero) |
| options the current options bits |
| isclass TRUE if inside a character class |
| cb compile data block |
| |
| Returns: zero => a data character |
| positive => a special escape sequence |
| negative => a back reference |
| on error, errorcodeptr is set non-zero |
| */ |
| |
| int |
| PRIV(check_escape)(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *chptr, |
| int *errorcodeptr, uint32_t options, BOOL isclass, compile_block *cb) |
| { |
| BOOL utf = (options & PCRE2_UTF) != 0; |
| PCRE2_SPTR ptr = *ptrptr + 1; |
| register uint32_t c, cc; |
| int escape = 0; |
| int i; |
| |
| /* Find the end of a nested insert. */ |
| |
| if (cb != NULL && cb->nestptr[0] != NULL) |
| ptrend = ptr + PRIV(strlen)(ptr); |
| |
| /* If backslash is at the end of the string, it's an error. */ |
| |
| if (ptr >= ptrend) |
| { |
| *errorcodeptr = ERR1; |
| return 0; |
| } |
| |
| GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */ |
| ptr--; /* Set pointer back to the last code unit */ |
| |
| /* Non-alphanumerics are literals, so we just leave the value in c. An initial |
| value test saves a memory lookup for code points outside the alphanumeric |
| range. Otherwise, do a table lookup. A non-zero result is something that can be |
| returned immediately. Otherwise further processing is required. */ |
| |
| if (c < ESCAPES_FIRST || c > ESCAPES_LAST) {} /* Definitely literal */ |
| |
| else if ((i = escapes[c - ESCAPES_FIRST]) != 0) |
| { |
| if (i > 0) c = (uint32_t)i; else /* Positive is a data character */ |
| { |
| escape = -i; /* Else return a special escape */ |
| if (escape == ESC_P || escape == ESC_p || escape == ESC_X) |
| cb->external_flags |= PCRE2_HASBKPORX; /* Note \P, \p, or \X */ |
| } |
| } |
| |
| /* Escapes that need further processing, including those that are unknown. |
| When called from pcre2_substitute(), only \c, \o, and \x are recognized (and \u |
| when BSUX is set). */ |
| |
| else |
| { |
| PCRE2_SPTR oldptr; |
| BOOL braced, negated, overflow; |
| unsigned int s; |
| |
| /* Filter calls from pcre2_substitute(). */ |
| |
| if (cb == NULL && c != CHAR_c && c != CHAR_o && c != CHAR_x && |
| (c != CHAR_u || (options & PCRE2_ALT_BSUX) != 0)) |
| { |
| *errorcodeptr = ERR3; |
| return 0; |
| } |
| |
| switch (c) |
| { |
| /* A number of Perl escapes are not handled by PCRE. We give an explicit |
| error. */ |
| |
| case CHAR_l: |
| case CHAR_L: |
| *errorcodeptr = ERR37; |
| break; |
| |
| /* \u is unrecognized when PCRE2_ALT_BSUX is not set. When it is treated |
| specially, \u must be followed by four hex digits. Otherwise it is a |
| lowercase u letter. */ |
| |
| case CHAR_u: |
| if ((options & PCRE2_ALT_BSUX) == 0) *errorcodeptr = ERR37; else |
| { |
| uint32_t xc; |
| if ((cc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */ |
| if ((xc = XDIGIT(ptr[2])) == 0xff) break; /* Not a hex digit */ |
| cc = (cc << 4) | xc; |
| if ((xc = XDIGIT(ptr[3])) == 0xff) break; /* Not a hex digit */ |
| cc = (cc << 4) | xc; |
| if ((xc = XDIGIT(ptr[4])) == 0xff) break; /* Not a hex digit */ |
| c = (cc << 4) | xc; |
| ptr += 4; |
| if (utf) |
| { |
| if (c > 0x10ffffU) *errorcodeptr = ERR77; |
| else if (c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73; |
| } |
| else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR77; |
| } |
| break; |
| |
| case CHAR_U: |
| /* \U is unrecognized unless PCRE2_ALT_BSUX is set, in which case it is an |
| upper case letter. */ |
| if ((options & PCRE2_ALT_BSUX) == 0) *errorcodeptr = ERR37; |
| break; |
| |
| /* In a character class, \g is just a literal "g". Outside a character |
| class, \g must be followed by one of a number of specific things: |
| |
| (1) A number, either plain or braced. If positive, it is an absolute |
| backreference. If negative, it is a relative backreference. This is a Perl |
| 5.10 feature. |
| |
| (2) Perl 5.10 also supports \g{name} as a reference to a named group. This |
| is part of Perl's movement towards a unified syntax for back references. As |
| this is synonymous with \k{name}, we fudge it up by pretending it really |
| was \k. |
| |
| (3) For Oniguruma compatibility we also support \g followed by a name or a |
| number either in angle brackets or in single quotes. However, these are |
| (possibly recursive) subroutine calls, _not_ backreferences. Just return |
| the ESC_g code (cf \k). */ |
| |
| case CHAR_g: |
| if (isclass) break; |
| if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE) |
| { |
| escape = ESC_g; |
| break; |
| } |
| |
| /* Handle the Perl-compatible cases */ |
| |
| if (ptr[1] == CHAR_LEFT_CURLY_BRACKET) |
| { |
| PCRE2_SPTR p; |
| for (p = ptr+2; *p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET; p++) |
| if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break; |
| if (*p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET) |
| { |
| escape = ESC_k; |
| break; |
| } |
| braced = TRUE; |
| ptr++; |
| } |
| else braced = FALSE; |
| |
| if (ptr[1] == CHAR_MINUS) |
| { |
| negated = TRUE; |
| ptr++; |
| } |
| else negated = FALSE; |
| |
| /* The integer range is limited by the machine's int representation. */ |
| s = 0; |
| overflow = FALSE; |
| while (IS_DIGIT(ptr[1])) |
| { |
| if (s > INT_MAX / 10 - 1) /* Integer overflow */ |
| { |
| overflow = TRUE; |
| break; |
| } |
| s = s * 10 + (int)(*(++ptr) - CHAR_0); |
| } |
| if (overflow) /* Integer overflow */ |
| { |
| while (IS_DIGIT(ptr[1])) ptr++; |
| *errorcodeptr = ERR61; |
| break; |
| } |
| |
| if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET) |
| { |
| *errorcodeptr = ERR57; |
| break; |
| } |
| |
| if (s == 0) |
| { |
| *errorcodeptr = ERR58; |
| break; |
| } |
| |
| if (negated) |
| { |
| if (s > cb->bracount) |
| { |
| *errorcodeptr = ERR15; |
| break; |
| } |
| s = cb->bracount - (s - 1); |
| } |
| |
| escape = -(int)s; |
| break; |
| |
| /* The handling of escape sequences consisting of a string of digits |
| starting with one that is not zero is not straightforward. Perl has changed |
| over the years. Nowadays \g{} for backreferences and \o{} for octal are |
| recommended to avoid the ambiguities in the old syntax. |
| |
| Outside a character class, the digits are read as a decimal number. If the |
| number is less than 10, or if there are that many previous extracting left |
| brackets, it is a back reference. Otherwise, up to three octal digits are |
| read to form an escaped character code. Thus \123 is likely to be octal 123 |
| (cf \0123, which is octal 012 followed by the literal 3). |
| |
| Inside a character class, \ followed by a digit is always either a literal |
| 8 or 9 or an octal number. */ |
| |
| case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5: |
| case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9: |
| |
| if (!isclass) |
| { |
| oldptr = ptr; |
| /* The integer range is limited by the machine's int representation. */ |
| s = c - CHAR_0; |
| overflow = FALSE; |
| while (IS_DIGIT(ptr[1])) |
| { |
| if (s > INT_MAX / 10 - 1) /* Integer overflow */ |
| { |
| overflow = TRUE; |
| break; |
| } |
| s = s * 10 + (int)(*(++ptr) - CHAR_0); |
| } |
| if (overflow) /* Integer overflow */ |
| { |
| while (IS_DIGIT(ptr[1])) ptr++; |
| *errorcodeptr = ERR61; |
| break; |
| } |
| |
| /* \1 to \9 are always back references. \8x and \9x are too; \1x to \7x |
| are octal escapes if there are not that many previous captures. */ |
| |
| if (s < 10 || *oldptr >= CHAR_8 || s <= cb->bracount) |
| { |
| escape = -(int)s; /* Indicates a back reference */ |
| break; |
| } |
| ptr = oldptr; /* Put the pointer back and fall through */ |
| } |
| |
| /* Handle a digit following \ when the number is not a back reference, or |
| we are within a character class. If the first digit is 8 or 9, Perl used to |
| generate a binary zero byte and then treat the digit as a following |
| literal. At least by Perl 5.18 this changed so as not to insert the binary |
| zero. */ |
| |
| if ((c = *ptr) >= CHAR_8) break; |
| |
| /* Fall through with a digit less than 8 */ |
| |
| /* \0 always starts an octal number, but we may drop through to here with a |
| larger first octal digit. The original code used just to take the least |
| significant 8 bits of octal numbers (I think this is what early Perls used |
| to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode, |
| but no more than 3 octal digits. */ |
| |
| case CHAR_0: |
| c -= CHAR_0; |
| while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7) |
| c = c * 8 + *(++ptr) - CHAR_0; |
| #if PCRE2_CODE_UNIT_WIDTH == 8 |
| if (!utf && c > 0xff) *errorcodeptr = ERR51; |
| #endif |
| break; |
| |
| /* \o is a relatively new Perl feature, supporting a more general way of |
| specifying character codes in octal. The only supported form is \o{ddd}. */ |
| |
| case CHAR_o: |
| if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR55; else |
| if (ptr[2] == CHAR_RIGHT_CURLY_BRACKET) *errorcodeptr = ERR78; else |
| { |
| ptr += 2; |
| c = 0; |
| overflow = FALSE; |
| while (*ptr >= CHAR_0 && *ptr <= CHAR_7) |
| { |
| cc = *ptr++; |
| if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */ |
| #if PCRE2_CODE_UNIT_WIDTH == 32 |
| if (c >= 0x20000000l) { overflow = TRUE; break; } |
| #endif |
| c = (c << 3) + (cc - CHAR_0); |
| #if PCRE2_CODE_UNIT_WIDTH == 8 |
| if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; } |
| #elif PCRE2_CODE_UNIT_WIDTH == 16 |
| if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; } |
| #elif PCRE2_CODE_UNIT_WIDTH == 32 |
| if (utf && c > 0x10ffffU) { overflow = TRUE; break; } |
| #endif |
| } |
| if (overflow) |
| { |
| while (*ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++; |
| *errorcodeptr = ERR34; |
| } |
| else if (*ptr == CHAR_RIGHT_CURLY_BRACKET) |
| { |
| if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73; |
| } |
| else *errorcodeptr = ERR64; |
| } |
| break; |
| |
| /* \x is complicated. When PCRE2_ALT_BSUX is set, \x must be followed by |
| two hexadecimal digits. Otherwise it is a lowercase x letter. */ |
| |
| case CHAR_x: |
| if ((options & PCRE2_ALT_BSUX) != 0) |
| { |
| uint32_t xc; |
| if ((cc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */ |
| if ((xc = XDIGIT(ptr[2])) == 0xff) break; /* Not a hex digit */ |
| c = (cc << 4) | xc; |
| ptr += 2; |
| } /* End PCRE2_ALT_BSUX handling */ |
| |
| /* Handle \x in Perl's style. \x{ddd} is a character number which can be |
| greater than 0xff in UTF-8 or non-8bit mode, but only if the ddd are hex |
| digits. If not, { used to be treated as a data character. However, Perl |
| seems to read hex digits up to the first non-such, and ignore the rest, so |
| that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE |
| now gives an error. */ |
| |
| else |
| { |
| if (ptr[1] == CHAR_LEFT_CURLY_BRACKET) |
| { |
| ptr += 2; |
| if (*ptr == CHAR_RIGHT_CURLY_BRACKET) |
| { |
| *errorcodeptr = ERR78; |
| break; |
| } |
| c = 0; |
| overflow = FALSE; |
| |
| while ((cc = XDIGIT(*ptr)) != 0xff) |
| { |
| ptr++; |
| if (c == 0 && cc == 0) continue; /* Leading zeroes */ |
| #if PCRE2_CODE_UNIT_WIDTH == 32 |
| if (c >= 0x10000000l) { overflow = TRUE; break; } |
| #endif |
| c = (c << 4) | cc; |
| if ((utf && c > 0x10ffffU) || (!utf && c > MAX_NON_UTF_CHAR)) |
| { |
| overflow = TRUE; |
| break; |
| } |
| } |
| |
| if (overflow) |
| { |
| while (XDIGIT(*ptr) != 0xff) ptr++; |
| *errorcodeptr = ERR34; |
| } |
| else if (*ptr == CHAR_RIGHT_CURLY_BRACKET) |
| { |
| if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73; |
| } |
| |
| /* If the sequence of hex digits does not end with '}', give an error. |
| We used just to recognize this construct and fall through to the normal |
| \x handling, but nowadays Perl gives an error, which seems much more |
| sensible, so we do too. */ |
| |
| else *errorcodeptr = ERR67; |
| } /* End of \x{} processing */ |
| |
| /* Read a single-byte hex-defined char (up to two hex digits after \x) */ |
| |
| else |
| { |
| c = 0; |
| if ((cc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */ |
| ptr++; |
| c = cc; |
| if ((cc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */ |
| ptr++; |
| c = (c << 4) | cc; |
| } /* End of \xdd handling */ |
| } /* End of Perl-style \x handling */ |
| break; |
| |
| /* The handling of \c is different in ASCII and EBCDIC environments. In an |
| ASCII (or Unicode) environment, an error is given if the character |
| following \c is not a printable ASCII character. Otherwise, the following |
| character is upper-cased if it is a letter, and after that the 0x40 bit is |
| flipped. The result is the value of the escape. |
| |
| In an EBCDIC environment the handling of \c is compatible with the |
| specification in the perlebcdic document. The following character must be |
| a letter or one of small number of special characters. These provide a |
| means of defining the character values 0-31. |
| |
| For testing the EBCDIC handling of \c in an ASCII environment, recognize |
| the EBCDIC value of 'c' explicitly. */ |
| |
| #if defined EBCDIC && 'a' != 0x81 |
| case 0x83: |
| #else |
| case CHAR_c: |
| #endif |
| |
| c = *(++ptr); |
| if (c >= CHAR_a && c <= CHAR_z) c = UPPER_CASE(c); |
| if (c == CHAR_NULL && ptr >= ptrend) |
| { |
| *errorcodeptr = ERR2; |
| break; |
| } |
| |
| /* Handle \c in an ASCII/Unicode environment. */ |
| |
| #ifndef EBCDIC /* ASCII/UTF-8 coding */ |
| if (c < 32 || c > 126) /* Excludes all non-printable ASCII */ |
| { |
| *errorcodeptr = ERR68; |
| break; |
| } |
| c ^= 0x40; |
| |
| /* Handle \c in an EBCDIC environment. The special case \c? is converted to |
| 255 (0xff) or 95 (0x5f) if other character suggest we are using th POSIX-BC |
| encoding. (This is the way Perl indicates that it handles \c?.) The other |
| valid sequences correspond to a list of specific characters. */ |
| |
| #else |
| if (c == CHAR_QUESTION_MARK) |
| c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff; |
| else |
| { |
| for (i = 0; i < 32; i++) |
| { |
| if (c == ebcdic_escape_c[i]) break; |
| } |
| if (i < 32) c = i; else *errorcodeptr = ERR68; |
| } |
| #endif /* EBCDIC */ |
| |
| break; |
| |
| /* Any other alphanumeric following \ is an error. Perl gives an error only |
| if in warning mode, but PCRE doesn't have a warning mode. */ |
| |
| default: |
| *errorcodeptr = ERR3; |
| break; |
| } |
| } |
| |
| /* Perl supports \N{name} for character names, as well as plain \N for "not |
| newline". PCRE does not support \N{name}. However, it does support |
| quantification such as \N{2,3}. */ |
| |
| if (escape == ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET && |
| !is_counted_repeat(ptr+2)) |
| *errorcodeptr = ERR37; |
| |
| /* If PCRE2_UCP is set, we change the values for \d etc. */ |
| |
| if ((options & PCRE2_UCP) != 0 && escape >= ESC_D && escape <= ESC_w) |
| escape += (ESC_DU - ESC_D); |
| |
| /* Set the pointer to the final character before returning. */ |
| |
| *ptrptr = ptr; |
| *chptr = c; |
| return escape; |
| } |
| |
| |
| |
| #ifdef SUPPORT_UNICODE |
| /************************************************* |
| * Handle \P and \p * |
| *************************************************/ |
| |
| /* This function is called after \P or \p has been encountered, provided that |
| PCRE2 is compiled with support for UTF and Unicode properties. On entry, the |
| contents of ptrptr are pointing at the P or p. On exit, it is left pointing at |
| the final code unit of the escape sequence. |
| |
| Arguments: |
| ptrptr the pattern position pointer |
| negptr a boolean that is set TRUE for negation else FALSE |
| ptypeptr an unsigned int that is set to the type value |
| pdataptr an unsigned int that is set to the detailed property value |
| errorcodeptr the error code variable |
| cb the compile data |
| |
| Returns: TRUE if the type value was found, or FALSE for an invalid type |
| */ |
| |
| static BOOL |
| get_ucp(PCRE2_SPTR *ptrptr, BOOL *negptr, unsigned int *ptypeptr, |
| unsigned int *pdataptr, int *errorcodeptr, compile_block *cb) |
| { |
| register PCRE2_UCHAR c; |
| int i, bot, top; |
| PCRE2_SPTR ptr = *ptrptr; |
| PCRE2_UCHAR name[32]; |
| |
| *negptr = FALSE; |
| c = *(++ptr); |
| |
| /* \P or \p can be followed by a name in {}, optionally preceded by ^ for |
| negation. */ |
| |
| if (c == CHAR_LEFT_CURLY_BRACKET) |
| { |
| if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT) |
| { |
| *negptr = TRUE; |
| ptr++; |
| } |
| for (i = 0; i < (int)(sizeof(name) / sizeof(PCRE2_UCHAR)) - 1; i++) |
| { |
| c = *(++ptr); |
| if (c == CHAR_NULL) goto ERROR_RETURN; |
| if (c == CHAR_RIGHT_CURLY_BRACKET) break; |
| name[i] = c; |
| } |
| if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN; |
| name[i] = 0; |
| } |
| |
| /* Otherwise there is just one following character, which must be an ASCII |
| letter. */ |
| |
| else if (MAX_255(c) && (cb->ctypes[c] & ctype_letter) != 0) |
| { |
| name[0] = c; |
| name[1] = 0; |
| } |
| else goto ERROR_RETURN; |
| |
| *ptrptr = ptr; |
| |
| /* Search for a recognized property name using binary chop. */ |
| |
| bot = 0; |
| top = PRIV(utt_size); |
| |
| while (bot < top) |
| { |
| int r; |
| i = (bot + top) >> 1; |
| r = PRIV(strcmp_c8)(name, PRIV(utt_names) + PRIV(utt)[i].name_offset); |
| if (r == 0) |
| { |
| *ptypeptr = PRIV(utt)[i].type; |
| *pdataptr = PRIV(utt)[i].value; |
| return TRUE; |
| } |
| if (r > 0) bot = i + 1; else top = i; |
| } |
| *errorcodeptr = ERR47; /* Unrecognized name */ |
| return FALSE; |
| |
| ERROR_RETURN: /* Malformed \P or \p */ |
| *errorcodeptr = ERR46; |
| *ptrptr = ptr; |
| return FALSE; |
| } |
| #endif |
| |
| |
| |
| /************************************************* |
| * Read repeat counts * |
| *************************************************/ |
| |
| /* Read an item of the form {n,m} and return the values. This is called only |
| after is_counted_repeat() has confirmed that a repeat-count quantifier exists, |
| so the syntax is guaranteed to be correct, but we need to check the values. |
| |
| Arguments: |
| p pointer to first char after '{' |
| minp pointer to int for min |
| maxp pointer to int for max |
| returned as -1 if no max |
| errorcodeptr points to error code variable |
| |
| Returns: pointer to '}' on success; |
| current ptr on error, with errorcodeptr set non-zero |
| */ |
| |
| static PCRE2_SPTR |
| read_repeat_counts(PCRE2_SPTR p, int *minp, int *maxp, int *errorcodeptr) |
| { |
| int min = 0; |
| int max = -1; |
| |
| while (IS_DIGIT(*p)) |
| { |
| min = min * 10 + (int)(*p++ - CHAR_0); |
| if (min > 65535) |
| { |
| *errorcodeptr = ERR5; |
| return p; |
| } |
| } |
| |
| if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else |
| { |
| if (*(++p) != CHAR_RIGHT_CURLY_BRACKET) |
| { |
| max = 0; |
| while(IS_DIGIT(*p)) |
| { |
| max = max * 10 + (int)(*p++ - CHAR_0); |
| if (max > 65535) |
| { |
| *errorcodeptr = ERR5; |
| return p; |
| } |
| } |
| if (max < min) |
| { |
| *errorcodeptr = ERR4; |
| return p; |
| } |
| } |
| } |
| |
| *minp = min; |
| *maxp = max; |
| return p; |
| } |
| |
| |
| |
| /************************************************* |
| * Scan compiled regex for recursion reference * |
| *************************************************/ |
| |
| /* This function scans through a compiled pattern until it finds an instance of |
| OP_RECURSE. |
| |
| Arguments: |
| code points to start of expression |
| utf TRUE in UTF mode |
| |
| Returns: pointer to the opcode for OP_RECURSE, or NULL if not found |
| */ |
| |
| static PCRE2_SPTR |
| find_recurse(PCRE2_SPTR code, BOOL utf) |
| { |
| for (;;) |
| { |
| register PCRE2_UCHAR c = *code; |
| if (c == OP_END) return NULL; |
| if (c == OP_RECURSE) return code; |
| |
| /* XCLASS is used for classes that cannot be represented just by a bit map. |
| This includes negated single high-valued characters. CALLOUT_STR is used for |
| callouts with string arguments. In both cases the length in the table is |
| zero; the actual length is stored in the compiled code. */ |
| |
| if (c == OP_XCLASS) code += GET(code, 1); |
| else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE); |
| |
| /* Otherwise, we can get the item's length from the table, except that for |
| repeated character types, we have to test for \p and \P, which have an extra |
| two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we |
| must add in its length. */ |
| |
| else |
| { |
| switch(c) |
| { |
| case OP_TYPESTAR: |
| case OP_TYPEMINSTAR: |
| case OP_TYPEPLUS: |
| case OP_TYPEMINPLUS: |
| case OP_TYPEQUERY: |
| case OP_TYPEMINQUERY: |
| case OP_TYPEPOSSTAR: |
| case OP_TYPEPOSPLUS: |
| case OP_TYPEPOSQUERY: |
| if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; |
| break; |
| |
| case OP_TYPEPOSUPTO: |
| case OP_TYPEUPTO: |
| case OP_TYPEMINUPTO: |
| case OP_TYPEEXACT: |
| if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP) |
| code += 2; |
| break; |
| |
| case OP_MARK: |
| case OP_PRUNE_ARG: |
| case OP_SKIP_ARG: |
| case OP_THEN_ARG: |
| code += code[1]; |
| break; |
| } |
| |
| /* Add in the fixed length from the table */ |
| |
| code += PRIV(OP_lengths)[c]; |
| |
| /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may |
| be followed by a multi-unit character. The length in the table is a |
| minimum, so we have to arrange to skip the extra units. */ |
| |
| #ifdef MAYBE_UTF_MULTI |
| if (utf) switch(c) |
| { |
| case OP_CHAR: |
| case OP_CHARI: |
| case OP_NOT: |
| case OP_NOTI: |
| case OP_EXACT: |
| case OP_EXACTI: |
| case OP_NOTEXACT: |
| case OP_NOTEXACTI: |
| case OP_UPTO: |
| case OP_UPTOI: |
| case OP_NOTUPTO: |
| case OP_NOTUPTOI: |
| case OP_MINUPTO: |
| case OP_MINUPTOI: |
| case OP_NOTMINUPTO: |
| case OP_NOTMINUPTOI: |
| case OP_POSUPTO: |
| case OP_POSUPTOI: |
| case OP_NOTPOSUPTO: |
| case OP_NOTPOSUPTOI: |
| case OP_STAR: |
| case OP_STARI: |
| case OP_NOTSTAR: |
| case OP_NOTSTARI: |
| case OP_MINSTAR: |
| case OP_MINSTARI: |
| case OP_NOTMINSTAR: |
| case OP_NOTMINSTARI: |
| case OP_POSSTAR: |
| case OP_POSSTARI: |
| case OP_NOTPOSSTAR: |
| case OP_NOTPOSSTARI: |
| case OP_PLUS: |
| case OP_PLUSI: |
| case OP_NOTPLUS: |
| case OP_NOTPLUSI: |
| case OP_MINPLUS: |
| case OP_MINPLUSI: |
| case OP_NOTMINPLUS: |
| case OP_NOTMINPLUSI: |
| case OP_POSPLUS: |
| case OP_POSPLUSI: |
| case OP_NOTPOSPLUS: |
| case OP_NOTPOSPLUSI: |
| case OP_QUERY: |
| case OP_QUERYI: |
| case OP_NOTQUERY: |
| case OP_NOTQUERYI: |
| case OP_MINQUERY: |
| case OP_MINQUERYI: |
| case OP_NOTMINQUERY: |
| case OP_NOTMINQUERYI: |
| case OP_POSQUERY: |
| case OP_POSQUERYI: |
| case OP_NOTPOSQUERY: |
| case OP_NOTPOSQUERYI: |
| if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]); |
| break; |
| } |
| #else |
| (void)(utf); /* Keep compiler happy by referencing function argument */ |
| #endif /* MAYBE_UTF_MULTI */ |
| } |
| } |
| } |
| |
| |
| |
| /************************************************* |
| * Check for POSIX class syntax * |
| *************************************************/ |
| |
| /* This function is called when the sequence "[:" or "[." or "[=" is |
| encountered in a character class. It checks whether this is followed by a |
| sequence of characters terminated by a matching ":]" or ".]" or "=]". If we |
| reach an unescaped ']' without the special preceding character, return FALSE. |
| |
| Originally, this function only recognized a sequence of letters between the |
| terminators, but it seems that Perl recognizes any sequence of characters, |
| though of course unknown POSIX names are subsequently rejected. Perl gives an |
| "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE |
| didn't consider this to be a POSIX class. Likewise for [:1234:]. |
| |
| The problem in trying to be exactly like Perl is in the handling of escapes. We |
| have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX |
| class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code |
| below handles the special cases \\ and \], but does not try to do any other |
| escape processing. This makes it different from Perl for cases such as |
| [:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does |
| not recognize "l\ower". This is a lesser evil than not diagnosing bad classes |
| when Perl does, I think. |
| |
| A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not. |
| It seems that the appearance of a nested POSIX class supersedes an apparent |
| external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or |
| a digit. This is handled by returning FALSE if the start of a new group with |
| the same terminator is encountered, since the next closing sequence must close |
| the nested group, not the outer one. |
| |
| In Perl, unescaped square brackets may also appear as part of class names. For |
| example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for |
| [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not |
| seem right at all. PCRE does not allow closing square brackets in POSIX class |
| names. |
| |
| Arguments: |
| ptr pointer to the initial [ |
| endptr where to return a pointer to the terminating ':', '.', or '=' |
| |
| Returns: TRUE or FALSE |
| */ |
| |
| static BOOL |
| check_posix_syntax(PCRE2_SPTR ptr, PCRE2_SPTR *endptr) |
| { |
| PCRE2_UCHAR terminator; /* Don't combine these lines; the Solaris cc */ |
| terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */ |
| |
| for (++ptr; *ptr != CHAR_NULL; ptr++) |
| { |
| if (*ptr == CHAR_BACKSLASH && |
| (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET || ptr[1] == CHAR_BACKSLASH)) |
| ptr++; |
| else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) || |
| *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE; |
| else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) |
| { |
| *endptr = ptr; |
| return TRUE; |
| } |
| } |
| |
| return FALSE; |
| } |
| |
| |
| |
| /************************************************* |
| * Check POSIX class name * |
| *************************************************/ |
| |
| /* This function is called to check the name given in a POSIX-style class entry |
| such as [:alnum:]. |
| |
| Arguments: |
| ptr points to the first letter |
| len the length of the name |
| |
| Returns: a value representing the name, or -1 if unknown |
| */ |
| |
| static int |
| check_posix_name(PCRE2_SPTR ptr, int len) |
| { |
| const char *pn = posix_names; |
| register int yield = 0; |
| while (posix_name_lengths[yield] != 0) |
| { |
| if (len == posix_name_lengths[yield] && |
| PRIV(strncmp_c8)(ptr, pn, (unsigned int)len) == 0) return yield; |
| pn += posix_name_lengths[yield] + 1; |
| yield++; |
| } |
| return -1; |
| } |
| |
| |
| |
| #ifdef SUPPORT_UNICODE |
| /************************************************* |
| * Get othercase range * |
| *************************************************/ |
| |
| /* This function is passed the start and end of a class range in UCT mode. It |
| searches up the characters, looking for ranges of characters in the "other" |
| case. Each call returns the next one, updating the start address. A character |
| with multiple other cases is returned on its own with a special return value. |
| |
| Arguments: |
| cptr points to starting character value; updated |
| d end value |
| ocptr where to put start of othercase range |
| odptr where to put end of othercase range |
| |
| Yield: -1 when no more |
| 0 when a range is returned |
| >0 the CASESET offset for char with multiple other cases |
| in this case, ocptr contains the original |
| */ |
| |
| static int |
| get_othercase_range(uint32_t *cptr, uint32_t d, uint32_t *ocptr, |
| uint32_t *odptr) |
| { |
| uint32_t c, othercase, next; |
| unsigned int co; |
| |
| /* Find the first character that has an other case. If it has multiple other |
| cases, return its case offset value. */ |
| |
| for (c = *cptr; c <= d; c++) |
| { |
| if ((co = UCD_CASESET(c)) != 0) |
| { |
| *ocptr = c++; /* Character that has the set */ |
| *cptr = c; /* Rest of input range */ |
| return (int)co; |
| } |
| if ((othercase = UCD_OTHERCASE(c)) != c) break; |
| } |
| |
| if (c > d) return -1; /* Reached end of range */ |
| |
| /* Found a character that has a single other case. Search for the end of the |
| range, which is either the end of the input range, or a character that has zero |
| or more than one other cases. */ |
| |
| *ocptr = othercase; |
| next = othercase + 1; |
| |
| for (++c; c <= d; c++) |
| { |
| if ((co = UCD_CASESET(c)) != 0 || UCD_OTHERCASE(c) != next) break; |
| next++; |
| } |
| |
| *odptr = next - 1; /* End of othercase range */ |
| *cptr = c; /* Rest of input range */ |
| return 0; |
| } |
| #endif /* SUPPORT_UNICODE */ |
| |
| |
| |
| /************************************************* |
| * Add a character or range to a class * |
| *************************************************/ |
| |
| /* This function packages up the logic of adding a character or range of |
| characters to a class. The character values in the arguments will be within the |
| valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is |
| mutually recursive with the function immediately below. |
| |
| Arguments: |
| classbits the bit map for characters < 256 |
| uchardptr points to the pointer for extra data |
| options the options word |
| cb compile data |
| start start of range character |
| end end of range character |
| |
| Returns: the number of < 256 characters added |
| the pointer to extra data is updated |
| */ |
| |
| static int |
| add_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options, |
| compile_block *cb, uint32_t start, uint32_t end) |
| { |
| uint32_t c; |
| uint32_t classbits_end = (end <= 0xff ? end : 0xff); |
| int n8 = 0; |
| |
| /* If caseless matching is required, scan the range and process alternate |
| cases. In Unicode, there are 8-bit characters that have alternate cases that |
| are greater than 255 and vice-versa. Sometimes we can just extend the original |
| range. */ |
| |
| if ((options & PCRE2_CASELESS) != 0) |
| { |
| #ifdef SUPPORT_UNICODE |
| if ((options & PCRE2_UTF) != 0) |
| { |
| int rc; |
| uint32_t oc, od; |
| |
| options &= ~PCRE2_CASELESS; /* Remove for recursive calls */ |
| c = start; |
| |
| while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0) |
| { |
| /* Handle a single character that has more than one other case. */ |
| |
| if (rc > 0) n8 += add_list_to_class(classbits, uchardptr, options, cb, |
| PRIV(ucd_caseless_sets) + rc, oc); |
| |
| /* Do nothing if the other case range is within the original range. */ |
| |
| else if (oc >= start && od <= end) continue; |
| |
| /* Extend the original range if there is overlap, noting that if oc < c, we |
| can't have od > end because a subrange is always shorter than the basic |
| range. Otherwise, use a recursive call to add the additional range. */ |
| |
| else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */ |
| else if (od > end && oc <= end + 1) |
| { |
| end = od; /* Extend upwards */ |
| if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff); |
| } |
| else n8 += add_to_class(classbits, uchardptr, options, cb, oc, od); |
| } |
| } |
| else |
| #endif /* SUPPORT_UNICODE */ |
| |
| /* Not UTF mode */ |
| |
| for (c = start; c <= classbits_end; c++) |
| { |
| SETBIT(classbits, cb->fcc[c]); |
| n8++; |
| } |
| } |
| |
| /* Now handle the original range. Adjust the final value according to the bit |
| length - this means that the same lists of (e.g.) horizontal spaces can be used |
| in all cases. */ |
| |
| if ((options & PCRE2_UTF) == 0 && end > MAX_NON_UTF_CHAR) |
| end = MAX_NON_UTF_CHAR; |
| |
| /* Use the bitmap for characters < 256. Otherwise use extra data.*/ |
| |
| for (c = start; c <= classbits_end; c++) |
| { |
| /* Regardless of start, c will always be <= 255. */ |
| SETBIT(classbits, c); |
| n8++; |
| } |
| |
| #ifdef SUPPORT_WIDE_CHARS |
| if (start <= 0xff) start = 0xff + 1; |
| |
| if (end >= start) |
| { |
| PCRE2_UCHAR *uchardata = *uchardptr; |
| |
| #ifdef SUPPORT_UNICODE |
| if ((options & PCRE2_UTF) != 0) |
| { |
| if (start < end) |
| { |
| *uchardata++ = XCL_RANGE; |
| uchardata += PRIV(ord2utf)(start, uchardata); |
| uchardata += PRIV(ord2utf)(end, uchardata); |
| } |
| else if (start == end) |
| { |
| *uchardata++ = XCL_SINGLE; |
| uchardata += PRIV(ord2utf)(start, uchardata); |
| } |
| } |
| else |
| #endif /* SUPPORT_UNICODE */ |
| |
| /* Without UTF support, character values are constrained by the bit length, |
| and can only be > 256 for 16-bit and 32-bit libraries. */ |
| |
| #if PCRE2_CODE_UNIT_WIDTH == 8 |
| {} |
| #else |
| if (start < end) |
| { |
| *uchardata++ = XCL_RANGE; |
| *uchardata++ = start; |
| *uchardata++ = end; |
| } |
| else if (start == end) |
| { |
| *uchardata++ = XCL_SINGLE; |
| *uchardata++ = start; |
| } |
| #endif |
| *uchardptr = uchardata; /* Updata extra data pointer */ |
| } |
| #else |
| (void)uchardptr; /* Avoid compiler warning */ |
| #endif /* SUPPORT_WIDE_CHARS */ |
| |
| return n8; /* Number of 8-bit characters */ |
| } |
| |
| |
| |
| /************************************************* |
| * Add a list of characters to a class * |
| *************************************************/ |
| |
| /* This function is used for adding a list of case-equivalent characters to a |
| class, and also for adding a list of horizontal or vertical whitespace. If the |
| list is in order (which it should be), ranges of characters are detected and |
| handled appropriately. This function is mutually recursive with the function |
| above. |
| |
| Arguments: |
| classbits the bit map for characters < 256 |
| uchardptr points to the pointer for extra data |
| options the options word |
| cb contains pointers to tables etc. |
| p points to row of 32-bit values, terminated by NOTACHAR |
| except character to omit; this is used when adding lists of |
| case-equivalent characters to avoid including the one we |
| already know about |
| |
| Returns: the number of < 256 characters added |
| the pointer to extra data is updated |
| */ |
| |
| static int |
| add_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options, |
| compile_block *cb, const uint32_t *p, unsigned int except) |
| { |
| int n8 = 0; |
| while (p[0] < NOTACHAR) |
| { |
| int n = 0; |
| if (p[0] != except) |
| { |
| while(p[n+1] == p[0] + n + 1) n++; |
| n8 += add_to_class(classbits, uchardptr, options, cb, p[0], p[n]); |
| } |
| p += n + 1; |
| } |
| return n8; |
| } |
| |
| |
| |
| /************************************************* |
| * Add characters not in a list to a class * |
| *************************************************/ |
| |
| /* This function is used for adding the complement of a list of horizontal or |
| vertical whitespace to a class. The list must be in order. |
| |
| Arguments: |
| classbits the bit map for characters < 256 |
| uchardptr points to the pointer for extra data |
| options the options word |
| cb contains pointers to tables etc. |
| p points to row of 32-bit values, terminated by NOTACHAR |
| |
| Returns: the number of < 256 characters added |
| the pointer to extra data is updated |
| */ |
| |
| static int |
| add_not_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, |
| uint32_t options, compile_block *cb, const uint32_t *p) |
| { |
| BOOL utf = (options & PCRE2_UTF) != 0; |
| int n8 = 0; |
| if (p[0] > 0) |
| n8 += add_to_class(classbits, uchardptr, options, cb, 0, p[0] - 1); |
| while (p[0] < NOTACHAR) |
| { |
| while (p[1] == p[0] + 1) p++; |
| n8 += add_to_class(classbits, uchardptr, options, cb, p[0] + 1, |
| (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1); |
| p++; |
| } |
| return n8; |
| } |
| |
| |
| |
| /************************************************* |
| * Process (*VERB) name for escapes * |
| *************************************************/ |
| |
| /* This function is called when the PCRE2_ALT_VERBNAMES option is set, to |
| process the characters in a verb's name argument. It is called twice, once with |
| codeptr == NULL, to find out the length of the processed name, and again to put |
| the name into memory. |
| |
| Arguments: |
| ptrptr pointer to the input pointer |
| codeptr pointer to the compiled code pointer |
| errorcodeptr pointer to the error code |
| options the options bits |
| utf TRUE if processing UTF |
| cb compile data block |
| |
| Returns: length of the processed name, or < 0 on error |
| */ |
| |
| static int |
| process_verb_name(PCRE2_SPTR *ptrptr, PCRE2_UCHAR **codeptr, int *errorcodeptr, |
| uint32_t options, BOOL utf, compile_block *cb) |
| { |
| int32_t arglen = 0; |
| BOOL inescq = FALSE; |
| PCRE2_SPTR ptr = *ptrptr; |
| PCRE2_UCHAR *code = (codeptr == NULL)? NULL : *codeptr; |
| |
| for (; ptr < cb->end_pattern; ptr++) |
| { |
| uint32_t x = *ptr; |
| |
| /* Skip over literals */ |
| |
| if (inescq) |
| { |
| if (x == CHAR_BACKSLASH && ptr[1] == CHAR_E) |
| { |
| inescq = FALSE; |
| ptr++;; |
| continue; |
| } |
| } |
| |
| else /* Not a literal character */ |
| { |
| if (x == CHAR_RIGHT_PARENTHESIS) break; |
| |
| /* Skip over comments and whitespace in extended mode. */ |
| |
| if ((options & PCRE2_EXTENDED) != 0) |
| { |
| PCRE2_SPTR wscptr = ptr; |
| while (MAX_255(x) && (cb->ctypes[x] & ctype_space) != 0) x = *(++ptr); |
| if (x == CHAR_NUMBER_SIGN) |
| { |
| ptr++; |
| while (*ptr != CHAR_NULL || ptr < cb->end_pattern) |
| { |
| if (IS_NEWLINE(ptr)) /* For non-fixed-length newline cases, */ |
| { /* IS_NEWLINE sets cb->nllen. */ |
| ptr += cb->nllen; |
| break; |
| } |
| ptr++; |
| #ifdef SUPPORT_UNICODE |
| if (utf) FORWARDCHAR(ptr); |
| #endif |
| } |
| } |
| |
| /* If we have skipped any characters, restart the loop. */ |
| |
| if (ptr > wscptr) |
| { |
| ptr--; |
| continue; |
| } |
| } |
| |
| /* Process escapes */ |
| |
| if (x == '\\') |
| { |
| int rc; |
| *errorcodeptr = 0; |
| rc = PRIV(check_escape)(&ptr, cb->end_pattern, &x, errorcodeptr, options, |
| FALSE, cb); |
| *ptrptr = ptr; /* For possible error */ |
| if (*errorcodeptr != 0) return -1; |
| if (rc != 0) |
| { |
| if (rc == ESC_Q) |
| { |
| inescq = TRUE; |
| continue; |
| } |
| if (rc == ESC_E) continue; |
| *errorcodeptr = ERR40; |
| return -1; |
| } |
| } |
| } |
| |
| /* We have the next character in the name. */ |
| |
| #ifdef SUPPORT_UNICODE |
| if (utf) |
| { |
| if (code == NULL) /* Just want the length */ |
| { |
| #if PCRE2_CODE_UNIT_WIDTH == 8 |
| int i; |
| for (i = 0; i < PRIV(utf8_table1_size); i++) |
| if ((int)x <= PRIV(utf8_table1)[i]) break; |
| arglen += i; |
| #elif PCRE2_CODE_UNIT_WIDTH == 16 |
| if (x > 0xffff) arglen++; |
| #endif |
| } |
| else |
| { |
| PCRE2_UCHAR cbuff[8]; |
| x = PRIV(ord2utf)(x, cbuff); |
| memcpy(code, cbuff, CU2BYTES(x)); |
| code += x; |
| } |
| } |
| else |
| #endif /* SUPPORT_UNICODE */ |
| |
| /* Not UTF */ |
| { |
| if (code != NULL) *code++ = x; |
| } |
| |
| arglen++; |
| |
| if ((unsigned int)arglen > MAX_MARK) |
| { |
| *errorcodeptr = ERR76; |
| *ptrptr = ptr; |
| return -1; |
| } |
| } |
| |
| /* Update the pointers before returning. */ |
| |
| *ptrptr = ptr; |
| if (codeptr != NULL) *codeptr = code; |
| return arglen; |
| } |
| |
|