| /* |
| * regex.c -- POSIX-conformant regular expression function set for the lsof |
| * library |
| * |
| * This file is used when the UNIX dialect does not have a POSIX-conformant |
| * regular expression function set. In that case USE_LIB_REGEX is defined. |
| * |
| * V. Abell <abe@purdue.edu> |
| * Purdue University Computing Center |
| */ |
| |
| |
| /* |
| * Copyright 2000 Purdue Research Foundation, West Lafayette, Indiana |
| * 47907. All rights reserved. |
| * |
| * Written by Victor A. Abell |
| * |
| * This software is not subject to any license of the American Telephone |
| * and Telegraph Company or the Regents of the University of California. |
| * |
| * This software has been adapted from snprintf.c in sendmail 8.9.3. It |
| * is subject to the sendmail copyright statements listed below, and the |
| * sendmail licensing terms stated in the sendmail LICENSE file comment |
| * section of this file. |
| * |
| * Permission is granted to anyone to use this software for any purpose on |
| * any computer system, and to alter it and redistribute it freely, subject |
| * to the following restrictions: |
| * |
| * 1. Neither the authors nor Purdue University are responsible for any |
| * consequences of the use of this software. |
| * |
| * 2. The origin of this software must not be misrepresented, either by |
| * explicit claim or by omission. Credit to the authors and Purdue |
| * University must appear in documentation and sources. |
| * |
| * 3. Altered versions must be plainly marked as such, and must not be |
| * misrepresented as being the original software. |
| * |
| * 4. This notice may not be removed or altered. |
| */ |
| |
| |
| #include "../machine.h" |
| |
| #ifdef USE_LIB_REGEX |
| /* |
| * This file comes from GLIBC 2.2. It is used when the UNIX dialect does not |
| * have a POSIX-conformant regular expression function set. In that case |
| * USE_LIB_REGEX is defined. |
| */ |
| |
| /* Extended regular expression matching and search library, |
| version 0.12. |
| (Implements POSIX draft P1003.2/D11.2, except for some of the |
| internationalization features.) |
| Copyright (C) 1993-1999, 2000 Free Software Foundation, Inc. |
| |
| The GNU C Library is free software; you can redistribute it and/or |
| modify it under the terms of the GNU Library General Public License as |
| published by the Free Software Foundation; either version 2 of the |
| License, or (at your option) any later version. |
| |
| The GNU C Library is distributed in the hope that it will be useful, |
| but WITHOUT ANY WARRANTY; without even the implied warranty of |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| Library General Public License for more details. |
| |
| You should have received a copy of the GNU Library General Public |
| License along with the GNU C Library; see the file COPYING.LIB. If not, |
| write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, |
| Boston, MA 02111-1307, USA. */ |
| |
| /* AIX requires this to be the first thing in the file. */ |
| #if defined _AIX && !defined REGEX_MALLOC |
| #pragma alloca |
| #endif |
| |
| #undef _GNU_SOURCE |
| #define _GNU_SOURCE |
| |
| #ifdef HAVE_CONFIG_H |
| # include <config.h> |
| #endif |
| |
| #ifndef PARAMS |
| # if defined __GNUC__ || (defined __STDC__ && __STDC__) |
| # define PARAMS(args) args |
| # else |
| # define PARAMS(args) () |
| # endif /* GCC. */ |
| #endif /* Not PARAMS. */ |
| |
| #if defined STDC_HEADERS && !defined emacs |
| # include <stddef.h> |
| #else |
| /* We need this for `regex.h', and perhaps for the Emacs include files. */ |
| # include <sys/types.h> |
| #endif |
| |
| #define WIDE_CHAR_SUPPORT (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC) |
| |
| /* For platform which support the ISO C amendement 1 functionality we |
| support user defined character classes. */ |
| #if defined _LIBC || WIDE_CHAR_SUPPORT |
| /* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */ |
| # include <wchar.h> |
| # include <wctype.h> |
| #endif |
| |
| #ifdef _LIBC |
| /* We have to keep the namespace clean. */ |
| # define regfree(preg) __regfree (preg) |
| # define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef) |
| # define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags) |
| # define regerror(errcode, preg, errbuf, errbuf_size) \ |
| __regerror(errcode, preg, errbuf, errbuf_size) |
| # define re_set_registers(bu, re, nu, st, en) \ |
| __re_set_registers (bu, re, nu, st, en) |
| # define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \ |
| __re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop) |
| # define re_match(bufp, string, size, pos, regs) \ |
| __re_match (bufp, string, size, pos, regs) |
| # define re_search(bufp, string, size, startpos, range, regs) \ |
| __re_search (bufp, string, size, startpos, range, regs) |
| # define re_compile_pattern(pattern, length, bufp) \ |
| __re_compile_pattern (pattern, length, bufp) |
| # define re_set_syntax(syntax) __re_set_syntax (syntax) |
| # define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \ |
| __re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop) |
| # define re_compile_fastmap(bufp) __re_compile_fastmap (bufp) |
| |
| # define btowc __btowc |
| |
| /* We are also using some library internals. */ |
| # include <locale/localeinfo.h> |
| # include <locale/elem-hash.h> |
| # include <langinfo.h> |
| #endif |
| |
| /* This is for other GNU distributions with internationalized messages. */ |
| #if HAVE_LIBINTL_H || defined _LIBC |
| # include <libintl.h> |
| # ifdef _LIBC |
| # undef gettext |
| # define gettext(msgid) __dcgettext ("libc", msgid, LC_MESSAGES) |
| # endif |
| #else |
| # define gettext(msgid) (msgid) |
| #endif |
| |
| #ifndef gettext_noop |
| /* This define is so xgettext can find the internationalizable |
| strings. */ |
| # define gettext_noop(String) String |
| #endif |
| |
| /* The `emacs' switch turns on certain matching commands |
| that make sense only in Emacs. */ |
| #ifdef emacs |
| |
| # include "lisp.h" |
| # include "buffer.h" |
| # include "syntax.h" |
| |
| #else /* not emacs */ |
| |
| /* If we are not linking with Emacs proper, |
| we can't use the relocating allocator |
| even if config.h says that we can. */ |
| # undef REL_ALLOC |
| |
| # if defined STDC_HEADERS || defined _LIBC |
| # include <stdlib.h> |
| # else |
| char *malloc (); |
| char *realloc (); |
| # endif |
| |
| /* When used in Emacs's lib-src, we need to get bzero and bcopy somehow. |
| If nothing else has been done, use the method below. */ |
| # ifdef INHIBIT_STRING_HEADER |
| # if !(defined HAVE_BZERO && defined HAVE_BCOPY) |
| # if !defined bzero && !defined bcopy |
| # undef INHIBIT_STRING_HEADER |
| # endif |
| # endif |
| # endif |
| |
| /* This is the normal way of making sure we have a bcopy and a bzero. |
| This is used in most programs--a few other programs avoid this |
| by defining INHIBIT_STRING_HEADER. */ |
| # ifndef INHIBIT_STRING_HEADER |
| # if defined HAVE_STRING_H || defined STDC_HEADERS || defined _LIBC |
| # include <string.h> |
| # ifndef bzero |
| # ifndef _LIBC |
| # define bzero(s, n) (memset (s, '\0', n), (s)) |
| # else |
| # define bzero(s, n) __bzero (s, n) |
| # endif |
| # endif |
| # else |
| # include <strings.h> |
| # ifndef memcmp |
| # define memcmp(s1, s2, n) bcmp (s1, s2, n) |
| # endif |
| # ifndef memcpy |
| # define memcpy(d, s, n) (bcopy (s, d, n), (d)) |
| # endif |
| # endif |
| # endif |
| |
| /* Define the syntax stuff for \<, \>, etc. */ |
| |
| /* This must be nonzero for the wordchar and notwordchar pattern |
| commands in re_match_2. */ |
| # ifndef Sword |
| # define Sword 1 |
| # endif |
| |
| # ifdef SWITCH_ENUM_BUG |
| # define SWITCH_ENUM_CAST(x) ((int)(x)) |
| # else |
| # define SWITCH_ENUM_CAST(x) (x) |
| # endif |
| |
| #endif /* not emacs */ |
| |
| #if defined _LIBC || HAVE_LIMITS_H |
| # include <limits.h> |
| #endif |
| |
| #ifndef MB_LEN_MAX |
| # define MB_LEN_MAX 1 |
| #endif |
| |
| /* Get the interface, including the syntax bits. */ |
| /* Disabled by V. Abell on January 29, 2001: #include <regex.h> */ |
| #include "../regex.h" |
| |
| /* isalpha etc. are used for the character classes. */ |
| #include <ctype.h> |
| |
| /* Jim Meyering writes: |
| |
| "... Some ctype macros are valid only for character codes that |
| isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when |
| using /bin/cc or gcc but without giving an ansi option). So, all |
| ctype uses should be through macros like ISPRINT... If |
| STDC_HEADERS is defined, then autoconf has verified that the ctype |
| macros don't need to be guarded with references to isascii. ... |
| Defining isascii to 1 should let any compiler worth its salt |
| eliminate the && through constant folding." |
| Solaris defines some of these symbols so we must undefine them first. */ |
| |
| #undef ISASCII |
| #if defined STDC_HEADERS || (!defined isascii && !defined HAVE_ISASCII) |
| # define ISASCII(c) 1 |
| #else |
| # define ISASCII(c) isascii(c) |
| #endif |
| |
| #ifdef isblank |
| # define ISBLANK(c) (ISASCII (c) && isblank (c)) |
| #else |
| # define ISBLANK(c) ((c) == ' ' || (c) == '\t') |
| #endif |
| #ifdef isgraph |
| # define ISGRAPH(c) (ISASCII (c) && isgraph (c)) |
| #else |
| # define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c)) |
| #endif |
| |
| #undef ISPRINT |
| #define ISPRINT(c) (ISASCII (c) && isprint (c)) |
| #define ISDIGIT(c) (ISASCII (c) && isdigit (c)) |
| #define ISALNUM(c) (ISASCII (c) && isalnum (c)) |
| #define ISALPHA(c) (ISASCII (c) && isalpha (c)) |
| #define ISCNTRL(c) (ISASCII (c) && iscntrl (c)) |
| #define ISLOWER(c) (ISASCII (c) && islower (c)) |
| #define ISPUNCT(c) (ISASCII (c) && ispunct (c)) |
| #define ISSPACE(c) (ISASCII (c) && isspace (c)) |
| #define ISUPPER(c) (ISASCII (c) && isupper (c)) |
| #define ISXDIGIT(c) (ISASCII (c) && isxdigit (c)) |
| |
| #ifdef _tolower |
| # define TOLOWER(c) _tolower(c) |
| #else |
| # define TOLOWER(c) tolower(c) |
| #endif |
| |
| #ifndef NULL |
| # define NULL (void *)0 |
| #endif |
| |
| /* We remove any previous definition of `SIGN_EXTEND_CHAR', |
| since ours (we hope) works properly with all combinations of |
| machines, compilers, `char' and `unsigned char' argument types. |
| (Per Bothner suggested the basic approach.) */ |
| #undef SIGN_EXTEND_CHAR |
| #if __STDC__ |
| # define SIGN_EXTEND_CHAR(c) ((signed char) (c)) |
| #else /* not __STDC__ */ |
| /* As in Harbison and Steele. */ |
| # define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128) |
| #endif |
| |
| #ifndef emacs |
| /* How many characters in the character set. */ |
| # define CHAR_SET_SIZE 256 |
| |
| # ifdef SYNTAX_TABLE |
| |
| extern char *re_syntax_table; |
| |
| # else /* not SYNTAX_TABLE */ |
| |
| static char re_syntax_table[CHAR_SET_SIZE]; |
| |
| static void |
| init_syntax_once () |
| { |
| register int c; |
| static int done = 0; |
| |
| if (done) |
| return; |
| bzero (re_syntax_table, sizeof re_syntax_table); |
| |
| for (c = 0; c < CHAR_SET_SIZE; ++c) |
| if (ISALNUM (c)) |
| re_syntax_table[c] = Sword; |
| |
| re_syntax_table['_'] = Sword; |
| |
| done = 1; |
| } |
| |
| # endif /* not SYNTAX_TABLE */ |
| |
| # define SYNTAX(c) re_syntax_table[(unsigned char) (c)] |
| |
| #endif /* emacs */ |
| |
| /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we |
| use `alloca' instead of `malloc'. This is because using malloc in |
| re_search* or re_match* could cause memory leaks when C-g is used in |
| Emacs; also, malloc is slower and causes storage fragmentation. On |
| the other hand, malloc is more portable, and easier to debug. |
| |
| Because we sometimes use alloca, some routines have to be macros, |
| not functions -- `alloca'-allocated space disappears at the end of the |
| function it is called in. */ |
| |
| #ifdef REGEX_MALLOC |
| |
| # define REGEX_ALLOCATE malloc |
| # define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize) |
| # define REGEX_FREE free |
| |
| #else /* not REGEX_MALLOC */ |
| |
| /* Emacs already defines alloca, sometimes. */ |
| # ifndef alloca |
| |
| /* Make alloca work the best possible way. */ |
| # ifdef __GNUC__ |
| # define alloca __builtin_alloca |
| # else /* not __GNUC__ */ |
| # if HAVE_ALLOCA_H |
| # include <alloca.h> |
| # endif /* HAVE_ALLOCA_H */ |
| # endif /* not __GNUC__ */ |
| |
| # endif /* not alloca */ |
| |
| # define REGEX_ALLOCATE alloca |
| |
| /* Assumes a `char *destination' variable. */ |
| # define REGEX_REALLOCATE(source, osize, nsize) \ |
| (destination = (char *) alloca (nsize), \ |
| memcpy (destination, source, osize)) |
| |
| /* No need to do anything to free, after alloca. */ |
| # define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */ |
| |
| #endif /* not REGEX_MALLOC */ |
| |
| /* Define how to allocate the failure stack. */ |
| |
| #if defined REL_ALLOC && defined REGEX_MALLOC |
| |
| # define REGEX_ALLOCATE_STACK(size) \ |
| r_alloc (&failure_stack_ptr, (size)) |
| # define REGEX_REALLOCATE_STACK(source, osize, nsize) \ |
| r_re_alloc (&failure_stack_ptr, (nsize)) |
| # define REGEX_FREE_STACK(ptr) \ |
| r_alloc_free (&failure_stack_ptr) |
| |
| #else /* not using relocating allocator */ |
| |
| # ifdef REGEX_MALLOC |
| |
| # define REGEX_ALLOCATE_STACK malloc |
| # define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize) |
| # define REGEX_FREE_STACK free |
| |
| # else /* not REGEX_MALLOC */ |
| |
| # define REGEX_ALLOCATE_STACK alloca |
| |
| # define REGEX_REALLOCATE_STACK(source, osize, nsize) \ |
| REGEX_REALLOCATE (source, osize, nsize) |
| /* No need to explicitly free anything. */ |
| # define REGEX_FREE_STACK(arg) |
| |
| # endif /* not REGEX_MALLOC */ |
| #endif /* not using relocating allocator */ |
| |
| |
| /* True if `size1' is non-NULL and PTR is pointing anywhere inside |
| `string1' or just past its end. This works if PTR is NULL, which is |
| a good thing. */ |
| #define FIRST_STRING_P(ptr) \ |
| (size1 && string1 <= (ptr) && (ptr) <= string1 + size1) |
| |
| /* (Re)Allocate N items of type T using malloc, or fail. */ |
| #define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t))) |
| #define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t))) |
| #define RETALLOC_IF(addr, n, t) \ |
| if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t) |
| #define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t))) |
| |
| #define BYTEWIDTH 8 /* In bits. */ |
| |
| #define STREQ(s1, s2) ((strcmp (s1, s2) == 0)) |
| |
| #undef MAX |
| #undef MIN |
| #define MAX(a, b) ((a) > (b) ? (a) : (b)) |
| #define MIN(a, b) ((a) < (b) ? (a) : (b)) |
| |
| typedef char boolean; |
| #define false 0 |
| #define true 1 |
| |
| static int re_match_2_internal PARAMS ((struct re_pattern_buffer *bufp, |
| const char *string1, int size1, |
| const char *string2, int size2, |
| int pos, |
| struct re_registers *regs, |
| int stop)); |
| |
| /* These are the command codes that appear in compiled regular |
| expressions. Some opcodes are followed by argument bytes. A |
| command code can specify any interpretation whatsoever for its |
| arguments. Zero bytes may appear in the compiled regular expression. */ |
| |
| typedef enum |
| { |
| no_op = 0, |
| |
| /* Succeed right away--no more backtracking. */ |
| succeed, |
| |
| /* Followed by one byte giving n, then by n literal bytes. */ |
| exactn, |
| |
| /* Matches any (more or less) character. */ |
| anychar, |
| |
| /* Matches any one char belonging to specified set. First |
| following byte is number of bitmap bytes. Then come bytes |
| for a bitmap saying which chars are in. Bits in each byte |
| are ordered low-bit-first. A character is in the set if its |
| bit is 1. A character too large to have a bit in the map is |
| automatically not in the set. */ |
| charset, |
| |
| /* Same parameters as charset, but match any character that is |
| not one of those specified. */ |
| charset_not, |
| |
| /* Start remembering the text that is matched, for storing in a |
| register. Followed by one byte with the register number, in |
| the range 0 to one less than the pattern buffer's re_nsub |
| field. Then followed by one byte with the number of groups |
| inner to this one. (This last has to be part of the |
| start_memory only because we need it in the on_failure_jump |
| of re_match_2.) */ |
| start_memory, |
| |
| /* Stop remembering the text that is matched and store it in a |
| memory register. Followed by one byte with the register |
| number, in the range 0 to one less than `re_nsub' in the |
| pattern buffer, and one byte with the number of inner groups, |
| just like `start_memory'. (We need the number of inner |
| groups here because we don't have any easy way of finding the |
| corresponding start_memory when we're at a stop_memory.) */ |
| stop_memory, |
| |
| /* Match a duplicate of something remembered. Followed by one |
| byte containing the register number. */ |
| duplicate, |
| |
| /* Fail unless at beginning of line. */ |
| begline, |
| |
| /* Fail unless at end of line. */ |
| endline, |
| |
| /* Succeeds if at beginning of buffer (if emacs) or at beginning |
| of string to be matched (if not). */ |
| begbuf, |
| |
| /* Analogously, for end of buffer/string. */ |
| endbuf, |
| |
| /* Followed by two byte relative address to which to jump. */ |
| jump, |
| |
| /* Same as jump, but marks the end of an alternative. */ |
| jump_past_alt, |
| |
| /* Followed by two-byte relative address of place to resume at |
| in case of failure. */ |
| on_failure_jump, |
| |
| /* Like on_failure_jump, but pushes a placeholder instead of the |
| current string position when executed. */ |
| on_failure_keep_string_jump, |
| |
| /* Throw away latest failure point and then jump to following |
| two-byte relative address. */ |
| pop_failure_jump, |
| |
| /* Change to pop_failure_jump if know won't have to backtrack to |
| match; otherwise change to jump. This is used to jump |
| back to the beginning of a repeat. If what follows this jump |
| clearly won't match what the repeat does, such that we can be |
| sure that there is no use backtracking out of repetitions |
| already matched, then we change it to a pop_failure_jump. |
| Followed by two-byte address. */ |
| maybe_pop_jump, |
| |
| /* Jump to following two-byte address, and push a dummy failure |
| point. This failure point will be thrown away if an attempt |
| is made to use it for a failure. A `+' construct makes this |
| before the first repeat. Also used as an intermediary kind |
| of jump when compiling an alternative. */ |
| dummy_failure_jump, |
| |
| /* Push a dummy failure point and continue. Used at the end of |
| alternatives. */ |
| push_dummy_failure, |
| |
| /* Followed by two-byte relative address and two-byte number n. |
| After matching N times, jump to the address upon failure. */ |
| succeed_n, |
| |
| /* Followed by two-byte relative address, and two-byte number n. |
| Jump to the address N times, then fail. */ |
| jump_n, |
| |
| /* Set the following two-byte relative address to the |
| subsequent two-byte number. The address *includes* the two |
| bytes of number. */ |
| set_number_at, |
| |
| wordchar, /* Matches any word-constituent character. */ |
| notwordchar, /* Matches any char that is not a word-constituent. */ |
| |
| wordbeg, /* Succeeds if at word beginning. */ |
| wordend, /* Succeeds if at word end. */ |
| |
| wordbound, /* Succeeds if at a word boundary. */ |
| notwordbound /* Succeeds if not at a word boundary. */ |
| |
| #ifdef emacs |
| ,before_dot, /* Succeeds if before point. */ |
| at_dot, /* Succeeds if at point. */ |
| after_dot, /* Succeeds if after point. */ |
| |
| /* Matches any character whose syntax is specified. Followed by |
| a byte which contains a syntax code, e.g., Sword. */ |
| syntaxspec, |
| |
| /* Matches any character whose syntax is not that specified. */ |
| notsyntaxspec |
| #endif /* emacs */ |
| } re_opcode_t; |
| |
| /* Common operations on the compiled pattern. */ |
| |
| /* Store NUMBER in two contiguous bytes starting at DESTINATION. */ |
| |
| #define STORE_NUMBER(destination, number) \ |
| do { \ |
| (destination)[0] = (number) & 0377; \ |
| (destination)[1] = (number) >> 8; \ |
| } while (0) |
| |
| /* Same as STORE_NUMBER, except increment DESTINATION to |
| the byte after where the number is stored. Therefore, DESTINATION |
| must be an lvalue. */ |
| |
| #define STORE_NUMBER_AND_INCR(destination, number) \ |
| do { \ |
| STORE_NUMBER (destination, number); \ |
| (destination) += 2; \ |
| } while (0) |
| |
| /* Put into DESTINATION a number stored in two contiguous bytes starting |
| at SOURCE. */ |
| |
| #define EXTRACT_NUMBER(destination, source) \ |
| do { \ |
| (destination) = *(source) & 0377; \ |
| (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \ |
| } while (0) |
| |
| #ifdef DEBUG |
| static void extract_number _RE_ARGS ((int *dest, unsigned char *source)); |
| static void |
| extract_number (dest, source) |
| int *dest; |
| unsigned char *source; |
| { |
| int temp = SIGN_EXTEND_CHAR (*(source + 1)); |
| *dest = *source & 0377; |
| *dest += temp << 8; |
| } |
| |
| # ifndef EXTRACT_MACROS /* To debug the macros. */ |
| # undef EXTRACT_NUMBER |
| # define EXTRACT_NUMBER(dest, src) extract_number (&dest, src) |
| # endif /* not EXTRACT_MACROS */ |
| |
| #endif /* DEBUG */ |
| |
| /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number. |
| SOURCE must be an lvalue. */ |
| |
| #define EXTRACT_NUMBER_AND_INCR(destination, source) \ |
| do { \ |
| EXTRACT_NUMBER (destination, source); \ |
| (source) += 2; \ |
| } while (0) |
| |
| #ifdef DEBUG |
| static void extract_number_and_incr _RE_ARGS ((int *destination, |
| unsigned char **source)); |
| static void |
| extract_number_and_incr (destination, source) |
| int *destination; |
| unsigned char **source; |
| { |
| extract_number (destination, *source); |
| *source += 2; |
| } |
| |
| # ifndef EXTRACT_MACROS |
| # undef EXTRACT_NUMBER_AND_INCR |
| # define EXTRACT_NUMBER_AND_INCR(dest, src) \ |
| extract_number_and_incr (&dest, &src) |
| # endif /* not EXTRACT_MACROS */ |
| |
| #endif /* DEBUG */ |
| |
| /* If DEBUG is defined, Regex prints many voluminous messages about what |
| it is doing (if the variable `debug' is nonzero). If linked with the |
| main program in `iregex.c', you can enter patterns and strings |
| interactively. And if linked with the main program in `main.c' and |
| the other test files, you can run the already-written tests. */ |
| |
| #ifdef DEBUG |
| |
| /* We use standard I/O for debugging. */ |
| # include <stdio.h> |
| |
| /* It is useful to test things that ``must'' be true when debugging. */ |
| # include <assert.h> |
| |
| static int debug; |
| |
| # define DEBUG_STATEMENT(e) e |
| # define DEBUG_PRINT1(x) if (debug) printf (x) |
| # define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2) |
| # define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3) |
| # define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4) |
| # define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \ |
| if (debug) print_partial_compiled_pattern (s, e) |
| # define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \ |
| if (debug) print_double_string (w, s1, sz1, s2, sz2) |
| |
| |
| /* Print the fastmap in human-readable form. */ |
| |
| void |
| print_fastmap (fastmap) |
| char *fastmap; |
| { |
| unsigned was_a_range = 0; |
| unsigned i = 0; |
| |
| while (i < (1 << BYTEWIDTH)) |
| { |
| if (fastmap[i++]) |
| { |
| was_a_range = 0; |
| putchar (i - 1); |
| while (i < (1 << BYTEWIDTH) && fastmap[i]) |
| { |
| was_a_range = 1; |
| i++; |
| } |
| if (was_a_range) |
| { |
| printf ("-"); |
| putchar (i - 1); |
| } |
| } |
| } |
| putchar ('\n'); |
| } |
| |
| |
| /* Print a compiled pattern string in human-readable form, starting at |
| the START pointer into it and ending just before the pointer END. */ |
| |
| void |
| print_partial_compiled_pattern (start, end) |
| unsigned char *start; |
| unsigned char *end; |
| { |
| int mcnt, mcnt2; |
| unsigned char *p1; |
| unsigned char *p = start; |
| unsigned char *pend = end; |
| |
| if (start == NULL) |
| { |
| printf ("(null)\n"); |
| return; |
| } |
| |
| /* Loop over pattern commands. */ |
| while (p < pend) |
| { |
| #ifdef _LIBC |
| printf ("%t:\t", p - start); |
| #else |
| printf ("%ld:\t", (long int) (p - start)); |
| #endif |
| |
| switch ((re_opcode_t) *p++) |
| { |
| case no_op: |
| printf ("/no_op"); |
| break; |
| |
| case exactn: |
| mcnt = *p++; |
| printf ("/exactn/%d", mcnt); |
| do |
| { |
| putchar ('/'); |
| putchar (*p++); |
| } |
| while (--mcnt); |
| break; |
| |
| case start_memory: |
| mcnt = *p++; |
| printf ("/start_memory/%d/%d", mcnt, *p++); |
| break; |
| |
| case stop_memory: |
| mcnt = *p++; |
| printf ("/stop_memory/%d/%d", mcnt, *p++); |
| break; |
| |
| case duplicate: |
| printf ("/duplicate/%d", *p++); |
| break; |
| |
| case anychar: |
| printf ("/anychar"); |
| break; |
| |
| case charset: |
| case charset_not: |
| { |
| register int c, last = -100; |
| register int in_range = 0; |
| |
| printf ("/charset [%s", |
| (re_opcode_t) *(p - 1) == charset_not ? "^" : ""); |
| |
| assert (p + *p < pend); |
| |
| for (c = 0; c < 256; c++) |
| if (c / 8 < *p |
| && (p[1 + (c/8)] & (1 << (c % 8)))) |
| { |
| /* Are we starting a range? */ |
| if (last + 1 == c && ! in_range) |
| { |
| putchar ('-'); |
| in_range = 1; |
| } |
| /* Have we broken a range? */ |
| else if (last + 1 != c && in_range) |
| { |
| putchar (last); |
| in_range = 0; |
| } |
| |
| if (! in_range) |
| putchar (c); |
| |
| last = c; |
| } |
| |
| if (in_range) |
| putchar (last); |
| |
| putchar (']'); |
| |
| p += 1 + *p; |
| } |
| break; |
| |
| case begline: |
| printf ("/begline"); |
| break; |
| |
| case endline: |
| printf ("/endline"); |
| break; |
| |
| case on_failure_jump: |
| extract_number_and_incr (&mcnt, &p); |
| #ifdef _LIBC |
| printf ("/on_failure_jump to %t", p + mcnt - start); |
| #else |
| printf ("/on_failure_jump to %ld", (long int) (p + mcnt - start)); |
| #endif |
| break; |
| |
| case on_failure_keep_string_jump: |
| extract_number_and_incr (&mcnt, &p); |
| #ifdef _LIBC |
| printf ("/on_failure_keep_string_jump to %t", p + mcnt - start); |
| #else |
| printf ("/on_failure_keep_string_jump to %ld", |
| (long int) (p + mcnt - start)); |
| #endif |
| break; |
| |
| case dummy_failure_jump: |
| extract_number_and_incr (&mcnt, &p); |
| #ifdef _LIBC |
| printf ("/dummy_failure_jump to %t", p + mcnt - start); |
| #else |
| printf ("/dummy_failure_jump to %ld", (long int) (p + mcnt - start)); |
| #endif |
| break; |
| |
| case push_dummy_failure: |
| printf ("/push_dummy_failure"); |
| break; |
| |
| case maybe_pop_jump: |
| extract_number_and_incr (&mcnt, &p); |
| #ifdef _LIBC |
| printf ("/maybe_pop_jump to %t", p + mcnt - start); |
| #else |
| printf ("/maybe_pop_jump to %ld", (long int) (p + mcnt - start)); |
| #endif |
| break; |
| |
| case pop_failure_jump: |
| extract_number_and_incr (&mcnt, &p); |
| #ifdef _LIBC |
| printf ("/pop_failure_jump to %t", p + mcnt - start); |
| #else |
| printf ("/pop_failure_jump to %ld", (long int) (p + mcnt - start)); |
| #endif |
| break; |
| |
| case jump_past_alt: |
| extract_number_and_incr (&mcnt, &p); |
| #ifdef _LIBC |
| printf ("/jump_past_alt to %t", p + mcnt - start); |
| #else |
| printf ("/jump_past_alt to %ld", (long int) (p + mcnt - start)); |
| #endif |
| break; |
| |
| case jump: |
| extract_number_and_incr (&mcnt, &p); |
| #ifdef _LIBC |
| printf ("/jump to %t", p + mcnt - start); |
| #else |
| printf ("/jump to %ld", (long int) (p + mcnt - start)); |
| #endif |
| break; |
| |
| case succeed_n: |
| extract_number_and_incr (&mcnt, &p); |
| p1 = p + mcnt; |
| extract_number_and_incr (&mcnt2, &p); |
| #ifdef _LIBC |
| printf ("/succeed_n to %t, %d times", p1 - start, mcnt2); |
| #else |
| printf ("/succeed_n to %ld, %d times", |
| (long int) (p1 - start), mcnt2); |
| #endif |
| break; |
| |
| case jump_n: |
| extract_number_and_incr (&mcnt, &p); |
| p1 = p + mcnt; |
| extract_number_and_incr (&mcnt2, &p); |
| printf ("/jump_n to %d, %d times", p1 - start, mcnt2); |
| break; |
| |
| case set_number_at: |
| extract_number_and_incr (&mcnt, &p); |
| p1 = p + mcnt; |
| extract_number_and_incr (&mcnt2, &p); |
| #ifdef _LIBC |
| printf ("/set_number_at location %t to %d", p1 - start, mcnt2); |
| #else |
| printf ("/set_number_at location %ld to %d", |
| (long int) (p1 - start), mcnt2); |
| #endif |
| break; |
| |
| case wordbound: |
| printf ("/wordbound"); |
| break; |
| |
| case notwordbound: |
| printf ("/notwordbound"); |
| break; |
| |
| case wordbeg: |
| printf ("/wordbeg"); |
| break; |
| |
| case wordend: |
| printf ("/wordend"); |
| |
| # ifdef emacs |
| case before_dot: |
| printf ("/before_dot"); |
| break; |
| |
| case at_dot: |
| printf ("/at_dot"); |
| break; |
| |
| case after_dot: |
| printf ("/after_dot"); |
| break; |
| |
| case syntaxspec: |
| printf ("/syntaxspec"); |
| mcnt = *p++; |
| printf ("/%d", mcnt); |
| break; |
| |
| case notsyntaxspec: |
| printf ("/notsyntaxspec"); |
| mcnt = *p++; |
| printf ("/%d", mcnt); |
| break; |
| # endif /* emacs */ |
| |
| case wordchar: |
| printf ("/wordchar"); |
| break; |
| |
| case notwordchar: |
| printf ("/notwordchar"); |
| break; |
| |
| case begbuf: |
| printf ("/begbuf"); |
| break; |
| |
| case endbuf: |
| printf ("/endbuf"); |
| break; |
| |
| default: |
| printf ("?%d", *(p-1)); |
| } |
| |
| putchar ('\n'); |
| } |
| |
| #ifdef _LIBC |
| printf ("%t:\tend of pattern.\n", p - start); |
| #else |
| printf ("%ld:\tend of pattern.\n", (long int) (p - start)); |
| #endif |
| } |
| |
| |
| void |
| print_compiled_pattern (bufp) |
| struct re_pattern_buffer *bufp; |
| { |
| unsigned char *buffer = bufp->buffer; |
| |
| print_partial_compiled_pattern (buffer, buffer + bufp->used); |
| printf ("%ld bytes used/%ld bytes allocated.\n", |
| bufp->used, bufp->allocated); |
| |
| if (bufp->fastmap_accurate && bufp->fastmap) |
| { |
| printf ("fastmap: "); |
| print_fastmap (bufp->fastmap); |
| } |
| |
| #ifdef _LIBC |
| printf ("re_nsub: %Zd\t", bufp->re_nsub); |
| #else |
| printf ("re_nsub: %ld\t", (long int) bufp->re_nsub); |
| #endif |
| printf ("regs_alloc: %d\t", bufp->regs_allocated); |
| printf ("can_be_null: %d\t", bufp->can_be_null); |
| printf ("newline_anchor: %d\n", bufp->newline_anchor); |
| printf ("no_sub: %d\t", bufp->no_sub); |
| printf ("not_bol: %d\t", bufp->not_bol); |
| printf ("not_eol: %d\t", bufp->not_eol); |
| printf ("syntax: %lx\n", bufp->syntax); |
| /* Perhaps we should print the translate table? */ |
| } |
| |
| |
| void |
| print_double_string (where, string1, size1, string2, size2) |
| const char *where; |
| const char *string1; |
| const char *string2; |
| int size1; |
| int size2; |
| { |
| int this_char; |
| |
| if (where == NULL) |
| printf ("(null)"); |
| else |
| { |
| if (FIRST_STRING_P (where)) |
| { |
| for (this_char = where - string1; this_char < size1; this_char++) |
| putchar (string1[this_char]); |
| |
| where = string2; |
| } |
| |
| for (this_char = where - string2; this_char < size2; this_char++) |
| putchar (string2[this_char]); |
| } |
| } |
| |
| void |
| printchar (c) |
| int c; |
| { |
| putc (c, stderr); |
| } |
| |
| #else /* not DEBUG */ |
| |
| # undef assert |
| # define assert(e) |
| |
| # define DEBUG_STATEMENT(e) |
| # define DEBUG_PRINT1(x) |
| # define DEBUG_PRINT2(x1, x2) |
| # define DEBUG_PRINT3(x1, x2, x3) |
| # define DEBUG_PRINT4(x1, x2, x3, x4) |
| # define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) |
| # define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) |
| |
| #endif /* not DEBUG */ |
| |
| /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can |
| also be assigned to arbitrarily: each pattern buffer stores its own |
| syntax, so it can be changed between regex compilations. */ |
| /* This has no initializer because initialized variables in Emacs |
| become read-only after dumping. */ |
| reg_syntax_t re_syntax_options; |
| |
| |
| /* Specify the precise syntax of regexps for compilation. This provides |
| for compatibility for various utilities which historically have |
| different, incompatible syntaxes. |
| |
| The argument SYNTAX is a bit mask comprised of the various bits |
| defined in regex.h. We return the old syntax. */ |
| |
| reg_syntax_t |
| re_set_syntax (syntax) |
| reg_syntax_t syntax; |
| { |
| reg_syntax_t ret = re_syntax_options; |
| |
| re_syntax_options = syntax; |
| #ifdef DEBUG |
| if (syntax & RE_DEBUG) |
| debug = 1; |
| else if (debug) /* was on but now is not */ |
| debug = 0; |
| #endif /* DEBUG */ |
| return ret; |
| } |
| #ifdef _LIBC |
| weak_alias (__re_set_syntax, re_set_syntax) |
| #endif |
| |
| /* This table gives an error message for each of the error codes listed |
| in regex.h. Obviously the order here has to be same as there. |
| POSIX doesn't require that we do anything for REG_NOERROR, |
| but why not be nice? */ |
| |
| static const char re_error_msgid[] = |
| { |
| #define REG_NOERROR_IDX 0 |
| gettext_noop ("Success") /* REG_NOERROR */ |
| "\0" |
| #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success") |
| gettext_noop ("No match") /* REG_NOMATCH */ |
| "\0" |
| #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match") |
| gettext_noop ("Invalid regular expression") /* REG_BADPAT */ |
| "\0" |
| #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression") |
| gettext_noop ("Invalid collation character") /* REG_ECOLLATE */ |
| "\0" |
| #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character") |
| gettext_noop ("Invalid character class name") /* REG_ECTYPE */ |
| "\0" |
| #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name") |
| gettext_noop ("Trailing backslash") /* REG_EESCAPE */ |
| "\0" |
| #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash") |
| gettext_noop ("Invalid back reference") /* REG_ESUBREG */ |
| "\0" |
| #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference") |
| gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */ |
| "\0" |
| #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^") |
| gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */ |
| "\0" |
| #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(") |
| gettext_noop ("Unmatched \\{") /* REG_EBRACE */ |
| "\0" |
| #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{") |
| gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */ |
| "\0" |
| #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}") |
| gettext_noop ("Invalid range end") /* REG_ERANGE */ |
| "\0" |
| #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end") |
| gettext_noop ("Memory exhausted") /* REG_ESPACE */ |
| "\0" |
| #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted") |
| gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */ |
| "\0" |
| #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression") |
| gettext_noop ("Premature end of regular expression") /* REG_EEND */ |
| "\0" |
| #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression") |
| gettext_noop ("Regular expression too big") /* REG_ESIZE */ |
| "\0" |
| #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big") |
| gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */ |
| }; |
| |
| static const size_t re_error_msgid_idx[] = |
| { |
| REG_NOERROR_IDX, |
| REG_NOMATCH_IDX, |
| REG_BADPAT_IDX, |
| REG_ECOLLATE_IDX, |
| REG_ECTYPE_IDX, |
| REG_EESCAPE_IDX, |
| REG_ESUBREG_IDX, |
| REG_EBRACK_IDX, |
| REG_EPAREN_IDX, |
| REG_EBRACE_IDX, |
| REG_BADBR_IDX, |
| REG_ERANGE_IDX, |
| REG_ESPACE_IDX, |
| REG_BADRPT_IDX, |
| REG_EEND_IDX, |
| REG_ESIZE_IDX, |
| REG_ERPAREN_IDX |
| }; |
| |
| /* Avoiding alloca during matching, to placate r_alloc. */ |
| |
| /* Define MATCH_MAY_ALLOCATE unless we need to make sure that the |
| searching and matching functions should not call alloca. On some |
| systems, alloca is implemented in terms of malloc, and if we're |
| using the relocating allocator routines, then malloc could cause a |
| relocation, which might (if the strings being searched are in the |
| ralloc heap) shift the data out from underneath the regexp |
| routines. |
| |
| Here's another reason to avoid allocation: Emacs |
| processes input from X in a signal handler; processing X input may |
| call malloc; if input arrives while a matching routine is calling |
| malloc, then we're scrod. But Emacs can't just block input while |
| calling matching routines; then we don't notice interrupts when |
| they come in. So, Emacs blocks input around all regexp calls |
| except the matching calls, which it leaves unprotected, in the |
| faith that they will not malloc. */ |
| |
| /* Normally, this is fine. */ |
| #define MATCH_MAY_ALLOCATE |
| |
| /* When using GNU C, we are not REALLY using the C alloca, no matter |
| what config.h may say. So don't take precautions for it. */ |
| #ifdef __GNUC__ |
| # undef C_ALLOCA |
| #endif |
| |
| /* The match routines may not allocate if (1) they would do it with malloc |
| and (2) it's not safe for them to use malloc. |
| Note that if REL_ALLOC is defined, matching would not use malloc for the |
| failure stack, but we would still use it for the register vectors; |
| so REL_ALLOC should not affect this. */ |
| #if (defined C_ALLOCA || defined REGEX_MALLOC) && defined emacs |
| # undef MATCH_MAY_ALLOCATE |
| #endif |
| |
| |
| /* Failure stack declarations and macros; both re_compile_fastmap and |
| re_match_2 use a failure stack. These have to be macros because of |
| REGEX_ALLOCATE_STACK. */ |
| |
| |
| /* Number of failure points for which to initially allocate space |
| when matching. If this number is exceeded, we allocate more |
| space, so it is not a hard limit. */ |
| #ifndef INIT_FAILURE_ALLOC |
| # define INIT_FAILURE_ALLOC 5 |
| #endif |
| |
| /* Roughly the maximum number of failure points on the stack. Would be |
| exactly that if always used MAX_FAILURE_ITEMS items each time we failed. |
| This is a variable only so users of regex can assign to it; we never |
| change it ourselves. */ |
| |
| #ifdef INT_IS_16BIT |
| |
| # if defined MATCH_MAY_ALLOCATE |
| /* 4400 was enough to cause a crash on Alpha OSF/1, |
| whose default stack limit is 2mb. */ |
| long int re_max_failures = 4000; |
| # else |
| long int re_max_failures = 2000; |
| # endif |
| |
| union fail_stack_elt |
| { |
| unsigned char *pointer; |
| long int integer; |
| }; |
| |
| typedef union fail_stack_elt fail_stack_elt_t; |
| |
| typedef struct |
| { |
| fail_stack_elt_t *stack; |
| unsigned long int size; |
| unsigned long int avail; /* Offset of next open position. */ |
| } fail_stack_type; |
| |
| #else /* not INT_IS_16BIT */ |
| |
| # if defined MATCH_MAY_ALLOCATE |
| /* 4400 was enough to cause a crash on Alpha OSF/1, |
| whose default stack limit is 2mb. */ |
| int re_max_failures = 4000; |
| # else |
| int re_max_failures = 2000; |
| # endif |
| |
| union fail_stack_elt |
| { |
| unsigned char *pointer; |
| int integer; |
| }; |
| |
| typedef union fail_stack_elt fail_stack_elt_t; |
| |
| typedef struct |
| { |
| fail_stack_elt_t *stack; |
| unsigned size; |
| unsigned avail; /* Offset of next open position. */ |
| } fail_stack_type; |
| |
| #endif /* INT_IS_16BIT */ |
| |
| #define FAIL_STACK_EMPTY() (fail_stack.avail == 0) |
| #define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0) |
| #define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size) |
| |
| |
| /* Define macros to initialize and free the failure stack. |
| Do `return -2' if the alloc fails. */ |
| |
| #ifdef MATCH_MAY_ALLOCATE |
| # define INIT_FAIL_STACK() \ |
| do { \ |
| fail_stack.stack = (fail_stack_elt_t *) \ |
| REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * sizeof (fail_stack_elt_t)); \ |
| \ |
| if (fail_stack.stack == NULL) \ |
| return -2; \ |
| \ |
| fail_stack.size = INIT_FAILURE_ALLOC; \ |
| fail_stack.avail = 0; \ |
| } while (0) |
| |
| # define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack) |
| #else |
| # define INIT_FAIL_STACK() \ |
| do { \ |
| fail_stack.avail = 0; \ |
| } while (0) |
| |
| # define RESET_FAIL_STACK() |
| #endif |
| |
| |
| /* Double the size of FAIL_STACK, up to approximately `re_max_failures' items. |
| |
| Return 1 if succeeds, and 0 if either ran out of memory |
| allocating space for it or it was already too large. |
| |
| REGEX_REALLOCATE_STACK requires `destination' be declared. */ |
| |
| #define DOUBLE_FAIL_STACK(fail_stack) \ |
| ((fail_stack).size > (unsigned) (re_max_failures * MAX_FAILURE_ITEMS) \ |
| ? 0 \ |
| : ((fail_stack).stack = (fail_stack_elt_t *) \ |
| REGEX_REALLOCATE_STACK ((fail_stack).stack, \ |
| (fail_stack).size * sizeof (fail_stack_elt_t), \ |
| ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \ |
| \ |
| (fail_stack).stack == NULL \ |
| ? 0 \ |
| : ((fail_stack).size <<= 1, \ |
| 1))) |
| |
| |
| /* Push pointer POINTER on FAIL_STACK. |
| Return 1 if was able to do so and 0 if ran out of memory allocating |
| space to do so. */ |
| #define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \ |
| ((FAIL_STACK_FULL () \ |
| && !DOUBLE_FAIL_STACK (FAIL_STACK)) \ |
| ? 0 \ |
| : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \ |
| 1)) |
| |
| /* Push a pointer value onto the failure stack. |
| Assumes the variable `fail_stack'. Probably should only |
| be called from within `PUSH_FAILURE_POINT'. */ |
| #define PUSH_FAILURE_POINTER(item) \ |
| fail_stack.stack[fail_stack.avail++].pointer = (unsigned char *) (item) |
| |
| /* This pushes an integer-valued item onto the failure stack. |
| Assumes the variable `fail_stack'. Probably should only |
| be called from within `PUSH_FAILURE_POINT'. */ |
| #define PUSH_FAILURE_INT(item) \ |
| fail_stack.stack[fail_stack.avail++].integer = (item) |
| |
| /* Push a fail_stack_elt_t value onto the failure stack. |
| Assumes the variable `fail_stack'. Probably should only |
| be called from within `PUSH_FAILURE_POINT'. */ |
| #define PUSH_FAILURE_ELT(item) \ |
| fail_stack.stack[fail_stack.avail++] = (item) |
| |
| /* These three POP... operations complement the three PUSH... operations. |
| All assume that `fail_stack' is nonempty. */ |
| #define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer |
| #define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer |
| #define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail] |
| |
| /* Used to omit pushing failure point id's when we're not debugging. */ |
| #ifdef DEBUG |
| # define DEBUG_PUSH PUSH_FAILURE_INT |
| # define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_INT () |
| #else |
| # define DEBUG_PUSH(item) |
| # define DEBUG_POP(item_addr) |
| #endif |
| |
| |
| /* Push the information about the state we will need |
| if we ever fail back to it. |
| |
| Requires variables fail_stack, regstart, regend, reg_info, and |
| num_regs_pushed be declared. DOUBLE_FAIL_STACK requires `destination' |
| be declared. |
| |
| Does `return FAILURE_CODE' if runs out of memory. */ |
| |
| #define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \ |
| do { \ |
| char *destination; \ |
| /* Must be int, so when we don't save any registers, the arithmetic \ |
| of 0 + -1 isn't done as unsigned. */ \ |
| /* Can't be int, since there is not a shred of a guarantee that int \ |
| is wide enough to hold a value of something to which pointer can \ |
| be assigned */ \ |
| active_reg_t this_reg; \ |
| \ |
| DEBUG_STATEMENT (failure_id++); \ |
| DEBUG_STATEMENT (nfailure_points_pushed++); \ |
| DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \ |
| DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail);\ |
| DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\ |
| \ |
| DEBUG_PRINT2 (" slots needed: %ld\n", NUM_FAILURE_ITEMS); \ |
| DEBUG_PRINT2 (" available: %d\n", REMAINING_AVAIL_SLOTS); \ |
| \ |
| /* Ensure we have enough space allocated for what we will push. */ \ |
| while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \ |
| { \ |
| if (!DOUBLE_FAIL_STACK (fail_stack)) \ |
| return failure_code; \ |
| \ |
| DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", \ |
| (fail_stack).size); \ |
| DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\ |
| } \ |
| \ |
| /* Push the info, starting with the registers. */ \ |
| DEBUG_PRINT1 ("\n"); \ |
| \ |
| if (1) \ |
| for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \ |
| this_reg++) \ |
| { \ |
| DEBUG_PRINT2 (" Pushing reg: %lu\n", this_reg); \ |
| DEBUG_STATEMENT (num_regs_pushed++); \ |
| \ |
| DEBUG_PRINT2 (" start: %p\n", regstart[this_reg]); \ |
| PUSH_FAILURE_POINTER (regstart[this_reg]); \ |
| \ |
| DEBUG_PRINT2 (" end: %p\n", regend[this_reg]); \ |
| PUSH_FAILURE_POINTER (regend[this_reg]); \ |
| \ |
| DEBUG_PRINT2 (" info: %p\n ", \ |
| reg_info[this_reg].word.pointer); \ |
| DEBUG_PRINT2 (" match_null=%d", \ |
| REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \ |
| DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \ |
| DEBUG_PRINT2 (" matched_something=%d", \ |
| MATCHED_SOMETHING (reg_info[this_reg])); \ |
| DEBUG_PRINT2 (" ever_matched=%d", \ |
| EVER_MATCHED_SOMETHING (reg_info[this_reg])); \ |
| DEBUG_PRINT1 ("\n"); \ |
| PUSH_FAILURE_ELT (reg_info[this_reg].word); \ |
| } \ |
| \ |
| DEBUG_PRINT2 (" Pushing low active reg: %ld\n", lowest_active_reg);\ |
| PUSH_FAILURE_INT (lowest_active_reg); \ |
| \ |
| DEBUG_PRINT2 (" Pushing high active reg: %ld\n", highest_active_reg);\ |
| PUSH_FAILURE_INT (highest_active_reg); \ |
| \ |
| DEBUG_PRINT2 (" Pushing pattern %p:\n", pattern_place); \ |
| DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \ |
| PUSH_FAILURE_POINTER (pattern_place); \ |
| \ |
| DEBUG_PRINT2 (" Pushing string %p: `", string_place); \ |
| DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \ |
| size2); \ |
| DEBUG_PRINT1 ("'\n"); \ |
| PUSH_FAILURE_POINTER (string_place); \ |
| \ |
| DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \ |
| DEBUG_PUSH (failure_id); \ |
| } while (0) |
| |
| /* This is the number of items that are pushed and popped on the stack |
| for each register. */ |
| #define NUM_REG_ITEMS 3 |
| |
| /* Individual items aside from the registers. */ |
| #ifdef DEBUG |
| # define NUM_NONREG_ITEMS 5 /* Includes failure point id. */ |
| #else |
| # define NUM_NONREG_ITEMS 4 |
| #endif |
| |
| /* We push at most this many items on the stack. */ |
| /* We used to use (num_regs - 1), which is the number of registers |
| this regexp will save; but that was changed to 5 |
| to avoid stack overflow for a regexp with lots of parens. */ |
| #define MAX_FAILURE_ITEMS (5 * NUM_REG_ITEMS + NUM_NONREG_ITEMS) |
| |
| /* We actually push this many items. */ |
| #define NUM_FAILURE_ITEMS \ |
| (((0 \ |
| ? 0 : highest_active_reg - lowest_active_reg + 1) \ |
| * NUM_REG_ITEMS) \ |
| + NUM_NONREG_ITEMS) |
| |
| /* How many items can still be added to the stack without overflowing it. */ |
| #define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail) |
| |
| |
| /* Pops what PUSH_FAIL_STACK pushes. |
| |
| We restore into the parameters, all of which should be lvalues: |
| STR -- the saved data position. |
| PAT -- the saved pattern position. |
| LOW_REG, HIGH_REG -- the highest and lowest active registers. |
| REGSTART, REGEND -- arrays of string positions. |
| REG_INFO -- array of information about each subexpression. |
| |
| Also assumes the variables `fail_stack' and (if debugging), `bufp', |
| `pend', `string1', `size1', `string2', and `size2'. */ |
| |
| #define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info)\ |
| { \ |
| DEBUG_STATEMENT (unsigned failure_id;) \ |
| active_reg_t this_reg; \ |
| const unsigned char *string_temp; \ |
| \ |
| assert (!FAIL_STACK_EMPTY ()); \ |
| \ |
| /* Remove failure points and point to how many regs pushed. */ \ |
| DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \ |
| DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \ |
| DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \ |
| \ |
| assert (fail_stack.avail >= NUM_NONREG_ITEMS); \ |
| \ |
| DEBUG_POP (&failure_id); \ |
| DEBUG_PRINT2 (" Popping failure id: %u\n", failure_id); \ |
| \ |
| /* If the saved string location is NULL, it came from an \ |
| on_failure_keep_string_jump opcode, and we want to throw away the \ |
| saved NULL, thus retaining our current position in the string. */ \ |
| string_temp = POP_FAILURE_POINTER (); \ |
| if (string_temp != NULL) \ |
| str = (const char *) string_temp; \ |
| \ |
| DEBUG_PRINT2 (" Popping string %p: `", str); \ |
| DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \ |
| DEBUG_PRINT1 ("'\n"); \ |
| \ |
| pat = (unsigned char *) POP_FAILURE_POINTER (); \ |
| DEBUG_PRINT2 (" Popping pattern %p:\n", pat); \ |
| DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \ |
| \ |
| /* Restore register info. */ \ |
| high_reg = (active_reg_t) POP_FAILURE_INT (); \ |
| DEBUG_PRINT2 (" Popping high active reg: %ld\n", high_reg); \ |
| \ |
| low_reg = (active_reg_t) POP_FAILURE_INT (); \ |
| DEBUG_PRINT2 (" Popping low active reg: %ld\n", low_reg); \ |
| \ |
| if (1) \ |
| for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \ |
| { \ |
| DEBUG_PRINT2 (" Popping reg: %ld\n", this_reg); \ |
| \ |
| reg_info[this_reg].word = POP_FAILURE_ELT (); \ |
| DEBUG_PRINT2 (" info: %p\n", \ |
| reg_info[this_reg].word.pointer); \ |
| \ |
| regend[this_reg] = (const char *) POP_FAILURE_POINTER (); \ |
| DEBUG_PRINT2 (" end: %p\n", regend[this_reg]); \ |
| \ |
| regstart[this_reg] = (const char *) POP_FAILURE_POINTER (); \ |
| DEBUG_PRINT2 (" start: %p\n", regstart[this_reg]); \ |
| } \ |
| else \ |
| { \ |
| for (this_reg = highest_active_reg; this_reg > high_reg; this_reg--) \ |
| { \ |
| reg_info[this_reg].word.integer = 0; \ |
| regend[this_reg] = 0; \ |
| regstart[this_reg] = 0; \ |
| } \ |
| highest_active_reg = high_reg; \ |
| } \ |
| \ |
| set_regs_matched_done = 0; \ |
| DEBUG_STATEMENT (nfailure_points_popped++); \ |
| } /* POP_FAILURE_POINT */ |
| |
| |
| |
| /* Structure for per-register (a.k.a. per-group) information. |
| Other register information, such as the |
| starting and ending positions (which are addresses), and the list of |
| inner groups (which is a bits list) are maintained in separate |
| variables. |
| |
| We are making a (strictly speaking) nonportable assumption here: that |
| the compiler will pack our bit fields into something that fits into |
| the type of `word', i.e., is something that fits into one item on the |
| failure stack. */ |
| |
| |
| /* Declarations and macros for re_match_2. */ |
| |
| typedef union |
| { |
| fail_stack_elt_t word; |
| struct |
| { |
| /* This field is one if this group can match the empty string, |
| zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */ |
| #define MATCH_NULL_UNSET_VALUE 3 |
| unsigned match_null_string_p : 2; |
| unsigned is_active : 1; |
| unsigned matched_something : 1; |
| unsigned ever_matched_something : 1; |
| } bits; |
| } register_info_type; |
| |
| #define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p) |
| #define IS_ACTIVE(R) ((R).bits.is_active) |
| #define MATCHED_SOMETHING(R) ((R).bits.matched_something) |
| #define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something) |
| |
| |
| /* Call this when have matched a real character; it sets `matched' flags |
| for the subexpressions which we are currently inside. Also records |
| that those subexprs have matched. */ |
| #define SET_REGS_MATCHED() \ |
| do \ |
| { \ |
| if (!set_regs_matched_done) \ |
| { \ |
| active_reg_t r; \ |
| set_regs_matched_done = 1; \ |
| for (r = lowest_active_reg; r <= highest_active_reg; r++) \ |
| { \ |
| MATCHED_SOMETHING (reg_info[r]) \ |
| = EVER_MATCHED_SOMETHING (reg_info[r]) \ |
| = 1; \ |
| } \ |
| } \ |
| } \ |
| while (0) |
| |
| /* Registers are set to a sentinel when they haven't yet matched. */ |
| static char reg_unset_dummy; |
| #define REG_UNSET_VALUE (®_unset_dummy) |
| #define REG_UNSET(e) ((e) == REG_UNSET_VALUE) |
| |
| /* Subroutine declarations and macros for regex_compile. */ |
| |
| static reg_errcode_t regex_compile _RE_ARGS ((const char *pattern, size_t size, |
| reg_syntax_t syntax, |
| struct re_pattern_buffer *bufp)); |
| static void store_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc, int arg)); |
| static void store_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc, |
| int arg1, int arg2)); |
| static void insert_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc, |
| int arg, unsigned char *end)); |
| static void insert_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc, |
| int arg1, int arg2, unsigned char *end)); |
| static boolean at_begline_loc_p _RE_ARGS ((const char *pattern, const char *p, |
| reg_syntax_t syntax)); |
| static boolean at_endline_loc_p _RE_ARGS ((const char *p, const char *pend, |
| reg_syntax_t syntax)); |
| static reg_errcode_t compile_range _RE_ARGS ((unsigned int range_start, |
| const char **p_ptr, |
| const char *pend, |
| char *translate, |
| reg_syntax_t syntax, |
| unsigned char *b)); |
| |
| /* Fetch the next character in the uncompiled pattern---translating it |
| if necessary. Also cast from a signed character in the constant |
| string passed to us by the user to an unsigned char that we can use |
| as an array index (in, e.g., `translate'). */ |
| #ifndef PATFETCH |
| # define PATFETCH(c) \ |
| do {if (p == pend) return REG_EEND; \ |
| c = (unsigned char) *p++; \ |
| if (translate) c = (unsigned char) translate[c]; \ |
| } while (0) |
| #endif |
| |
| /* Fetch the next character in the uncompiled pattern, with no |
| translation. */ |
| #define PATFETCH_RAW(c) \ |
| do {if (p == pend) return REG_EEND; \ |
| c = (unsigned char) *p++; \ |
| } while (0) |
| |
| /* Go backwards one character in the pattern. */ |
| #define PATUNFETCH p-- |
| |
| |
| /* If `translate' is non-null, return translate[D], else just D. We |
| cast the subscript to translate because some data is declared as |
| `char *', to avoid warnings when a string constant is passed. But |
| when we use a character as a subscript we must make it unsigned. */ |
| #ifndef TRANSLATE |
| # define TRANSLATE(d) \ |
| (translate ? (char) translate[(unsigned char) (d)] : (d)) |
| #endif |
| |
| |
| /* Macros for outputting the compiled pattern into `buffer'. */ |
| |
| /* If the buffer isn't allocated when it comes in, use this. */ |
| #define INIT_BUF_SIZE 32 |
| |
| /* Make sure we have at least N more bytes of space in buffer. */ |
| #define GET_BUFFER_SPACE(n) \ |
| while ((unsigned long) (b - bufp->buffer + (n)) > bufp->allocated) \ |
| EXTEND_BUFFER () |
| |
| /* Make sure we have one more byte of buffer space and then add C to it. */ |
| #define BUF_PUSH(c) \ |
| do { \ |
| GET_BUFFER_SPACE (1); \ |
| *b++ = (unsigned char) (c); \ |
| } while (0) |
| |
| |
| /* Ensure we have two more bytes of buffer space and then append C1 and C2. */ |
| #define BUF_PUSH_2(c1, c2) \ |
| do { \ |
| GET_BUFFER_SPACE (2); \ |
| *b++ = (unsigned char) (c1); \ |
| *b++ = (unsigned char) (c2); \ |
| } while (0) |
| |
| |
| /* As with BUF_PUSH_2, except for three bytes. */ |
| #define BUF_PUSH_3(c1, c2, c3) \ |
| do { \ |
| GET_BUFFER_SPACE (3); \ |
| *b++ = (unsigned char) (c1); \ |
| *b++ = (unsigned char) (c2); \ |
| *b++ = (unsigned char) (c3); \ |
| } while (0) |
| |
| |
| /* Store a jump with opcode OP at LOC to location TO. We store a |
| relative address offset by the three bytes the jump itself occupies. */ |
| #define STORE_JUMP(op, loc, to) \ |
| store_op1 (op, loc, (int) ((to) - (loc) - 3)) |
| |
| /* Likewise, for a two-argument jump. */ |
| #define STORE_JUMP2(op, loc, to, arg) \ |
| store_op2 (op, loc, (int) ((to) - (loc) - 3), arg) |
| |
| /* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */ |
| #define INSERT_JUMP(op, loc, to) \ |
| insert_op1 (op, loc, (int) ((to) - (loc) - 3), b) |
| |
| /* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */ |
| #define INSERT_JUMP2(op, loc, to, arg) \ |
| insert_op2 (op, loc, (int) ((to) - (loc) - 3), arg, b) |
| |
| |
| /* This is not an arbitrary limit: the arguments which represent offsets |
| into the pattern are two bytes long. So if 2^16 bytes turns out to |
| be too small, many things would have to change. */ |
| /* Any other compiler which, like MSC, has allocation limit below 2^16 |
| bytes will have to use approach similar to what was done below for |
| MSC and drop MAX_BUF_SIZE a bit. Otherwise you may end up |
| reallocating to 0 bytes. Such thing is not going to work too well. |
| You have been warned!! */ |
| #if defined _MSC_VER && !defined WIN32 |
| /* Microsoft C 16-bit versions limit malloc to approx 65512 bytes. |
| The REALLOC define eliminates a flurry of conversion warnings, |
| but is not required. */ |
| # define MAX_BUF_SIZE 65500L |
| # define REALLOC(p,s) realloc ((p), (size_t) (s)) |
| #else |
| # define MAX_BUF_SIZE (1L << 16) |
| # define REALLOC(p,s) realloc ((p), (s)) |
| #endif |
| |
| /* Extend the buffer by twice its current size via realloc and |
| reset the pointers that pointed into the old block to point to the |
| correct places in the new one. If extending the buffer results in it |
| being larger than MAX_BUF_SIZE, then flag memory exhausted. */ |
| #if __BOUNDED_POINTERS__ |
| # define SET_HIGH_BOUND(P) (__ptrhigh (P) = __ptrlow (P) + bufp->allocated) |
| # define MOVE_BUFFER_POINTER(P) \ |
| (__ptrlow (P) += incr, SET_HIGH_BOUND (P), __ptrvalue (P) += incr) |
| # define ELSE_EXTEND_BUFFER_HIGH_BOUND \ |
| else \ |
| { \ |
| SET_HIGH_BOUND (b); \ |
| SET_HIGH_BOUND (begalt); \ |
| if (fixup_alt_jump) \ |
| SET_HIGH_BOUND (fixup_alt_jump); \ |
| if (laststart) \ |
| SET_HIGH_BOUND (laststart); \ |
| if (pending_exact) \ |
| SET_HIGH_BOUND (pending_exact); \ |
| } |
| #else |
| # define MOVE_BUFFER_POINTER(P) (P) += incr |
| # define ELSE_EXTEND_BUFFER_HIGH_BOUND |
| #endif |
| #define EXTEND_BUFFER() \ |
| do { \ |
| unsigned char *old_buffer = bufp->buffer; \ |
| if (bufp->allocated == MAX_BUF_SIZE) \ |
| return REG_ESIZE; \ |
| bufp->allocated <<= 1; \ |
| if (bufp->allocated > MAX_BUF_SIZE) \ |
| bufp->allocated = MAX_BUF_SIZE; \ |
| bufp->buffer = (unsigned char *) REALLOC (bufp->buffer, bufp->allocated);\ |
| if (bufp->buffer == NULL) \ |
| return REG_ESPACE; \ |
| /* If the buffer moved, move all the pointers into it. */ \ |
| if (old_buffer != bufp->buffer) \ |
| { \ |
| int incr = bufp->buffer - old_buffer; \ |
| MOVE_BUFFER_POINTER (b); \ |
| MOVE_BUFFER_POINTER (begalt); \ |
| if (fixup_alt_jump) \ |
| MOVE_BUFFER_POINTER (fixup_alt_jump); \ |
| if (laststart) \ |
| MOVE_BUFFER_POINTER (laststart); \ |
| if (pending_exact) \ |
| MOVE_BUFFER_POINTER (pending_exact); \ |
| } \ |
| ELSE_EXTEND_BUFFER_HIGH_BOUND \ |
| } while (0) |
| |
| |
| /* Since we have one byte reserved for the register number argument to |
| {start,stop}_memory, the maximum number of groups we can report |
| things about is what fits in that byte. */ |
| #define MAX_REGNUM 255 |
| |
| /* But patterns can have more than `MAX_REGNUM' registers. We just |
| ignore the excess. */ |
| typedef unsigned regnum_t; |
| |
| |
| /* Macros for the compile stack. */ |
| |
| /* Since offsets can go either forwards or backwards, this type needs to |
| be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */ |
| /* int may be not enough when sizeof(int) == 2. */ |
| typedef long pattern_offset_t; |
| |
| typedef struct |
| { |
| pattern_offset_t begalt_offset; |
| pattern_offset_t fixup_alt_jump; |
| pattern_offset_t inner_group_offset; |
| pattern_offset_t laststart_offset; |
| regnum_t regnum; |
| } compile_stack_elt_t; |
| |
| |
| typedef struct |
| { |
| compile_stack_elt_t *stack; |
| unsigned size; |
| unsigned avail; /* Offset of next open position. */ |
| } compile_stack_type; |
| |
| |
| #define INIT_COMPILE_STACK_SIZE 32 |
| |
| #define COMPILE_STACK_EMPTY (compile_stack.avail == 0) |
| #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size) |
| |
| /* The next available element. */ |
| #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail]) |
| |
| |
| /* Set the bit for character C in a list. */ |
| #define SET_LIST_BIT(c) \ |
| (b[((unsigned char) (c)) / BYTEWIDTH] \ |
| |= 1 << (((unsigned char) c) % BYTEWIDTH)) |
| |
| |
| /* Get the next unsigned number in the uncompiled pattern. */ |
| #define GET_UNSIGNED_NUMBER(num) \ |
| { if (p != pend) \ |
| { \ |
| PATFETCH (c); \ |
| while ('0' <= c && c <= '9') \ |
| { \ |
| if (num < 0) \ |
| num = 0; \ |
| num = num * 10 + c - '0'; \ |
| if (p == pend) \ |
| break; \ |
| PATFETCH (c); \ |
| } \ |
| } \ |
| } |
| |
| #if defined _LIBC || WIDE_CHAR_SUPPORT |
| /* The GNU C library provides support for user-defined character classes |
| and the functions from ISO C amendement 1. */ |
| # ifdef CHARCLASS_NAME_MAX |
| # define CHAR_CLASS_MAX_LENGTH CHARCLASS_NAME_MAX |
| # else |
| /* This shouldn't happen but some implementation might still have this |
| problem. Use a reasonable default value. */ |
| # define CHAR_CLASS_MAX_LENGTH 256 |
| # endif |
| |
| # ifdef _LIBC |
| # define IS_CHAR_CLASS(string) __wctype (string) |
| # else |
| # define IS_CHAR_CLASS(string) wctype (string) |
| # endif |
| #else |
| # define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */ |
| |
| # define IS_CHAR_CLASS(string) \ |
| (STREQ (string, "alpha") || STREQ (string, "upper") \ |
| || STREQ (string, "lower") || STREQ (string, "digit") \ |
| || STREQ (string, "alnum") || STREQ (string, "xdigit") \ |
| || STREQ (string, "space") || STREQ (string, "print") \ |
| || STREQ (string, "punct") || STREQ (string, "graph") \ |
| || STREQ (string, "cntrl") || STREQ (string, "blank")) |
| #endif |
| |
| #ifndef MATCH_MAY_ALLOCATE |
| |
| /* If we cannot allocate large objects within re_match_2_internal, |
| we make the fail stack and register vectors global. |
| The fail stack, we grow to the maximum size when a regexp |
| is compiled. |
| The register vectors, we adjust in size each time we |
| compile a regexp, according to the number of registers it needs. */ |
| |
| static fail_stack_type fail_stack; |
| |
| /* Size with which the following vectors are currently allocated. |
| That is so we can make them bigger as needed, |
| but never make them smaller. */ |
| static int regs_allocated_size; |
| |
| static const char ** regstart, ** regend; |
| static const char ** old_regstart, ** old_regend; |
| static const char **best_regstart, **best_regend; |
| static register_info_type *reg_info; |
| static const char **reg_dummy; |
| static register_info_type *reg_info_dummy; |
| |
| /* Make the register vectors big enough for NUM_REGS registers, |
| but don't make them smaller. */ |
| |
| static |
| regex_grow_registers (num_regs) |
| int num_regs; |
| { |
| if (num_regs > regs_allocated_size) |
| { |
| RETALLOC_IF (regstart, num_regs, const char *); |
| RETALLOC_IF (regend, num_regs, const char *); |
| RETALLOC_IF (old_regstart, num_regs, const char *); |
| RETALLOC_IF (old_regend, num_regs, const char *); |
| RETALLOC_IF (best_regstart, num_regs, const char *); |
| RETALLOC_IF (best_regend, num_regs, const char *); |
| RETALLOC_IF (reg_info, num_regs, register_info_type); |
| RETALLOC_IF (reg_dummy, num_regs, const char *); |
| RETALLOC_IF (reg_info_dummy, num_regs, register_info_type); |
| |
| regs_allocated_size = num_regs; |
| } |
| } |
| |
| #endif /* not MATCH_MAY_ALLOCATE */ |
| |
| static boolean group_in_compile_stack _RE_ARGS ((compile_stack_type |
| compile_stack, |
| regnum_t regnum)); |
| |
| /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX. |
| Returns one of error codes defined in `regex.h', or zero for success. |
| |
| Assumes the `allocated' (and perhaps `buffer') and `translate' |
| fields are set in BUFP on entry. |
| |
| If it succeeds, results are put in BUFP (if it returns an error, the |
| contents of BUFP are undefined): |
| `buffer' is the compiled pattern; |
| `syntax' is set to SYNTAX; |
| `used' is set to the length of the compiled pattern; |
| `fastmap_accurate' is zero; |
| `re_nsub' is the number of subexpressions in PATTERN; |
| `not_bol' and `not_eol' are zero; |
| |
| The `fastmap' and `newline_anchor' fields are neither |
| examined nor set. */ |
| |
| /* Return, freeing storage we allocated. */ |
| #define FREE_STACK_RETURN(value) \ |
| return (free (compile_stack.stack), value) |
| |
| static reg_errcode_t |
| regex_compile (pattern, size, syntax, bufp) |
| const char *pattern; |
| size_t size; |
| reg_syntax_t syntax; |
| struct re_pattern_buffer *bufp; |
| { |
| /* We fetch characters from PATTERN here. Even though PATTERN is |
| `char *' (i.e., signed), we declare these variables as unsigned, so |
| they can be reliably used as array indices. */ |
| register unsigned char c, c1; |
| |
| /* A random temporary spot in PATTERN. */ |
| const char *p1; |
| |
| /* Points to the end of the buffer, where we should append. */ |
| register unsigned char *b; |
| |
| /* Keeps track of unclosed groups. */ |
| compile_stack_type compile_stack; |
| |
| /* Points to the current (ending) position in the pattern. */ |
| const char *p = pattern; |
| const char *pend = pattern + size; |
| |
| /* How to translate the characters in the pattern. */ |
| RE_TRANSLATE_TYPE translate = bufp->translate; |
| |
| /* Address of the count-byte of the most recently inserted `exactn' |
| command. This makes it possible to tell if a new exact-match |
| character can be added to that command or if the character requires |
| a new `exactn' command. */ |
| unsigned char *pending_exact = 0; |
| |
| /* Address of start of the most recently finished expression. |
| This tells, e.g., postfix * where to find the start of its |
| operand. Reset at the beginning of groups and alternatives. */ |
| unsigned char *laststart = 0; |
| |
| /* Address of beginning of regexp, or inside of last group. */ |
| unsigned char *begalt; |
| |
| /* Place in the uncompiled pattern (i.e., the {) to |
| which to go back if the interval is invalid. */ |
| const char *beg_interval; |
| |
| /* Address of the place where a forward jump should go to the end of |
| the containing expression. Each alternative of an `or' -- except the |
| last -- ends with a forward jump of this sort. */ |
| unsigned char *fixup_alt_jump = 0; |
| |
| /* Counts open-groups as they are encountered. Remembered for the |
| matching close-group on the compile stack, so the same register |
| number is put in the stop_memory as the start_memory. */ |
| regnum_t regnum = 0; |
| |
| #ifdef DEBUG |
| DEBUG_PRINT1 ("\nCompiling pattern: "); |
| if (debug) |
| { |
| unsigned debug_count; |
| |
| for (debug_count = 0; debug_count < size; debug_count++) |
| putchar (pattern[debug_count]); |
| putchar ('\n'); |
| } |
| #endif /* DEBUG */ |
| |
| /* Initialize the compile stack. */ |
| compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t); |
| if (compile_stack.stack == NULL) |
| return REG_ESPACE; |
| |
| compile_stack.size = INIT_COMPILE_STACK_SIZE; |
| compile_stack.avail = 0; |
| |
| /* Initialize the pattern buffer. */ |
| bufp->syntax = syntax; |
| bufp->fastmap_accurate = 0; |
| bufp->not_bol = bufp->not_eol = 0; |
| |
| /* Set `used' to zero, so that if we return an error, the pattern |
| printer (for debugging) will think there's no pattern. We reset it |
| at the end. */ |
| bufp->used = 0; |
| |
| /* Always count groups, whether or not bufp->no_sub is set. */ |
| bufp->re_nsub = 0; |
| |
| #if !defined emacs && !defined SYNTAX_TABLE |
| /* Initialize the syntax table. */ |
| init_syntax_once (); |
| #endif |
| |
| if (bufp->allocated == 0) |
| { |
| if (bufp->buffer) |
| { /* If zero allocated, but buffer is non-null, try to realloc |
| enough space. This loses if buffer's address is bogus, but |
| that is the user's responsibility. */ |
| RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char); |
| } |
| else |
| { /* Caller did not allocate a buffer. Do it for them. */ |
| bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char); |
| } |
| if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE); |
| |
| bufp->allocated = INIT_BUF_SIZE; |
| } |
| |
| begalt = b = bufp->buffer; |
| |
| /* Loop through the uncompiled pattern until we're at the end. */ |
| while (p != pend) |
| { |
| PATFETCH (c); |
| |
| switch (c) |
| { |
| case '^': |
| { |
| if ( /* If at start of pattern, it's an operator. */ |
| p == pattern + 1 |
| /* If context independent, it's an operator. */ |
| || syntax & RE_CONTEXT_INDEP_ANCHORS |
| /* Otherwise, depends on what's come before. */ |
| || at_begline_loc_p (pattern, p, syntax)) |
| BUF_PUSH (begline); |
| else |
| goto normal_char; |
| } |
| break; |
| |
| |
| case '$': |
| { |
| if ( /* If at end of pattern, it's an operator. */ |
| p == pend |
| /* If context independent, it's an operator. */ |
| || syntax & RE_CONTEXT_INDEP_ANCHORS |
| /* Otherwise, depends on what's next. */ |
| || at_endline_loc_p (p, pend, syntax)) |
| BUF_PUSH (endline); |
| else |
| goto normal_char; |
| } |
| break; |
| |
| |
| case '+': |
| case '?': |
| if ((syntax & RE_BK_PLUS_QM) |
| || (syntax & RE_LIMITED_OPS)) |
| goto normal_char; |
| handle_plus: |
| case '*': |
| /* If there is no previous pattern... */ |
| if (!laststart) |
| { |
| if (syntax & RE_CONTEXT_INVALID_OPS) |
| FREE_STACK_RETURN (REG_BADRPT); |
| else if (!(syntax & RE_CONTEXT_INDEP_OPS)) |
| goto normal_char; |
| } |
| |
| { |
| /* Are we optimizing this jump? */ |
| boolean keep_string_p = false; |
| |
| /* 1 means zero (many) matches is allowed. */ |
| char zero_times_ok = 0, many_times_ok = 0; |
| |
| /* If there is a sequence of repetition chars, collapse it |
| down to just one (the right one). We can't combine |
| interval operators with these because of, e.g., `a{2}*', |
| which should only match an even number of `a's. */ |
| |
| for (;;) |
| { |
| zero_times_ok |= c != '+'; |
| many_times_ok |= c != '?'; |
| |
| if (p == pend) |
| break; |
| |
| PATFETCH (c); |
| |
| if (c == '*' |
| || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?'))) |
| ; |
| |
| else if (syntax & RE_BK_PLUS_QM && c == '\\') |
| { |
| if (p == pend) FREE_STACK_RETURN (REG_EESCAPE); |
| |
| PATFETCH (c1); |
| if (!(c1 == '+' || c1 == '?')) |
| { |
| PATUNFETCH; |
| PATUNFETCH; |
| break; |
| } |
| |
| c = c1; |
| } |
| else |
| { |
| PATUNFETCH; |
| break; |
| } |
| |
| /* If we get here, we found another repeat character. */ |
| } |
| |
| /* Star, etc. applied to an empty pattern is equivalent |
| to an empty pattern. */ |
| if (!laststart) |
| break; |
| |
| /* Now we know whether or not zero matches is allowed |
| and also whether or not two or more matches is allowed. */ |
| if (many_times_ok) |
| { /* More than one repetition is allowed, so put in at the |
| end a backward relative jump from `b' to before the next |
| jump we're going to put in below (which jumps from |
| laststart to after this jump). |
| |
| But if we are at the `*' in the exact sequence `.*\n', |
| insert an unconditional jump backwards to the ., |
| instead of the beginning of the loop. This way we only |
| push a failure point once, instead of every time |
| through the loop. */ |
| assert (p - 1 > pattern); |
| |
| /* Allocate the space for the jump. */ |
| GET_BUFFER_SPACE (3); |
| |
| /* We know we are not at the first character of the pattern, |
| because laststart was nonzero. And we've already |
| incremented `p', by the way, to be the character after |
| the `*'. Do we have to do something analogous here |
| for null bytes, because of RE_DOT_NOT_NULL? */ |
| if (TRANSLATE (*(p - 2)) == TRANSLATE ('.') |
| && zero_times_ok |
| && p < pend && TRANSLATE (*p) == TRANSLATE ('\n') |
| && !(syntax & RE_DOT_NEWLINE)) |
| { /* We have .*\n. */ |
| STORE_JUMP (jump, b, laststart); |
| keep_string_p = true; |
| } |
| else |
| /* Anything else. */ |
| STORE_JUMP (maybe_pop_jump, b, laststart - 3); |
| |
| /* We've added more stuff to the buffer. */ |
| b += 3; |
| } |
| |
| /* On failure, jump from laststart to b + 3, which will be the |
| end of the buffer after this jump is inserted. */ |
| GET_BUFFER_SPACE (3); |
| INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump |
| : on_failure_jump, |
| laststart, b + 3); |
| pending_exact = 0; |
| b += 3; |
| |
| if (!zero_times_ok) |
| { |
| /* At least one repetition is required, so insert a |
| `dummy_failure_jump' before the initial |
| `on_failure_jump' instruction of the loop. This |
| effects a skip over that instruction the first time |
| we hit that loop. */ |
| GET_BUFFER_SPACE (3); |
| INSERT_JUMP (dummy_failure_jump, laststart, laststart + 6); |
| b += 3; |
| } |
| } |
| break; |
| |
| |
| case '.': |
| laststart = b; |
| BUF_PUSH (anychar); |
| break; |
| |
| |
| case '[': |
| { |
| boolean had_char_class = false; |
| unsigned int range_start = 0xffffffff; |
| |
| if (p == pend) FREE_STACK_RETURN (REG_EBRACK); |
| |
| /* Ensure that we have enough space to push a charset: the |
| opcode, the length count, and the bitset; 34 bytes in all. */ |
| GET_BUFFER_SPACE (34); |
| |
| laststart = b; |
| |
| /* We test `*p == '^' twice, instead of using an if |
| statement, so we only need one BUF_PUSH. */ |
| BUF_PUSH (*p == '^' ? charset_not : charset); |
| if (*p == '^') |
| p++; |
| |
| /* Remember the first position in the bracket expression. */ |
| p1 = p; |
| |
| /* Push the number of bytes in the bitmap. */ |
| BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH); |
| |
| /* Clear the whole map. */ |
| bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH); |
| |
| /* charset_not matches newline according to a syntax bit. */ |
| if ((re_opcode_t) b[-2] == charset_not |
| && (syntax & RE_HAT_LISTS_NOT_NEWLINE)) |
| SET_LIST_BIT ('\n'); |
| |
| /* Read in characters and ranges, setting map bits. */ |
| for (;;) |
| { |
| if (p == pend) FREE_STACK_RETURN (REG_EBRACK); |
| |
| PATFETCH (c); |
| |
| /* \ might escape characters inside [...] and [^...]. */ |
| if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\') |
| { |
| if (p == pend) FREE_STACK_RETURN (REG_EESCAPE); |
| |
| PATFETCH (c1); |
| SET_LIST_BIT (c1); |
| range_start = c1; |
| continue; |
| } |
| |
| /* Could be the end of the bracket expression. If it's |
| not (i.e., when the bracket expression is `[]' so |
| far), the ']' character bit gets set way below. */ |
| if (c == ']' && p != p1 + 1) |
| break; |
| |
| /* Look ahead to see if it's a range when the last thing |
| was a character class. */ |
| if (had_char_class && c == '-' && *p != ']') |
| FREE_STACK_RETURN (REG_ERANGE); |
| |
| /* Look ahead to see if it's a range when the last thing |
| was a character: if this is a hyphen not at the |
| beginning or the end of a list, then it's the range |
| operator. */ |
| if (c == '-' |
| && !(p - 2 >= pattern && p[-2] == '[') |
| && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^') |
| && *p != ']') |
| { |
| reg_errcode_t ret |
| = compile_range (range_start, &p, pend, translate, |
| syntax, b); |
| if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); |
| range_start = 0xffffffff; |
| } |
| |
| else if (p[0] == '-' && p[1] != ']') |
| { /* This handles ranges made up of characters only. */ |
| reg_errcode_t ret; |
| |
| /* Move past the `-'. */ |
| PATFETCH (c1); |
| |
| ret = compile_range (c, &p, pend, translate, syntax, b); |
| if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); |
| range_start = 0xffffffff; |
| } |
| |
| /* See if we're at the beginning of a possible character |
| class. */ |
| |
| else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':') |
| { /* Leave room for the null. */ |
| char str[CHAR_CLASS_MAX_LENGTH + 1]; |
| |
| PATFETCH (c); |
| c1 = 0; |
| |
| /* If pattern is `[[:'. */ |
| if (p == pend) FREE_STACK_RETURN (REG_EBRACK); |
| |
| for (;;) |
| { |
| PATFETCH (c); |
| if ((c == ':' && *p == ']') || p == pend) |
| break; |
| if (c1 < CHAR_CLASS_MAX_LENGTH) |
| str[c1++] = c; |
| else |
| /* This is in any case an invalid class name. */ |
| str[0] = '\0'; |
| } |
| str[c1] = '\0'; |
| |
| /* If isn't a word bracketed by `[:' and `:]': |
| undo the ending character, the letters, and leave |
| the leading `:' and `[' (but set bits for them). */ |
| if (c == ':' && *p == ']') |
| { |
| #if defined _LIBC || WIDE_CHAR_SUPPORT |
| boolean is_lower = STREQ (str, "lower"); |
| boolean is_upper = STREQ (str, "upper"); |
| wctype_t wt; |
| int ch; |
| |
| wt = IS_CHAR_CLASS (str); |
| if (wt == 0) |
| FREE_STACK_RETURN (REG_ECTYPE); |
| |
| /* Throw away the ] at the end of the character |
| class. */ |
| PATFETCH (c); |
| |
| if (p == pend) FREE_STACK_RETURN (REG_EBRACK); |
| |
| for (ch = 0; ch < 1 << BYTEWIDTH; ++ch) |
| { |
| # ifdef _LIBC |
| if (__iswctype (__btowc (ch), wt)) |
| SET_LIST_BIT (ch); |
| # else |
| if (iswctype (btowc (ch), wt)) |
| SET_LIST_BIT (ch); |
| # endif |
| |
| if (translate && (is_upper || is_lower) |
| && (ISUPPER (ch) || ISLOWER (ch))) |
| SET_LIST_BIT (ch); |
| } |
| |
| had_char_class = true; |
| #else |
| int ch; |
| boolean is_alnum = STREQ (str, "alnum"); |
| boolean is_alpha = STREQ (str, "alpha"); |
| boolean is_blank = STREQ (str, "blank"); |
| boolean is_cntrl = STREQ (str, "cntrl"); |
| boolean is_digit = STREQ (str, "digit"); |
| boolean is_graph = STREQ (str, "graph"); |
| boolean is_lower = STREQ (str, "lower"); |
| boolean is_print = STREQ (str, "print"); |
| boolean is_punct = STREQ (str, "punct"); |
| boolean is_space = STREQ (str, "space"); |
| boolean is_upper = STREQ (str, "upper"); |
| boolean is_xdigit = STREQ (str, "xdigit"); |
| |
| if (!IS_CHAR_CLASS (str)) |
| FREE_STACK_RETURN (REG_ECTYPE); |
| |
| /* Throw away the ] at the end of the character |
| class. */ |
| PATFETCH (c); |
| |
| if (p == pend) FREE_STACK_RETURN (REG_EBRACK); |
| |
| for (ch = 0; ch < 1 << BYTEWIDTH; ch++) |
| { |
| /* This was split into 3 if's to |
| avoid an arbitrary limit in some compiler. */ |
| if ( (is_alnum && ISALNUM (ch)) |
| || (is_alpha && ISALPHA (ch)) |
| || (is_blank && ISBLANK (ch)) |
| || (is_cntrl && ISCNTRL (ch))) |
| SET_LIST_BIT (ch); |
| if ( (is_digit && ISDIGIT (ch)) |
| || (is_graph && ISGRAPH (ch)) |
| || (is_lower && ISLOWER (ch)) |
| || (is_print && ISPRINT (ch))) |
| SET_LIST_BIT (ch); |
| if ( (is_punct && ISPUNCT (ch)) |
| || (is_space && ISSPACE (ch)) |
| || (is_upper && ISUPPER (ch)) |
| || (is_xdigit && ISXDIGIT (ch))) |
| SET_LIST_BIT (ch); |
| if ( translate && (is_upper || is_lower) |
| && (ISUPPER (ch) || ISLOWER (ch))) |
| SET_LIST_BIT (ch); |
| } |
| had_char_class = true; |
| #endif /* libc || wctype.h */ |
| } |
| else |
| { |
| c1++; |
| while (c1--) |
| PATUNFETCH; |
| SET_LIST_BIT ('['); |
| SET_LIST_BIT (':'); |
| range_start = ':'; |
| had_char_class = false; |
| } |
| } |
| else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == '=') |
| { |
| unsigned char str[MB_LEN_MAX + 1]; |
| #ifdef _LIBC |
| uint32_t nrules = |
| _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); |
| #endif |
| |
| PATFETCH (c); |
| c1 = 0; |
| |
| /* If pattern is `[[='. */ |
| if (p == pend) FREE_STACK_RETURN (REG_EBRACK); |
| |
| for (;;) |
| { |
| PATFETCH (c); |
| if ((c == '=' && *p == ']') || p == pend) |
| break; |
| if (c1 < MB_LEN_MAX) |
| str[c1++] = c; |
| else |
| /* This is in any case an invalid class name. */ |
| str[0] = '\0'; |
| } |
| str[c1] = '\0'; |
| |
| if (c == '=' && *p == ']' && str[0] != '\0') |
| { |
| /* If we have no collation data we use the default |
| collation in which each character is in a class |
| by itself. It also means that ASCII is the |
| character set and therefore we cannot have character |
| with more than one byte in the multibyte |
| representation. */ |
| #ifdef _LIBC |
| if (nrules == 0) |
| #endif |
| { |
| if (c1 != 1) |
| FREE_STACK_RETURN (REG_ECOLLATE); |
| |
| /* Throw away the ] at the end of the equivalence |
| class. */ |
| PATFETCH (c); |
| |
| /* Set the bit for the character. */ |
| SET_LIST_BIT (str[0]); |
| } |
| #ifdef _LIBC |
| else |
| { |
| /* Try to match the byte sequence in `str' against |
| those known to the collate implementation. |
| First find out whether the bytes in `str' are |
| actually from exactly one character. */ |
| const int32_t *table; |
| const unsigned char *weights; |
| const unsigned char *extra; |
| const int32_t *indirect; |
| int32_t idx; |
| const unsigned char *cp = str; |
| int ch; |
| |
| /* This #include defines a local function! */ |
| # include <locale/weight.h> |
| |
| table = (const int32_t *) |
| _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); |
| weights = (const unsigned char *) |
| _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB); |
| extra = (const unsigned char *) |
| _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB); |
| indirect = (const int32_t *) |
| _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB); |
| |
| idx = findidx (&cp); |
| if (idx == 0 || cp < str + c1) |
| /* This is no valid character. */ |
| FREE_STACK_RETURN (REG_ECOLLATE); |
| |
| /* Throw away the ] at the end of the equivalence |
| class. */ |
| PATFETCH (c); |
| |
| /* Now we have to go throught the whole table |
| and find all characters which have the same |
| first level weight. |
| |
| XXX Note that this is not entirely correct. |
| we would have to match multibyte sequences |
| but this is not possible with the current |
| implementation. */ |
| for (ch = 1; ch < 256; ++ch) |
| /* XXX This test would have to be changed if we |
| would allow matching multibyte sequences. */ |
| if (table[ch] > 0) |
| { |
| int32_t idx2 = table[ch]; |
| size_t len = weights[idx2]; |
| |
| /* Test whether the lenghts match. */ |
| if (weights[idx] == len) |
| { |
| /* They do. New compare the bytes of |
| the weight. */ |
| size_t cnt = 0; |
| |
| while (cnt < len |
| && (weights[idx + 1 + cnt] |
| == weights[idx2 + 1 + cnt])) |
| ++len; |
| |
| if (cnt == len) |
| /* They match. Mark the character as |
| acceptable. */ |
| SET_LIST_BIT (ch); |
| } |
| } |
| } |
| #endif |
| had_char_class = true; |
| } |
| else |
| { |
| c1++; |
| while (c1--) |
| PATUNFETCH; |
| SET_LIST_BIT ('['); |
| SET_LIST_BIT ('='); |
| range_start = '='; |
| had_char_class = false; |
| } |
| } |
| else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == '.') |
| { |
| unsigned char str[128]; /* Should be large enough. */ |
| #ifdef _LIBC |
| uint32_t nrules = |
| _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); |
| #endif |
| |
| PATFETCH (c); |
| c1 = 0; |
| |
| /* If pattern is `[[='. */ |
| if (p == pend) FREE_STACK_RETURN (REG_EBRACK); |
| |
| for (;;) |
| { |
| PATFETCH (c); |
| if ((c == '.' && *p == ']') || p == pend) |
| break; |
| if (c1 < sizeof (str)) |
| str[c1++] = c; |
| else |
| /* This is in any case an invalid class name. */ |
| str[0] = '\0'; |
| } |
| str[c1] = '\0'; |
| |
| if (c == '.' && *p == ']' && str[0] != '\0') |
| { |
| /* If we have no collation data we use the default |
| collation in which each character is the name |
| for its own class which contains only the one |
| character. It also means that ASCII is the |
| character set and therefore we cannot have character |
| with more than one byte in the multibyte |
| representation. */ |
| #ifdef _LIBC |
| if (nrules == 0) |
| #endif |
| { |
| if (c1 != 1) |
| FREE_STACK_RETURN (REG_ECOLLATE); |
| |
| /* Throw away the ] at the end of the equivalence |
| class. */ |
| PATFETCH (c); |
| |
| /* Set the bit for the character. */ |
| SET_LIST_BIT (str[0]); |
| range_start = ((const unsigned char *) str)[0]; |
| } |
| #ifdef _LIBC |
| else |
| { |
| /* Try to match the byte sequence in `str' against |
| those known to the collate implementation. |
| First find out whether the bytes in `str' are |
| actually from exactly one character. */ |
| int32_t table_size; |
| const int32_t *symb_table; |
| const unsigned char *extra; |
| int32_t idx; |
| int32_t elem; |
| int32_t second; |
| int32_t hash; |
| |
| table_size = |
| _NL_CURRENT_WORD (LC_COLLATE, |
| _NL_COLLATE_SYMB_HASH_SIZEMB); |
| symb_table = (const int32_t *) |
| _NL_CURRENT (LC_COLLATE, |
| _NL_COLLATE_SYMB_TABLEMB); |
| extra = (const unsigned char *) |
| _NL_CURRENT (LC_COLLATE, |
| _NL_COLLATE_SYMB_EXTRAMB); |
| |
| /* Locate the character in the hashing table. */ |
| hash = elem_hash (str, c1); |
| |
| idx = 0; |
| elem = hash % table_size; |
| second = hash % (table_size - 2); |
| while (symb_table[2 * elem] != 0) |
| { |
| /* First compare the hashing value. */ |
| if (symb_table[2 * elem] == hash |
| && c1 == extra[symb_table[2 * elem + 1]] |
| && memcmp (str, |
| &extra[symb_table[2 * elem + 1] |
| + 1], |
| c1) == 0) |
| { |
| /* Yep, this is the entry. */ |
| idx = symb_table[2 * elem + 1]; |
| idx += 1 + extra[idx]; |
| break; |
| } |
| |
| /* Next entry. */ |
| elem += second; |
| } |
| |
| if (symb_table[2 * elem] == 0) |
| /* This is no valid character. */ |
| FREE_STACK_RETURN (REG_ECOLLATE); |
| |
| /* Throw away the ] at the end of the equivalence |
| class. */ |
| PATFETCH (c); |
| |
| /* Now add the multibyte character(s) we found |
| to the accept list. |
| |
| XXX Note that this is not entirely correct. |
| we would have to match multibyte sequences |
| but this is not possible with the current |
| implementation. Also, we have to match |
| collating symbols, which expand to more than |
| one file, as a whole and not allow the |
| individual bytes. */ |
| c1 = extra[idx++]; |
| if (c1 == 1) |
| range_start = extra[idx]; |
| while (c1-- > 0) |
| { |
| SET_LIST_BIT (extra[idx]); |
| ++idx; |
| } |
| } |
| #endif |
| had_char_class = false; |
| } |
| else |
| { |
| c1++; |
| while (c1--) |
| PATUNFETCH; |
| SET_LIST_BIT ('['); |
| SET_LIST_BIT ('.'); |
| range_start = '.'; |
| had_char_class = false; |
| } |
| } |
| else |
| { |
| had_char_class = false; |
| SET_LIST_BIT (c); |
| range_start = c; |
| } |
| } |
| |
| /* Discard any (non)matching list bytes that are all 0 at the |
| end of the map. Decrease the map-length byte too. */ |
| while ((int) b[-1] > 0 && b[b[-1] - 1] == 0) |
| b[-1]--; |
| b += b[-1]; |
| } |
| break; |
| |
| |
| case '(': |
| if (syntax & RE_NO_BK_PARENS) |
| goto handle_open; |
| else |
| goto normal_char; |
| |
| |
| case ')': |
| if (syntax & RE_NO_BK_PARENS) |
| goto handle_close; |
| else |
| goto normal_char; |
| |
| |
| case '\n': |
| if (syntax & RE_NEWLINE_ALT) |
| goto handle_alt; |
| else |
| goto normal_char; |
| |
| |
| case '|': |
| if (syntax & RE_NO_BK_VBAR) |
| goto handle_alt; |
| else |
| goto normal_char; |
| |
| |
| case '{': |
| if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES) |
| goto handle_interval; |
| else |
| goto normal_char; |
| |
| |
| case '\\': |
| if (p == pend) FREE_STACK_RETURN (REG_EESCAPE); |
| |
| /* Do not translate the character after the \, so that we can |
| distinguish, e.g., \B from \b, even if we normally would |
| translate, e.g., B to b. */ |
| PATFETCH_RAW (c); |
| |
| switch (c) |
| { |
| case '(': |
| if (syntax & RE_NO_BK_PARENS) |
| goto normal_backslash; |
| |
| handle_open: |
| bufp->re_nsub++; |
| regnum++; |
| |
| if (COMPILE_STACK_FULL) |
| { |
| RETALLOC (compile_stack.stack, compile_stack.size << 1, |
| compile_stack_elt_t); |
| if (compile_stack.stack == NULL) return REG_ESPACE; |
| |
| compile_stack.size <<= 1; |
| } |
| |
| /* These are the values to restore when we hit end of this |
| group. They are all relative offsets, so that if the |
| whole pattern moves because of realloc, they will still |
| be valid. */ |
| COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer; |
| COMPILE_STACK_TOP.fixup_alt_jump |
| = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0; |
| COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer; |
| COMPILE_STACK_TOP.regnum = regnum; |
| |
| /* We will eventually replace the 0 with the number of |
| groups inner to this one. But do not push a |
| start_memory for groups beyond the last one we can |
| represent in the compiled pattern. */ |
| if (regnum <= MAX_REGNUM) |
| { |
| COMPILE_STACK_TOP.inner_group_offset = b - bufp->buffer + 2; |
| BUF_PUSH_3 (start_memory, regnum, 0); |
| } |
| |
| compile_stack.avail++; |
| |
| fixup_alt_jump = 0; |
| laststart = 0; |
| begalt = b; |
| /* If we've reached MAX_REGNUM groups, then this open |
| won't actually generate any code, so we'll have to |
| clear pending_exact explicitly. */ |
| pending_exact = 0; |
| break; |
| |
| |
| case ')': |
| if (syntax & RE_NO_BK_PARENS) goto normal_backslash; |
| |
| if (COMPILE_STACK_EMPTY) |
| { |
| if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD) |
| goto normal_backslash; |
| else |
| FREE_STACK_RETURN (REG_ERPAREN); |
| } |
| |
| handle_close: |
| if (fixup_alt_jump) |
| { /* Push a dummy failure point at the end of the |
| alternative for a possible future |
| `pop_failure_jump' to pop. See comments at |
| `push_dummy_failure' in `re_match_2'. */ |
| BUF_PUSH (push_dummy_failure); |
| |
| /* We allocated space for this jump when we assigned |
| to `fixup_alt_jump', in the `handle_alt' case below. */ |
| STORE_JUMP (jump_past_alt, fixup_alt_jump, b - 1); |
| } |
| |
| /* See similar code for backslashed left paren above. */ |
| if (COMPILE_STACK_EMPTY) |
| { |
| if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD) |
| goto normal_char; |
| else |
| FREE_STACK_RETURN (REG_ERPAREN); |
| } |
| |
| /* Since we just checked for an empty stack above, this |
| ``can't happen''. */ |
| assert (compile_stack.avail != 0); |
| { |
| /* We don't just want to restore into `regnum', because |
| later groups should continue to be numbered higher, |
| as in `(ab)c(de)' -- the second group is #2. */ |
| regnum_t this_group_regnum; |
| |
| compile_stack.avail--; |
| begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset; |
| fixup_alt_jump |
| = COMPILE_STACK_TOP.fixup_alt_jump |
| ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1 |
| : 0; |
| laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset; |
| this_group_regnum = COMPILE_STACK_TOP.regnum; |
| /* If we've reached MAX_REGNUM groups, then this open |
| won't actually generate any code, so we'll have to |
| clear pending_exact explicitly. */ |
| pending_exact = 0; |
| |
| /* We're at the end of the group, so now we know how many |
| groups were inside this one. */ |
| if (this_group_regnum <= MAX_REGNUM) |
| { |
| unsigned char *inner_group_loc |
| = bufp->buffer + COMPILE_STACK_TOP.inner_group_offset; |
| |
| *inner_group_loc = regnum - this_group_regnum; |
| BUF_PUSH_3 (stop_memory, this_group_regnum, |
| regnum - this_group_regnum); |
| } |
| } |
| break; |
| |
| |
| case '|': /* `\|'. */ |
| if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR) |
| goto normal_backslash; |
| handle_alt: |
| if (syntax & RE_LIMITED_OPS) |
| goto normal_char; |
| |
| /* Insert before the previous alternative a jump which |
| jumps to this alternative if the former fails. */ |
| GET_BUFFER_SPACE (3); |
| INSERT_JUMP (on_failure_jump, begalt, b + 6); |
| pending_exact = 0; |
| b += 3; |
| |
| /* The alternative before this one has a jump after it |
| which gets executed if it gets matched. Adjust that |
| jump so it will jump to this alternative's analogous |
| jump (put in below, which in turn will jump to the next |
| (if any) alternative's such jump, etc.). The last such |
| jump jumps to the correct final destination. A picture: |
| _____ _____ |
| | | | | |
| | v | v |
| a | b | c |
| |
| If we are at `b', then fixup_alt_jump right now points to a |
| three-byte space after `a'. We'll put in the jump, set |
| fixup_alt_jump to right after `b', and leave behind three |
| bytes which we'll fill in when we get to after `c'. */ |
| |
| if (fixup_alt_jump) |
| STORE_JUMP (jump_past_alt, fixup_alt_jump, b); |
| |
| /* Mark and leave space for a jump after this alternative, |
| to be filled in later either by next alternative or |
| when know we're at the end of a series of alternatives. */ |
| fixup_alt_jump = b; |
| GET_BUFFER_SPACE (3); |
| b += 3; |
| |
| laststart = 0; |
| begalt = b; |
| break; |
| |
| |
| case '{': |
| /* If \{ is a literal. */ |
| if (!(syntax & RE_INTERVALS) |
| /* If we're at `\{' and it's not the open-interval |
| operator. */ |
| || (syntax & RE_NO_BK_BRACES)) |
| goto normal_backslash; |
| |
| handle_interval: |
| { |
| /* If got here, then the syntax allows intervals. */ |
| |
| /* At least (most) this many matches must be made. */ |
| int lower_bound = -1, upper_bound = -1; |
| |
| beg_interval = p - 1; |
| |
| if (p == pend) |
| { |
| if (!(syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) |
| goto unfetch_interval; |
| else |
| FREE_STACK_RETURN (REG_EBRACE); |
| } |
| |
| GET_UNSIGNED_NUMBER (lower_bound); |
| |
| if (c == ',') |
| { |
| GET_UNSIGNED_NUMBER (upper_bound); |
| if ((!(syntax & RE_NO_BK_BRACES) && c != '\\') |
| || ((syntax & RE_NO_BK_BRACES) && c != '}')) |
| FREE_STACK_RETURN (REG_BADBR); |
| |
| if (upper_bound < 0) |
| upper_bound = RE_DUP_MAX; |
| } |
| else |
| /* Interval such as `{1}' => match exactly once. */ |
| upper_bound = lower_bound; |
| |
| if (lower_bound < 0 || upper_bound > RE_DUP_MAX |
| || lower_bound > upper_bound) |
| { |
| if (!(syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) |
| goto unfetch_interval; |
| else |
| FREE_STACK_RETURN (REG_BADBR); |
| } |
| |
| if (!(syntax & RE_NO_BK_BRACES)) |
| { |
| if (c != '\\') FREE_STACK_RETURN (REG_EBRACE); |
| |
| PATFETCH (c); |
| } |
| |
| if (c != '}') |
| { |
| if (!(syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) |
| goto unfetch_interval; |
| else |
| FREE_STACK_RETURN (REG_BADBR); |
| } |
| |
| /* We just parsed a valid interval. */ |
| |
| /* If it's invalid to have no preceding re. */ |
| if (!laststart) |
| { |
| if (syntax & RE_CONTEXT_INVALID_OPS) |
| FREE_STACK_RETURN (REG_BADRPT); |
| else if (syntax & RE_CONTEXT_INDEP_OPS) |
| laststart = b; |
| else |
| goto unfetch_interval; |
| } |
| |
| /* If the upper bound is zero, don't want to succeed at |
| all; jump from `laststart' to `b + 3', which will be |
| the end of the buffer after we insert the jump. */ |
| if (upper_bound == 0) |
| { |
| GET_BUFFER_SPACE (3); |
| INSERT_JUMP (jump, laststart, b + 3); |
| b += 3; |
| } |
| |
| /* Otherwise, we have a nontrivial interval. When |
| we're all done, the pattern will look like: |
| set_number_at <jump count> <upper bound> |
| set_number_at <succeed_n count> <lower bound> |
| succeed_n <after jump addr> <succeed_n count> |
| <body of loop> |
| jump_n <succeed_n addr> <jump count> |
| (The upper bound and `jump_n' are omitted if |
| `upper_bound' is 1, though.) */ |
| else |
| { /* If the upper bound is > 1, we need to insert |
| more at the end of the loop. */ |
| unsigned nbytes = 10 + (upper_bound > 1) * 10; |
| |
| GET_BUFFER_SPACE (nbytes); |
| |
| /* Initialize lower bound of the `succeed_n', even |
| though it will be set during matching by its |
| attendant `set_number_at' (inserted next), |
| because `re_compile_fastmap' needs to know. |
| Jump to the `jump_n' we might insert below. */ |
| INSERT_JUMP2 (succeed_n, laststart, |
| b + 5 + (upper_bound > 1) * 5, |
| lower_bound); |
| b += 5; |
| |
| /* Code to initialize the lower bound. Insert |
| before the `succeed_n'. The `5' is the last two |
| bytes of this `set_number_at', plus 3 bytes of |
| the following `succeed_n'. */ |
| insert_op2 (set_number_at, laststart, 5, lower_bound, b); |
| b += 5; |
| |
| if (upper_bound > 1) |
| { /* More than one repetition is allowed, so |
| append a backward jump to the `succeed_n' |
| that starts this interval. |
| |
| When we've reached this during matching, |
| we'll have matched the interval once, so |
| jump back only `upper_bound - 1' times. */ |
| STORE_JUMP2 (jump_n, b, laststart + 5, |
| upper_bound - 1); |
| b += 5; |
| |
| /* The location we want to set is the second |
| parameter of the `jump_n'; that is `b-2' as |
| an absolute address. `laststart' will be |
| the `set_number_at' we're about to insert; |
| `laststart+3' the number to set, the source |
| for the relative address. But we are |
| inserting into the middle of the pattern -- |
| so everything is getting moved up by 5. |
| Conclusion: (b - 2) - (laststart + 3) + 5, |
| i.e., b - laststart. |
| |
| We insert this at the beginning of the loop |
| so that if we fail during matching, we'll |
| reinitialize the bounds. */ |
| insert_op2 (set_number_at, laststart, b - laststart, |
| upper_bound - 1, b); |
| b += 5; |
| } |
| } |
| pending_exact = 0; |
| beg_interval = NULL; |
| } |
| break; |
| |
| unfetch_interval: |
| /* If an invalid interval, match the characters as literals. */ |
| assert (beg_interval); |
| p = beg_interval; |
| beg_interval = NULL; |
| |
| /* normal_char and normal_backslash need `c'. */ |
| PATFETCH (c); |
| |
| if (!(syntax & RE_NO_BK_BRACES)) |
| { |
| if (p > pattern && p[-1] == '\\') |
| goto normal_backslash; |
| } |
| goto normal_char; |
| |
| #ifdef emacs |
| /* There is no way to specify the before_dot and after_dot |
| operators. rms says this is ok. --karl */ |
| case '=': |
| BUF_PUSH (at_dot); |
| break; |
| |
| case 's': |
| laststart = b; |
| PATFETCH (c); |
| BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]); |
| break; |
| |
| case 'S': |
| laststart = b; |
| PATFETCH (c); |
| BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]); |
| break; |
| #endif /* emacs */ |
| |
| |
| case 'w': |
| if (syntax & RE_NO_GNU_OPS) |
| goto normal_char; |
| laststart = b; |
| BUF_PUSH (wordchar); |
| break; |
| |
| |
| case 'W': |
| if (syntax & RE_NO_GNU_OPS) |
| goto normal_char; |
| laststart = b; |
| BUF_PUSH (notwordchar); |
| break; |
| |
| |
| case '<': |
| if (syntax & RE_NO_GNU_OPS) |
| goto normal_char; |
| BUF_PUSH (wordbeg); |
| break; |
| |
| case '>': |
| if (syntax & RE_NO_GNU_OPS) |
| goto normal_char; |
| BUF_PUSH (wordend); |
| break; |
| |
| case 'b': |
| if (syntax & RE_NO_GNU_OPS) |
| goto normal_char; |
| BUF_PUSH (wordbound); |
| break; |
| |
| case 'B': |
| if (syntax & RE_NO_GNU_OPS) |
| goto normal_char; |
| BUF_PUSH (notwordbound); |
| break; |
| |
| case '`': |
| if (syntax & RE_NO_GNU_OPS) |
| goto normal_char; |
| BUF_PUSH (begbuf); |
| break; |
| |
| case '\'': |
| if (syntax & RE_NO_GNU_OPS) |
| goto normal_char; |
| BUF_PUSH (endbuf); |
| break; |
| |
| case '1': case '2': case '3': case '4': case '5': |
| case '6': case '7': case '8': case '9': |
| if (syntax & RE_NO_BK_REFS) |
| goto normal_char; |
| |
| c1 = c - '0'; |
| |
| if (c1 > regnum) |
| FREE_STACK_RETURN (REG_ESUBREG); |
| |
| /* Can't back reference to a subexpression if inside of it. */ |
| if (group_in_compile_stack (compile_stack, (regnum_t) c1)) |
| goto normal_char; |
| |
| laststart = b; |
| BUF_PUSH_2 (duplicate, c1); |
| break; |
| |
| |
| case '+': |
| case '?': |
| if (syntax & RE_BK_PLUS_QM) |
| goto handle_plus; |
| else |
| goto normal_backslash; |
| |
| default: |
| normal_backslash: |
| /* You might think it would be useful for \ to mean |
| not to translate; but if we don't translate it |
| it will never match anything. */ |
| c = TRANSLATE (c); |
| goto normal_char; |
| } |
| break; |
| |
| |
| default: |
| /* Expects the character in `c'. */ |
| normal_char: |
| |