| /* |
| * Copyright (C) 2001-2004 Sistina Software, Inc. All rights reserved. |
| * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. |
| * |
| * This file is part of the device-mapper userspace tools. |
| * |
| * This copyrighted material is made available to anyone wishing to use, |
| * modify, copy, or redistribute it subject to the terms and conditions |
| * of the GNU Lesser General Public License v.2.1. |
| * |
| * You should have received a copy of the GNU Lesser General Public License |
| * along with this program; if not, write to the Free Software Foundation, |
| * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| */ |
| |
| #include "dmlib.h" |
| #include "parse_rx.h" |
| |
| #ifdef DEBUG |
| #include <ctype.h> |
| |
| static void _regex_print(struct rx_node *rx, int depth, unsigned show_nodes) |
| { |
| int i, numchars; |
| |
| if (rx->left) { |
| if (rx->left->type != CHARSET && (show_nodes || (!((rx->type == CAT || rx->type == OR) && rx->left->type == CAT)))) |
| printf("("); |
| |
| _regex_print(rx->left, depth + 1, show_nodes); |
| |
| if (rx->left->type != CHARSET && (show_nodes || (!((rx->type == CAT || rx->type == OR) && rx->left->type == CAT)))) |
| printf(")"); |
| } |
| |
| /* display info about the node */ |
| switch (rx->type) { |
| case CAT: |
| break; |
| |
| case OR: |
| printf("|"); |
| break; |
| |
| case STAR: |
| printf("*"); |
| break; |
| |
| case PLUS: |
| printf("+"); |
| break; |
| |
| case QUEST: |
| printf("?"); |
| break; |
| |
| case CHARSET: |
| numchars = 0; |
| for (i = 0; i < 256; i++) |
| if (dm_bit(rx->charset, i) && (isprint(i) || i == HAT_CHAR || i == DOLLAR_CHAR)) |
| numchars++; |
| if (numchars == 97) { |
| printf("."); |
| break; |
| } |
| if (numchars > 1) |
| printf("["); |
| for (i = 0; i < 256; i++) |
| if (dm_bit(rx->charset, i)) { |
| if (isprint(i)) |
| printf("%c", (char) i); |
| else if (i == HAT_CHAR) |
| printf("^"); |
| else if (i == DOLLAR_CHAR) |
| printf("$"); |
| } |
| if (numchars > 1) |
| printf("]"); |
| break; |
| |
| default: |
| fprintf(stderr, "Unknown type"); |
| } |
| |
| if (rx->right) { |
| if (rx->right->type != CHARSET && (show_nodes || (!(rx->type == CAT && rx->right->type == CAT) && rx->right->right))) |
| printf("("); |
| _regex_print(rx->right, depth + 1, show_nodes); |
| if (rx->right->type != CHARSET && (show_nodes || (!(rx->type == CAT && rx->right->type == CAT) && rx->right->right))) |
| printf(")"); |
| } |
| |
| if (!depth) |
| printf("\n"); |
| } |
| #endif /* DEBUG */ |
| |
| struct parse_sp { /* scratch pad for the parsing process */ |
| struct dm_pool *mem; |
| int type; /* token type, 0 indicates a charset */ |
| dm_bitset_t charset; /* The current charset */ |
| const char *cursor; /* where we are in the regex */ |
| const char *rx_end; /* 1pte for the expression being parsed */ |
| }; |
| |
| static struct rx_node *_or_term(struct parse_sp *ps); |
| |
| static void _single_char(struct parse_sp *ps, unsigned int c, const char *ptr) |
| { |
| ps->type = 0; |
| ps->cursor = ptr + 1; |
| dm_bit_clear_all(ps->charset); |
| dm_bit_set(ps->charset, c); |
| } |
| |
| /* |
| * Get the next token from the regular expression. |
| * Returns: 1 success, 0 end of input, -1 error. |
| */ |
| static int _rx_get_token(struct parse_sp *ps) |
| { |
| int neg = 0, range = 0; |
| char c, lc = 0; |
| const char *ptr = ps->cursor; |
| if (ptr == ps->rx_end) { /* end of input ? */ |
| ps->type = -1; |
| return 0; |
| } |
| |
| switch (*ptr) { |
| /* charsets and ncharsets */ |
| case '[': |
| ptr++; |
| if (*ptr == '^') { |
| dm_bit_set_all(ps->charset); |
| |
| /* never transition on zero */ |
| dm_bit_clear(ps->charset, 0); |
| neg = 1; |
| ptr++; |
| |
| } else |
| dm_bit_clear_all(ps->charset); |
| |
| while ((ptr < ps->rx_end) && (*ptr != ']')) { |
| if (*ptr == '\\') { |
| /* an escaped character */ |
| ptr++; |
| switch (*ptr) { |
| case 'n': |
| c = '\n'; |
| break; |
| case 'r': |
| c = '\r'; |
| break; |
| case 't': |
| c = '\t'; |
| break; |
| default: |
| c = *ptr; |
| } |
| } else if (*ptr == '-' && lc) { |
| /* we've got a range on our hands */ |
| range = 1; |
| ptr++; |
| if (ptr == ps->rx_end) { |
| log_error("Incomplete range" |
| "specification"); |
| return -1; |
| } |
| c = *ptr; |
| } else |
| c = *ptr; |
| |
| if (range) { |
| /* add lc - c into the bitset */ |
| if (lc > c) { |
| char tmp = c; |
| c = lc; |
| lc = tmp; |
| } |
| |
| for (; lc <= c; lc++) { |
| if (neg) |
| dm_bit_clear(ps->charset, lc); |
| else |
| dm_bit_set(ps->charset, lc); |
| } |
| range = 0; |
| } else { |
| /* add c into the bitset */ |
| if (neg) |
| dm_bit_clear(ps->charset, c); |
| else |
| dm_bit_set(ps->charset, c); |
| } |
| ptr++; |
| lc = c; |
| } |
| |
| if (ptr >= ps->rx_end) { |
| ps->type = -1; |
| return -1; |
| } |
| |
| ps->type = 0; |
| ps->cursor = ptr + 1; |
| break; |
| |
| /* These characters are special, we just return their ASCII |
| codes as the type. Sorted into ascending order to help the |
| compiler */ |
| case '(': |
| case ')': |
| case '*': |
| case '+': |
| case '?': |
| case '|': |
| ps->type = (int) *ptr; |
| ps->cursor = ptr + 1; |
| break; |
| |
| case '^': |
| _single_char(ps, HAT_CHAR, ptr); |
| break; |
| |
| case '$': |
| _single_char(ps, DOLLAR_CHAR, ptr); |
| break; |
| |
| case '.': |
| /* The 'all but newline' character set */ |
| ps->type = 0; |
| ps->cursor = ptr + 1; |
| dm_bit_set_all(ps->charset); |
| dm_bit_clear(ps->charset, (int) '\n'); |
| dm_bit_clear(ps->charset, (int) '\r'); |
| dm_bit_clear(ps->charset, 0); |
| break; |
| |
| case '\\': |
| /* escaped character */ |
| ptr++; |
| if (ptr >= ps->rx_end) { |
| log_error("Badly quoted character at end " |
| "of expression"); |
| ps->type = -1; |
| return -1; |
| } |
| |
| ps->type = 0; |
| ps->cursor = ptr + 1; |
| dm_bit_clear_all(ps->charset); |
| switch (*ptr) { |
| case 'n': |
| dm_bit_set(ps->charset, (int) '\n'); |
| break; |
| case 'r': |
| dm_bit_set(ps->charset, (int) '\r'); |
| break; |
| case 't': |
| dm_bit_set(ps->charset, (int) '\t'); |
| break; |
| default: |
| dm_bit_set(ps->charset, (int) *ptr); |
| } |
| break; |
| |
| default: |
| /* add a single character to the bitset */ |
| ps->type = 0; |
| ps->cursor = ptr + 1; |
| dm_bit_clear_all(ps->charset); |
| dm_bit_set(ps->charset, (int) (unsigned char) *ptr); |
| break; |
| } |
| |
| return 1; |
| } |
| |
| static struct rx_node *_node(struct dm_pool *mem, int type, |
| struct rx_node *l, struct rx_node *r) |
| { |
| struct rx_node *n = dm_pool_zalloc(mem, sizeof(*n)); |
| |
| if (n) { |
| if (type == CHARSET && !(n->charset = dm_bitset_create(mem, 256))) { |
| dm_pool_free(mem, n); |
| return NULL; |
| } |
| |
| n->type = type; |
| n->left = l; |
| n->right = r; |
| } |
| |
| return n; |
| } |
| |
| static struct rx_node *_term(struct parse_sp *ps) |
| { |
| struct rx_node *n; |
| |
| switch (ps->type) { |
| case 0: |
| if (!(n = _node(ps->mem, CHARSET, NULL, NULL))) |
| return_NULL; |
| |
| dm_bit_copy(n->charset, ps->charset); |
| _rx_get_token(ps); /* match charset */ |
| break; |
| |
| case '(': |
| _rx_get_token(ps); /* match '(' */ |
| n = _or_term(ps); |
| if (ps->type != ')') { |
| log_error("missing ')' in regular expression"); |
| return 0; |
| } |
| _rx_get_token(ps); /* match ')' */ |
| break; |
| |
| default: |
| n = 0; |
| } |
| |
| return n; |
| } |
| |
| static struct rx_node *_closure_term(struct parse_sp *ps) |
| { |
| struct rx_node *l, *n; |
| |
| if (!(l = _term(ps))) |
| return NULL; |
| |
| for (;;) { |
| switch (ps->type) { |
| case '*': |
| n = _node(ps->mem, STAR, l, NULL); |
| break; |
| |
| case '+': |
| n = _node(ps->mem, PLUS, l, NULL); |
| break; |
| |
| case '?': |
| n = _node(ps->mem, QUEST, l, NULL); |
| break; |
| |
| default: |
| return l; |
| } |
| |
| if (!n) |
| return_NULL; |
| |
| _rx_get_token(ps); |
| l = n; |
| } |
| |
| return n; |
| } |
| |
| static struct rx_node *_cat_term(struct parse_sp *ps) |
| { |
| struct rx_node *l, *r, *n; |
| |
| if (!(l = _closure_term(ps))) |
| return NULL; |
| |
| if (ps->type == '|') |
| return l; |
| |
| if (!(r = _cat_term(ps))) |
| return l; |
| |
| if (!(n = _node(ps->mem, CAT, l, r))) |
| stack; |
| |
| return n; |
| } |
| |
| static struct rx_node *_or_term(struct parse_sp *ps) |
| { |
| struct rx_node *l, *r, *n; |
| |
| if (!(l = _cat_term(ps))) |
| return NULL; |
| |
| if (ps->type != '|') |
| return l; |
| |
| _rx_get_token(ps); /* match '|' */ |
| |
| if (!(r = _or_term(ps))) { |
| log_error("Badly formed 'or' expression"); |
| return NULL; |
| } |
| |
| if (!(n = _node(ps->mem, OR, l, r))) |
| stack; |
| |
| return n; |
| } |
| |
| /*----------------------------------------------------------------*/ |
| |
| /* Macros for left and right nodes. Inverted if 'leftmost' is set. */ |
| #define LEFT(a) (leftmost ? (a)->left : (a)->right) |
| #define RIGHT(a) (leftmost ? (a)->right : (a)->left) |
| |
| /* |
| * The optimiser spots common prefixes on either side of an 'or' node, and |
| * lifts them outside the 'or' with a 'cat'. |
| */ |
| static unsigned _depth(struct rx_node *r, unsigned leftmost) |
| { |
| int count = 1; |
| |
| while (r->type != CHARSET && LEFT(r) && (leftmost || r->type != OR)) { |
| count++; |
| r = LEFT(r); |
| } |
| |
| return count; |
| } |
| |
| /* |
| * FIXME: a unique key could be built up as part of the parse, to make the |
| * comparison quick. Alternatively we could use cons-hashing, and then |
| * this would simply be a pointer comparison. |
| */ |
| static int _nodes_equal(struct rx_node *l, struct rx_node *r) |
| { |
| if (l->type != r->type) |
| return 0; |
| |
| switch (l->type) { |
| case CAT: |
| case OR: |
| return _nodes_equal(l->left, r->left) && |
| _nodes_equal(l->right, r->right); |
| |
| case STAR: |
| case PLUS: |
| case QUEST: |
| return _nodes_equal(l->left, r->left); |
| |
| case CHARSET: |
| /* |
| * Never change anything containing TARGET_TRANS |
| * used by matcher as boundary marker between concatenated |
| * expressions. |
| */ |
| return (!dm_bit(l->charset, TARGET_TRANS) && dm_bitset_equal(l->charset, r->charset)); |
| } |
| |
| /* NOTREACHED */ |
| return_0; |
| } |
| |
| static int _find_leftmost_common(struct rx_node *or, |
| struct rx_node **l, |
| struct rx_node **r, |
| unsigned leftmost) |
| { |
| struct rx_node *left = or->left, *right = or->right; |
| unsigned left_depth = _depth(left, leftmost); |
| unsigned right_depth = _depth(right, leftmost); |
| |
| while (left_depth > right_depth && left->type != OR) { |
| left = LEFT(left); |
| left_depth--; |
| } |
| |
| while (right_depth > left_depth && right->type != OR) { |
| right = LEFT(right); |
| right_depth--; |
| } |
| |
| if (left_depth != right_depth) |
| return 0; |
| |
| while (left_depth) { |
| if (left->type == CAT && right->type == CAT) { |
| if (_nodes_equal(LEFT(left), LEFT(right))) { |
| *l = left; |
| *r = right; |
| return 1; |
| } |
| } |
| if (left->type == OR || right->type == OR) |
| break; |
| left = LEFT(left); |
| right = LEFT(right); |
| left_depth--; |
| } |
| |
| return 0; |
| } |
| |
| /* If top node is OR, rotate (leftmost example) from ((ab)|((ac)|d)) to (((ab)|(ac))|d) */ |
| static int _rotate_ors(struct rx_node *r, unsigned leftmost) |
| { |
| struct rx_node *old_node; |
| |
| if (r->type != OR || RIGHT(r)->type != OR) |
| return 0; |
| |
| old_node = RIGHT(r); |
| |
| if (leftmost) { |
| r->right = RIGHT(old_node); |
| old_node->right = LEFT(old_node); |
| old_node->left = LEFT(r); |
| r->left = old_node; |
| } else { |
| r->left = RIGHT(old_node); |
| old_node->left = LEFT(old_node); |
| old_node->right = LEFT(r); |
| r->right = old_node; |
| } |
| |
| return 1; |
| } |
| |
| static struct rx_node *_exchange_nodes(struct dm_pool *mem, struct rx_node *r, |
| struct rx_node *left_cat, struct rx_node *right_cat, |
| unsigned leftmost) |
| { |
| struct rx_node *new_r; |
| |
| if (leftmost) |
| new_r = _node(mem, CAT, LEFT(left_cat), r); |
| else |
| new_r = _node(mem, CAT, r, LEFT(right_cat)); |
| |
| if (!new_r) |
| return_NULL; |
| |
| memcpy(left_cat, RIGHT(left_cat), sizeof(*left_cat)); |
| memcpy(right_cat, RIGHT(right_cat), sizeof(*right_cat)); |
| |
| return new_r; |
| } |
| |
| static struct rx_node *_pass(struct dm_pool *mem, |
| struct rx_node *r, |
| int *changed) |
| { |
| struct rx_node *left, *right; |
| |
| /* |
| * walk the tree, optimising every 'or' node. |
| */ |
| switch (r->type) { |
| case CAT: |
| if (!(r->left = _pass(mem, r->left, changed))) |
| return_NULL; |
| |
| if (!(r->right = _pass(mem, r->right, changed))) |
| return_NULL; |
| |
| break; |
| |
| case STAR: |
| case PLUS: |
| case QUEST: |
| if (!(r->left = _pass(mem, r->left, changed))) |
| return_NULL; |
| |
| break; |
| case OR: |
| /* It's important we optimise sub nodes first */ |
| if (!(r->left = _pass(mem, r->left, changed))) |
| return_NULL; |
| |
| if (!(r->right = _pass(mem, r->right, changed))) |
| return_NULL; |
| /* |
| * If rotate_ors changes the tree, left and right are stale, |
| * so just set 'changed' to repeat the search. |
| * |
| * FIXME Check we can't 'bounce' between left and right rotations here. |
| */ |
| if (_find_leftmost_common(r, &left, &right, 1)) { |
| if (!_rotate_ors(r, 1)) |
| r = _exchange_nodes(mem, r, left, right, 1); |
| *changed = 1; |
| } else if (_find_leftmost_common(r, &left, &right, 0)) { |
| if (!_rotate_ors(r, 0)) |
| r = _exchange_nodes(mem, r, left, right, 0); |
| *changed = 1; |
| } |
| break; |
| |
| case CHARSET: |
| break; |
| } |
| |
| return r; |
| } |
| |
| static struct rx_node *_optimise(struct dm_pool *mem, struct rx_node *r) |
| { |
| /* |
| * We're looking for (or (... (cat <foo> a)) (... (cat <foo> b))) |
| * and want to turn it into (cat <foo> (or (... a) (... b))) |
| * |
| * (fa)|(fb) becomes f(a|b) |
| */ |
| |
| /* |
| * Initially done as an inefficient multipass algorithm. |
| */ |
| int changed; |
| |
| do { |
| changed = 0; |
| r = _pass(mem, r, &changed); |
| } while (r && changed); |
| |
| return r; |
| } |
| |
| /*----------------------------------------------------------------*/ |
| |
| struct rx_node *rx_parse_tok(struct dm_pool *mem, |
| const char *begin, const char *end) |
| { |
| struct rx_node *r; |
| struct parse_sp *ps = dm_pool_zalloc(mem, sizeof(*ps)); |
| |
| if (!ps) |
| return_NULL; |
| |
| ps->mem = mem; |
| if (!(ps->charset = dm_bitset_create(mem, 256))) { |
| log_error("Regex charset allocation failed"); |
| dm_pool_free(mem, ps); |
| return NULL; |
| } |
| ps->cursor = begin; |
| ps->rx_end = end; |
| _rx_get_token(ps); /* load the first token */ |
| |
| if (!(r = _or_term(ps))) { |
| log_error("Parse error in regex"); |
| dm_pool_free(mem, ps); |
| return NULL; |
| } |
| |
| if (!(r = _optimise(mem, r))) { |
| log_error("Regex optimisation error"); |
| dm_pool_free(mem, ps); |
| return NULL; |
| } |
| |
| return r; |
| } |
| |
| struct rx_node *rx_parse_str(struct dm_pool *mem, const char *str) |
| { |
| return rx_parse_tok(mem, str, str + strlen(str)); |
| } |