/* * simple_regex.c * * Simle regex library. * * Copyright (C) 2004,2005 Martin Schlemmer * * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation version 2 of the License. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 675 Mass Ave, Cambridge, MA 02139, USA. * * $Header$ */ /* * Some notes: * * - This is a very simple regex library (read: return a match if some string * matches some regex). It is probably not POSIX (if there are a POSIX or * other standard) compatible. * * - I primarily wrote it to _not_ use glibc type regex functions, in case we * might want to use it in code that have to be linked agaist klibc, etc. * * - It really is not optimized in any way yet. * * - Supported operators are: * * '.', '?', '*', '+' - So called 'wildcards' * '[a-z]', '[^a-z]' - Basic 'lists'. Note that 'a-z' just specify that * it supports basic lists as well as sequences .. * The '^' is for an inverted list of course. * '^', '$' - The 'from start' and 'to end' operators. If these * are not used at the start ('^') or end ('$') of the * regex, they will be treated as normal characters * (this of course exclude the use of '^' in a 'list'). * * - If an invalid argument was passed, the functions returns 0 with * 'regex_data-match == 0' (no error with no match) rather than -1. It may * not be consistant with other practices, but I personally do not feel it is * a critical error for these types of functions, and there are debugging you * can enable to verify that there are no such issues. * * - __somefunction() is usually a helper function for somefunction(). I guess * recursion might be an alternative, but I try to avoid it. * * - In general if we are matching a 'wildcard' ('*', '+' or '?'), a 'word' * (read: some part of the regex that do not contain a 'wildcard' or 'list') * will have a greater 'weight' than the 'wildcard'. This means that we * will only continue to evaluate the 'wildcard' until the following 'word' * (if any) matches. Currently this do not hold true for a 'list' not * followed by a 'wildcard' - I might fix this in future. * */ #include #include #include #include #include "debug.h" #include "misc.h" #include "simple-regex.h" /* Macro to check if a regex_data_t pointer is valid */ #define CHECK_REGEX_DATA_P(_regex_data, _on_error) \ do { \ if ((NULL == _regex_data) || \ (NULL == _regex_data->data) || \ /* We do not check for this, as it might still \ * provide a match ('*' or '?' wildcard) */ \ /* (0 == strlen(_regex_data->data)) || */ \ (NULL == _regex_data->regex) || \ (0 == strlen(_regex_data->regex))) {\ DBG_MSG("Invalid argument passed!\n"); \ goto _on_error; \ } \ } while (0) size_t get_word(const char *regex, char **r_word); int match_word(regex_data_t *regex_data); size_t get_list_size(const char *regex); size_t get_list(const char *regex, char **r_list); int __match_list(regex_data_t *regex_data); int match_list(regex_data_t *regex_data); size_t get_wildcard(const char *regex, char *r_wildcard); int __match_wildcard(regex_data_t *regex_data, int (*match_func)(regex_data_t *regex_data), const char *regex); int match_wildcard(regex_data_t *regex_data); int __match(regex_data_t *regex_data); /* * Return values for match_* functions * * 0 - There was no error. If there was a match, regex_data->match * - will be > 0 (this is the definitive check - if not true, the * - other values of the struct may be bogus), regex_data->count * - will be the amount of data that was matched (might be 0 for * - some wildcards), and regex_data->r_count will be > 0. * * -1 - An error occured. Check errno for more info. * */ size_t get_word(const char *regex, char **r_word) { char *r_list; char *tmp_p; size_t count = 0; size_t tmp_count; /* NULL string means we do not have a word */ if ((NULL == regex) || (0 == strlen(regex))) { DBG_MSG("Invalid argument passed!\n"); return 0; } *r_word = malloc(strlen(regex) + 1); if (NULL == r_word) { DBG_MSG("Failed to allocate buffer!\n"); return 0; } tmp_p = *r_word; while (strlen(regex) > 0) { switch (regex[0]) { case '*': case '+': case '?': /* If its a wildcard, backup one step */ *--tmp_p = '\0'; count--; return count; case '[': tmp_count = get_list(regex, &r_list); free(r_list); /* In theory should not happen, but you never know * what may happen in future ... */ if (-1 == tmp_count) goto error; /* Bail if we have a list */ if (tmp_count > 0) { tmp_p[0] = '\0'; return count; } default: *tmp_p++ = *regex++; count++; break; } } tmp_p[0] = '\0'; return count; error: free(*r_word); return -1; } int match_word(regex_data_t *regex_data) { char *data_p = regex_data->data; char *r_word = NULL, *r_word_p; size_t count = 0; CHECK_REGEX_DATA_P(regex_data, exit); count = get_word(regex_data->regex, &r_word); if (-1 == count) goto error; if (0 == count) goto exit; r_word_p = r_word; while ((strlen(data_p) > 0) && (strlen(r_word_p) > 0 )) { /* If 'r_word' is not 100% part of 'string', we do not have * a match. If its a '.', it matches no matter what. */ if ((data_p[0] != r_word_p[0]) && (r_word_p[0] != '.')) { count = 0; goto exit; } data_p++; r_word_p++; } /* If 'string' is shorter than 'r_word', we do not have a match */ if ((0 == strlen(data_p)) && (0 < strlen(r_word_p))) { count = 0; goto exit; } exit: /* Fill in our structure */ if (0 == count) regex_data->match = REGEX_NO_MATCH; else if (strlen(regex_data->data) == count) regex_data->match = REGEX_FULL_MATCH; else regex_data->match = REGEX_PARTIAL_MATCH; if (regex_data->match != REGEX_NO_MATCH) regex_data->where = regex_data->data; else regex_data->where = NULL; regex_data->count = count; regex_data->r_count = count; free(r_word); return 0; error: regex_data->match = REGEX_NO_MATCH; free(r_word); return -1; } size_t get_list_size(const char *regex) { size_t count = 0; /* NULL string means we do not have a list */ if ((NULL == regex) || (0 == strlen(regex)) || (regex[0] != '[')) { DBG_MSG("Invalid argument passed!\n"); return 0; } regex++; while ((strlen(regex) > 0) && (regex[0] != ']')) { /* We have a sequence (x-y) */ if ((regex[0] == '-') && (regex[1] != ']') && (strlen(regex) >= 2) && (regex[-1] < regex[1])) { /* Add current + diff in sequence */ count += regex[1] - regex[-1]; /* Take care of '-' and next char */ regex += 2; } else { regex++; count++; } } return count; } size_t get_list(const char *regex, char **r_list) { char *tmp_buf = NULL; size_t count = 0; size_t size; /* NULL string means we do not have a list */ if ((NULL == regex) || (0 == strlen(regex))) { DBG_MSG("Invalid argument passed!\n"); return 0; } /* Bail if we do not have a list. Do not add debugging, as * it is very noisy (used a lot when we call match_list() in * __match() and match() to test for list matching) */ if (regex[0] != '[') return 0; size = get_list_size(regex); if (0 == size) { /* Should not be an issue, but just in case */ DBG_MSG("0 returned by get_list_size.\n"); return 0; } *r_list = malloc(size + 1); if (NULL == *r_list) { DBG_MSG("Failed to allocate buffer!\n"); return -1; } tmp_buf = *r_list; /* Take care of '[' */ regex++; count++; while ((strlen(regex) > 0) && (regex[0] != ']')) { /* We have a sequence (x-y) */ if ((regex[0] == '-') && (regex[1] != ']') && (strlen(regex) >= 2) && (regex[-1] < regex[1])) { /* Fill in missing chars in sequence */ while (tmp_buf[-1] < regex[1]) { tmp_buf[0] = (char)(tmp_buf[-1] + 1); tmp_buf++; /* We do not increase count */ } /* Take care of '-' and next char */ count += 2; regex += 2; } else { *tmp_buf++ = *regex++; count++; } } tmp_buf[0] = '\0'; /* Take care of ']' */ count++; /* We do not have a list as it does not end in ']' */ if (regex[0] != ']') { count = 0; free(*r_list); } return count; } /* If the first is the '^' character, everything but the list is matched * NOTE: We only evaluate _ONE_ data character at a time!! */ int __match_list(regex_data_t *regex_data) { regex_data_t tmp_data; char *data_p = regex_data->data; char *list_p = regex_data->regex; char test_regex[2] = { '\0', '\0' }; int invert = 0; int match; int retval; CHECK_REGEX_DATA_P(regex_data, failed); if (list_p[0] == '^') { /* We need to invert the match */ invert = 1; /* Make sure '^' is not part of our list */ list_p++; } if (invert) /* All should be a match if not in the list */ match = 1; else /* We only have a match if in the list */ match = 0; while (strlen(list_p) > 0) { test_regex[0] = list_p[0]; FILL_REGEX_DATA(tmp_data, data_p, test_regex); retval = match_word(&tmp_data); if (-1 == retval) goto error; if (REGEX_MATCH(tmp_data)) { if (invert) /* If we exclude the list from * characters we try to match, we * have a match until one of the * list is found. */ match = 0; else /* If not, we have to keep looking * until one from the list match * before we have a match */ match = 1; break; } list_p++; } /* Fill in our structure */ if (match) { regex_data->match = REGEX_PARTIAL_MATCH; regex_data->where = regex_data->data; regex_data->count = 1; /* This one is more cosmetic, as match_list() will * do the right thing */ regex_data->r_count = 0; /* strlen(regex_data->regex); */ } else { failed: regex_data->match = REGEX_NO_MATCH; regex_data->where = NULL; regex_data->count = 0; regex_data->r_count = 0; } return 0; error: regex_data->match = REGEX_NO_MATCH; return -1; } int match_list(regex_data_t *regex_data) { regex_data_t tmp_data; char *data_p = regex_data->data; char *list_p = regex_data->regex; char *r_list = NULL; size_t r_count = 0; int retval; CHECK_REGEX_DATA_P(regex_data, failed); r_count = get_list(list_p, &r_list); if (-1 == r_count) goto error; if (0 == r_count) goto failed; FILL_REGEX_DATA(tmp_data, data_p, &list_p[r_count-1]); retval = __match_wildcard(&tmp_data, __match_list, r_list); if (-1 == retval) goto error; if (REGEX_MATCH(tmp_data)) { /* This should be 2 ('word' + 'wildcard'), so just remove * the wildcard */ tmp_data.r_count--; goto exit; } FILL_REGEX_DATA(tmp_data, data_p, r_list); retval = __match_list(&tmp_data); if (-1 == retval) goto error; if (REGEX_MATCH(tmp_data)) goto exit; failed: /* We will fill in regex_data below */ tmp_data.match = REGEX_NO_MATCH; tmp_data.where = NULL; tmp_data.count = 0; tmp_data.r_count = 0; exit: /* Fill in our structure */ regex_data->match = tmp_data.match; regex_data->where = tmp_data.where; regex_data->count = tmp_data.count; if (regex_data->match != REGEX_NO_MATCH) /* tmp_data.r_count for __match_wildcard will take care of the * wildcard, and tmp_data.r_count for __match_list will be 0 */ regex_data->r_count = r_count + tmp_data.r_count; else regex_data->r_count = 0; free(r_list); return 0; error: regex_data->match = REGEX_NO_MATCH; free(r_list); return -1; } size_t get_wildcard(const char *regex, char *r_wildcard) { /* NULL regex means we do not have a wildcard */ if ((NULL == regex) || (0 == strlen(regex))) { DBG_MSG("Invalid argument passed!\n"); return 0; } r_wildcard[0] = regex[0]; r_wildcard[2] = '\0'; switch (regex[1]) { case '*': case '+': case '?': r_wildcard[1] = regex[1]; break; default: r_wildcard[0] = '\0'; return 0; } return strlen(r_wildcard); } int __match_wildcard(regex_data_t *regex_data, int (*match_func)(regex_data_t *regex_data), const char *regex) { regex_data_t tmp_data; char *data_p = regex_data->data; char *wildcard_p = regex_data->regex; char r_wildcard[3]; size_t count = 0; size_t r_count = 0; int is_match = 0; int retval; CHECK_REGEX_DATA_P(regex_data, exit); if (NULL == match_func) { DBG_MSG("NULL match_func was passed!\n"); goto exit; } r_count = get_wildcard(wildcard_p, r_wildcard); if (0 == r_count) goto exit; FILL_REGEX_DATA(tmp_data, data_p, (char *)regex); retval = match_func(&tmp_data); if (-1 == retval) goto error; switch (r_wildcard[1]) { case '*': case '?': /* '*' and '?' always matches */ is_match = 1; case '+': /* We need to match all of them */ do { /* If we have at least one match for '+', or none * for '*' or '?', check if we have a word or list match. * We do this because a word weights more than a wildcard */ if ((strlen(wildcard_p) > 2) && ((count > 0) || (r_wildcard[1] == '*') || (r_wildcard[1] == '?'))) { regex_data_t tmp_data2; #if 0 printf("data_p = %s, wildcard_p = %s\n", data_p, wildcard_p); #endif FILL_REGEX_DATA(tmp_data2, data_p, &wildcard_p[2]); retval = match(&tmp_data2); if (-1 == retval) goto error; if (/* '.' might be a special case ... */ /* (wildcard_p[2] != '.') && */ (REGEX_MATCH(tmp_data2) && (REGEX_FULL_MATCH == tmp_data2.match))) { goto exit; } } if (REGEX_MATCH(tmp_data)) { data_p += tmp_data.count; count += tmp_data.count; is_match = 1; FILL_REGEX_DATA(tmp_data, data_p, (char *)regex); retval = match_func(&tmp_data); if (-1 == retval) goto error; } /* Only once for '?' */ } while ((REGEX_MATCH(tmp_data)) && (r_wildcard[1] != '?')); break; default: /* No wildcard */ break; } exit: /* Fill in our structure */ /* We can still have a match ('*' and '?'), although count == 0 */ if ((0 == count) && (0 == is_match)) regex_data->match = REGEX_NO_MATCH; else if (strlen(regex_data->data) == count) regex_data->match = REGEX_FULL_MATCH; else regex_data->match = REGEX_PARTIAL_MATCH; if (regex_data->match != REGEX_NO_MATCH) regex_data->where = regex_data->data; else regex_data->where = NULL; regex_data->count = count; regex_data->r_count = r_count; return 0; error: regex_data->match = REGEX_NO_MATCH; return -1; } int match_wildcard(regex_data_t *regex_data) { regex_data_t tmp_data; char *data_p = regex_data->data; char *wildcard_p = regex_data->regex; char r_wildcard[3]; size_t r_count; int retval; CHECK_REGEX_DATA_P(regex_data, failed); /* Invalid wildcard - we need a character + a regex operator */ if (strlen(wildcard_p) < 2) goto failed; r_count = get_wildcard(wildcard_p, r_wildcard); if (0 == r_count) goto failed; /* Needed so that match_word() will not bail if it sees the wildcard */ r_wildcard[1] = '\0'; FILL_REGEX_DATA(tmp_data, data_p, wildcard_p); retval = __match_wildcard(&tmp_data, match_word, r_wildcard); if (-1 == retval) goto error; if (REGEX_MATCH(tmp_data)) goto exit; failed: /* We will fill in regex_data below */ tmp_data.match = REGEX_NO_MATCH; tmp_data.where = NULL; tmp_data.count = 0; tmp_data.r_count = 0; exit: /* Fill in our structure */ regex_data->match = tmp_data.match; regex_data->where = tmp_data.where; regex_data->count = tmp_data.count; regex_data->r_count = tmp_data.r_count; return 0; error: regex_data->match = REGEX_NO_MATCH; return -1; } int __match(regex_data_t *regex_data) { regex_data_t tmp_data; char *data_p = regex_data->data; char *regex_p = regex_data->regex; size_t count = 0; size_t r_count = 0; int match = 0; int retval; CHECK_REGEX_DATA_P(regex_data, failed); while (strlen(regex_p) > 0) { #if 0 printf("data_p = '%s', regex_p = '%s'\n", data_p, regex_p); #endif FILL_REGEX_DATA(tmp_data, data_p, regex_p); retval = match_list(&tmp_data); if (-1 == retval) goto error; if (REGEX_MATCH(tmp_data)) goto match; FILL_REGEX_DATA(tmp_data, data_p, regex_p); retval = match_wildcard(&tmp_data); if (-1 == retval) goto error; if (REGEX_MATCH(tmp_data)) goto match; FILL_REGEX_DATA(tmp_data, data_p, regex_p); retval = match_word(&tmp_data); if (-1 == retval) goto error; if (REGEX_MATCH(tmp_data)) goto match; break; match: data_p += tmp_data.count; count += tmp_data.count; regex_p += tmp_data.r_count; r_count += tmp_data.r_count; match = 1; /* Check that we do not go out of bounds */ if (((data_p - regex_data->data) > strlen(regex_data->data)) || ((regex_p - regex_data->regex) > strlen(regex_data->regex))) goto failed; } /* We could not match the whole regex (data too short?) */ if (0 != strlen(regex_p)) goto failed; goto exit; failed: /* We will fill in regex_data below */ count = 0; r_count = 0; match = 0; exit: /* Fill in our structure */ /* We can still have a match ('*' and '?'), although count == 0 */ if ((0 == count) && (0 == match)) regex_data->match = REGEX_NO_MATCH; else if (strlen(regex_data->data) == count) regex_data->match = REGEX_FULL_MATCH; else regex_data->match = REGEX_PARTIAL_MATCH; if (regex_data->match != REGEX_NO_MATCH) regex_data->where = regex_data->data; else regex_data->where = NULL; regex_data->count = count; regex_data->r_count = r_count; return 0; error: regex_data->match = REGEX_NO_MATCH; return -1; } int match(regex_data_t *regex_data) { regex_data_t tmp_data; char *data_p = regex_data->data; char *regex_p; char *tmp_buf = NULL; int from_start = 0; int to_end = 0; int retval; CHECK_REGEX_DATA_P(regex_data, failed); /* We might be modifying regex_p, so make a copy */ tmp_buf = strndup(regex_data->regex, strlen(regex_data->regex)); if (NULL == tmp_buf) { DBG_MSG("Failed to allocate temporary buffer!\n"); goto error; } regex_p = tmp_buf; /* Should we only match from the start? */ if (regex_p[0] == '^') { regex_p++; from_start = 1; } /* Should we match up to the end? */ if (regex_p[strlen(regex_p) - 1] == '$') { regex_p[strlen(regex_p) - 1] = '\0'; to_end = 1; } do { FILL_REGEX_DATA(tmp_data, data_p, regex_p); retval = __match(&tmp_data); if (-1 == retval) goto error; } while ((strlen(data_p++) > 0) && (!REGEX_MATCH(tmp_data)) && (0 == from_start)); /* Compensate for above extra inc */ data_p--; /* Fill in our structure */ if (REGEX_MATCH(tmp_data)) { /* Check if we had an '$' at the end of the regex, and * verify that we still have a match */ if ((1 == to_end) && (tmp_data.count != strlen(data_p))) { goto failed; } if ((data_p == regex_data->data) && (tmp_data.match == REGEX_FULL_MATCH)) regex_data->match = REGEX_FULL_MATCH; else regex_data->match = REGEX_PARTIAL_MATCH; regex_data->where = data_p; regex_data->count = tmp_data.count; regex_data->r_count = tmp_data.r_count; if (1 == from_start) regex_data->r_count++; if (1 == to_end) regex_data->r_count++; } else { failed: regex_data->match = REGEX_NO_MATCH; regex_data->where = NULL; regex_data->count = 0; regex_data->r_count = 0; } free(tmp_buf); return 0; error: regex_data->match = REGEX_NO_MATCH; free(tmp_buf); return -1; } #if 0 int main() { regex_data_t tmp_data; FILE *rcscript; char regex[] = "^[ \t]*[d-p]+d[ \t]*(+)[ \t]*{$"; char tempstr[255]; int retval; rcscript = fopen("acpid", "r"); if (NULL == rcscript) { printf("%s", "Error opening file!"); return 1; } while (0 != fgets(tempstr, 254, rcscript)) { if (tempstr[strlen(tempstr) - 1] == '\n') tempstr[strlen(tempstr) - 1] = '\0'; FILL_REGEX_DATA(tmp_data, tempstr, regex); retval = match(&tmp_data); if (-1 != retval) { if (REGEX_MATCH(tmp_data)) { printf("*** string = '%s' ***\n", tempstr); printf("*** regex = '%s' ***\n", regex); if (REGEX_FULL_MATCH == tmp_data.match) printf("match (full): '%s', %i\n", tmp_data.where, tmp_data.count); else printf("match: '%s', %i\n", tmp_data.where, tmp_data.count); } else { printf("%s", "No match\n"); } } else { printf("%s", "Error during match\n"); } } fclose(rcscript); return 0; } #endif