/* * regutf8.c: UTF-8 regexp routines. * * Author: * Morten Welinder (terra@gnome.org) */ #include #include "regutf8.h" #include "goffice/cut-n-paste/pcre/pcre.h" #include "go-glib-extras.h" #include #include #include /* ------------------------------------------------------------------------- */ void go_regfree (GORegexp *gor) { if (gor->ppcre) { pcre_free (gor->ppcre); gor->ppcre = NULL; } } size_t go_regerror (int errcode, const GORegexp *gor, char *dst, size_t dstsize) { const char *err; size_t errlen; switch (errcode) { case REG_NOERROR: err = "?"; break; case REG_NOMATCH: err = _("Pattern not found."); break; default: case REG_BADPAT: err = _("Invalid pattern."); break; case REG_ECOLLATE: err = _("Invalid collating element."); break; case REG_ECTYPE: err = _("Invalid character class name."); break; case REG_EESCAPE: err = _("Trailing backslash."); break; case REG_ESUBREG: err = _("Invalid back reference."); break; case REG_EBRACK: err = _("Unmatched left bracket."); break; case REG_EPAREN: err = _("Parenthesis imbalance."); break; case REG_EBRACE: err = _("Unmatched \\{."); break; case REG_BADBR: err = _("Invalid contents of \\{\\}."); break; case REG_ERANGE: err = _("Invalid range end."); break; case REG_ESPACE: err = _("Out of memory."); break; case REG_BADRPT: err = _("Invalid repetition operator."); break; case REG_EEND: err = _("Premature end of pattern."); break; case REG_ESIZE: err = _("Pattern is too big."); break; case REG_ERPAREN: err = _("Unmatched ) or \\)"); break; }; errlen = strlen (err); if (dstsize > 0) { size_t copylen = MIN (errlen, dstsize - 1); memcpy (dst, err, copylen); dst[copylen] = 0; } return errlen; } int go_regcomp (GORegexp *gor, const char *pat , int cflags) { const char *errorptr; int errorofs, errorcode; pcre *r; int coptions = PCRE_UTF8 | PCRE_NO_UTF8_CHECK | ((cflags & REG_ICASE) ? PCRE_CASELESS : 0) | ((cflags & REG_NEWLINE) ? PCRE_MULTILINE : 0); gor->ppcre = r = pcre_compile2 (pat, coptions, &errorcode, &errorptr, &errorofs, NULL); if (r == NULL) { switch (errorcode) { case 1: case 2: case 3: case 37: return REG_EESCAPE; case 4: case 5: return REG_EBRACE; case 6: return REG_EBRACK; case 7: case 30: return REG_ECTYPE; case 8: return REG_ERANGE; case 9: case 10: return REG_BADRPT; case 14: case 18: case 22: return REG_EPAREN; case 15: return REG_ESUBREG; case 19: case 20: return REG_ESIZE; case 21: return REG_ESPACE; default: return REG_BADPAT; } } else { gor->re_nsub = pcre_info (r, NULL, NULL); gor->nosub = (cflags & REG_NOSUB) != 0; return 0; } } int go_regexec (const GORegexp *gor, const char *txt, size_t nmatch, GORegmatch *pmatch, int eflags) { size_t txtlen = strlen (txt); int eoptions = ((eflags & REG_NOTBOL) ? PCRE_NOTBOL : 0) | ((eflags & REG_NOTEOL) ? PCRE_NOTEOL : 0); int res; int *offsets, *allocated; int offsetcount; /* We need to totally ignore nmatch and pmatch in this case. */ if (gor->nosub) nmatch = 0; if (nmatch > 0) { /* Paranoia. */ if (nmatch >= G_MAXINT / sizeof (int) / 3) return REG_ESPACE; offsetcount = nmatch * 3; /* FIXME: we really want g_try_new but that is recent. */ offsets = allocated = g_new (int, offsetcount); if (!offsets) return REG_ESPACE; } else { offsets = allocated = NULL; offsetcount = 0; } res = pcre_exec (gor->ppcre, NULL, txt, txtlen, 0, eoptions, offsets, offsetcount); if (res >= 0) { int i; if (res == 0) res = nmatch; for (i = 0; i < res; i++) { pmatch[i].rm_so = offsets[i * 2]; pmatch[i].rm_eo = offsets[i * 2 + 1]; } for (; i < (int)nmatch; i++) { pmatch[i].rm_so = -1; pmatch[i].rm_eo = -1; } g_free (allocated); return REG_NOERROR; } g_free (allocated); switch (res) { case PCRE_ERROR_NOMATCH: return REG_NOMATCH; case PCRE_ERROR_BADUTF8: case PCRE_ERROR_BADUTF8_OFFSET: /* POSIX doesn't seem to foresee this kind of error. */ return REG_BADPAT; default: return REG_ESPACE; } } /* ------------------------------------------------------------------------- */ static GObjectClass *parent_class; enum { PROP_0, PROP_SEARCH_TEXT, PROP_REPLACE_TEXT, PROP_IS_REGEXP, PROP_IGNORE_CASE, PROP_PRESERVE_CASE, PROP_MATCH_WORDS }; /* ------------------------------------------------------------------------- */ GQuark go_search_replace_error_quark (void) { static GQuark q = 0; if (q == 0) q = g_quark_from_static_string ("go-search-replace-error-quark"); return q; } /* ------------------------------------------------------------------------- */ static void kill_compiled (GoSearchReplace *sr) { if (sr->comp_search) { go_regfree (sr->comp_search); g_free (sr->comp_search); sr->comp_search = NULL; } } /* ------------------------------------------------------------------------- */ static int go_search_replace_compile (GoSearchReplace *sr) { const char *pattern; char *tmp; int flags = 0; int res; g_return_val_if_fail (sr && sr->search_text, REG_EEND); kill_compiled (sr); if (sr->is_regexp) { pattern = sr->search_text; tmp = NULL; sr->plain_replace = (sr->replace_text && g_utf8_strchr (sr->replace_text, -1, '$') == 0 && g_utf8_strchr (sr->replace_text, -1, '\\') == 0); } else { /* * Create a regular expression equivalent to the search * string. (Thus hoping the regular expression search * routines are pretty good.) */ GString *regexp = g_string_new (NULL); go_regexp_quote (regexp, sr->search_text); pattern = tmp = g_string_free (regexp, FALSE); sr->plain_replace = TRUE; } if (sr->ignore_case) flags |= REG_ICASE; sr->comp_search = g_new0 (GORegexp, 1); res = go_regcomp (sr->comp_search, pattern, flags); g_free (tmp); return res; } /* ------------------------------------------------------------------------- */ /** * go_search_replace_verify: * @sr: Search-and-Replace info to be checked * @repl: Check replacement part too. * @err: Location to store error message. * * Checks that validity of the search-and-replace data and returns TRUE * on success. **/ gboolean go_search_replace_verify (GoSearchReplace *sr, gboolean repl, GError **err) { int comp_err; g_return_val_if_fail (sr != NULL, err ? ((*err = NULL), FALSE) : FALSE); if (!sr->search_text || sr->search_text[0] == 0) { if (err) g_set_error (err, go_search_replace_error_quark (), 0, _("Search string must not be empty.")); return FALSE; } if (repl && !sr->replace_text) { if (err) g_set_error (err, go_search_replace_error_quark (), 0, _("Search string must not be empty.")); return FALSE; } comp_err = go_search_replace_compile (sr); if (comp_err) { if (err) { char buf[500]; go_regerror (comp_err, sr->comp_search, buf, sizeof (buf)); g_set_error (err, go_search_replace_error_quark (), 0, _("Invalid search pattern (%s)"), buf); } return FALSE; } if (repl && !sr->plain_replace) { const char *s; for (s = sr->replace_text; *s; s = g_utf8_next_char (s)) { switch (*s) { case '$': s++; switch (*s) { case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': if ((*s - '0') <= (int)sr->comp_search->re_nsub) break; /* Fall through */ default: if (err) g_set_error (err, go_search_replace_error_quark (), 0, _("Invalid $-specification in replacement.")); return FALSE; } break; case '\\': if (s[1] == 0) { if (err) g_set_error (err, go_search_replace_error_quark (), 0, _("Invalid trailing backslash in replacement.")); return FALSE; } s++; break; } } } return TRUE; } /* ------------------------------------------------------------------------- */ /* * Quote a single UTF-8 encoded character from s into target and return the * location of the next character in s. */ const char * go_regexp_quote1 (GString *target, const char *s) { g_return_val_if_fail (target != NULL, NULL); g_return_val_if_fail (s != NULL, NULL); switch (*s) { case '.': case '[': case '\\': case '*': case '+': case '{': case '?': case '^': case '$': case '(': case '|': case ')': g_string_append_c (target, '\\'); g_string_append_c (target, *s); return s + 1; case 0: return s; default: do { g_string_append_c (target, *s); s++; } while ((*s & 0xc0) == 0x80); return s; } } /* ------------------------------------------------------------------------- */ /* * Regexp quote a UTF-8 string. */ void go_regexp_quote (GString *target, const char *s) { g_return_if_fail (target != NULL); g_return_if_fail (s != NULL); while (*s) s = go_regexp_quote1 (target, s); } /* ------------------------------------------------------------------------- */ static gboolean match_is_word (const char *src, const GORegmatch *pm, gboolean bolp) { /* The empty string is not a word. */ if (pm->rm_so == pm->rm_eo) return FALSE; if (pm->rm_so > 0 || !bolp) { /* We get here when something actually preceded the match. */ gunichar c_pre = g_utf8_get_char (g_utf8_prev_char (src + pm->rm_so)); if (g_unichar_isalnum (c_pre)) return FALSE; } { gunichar c_post = g_utf8_get_char (src + pm->rm_eo); if (c_post != 0 && g_unichar_isalnum (c_post)) return FALSE; } return TRUE; } /* ------------------------------------------------------------------------- */ typedef enum { SC_Upper, /* At least one letter. No lower case. */ SC_Capital, /* Something Like: This */ SC_Other } SearchCase; static SearchCase inspect_case (const char *p, const char *pend) { gboolean is_upper = TRUE; gboolean is_capital = TRUE; gboolean has_letter = FALSE; gboolean expect_upper = TRUE; for (; p < pend; p = g_utf8_next_char (p)) { gunichar c = g_utf8_get_char (p); if (g_unichar_isalpha (c)) { has_letter = TRUE; if (!g_unichar_isupper (c)) { is_upper = FALSE; } if (expect_upper ? !g_unichar_isupper (c) : !g_unichar_islower (c)) { is_capital = FALSE; } expect_upper = FALSE; } else expect_upper = TRUE; } if (has_letter) { if (is_upper) return SC_Upper; if (is_capital) return SC_Capital; } return SC_Other; } static char * calculate_replacement (GoSearchReplace *sr, const char *src, const GORegmatch *pm) { char *res; if (sr->plain_replace) { res = g_strdup (sr->replace_text); } else { const char *s; GString *gres = g_string_sized_new (strlen (sr->replace_text)); for (s = sr->replace_text; *s; s = g_utf8_next_char (s)) { switch (*s) { case '$': { int n = s[1] - '0'; s++; g_assert (n > 0 && n <= (int)sr->comp_search->re_nsub); g_string_append_len (gres, src + pm[n].rm_so, pm[n].rm_eo - pm[n].rm_so); break; } case '\\': s++; g_assert (*s != 0); g_string_append_unichar (gres, g_utf8_get_char (s)); break; default: g_string_append_unichar (gres, g_utf8_get_char (s)); break; } } res = gres->str; g_string_free (gres, FALSE); } /* * Try to preserve the case during replacement, i.e., do the * following substitutions: * * search -> replace * Search -> Replace * SEARCH -> REPLACE * TheSearch -> TheReplace */ if (sr->preserve_case) { SearchCase sc = inspect_case (src + pm->rm_so, src + pm->rm_eo); switch (sc) { case SC_Upper: { char *newres = g_utf8_strup (res, -1); g_free (res); res = newres; break; } case SC_Capital: { char *newres = go_utf8_strcapital (res, -1); g_free (res); res = newres; break; } case SC_Other: break; #ifndef DEBUG_SWITCH_ENUM default: g_assert_not_reached (); #endif } } return res; } /* ------------------------------------------------------------------------- */ gboolean go_search_match_string (GoSearchReplace *sr, const char *src) { int flags = 0; g_return_val_if_fail (sr, FALSE); if (!sr->comp_search) { go_search_replace_compile (sr); g_return_val_if_fail (sr->comp_search, FALSE); } while (1) { GORegmatch match; int ret = go_regexec (sr->comp_search, src, 1, &match, flags); switch (ret) { case 0: if (!sr->match_words) return TRUE; if (match_is_word (src, &match, (flags & REG_NOTBOL) != 0)) return TRUE; /* * We had a match, but it's not a word. Pretend we saw * a one-character match and continue after that. */ flags |= REG_NOTBOL; src = g_utf8_next_char (src + match.rm_so); break; case REG_NOMATCH: return FALSE; default: g_error ("Unexpected error code from regexec: %d.", ret); return FALSE; } } } /* ------------------------------------------------------------------------- */ /* * Returns NULL if nothing changed, or a g_malloc string otherwise. */ char * go_search_replace_string (GoSearchReplace *sr, const char *src) { int nmatch; GORegmatch *pmatch; GString *res = NULL; int ret; int flags = 0; g_return_val_if_fail (sr, NULL); g_return_val_if_fail (sr->replace_text, NULL); if (!sr->comp_search) { go_search_replace_compile (sr); g_return_val_if_fail (sr->comp_search, NULL); } nmatch = 1 + sr->comp_search->re_nsub; pmatch = g_new (GORegmatch, nmatch); while ((ret = go_regexec (sr->comp_search, src, nmatch, pmatch, flags)) == 0) { if (!res) { /* The size here is a bit arbitrary. */ int size = strlen (src) + 10 * strlen (sr->replace_text); res = g_string_sized_new (size); } if (pmatch[0].rm_so > 0) { g_string_append_len (res, src, pmatch[0].rm_so); } if (sr->match_words && !match_is_word (src, pmatch, (flags & REG_NOTBOL) != 0)) { /* We saw a fake match. */ if (pmatch[0].rm_so < pmatch[0].rm_eo) { const char *p = src + pmatch[0].rm_so; gunichar c = g_utf8_get_char (p); g_string_append_unichar (res, c); /* Pretend we saw a one-character match. */ pmatch[0].rm_eo = pmatch[0].rm_so + (g_utf8_next_char (p) - p); } } else { char *replacement = calculate_replacement (sr, src, pmatch); g_string_append (res, replacement); g_free (replacement); if (src[pmatch[0].rm_eo] == 0) { /* * We matched and replaced the last character * of the string. Do not continue as we might * then match the empty string at the end and * re-add the replacement. This would happen, * for example, if you searched for ".*". */ src = ""; break; } } if (pmatch[0].rm_eo > 0) { src += pmatch[0].rm_eo; flags |= REG_NOTBOL; } if (pmatch[0].rm_so == pmatch[0].rm_eo) { /* * We have matched a null string at the current point. * This might happen searching for just an anchor, for * example. Don't loop forever... */ break; } } g_free (pmatch); if (res) { if (*src) g_string_append (res, src); return g_string_free (res, FALSE); } else { return NULL; } } /* ------------------------------------------------------------------------- */ static void go_search_replace_init (GObject *obj) { } /* ------------------------------------------------------------------------- */ static void go_search_replace_finalize (GObject *obj) { GoSearchReplace *sr = (GoSearchReplace *)obj; kill_compiled (sr); g_free (sr->search_text); g_free (sr->replace_text); G_OBJECT_CLASS (parent_class)->finalize (obj); } /* ------------------------------------------------------------------------- */ static void go_search_replace_get_property (GObject *object, guint property_id, GValue *value, GParamSpec *pspec) { GoSearchReplace *sr = (GoSearchReplace *)object; switch (property_id) { case PROP_SEARCH_TEXT: g_value_set_string (value, sr->search_text); break; case PROP_REPLACE_TEXT: g_value_set_string (value, sr->replace_text); break; case PROP_IS_REGEXP: g_value_set_boolean (value, sr->is_regexp); break; case PROP_IGNORE_CASE: g_value_set_boolean (value, sr->ignore_case); break; case PROP_PRESERVE_CASE: g_value_set_boolean (value, sr->preserve_case); break; case PROP_MATCH_WORDS: g_value_set_boolean (value, sr->match_words); break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID (object, property_id, pspec); break; } } /* ------------------------------------------------------------------------- */ static void go_search_replace_set_search_text (GoSearchReplace *sr, const char *text) { char *text_copy = g_strdup (text); g_free (sr->search_text); sr->search_text = text_copy; kill_compiled (sr); } static void go_search_replace_set_replace_text (GoSearchReplace *sr, const char *text) { char *text_copy = g_strdup (text); g_free (sr->replace_text); sr->replace_text = text_copy; kill_compiled (sr); } static void go_search_replace_set_property (GObject *object, guint property_id, GValue const *value, GParamSpec *pspec) { GoSearchReplace *sr = (GoSearchReplace *)object; switch (property_id) { case PROP_SEARCH_TEXT: go_search_replace_set_search_text (sr, g_value_get_string (value)); break; case PROP_REPLACE_TEXT: go_search_replace_set_replace_text (sr, g_value_get_string (value)); break; case PROP_IS_REGEXP: sr->is_regexp = g_value_get_boolean (value); kill_compiled (sr); break; case PROP_IGNORE_CASE: sr->ignore_case = g_value_get_boolean (value); kill_compiled (sr); break; case PROP_PRESERVE_CASE: sr->preserve_case = g_value_get_boolean (value); kill_compiled (sr); break; case PROP_MATCH_WORDS: sr->match_words = g_value_get_boolean (value); kill_compiled (sr); break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID (object, property_id, pspec); break; } } /* ------------------------------------------------------------------------- */ static void go_search_replace_class_init (GObjectClass *gobject_class) { parent_class = g_type_class_peek_parent (gobject_class); gobject_class->finalize = go_search_replace_finalize; gobject_class->get_property = go_search_replace_get_property; gobject_class->set_property = go_search_replace_set_property; g_object_class_install_property (gobject_class, PROP_SEARCH_TEXT, g_param_spec_string ("search-text", _("Search Text"), _("The text to search for"), NULL, GSF_PARAM_STATIC | G_PARAM_READWRITE)); g_object_class_install_property (gobject_class, PROP_REPLACE_TEXT, g_param_spec_string ("replace-text", _("Replacement Text"), _("The text to replace with"), NULL, GSF_PARAM_STATIC | G_PARAM_READWRITE)); g_object_class_install_property (gobject_class, PROP_IS_REGEXP, g_param_spec_boolean ("is-regexp", _("Is Regular Expression"), _("Is the search text a regular expression."), FALSE, GSF_PARAM_STATIC | G_PARAM_READWRITE)); g_object_class_install_property (gobject_class, PROP_IGNORE_CASE, g_param_spec_boolean ("ignore-case", _("Ignore Case"), _("Ignore the case of letters."), FALSE, GSF_PARAM_STATIC | G_PARAM_READWRITE)); g_object_class_install_property (gobject_class, PROP_PRESERVE_CASE, g_param_spec_boolean ("preserve-case", _("Preserve Case"), _("Preserve the case of letters."), FALSE, GSF_PARAM_STATIC | G_PARAM_READWRITE)); g_object_class_install_property (gobject_class, PROP_MATCH_WORDS, g_param_spec_boolean ("match-words", _("Match Words"), _("Match whole words only."), FALSE, GSF_PARAM_STATIC | G_PARAM_READWRITE)); } /* ------------------------------------------------------------------------- */ GSF_CLASS (GoSearchReplace, go_search_replace, go_search_replace_class_init, go_search_replace_init, G_TYPE_OBJECT)