From ff7a532bf596d9fa8f93f5ff15f83231dd2ff431 Mon Sep 17 00:00:00 2001 From: Tim Waugh Date: Dec 31 2004 15:43:57 +0000 Subject: - Jakub Jelinek's much improved -Fi algorithm. --- diff --git a/grep-2.5.1-egf-speedup.patch b/grep-2.5.1-egf-speedup.patch index 88fdeea..0728bfd 100644 --- a/grep-2.5.1-egf-speedup.patch +++ b/grep-2.5.1-egf-speedup.patch @@ -1,6 +1,12 @@ ---- grep-2.5.1/src/search.c 2004-12-21 13:37:15.700555594 +0000 -+++ grep-2.5.1/src/search.c 2004-12-21 13:49:05.873811016 +0000 -@@ -21,6 +21,7 @@ +--- grep-2.5.1/src/search.c.egf-speedup 2004-12-31 15:26:52.187199404 +0000 ++++ grep-2.5.1/src/search.c 2004-12-31 15:28:35.720391036 +0000 +@@ -18,9 +18,13 @@ + + /* Written August 1992 by Mike Haertel. */ + ++#ifndef _GNU_SOURCE ++# define _GNU_SOURCE 1 ++#endif #ifdef HAVE_CONFIG_H # include #endif @@ -8,7 +14,7 @@ #include #if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC /* We can handle multibyte string. */ -@@ -39,6 +40,9 @@ +@@ -39,6 +43,9 @@ #ifdef HAVE_LIBPCRE # include #endif @@ -18,7 +24,7 @@ #define NCHAR (UCHAR_MAX + 1) -@@ -70,9 +74,10 @@ +@@ -70,9 +77,10 @@ call the regexp matcher at all. */ static int kwset_exact_matches; @@ -32,7 +38,7 @@ static void kwsinit PARAMS ((void)); static void kwsmusts PARAMS ((void)); static void Gcompile PARAMS ((char const *, size_t)); -@@ -84,6 +89,15 @@ +@@ -84,6 +92,15 @@ static size_t Pexecute PARAMS ((char const *, size_t, size_t *, int)); void @@ -48,7 +54,7 @@ dfaerror (char const *mesg) { error (2, 0, mesg); -@@ -141,47 +155,6 @@ +@@ -141,47 +158,6 @@ } } @@ -96,7 +102,7 @@ static void Gcompile (char const *pattern, size_t size) { -@@ -190,6 +163,7 @@ +@@ -190,6 +166,7 @@ size_t total = size; char const *motif = pattern; @@ -104,7 +110,7 @@ re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE | (match_icase ? RE_ICASE : 0)); dfasyntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, match_icase, eolbyte); -@@ -266,6 +240,7 @@ +@@ -266,6 +243,7 @@ size_t total = size; char const *motif = pattern; @@ -112,7 +118,7 @@ if (strcmp (matcher, "awk") == 0) { re_set_syntax (RE_SYNTAX_AWK | (match_icase ? RE_ICASE : 0)); -@@ -350,18 +325,8 @@ +@@ -350,18 +328,8 @@ struct kwsmatch kwsm; size_t i, ret_val; #ifdef MBS_SUPPORT @@ -133,7 +139,7 @@ #endif /* MBS_SUPPORT */ buflim = buf + size; -@@ -373,21 +338,63 @@ +@@ -373,21 +341,63 @@ if (kwset) { /* Find a possible match using the KWset matcher. */ @@ -201,7 +207,7 @@ goto success_in_beg_and_end; if (dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1) continue; -@@ -395,13 +402,47 @@ +@@ -395,13 +405,47 @@ else { /* No good fixed strings; start with DFA. */ @@ -249,7 +255,7 @@ while (beg > buf && beg[-1] != eol) --beg; } -@@ -469,15 +510,6 @@ +@@ -469,15 +513,6 @@ } /* for (beg = end ..) */ failure: @@ -265,7 +271,7 @@ return (size_t) -1; success_in_beg_and_end: -@@ -486,24 +518,125 @@ +@@ -486,24 +521,143 @@ /* FALLTHROUGH */ success_in_start_and_len: @@ -282,10 +288,15 @@ return start; } -+static wchar_t **f_pattern; -+static char *f_initial_byte; -+static size_t f_pattern_count; ++#ifdef MBS_SUPPORT +static int f_i_multibyte; /* whether we're using the new -Fi MB method */ ++static struct ++{ ++ wchar_t **patterns; ++ size_t count, maxlen; ++ unsigned char *match; ++} Fimb; ++#endif + static void Fcompile (char const *pattern, size_t size) @@ -297,101 +308,114 @@ + /* Support -F -i for UTF-8 input. */ + if (match_icase && MB_CUR_MAX > 1) + { -+ size_t in = 0; -+ -+ while (f_i_multibyte != -1 && in < size) ++ mbstate_t mbs; ++ wchar_t *wcpattern = xmalloc ((size + 1) * sizeof (wchar_t)); ++ const char *patternend = pattern; ++ size_t wcsize; ++ kwset_t fimb_kwset = NULL; ++ char *starts = NULL; ++ wchar_t *wcbeg, *wclim; ++ size_t allocated = 0; ++ ++ memset (&mbs, '\0', sizeof (mbs)); ++# ifdef __GNU_LIBRARY__ ++ wcsize = mbsnrtowcs (wcpattern, &patternend, size, size, &mbs); ++ if (patternend != pattern + size) ++ wcsize = (size_t) -1; ++# else ++ { ++ char *patterncopy = xmalloc (size + 1); ++ ++ memcpy (patterncopy, pattern, size); ++ patterncopy[size] = '\0'; ++ patternend = patterncopy; ++ wcsize = mbsrtowcs (wcpattern, &patternend, size, &mbs); ++ if (patternend != patterncopy + size) ++ wcsize = (size_t) -1; ++ free (patterncopy); ++ } ++# endif ++ if (wcsize + 2 <= 2) + { -+ wchar_t *f_this_pattern; -+ size_t f_this_pattern_allocated = sizeof (wchar_t) * 1000; -+ mbstate_t mbs; -+ size_t out = 0; -+ f_pattern_count++; -+ f_pattern = xrealloc (f_pattern, -+ sizeof (wchar_t *) * f_pattern_count); -+ f_initial_byte = xrealloc (f_initial_byte, -+ sizeof (char) * -+ (2 * f_pattern_count + 1)); -+ if (f_pattern_count == 1) -+ f_initial_byte[0] = '\0'; -+ -+ /* Convert pattern into wchar_t*, storing them in this_pattern. -+ Don't read more than we're given. */ -+ f_this_pattern = xmalloc (f_this_pattern_allocated); -+ memset (&mbs, '\0', sizeof (mbs)); -+ while (in < size) ++fimb_fail: ++ free (wcpattern); ++ free (starts); ++ if (fimb_kwset) ++ kwsfree (fimb_kwset); ++ free (Fimb.patterns); ++ Fimb.patterns = NULL; ++ } ++ else ++ { ++ if (!(fimb_kwset = kwsalloc (NULL))) ++ error (2, 0, _("memory exhausted")); ++ ++ starts = xmalloc (MB_CUR_MAX * 3); ++ wcbeg = wcpattern; ++ do + { -+ size_t c; -+ wchar_t this_wc; -+ if (out == f_this_pattern_allocated) -+ { -+ f_this_pattern_allocated *= 2; -+ f_this_pattern = xrealloc (f_this_pattern, -+ f_this_pattern_allocated); -+ } ++ int i; ++ size_t wclen; + -+ c = mbrtowc (&this_wc, pattern + in, size - in, &mbs); -+ if (c < 1) ++ if (Fimb.count >= allocated) + { -+ /* Fall back to old method. */ -+ f_i_multibyte = -1; -+ while (f_pattern_count--) -+ free (f_pattern[f_pattern_count]); -+ free (f_pattern); -+ f_pattern = NULL; -+ break; ++ if (allocated == 0) ++ allocated = 128; ++ else ++ allocated *= 2; ++ Fimb.patterns = xrealloc (Fimb.patterns, ++ sizeof (wchar_t *) * allocated); + } -+ -+ f_this_pattern[out] = towlower (this_wc); -+ if (out == 0) ++ Fimb.patterns[Fimb.count++] = wcbeg; ++ for (wclim = wcbeg; ++ wclim < wcpattern + wcsize && *wclim != L'\n'; ++wclim) ++ *wclim = towlower (*wclim); ++ *wclim = L'\0'; ++ wclen = wclim - wcbeg; ++ if (wclen > Fimb.maxlen) ++ Fimb.maxlen = wclen; ++ if (wclen > 3) ++ wclen = 3; ++ if (wclen == 0) + { -+ /* First character. Work out the first byte of upper and -+ lower case multibyte strings for the first character. */ -+ wchar_t wc; -+ char mbs[MB_CUR_MAX]; -+ mbstate_t ps; -+ -+ if (iswupper (this_wc)) -+ { -+ wc = towlower (this_wc); -+ } -+ else -+ { -+ wc = towupper (this_wc); -+ } -+ -+ memset (&ps, '\0', sizeof (ps)); -+ wcrtomb (mbs, this_wc, &ps); -+ mbs[1] = '\0'; -+ strcat (f_initial_byte, mbs); -+ -+ memset (&ps, '\0', sizeof (ps)); -+ wcrtomb (mbs, wc, &ps); -+ mbs[1] = '\0'; -+ strcat (f_initial_byte, mbs); ++ if ((err = kwsincr (fimb_kwset, "", 0)) != 0) ++ error (2, 0, err); + } -+ -+ in += c; -+ -+ if (this_wc == L'\n') -+ break; -+ -+ out++; -+ } -+ -+ if (f_i_multibyte == -1) -+ break; -+ -+ /* Nul-terminate it. */ -+ if (out == f_this_pattern_allocated) -+ { -+ f_this_pattern_allocated++; -+ f_this_pattern = xrealloc (f_this_pattern, -+ f_this_pattern_allocated); ++ else ++ for (i = 0; i < (1 << wclen); i++) ++ { ++ char *p = starts; ++ int j, k; ++ ++ for (j = 0; j < wclen; ++j) ++ { ++ wchar_t wc = wcbeg[j]; ++ if (i & (1 << j)) ++ { ++ wc = towupper (wc); ++ if (wc == wcbeg[j]) ++ continue; ++ } ++ k = wctomb (p, wc); ++ if (k <= 0) ++ goto fimb_fail; ++ p += k; ++ } ++ if ((err = kwsincr (fimb_kwset, starts, p - starts)) != 0) ++ error (2, 0, err); ++ } ++ if (wclim < wcpattern + wcsize) ++ ++wclim; ++ wcbeg = wclim; + } -+ -+ f_this_pattern[out] = L'\0'; -+ f_pattern[f_pattern_count - 1] = f_this_pattern; ++ while (wcbeg < wcpattern + wcsize); + f_i_multibyte = 1; ++ kwset = fimb_kwset; ++ free (starts); ++ Fimb.match = xmalloc (Fimb.count); ++ if ((err = kwsprep (kwset)) != 0) ++ error (2, 0, err); ++ return; + } + } +#endif /* MBS_SUPPORT */ @@ -400,95 +424,84 @@ kwsinit (); beg = pattern; do -@@ -523,6 +656,87 @@ +@@ -522,6 +676,76 @@ + error (2, 0, err); } - static size_t -+Fimbexec (const char *buf, size_t size, size_t *plen) ++#ifdef MBS_SUPPORT ++static int ++Fimbexec (const char *buf, size_t size, size_t *plen, int exact) +{ -+ char const *beg; -+ size_t len; ++ size_t len, letter, i; ++ int ret = -1; + mbstate_t mbs; ++ wchar_t wc; ++ int patterns_left; + + assert (match_icase && f_i_multibyte == 1); + assert (MB_CUR_MAX > 1); + + memset (&mbs, '\0', sizeof (mbs)); -+ beg = buf; -+ len = 0; -+ while (beg < buf + size) ++ memset (Fimb.match, '\1', Fimb.count); ++ letter = len = 0; ++ patterns_left = 1; ++ while (patterns_left && len <= size) + { -+ wchar_t wc; -+ char const *p; -+ char const *next_char; -+ unsigned char match[f_pattern_count]; -+ size_t i, letter; -+ int patterns_left; -+ -+ for (p = beg; -+ (p < buf + size) && !strchr (f_initial_byte, *p); -+ p++) -+ ; -+ -+ if (p == NULL || p == buf + size) -+ break; -+ -+ /* First byte matches, now check the rest */ -+ beg = p; -+ letter = len = 0; -+ memset (match, '\1', f_pattern_count); -+ patterns_left = 1; -+ while (patterns_left) -+ { -+ size_t c; -+ -+ patterns_left = 0; -+ -+ c = mbrtowc (&wc, beg + len, size - (beg - buf) - len, &mbs); -+ if (c < 1) -+ { -+ memset (&mbs, '\0', sizeof (mbs)); -+ next_char = beg + 1; -+ break; -+ } ++ size_t c; + -+ if (!len) -+ next_char = beg + c; ++ patterns_left = 0; ++ if (len < size) ++ { ++ c = mbrtowc (&wc, buf + len, size - len, &mbs); ++ if (c + 2 <= 2) ++ return ret; + + wc = towlower (wc); -+ for (i = 0; i < f_pattern_count; i++) ++ } ++ else ++ { ++ c = 1; ++ wc = L'\0'; ++ } ++ ++ for (i = 0; i < Fimb.count; i++) ++ { ++ if (Fimb.match[i]) + { -+ if (match[i]) ++ if (Fimb.patterns[i][letter] == L'\0') + { -+ if (f_pattern[i][letter] == L'\0') ++ /* Found a match. */ ++ *plen = len; ++ if (!exact && !match_words) ++ return 0; ++ else + { -+ /* Found a match. */ -+ *plen = len; -+ return beg - buf; ++ /* For -w or exact look for longest match. */ ++ ret = 0; ++ Fimb.match[i] = '\0'; ++ continue; + } -+ -+ if (f_pattern[i][letter] == wc) -+ patterns_left = 1; -+ else -+ match[i] = '\0'; + } -+ } + -+ len += c; -+ letter++; ++ if (Fimb.patterns[i][letter] == wc) ++ patterns_left = 1; ++ else ++ Fimb.match[i] = '\0'; ++ } + } + -+ beg = next_char; ++ len += c; ++ letter++; + } -+ -+ return -1; ++ ++ return ret; +} ++#endif /* MBS_SUPPORT */ + -+static size_t + static size_t Fexecute (char const *buf, size_t size, size_t *match_size, int exact) { - register char const *beg, *try, *end; -@@ -531,27 +745,50 @@ +@@ -531,80 +755,258 @@ struct kwsmatch kwsmatch; size_t ret_val; #ifdef MBS_SUPPORT @@ -503,19 +516,16 @@ - } - mb_properties = check_multibyte_string(buf, size); - } ++ int mb_cur_max = MB_CUR_MAX; + mbstate_t mbs; + memset (&mbs, '\0', sizeof (mbstate_t)); ++ const char *last_char = NULL; #endif /* MBS_SUPPORT */ for (beg = buf; beg <= buf + size; ++beg) { - size_t offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch); + size_t offset; -+#ifdef MBS_SUPPORT -+ if (match_icase && f_i_multibyte == 1) -+ offset = Fimbexec (beg, buf + size - beg, &kwsmatch.size[0]); -+ else -+#endif /* MBS_SUPPORT */ + offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch); + if (offset == (size_t) -1) @@ -523,13 +533,15 @@ #ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1 && mb_properties[offset+beg-buf] == 0) - continue; /* It is a part of multibyte character. */ -+ if (MB_CUR_MAX > 1 && !using_utf8) ++ if (mb_cur_max > 1 && !using_utf8) + { + size_t bytes_left = offset; + while (bytes_left) + { -+ size_t len = mbrlen (beg, bytes_left, &mbs); -+ if (len == (size_t) -1 || len == 0) ++ size_t mlen = mbrlen (beg, bytes_left, &mbs); ++ ++ last_char = beg; ++ if (mlen == (size_t) -1 || mlen == 0) + { + /* Incomplete character: treat as single-byte. */ + memset (&mbs, '\0', sizeof (mbstate_t)); @@ -538,12 +550,12 @@ + continue; + } + -+ if (len == (size_t) -2) ++ if (mlen == (size_t) -2) + /* Offset points inside multibyte character: no good. */ + break; + -+ beg += len; -+ bytes_left -= len; ++ beg += mlen; ++ bytes_left -= mlen; + } + + if (bytes_left) @@ -552,29 +564,128 @@ + else #endif /* MBS_SUPPORT */ beg += offset; - len = kwsmatch.size[0]; -@@ -583,10 +820,46 @@ - { - /* Try a shorter length anchored at the same place. */ - --len; +#ifdef MBS_SUPPORT -+ if (match_icase && f_i_multibyte == 1) -+ offset = Fimbexec (beg, len, &kwsmatch.size[0]); ++ /* For f_i_multibyte, the string at beg now matches first 3 chars of ++ one of the search strings (less if there are shorter search strings). ++ See if this is a real match. */ ++ if (f_i_multibyte ++ && Fimbexec (beg, buf + size - beg, &kwsmatch.size[0], exact)) ++ goto next_char; ++#endif /* MBS_SUPPORT */ + len = kwsmatch.size[0]; + if (exact && !match_words) + goto success_in_beg_and_len; + if (match_lines) + { + if (beg > buf && beg[-1] != eol) +- continue; ++ goto next_char; + if (beg + len < buf + size && beg[len] != eol) +- continue; ++ goto next_char; + goto success; + } + else if (match_words) +- for (try = beg; len; ) +- { +- if (try > buf && WCHAR((unsigned char) try[-1])) +- break; +- if (try + len < buf + size && WCHAR((unsigned char) try[len])) +- { +- offset = kwsexec (kwset, beg, --len, &kwsmatch); +- if (offset == (size_t) -1) +- { ++ { ++ while (1) ++ { ++ int word_match = 0; ++ if (beg > buf) ++ { + #ifdef MBS_SUPPORT +- if (MB_CUR_MAX > 1) +- free (mb_properties); ++ if (mb_cur_max > 1) ++ { ++ const char *s; ++ int mr; ++ wchar_t pwc; ++ ++ if (using_utf8) ++ { ++ s = beg - 1; ++ while (s > buf ++ && (unsigned char) *s >= 0x80 ++ && (unsigned char) *s <= 0xbf) ++ --s; ++ } ++ else ++ s = last_char; ++ mr = mbtowc (&pwc, s, beg - s); ++ if (mr <= 0) ++ memset (&mbs, '\0', sizeof (mbstate_t)); ++ else if ((iswalnum (pwc) || pwc == L'_') ++ && mr == (int) (beg - s)) ++ goto next_char; ++ } + else + #endif /* MBS_SUPPORT */ +- return offset; +- } +- try = beg + offset; +- len = kwsmatch.size[0]; +- } +- else +- goto success; +- } ++ if (!WCHAR ((unsigned char) beg[-1])) ++ goto next_char; ++ } ++#ifdef MBS_SUPPORT ++ if (mb_cur_max > 1) ++ { ++ wchar_t nwc; ++ int mr; ++ ++ mr = mbtowc (&nwc, beg + len, buf + size - beg - len); ++ if (mr <= 0) ++ { ++ memset (&mbs, '\0', sizeof (mbstate_t)); ++ word_match = 1; ++ } ++ else if (!iswalnum (nwc) && nwc != L'_') ++ word_match = 1; ++ } ++ else +#endif /* MBS_SUPPORT */ - offset = kwsexec (kwset, beg, len, &kwsmatch); ++ if (beg + len >= buf + size && !WCHAR ((unsigned char) beg[len])) ++ word_match = 1; ++ if (word_match) ++ { ++ if (!exact) ++ /* Returns the whole line now we know there's a word match. */ ++ goto success; ++ else ++ /* Returns just this word match. */ ++ goto success_in_beg_and_len; ++ } ++ if (len > 0) ++ { ++ /* Try a shorter length anchored at the same place. */ ++ --len; ++ offset = kwsexec (kwset, beg, len, &kwsmatch); + - if (offset == -1) { - break; /* Try a different anchor. */ - } ++ if (offset == -1) ++ goto next_char; /* Try a different anchor. */ +#ifdef MBS_SUPPORT -+ if (MB_CUR_MAX > 1 && !using_utf8) ++ if (mb_cur_max > 1 && !using_utf8) + { + size_t bytes_left = offset; + while (bytes_left) + { -+ size_t len = mbrlen (beg, bytes_left, &mbs); -+ if (len == (size_t) -1 || len == 0) ++ size_t mlen = mbrlen (beg, bytes_left, &mbs); ++ ++ last_char = beg; ++ if (mlen == (size_t) -1 || mlen == 0) + { + /* Incomplete character: treat as single-byte. */ + memset (&mbs, '\0', sizeof (mbstate_t)); @@ -583,24 +694,76 @@ + continue; + } + -+ if (len == (size_t) -2) -+ /* Offset points inside multibyte character: -+ * no good. */ -+ break; ++ if (mlen == (size_t) -2) ++ { ++ /* Offset points inside multibyte character: ++ * no good. */ ++ break; ++ } + -+ beg += len; -+ bytes_left -= len; ++ beg += mlen; ++ bytes_left -= mlen; + } + + if (bytes_left) -+ break; /* Try a different anchor. */ ++ { ++ memset (&mbs, '\0', sizeof (mbstate_t)); ++ goto next_char; /* Try a different anchor. */ ++ } + } + else +#endif /* MBS_SUPPORT */ - beg += offset; - len = kwsmatch.size[0]; - } -@@ -597,19 +870,31 @@ ++ beg += offset; ++#ifdef MBS_SUPPORT ++ /* The string at beg now matches first 3 chars of one of ++ the search strings (less if there are shorter search ++ strings). See if this is a real match. */ ++ if (f_i_multibyte ++ && Fimbexec (beg, len - offset, &kwsmatch.size[0], ++ exact)) ++ goto next_char; ++#endif /* MBS_SUPPORT */ ++ len = kwsmatch.size[0]; ++ } ++ } ++ } + else + goto success; ++next_char:; ++#ifdef MBS_SUPPORT ++ /* Advance to next character. For MB_CUR_MAX == 1 case this is handled ++ by ++beg above. */ ++ if (mb_cur_max > 1) ++ { ++ if (using_utf8) ++ { ++ unsigned char c = *beg; ++ if (c >= 0xc2) ++ { ++ if (c < 0xe0) ++ ++beg; ++ else if (c < 0xf0) ++ beg += 2; ++ else if (c < 0xf8) ++ beg += 3; ++ else if (c < 0xfc) ++ beg += 4; ++ else if (c < 0xfe) ++ beg += 5; ++ } ++ } ++ else ++ { ++ size_t l = mbrlen (beg, buf + size - beg, &mbs); ++ ++ last_char = beg; ++ if (l + 2 >= 2) ++ beg += l - 1; ++ else ++ memset (&mbs, '\0', sizeof (mbstate_t)); ++ } ++ } ++#endif /* MBS_SUPPORT */ } failure: @@ -609,7 +772,7 @@ + success: #ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1) -+ if (MB_CUR_MAX > 1 && !using_utf8) ++ if (mb_cur_max > 1 && !using_utf8) { - if (match_icase) - free((char *) buf); @@ -640,7 +803,7 @@ end++; while (buf < beg && beg[-1] != eol) --beg; -@@ -618,15 +903,6 @@ +@@ -613,15 +1015,6 @@ success_in_beg_and_len: *match_size = len; diff --git a/grep.spec b/grep.spec index 565f812..3295939 100644 --- a/grep.spec +++ b/grep.spec @@ -86,6 +86,7 @@ fi %changelog * Fri Dec 31 2004 Tim Waugh +- Jakub Jelinek's much improved -Fi algorithm. - Removed bogus part of grep-2.5.1-fgrep patch. * Tue Dec 21 2004 Tim Waugh 2.5.1-44