--- grep-2.5.1/src/search.c.egf-speedup 2004-11-05 12:50:25.934736684 +0000 +++ grep-2.5.1/src/search.c 2004-11-05 13:52:33.819394140 +0000 @@ -70,9 +70,6 @@ call the regexp matcher at all. */ static int kwset_exact_matches; -#if defined(MBS_SUPPORT) -static char* check_multibyte_string PARAMS ((char const *buf, size_t size)); -#endif static void kwsinit PARAMS ((void)); static void kwsmusts PARAMS ((void)); static void Gcompile PARAMS ((char const *, size_t)); @@ -141,47 +138,6 @@ } } -#ifdef MBS_SUPPORT -/* This function allocate the array which correspond to "buf". - Then this check multibyte string and mark on the positions which - are not singlebyte character nor the first byte of a multibyte - character. Caller must free the array. */ -static char* -check_multibyte_string(char const *buf, size_t size) -{ - char *mb_properties = xmalloc(size); - mbstate_t cur_state; - wchar_t wc; - int i; - memset(&cur_state, 0, sizeof(mbstate_t)); - memset(mb_properties, 0, sizeof(char)*size); - for (i = 0; i < size ;) - { - size_t mbclen; - mbclen = mbrtowc(&wc, buf + i, size - i, &cur_state); - - if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0) - { - /* An invalid sequence, or a truncated multibyte character. - We treat it as a singlebyte character. */ - mbclen = 1; - } - else if (match_icase) - { - if (iswupper((wint_t)wc)) - { - wc = towlower((wint_t)wc); - wcrtomb(buf + i, wc, &cur_state); - } - } - mb_properties[i] = mbclen; - i += mbclen; - } - - return mb_properties; -} -#endif - static void Gcompile (char const *pattern, size_t size) { @@ -350,18 +306,8 @@ struct kwsmatch kwsm; size_t i, ret_val; #ifdef MBS_SUPPORT - char *mb_properties = NULL; - if (MB_CUR_MAX > 1) - { - if (match_icase) - { - char *case_buf = xmalloc(size); - memcpy(case_buf, buf, size); - buf = case_buf; - } - if (kwset) - mb_properties = check_multibyte_string(buf, size); - } + mbstate_t mbs; + memset (&mbs, '\0', sizeof (mbstate_t)); #endif /* MBS_SUPPORT */ buflim = buf + size; @@ -373,18 +319,48 @@ if (kwset) { /* Find a possible match using the KWset matcher. */ +#ifdef MBS_SUPPORT + size_t bytes_left = 0; +#endif /* MBS_SUPPORT */ size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm); if (offset == (size_t) -1) goto failure; +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1) + { + bytes_left = offset; + while (bytes_left) + { + size_t len = mbrlen (beg, bytes_left, &mbs); + if (len == (size_t) -1 || len == 0) + { + /* Incomplete character: treat as single-byte. */ + memset (&mbs, '\0', sizeof (mbstate_t)); + beg++; + bytes_left--; + continue; + } + + if (len == (size_t) -2) + /* Offset points inside multibyte character: + * no good. */ + break; + + beg += len; + bytes_left -= len; + } + } + else +#endif /* MBS_SUPPORT */ beg += offset; /* Narrow down to the line containing the candidate, and run it through DFA. */ end = memchr(beg, eol, buflim - beg); end++; #ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1 && mb_properties[beg - buf] == 0) + if (MB_CUR_MAX > 1 && bytes_left) continue; -#endif +#endif /* MBS_SUPPORT */ while (beg > buf && beg[-1] != eol) --beg; if (kwsm.index < kwset_exact_matches) @@ -395,13 +371,47 @@ else { /* No good fixed strings; start with DFA. */ +#ifdef MBS_SUPPORT + size_t bytes_left = 0; +#endif /* MBS_SUPPORT */ size_t offset = dfaexec (&dfa, beg, buflim - beg, &backref); if (offset == (size_t) -1) break; /* Narrow down to the line we've found. */ +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1) + { + bytes_left = offset; + while (bytes_left) + { + size_t len = mbrlen (beg, bytes_left, &mbs); + if (len == (size_t) -1 || len == 0) + { + /* Incomplete character: treat as single-byte. */ + memset (&mbs, '\0', sizeof (mbstate_t)); + beg++; + bytes_left--; + continue; + } + + if (len == (size_t) -2) + /* Offset points inside multibyte character: + * no good. */ + break; + + beg += len; + bytes_left -= len; + } + } + else +#endif /* MBS_SUPPORT */ beg += offset; end = memchr (beg, eol, buflim - beg); end++; +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1 && bytes_left) + continue; +#endif /* MBS_SUPPORT */ while (beg > buf && beg[-1] != eol) --beg; } @@ -469,15 +479,6 @@ } /* for (beg = end ..) */ failure: -#ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1) - { - if (mb_properties) - free (mb_properties); - if (match_icase) - free ((char *) buf); - } -#endif /* MBS_SUPPORT */ return (size_t) -1; success_in_beg_and_end: @@ -486,15 +487,6 @@ /* FALLTHROUGH */ success_in_start_and_len: -#ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1) - { - if (mb_properties) - free (mb_properties); - if (match_icase) - free ((char *) buf); - } -#endif /* MBS_SUPPORT */ *match_size = len; return start; } @@ -531,17 +523,8 @@ struct kwsmatch kwsmatch; size_t ret_val; #ifdef MBS_SUPPORT - char *mb_properties = NULL; - if (MB_CUR_MAX > 1) - { - if (match_icase) - { - char *case_buf = xmalloc(size); - memcpy(case_buf, buf, size); - buf = case_buf; - } - mb_properties = check_multibyte_string(buf, size); - } + mbstate_t mbs; + memset (&mbs, '\0', sizeof (mbstate_t)); #endif /* MBS_SUPPORT */ for (beg = buf; beg <= buf + size; ++beg) @@ -550,8 +533,33 @@ if (offset == (size_t) -1) goto failure; #ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1 && mb_properties[offset+beg-buf] == 0) - continue; /* It is a part of multibyte character. */ + if (MB_CUR_MAX > 1) + { + size_t bytes_left = offset; + while (bytes_left) + { + size_t len = mbrlen (beg, bytes_left, &mbs); + if (len == (size_t) -1 || len == 0) + { + /* Incomplete character: treat as single-byte. */ + memset (&mbs, '\0', sizeof (mbstate_t)); + beg++; + bytes_left--; + continue; + } + + if (len == (size_t) -2) + /* Offset points inside multibyte character: no good. */ + break; + + beg += len; + bytes_left -= len; + } + + if (bytes_left) + continue; + } + else #endif /* MBS_SUPPORT */ beg += offset; len = kwsmatch.size[0]; @@ -587,6 +595,36 @@ if (offset == -1) { break; /* Try a different anchor. */ } +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1) + { + size_t bytes_left = offset; + while (bytes_left) + { + size_t len = mbrlen (beg, bytes_left, &mbs); + if (len == (size_t) -1 || len == 0) + { + /* Incomplete character: treat as single-byte. */ + memset (&mbs, '\0', sizeof (mbstate_t)); + beg++; + bytes_left--; + continue; + } + + if (len == (size_t) -2) + /* Offset points inside multibyte character: + * no good. */ + break; + + beg += len; + bytes_left -= len; + } + + if (bytes_left) + break; /* Try a different anchor. */ + } + else +#endif /* MBS_SUPPORT */ beg += offset; len = kwsmatch.size[0]; } @@ -597,20 +635,30 @@ } failure: -#ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1) - { - if (match_icase) - free((char *) buf); - if (mb_properties) - free(mb_properties); - } -#endif /* MBS_SUPPORT */ return -1; success: +#ifdef MBS_SUPPORT + end = beg + len; + while (end < buf + size) + { + size_t len = mbrlen (end, buf + size - end, &mbs); + if (len == (size_t) -1 || len == (size_t) -2 || len == 0) + { + memset (&mbs, '\0', sizeof (mbstate_t)); + continue; + } + if (len == 1 && *end == eol) + break; + + end += len; + } + end++; +#else end = memchr (beg + len, eol, (buf + size) - (beg + len)); end++; +#endif /* MBS_SUPPORT */ + /* Hmm, is this correct for multibyte? */ while (buf < beg && beg[-1] != eol) --beg; len = end - beg; @@ -618,15 +666,6 @@ success_in_beg_and_len: *match_size = len; -#ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1) - { - if (mb_properties) - free (mb_properties); - if (match_icase) - free ((char *) buf); - } -#endif /* MBS_SUPPORT */ return beg - buf; }