From e7b4da674280d47f787ce48c21e7b879dc5e1362 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Aug 03 2006 13:36:23 +0000 Subject: - remove superfluous multibyte processing in str_append for UTF-8 encoding (thanks Paolo Bonzini, #177246) --- diff --git a/sed-4.1.5-utf8performance.patch b/sed-4.1.5-utf8performance.patch new file mode 100644 index 0000000..b7c0dc2 --- /dev/null +++ b/sed-4.1.5-utf8performance.patch @@ -0,0 +1,113 @@ +* looking for bonzini@gnu.org--2004b/sed--stable--4.1--patch-69 to compare with +* comparing to bonzini@gnu.org--2004b/sed--stable--4.1--patch-69 +M sed/mbcs.c +M sed/sed.h +M sed/execute.c + +* modified files + +--- orig/sed/execute.c ++++ mod/sed/execute.c +@@ -235,25 +235,26 @@ str_append(to, string, length) + to->length = new_length; + + #ifdef HAVE_MBRTOWC +- if (mb_cur_max == 1) +- return; +- +- while (length) +- { +- int n = MBRLEN (string, length, &to->mbstate); ++ if (mb_cur_max > 1 && !is_utf8) ++ while (length) ++ { ++ size_t n = MBRLEN (string, length, &to->mbstate); + +- /* An invalid sequence is treated like a singlebyte character. */ +- if (n == -1) +- { +- memset (&to->mbstate, 0, sizeof (to->mbstate)); +- n = 1; +- } ++ /* An invalid sequence is treated like a singlebyte character. */ ++ if (n == (size_t) -1) ++ { ++ memset (&to->mbstate, 0, sizeof (to->mbstate)); ++ n = 1; ++ } + +- if (n > 0) +- length -= n; +- else +- break; +- } ++ if (n > 0) ++ { ++ string += n; ++ length -= n; ++ } ++ else ++ break; ++ } + #endif + } + + + +--- orig/sed/mbcs.c ++++ mod/sed/mbcs.c +@@ -18,7 +18,12 @@ + #include "sed.h" + #include + ++#ifdef HAVE_LANGINFO_CODESET ++#include ++#endif ++ + int mb_cur_max; ++bool is_utf8; + + #ifdef HAVE_MBRTOWC + /* Add a byte to the multibyte character represented by the state +@@ -47,6 +52,26 @@ int brlen (ch, cur_stat) + void + initialize_mbcs () + { ++ /* For UTF-8, we know that the encoding is stateless. */ ++ const char *codeset_name; ++ ++#ifdef HAVE_LANGINFO_CODESET ++ codeset_name = nl_langinfo (CODESET); ++#else ++ codeset_name = getenv ("LC_ALL"); ++ if (codeset_name == NULL || codeset_name[0] == '\0') ++ codeset_name = getenv ("LC_CTYPE"); ++ if (codeset_name == NULL || codeset_name[0] == '\0') ++ codeset_name = getenv ("LANG"); ++ if (codeset_name == NULL) ++ codeset_name = ""; ++ else if (strchr (codeset_name, '.') != NULL) ++ codeset_name = strchr (codeset_name, '.') + 1; ++#endif ++ ++ is_utf8 = (strcasecmp (codeset_name, "UTF-8") == 0 ++ || strcasecmp (codeset_name, "UTF8") == 0); ++ + #ifdef HAVE_MBRTOWC + mb_cur_max = MB_CUR_MAX; + #else + + +--- orig/sed/sed.h ++++ mod/sed/sed.h +@@ -233,6 +233,7 @@ extern bool use_extended_syntax_p; + + /* Declarations for multibyte character sets. */ + extern int mb_cur_max; ++extern bool is_utf8; + + #ifdef HAVE_MBRTOWC + #ifdef HAVE_BTOWC + + + diff --git a/sed.spec b/sed.spec index f57ba43..72d7919 100644 --- a/sed.spec +++ b/sed.spec @@ -10,8 +10,8 @@ License: GPL Group: Applications/Text Source0: ftp://ftp.gnu.org/pub/gnu/sed/sed-%{version}.tar.gz Source1: http://sed.sourceforge.net/sedfaq.txt -Patch0: sed-4.1.5-bz185374.patch -Patch1: sed-4.1.5-str_append.patch +Patch0: sed-4.1.5-utf8performance.patch +Patch1: sed-4.1.5-bz185374.patch Prereq: /sbin/install-info Prefix: %{_prefix} Buildroot: %{_tmppath}/%{name}-root @@ -67,8 +67,9 @@ rm -rf ${RPM_BUILD_ROOT} %{_mandir}/man*/* %changelog -* Wed Aug 2 2006 Petr Machata - 4.1.5-4 -- remove superfluous multibyte processing in str_append (#177246) +* Wed Aug 3 2006 Petr Machata - 4.1.5-4 +- remove superfluous multibyte processing in str_append for UTF-8 + encoding (thanks Paolo Bonzini, #177246) * Mon Jul 17 2006 Petr Machata - 4.1.5-3 - use dist tag