diff -up ./src/expand-core.c.orig ./src/expand-core.c --- ./src/expand-core.c.orig 2016-06-28 14:44:18.281619000 +0200 +++ ./src/expand-core.c 2016-06-30 11:46:50.025109755 +0200 @@ -18,6 +18,7 @@ #include #include +#include #include "system.h" #include "error.h" @@ -27,6 +28,119 @@ #include "expand-core.h" +extern inline int +set_utf_locale (void) +{ + /*try using some predefined locale */ + const char* predef_locales[] = {"C.UTF8","en_US.UTF8","en_GB.UTF8"}; + + const int predef_locales_count=3; + for (int i=0;ibufcount=0; + if (c == 0xEF) + { + c=fgetc(fp); + } + else + { + if (c != EOF) + { + ungetc(c,fp); + } + return false; + } + + if (c == 0xBB) + { + c=fgetc(fp); + } + else + { + if ( c!= EOF ) + { + mbf->buf[0]=(unsigned char) 0xEF; + mbf->bufcount=1; + ungetc(c,fp); + return false; + } + else + { + ungetc(0xEF,fp); + return false; + } + } + if (c == 0xBF) + { + mbf->bufcount=0; + return true; + } + else + { + if (c != EOF) + { + mbf->buf[0]=(unsigned char) 0xEF; + mbf->buf[1]=(unsigned char) 0xBB; + mbf->bufcount=2; + ungetc(c,fp); + return false; + } + else + { + mbf->buf[0]=(unsigned char) 0xEF; + mbf->bufcount=1; + ungetc(0xBB,fp); + return false; + } + } + return false; +} + +extern inline void +print_bom(void) +{ + putc (0xEF, stdout); + putc (0xBB, stdout); + putc (0xBF, stdout); +} + /* Add the comma or blank separated list of tab stops STOPS to the list of tab stops. */ diff -up ./src/expand-core.h.orig ./src/expand-core.h --- ./src/expand-core.h.orig 2016-06-28 14:44:18.281619000 +0200 +++ ./src/expand-core.h 2016-06-30 11:47:18.929437205 +0200 @@ -15,7 +15,7 @@ along with this program. If not, see . */ #ifndef EXPAND_CORE_H_ -# define EXPAND_CORE_H_ +#define EXPAND_CORE_H_ extern size_t first_free_tab; @@ -29,6 +29,18 @@ extern char **file_list; extern bool have_read_stdin; +inline int +set_utf_locale (void); + +bool +check_utf_locale(void); + +bool +check_bom(FILE* fp, mb_file_t *mbf); + +inline void +print_bom(void); + void parse_tab_stops (char const *stops, void (*add_tab_stop)(uintmax_t)); diff -up ./src/expand.c.orig ./src/expand.c --- ./src/expand.c.orig 2016-06-28 14:44:18.286619000 +0200 +++ ./src/expand.c 2016-06-30 11:50:15.077312947 +0200 @@ -149,11 +149,33 @@ expand (void) FILE *fp = next_file (NULL); mb_file_t mbf; mbf_char_t c; - + /* True if the starting locale is utf8. */ + bool using_utf_locale; + + /* True if the first file contains BOM header. */ + bool found_bom; + using_utf_locale=check_utf_locale(); + if (!fp) return; - mbf_init (mbf, fp); + found_bom=check_bom(fp,&mbf); + + if (using_utf_locale == false && found_bom == true) + { + /*try using some predefined locale */ + + if (set_utf_locale () != 0) + { + error (EXIT_FAILURE, errno, _("cannot set UTF-8 locale")); + } + } + + + if (found_bom == true) + { + print_bom(); + } while (true) { @@ -178,6 +200,27 @@ expand (void) if ((mb_iseof (c)) && (fp = next_file (fp))) { mbf_init (mbf, fp); + if (fp!=NULL) + { + if (check_bom(fp,&mbf)==true) + { + /*Not the first file - check BOM header*/ + if (using_utf_locale==false && found_bom==false) + { + /*BOM header in subsequent file but not in the first one. */ + error (EXIT_FAILURE, errno, _("combination of files with and without BOM header")); + } + } + else + { + if(using_utf_locale==false && found_bom==true) + { + /*First file conatined BOM header - locale was switched to UTF + /*all subsequent files should contain BOM. */ + error (EXIT_FAILURE, errno, _("combination of files with and without BOM header")); + } + } + } continue; } else diff -up ./src/unexpand.c.orig ./src/unexpand.c --- ./src/unexpand.c.orig 2016-06-28 17:39:22.894259000 +0200 +++ ./src/unexpand.c 2016-07-07 09:48:07.659924755 +0200 @@ -172,16 +172,36 @@ unexpand (void) include characters other than spaces, so the blanks must be stored, not merely counted. */ mbf_char_t *pending_blank; + /* True if the starting locale is utf8. */ + bool using_utf_locale; + + /* True if the first file contains BOM header. */ + bool found_bom; + using_utf_locale=check_utf_locale(); if (!fp) return; + mbf_init (mbf, fp); + found_bom=check_bom(fp,&mbf); + if (using_utf_locale == false && found_bom == true) + { + /*try using some predefined locale */ + + if (set_utf_locale () != 0) + { + error (EXIT_FAILURE, errno, _("cannot set UTF-8 locale")); + } + } /* The worst case is a non-blank character, then one blank, then a tab stop, then MAX_COLUMN_WIDTH - 1 blanks, then a non-blank; so allocate MAX_COLUMN_WIDTH bytes to store the blanks. */ pending_blank = xmalloc (max_column_width * sizeof (mbf_char_t)); - mbf_init (mbf, fp); + if (found_bom == true) + { + print_bom(); + } while (true) { @@ -225,6 +245,27 @@ unexpand (void) if ((mb_iseof (c)) && (fp = next_file (fp))) { mbf_init (mbf, fp); + if (fp!=NULL) + { + if (check_bom(fp,&mbf)==true) + { + /*Not the first file - check BOM header*/ + if (using_utf_locale==false && found_bom==false) + { + /*BOM header in subsequent file but not in the first one. */ + error (EXIT_FAILURE, errno, _("combination of files with and without BOM header")); + } + } + else + { + if(using_utf_locale==false && found_bom==true) + { + /*First file conatined BOM header - locale was switched to UTF + /*all subsequent files should contain BOM. */ + error (EXIT_FAILURE, errno, _("combination of files with and without BOM header")); + } + } + } continue; } else diff -up ./tests/expand/mb.sh.orig ./tests/expand/mb.sh --- ./tests/expand/mb.sh.orig 2016-06-28 14:44:18.287619000 +0200 +++ ./tests/expand/mb.sh 2016-06-30 11:57:10.038407216 +0200 @@ -109,4 +109,75 @@ äbcdef\xFF | expand < in > out || fail=1 compare exp out > /dev/null 2>&1 || fail=1 + + +#BOM header test 1 +printf "\xEF\xBB\xBF" > in; cat <<\EOF >> in || framework_failure_ +1234567812345678123456781 +. . . . +a b c d +. . . . +ä ö ü ß +. . . . +EOF +env printf ' äöü\t. öüä. \tä xx\n' >> in || framework_failure_ + +printf "\xEF\xBB\xBF" > exp; cat <<\EOF >> exp || framework_failure_ +1234567812345678123456781 +. . . . +a b c d +. . . . +ä ö ü ß +. . . . + äöü . öüä. ä xx +EOF + + +expand < in > out || fail=1 +compare exp out > /dev/null 2>&1 || fail=1 + +LANG=C expand < in > out || fail=1 +compare exp out > /dev/null 2>&1 || fail=1 + +LC_ALL=C expand < in > out || fail=1 +compare exp out > /dev/null 2>&1 || fail=1 + + +printf '\xEF\xBB\xBF' > in1; cat <<\EOF >> in1 || framework_failure_ +1234567812345678123456781 +. . . . +a b c d +. . . . +ä ö ü ß +. . . . +EOF +env printf ' äöü\t. öüä. \tä xx\n' >> in1 || framework_failure_ + + +printf '\xEF\xBB\xBF' > exp; cat <<\EOF >> exp || framework_failure_ +1234567812345678123456781 +. . . . +a b c d +. . . . +ä ö ü ß +. . . . + äöü . öüä. ä xx +1234567812345678123456781 +. . . . +a b c d +. . . . +ä ö ü ß +. . . . + äöü . öüä. ä xx +EOF + +expand in1 in1 > out || fail=1 +compare exp out > /dev/null 2>&1 || fail=1 + +LANG=C expand in1 in1 > out || fail=1 +compare exp out > /dev/null 2>&1 || fail=1 + +LC_ALL=C expand in1 in1 > out || fail=1 +compare exp out > /dev/null 2>&1 || fail=1 + exit $fail diff -up ./tests/unexpand/mb.sh.orig ./tests/unexpand/mb.sh --- ./tests/unexpand/mb.sh.orig 2016-06-28 17:39:22.895259000 +0200 +++ ./tests/unexpand/mb.sh 2016-07-07 09:55:00.098281917 +0200 @@ -111,3 +111,62 @@ äbcdef\xFF\t| unexpand -a < in > out || fail=1 compare exp out > /dev/null 2>&1 || fail=1 + +#BOM header test 1 +printf "\xEF\xBB\xBF" > in; cat <<\EOF >> in || framework_failure_ +1234567812345678123456781 +. . . . +a b c d +. . . . +ä ö ü ß +. . . . + äöü . öüä. ä xx +EOF +env printf ' äöü\t. öüä. \tä xx\n' >> in || framework_failure_ + +printf "\xEF\xBB\xBF" > exp; cat <<\EOF >> exp || framework_failure_ +1234567812345678123456781 +. . . . +a b c d +. . . . +ä ö ü ß +. . . . + äöü . öüä. ä xx +EOF + +unexpand < in > out || fail=1 +compare exp out > /dev/null 2>&1 || fail=1 + +LANG=C unexpand < in > out || fail=1 +compare exp out > /dev/null 2>&1 || fail=1 + +LC_ALL=C unexpand < in > out || fail=1 +compare exp out > /dev/null 2>&1 || fail=1 + + +printf "\xEF\xBB\xBF" > exp; cat <<\EOF >> exp || framework_failure_ +1234567812345678123456781 +. . . . +a b c d +. . . . +ä ö ü ß +. . . . + äöü . öüä. ä xx +1234567812345678123456781 +. . . . +a b c d +. . . . +ä ö ü ß +. . . . + äöü . öüä. ä xx +EOF + + +unexpand in in > out || fail=1 +compare exp out > /dev/null 2>&1 || fail=1 + +LANG=C unexpand in in > out || fail=1 +compare exp out > /dev/null 2>&1 || fail=1 + +LC_ALL=C unexpand in in > out || fail=1 +compare exp out > /dev/null 2>&1 || fail=1