Blob Blame History Raw
diff -rupN binutils.orig/gas/NEWS binutils-2.37/gas/NEWS
--- binutils.orig/gas/NEWS	2021-11-18 16:50:39.104088534 +0000
+++ binutils-2.37/gas/NEWS	2021-11-18 16:51:16.340948280 +0000
@@ -1,5 +1,13 @@
 -*- text -*-
 
+* The --multibyte-handling=[allow|warn|warn-sym-only] option tells the
+  assembler what to when it encoutners multibyte characters in the input.  The
+  default is to allow them.  Setting the option to "warn" will generate a
+  warning message whenever any multibyte character is encountered.  Using the
+  option to "warn-sym-only" will make the assembler generate a warning whenever a
+  symbol is defined containing multibyte characters.  (References to undefined
+  symbols will not generate warnings).
+
 Changes in 2.37:
 
 * arm-symbianelf support removed.
diff -rupN binutils.orig/gas/app.c binutils-2.37/gas/app.c
--- binutils.orig/gas/app.c	2021-11-18 16:50:39.104088534 +0000
+++ binutils-2.37/gas/app.c	2021-11-18 16:50:42.530075630 +0000
@@ -345,6 +345,55 @@ process_escape (int ch)
     }
 }
 
+#define MULTIBYTE_WARN_COUNT_LIMIT 10
+static unsigned int multibyte_warn_count = 0;
+
+bool
+scan_for_multibyte_characters (const unsigned char *  start,
+			       const unsigned char *  end,
+			       bool                   warn)
+{
+  if (end <= start)
+    return false;
+
+  if (warn && multibyte_warn_count > MULTIBYTE_WARN_COUNT_LIMIT)
+    return false;
+
+  bool found = false;
+
+  while (start < end)
+    {
+      unsigned char c;
+
+      if ((c = * start++) <= 0x7f)
+	continue;
+
+      if (!warn)
+	return true;
+
+      found = true;
+
+      const char * filename;
+      unsigned int lineno;
+
+      filename = as_where (& lineno);
+      if (filename == NULL)
+	as_warn (_("multibyte character (%#x) encountered in input"), c);
+      else if (lineno == 0)
+	as_warn (_("multibyte character (%#x) encountered in %s"), c, filename);
+      else
+	as_warn (_("multibyte character (%#x) encountered in %s at or near line %u"), c, filename, lineno);
+
+      if (++ multibyte_warn_count == MULTIBYTE_WARN_COUNT_LIMIT)
+	{
+	  as_warn (_("further multibyte character warnings suppressed"));
+	  break;
+	}
+    }
+
+  return found;
+}
+
 /* This function is called to process input characters.  The GET
    parameter is used to retrieve more input characters.  GET should
    set its parameter to point to a buffer, and return the length of
@@ -463,6 +512,11 @@ do_scrub_chars (size_t (*get) (char *, s
 	return 0;
       from = input_buffer;
       fromend = from + fromlen;
+
+      if (multibyte_handling == multibyte_warn)
+	(void) scan_for_multibyte_characters ((const unsigned char *) from,
+					      (const unsigned char* ) fromend,
+					      true /* Generate warnings.  */);
     }
 
   while (1)
diff -rupN binutils.orig/gas/as.c binutils-2.37/gas/as.c
--- binutils.orig/gas/as.c	2021-11-18 16:50:39.104088534 +0000
+++ binutils-2.37/gas/as.c	2021-11-18 16:50:42.531075627 +0000
@@ -474,7 +474,7 @@ parse_args (int * pargc, char *** pargv)
       OPTION_DEBUG_PREFIX_MAP,
       OPTION_DEFSYM,
       OPTION_LISTING_LHS_WIDTH,
-      OPTION_LISTING_LHS_WIDTH2,
+      OPTION_LISTING_LHS_WIDTH2, /* = STD_BASE + 10 */
       OPTION_LISTING_RHS_WIDTH,
       OPTION_LISTING_CONT_LINES,
       OPTION_DEPFILE,
@@ -484,7 +484,7 @@ parse_args (int * pargc, char *** pargv)
       OPTION_GDWARF_3,
       OPTION_GDWARF_4,
       OPTION_GDWARF_5,
-      OPTION_GDWARF_SECTIONS,
+      OPTION_GDWARF_SECTIONS, /* = STD_BASE + 20 */
       OPTION_GDWARF_CIE_VERSION,
       OPTION_STRIP_LOCAL_ABSOLUTE,
       OPTION_TRADITIONAL_FORMAT,
@@ -494,7 +494,7 @@ parse_args (int * pargc, char *** pargv)
       OPTION_NOEXECSTACK,
       OPTION_SIZE_CHECK,
       OPTION_ELF_STT_COMMON,
-      OPTION_ELF_BUILD_NOTES,
+      OPTION_ELF_BUILD_NOTES, /* = STD_BASE + 30 */
       OPTION_SECTNAME_SUBST,
       OPTION_ALTERNATE,
       OPTION_AL,
@@ -503,7 +503,8 @@ parse_args (int * pargc, char *** pargv)
       OPTION_WARN_FATAL,
       OPTION_COMPRESS_DEBUG,
       OPTION_NOCOMPRESS_DEBUG,
-      OPTION_NO_PAD_SECTIONS /* = STD_BASE + 40 */
+      OPTION_NO_PAD_SECTIONS,
+      OPTION_MULTIBYTE_HANDLING  /* = STD_BASE + 40 */
     /* When you add options here, check that they do
        not collide with OPTION_MD_BASE.  See as.h.  */
     };
@@ -581,6 +582,7 @@ parse_args (int * pargc, char *** pargv)
     ,{"target-help", no_argument, NULL, OPTION_TARGET_HELP}
     ,{"traditional-format", no_argument, NULL, OPTION_TRADITIONAL_FORMAT}
     ,{"warn", no_argument, NULL, OPTION_WARN}
+    ,{"multibyte-handling", required_argument, NULL, OPTION_MULTIBYTE_HANDLING}
   };
 
   /* Construct the option lists from the standard list and the target
@@ -683,6 +685,19 @@ parse_args (int * pargc, char *** pargv)
 	  flag_traditional_format = 1;
 	  break;
 
+	case OPTION_MULTIBYTE_HANDLING:
+	  if (strcmp (optarg, "allow") == 0)
+	    multibyte_handling = multibyte_allow;
+	  else if (strcmp (optarg, "warn") == 0)
+	    multibyte_handling = multibyte_warn;
+	  else if (strcmp (optarg, "warn-sym-only") == 0)
+	    multibyte_handling = multibyte_warn_syms;
+	  else if (strcmp (optarg, "warn_sym_only") == 0)
+	    multibyte_handling = multibyte_warn_syms;
+	  else
+	    as_fatal (_("unexpected argument to --multibyte-input-option: '%s'"), optarg);
+	  break;
+
 	case OPTION_VERSION:
 	  /* This output is intended to follow the GNU standards document.  */
 	  printf (_("GNU assembler %s\n"), BFD_VERSION_STRING);
diff -rupN binutils.orig/gas/as.h binutils-2.37/gas/as.h
--- binutils.orig/gas/as.h	2021-11-18 16:50:38.834089551 +0000
+++ binutils-2.37/gas/as.h	2021-11-18 16:50:42.531075627 +0000
@@ -344,6 +344,14 @@ COMMON int linkrelax;
 
 COMMON int do_not_pad_sections_to_alignment;
 
+enum multibyte_input_handling
+{
+  multibyte_allow = 0,
+  multibyte_warn,
+  multibyte_warn_syms
+};
+COMMON enum multibyte_input_handling multibyte_handling;
+
 /* TRUE if we should produce a listing.  */
 extern int listing;
 
@@ -450,6 +458,7 @@ void   input_scrub_insert_file (char *);
 char * input_scrub_new_file (const char *);
 char * input_scrub_next_buffer (char **bufp);
 size_t do_scrub_chars (size_t (*get) (char *, size_t), char *, size_t);
+bool   scan_for_multibyte_characters (const unsigned char *, const unsigned char *, bool);
 int    gen_to_words (LITTLENUM_TYPE *, int, long);
 int    had_err (void);
 int    ignore_input (void);
diff -rupN binutils.orig/gas/doc/as.texi binutils-2.37/gas/doc/as.texi
--- binutils.orig/gas/doc/as.texi	2021-11-18 16:50:38.838089536 +0000
+++ binutils-2.37/gas/doc/as.texi	2021-11-18 16:50:42.535075612 +0000
@@ -245,6 +245,7 @@ gcc(1), ld(1), and the Info entries for
  [@b{--sectname-subst}] [@b{--size-check=[error|warning]}]
  [@b{--elf-stt-common=[no|yes]}]
  [@b{--generate-missing-build-notes=[no|yes]}]
+ [@b{--multibyte-handling=[allow|warn|warn-sym-only]}]
  [@b{--target-help}] [@var{target-options}]
  [@b{--}|@var{files} @dots{}]
 @c
@@ -866,6 +867,18 @@ Set the maximum width of an input source
 Set the maximum number of lines printed in a listing for a single line of input
 to @var{number} + 1.
 
+@item --multibyte-handling=allow
+@itemx --multibyte-handling=warn
+@itemx --multibyte-handling=warn-sym-only
+Controls how the assembler handles multibyte characters in the input.  The
+default (which can be restored by using the @option{allow} argument) is to
+allow such characters without complaint.  Using the @option{warn} argument will
+make the assembler generate a warning message whenever any multibyte character
+is encountered.  Using the @option{warn-sym-only} argument will only cause a
+warning to be generated when a symbol is defined with a name that contains
+multibyte characters.  (References to undefined symbols will not generate a
+warning).
+
 @item --no-pad-sections
 Stop the assembler for padding the ends of output sections to the alignment
 of that section.  The default is to pad the sections, but this can waste space
@@ -2942,9 +2955,11 @@ are noted in @ref{Machine Dependencies}.
 @end ifset
 No symbol may begin with a digit.  Case is significant.
 There is no length limit; all characters are significant.  Multibyte characters
-are supported.  Symbols are delimited by characters not in that set, or by the
-beginning of a file (since the source program must end with a newline, the end
-of a file is not a possible symbol delimiter).  @xref{Symbols}.
+are supported, but note that the setting of the
+@option{--multibyte-handling} option might prevent their use.  Symbols
+are delimited by characters not in that set, or by the beginning of a file
+(since the source program must end with a newline, the end of a file is not a
+possible symbol delimiter).  @xref{Symbols}.
 
 Symbol names may also be enclosed in double quote @code{"} characters.  In such
 cases any characters are allowed, except for the NUL character.  If a double
@@ -3834,11 +3849,18 @@ than @code{Foo}.
 Symbol names do not start with a digit.  An exception to this rule is made for
 Local Labels.  See below.
 
-Multibyte characters are supported.  To generate a symbol name containing
+Multibyte characters are supported, but note that the setting of the
+@option{multibyte-handling} option might prevent their use.
+To generate a symbol name containing
 multibyte characters enclose it within double quotes and use escape codes. cf
 @xref{Strings}.  Generating a multibyte symbol name from a label is not
 currently supported.
 
+Since multibyte symbol names are unusual, and could possibly be used
+maliciously, @command{@value{AS}} provides a command line option
+(@option{--multibyte-handling=warn-sym-only}) which can be used to generate a
+warning message whenever a symbol name containing multibyte characters is defined.
+
 Each symbol has exactly one name.  Each name in an assembly language program
 refers to exactly one symbol.  You may use that symbol name any number of times
 in a program.
diff -rupN binutils.orig/gas/input-scrub.c binutils-2.37/gas/input-scrub.c
--- binutils.orig/gas/input-scrub.c	2021-11-18 16:50:38.835089547 +0000
+++ binutils-2.37/gas/input-scrub.c	2021-11-18 16:50:42.535075612 +0000
@@ -377,6 +377,11 @@ input_scrub_next_buffer (char **bufp)
 	  ++p;
 	}
 
+      if (multibyte_handling == multibyte_warn)
+	(void) scan_for_multibyte_characters ((const unsigned char *) p,
+					      (const unsigned char *) limit,
+					      true /* Generate warnings */);
+
       /* We found a newline in the newly read chars.  */
       partial_where = p;
       partial_size = limit - p;
diff -rupN binutils.orig/gas/symbols.c binutils-2.37/gas/symbols.c
--- binutils.orig/gas/symbols.c	2021-11-18 16:50:39.105088530 +0000
+++ binutils-2.37/gas/symbols.c	2021-11-18 16:52:17.980716107 +0000
@@ -78,6 +78,10 @@ struct symbol_flags
      before.  It is cleared as soon as any direct reference to the
      symbol is present.  */
   unsigned int weakrefd : 1;
+
+  /* Set when a warning about the symbol containing multibyte characters
+     is generated.  */
+  unsigned int multibyte_warned : 1;
 };
 
 /* A pointer in the symbol may point to either a complete symbol
@@ -194,7 +198,7 @@ static void *
 symbol_entry_find (htab_t table, const char *name)
 {
   hashval_t hash = htab_hash_string (name);
-  symbol_entry_t needle = { { { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  symbol_entry_t needle = { { { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
 			      hash, name, 0, 0, 0 } };
   return htab_find_with_hash (table, &needle, hash);
 }
@@ -305,6 +309,18 @@ symbol_init (symbolS *symbolP, const cha
   symbolP->bsym->name = name;
   symbolP->bsym->section = sec;
 
+  if (multibyte_handling == multibyte_warn_syms
+      && ! symbolP->flags.local_symbol
+      && sec != undefined_section
+      && ! symbolP->flags.multibyte_warned
+      && scan_for_multibyte_characters ((const unsigned char *) name,
+					(const unsigned char *) name + strlen (name),
+					false /* Do not warn.  */))
+    {
+      as_warn (_("symbol '%s' contains multibyte characters"), name);
+      symbolP->flags.multibyte_warned = 1;
+    }
+
   S_SET_VALUE (symbolP, valu);
 
   symbol_clear_list_pointers (symbolP);
@@ -2413,7 +2429,21 @@ S_SET_SEGMENT (symbolS *s, segT seg)
 	abort ();
     }
   else
-    s->bsym->section = seg;
+    {
+      if (multibyte_handling == multibyte_warn_syms
+	  && ! s->flags.local_symbol
+	  && seg != undefined_section
+	  && ! s->flags.multibyte_warned
+	  && scan_for_multibyte_characters ((const unsigned char *) s->name,
+					    (const unsigned char *) s->name + strlen (s->name),
+					    false))
+	{
+	  as_warn (_("symbol '%s' contains multibyte characters"), s->name);
+	  s->flags.multibyte_warned = 1;
+	}
+
+      s->bsym->section = seg;
+    }
 }
 
 void
diff -rupN binutils.orig/gas/testsuite/gas/all/gas.exp binutils-2.37/gas/testsuite/gas/all/gas.exp
--- binutils.orig/gas/testsuite/gas/all/gas.exp	2021-11-18 16:50:39.101088545 +0000
+++ binutils-2.37/gas/testsuite/gas/all/gas.exp	2021-11-18 16:50:42.538075600 +0000
@@ -494,3 +494,5 @@ run_dump_test "nop"
 run_dump_test "asciz"
 run_dump_test "pr27384"
 run_dump_test "pr27381"
+run_dump_test "multibyte1"
+run_dump_test "multibyte2"
diff -rupN binutils.orig/testsuite/gas/all/multibyte.s binutils-2.37/testsuite/gas/all/multibyte.s
--- binutils.orig/testsuite/gas/all/multibyte.s	1970-01-01 01:00:00.000000000 +0100
+++ binutils-2.37/testsuite/gas/all/multibyte.s	2021-11-18 16:50:42.541075589 +0000
@@ -0,0 +1,8 @@
+	.text
+	.globl	he‮oll‬
+he‮oll‬:
+	.nop
+	
+	.globl	hello
+hello:
+	.nop
diff -rupN binutils.orig/testsuite/gas/all/multibyte1.d binutils-2.37/testsuite/gas/all/multibyte1.d
--- binutils.orig/testsuite/gas/all/multibyte1.d	1970-01-01 01:00:00.000000000 +0100
+++ binutils-2.37/testsuite/gas/all/multibyte1.d	2021-11-18 16:50:42.541075589 +0000
@@ -0,0 +1,3 @@
+#source: multibyte.s
+#as: --multibyte-handling=warn
+#warning_output: multibyte1.l
diff -rupN binutils.orig/testsuite/gas/all/multibyte1.l binutils-2.37/testsuite/gas/all/multibyte1.l
--- binutils.orig/testsuite/gas/all/multibyte1.l	1970-01-01 01:00:00.000000000 +0100
+++ binutils-2.37/testsuite/gas/all/multibyte1.l	2021-11-18 16:50:42.541075589 +0000
@@ -0,0 +1,12 @@
+[^:]*: Assembler messages:
+[^:]*: Warning: multibyte character \(0xe2\) encountered in .*multibyte.s
+[^:]*: Warning: multibyte character \(0x80\) encountered in .*multibyte.s
+[^:]*: Warning: multibyte character \(0xae\) encountered in .*multibyte.s
+[^:]*: Warning: multibyte character \(0xe2\) encountered in .*multibyte.s
+[^:]*: Warning: multibyte character \(0x80\) encountered in .*multibyte.s
+[^:]*: Warning: multibyte character \(0xac\) encountered in .*multibyte.s
+[^:]*: Warning: multibyte character \(0xe2\) encountered in .*multibyte.s
+[^:]*: Warning: multibyte character \(0x80\) encountered in .*multibyte.s
+[^:]*: Warning: multibyte character \(0xae\) encountered in .*multibyte.s
+[^:]*: Warning: multibyte character \(0xe2\) encountered in .*multibyte.s
+[^:]*: Warning: further multibyte character warnings suppressed
diff -rupN binutils.orig/testsuite/gas/all/multibyte2.d binutils-2.37/testsuite/gas/all/multibyte2.d
--- binutils.orig/testsuite/gas/all/multibyte2.d	1970-01-01 01:00:00.000000000 +0100
+++ binutils-2.37/testsuite/gas/all/multibyte2.d	2021-11-18 16:50:42.542075585 +0000
@@ -0,0 +1,3 @@
+#source: multibyte.s
+#as: --multibyte-handling=warn-sym-only
+#warning_output: multibyte2.l
diff -rupN binutils.orig/testsuite/gas/all/multibyte2.l binutils-2.37/testsuite/gas/all/multibyte2.l
--- binutils.orig/testsuite/gas/all/multibyte2.l	1970-01-01 01:00:00.000000000 +0100
+++ binutils-2.37/testsuite/gas/all/multibyte2.l	2021-11-18 16:50:42.541075589 +0000
@@ -0,0 +1,2 @@
+[^:]*: Assembler messages:
+[^:]*:3: Warning: symbol '.*' contains multibyte characters