Blob Blame History Raw
From d5c106a95c49508f5e214f2fa174968eee2352fc Mon Sep 17 00:00:00 2001
From: christos <christos>
Date: Sat, 6 Jun 2015 21:19:07 +0000
Subject: [PATCH] PR/437: Fix handling of invalid unicode characters. tcsh uses
 the high order bits to encode attributes in the prompt and the high bit in
 regular characters. Make the drawing routines take an argument indicating if
 we are drawing the prompt or not, so that we can decide how to deal with the
 high bits. This solution is the minimum diff and does not allow "large valued"
 unicode characters to be in the prompt (because they would conflict with the
 attribute bits). A better solution would be to have a struct for each
 character so we could encode extra attributes.

---
 Fixes        |  1 +
 ed.chared.c  |  2 +-
 ed.refresh.c | 54 ++++++++++++++++++++++++++++++++++++++++++------------
 ed.xmap.c    |  2 +-
 sh.file.c    |  2 +-
 sh.glob.c    |  9 +++++++--
 sh.h         | 15 ++++++++++++---
 sh.hist.c    |  2 +-
 sh.misc.c    | 22 ++++++++++++++++++----
 tc.func.c    |  7 +++++++
 tc.nls.c     | 40 ++++++++++++++++++++++++++++++----------
 tc.nls.h     |  3 ++-
 tc.printf.c  |  2 +-
 tc.str.c     | 22 ++++++++++++++++++----
 tw.parse.c   |  7 ++++++-
 15 files changed, 148 insertions(+), 42 deletions(-)

diff --git a/Fixes b/Fixes
index 7d0ceac..aa779b1 100644
--- a/Fixes
+++ b/Fixes
@@ -1,3 +1,4 @@
+  2. PR/437: Fix handling of invalid unicode characters.
   1. PR/451: Fix error messages containing %c to be always '%c'
 
  41. V6.19.00 - 20150521
diff --git a/ed.chared.c b/ed.chared.c
index bade211..c0bd41b 100644
--- a/ed.chared.c
+++ b/ed.chared.c
@@ -3387,7 +3387,7 @@ e_stuff_char(Char c)
          (void) Cookedmode();
 
      (void) xwrite(SHIN, "\n", 1);
-     len = one_wctomb(buf, c & CHAR);
+     len = one_wctomb(buf, c);
      for (i = 0; i < len; i++)
 	 (void) ioctl(SHIN, TIOCSTI, (ioctl_t) &buf[i]);
 
diff --git a/ed.refresh.c b/ed.refresh.c
index 9e6da00..a88c5e5 100644
--- a/ed.refresh.c
+++ b/ed.refresh.c
@@ -46,7 +46,7 @@ static int vcursor_h, vcursor_v;
 static int rprompt_h, rprompt_v;
 
 static	int	MakeLiteral		(Char *, int, Char);
-static	int	Draw 			(Char *, int);
+static	int	Draw 			(Char *, int, int);
 static	void	Vdraw 			(Char, int);
 static	void	RefreshPromptpart	(Char *);
 static	void	update_line 		(Char *, Char *, int);
@@ -159,15 +159,44 @@ static int MakeLiteral(Char *str, int len, Char addlit)
     return i | LITERAL;
 }
 
+/* draw char at cp, expand tabs, ctl chars */
 static int
-Draw(Char *cp, int nocomb)	/* draw char at cp, expand tabs, ctl chars */
+Draw(Char *cp, int nocomb, int drawPrompt)
 {
     int w, i, lv, lh;
     Char c, attr;
 
+#ifdef WIDE_STRINGS
+    if (!drawPrompt) {			/* draw command-line */
+	attr = 0;
+	c = *cp;
+    } else {				/* draw prompt */
+	/* prompt with attributes(UNDER,BOLD,STANDOUT) */
+	if (*cp & (UNDER | BOLD | STANDOUT)) {		/* *cp >= STANDOUT */
+
+	    /* example)
+	     * We can't distinguish whether (*cp=)0x02ffffff is
+	     * U+02FFFFFF or U+00FFFFFF|STANDOUT.
+	     * We handle as U+00FFFFFF|STANDOUT, only when drawing prompt. */
+	    attr = (*cp & ATTRIBUTES);
+	    /* ~(UNDER | BOLD | STANDOUT) = 0xf1ffffff */
+	    c = *cp & ~(UNDER | BOLD | STANDOUT);
+
+	    /* if c is ctrl code, we handle *cp as havnig no attributes */
+	    if ((c < 0x20 && c >= 0) || c == 0x7f) {
+		attr = 0;
+		c = *cp;
+	    }
+	} else {			/* prompt without attributes */
+	    attr = 0;
+	    c = *cp;
+	}
+    }
+#else
     attr = *cp & ~CHAR;
     c = *cp & CHAR;
-    w = NLSClassify(c, nocomb);
+#endif
+    w = NLSClassify(c, nocomb, drawPrompt);
     switch (w) {
 	case NLSCLASS_NL:
 	    Vdraw('\0', 0);		/* assure end of line	 */
@@ -201,10 +230,11 @@ Draw(Char *cp, int nocomb)	/* draw char at cp, expand tabs, ctl chars */
 	case NLSCLASS_ILLEGAL2:
 	case NLSCLASS_ILLEGAL3:
 	case NLSCLASS_ILLEGAL4:
-	    Vdraw('\\' | attr, 1);
-	    Vdraw('U' | attr, 1);
-	    Vdraw('+' | attr, 1);
-	    for (i = 8 * NLSCLASS_ILLEGAL_SIZE(w) - 4; i >= 0; i -= 4)
+	case NLSCLASS_ILLEGAL5:
+	    Vdraw('\\', 1);
+	    Vdraw('U', 1);
+	    Vdraw('+', 1);
+	    for (i = 16 + 4 * (-w-5); i >= 0; i -= 4)
 		Vdraw("0123456789ABCDEF"[(c >> i) & 15] | attr, 1);
 	    break;
 	case 0:
@@ -302,7 +332,7 @@ RefreshPromptpart(Char *buf)
 	    }
 	}
 	else
-	    cp += Draw(cp, cp == buf);
+	    cp += Draw(cp, cp == buf, 1);
     }
 }
 
@@ -354,7 +384,7 @@ Refresh(void)
 	    cur_v = vcursor_v;
 	    Cursor = cp;
 	}
-	cp += Draw(cp, cp == InputBuf);
+	cp += Draw(cp, cp == InputBuf, 0);
     }
 
     if (cur_h == -1) {		/* if I haven't been set yet, I'm at the end */
@@ -1126,7 +1156,7 @@ RefCursor(void)
 	    cp++;
 	    continue;
 	}
-	w = NLSClassify(*cp & CHAR, cp == Prompt);
+	w = NLSClassify(*cp & CHAR, cp == Prompt, 0);
 	cp++;
 	switch(w) {
 	    case NLSCLASS_NL:
@@ -1158,7 +1188,7 @@ RefCursor(void)
     }
 
     for (cp = InputBuf; cp < Cursor;) {	/* do input buffer to Cursor */
-	w = NLSClassify(*cp & CHAR, cp == InputBuf);
+	w = NLSClassify(*cp & CHAR, cp == InputBuf, 0);
 	cp++;
 	switch(w) {
 	    case NLSCLASS_NL:
@@ -1251,7 +1281,7 @@ RefPlusOne(int l)
     }
     cp = Cursor - l;
     c = *cp & CHAR;
-    w = NLSClassify(c, cp == InputBuf);
+    w = NLSClassify(c, cp == InputBuf, 0);
     switch(w) {
 	case NLSCLASS_CTRL:
 	    PutPlusOne('^', 1);
diff --git a/ed.xmap.c b/ed.xmap.c
index 6e1d56e..36bce1e 100644
--- a/ed.xmap.c
+++ b/ed.xmap.c
@@ -743,7 +743,7 @@ unparsestring(const CStr *str, const Char *sep)
 	    *b++ = (unsigned char) p;
 	}
 	else if (p == ' ' || (Isprint(p) && !Isspace(p)))
-	    b += one_wctomb((char *)b, p & CHAR);
+	    b += one_wctomb((char *)b, p);
 	else {
 	    *b++ = '\\';
 	    *b++ = ((p >> 6) & 7) + '0';
diff --git a/sh.file.c b/sh.file.c
index 343b774..3989d8a 100644
--- a/sh.file.c
+++ b/sh.file.c
@@ -249,7 +249,7 @@ pushback(const Char *string)
 	char buf[MB_LEN_MAX];
 	size_t i, len;
 
-	len = one_wctomb(buf, *p & CHAR);
+	len = one_wctomb(buf, *p);
 	for (i = 0; i < len; i++)
 	    (void) ioctl(SHOUT, TIOCSTI, (ioctl_t) &buf[i]);
     }
diff --git a/sh.glob.c b/sh.glob.c
index fc510bf..7d008aa 100644
--- a/sh.glob.c
+++ b/sh.glob.c
@@ -594,8 +594,13 @@ trim(Char **t)
     Char *p;
 
     while ((p = *t++) != '\0')
-	while (*p)
-	    *p++ &= TRIM;
+	while (*p) {
+#if INVALID_BYTE != 0
+	    if ((*p & INVALID_BYTE) != INVALID_BYTE)	/* *p < INVALID_BYTE */
+#endif
+		*p &= TRIM;
+	    p++;
+	}
 }
 
 int
diff --git a/sh.h b/sh.h
index e71a24e..75de557 100644
--- a/sh.h
+++ b/sh.h
@@ -707,14 +707,21 @@ extern struct sigaction parterm;	/* Parents terminate catch */
 #define		ASCII		0177
 #ifdef WIDE_STRINGS		/* Implies SHORT_STRINGS */
 /* 31st char bit used for 'ing (not 32nd, we want all values nonnegative) */
-# define	QUOTE		0x40000000
-# define	TRIM		0x3FFFFFFF /* Mask to strip quote bit */
+/*
+ * Notice
+ *
+ * By fix for handling unicode name file, 32nd bit is used.
+ * We need use '&' instead of '> or <' when comparing with INVALID_BYTE etc..
+ * Cast to uChar is not recommended,
+ *  becase Char is 4bytes but uChar is 8bytes on I32LP64. */
+# define	QUOTE		0x80000000
+# define	TRIM		0x7FFFFFFF /* Mask to strip quote bit */
 # define	UNDER		0x08000000 /* Underline flag */
 # define	BOLD		0x04000000 /* Bold flag */
 # define	STANDOUT	0x02000000 /* Standout flag */
 # define	LITERAL		0x01000000 /* Literal character flag */
 # define	ATTRIBUTES	0x0F000000 /* The bits used for attributes */
-# define	INVALID_BYTE	0x00800000 /* Invalid character on input */
+# define	INVALID_BYTE	0xF0000000 /* Invalid character on input */
 # ifdef SOLARIS2
 #  define	CHAR		0x30FFFFFF /* Mask to mask out the character */
 # else
@@ -743,6 +750,8 @@ extern struct sigaction parterm;	/* Parents terminate catch */
 #endif
 #define		CHAR_DBWIDTH	(LITERAL|(LITERAL-1))
 
+# define 	MAX_UTF32	0x7FFFFFFF	/* max UTF32 is U+7FFFFFFF */
+
 EXTERN int     AsciiOnly;	/* If set only 7 bits expected in characters */
 
 /*
diff --git a/sh.hist.c b/sh.hist.c
index b8f71b7..c0eded5 100644
--- a/sh.hist.c
+++ b/sh.hist.c
@@ -1199,7 +1199,7 @@ fmthist(int fmt, ptr_t ptr)
 	    buf = xmalloc(Strlen(istr) * MB_LEN_MAX + 1);
 
 	    for (p = buf, ip = istr; *ip != '\0'; ip++)
-		p += one_wctomb(p, CHAR & *ip);
+		p += one_wctomb(p, *ip);
 
 	    *p = '\0';
 	    xfree(istr);
diff --git a/sh.misc.c b/sh.misc.c
index 7232b12..233ba5f 100644
--- a/sh.misc.c
+++ b/sh.misc.c
@@ -450,8 +450,13 @@ strip(Char *cp)
 
     if (!cp)
 	return (cp);
-    while ((*dp++ &= TRIM) != '\0')
-	continue;
+    while (*dp != '\0') {
+#if INVALID_BYTE != 0
+	if ((*dp & INVALID_BYTE) != INVALID_BYTE)    /* *dp < INVALID_BYTE */
+#endif
+		*dp &= TRIM;
+	dp++;
+    }
     return (cp);
 }
 
@@ -462,8 +467,17 @@ quote(Char *cp)
 
     if (!cp)
 	return (cp);
-    while (*dp != '\0')
-	*dp++ |= QUOTE;
+    while (*dp != '\0') {
+#ifdef WIDE_STRINGS
+	if ((*dp & 0xffffff80) == 0)	/* *dp < 0x80 */
+#elif defined SHORT_STRINGS
+	if ((*dp & 0xff80) == 0)	/* *dp < 0x80 */
+#else
+	if ((*dp & 0x80) == 0)		/* *dp < 0x80 */
+#endif
+	    *dp |= QUOTE;
+	dp++;
+    }
     return (cp);
 }
 
diff --git a/tc.func.c b/tc.func.c
index 2b28a68..5a909d6 100644
--- a/tc.func.c
+++ b/tc.func.c
@@ -124,7 +124,14 @@ expand_lex(const struct wordent *sp0, int from, int to)
 			(((*s & TRIM) == '\\') && (prev_c != '\\')))) {
 		    Strbuf_append1(&buf, '\\');
 		}
+#if INVALID_BYTE != 0
+		if ((*s & INVALID_BYTE) != INVALID_BYTE) /* *s < INVALID_BYTE */
+		    Strbuf_append1(&buf, *s & TRIM);
+		else
+		    Strbuf_append1(&buf, *s);
+#else
 		Strbuf_append1(&buf, *s & TRIM);
+#endif
 		prev_c = *s;
 	    }
 	    Strbuf_append1(&buf, ' ');
diff --git a/tc.nls.c b/tc.nls.c
index 2c38f3f..22ad173 100644
--- a/tc.nls.c
+++ b/tc.nls.c
@@ -64,7 +64,11 @@ NLSWidth(Char c)
 {
 # ifdef HAVE_WCWIDTH
     int l;
+#if INVALID_BYTE != 0
+    if ((c & INVALID_BYTE) == INVALID_BYTE)	/* c >= INVALID_BYTE */
+#else
     if (c & INVALID_BYTE)
+#endif
 	return 1;
     l = xwcwidth((wchar_t) c);
     return l >= 0 ? l : 0;
@@ -116,12 +120,36 @@ NLSChangeCase(const Char *p, int mode)
 }
 
 int
-NLSClassify(Char c, int nocomb)
+NLSClassify(Char c, int nocomb, int drawPrompt)
 {
     int w;
-    if (c & INVALID_BYTE)
+#ifndef SHORT_STRINGS
+    if ((c & 0x80) != 0)		/* c >= 0x80 */
 	return NLSCLASS_ILLEGAL;
+#endif
+    if (!drawPrompt) {			/* draw command-line */
+#if INVALID_BYTE != 0
+	if ((c & INVALID_BYTE) == INVALID_BYTE)		/* c >= INVALID_BYTE */
+	    return NLSCLASS_ILLEGAL;
+	if ((c & INVALID_BYTE) == QUOTE && (c & 0x80) == 0)	/* c >= QUOTE */
+	    return 1;
+	if (c >= 0x10000000)		/* U+10000000 = FC 90 80 80 80 80 */
+	    return NLSCLASS_ILLEGAL5;
+	if (c >= 0x1000000)		/*  U+1000000 = F9 80 80 80 80 */
+	    return NLSCLASS_ILLEGAL4;
+	if (c >= 0x100000)		/*   U+100000 = F4 80 80 80 */
+	    return NLSCLASS_ILLEGAL3;
+#endif
+	if (c >= 0x10000)		/*    U+10000 = F0 90 80 80 */
+	    return NLSCLASS_ILLEGAL2;
+    }
     w = NLSWidth(c);
+    if (drawPrompt) {			/* draw prompt */
+	if (w > 0)
+	    return w;
+	if (w == 0)
+	    return 1;
+    }
     if ((w > 0 && !(Iscntrl(c) && (c & CHAR) < 0x100)) || (Isprint(c) && !nocomb))
 	return w;
     if (Iscntrl(c) && (c & CHAR) < 0x100) {
@@ -131,13 +159,5 @@ NLSClassify(Char c, int nocomb)
 	    return NLSCLASS_TAB;
 	return NLSCLASS_CTRL;
     }
-#ifdef WIDE_STRINGS
-    if (c >= 0x1000000)
-	return NLSCLASS_ILLEGAL4;
-    if (c >= 0x10000)
-	return NLSCLASS_ILLEGAL3;
-#endif
-    if (c >= 0x100)
-	return NLSCLASS_ILLEGAL2;
     return NLSCLASS_ILLEGAL;
 }
diff --git a/tc.nls.h b/tc.nls.h
index 4d27741..6930682 100644
--- a/tc.nls.h
+++ b/tc.nls.h
@@ -43,7 +43,7 @@ extern int NLSStringWidth (const Char *);
 #endif
 
 extern Char *NLSChangeCase (const Char *, int);
-extern int NLSClassify (Char, int);
+extern int NLSClassify (Char, int, int);
 
 #define NLSCLASS_CTRL		(-1)
 #define NLSCLASS_TAB		(-2)
@@ -52,6 +52,7 @@ extern int NLSClassify (Char, int);
 #define NLSCLASS_ILLEGAL2	(-5)
 #define NLSCLASS_ILLEGAL3	(-6)
 #define NLSCLASS_ILLEGAL4	(-7)
+#define NLSCLASS_ILLEGAL5	(-8)
 
 #define NLSCLASS_ILLEGAL_SIZE(x) (-(x) - (-(NLSCLASS_ILLEGAL) - 1))
 
diff --git a/tc.printf.c b/tc.printf.c
index 7f2612d..c6be145 100644
--- a/tc.printf.c
+++ b/tc.printf.c
@@ -289,7 +289,7 @@ doprnt(void (*addchar) (int), const char *sfmt, va_list ap)
 			(*addchar) ('\\' | attributes);
 			count++;
 		    }
-		    len = one_wctomb(cbuf, *Bp & CHAR);
+		    len = one_wctomb(cbuf, *Bp);
 		    for (pos = 0; pos < len; pos++) {
 			(*addchar) ((unsigned char)cbuf[pos] | attributes
 				    | (*Bp & ATTRIBUTES));
diff --git a/tc.str.c b/tc.str.c
index c407cb8..c2b5ac8 100644
--- a/tc.str.c
+++ b/tc.str.c
@@ -66,10 +66,24 @@ one_wctomb(char *s, Char wchar)
 {
     int len;
 
-    if (wchar & INVALID_BYTE) {
-	s[0] = wchar & 0xFF;
+#if INVALID_BYTE != 0
+    if ((wchar & INVALID_BYTE) == INVALID_BYTE) {    /* wchar >= INVALID_BYTE */
+	/* invalid char
+	 * exmaple)
+	 * if wchar = f0000090(=90|INVALID_BYTE), then *s = ffffff90 */
+	*s = (char)wchar;
 	len = 1;
+#else
+    if (wchar & (CHAR & INVALID_BYTE)) {
+	s[0] = wchar & (CHAR & 0xFF);
+	len = 1;
+#endif
     } else {
+#if INVALID_BYTE != 0
+	wchar &= MAX_UTF32;
+#else
+	wchar &= CHAR;
+#endif
 #ifdef UTF16_STRINGS
 	if (wchar >= 0x10000) {
 	    /* UTF-16 systems can't handle these values directly in calls to
@@ -224,7 +238,7 @@ short2str(const Char *src)
     dst = sdst;
     edst = &dst[dstsize];
     while (*src) {
-	dst += one_wctomb(dst, *src & CHAR);
+	dst += one_wctomb(dst, *src);
 	src++;
 	if (dst >= edst) {
 	    char *wdst = dst;
@@ -544,7 +558,7 @@ short2qstr(const Char *src)
 		dst = &edst[-MALLOC_INCR];
 	    }
 	}
-	dst += one_wctomb(dst, *src & CHAR);
+	dst += one_wctomb(dst, *src);
 	src++;
 	if (dst >= edst) {
 	    ptrdiff_t i = dst - edst;
diff --git a/tw.parse.c b/tw.parse.c
index 8309ed8..94982d6 100644
--- a/tw.parse.c
+++ b/tw.parse.c
@@ -618,7 +618,12 @@ insert_meta(const Char *cp, const Char *cpend, const Char *word,
 	    break;
 
 	wq = w & QUOTE;
-	w &= ~QUOTE;
+#if INVALID_BYTE != 0
+	/* add checking INVALID_BYTE for FIX UTF32 */
+	if ((w & INVALID_BYTE) != INVALID_BYTE)		/* w < INVALID_BYTE */
+#else
+	    w &= ~QUOTE;
+#endif
 
 	if (cmap(w, _ESC | _QF))
 	    wq = QUOTE;		/* quotes are always quoted */
-- 
2.5.5