Tree - rpms/python3 - src.fedoraproject.org

rpms / python3

Blame python3-r80382-r80385-lone-surrogate-and-utf8-error-handler.patch

Blob History Raw

		e9848ab	`Index: Python-3.1.2/Objects/unicodeobject.c`
		211f42f	`===================================================================`
		e9848ab	`--- Python-3.1.2.orig/Objects/unicodeobject.c`
		e9848ab	`+++ Python-3.1.2/Objects/unicodeobject.c`
		e9848ab	`@@ -159,6 +159,12 @@ static PyObject *unicode_encode_call_err`
		211f42f	`const Py_UNICODE unicode, Py_ssize_t size, PyObject *exceptionObject,`
		211f42f	`Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);`
		211f42f
		211f42f	`+static void raise_encode_exception(PyObject **exceptionObject,`
		211f42f	`+ const char *encoding,`
		211f42f	`+ const Py_UNICODE *unicode, Py_ssize_t size,`
		211f42f	`+ Py_ssize_t startpos, Py_ssize_t endpos,`
		211f42f	`+ const char *reason);`
		211f42f	`+`
		211f42f	`/* Same for linebreaks */`
		211f42f	`static unsigned char ascii_linebreak[] = {`
		211f42f	`0, 0, 0, 0, 0, 0, 0, 0,`
		e9848ab	`@@ -2453,67 +2459,98 @@ PyUnicode_EncodeUTF8(const Py_UNICODE *s`
		e9848ab	`for (i = 0; i < size;) {`
		e9848ab	`Py_UCS4 ch = s[i++];`
		e9848ab
		e9848ab	`- if (ch < 0x80)`
		e9848ab	`+ if (ch < 0x80) {`
		e9848ab	`/* Encode ASCII */`
		e9848ab	`*p++ = (char) ch;`
		e9848ab
		e9848ab	`- else if (ch < 0x0800) {`
		e9848ab	`+ } else if (ch < 0x0800) {`
		211f42f	`/* Encode Latin-1 */`
		211f42f	`*p++ = (char)(0xc0 \| (ch >> 6));`
		211f42f	`*p++ = (char)(0x80 \| (ch & 0x3f));`
		211f42f	`- }`
		211f42f	`- else {`
		211f42f	`- /* Encode UCS2 Unicode ordinals */`
		211f42f	`- if (ch < 0x10000) {`
		211f42f	`+ } else if (0xD800 <= ch && ch <= 0xDFFF) {`
		211f42f	`#ifndef Py_UNICODE_WIDE`
		211f42f	`- /* Special case: check for high surrogate */`
		211f42f	`- if (0xD800 <= ch && ch <= 0xDBFF && i != size) {`
		211f42f	`- Py_UCS4 ch2 = s[i];`
		211f42f	`- /* Check for low surrogate and combine the two to`
		211f42f	`- form a UCS4 value */`
		211f42f	`- if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {`
		211f42f	`- ch = ((ch - 0xD800) << 10 \| (ch2 - 0xDC00)) + 0x10000;`
		211f42f	`- i++;`
		211f42f	`- goto encodeUCS4;`
		211f42f	`- }`
		211f42f	`- /* Fall through: handles isolated high surrogates */`
		211f42f	`- }`
		211f42f	`+ /* Special case: check for high and low surrogate */`
		211f42f	`+ if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {`
		211f42f	`+ Py_UCS4 ch2 = s[i];`
		211f42f	`+ /* Combine the two surrogates to form a UCS4 value */`
		211f42f	`+ ch = ((ch - 0xD800) << 10 \| (ch2 - 0xDC00)) + 0x10000;`
		211f42f	`+ i++;`
		211f42f	`+`
		211f42f	`+ /* Encode UCS4 Unicode ordinals */`
		211f42f	`+ *p++ = (char)(0xf0 \| (ch >> 18));`
		211f42f	`+ *p++ = (char)(0x80 \| ((ch >> 12) & 0x3f));`
		211f42f	`+ *p++ = (char)(0x80 \| ((ch >> 6) & 0x3f));`
		211f42f	`+ *p++ = (char)(0x80 \| (ch & 0x3f));`
		211f42f	`+`
		e9848ab	`+ } else {`
		211f42f	`#endif`
		211f42f	`- if (ch >= 0xd800 && ch <= 0xdfff) {`
		211f42f	`- Py_ssize_t newpos;`
		211f42f	`- PyObject *rep;`
		211f42f	`- char *prep;`
		211f42f	`- int k;`
		211f42f	`- rep = unicode_encode_call_errorhandler`
		211f42f	`- (errors, &errorHandler, "utf-8", "surrogates not allowed",`
		211f42f	`- s, size, &exc, i-1, i, &newpos);`
		211f42f	`- if (!rep)`
		e9848ab	`- goto error;`
		e9848ab	`- /* Implementation limitations: only support error handler that return`
		e9848ab	`- bytes, and only support up to four replacement bytes. */`
		e9848ab	`- if (!PyBytes_Check(rep)) {`
		e9848ab	`- PyErr_SetString(PyExc_TypeError, "error handler should have returned bytes");`
		e9848ab	`- Py_DECREF(rep);`
		211f42f	`+ Py_ssize_t newpos;`
		211f42f	`+ PyObject *rep;`
		211f42f	`+ Py_ssize_t repsize, k;`
		211f42f	`+ rep = unicode_encode_call_errorhandler`
		211f42f	`+ (errors, &errorHandler, "utf-8", "surrogates not allowed",`
		211f42f	`+ s, size, &exc, i-1, i, &newpos);`
		211f42f	`+ if (!rep)`
		211f42f	`+ goto error;`
		211f42f	`+`
		211f42f	`+ if (PyBytes_Check(rep))`
		211f42f	`+ repsize = PyBytes_GET_SIZE(rep);`
		211f42f	`+ else`
		211f42f	`+ repsize = PyUnicode_GET_SIZE(rep);`
		211f42f	`+`
		211f42f	`+ if (repsize > 4) {`
		211f42f	`+ Py_ssize_t offset;`
		211f42f	`+`
		211f42f	`+ if (result == NULL)`
		211f42f	`+ offset = p - stackbuf;`
		211f42f	`+ else`
		211f42f	`+ offset = p - PyBytes_AS_STRING(result);`
		211f42f	`+`
		211f42f	`+ if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {`
		211f42f	`+ /* integer overflow */`
		211f42f	`+ PyErr_NoMemory();`
		211f42f	`goto error;`
		211f42f	`}`
		211f42f	`- if (PyBytes_Size(rep) > 4) {`
		211f42f	`- PyErr_SetString(PyExc_TypeError, "error handler returned too many bytes");`
		211f42f	`- Py_DECREF(rep);`
		211f42f	`- goto error;`
		211f42f	`+ nallocated += repsize - 4;`
		211f42f	`+ if (result != NULL) {`
		211f42f	`+ if (_PyBytes_Resize(&result, nallocated) < 0)`
		211f42f	`+ goto error;`
		211f42f	`+ } else {`
		211f42f	`+ result = PyBytes_FromStringAndSize(NULL, nallocated);`
		211f42f	`+ if (result == NULL)`
		211f42f	`+ goto error;`
		211f42f	`+ Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);`
		211f42f	`}`
		211f42f	`- prep = PyBytes_AsString(rep);`
		211f42f	`- for(k = PyBytes_Size(rep); k > 0; k--)`
		211f42f	`+ p = PyBytes_AS_STRING(result) + offset;`
		211f42f	`+ }`
		211f42f	`+`
		211f42f	`+ if (PyBytes_Check(rep)) {`
		211f42f	`+ char *prep = PyBytes_AS_STRING(rep);`
		211f42f	`+ for(k = repsize; k > 0; k--)`
		211f42f	`p++ = prep++;`
		211f42f	`- Py_DECREF(rep);`
		211f42f	`- continue;`
		211f42f	`-`
		211f42f	`+ } else /* rep is unicode */ {`
		211f42f	`+ Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);`
		211f42f	`+ Py_UNICODE c;`
		211f42f	`+`
		211f42f	`+ for(k=0; k`
		211f42f	`+ c = prep[k];`
		211f42f	`+ if (0x80 <= c) {`
		211f42f	`+ raise_encode_exception(&exc, "utf-8", s, size,`
		211f42f	`+ i-1, i, "surrogates not allowed");`
		211f42f	`+ goto error;`
		211f42f	`+ }`
		211f42f	`+ *p++ = (char)prep[k];`
		211f42f	`+ }`
		211f42f	`}`
		211f42f	`- *p++ = (char)(0xe0 \| (ch >> 12));`
		211f42f	`- *p++ = (char)(0x80 \| ((ch >> 6) & 0x3f));`
		211f42f	`- *p++ = (char)(0x80 \| (ch & 0x3f));`
		211f42f	`- continue;`
		211f42f	`+ Py_DECREF(rep);`
		e9848ab	`+#ifndef Py_UNICODE_WIDE`
		211f42f	`}`
		211f42f	`- encodeUCS4:`
		e9848ab	`+#endif`
		211f42f	`+ } else if (ch < 0x10000) {`
		211f42f	`+ *p++ = (char)(0xe0 \| (ch >> 12));`
		211f42f	`+ *p++ = (char)(0x80 \| ((ch >> 6) & 0x3f));`
		211f42f	`+ *p++ = (char)(0x80 \| (ch & 0x3f));`
		211f42f	`+ } else /* ch >= 0x10000 */ {`
		211f42f	`/* Encode UCS4 Unicode ordinals */`
		211f42f	`*p++ = (char)(0xf0 \| (ch >> 18));`
		211f42f	`*p++ = (char)(0x80 \| ((ch >> 12) & 0x3f));`
		e9848ab	`Index: Python-3.1.2/Lib/test/test_codecs.py`
		211f42f	`===================================================================`
		e9848ab	`--- Python-3.1.2.orig/Lib/test/test_codecs.py`
		e9848ab	`+++ Python-3.1.2/Lib/test/test_codecs.py`
		e9848ab	`@@ -571,6 +571,16 @@ class UTF8Test(ReadTest):`
		211f42f	`def test_lone_surrogates(self):`
		211f42f	`self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8")`
		211f42f	`self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8")`
		211f42f	`+ self.assertEqual("[\uDC80]".encode("utf-8", "backslashreplace"),`
		211f42f	`+ b'[\\udc80]')`
		211f42f	`+ self.assertEqual("[\uDC80]".encode("utf-8", "xmlcharrefreplace"),`
		211f42f	`+ b'[�]')`
		211f42f	`+ self.assertEqual("[\uDC80]".encode("utf-8", "surrogateescape"),`
		211f42f	`+ b'[\x80]')`
		211f42f	`+ self.assertEqual("[\uDC80]".encode("utf-8", "ignore"),`
		211f42f	`+ b'[]')`
		211f42f	`+ self.assertEqual("[\uDC80]".encode("utf-8", "replace"),`
		211f42f	`+ b'[?]')`
		211f42f
		211f42f	`def test_surrogatepass_handler(self):`
		211f42f	`self.assertEquals("abc\ud800def".encode("utf-8", "surrogatepass"),`

rpms / python3

Source Code

Blame python3-r80382-r80385-lone-surrogate-and-utf8-error-handler.patch