churchyard / rpms / python38

Forked from rpms/python38 2 years ago
Clone
3b36b4
diff --git a/Doc/using/cmdline.rst b/Doc/using/cmdline.rst
31fe33
index 195f63f..0d0a127 100644
3b36b4
--- a/Doc/using/cmdline.rst
3b36b4
+++ b/Doc/using/cmdline.rst
31fe33
@@ -713,6 +713,40 @@ conflict.
3b36b4
 
3b36b4
    .. versionadded:: 3.6
3b36b4
 
3b36b4
+
3b36b4
+.. envvar:: PYTHONCOERCECLOCALE
3b36b4
+
31fe33
+   If set to the value ``0``, causes the main Python command line application
3b36b4
+   to skip coercing the legacy ASCII-based C locale to a more capable UTF-8
3b36b4
+   based alternative. Note that this setting is checked even when the
3b36b4
+   :option:`-E` or :option:`-I` options are used, as it is handled prior to
3b36b4
+   the processing of command line options.
3b36b4
+
31fe33
+   If this variable is *not* set, or is set to a value other than ``0``, and
31fe33
+   the current locale reported for the ``LC_CTYPE`` category is the default
31fe33
+   ``C`` locale, then the Python CLI will attempt to configure one of the
31fe33
+   following locales for the given locale categories before loading the
31fe33
+   interpreter runtime:
3b36b4
+
31fe33
+   * ``C.UTF-8`` (``LC_ALL``)
31fe33
+   * ``C.utf8`` (``LC_ALL``)
31fe33
+   * ``UTF-8`` (``LC_CTYPE``)
3b36b4
+
3b36b4
+   If setting one of these locale categories succeeds, then the matching
31fe33
+   environment variables will be set (both ``LC_ALL`` and ``LANG`` for the
31fe33
+   ``LC_ALL`` category, and ``LC_CTYPE`` for the ``LC_CTYPE`` category) in
31fe33
+   the current process environment before the Python runtime is initialized.
31fe33
+
31fe33
+   Configuring one of these locales (either explicitly or via the above
31fe33
+   implicit locale coercion) will automatically set the error handler for
31fe33
+   :data:`sys.stdin` and :data:`sys.stdout` to ``surrogateescape``. This
31fe33
+   behavior can be overridden using :envvar:`PYTHONIOENCODING` as usual.
3b36b4
+
3b36b4
+   Availability: \*nix
3b36b4
+
3b36b4
+   .. versionadded:: 3.7
3b36b4
+      See :pep:`538` for more details.
3b36b4
+
3b36b4
 Debug-mode variables
3b36b4
 ~~~~~~~~~~~~~~~~~~~~
3b36b4
 
3b36b4
diff --git a/Lib/test/support/script_helper.py b/Lib/test/support/script_helper.py
31fe33
index ca5f9c2..7aa460b 100644
3b36b4
--- a/Lib/test/support/script_helper.py
3b36b4
+++ b/Lib/test/support/script_helper.py
3b36b4
@@ -51,8 +51,35 @@ def interpreter_requires_environment():
3b36b4
     return __cached_interp_requires_environment
3b36b4
 
3b36b4
 
3b36b4
-_PythonRunResult = collections.namedtuple("_PythonRunResult",
3b36b4
-                                          ("rc", "out", "err"))
3b36b4
+class _PythonRunResult(collections.namedtuple("_PythonRunResult",
3b36b4
+                                          ("rc", "out", "err"))):
3b36b4
+    """Helper for reporting Python subprocess run results"""
3b36b4
+    def fail(self, cmd_line):
3b36b4
+        """Provide helpful details about failed subcommand runs"""
3b36b4
+        # Limit to 80 lines to ASCII characters
3b36b4
+        maxlen = 80 * 100
3b36b4
+        out, err = self.out, self.err
3b36b4
+        if len(out) > maxlen:
3b36b4
+            out = b'(... truncated stdout ...)' + out[-maxlen:]
3b36b4
+        if len(err) > maxlen:
3b36b4
+            err = b'(... truncated stderr ...)' + err[-maxlen:]
3b36b4
+        out = out.decode('ascii', 'replace').rstrip()
3b36b4
+        err = err.decode('ascii', 'replace').rstrip()
3b36b4
+        raise AssertionError("Process return code is %d\n"
3b36b4
+                             "command line: %r\n"
3b36b4
+                             "\n"
3b36b4
+                             "stdout:\n"
3b36b4
+                             "---\n"
3b36b4
+                             "%s\n"
3b36b4
+                             "---\n"
3b36b4
+                             "\n"
3b36b4
+                             "stderr:\n"
3b36b4
+                             "---\n"
3b36b4
+                             "%s\n"
3b36b4
+                             "---"
3b36b4
+                             % (self.rc, cmd_line,
3b36b4
+                                out,
3b36b4
+                                err))
3b36b4
 
3b36b4
 
3b36b4
 # Executing the interpreter in a subprocess
31fe33
@@ -110,30 +137,7 @@ def run_python_until_end(*args, **env_vars):
3b36b4
 def _assert_python(expected_success, *args, **env_vars):
3b36b4
     res, cmd_line = run_python_until_end(*args, **env_vars)
3b36b4
     if (res.rc and expected_success) or (not res.rc and not expected_success):
3b36b4
-        # Limit to 80 lines to ASCII characters
3b36b4
-        maxlen = 80 * 100
3b36b4
-        out, err = res.out, res.err
3b36b4
-        if len(out) > maxlen:
3b36b4
-            out = b'(... truncated stdout ...)' + out[-maxlen:]
3b36b4
-        if len(err) > maxlen:
3b36b4
-            err = b'(... truncated stderr ...)' + err[-maxlen:]
3b36b4
-        out = out.decode('ascii', 'replace').rstrip()
3b36b4
-        err = err.decode('ascii', 'replace').rstrip()
3b36b4
-        raise AssertionError("Process return code is %d\n"
3b36b4
-                             "command line: %r\n"
3b36b4
-                             "\n"
3b36b4
-                             "stdout:\n"
3b36b4
-                             "---\n"
3b36b4
-                             "%s\n"
3b36b4
-                             "---\n"
3b36b4
-                             "\n"
3b36b4
-                             "stderr:\n"
3b36b4
-                             "---\n"
3b36b4
-                             "%s\n"
3b36b4
-                             "---"
3b36b4
-                             % (res.rc, cmd_line,
3b36b4
-                                out,
3b36b4
-                                err))
3b36b4
+        res.fail(cmd_line)
3b36b4
     return res
3b36b4
 
3b36b4
 def assert_python_ok(*args, **env_vars):
3b36b4
diff --git a/Lib/test/test_capi.py b/Lib/test/test_capi.py
31fe33
index 2a53f3d..391ca15 100644
3b36b4
--- a/Lib/test/test_capi.py
3b36b4
+++ b/Lib/test/test_capi.py
31fe33
@@ -369,14 +369,15 @@ class EmbeddingTests(unittest.TestCase):
31fe33
     def tearDown(self):
31fe33
         os.chdir(self.oldcwd)
31fe33
 
31fe33
-    def run_embedded_interpreter(self, *args):
31fe33
+    def run_embedded_interpreter(self, *args, env=None):
31fe33
         """Runs a test in the embedded interpreter"""
31fe33
         cmd = [self.test_exe]
31fe33
         cmd.extend(args)
31fe33
         p = subprocess.Popen(cmd,
31fe33
                              stdout=subprocess.PIPE,
31fe33
                              stderr=subprocess.PIPE,
31fe33
-                             universal_newlines=True)
31fe33
+                             universal_newlines=True,
31fe33
+                             env=env)
31fe33
         (out, err) = p.communicate()
31fe33
         self.assertEqual(p.returncode, 0,
31fe33
                          "bad returncode %d, stderr is %r" %
31fe33
@@ -386,7 +387,7 @@ class EmbeddingTests(unittest.TestCase):
3b36b4
     def test_subinterps(self):
3b36b4
         # This is just a "don't crash" test
3b36b4
         out, err = self.run_embedded_interpreter("repeated_init_and_subinterpreters")
3b36b4
-        if support.verbose:
3b36b4
+        if support.verbose > 1:
3b36b4
             print()
3b36b4
             print(out)
3b36b4
             print(err)
31fe33
@@ -403,13 +404,14 @@ class EmbeddingTests(unittest.TestCase):
31fe33
 
3b36b4
     def test_forced_io_encoding(self):
3b36b4
         # Checks forced configuration of embedded interpreter IO streams
31fe33
-        out, err = self.run_embedded_interpreter("forced_io_encoding")
3b36b4
-        if support.verbose:
31fe33
+        env = {"PYTHONIOENCODING": "UTF-8:surrogateescape"}
31fe33
+        out, err = self.run_embedded_interpreter("forced_io_encoding", env=env)
3b36b4
+        if support.verbose > 1:
3b36b4
             print()
3b36b4
             print(out)
3b36b4
             print(err)
3b36b4
-        expected_errors = sys.__stdout__.errors
3b36b4
-        expected_stdin_encoding = sys.__stdin__.encoding
3b36b4
+        expected_errors = "surrogateescape"
3b36b4
+        expected_stdin_encoding = "UTF-8"
3b36b4
         expected_pipe_encoding = self._get_default_pipe_encoding()
3b36b4
         expected_output = '\n'.join([
3b36b4
         "--- Use defaults ---",
3b36b4
diff --git a/Lib/test/test_cmd_line.py b/Lib/test/test_cmd_line.py
31fe33
index ae2bcd4..0a302ff 100644
3b36b4
--- a/Lib/test/test_cmd_line.py
3b36b4
+++ b/Lib/test/test_cmd_line.py
3b36b4
@@ -9,8 +9,9 @@ import sys
3b36b4
 import subprocess
3b36b4
 import tempfile
Iryna Shcherbina aba719
 from test.support import script_helper, is_android
3b36b4
-from test.support.script_helper import (spawn_python, kill_python, assert_python_ok,
3b36b4
-    assert_python_failure)
3b36b4
+from test.support.script_helper import (
3b36b4
+    spawn_python, kill_python, assert_python_ok, assert_python_failure
3b36b4
+)
3b36b4
 
3b36b4
 
3b36b4
 # XXX (ncoghlan): Move to script_helper and make consistent with run_python
3b36b4
@@ -151,6 +152,7 @@ class CmdLineTest(unittest.TestCase):
3b36b4
         env = os.environ.copy()
3b36b4
         # Use C locale to get ascii for the locale encoding
3b36b4
         env['LC_ALL'] = 'C'
3b36b4
+        env['PYTHONCOERCECLOCALE'] = '0'
3b36b4
         code = (
3b36b4
             b'import locale; '
3b36b4
             b'print(ascii("' + undecodable + b'"), '
3b36b4
diff --git a/Lib/test/test_sys.py b/Lib/test/test_sys.py
3b36b4
index df9ebd4..63145e4 100644
3b36b4
--- a/Lib/test/test_sys.py
3b36b4
+++ b/Lib/test/test_sys.py
3b36b4
@@ -680,6 +680,7 @@ class SysModuleTest(unittest.TestCase):
3b36b4
         # Force the POSIX locale
3b36b4
         env = os.environ.copy()
3b36b4
         env["LC_ALL"] = "C"
3b36b4
+        env["PYTHONCOERCECLOCALE"] = "0"
3b36b4
         code = '\n'.join((
3b36b4
             'import sys',
3b36b4
             'def dump(name):',
3b36b4
diff --git a/Programs/_testembed.c b/Programs/_testembed.c
31fe33
index a68d4fa..e28de1c 100644
3b36b4
--- a/Programs/_testembed.c
3b36b4
+++ b/Programs/_testembed.c
3b36b4
@@ -1,4 +1,5 @@
3b36b4
-#include <Python.h>
3b36b4
+#include "Python.h"
3b36b4
+#include "pyconfig.h"
3b36b4
 #include <stdio.h>
3b36b4
 
3b36b4
 /*********************************************************
31fe33
@@ -126,6 +127,20 @@ static int test_forced_io_encoding(void)
3b36b4
     return 0;
3b36b4
 }
3b36b4
 
3b36b4
+static int test_c_locale_warning(void)
3b36b4
+{
3b36b4
+#ifdef PY_WARN_ON_C_LOCALE
3b36b4
+    /* Force use of the C locale */
3b36b4
+    setenv("LC_ALL", "C", 1);
3b36b4
+
3b36b4
+    _testembed_Py_Initialize();
3b36b4
+    Py_Finalize();
3b36b4
+#else
3b36b4
+    printf("C locale compatibility warning disabled at compile time\n");
3b36b4
+#endif
3b36b4
+    return 0;
3b36b4
+}
3b36b4
+
3b36b4
 /* *********************************************************
3b36b4
  * List of test cases and the function that implements it.
3b36b4
  * 
31fe33
@@ -147,6 +162,7 @@ struct TestCase
3b36b4
 static struct TestCase TestCases[] = {
3b36b4
     { "forced_io_encoding", test_forced_io_encoding },
3b36b4
     { "repeated_init_and_subinterpreters", test_repeated_init_and_subinterpreters },
3b36b4
+    { "c_locale_warning", test_c_locale_warning },
3b36b4
     { NULL, NULL }
3b36b4
 };
3b36b4
 
3b36b4
diff --git a/Programs/python.c b/Programs/python.c
31fe33
index a7afbc7..03f8295 100644
3b36b4
--- a/Programs/python.c
3b36b4
+++ b/Programs/python.c
31fe33
@@ -15,6 +15,21 @@ wmain(int argc, wchar_t **argv)
3b36b4
 }
3b36b4
 #else
3b36b4
 
31fe33
+/* Access private pylifecycle helper API to better handle the legacy C locale
3b36b4
+ *
3b36b4
+ * The legacy C locale assumes ASCII as the default text encoding, which
3b36b4
+ * causes problems not only for the CPython runtime, but also other
3b36b4
+ * components like GNU readline.
3b36b4
+ *
3b36b4
+ * Accordingly, when the CLI detects it, it attempts to coerce it to a
3b36b4
+ * more capable UTF-8 based alternative.
3b36b4
+ *
3b36b4
+ * See the documentation of the PYTHONCOERCECLOCALE setting for more details.
3b36b4
+ *
3b36b4
+ */
31fe33
+extern int _Py_LegacyLocaleDetected(void);
31fe33
+extern void _Py_CoerceLegacyLocale(void);
3b36b4
+
31fe33
 int
31fe33
 main(int argc, char **argv)
31fe33
 {
31fe33
@@ -25,7 +40,11 @@ main(int argc, char **argv)
31fe33
     char *oldloc;
31fe33
 
31fe33
     /* Force malloc() allocator to bootstrap Python */
31fe33
+#ifdef Py_DEBUG
31fe33
+    (void)_PyMem_SetupAllocators("malloc_debug");
31fe33
+#  else
31fe33
     (void)_PyMem_SetupAllocators("malloc");
31fe33
+#  endif
31fe33
 
31fe33
     argv_copy = (wchar_t **)PyMem_RawMalloc(sizeof(wchar_t*) * (argc+1));
31fe33
     argv_copy2 = (wchar_t **)PyMem_RawMalloc(sizeof(wchar_t*) * (argc+1));
31fe33
@@ -49,7 +68,21 @@ main(int argc, char **argv)
31fe33
         return 1;
31fe33
     }
31fe33
 
31fe33
+#ifdef __ANDROID__
31fe33
+    /* Passing "" to setlocale() on Android requests the C locale rather
31fe33
+     * than checking environment variables, so request C.UTF-8 explicitly
31fe33
+     */
31fe33
+    setlocale(LC_ALL, "C.UTF-8");
31fe33
+#else
31fe33
+    /* Reconfigure the locale to the default for this process */
31fe33
     setlocale(LC_ALL, "");
31fe33
+#endif
31fe33
+
31fe33
+    if (_Py_LegacyLocaleDetected()) {
31fe33
+        _Py_CoerceLegacyLocale();
31fe33
+    }
31fe33
+
31fe33
+    /* Convert from char to wchar_t based on the locale settings */
31fe33
     for (i = 0; i < argc; i++) {
31fe33
         argv_copy[i] = Py_DecodeLocale(argv[i], NULL);
31fe33
         if (!argv_copy[i]) {
31fe33
@@ -70,7 +103,11 @@ main(int argc, char **argv)
31fe33
 
31fe33
     /* Force again malloc() allocator to release memory blocks allocated
31fe33
        before Py_Main() */
31fe33
+#ifdef Py_DEBUG
31fe33
+    (void)_PyMem_SetupAllocators("malloc_debug");
31fe33
+#  else
31fe33
     (void)_PyMem_SetupAllocators("malloc");
31fe33
+#  endif
31fe33
 
31fe33
     for (i = 0; i < argc; i++) {
31fe33
         PyMem_RawFree(argv_copy2[i]);
31fe33
diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c
31fe33
index a4f7f82..261ed34 100644
31fe33
--- a/Python/pylifecycle.c
31fe33
+++ b/Python/pylifecycle.c
31fe33
@@ -167,6 +167,7 @@ Py_SetStandardStreamEncoding(const char *encoding, const char *errors)
31fe33
     return 0;
31fe33
 }
31fe33
 
31fe33
+
31fe33
 /* Global initializations.  Can be undone by Py_FinalizeEx().  Don't
31fe33
    call this twice without an intervening Py_FinalizeEx() call.  When
31fe33
    initializations fail, a fatal error is issued and the function does
31fe33
@@ -301,6 +302,173 @@ import_init(PyInterpreterState *interp, PyObject *sysmod)
31fe33
 }
31fe33
 
31fe33
 
31fe33
+/* Helper functions to better handle the legacy C locale
31fe33
+ *
31fe33
+ * The legacy C locale assumes ASCII as the default text encoding, which
31fe33
+ * causes problems not only for the CPython runtime, but also other
31fe33
+ * components like GNU readline.
31fe33
+ *
31fe33
+ * Accordingly, when the CLI detects it, it attempts to coerce it to a
31fe33
+ * more capable UTF-8 based alternative as follows:
31fe33
+ *
31fe33
+ *     if (_Py_LegacyLocaleDetected()) {
31fe33
+ *         _Py_CoerceLegacyLocale();
31fe33
+ *     }
31fe33
+ *
31fe33
+ * See the documentation of the PYTHONCOERCECLOCALE setting for more details.
31fe33
+ *
31fe33
+ * Locale coercion also impacts the default error handler for the standard
31fe33
+ * streams: while the usual default is "strict", the default for the legacy
31fe33
+ * C locale and for any of the coercion target locales is "surrogateescape".
31fe33
+ */
31fe33
+
31fe33
+int
31fe33
+_Py_LegacyLocaleDetected(void)
31fe33
+{
31fe33
+    const char *ctype_loc = setlocale(LC_CTYPE, NULL);
31fe33
+    return ctype_loc != NULL && strcmp(ctype_loc, "C") == 0;
31fe33
+}
3b36b4
+
3b36b4
+typedef struct _CandidateLocale {
3b36b4
+    const char *locale_name;
3b36b4
+    int category;
3b36b4
+} _LocaleCoercionTarget;
3b36b4
+
3b36b4
+static _LocaleCoercionTarget _TARGET_LOCALES[] = {
3b36b4
+    { "C.UTF-8", LC_ALL },
3b36b4
+    { "C.utf8", LC_ALL },
3b36b4
+    { "UTF-8", LC_CTYPE },
3b36b4
+    { NULL, 0 }
3b36b4
+};
3b36b4
+
31fe33
+static char *
31fe33
+get_default_standard_stream_error_handler(void)
31fe33
+{
31fe33
+    const char *ctype_loc = setlocale(LC_CTYPE, NULL);
31fe33
+    if (ctype_loc != NULL) {
31fe33
+        /* "surrogateescape" is the default in the legacy C locale */
31fe33
+        if (strcmp(ctype_loc, "C") == 0) {
31fe33
+            return "surrogateescape";
31fe33
+        }
31fe33
+
31fe33
+        /* "surrogateescape" is the default in locale coercion target locales */
31fe33
+        const _LocaleCoercionTarget *target = NULL;
31fe33
+        for (target = _TARGET_LOCALES; target->locale_name; target++) {
31fe33
+            if (strcmp(ctype_loc, target->locale_name) == 0) {
31fe33
+                return "surrogateescape";
31fe33
+            }
31fe33
+        }
31fe33
+   }
31fe33
+
31fe33
+   /* Otherwise return NULL to request the typical default error handler */
31fe33
+   return NULL;
31fe33
+}
31fe33
+
31fe33
+#ifdef PY_COERCE_C_LOCALE
31fe33
+static const char *_C_LOCALE_COERCION_WARNING =
31fe33
+    "Python detected LC_CTYPE=C: %.20s coerced to %.20s (set another locale "
31fe33
+    "or PYTHONCOERCECLOCALE=0 to disable this locale coercion behavior).\n";
31fe33
+
31fe33
+static void
3b36b4
+_coerce_default_locale_settings(const _LocaleCoercionTarget *target)
3b36b4
+{
3b36b4
+    const char *newloc = target->locale_name;
3b36b4
+    int category = target->category;
3b36b4
+
3b36b4
+    /* Reset locale back to currently configured defaults */
3b36b4
+    setlocale(LC_ALL, "");
3b36b4
+
3b36b4
+    /* Set the relevant locale environment variables */
3b36b4
+    if (category == LC_ALL) {
3b36b4
+        const char *env_vars_updated = "LC_ALL & LANG";
3b36b4
+        if (setenv("LC_ALL", newloc, 1)) {
3b36b4
+            fprintf(stderr,
3b36b4
+                    "Error setting LC_ALL, skipping C locale coercion\n");
3b36b4
+            return;
3b36b4
+        }
3b36b4
+        if (setenv("LANG", newloc, 1)) {
3b36b4
+            fprintf(stderr,
3b36b4
+                    "Error setting LANG during C locale coercion\n");
3b36b4
+            env_vars_updated = "LC_ALL";
3b36b4
+        }
3b36b4
+        fprintf(stderr, _C_LOCALE_COERCION_WARNING, env_vars_updated, newloc);
3b36b4
+    } else if (category == LC_CTYPE) {
3b36b4
+        if (setenv("LC_CTYPE", newloc, 1)) {
3b36b4
+            fprintf(stderr,
3b36b4
+                    "Error setting LC_CTYPE, skipping C locale coercion\n");
3b36b4
+            return;
3b36b4
+        }
3b36b4
+        fprintf(stderr, _C_LOCALE_COERCION_WARNING, "LC_CTYPE", newloc);
3b36b4
+    } else {
3b36b4
+        fprintf(stderr, "Locale coercion must target LC_ALL or LC_CTYPE\n");
3b36b4
+        return;
3b36b4
+    }
3b36b4
+
3b36b4
+    /* Reconfigure with the overridden environment variables */
3b36b4
+    setlocale(LC_ALL, "");
3b36b4
+}
3b36b4
+
31fe33
+static int
31fe33
+c_locale_coercion_is_expected(void)
3b36b4
+{
31fe33
+    /* This may be called prior to Py_Initialize, so we don't call any other
31fe33
+     * Python APIs, and we ignore the -E and -I flags
31fe33
+     */
3b36b4
+    const char *coerce_c_locale = getenv("PYTHONCOERCECLOCALE");
31fe33
+    if (coerce_c_locale == NULL || strncmp(coerce_c_locale, "0", 2) != 0) {
31fe33
+        return 1;
31fe33
+    }
31fe33
+    return 0;
31fe33
+}
31fe33
+#endif
31fe33
+
31fe33
+void
31fe33
+_Py_CoerceLegacyLocale(void)
31fe33
+{
31fe33
+#ifdef PY_COERCE_C_LOCALE
31fe33
+    /* We ignore the Python -E and -I flags here, as the CLI needs to sort out
3b36b4
+     * the locale settings *before* we try to do anything with the command
3b36b4
+     * line arguments. For cross-platform debugging purposes, we also need
3b36b4
+     * to give end users a way to force even scripts that are otherwise
3b36b4
+     * isolated from their environment to use the legacy ASCII-centric C
3b36b4
+     * locale.
3b36b4
+    */
31fe33
+    if (c_locale_coercion_is_expected()) {
3b36b4
+        /* PYTHONCOERCECLOCALE is not set, or is not set to exactly "0" */
3b36b4
+        const _LocaleCoercionTarget *target = NULL;
3b36b4
+        for (target = _TARGET_LOCALES; target->locale_name; target++) {
3b36b4
+            const char *reconfigured_locale = setlocale(target->category,
3b36b4
+                                                        target->locale_name);
3b36b4
+            if (reconfigured_locale != NULL) {
3b36b4
+                /* Successfully configured locale, so make it the default */
3b36b4
+                _coerce_default_locale_settings(target);
3b36b4
+                return;
3b36b4
+            }
3b36b4
+        }
3b36b4
+    }
3b36b4
+    /* No C locale warning here, as Py_Initialize will emit one later */
3b36b4
+#endif
31fe33
+}
3b36b4
+
3b36b4
+
3b36b4
+#ifdef PY_WARN_ON_C_LOCALE
3b36b4
+static const char *_C_LOCALE_WARNING =
3b36b4
+    "Python runtime initialized with LC_CTYPE=C (a locale with default ASCII "
3b36b4
+    "encoding), which may cause Unicode compatibility problems. Using C.UTF-8, "
3b36b4
+    "C.utf8, or UTF-8 (if available) as alternative Unicode-compatible "
3b36b4
+    "locales is recommended.\n";
3b36b4
+
3b36b4
+static void
3b36b4
+_emit_stderr_warning_for_c_locale(void)
3b36b4
+{
31fe33
+    if (c_locale_coercion_is_expected()) {
31fe33
+        if (_Py_LegacyLocaleDetected()) {
3b36b4
+            fprintf(stderr, "%s", _C_LOCALE_WARNING);
3b36b4
+        }
3b36b4
+    }
3b36b4
+}
3b36b4
+#endif
3b36b4
+
3b36b4
 void
3b36b4
 _Py_InitializeEx_Private(int install_sigs, int install_importlib)
3b36b4
 {
31fe33
@@ -315,11 +483,19 @@ _Py_InitializeEx_Private(int install_sigs, int install_importlib)
3b36b4
     initialized = 1;
3b36b4
     _Py_Finalizing = NULL;
3b36b4
 
3b36b4
-#ifdef HAVE_SETLOCALE
3b36b4
+#ifdef __ANDROID__
3b36b4
+    /* Passing "" to setlocale() on Android requests the C locale rather
3b36b4
+     * than checking environment variables, so request C.UTF-8 explicitly
3b36b4
+     */
3b36b4
+    setlocale(LC_CTYPE, "C.UTF-8");
3b36b4
+#else
3b36b4
     /* Set up the LC_CTYPE locale, so we can obtain
3b36b4
        the locale's charset without having to switch
3b36b4
        locales. */
3b36b4
     setlocale(LC_CTYPE, "");
3b36b4
+#ifdef PY_WARN_ON_C_LOCALE
3b36b4
+    _emit_stderr_warning_for_c_locale();
3b36b4
+#endif
3b36b4
 #endif
3b36b4
 
3b36b4
     if ((p = Py_GETENV("PYTHONDEBUG")) && *p != '\0')
31fe33
@@ -1242,12 +1418,8 @@ initstdio(void)
31fe33
             }
31fe33
         }
31fe33
         if (!errors && !(pythonioencoding && *pythonioencoding)) {
31fe33
-            /* When the LC_CTYPE locale is the POSIX locale ("C locale"),
31fe33
-               stdin and stdout use the surrogateescape error handler by
31fe33
-               default, instead of the strict error handler. */
31fe33
-            char *loc = setlocale(LC_CTYPE, NULL);
31fe33
-            if (loc != NULL && strcmp(loc, "C") == 0)
31fe33
-                errors = "surrogateescape";
31fe33
+            /* Choose the default error handler based on the current locale */
31fe33
+            errors = get_default_standard_stream_error_handler();
31fe33
         }
31fe33
     }
31fe33
 
3b36b4
diff --git a/configure b/configure
31fe33
index 2915246..39e5a27 100755
3b36b4
--- a/configure
3b36b4
+++ b/configure
3b36b4
@@ -834,6 +834,8 @@ with_thread
3b36b4
 enable_ipv6
3b36b4
 with_doc_strings
3b36b4
 with_pymalloc
3b36b4
+with_c_locale_coercion
3b36b4
+with_c_locale_warning
3b36b4
 with_valgrind
3b36b4
 with_dtrace
3b36b4
 with_fpectl
3b36b4
@@ -1527,6 +1529,12 @@ Optional Packages:
3b36b4
                           deprecated; use --with(out)-threads
3b36b4
   --with(out)-doc-strings disable/enable documentation strings
3b36b4
   --with(out)-pymalloc    disable/enable specialized mallocs
3b36b4
+  --with(out)-c-locale-coercion
3b36b4
+                          disable/enable C locale coercion to a UTF-8 based
3b36b4
+                          locale
3b36b4
+  --with(out)-c-locale-warning
3b36b4
+                          disable/enable locale compatibility warning in the C
3b36b4
+                          locale
3b36b4
   --with-valgrind         Enable Valgrind support
3b36b4
   --with(out)-dtrace      disable/enable DTrace support
3b36b4
   --with-fpectl           enable SIGFPE catching
31fe33
@@ -11010,6 +11018,52 @@ fi
3b36b4
 { $as_echo "$as_me:${as_lineno-$LINENO}: result: $with_pymalloc" >&5
3b36b4
 $as_echo "$with_pymalloc" >&6; }
3b36b4
 
3b36b4
+# Check for --with-c-locale-coercion
3b36b4
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for --with-c-locale-coercion" >&5
3b36b4
+$as_echo_n "checking for --with-c-locale-coercion... " >&6; }
3b36b4
+
3b36b4
+# Check whether --with-c-locale-coercion was given.
3b36b4
+if test "${with_c_locale_coercion+set}" = set; then :
3b36b4
+  withval=$with_c_locale_coercion;
3b36b4
+fi
3b36b4
+
3b36b4
+
3b36b4
+if test -z "$with_c_locale_coercion"
3b36b4
+then
3b36b4
+    with_c_locale_coercion="yes"
3b36b4
+fi
3b36b4
+if test "$with_c_locale_coercion" != "no"
3b36b4
+then
3b36b4
+
3b36b4
+$as_echo "#define PY_COERCE_C_LOCALE 1" >>confdefs.h
3b36b4
+
3b36b4
+fi
3b36b4
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $with_c_locale_coercion" >&5
3b36b4
+$as_echo "$with_c_locale_coercion" >&6; }
3b36b4
+
3b36b4
+# Check for --with-c-locale-warning
3b36b4
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for --with-c-locale-warning" >&5
3b36b4
+$as_echo_n "checking for --with-c-locale-warning... " >&6; }
3b36b4
+
3b36b4
+# Check whether --with-c-locale-warning was given.
3b36b4
+if test "${with_c_locale_warning+set}" = set; then :
3b36b4
+  withval=$with_c_locale_warning;
3b36b4
+fi
3b36b4
+
3b36b4
+
3b36b4
+if test -z "$with_c_locale_warning"
3b36b4
+then
3b36b4
+    with_c_locale_warning="yes"
3b36b4
+fi
3b36b4
+if test "$with_c_locale_warning" != "no"
3b36b4
+then
3b36b4
+
3b36b4
+$as_echo "#define PY_WARN_ON_C_LOCALE 1" >>confdefs.h
3b36b4
+
3b36b4
+fi
3b36b4
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $with_c_locale_warning" >&5
3b36b4
+$as_echo "$with_c_locale_warning" >&6; }
3b36b4
+
3b36b4
 # Check for Valgrind support
3b36b4
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for --with-valgrind" >&5
3b36b4
 $as_echo_n "checking for --with-valgrind... " >&6; }
3b36b4
diff --git a/configure.ac b/configure.ac
31fe33
index 67dfba3..b9c9f04 100644
3b36b4
--- a/configure.ac
3b36b4
+++ b/configure.ac
31fe33
@@ -3279,6 +3279,40 @@ then
3b36b4
 fi
3b36b4
 AC_MSG_RESULT($with_pymalloc)
3b36b4
 
3b36b4
+# Check for --with-c-locale-coercion
3b36b4
+AC_MSG_CHECKING(for --with-c-locale-coercion)
3b36b4
+AC_ARG_WITH(c-locale-coercion,
3b36b4
+            AS_HELP_STRING([--with(out)-c-locale-coercion],
3b36b4
+              [disable/enable C locale coercion to a UTF-8 based locale]))
3b36b4
+
3b36b4
+if test -z "$with_c_locale_coercion"
3b36b4
+then
3b36b4
+    with_c_locale_coercion="yes"
3b36b4
+fi
3b36b4
+if test "$with_c_locale_coercion" != "no"
3b36b4
+then
3b36b4
+    AC_DEFINE(PY_COERCE_C_LOCALE, 1,
3b36b4
+      [Define if you want to coerce the C locale to a UTF-8 based locale])
3b36b4
+fi
3b36b4
+AC_MSG_RESULT($with_c_locale_coercion)
3b36b4
+
3b36b4
+# Check for --with-c-locale-warning
3b36b4
+AC_MSG_CHECKING(for --with-c-locale-warning)
3b36b4
+AC_ARG_WITH(c-locale-warning,
3b36b4
+            AS_HELP_STRING([--with(out)-c-locale-warning],
3b36b4
+              [disable/enable locale compatibility warning in the C locale]))
3b36b4
+
3b36b4
+if test -z "$with_c_locale_warning"
3b36b4
+then
3b36b4
+    with_c_locale_warning="yes"
3b36b4
+fi
3b36b4
+if test "$with_c_locale_warning" != "no"
3b36b4
+then
3b36b4
+    AC_DEFINE(PY_WARN_ON_C_LOCALE, 1,
3b36b4
+      [Define to emit a locale compatibility warning in the C locale])
3b36b4
+fi
3b36b4
+AC_MSG_RESULT($with_c_locale_warning)
3b36b4
+
3b36b4
 # Check for Valgrind support
3b36b4
 AC_MSG_CHECKING([for --with-valgrind])
3b36b4
 AC_ARG_WITH([valgrind],
3b36b4
diff --git a/pyconfig.h.in b/pyconfig.h.in
31fe33
index b10c57f..0a6f3e2 100644
3b36b4
--- a/pyconfig.h.in
3b36b4
+++ b/pyconfig.h.in
31fe33
@@ -1244,9 +1244,15 @@
3b36b4
 /* Define as the preferred size in bits of long digits */
3b36b4
 #undef PYLONG_BITS_IN_DIGIT
3b36b4
 
3b36b4
+/* Define if you want to coerce the C locale to a UTF-8 based locale */
3b36b4
+#undef PY_COERCE_C_LOCALE
3b36b4
+
3b36b4
 /* Define to printf format modifier for Py_ssize_t */
3b36b4
 #undef PY_FORMAT_SIZE_T
3b36b4
 
3b36b4
+/* Define to emit a locale compatibility warning in the C locale */
3b36b4
+#undef PY_WARN_ON_C_LOCALE
3b36b4
+
3b36b4
 /* Define if you want to build an interpreter with many run-time checks. */
3b36b4
 #undef Py_DEBUG
3b36b4