churchyard / rpms / python3

Forked from rpms/python3 3 years ago
Clone

Blame 00262-pep538_coerce_legacy_c_locale.patch

3b36b49
diff --git a/Doc/using/cmdline.rst b/Doc/using/cmdline.rst
2529623
index 195f63f..8ecd70f 100644
3b36b49
--- a/Doc/using/cmdline.rst
3b36b49
+++ b/Doc/using/cmdline.rst
2529623
@@ -713,6 +713,45 @@ conflict.
3b36b49
 
3b36b49
    .. versionadded:: 3.6
3b36b49
 
3b36b49
+
3b36b49
+.. envvar:: PYTHONCOERCECLOCALE
3b36b49
+
31fe33b
+   If set to the value ``0``, causes the main Python command line application
3b36b49
+   to skip coercing the legacy ASCII-based C locale to a more capable UTF-8
3b36b49
+   based alternative. Note that this setting is checked even when the
3b36b49
+   :option:`-E` or :option:`-I` options are used, as it is handled prior to
3b36b49
+   the processing of command line options.
3b36b49
+
31fe33b
+   If this variable is *not* set, or is set to a value other than ``0``, and
31fe33b
+   the current locale reported for the ``LC_CTYPE`` category is the default
31fe33b
+   ``C`` locale, then the Python CLI will attempt to configure one of the
31fe33b
+   following locales for the given locale categories before loading the
31fe33b
+   interpreter runtime:
3b36b49
+
31fe33b
+   * ``C.UTF-8`` (``LC_ALL``)
31fe33b
+   * ``C.utf8`` (``LC_ALL``)
31fe33b
+   * ``UTF-8`` (``LC_CTYPE``)
3b36b49
+
3b36b49
+   If setting one of these locale categories succeeds, then the matching
31fe33b
+   environment variables will be set (both ``LC_ALL`` and ``LANG`` for the
31fe33b
+   ``LC_ALL`` category, and ``LC_CTYPE`` for the ``LC_CTYPE`` category) in
31fe33b
+   the current process environment before the Python runtime is initialized.
31fe33b
+
31fe33b
+   Configuring one of these locales (either explicitly or via the above
31fe33b
+   implicit locale coercion) will automatically set the error handler for
31fe33b
+   :data:`sys.stdin` and :data:`sys.stdout` to ``surrogateescape``. This
31fe33b
+   behavior can be overridden using :envvar:`PYTHONIOENCODING` as usual.
3b36b49
+
2529623
+   For debugging purposes, setting ``PYTHONCOERCECLOCALE=warn`` will cause
2529623
+   Python to emit warning messages on ``stderr`` if either the locale coercion
2529623
+   activates, or else if a locale that *would* have triggered coercion is
2529623
+   still active when the Python runtime is initialized.
2529623
+
3b36b49
+   Availability: \*nix
3b36b49
+
3b36b49
+   .. versionadded:: 3.7
3b36b49
+      See :pep:`538` for more details.
3b36b49
+
3b36b49
 Debug-mode variables
3b36b49
 ~~~~~~~~~~~~~~~~~~~~
3b36b49
 
3b36b49
diff --git a/Lib/test/support/script_helper.py b/Lib/test/support/script_helper.py
2529623
index ca5f9c2..7aa460b 100644
3b36b49
--- a/Lib/test/support/script_helper.py
3b36b49
+++ b/Lib/test/support/script_helper.py
2529623
@@ -51,8 +51,35 @@ def interpreter_requires_environment():
3b36b49
     return __cached_interp_requires_environment
3b36b49
 
3b36b49
 
3b36b49
-_PythonRunResult = collections.namedtuple("_PythonRunResult",
3b36b49
-                                          ("rc", "out", "err"))
3b36b49
+class _PythonRunResult(collections.namedtuple("_PythonRunResult",
3b36b49
+                                          ("rc", "out", "err"))):
3b36b49
+    """Helper for reporting Python subprocess run results"""
3b36b49
+    def fail(self, cmd_line):
3b36b49
+        """Provide helpful details about failed subcommand runs"""
3b36b49
+        # Limit to 80 lines to ASCII characters
3b36b49
+        maxlen = 80 * 100
3b36b49
+        out, err = self.out, self.err
3b36b49
+        if len(out) > maxlen:
3b36b49
+            out = b'(... truncated stdout ...)' + out[-maxlen:]
3b36b49
+        if len(err) > maxlen:
3b36b49
+            err = b'(... truncated stderr ...)' + err[-maxlen:]
3b36b49
+        out = out.decode('ascii', 'replace').rstrip()
3b36b49
+        err = err.decode('ascii', 'replace').rstrip()
3b36b49
+        raise AssertionError("Process return code is %d\n"
3b36b49
+                             "command line: %r\n"
3b36b49
+                             "\n"
3b36b49
+                             "stdout:\n"
3b36b49
+                             "---\n"
3b36b49
+                             "%s\n"
3b36b49
+                             "---\n"
3b36b49
+                             "\n"
3b36b49
+                             "stderr:\n"
3b36b49
+                             "---\n"
3b36b49
+                             "%s\n"
3b36b49
+                             "---"
3b36b49
+                             % (self.rc, cmd_line,
3b36b49
+                                out,
3b36b49
+                                err))
3b36b49
 
3b36b49
 
3b36b49
 # Executing the interpreter in a subprocess
2529623
@@ -110,30 +137,7 @@ def run_python_until_end(*args, **env_vars):
3b36b49
 def _assert_python(expected_success, *args, **env_vars):
3b36b49
     res, cmd_line = run_python_until_end(*args, **env_vars)
3b36b49
     if (res.rc and expected_success) or (not res.rc and not expected_success):
3b36b49
-        # Limit to 80 lines to ASCII characters
3b36b49
-        maxlen = 80 * 100
3b36b49
-        out, err = res.out, res.err
3b36b49
-        if len(out) > maxlen:
3b36b49
-            out = b'(... truncated stdout ...)' + out[-maxlen:]
3b36b49
-        if len(err) > maxlen:
3b36b49
-            err = b'(... truncated stderr ...)' + err[-maxlen:]
3b36b49
-        out = out.decode('ascii', 'replace').rstrip()
3b36b49
-        err = err.decode('ascii', 'replace').rstrip()
3b36b49
-        raise AssertionError("Process return code is %d\n"
3b36b49
-                             "command line: %r\n"
3b36b49
-                             "\n"
3b36b49
-                             "stdout:\n"
3b36b49
-                             "---\n"
3b36b49
-                             "%s\n"
3b36b49
-                             "---\n"
3b36b49
-                             "\n"
3b36b49
-                             "stderr:\n"
3b36b49
-                             "---\n"
3b36b49
-                             "%s\n"
3b36b49
-                             "---"
3b36b49
-                             % (res.rc, cmd_line,
3b36b49
-                                out,
3b36b49
-                                err))
3b36b49
+        res.fail(cmd_line)
3b36b49
     return res
3b36b49
 
3b36b49
 def assert_python_ok(*args, **env_vars):
51bb7c4
diff --git a/Lib/test/test_c_locale_coercion.py b/Lib/test/test_c_locale_coercion.py
51bb7c4
new file mode 100644
f13050e
index 0000000..635c98f
51bb7c4
--- /dev/null
51bb7c4
+++ b/Lib/test/test_c_locale_coercion.py
f13050e
@@ -0,0 +1,371 @@
51bb7c4
+# Tests the attempted automatic coercion of the C locale to a UTF-8 locale
51bb7c4
+
51bb7c4
+import unittest
f13050e
+import locale
51bb7c4
+import os
51bb7c4
+import sys
51bb7c4
+import sysconfig
51bb7c4
+import shutil
51bb7c4
+import subprocess
51bb7c4
+from collections import namedtuple
51bb7c4
+
51bb7c4
+import test.support
51bb7c4
+from test.support.script_helper import (
51bb7c4
+    run_python_until_end,
51bb7c4
+    interpreter_requires_environment,
51bb7c4
+)
51bb7c4
+
2529623
+# Set our expectation for the default encoding used in the C locale
2529623
+# for the filesystem encoding and the standard streams
f13050e
+
f13050e
+# AIX uses iso8859-1 in the C locale, other *nix platforms use ASCII
f13050e
+if sys.platform.startswith("aix"):
f13050e
+    C_LOCALE_STREAM_ENCODING = "iso8859-1"
f13050e
+else:
f13050e
+    C_LOCALE_STREAM_ENCODING = "ascii"
f13050e
+
f13050e
+# FS encoding is UTF-8 on macOS, other *nix platforms use the locale encoding
2529623
+if sys.platform == "darwin":
2529623
+    C_LOCALE_FS_ENCODING = "utf-8"
2529623
+else:
2529623
+    C_LOCALE_FS_ENCODING = C_LOCALE_STREAM_ENCODING
2529623
+
2529623
+# Note that the above is probably still wrong in some cases, such as:
2529623
+# * Windows when PYTHONLEGACYWINDOWSFSENCODING is set
2529623
+# * AIX and any other platforms that use latin-1 in the C locale
2529623
+#
2529623
+# Options for dealing with this:
2529623
+# * Don't set PYTHON_COERCE_C_LOCALE on such platforms (e.g. Windows doesn't)
2529623
+# * Fix the test expectations to match the actual platform behaviour
2529623
+
51bb7c4
+# In order to get the warning messages to match up as expected, the candidate
51bb7c4
+# order here must much the target locale order in Python/pylifecycle.c
f13050e
+_C_UTF8_LOCALES = ("C.UTF-8", "C.utf8", "UTF-8")
51bb7c4
+
51bb7c4
+# There's no reliable cross-platform way of checking locale alias
51bb7c4
+# lists, so the only way of knowing which of these locales will work
51bb7c4
+# is to try them with locale.setlocale(). We do that in a subprocess
51bb7c4
+# to avoid altering the locale of the test runner.
f13050e
+#
f13050e
+# If the relevant locale module attributes exist, and we're not on a platform
f13050e
+# where we expect it to always succeed, we also check that
f13050e
+# `locale.nl_langinfo(locale.CODESET)` works, as if it fails, the interpreter
f13050e
+# will skip locale coercion for that particular target locale
f13050e
+_check_nl_langinfo_CODESET = bool(
f13050e
+    sys.platform not in ("darwin", "linux") and
f13050e
+    hasattr(locale, "nl_langinfo") and
f13050e
+    hasattr(locale, "CODESET")
f13050e
+)
f13050e
+
51bb7c4
+def _set_locale_in_subprocess(locale_name):
51bb7c4
+    cmd_fmt = "import locale; print(locale.setlocale(locale.LC_CTYPE, '{}'))"
f13050e
+    if _check_nl_langinfo_CODESET:
f13050e
+        # If there's no valid CODESET, we expect coercion to be skipped
f13050e
+        cmd_fmt += "; import sys; sys.exit(not locale.nl_langinfo(locale.CODESET))"
51bb7c4
+    cmd = cmd_fmt.format(locale_name)
51bb7c4
+    result, py_cmd = run_python_until_end("-c", cmd, __isolated=True)
51bb7c4
+    return result.rc == 0
51bb7c4
+
f13050e
+
f13050e
+
2529623
+_fields = "fsencoding stdin_info stdout_info stderr_info lang lc_ctype lc_all"
2529623
+_EncodingDetails = namedtuple("EncodingDetails", _fields)
51bb7c4
+
51bb7c4
+class EncodingDetails(_EncodingDetails):
2529623
+    # XXX (ncoghlan): Using JSON for child state reporting may be less fragile
51bb7c4
+    CHILD_PROCESS_SCRIPT = ";".join([
2529623
+        "import sys, os",
51bb7c4
+        "print(sys.getfilesystemencoding())",
51bb7c4
+        "print(sys.stdin.encoding + ':' + sys.stdin.errors)",
51bb7c4
+        "print(sys.stdout.encoding + ':' + sys.stdout.errors)",
51bb7c4
+        "print(sys.stderr.encoding + ':' + sys.stderr.errors)",
2529623
+        "print(os.environ.get('LANG', 'not set'))",
2529623
+        "print(os.environ.get('LC_CTYPE', 'not set'))",
2529623
+        "print(os.environ.get('LC_ALL', 'not set'))",
51bb7c4
+    ])
51bb7c4
+
51bb7c4
+    @classmethod
2529623
+    def get_expected_details(cls, coercion_expected, fs_encoding, stream_encoding, env_vars):
51bb7c4
+        """Returns expected child process details for a given encoding"""
2529623
+        _stream = stream_encoding + ":{}"
51bb7c4
+        # stdin and stdout should use surrogateescape either because the
51bb7c4
+        # coercion triggered, or because the C locale was detected
51bb7c4
+        stream_info = 2*[_stream.format("surrogateescape")]
51bb7c4
+        # stderr should always use backslashreplace
51bb7c4
+        stream_info.append(_stream.format("backslashreplace"))
2529623
+        expected_lang = env_vars.get("LANG", "not set").lower()
2529623
+        if coercion_expected:
2529623
+            expected_lc_ctype = CLI_COERCION_TARGET.lower()
2529623
+        else:
2529623
+            expected_lc_ctype = env_vars.get("LC_CTYPE", "not set").lower()
2529623
+        expected_lc_all = env_vars.get("LC_ALL", "not set").lower()
2529623
+        env_info = expected_lang, expected_lc_ctype, expected_lc_all
2529623
+        return dict(cls(fs_encoding, *stream_info, *env_info)._asdict())
51bb7c4
+
51bb7c4
+    @staticmethod
51bb7c4
+    def _handle_output_variations(data):
51bb7c4
+        """Adjust the output to handle platform specific idiosyncrasies
51bb7c4
+
51bb7c4
+        * Some platforms report ASCII as ANSI_X3.4-1968
51bb7c4
+        * Some platforms report ASCII as US-ASCII
51bb7c4
+        * Some platforms report UTF-8 instead of utf-8
51bb7c4
+        """
51bb7c4
+        data = data.replace(b"ANSI_X3.4-1968", b"ascii")
51bb7c4
+        data = data.replace(b"US-ASCII", b"ascii")
51bb7c4
+        data = data.lower()
51bb7c4
+        return data
51bb7c4
+
51bb7c4
+    @classmethod
51bb7c4
+    def get_child_details(cls, env_vars):
51bb7c4
+        """Retrieves fsencoding and standard stream details from a child process
51bb7c4
+
51bb7c4
+        Returns (encoding_details, stderr_lines):
51bb7c4
+
51bb7c4
+        - encoding_details: EncodingDetails for eager decoding
51bb7c4
+        - stderr_lines: result of calling splitlines() on the stderr output
51bb7c4
+
51bb7c4
+        The child is run in isolated mode if the current interpreter supports
51bb7c4
+        that.
51bb7c4
+        """
51bb7c4
+        result, py_cmd = run_python_until_end(
51bb7c4
+            "-c", cls.CHILD_PROCESS_SCRIPT,
51bb7c4
+            __isolated=True,
51bb7c4
+            **env_vars
51bb7c4
+        )
51bb7c4
+        if not result.rc == 0:
51bb7c4
+            result.fail(py_cmd)
51bb7c4
+        # All subprocess outputs in this test case should be pure ASCII
51bb7c4
+        adjusted_output = cls._handle_output_variations(result.out)
2529623
+        stdout_lines = adjusted_output.decode("ascii").splitlines()
51bb7c4
+        child_encoding_details = dict(cls(*stdout_lines)._asdict())
51bb7c4
+        stderr_lines = result.err.decode("ascii").rstrip().splitlines()
51bb7c4
+        return child_encoding_details, stderr_lines
51bb7c4
+
51bb7c4
+
51bb7c4
+# Details of the shared library warning emitted at runtime
2529623
+LEGACY_LOCALE_WARNING = (
51bb7c4
+    "Python runtime initialized with LC_CTYPE=C (a locale with default ASCII "
51bb7c4
+    "encoding), which may cause Unicode compatibility problems. Using C.UTF-8, "
51bb7c4
+    "C.utf8, or UTF-8 (if available) as alternative Unicode-compatible "
51bb7c4
+    "locales is recommended."
51bb7c4
+)
51bb7c4
+
51bb7c4
+# Details of the CLI locale coercion warning emitted at runtime
51bb7c4
+CLI_COERCION_WARNING_FMT = (
2529623
+    "Python detected LC_CTYPE=C: LC_CTYPE coerced to {} (set another locale "
51bb7c4
+    "or PYTHONCOERCECLOCALE=0 to disable this locale coercion behavior)."
51bb7c4
+)
51bb7c4
+
51bb7c4
+
2529623
+AVAILABLE_TARGETS = None
2529623
+CLI_COERCION_TARGET = None
2529623
+CLI_COERCION_WARNING = None
51bb7c4
+
2529623
+def setUpModule():
2529623
+    global AVAILABLE_TARGETS
2529623
+    global CLI_COERCION_TARGET
2529623
+    global CLI_COERCION_WARNING
2529623
+
2529623
+    if AVAILABLE_TARGETS is not None:
2529623
+        # initialization already done
2529623
+        return
2529623
+    AVAILABLE_TARGETS = []
2529623
+
2529623
+    # Find the target locales available in the current system
2529623
+    for target_locale in _C_UTF8_LOCALES:
2529623
+        if _set_locale_in_subprocess(target_locale):
2529623
+            AVAILABLE_TARGETS.append(target_locale)
2529623
+
2529623
+    if AVAILABLE_TARGETS:
2529623
+        # Coercion is expected to use the first available target locale
2529623
+        CLI_COERCION_TARGET = AVAILABLE_TARGETS[0]
2529623
+        CLI_COERCION_WARNING = CLI_COERCION_WARNING_FMT.format(CLI_COERCION_TARGET)
2529623
+
2529623
+
2529623
+class _LocaleHandlingTestCase(unittest.TestCase):
2529623
+    # Base class to check expected locale handling behaviour
2529623
+
2529623
+    def _check_child_encoding_details(self,
2529623
+                                      env_vars,
2529623
+                                      expected_fs_encoding,
2529623
+                                      expected_stream_encoding,
2529623
+                                      expected_warnings,
2529623
+                                      coercion_expected):
2529623
+        """Check the C locale handling for the given process environment
2529623
+
2529623
+        Parameters:
2529623
+            expected_fs_encoding: expected sys.getfilesystemencoding() result
2529623
+            expected_stream_encoding: expected encoding for standard streams
2529623
+            expected_warning: stderr output to expect (if any)
2529623
+        """
2529623
+        result = EncodingDetails.get_child_details(env_vars)
2529623
+        encoding_details, stderr_lines = result
2529623
+        expected_details = EncodingDetails.get_expected_details(
2529623
+            coercion_expected,
2529623
+            expected_fs_encoding,
2529623
+            expected_stream_encoding,
2529623
+            env_vars
51bb7c4
+        )
2529623
+        self.assertEqual(encoding_details, expected_details)
2529623
+        if expected_warnings is None:
2529623
+            expected_warnings = []
2529623
+        self.assertEqual(stderr_lines, expected_warnings)
51bb7c4
+
51bb7c4
+
2529623
+class LocaleConfigurationTests(_LocaleHandlingTestCase):
51bb7c4
+    # Test explicit external configuration via the process environment
51bb7c4
+
2529623
+    def setUpClass():
2529623
+        # This relies on setupModule() having been run, so it can't be
2529623
+        # handled via the @unittest.skipUnless decorator
2529623
+        if not AVAILABLE_TARGETS:
2529623
+            raise unittest.SkipTest("No C-with-UTF-8 locale available")
2529623
+
51bb7c4
+    def test_external_target_locale_configuration(self):
2529623
+
51bb7c4
+        # Explicitly setting a target locale should give the same behaviour as
51bb7c4
+        # is seen when implicitly coercing to that target locale
51bb7c4
+        self.maxDiff = None
51bb7c4
+
2529623
+        expected_fs_encoding = "utf-8"
2529623
+        expected_stream_encoding = "utf-8"
51bb7c4
+
51bb7c4
+        base_var_dict = {
51bb7c4
+            "LANG": "",
51bb7c4
+            "LC_CTYPE": "",
51bb7c4
+            "LC_ALL": "",
51bb7c4
+        }
51bb7c4
+        for env_var in ("LANG", "LC_CTYPE"):
2529623
+            for locale_to_set in AVAILABLE_TARGETS:
2529623
+                # XXX (ncoghlan): LANG=UTF-8 doesn't appear to work as
2529623
+                #                 expected, so skip that combination for now
2529623
+                # See https://bugs.python.org/issue30672 for discussion
2529623
+                if env_var == "LANG" and locale_to_set == "UTF-8":
2529623
+                    continue
2529623
+
51bb7c4
+                with self.subTest(env_var=env_var,
51bb7c4
+                                  configured_locale=locale_to_set):
51bb7c4
+                    var_dict = base_var_dict.copy()
51bb7c4
+                    var_dict[env_var] = locale_to_set
51bb7c4
+                    self._check_child_encoding_details(var_dict,
2529623
+                                                       expected_fs_encoding,
2529623
+                                                       expected_stream_encoding,
2529623
+                                                       expected_warnings=None,
2529623
+                                                       coercion_expected=False)
51bb7c4
+
51bb7c4
+
51bb7c4
+
51bb7c4
+@test.support.cpython_only
51bb7c4
+@unittest.skipUnless(sysconfig.get_config_var("PY_COERCE_C_LOCALE"),
51bb7c4
+                     "C locale coercion disabled at build time")
2529623
+class LocaleCoercionTests(_LocaleHandlingTestCase):
51bb7c4
+    # Test implicit reconfiguration of the environment during CLI startup
51bb7c4
+
2529623
+    def _check_c_locale_coercion(self,
2529623
+                                 fs_encoding, stream_encoding,
2529623
+                                 coerce_c_locale,
2529623
+                                 expected_warnings=None,
2529623
+                                 coercion_expected=True,
2529623
+                                 **extra_vars):
51bb7c4
+        """Check the C locale handling for various configurations
51bb7c4
+
51bb7c4
+        Parameters:
2529623
+            fs_encoding: expected sys.getfilesystemencoding() result
2529623
+            stream_encoding: expected encoding for standard streams
2529623
+            coerce_c_locale: setting to use for PYTHONCOERCECLOCALE
51bb7c4
+              None: don't set the variable at all
51bb7c4
+              str: the value set in the child's environment
2529623
+            expected_warnings: expected warning lines on stderr
2529623
+            extra_vars: additional environment variables to set in subprocess
51bb7c4
+        """
51bb7c4
+        self.maxDiff = None
51bb7c4
+
2529623
+        if not AVAILABLE_TARGETS:
2529623
+            # Locale coercion is disabled when there aren't any target locales
2529623
+            fs_encoding = C_LOCALE_FS_ENCODING
2529623
+            stream_encoding = C_LOCALE_STREAM_ENCODING
2529623
+            coercion_expected = False
2529623
+            if expected_warnings:
2529623
+                expected_warnings = [LEGACY_LOCALE_WARNING]
51bb7c4
+
51bb7c4
+        base_var_dict = {
51bb7c4
+            "LANG": "",
51bb7c4
+            "LC_CTYPE": "",
51bb7c4
+            "LC_ALL": "",
51bb7c4
+        }
2529623
+        base_var_dict.update(extra_vars)
51bb7c4
+        for env_var in ("LANG", "LC_CTYPE"):
51bb7c4
+            for locale_to_set in ("", "C", "POSIX", "invalid.ascii"):
2529623
+                # XXX (ncoghlan): *BSD platforms don't behave as expected in the
2529623
+                #                 POSIX locale, so we skip that for now
2529623
+                # See https://bugs.python.org/issue30672 for discussion
2529623
+                if locale_to_set == "POSIX":
2529623
+                    continue
51bb7c4
+                with self.subTest(env_var=env_var,
51bb7c4
+                                  nominal_locale=locale_to_set,
51bb7c4
+                                  PYTHONCOERCECLOCALE=coerce_c_locale):
51bb7c4
+                    var_dict = base_var_dict.copy()
51bb7c4
+                    var_dict[env_var] = locale_to_set
51bb7c4
+                    if coerce_c_locale is not None:
51bb7c4
+                        var_dict["PYTHONCOERCECLOCALE"] = coerce_c_locale
2529623
+                    # Check behaviour on successful coercion
51bb7c4
+                    self._check_child_encoding_details(var_dict,
2529623
+                                                       fs_encoding,
2529623
+                                                       stream_encoding,
2529623
+                                                       expected_warnings,
2529623
+                                                       coercion_expected)
51bb7c4
+
51bb7c4
+    def test_test_PYTHONCOERCECLOCALE_not_set(self):
51bb7c4
+        # This should coerce to the first available target locale by default
2529623
+        self._check_c_locale_coercion("utf-8", "utf-8", coerce_c_locale=None)
51bb7c4
+
51bb7c4
+    def test_PYTHONCOERCECLOCALE_not_zero(self):
2529623
+        # *Any* string other than "0" is considered "set" for our purposes
51bb7c4
+        # and hence should result in the locale coercion being enabled
51bb7c4
+        for setting in ("", "1", "true", "false"):
2529623
+            self._check_c_locale_coercion("utf-8", "utf-8", coerce_c_locale=setting)
2529623
+
2529623
+    def test_PYTHONCOERCECLOCALE_set_to_warn(self):
2529623
+        # PYTHONCOERCECLOCALE=warn enables runtime warnings for legacy locales
2529623
+        self._check_c_locale_coercion("utf-8", "utf-8",
2529623
+                                      coerce_c_locale="warn",
2529623
+                                      expected_warnings=[CLI_COERCION_WARNING])
2529623
+
51bb7c4
+
51bb7c4
+    def test_PYTHONCOERCECLOCALE_set_to_zero(self):
51bb7c4
+        # The setting "0" should result in the locale coercion being disabled
2529623
+        self._check_c_locale_coercion(C_LOCALE_FS_ENCODING,
2529623
+                                      C_LOCALE_STREAM_ENCODING,
2529623
+                                      coerce_c_locale="0",
2529623
+                                      coercion_expected=False)
2529623
+        # Setting LC_ALL=C shouldn't make any difference to the behaviour
2529623
+        self._check_c_locale_coercion(C_LOCALE_FS_ENCODING,
2529623
+                                      C_LOCALE_STREAM_ENCODING,
2529623
+                                      coerce_c_locale="0",
2529623
+                                      LC_ALL="C",
2529623
+                                      coercion_expected=False)
2529623
+
2529623
+    def test_LC_ALL_set_to_C(self):
2529623
+        # Setting LC_ALL should render the locale coercion ineffective
2529623
+        self._check_c_locale_coercion(C_LOCALE_FS_ENCODING,
2529623
+                                      C_LOCALE_STREAM_ENCODING,
2529623
+                                      coerce_c_locale=None,
2529623
+                                      LC_ALL="C",
2529623
+                                      coercion_expected=False)
2529623
+        # And result in a warning about a lack of locale compatibility
2529623
+        self._check_c_locale_coercion(C_LOCALE_FS_ENCODING,
2529623
+                                      C_LOCALE_STREAM_ENCODING,
2529623
+                                      coerce_c_locale="warn",
2529623
+                                      LC_ALL="C",
2529623
+                                      expected_warnings=[LEGACY_LOCALE_WARNING],
2529623
+                                      coercion_expected=False)
51bb7c4
+
51bb7c4
+def test_main():
51bb7c4
+    test.support.run_unittest(
51bb7c4
+        LocaleConfigurationTests,
2529623
+        LocaleCoercionTests
51bb7c4
+    )
51bb7c4
+    test.support.reap_children()
51bb7c4
+
51bb7c4
+if __name__ == "__main__":
51bb7c4
+    test_main()
3b36b49
diff --git a/Lib/test/test_capi.py b/Lib/test/test_capi.py
2529623
index 6c3625d..009f542 100644
3b36b49
--- a/Lib/test/test_capi.py
3b36b49
+++ b/Lib/test/test_capi.py
2529623
@@ -369,14 +369,21 @@ class EmbeddingTests(unittest.TestCase):
31fe33b
     def tearDown(self):
31fe33b
         os.chdir(self.oldcwd)
31fe33b
 
31fe33b
-    def run_embedded_interpreter(self, *args):
31fe33b
+    def run_embedded_interpreter(self, *args, env=None):
31fe33b
         """Runs a test in the embedded interpreter"""
31fe33b
         cmd = [self.test_exe]
31fe33b
         cmd.extend(args)
2529623
+        if env is not None and sys.platform == 'win32':
2529623
+            # Windows requires at least the SYSTEMROOT environment variable to
2529623
+            # start Python.
2529623
+            env = env.copy()
2529623
+            env['SYSTEMROOT'] = os.environ['SYSTEMROOT']
2529623
+
31fe33b
         p = subprocess.Popen(cmd,
31fe33b
                              stdout=subprocess.PIPE,
31fe33b
                              stderr=subprocess.PIPE,
31fe33b
-                             universal_newlines=True)
31fe33b
+                             universal_newlines=True,
31fe33b
+                             env=env)
31fe33b
         (out, err) = p.communicate()
31fe33b
         self.assertEqual(p.returncode, 0,
31fe33b
                          "bad returncode %d, stderr is %r" %
2529623
@@ -386,31 +393,21 @@ class EmbeddingTests(unittest.TestCase):
3b36b49
     def test_subinterps(self):
3b36b49
         # This is just a "don't crash" test
2529623
         out, err = self.run_embedded_interpreter()
3b36b49
-        if support.verbose:
3b36b49
+        if support.verbose > 1:
3b36b49
             print()
3b36b49
             print(out)
3b36b49
             print(err)
31fe33b
 
2529623
-    @staticmethod
2529623
-    def _get_default_pipe_encoding():
2529623
-        rp, wp = os.pipe()
2529623
-        try:
2529623
-            with os.fdopen(wp, 'w') as w:
2529623
-                default_pipe_encoding = w.encoding
2529623
-        finally:
2529623
-            os.close(rp)
2529623
-        return default_pipe_encoding
2529623
-
3b36b49
     def test_forced_io_encoding(self):
3b36b49
         # Checks forced configuration of embedded interpreter IO streams
31fe33b
-        out, err = self.run_embedded_interpreter("forced_io_encoding")
3b36b49
-        if support.verbose:
2529623
+        env = dict(os.environ, PYTHONIOENCODING="utf-8:surrogateescape")
31fe33b
+        out, err = self.run_embedded_interpreter("forced_io_encoding", env=env)
3b36b49
+        if support.verbose > 1:
3b36b49
             print()
3b36b49
             print(out)
3b36b49
             print(err)
3b36b49
-        expected_errors = sys.__stdout__.errors
3b36b49
-        expected_stdin_encoding = sys.__stdin__.encoding
2529623
-        expected_pipe_encoding = self._get_default_pipe_encoding()
2529623
+        expected_stream_encoding = "utf-8"
3b36b49
+        expected_errors = "surrogateescape"
3b36b49
         expected_output = '\n'.join([
3b36b49
         "--- Use defaults ---",
2529623
         "Expected encoding: default",
2529623
@@ -437,8 +434,8 @@ class EmbeddingTests(unittest.TestCase):
2529623
         "stdout: latin-1:replace",
2529623
         "stderr: latin-1:backslashreplace"])
2529623
         expected_output = expected_output.format(
2529623
-                                in_encoding=expected_stdin_encoding,
2529623
-                                out_encoding=expected_pipe_encoding,
2529623
+                                in_encoding=expected_stream_encoding,
2529623
+                                out_encoding=expected_stream_encoding,
2529623
                                 errors=expected_errors)
2529623
         # This is useful if we ever trip over odd platform behaviour
2529623
         self.maxDiff = None
3b36b49
diff --git a/Lib/test/test_cmd_line.py b/Lib/test/test_cmd_line.py
2529623
index ae2bcd4..0a302ff 100644
3b36b49
--- a/Lib/test/test_cmd_line.py
3b36b49
+++ b/Lib/test/test_cmd_line.py
2529623
@@ -9,8 +9,9 @@ import sys
3b36b49
 import subprocess
3b36b49
 import tempfile
Iryna Shcherbina aba719b
 from test.support import script_helper, is_android
3b36b49
-from test.support.script_helper import (spawn_python, kill_python, assert_python_ok,
3b36b49
-    assert_python_failure)
3b36b49
+from test.support.script_helper import (
3b36b49
+    spawn_python, kill_python, assert_python_ok, assert_python_failure
3b36b49
+)
3b36b49
 
3b36b49
 
3b36b49
 # XXX (ncoghlan): Move to script_helper and make consistent with run_python
2529623
@@ -151,6 +152,7 @@ class CmdLineTest(unittest.TestCase):
3b36b49
         env = os.environ.copy()
3b36b49
         # Use C locale to get ascii for the locale encoding
3b36b49
         env['LC_ALL'] = 'C'
3b36b49
+        env['PYTHONCOERCECLOCALE'] = '0'
3b36b49
         code = (
3b36b49
             b'import locale; '
3b36b49
             b'print(ascii("' + undecodable + b'"), '
3b36b49
diff --git a/Lib/test/test_sys.py b/Lib/test/test_sys.py
2529623
index df9ebd4..63145e4 100644
3b36b49
--- a/Lib/test/test_sys.py
3b36b49
+++ b/Lib/test/test_sys.py
2529623
@@ -680,6 +680,7 @@ class SysModuleTest(unittest.TestCase):
3b36b49
         # Force the POSIX locale
3b36b49
         env = os.environ.copy()
3b36b49
         env["LC_ALL"] = "C"
3b36b49
+        env["PYTHONCOERCECLOCALE"] = "0"
3b36b49
         code = '\n'.join((
3b36b49
             'import sys',
3b36b49
             'def dump(name):',
2529623
diff --git a/Modules/main.c b/Modules/main.c
2529623
index dd50211..f20cf24 100644
2529623
--- a/Modules/main.c
2529623
+++ b/Modules/main.c
2529623
@@ -105,7 +105,11 @@ static const char usage_6[] =
2529623
 "   predictable seed.\n"
2529623
 "PYTHONMALLOC: set the Python memory allocators and/or install debug hooks\n"
2529623
 "   on Python memory allocators. Use PYTHONMALLOC=debug to install debug\n"
2529623
-"   hooks.\n";
2529623
+"   hooks.\n"
2529623
+
2529623
+"PYTHONCOERCECLOCALE: if this variable is set to 0, it disables the locale\n"
2529623
+"   coercion behavior. Use PYTHONCOERCECLOCALE=warn to request display of\n"
2529623
+"   locale coercion and locale compatibility warnings on stderr.\n";
2529623
 
2529623
 static int
2529623
 usage(int exitcode, const wchar_t* program)
3b36b49
diff --git a/Programs/_testembed.c b/Programs/_testembed.c
2529623
index 3968399..1bd2bbf 100644
3b36b49
--- a/Programs/_testembed.c
3b36b49
+++ b/Programs/_testembed.c
3b36b49
@@ -1,4 +1,5 @@
3b36b49
-#include <Python.h>
3b36b49
+#include "Python.h"
3b36b49
+#include "pyconfig.h"
3b36b49
 #include <stdio.h>
3b36b49
 
3b36b49
 /*********************************************************
3b36b49
diff --git a/Programs/python.c b/Programs/python.c
31fe33b
index a7afbc7..03f8295 100644
3b36b49
--- a/Programs/python.c
3b36b49
+++ b/Programs/python.c
31fe33b
@@ -15,6 +15,21 @@ wmain(int argc, wchar_t **argv)
3b36b49
 }
3b36b49
 #else
3b36b49
 
31fe33b
+/* Access private pylifecycle helper API to better handle the legacy C locale
3b36b49
+ *
3b36b49
+ * The legacy C locale assumes ASCII as the default text encoding, which
3b36b49
+ * causes problems not only for the CPython runtime, but also other
3b36b49
+ * components like GNU readline.
3b36b49
+ *
3b36b49
+ * Accordingly, when the CLI detects it, it attempts to coerce it to a
3b36b49
+ * more capable UTF-8 based alternative.
3b36b49
+ *
3b36b49
+ * See the documentation of the PYTHONCOERCECLOCALE setting for more details.
3b36b49
+ *
3b36b49
+ */
31fe33b
+extern int _Py_LegacyLocaleDetected(void);
31fe33b
+extern void _Py_CoerceLegacyLocale(void);
3b36b49
+
31fe33b
 int
31fe33b
 main(int argc, char **argv)
31fe33b
 {
31fe33b
@@ -25,7 +40,11 @@ main(int argc, char **argv)
31fe33b
     char *oldloc;
31fe33b
 
31fe33b
     /* Force malloc() allocator to bootstrap Python */
31fe33b
+#ifdef Py_DEBUG
31fe33b
+    (void)_PyMem_SetupAllocators("malloc_debug");
31fe33b
+#  else
31fe33b
     (void)_PyMem_SetupAllocators("malloc");
31fe33b
+#  endif
31fe33b
 
31fe33b
     argv_copy = (wchar_t **)PyMem_RawMalloc(sizeof(wchar_t*) * (argc+1));
31fe33b
     argv_copy2 = (wchar_t **)PyMem_RawMalloc(sizeof(wchar_t*) * (argc+1));
31fe33b
@@ -49,7 +68,21 @@ main(int argc, char **argv)
31fe33b
         return 1;
31fe33b
     }
31fe33b
 
31fe33b
+#ifdef __ANDROID__
31fe33b
+    /* Passing "" to setlocale() on Android requests the C locale rather
31fe33b
+     * than checking environment variables, so request C.UTF-8 explicitly
31fe33b
+     */
31fe33b
+    setlocale(LC_ALL, "C.UTF-8");
31fe33b
+#else
31fe33b
+    /* Reconfigure the locale to the default for this process */
31fe33b
     setlocale(LC_ALL, "");
31fe33b
+#endif
31fe33b
+
31fe33b
+    if (_Py_LegacyLocaleDetected()) {
31fe33b
+        _Py_CoerceLegacyLocale();
31fe33b
+    }
31fe33b
+
31fe33b
+    /* Convert from char to wchar_t based on the locale settings */
31fe33b
     for (i = 0; i < argc; i++) {
31fe33b
         argv_copy[i] = Py_DecodeLocale(argv[i], NULL);
31fe33b
         if (!argv_copy[i]) {
31fe33b
@@ -70,7 +103,11 @@ main(int argc, char **argv)
31fe33b
 
31fe33b
     /* Force again malloc() allocator to release memory blocks allocated
31fe33b
        before Py_Main() */
31fe33b
+#ifdef Py_DEBUG
31fe33b
+    (void)_PyMem_SetupAllocators("malloc_debug");
31fe33b
+#  else
31fe33b
     (void)_PyMem_SetupAllocators("malloc");
31fe33b
+#  endif
31fe33b
 
31fe33b
     for (i = 0; i < argc; i++) {
31fe33b
         PyMem_RawFree(argv_copy2[i]);
31fe33b
diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c
f13050e
index a4f7f82..3843297 100644
31fe33b
--- a/Python/pylifecycle.c
31fe33b
+++ b/Python/pylifecycle.c
31fe33b
@@ -167,6 +167,7 @@ Py_SetStandardStreamEncoding(const char *encoding, const char *errors)
31fe33b
     return 0;
31fe33b
 }
31fe33b
 
31fe33b
+
31fe33b
 /* Global initializations.  Can be undone by Py_FinalizeEx().  Don't
31fe33b
    call this twice without an intervening Py_FinalizeEx() call.  When
31fe33b
    initializations fail, a fatal error is issued and the function does
f13050e
@@ -301,6 +302,183 @@ import_init(PyInterpreterState *interp, PyObject *sysmod)
31fe33b
 }
31fe33b
 
31fe33b
 
31fe33b
+/* Helper functions to better handle the legacy C locale
31fe33b
+ *
31fe33b
+ * The legacy C locale assumes ASCII as the default text encoding, which
31fe33b
+ * causes problems not only for the CPython runtime, but also other
31fe33b
+ * components like GNU readline.
31fe33b
+ *
31fe33b
+ * Accordingly, when the CLI detects it, it attempts to coerce it to a
31fe33b
+ * more capable UTF-8 based alternative as follows:
31fe33b
+ *
31fe33b
+ *     if (_Py_LegacyLocaleDetected()) {
31fe33b
+ *         _Py_CoerceLegacyLocale();
31fe33b
+ *     }
31fe33b
+ *
31fe33b
+ * See the documentation of the PYTHONCOERCECLOCALE setting for more details.
31fe33b
+ *
31fe33b
+ * Locale coercion also impacts the default error handler for the standard
31fe33b
+ * streams: while the usual default is "strict", the default for the legacy
31fe33b
+ * C locale and for any of the coercion target locales is "surrogateescape".
31fe33b
+ */
31fe33b
+
31fe33b
+int
31fe33b
+_Py_LegacyLocaleDetected(void)
31fe33b
+{
2529623
+#ifndef MS_WINDOWS
2529623
+    /* On non-Windows systems, the C locale is considered a legacy locale */
2529623
+    /* XXX (ncoghlan): some platforms (notably Mac OS X) don't appear to treat
2529623
+     *                 the POSIX locale as a simple alias for the C locale, so
2529623
+     *                 we may also want to check for that explicitly.
2529623
+     */
31fe33b
+    const char *ctype_loc = setlocale(LC_CTYPE, NULL);
31fe33b
+    return ctype_loc != NULL && strcmp(ctype_loc, "C") == 0;
2529623
+#else
2529623
+    /* Windows uses code pages instead of locales, so no locale is legacy */
2529623
+    return 0;
2529623
+#endif
2529623
+}
2529623
+
2529623
+
2529623
+static const char *_C_LOCALE_WARNING =
2529623
+    "Python runtime initialized with LC_CTYPE=C (a locale with default ASCII "
2529623
+    "encoding), which may cause Unicode compatibility problems. Using C.UTF-8, "
2529623
+    "C.utf8, or UTF-8 (if available) as alternative Unicode-compatible "
2529623
+    "locales is recommended.\n";
2529623
+
2529623
+static int
2529623
+_legacy_locale_warnings_enabled(void)
2529623
+{
2529623
+    const char *coerce_c_locale = getenv("PYTHONCOERCECLOCALE");
2529623
+    return (coerce_c_locale != NULL &&
2529623
+            strncmp(coerce_c_locale, "warn", 5) == 0);
2529623
+}
2529623
+
2529623
+static void
2529623
+_emit_stderr_warning_for_legacy_locale(void)
2529623
+{
2529623
+    if (_legacy_locale_warnings_enabled()) {
2529623
+        if (_Py_LegacyLocaleDetected()) {
2529623
+            fprintf(stderr, "%s", _C_LOCALE_WARNING);
2529623
+        }
2529623
+    }
31fe33b
+}
3b36b49
+
3b36b49
+typedef struct _CandidateLocale {
51bb7c4
+    const char *locale_name; /* The locale to try as a coercion target */
3b36b49
+} _LocaleCoercionTarget;
3b36b49
+
3b36b49
+static _LocaleCoercionTarget _TARGET_LOCALES[] = {
2529623
+    {"C.UTF-8"},
2529623
+    {"C.utf8"},
f13050e
+    {"UTF-8"},
2529623
+    {NULL}
3b36b49
+};
3b36b49
+
31fe33b
+static char *
31fe33b
+get_default_standard_stream_error_handler(void)
31fe33b
+{
31fe33b
+    const char *ctype_loc = setlocale(LC_CTYPE, NULL);
31fe33b
+    if (ctype_loc != NULL) {
31fe33b
+        /* "surrogateescape" is the default in the legacy C locale */
31fe33b
+        if (strcmp(ctype_loc, "C") == 0) {
31fe33b
+            return "surrogateescape";
31fe33b
+        }
31fe33b
+
2529623
+#ifdef PY_COERCE_C_LOCALE
31fe33b
+        /* "surrogateescape" is the default in locale coercion target locales */
31fe33b
+        const _LocaleCoercionTarget *target = NULL;
31fe33b
+        for (target = _TARGET_LOCALES; target->locale_name; target++) {
31fe33b
+            if (strcmp(ctype_loc, target->locale_name) == 0) {
31fe33b
+                return "surrogateescape";
31fe33b
+            }
31fe33b
+        }
2529623
+#endif
31fe33b
+   }
31fe33b
+
31fe33b
+   /* Otherwise return NULL to request the typical default error handler */
31fe33b
+   return NULL;
31fe33b
+}
31fe33b
+
31fe33b
+#ifdef PY_COERCE_C_LOCALE
31fe33b
+static const char *_C_LOCALE_COERCION_WARNING =
2529623
+    "Python detected LC_CTYPE=C: LC_CTYPE coerced to %.20s (set another locale "
31fe33b
+    "or PYTHONCOERCECLOCALE=0 to disable this locale coercion behavior).\n";
31fe33b
+
31fe33b
+static void
3b36b49
+_coerce_default_locale_settings(const _LocaleCoercionTarget *target)
3b36b49
+{
2529623
+
3b36b49
+    const char *newloc = target->locale_name;
3b36b49
+
3b36b49
+    /* Reset locale back to currently configured defaults */
3b36b49
+    setlocale(LC_ALL, "");
3b36b49
+
2529623
+    /* Set the relevant locale environment variable */
51bb7c4
+    if (setenv("LC_CTYPE", newloc, 1)) {
51bb7c4
+        fprintf(stderr,
51bb7c4
+                "Error setting LC_CTYPE, skipping C locale coercion\n");
51bb7c4
+        return;
51bb7c4
+    }
2529623
+    if (_legacy_locale_warnings_enabled()) {
2529623
+        fprintf(stderr, _C_LOCALE_COERCION_WARNING, newloc);
3b36b49
+    }
3b36b49
+
3b36b49
+    /* Reconfigure with the overridden environment variables */
3b36b49
+    setlocale(LC_ALL, "");
3b36b49
+}
31fe33b
+#endif
31fe33b
+
2529623
+
31fe33b
+void
31fe33b
+_Py_CoerceLegacyLocale(void)
31fe33b
+{
31fe33b
+#ifdef PY_COERCE_C_LOCALE
31fe33b
+    /* We ignore the Python -E and -I flags here, as the CLI needs to sort out
3b36b49
+     * the locale settings *before* we try to do anything with the command
3b36b49
+     * line arguments. For cross-platform debugging purposes, we also need
3b36b49
+     * to give end users a way to force even scripts that are otherwise
3b36b49
+     * isolated from their environment to use the legacy ASCII-centric C
3b36b49
+     * locale.
2529623
+     *
2529623
+     * Ignoring -E and -I is safe from a security perspective, as we only use
2529623
+     * the setting to turn *off* the implicit locale coercion, and anyone with
2529623
+     * access to the process environment already has the ability to set
2529623
+     * `LC_ALL=C` to override the C level locale settings anyway.
2529623
+     */
2529623
+    const char *coerce_c_locale = getenv("PYTHONCOERCECLOCALE");
2529623
+    if (coerce_c_locale == NULL || strncmp(coerce_c_locale, "0", 2) != 0) {
2529623
+        /* PYTHONCOERCECLOCALE is not set, or is set to something other than "0" */
51bb7c4
+        const char *locale_override = getenv("LC_ALL");
51bb7c4
+        if (locale_override == NULL || *locale_override == '\0') {
51bb7c4
+            /* LC_ALL is also not set (or is set to an empty string) */
51bb7c4
+            const _LocaleCoercionTarget *target = NULL;
51bb7c4
+            for (target = _TARGET_LOCALES; target->locale_name; target++) {
51bb7c4
+                const char *new_locale = setlocale(LC_CTYPE,
51bb7c4
+                                                   target->locale_name);
51bb7c4
+                if (new_locale != NULL) {
f13050e
+#if !defined(__APPLE__) && defined(HAVE_LANGINFO_H) && defined(CODESET)
f13050e
+                    /* Also ensure that nl_langinfo works in this locale */
f13050e
+                    char *codeset = nl_langinfo(CODESET);
f13050e
+                    if (!codeset || *codeset == '\0') {
f13050e
+                        /* CODESET is not set or empty, so skip coercion */
f13050e
+                        new_locale = NULL;
f13050e
+                        setlocale(LC_CTYPE, "");
f13050e
+                        continue;
f13050e
+                    }
f13050e
+#endif
51bb7c4
+                    /* Successfully configured locale, so make it the default */
51bb7c4
+                    _coerce_default_locale_settings(target);
51bb7c4
+                    return;
51bb7c4
+                }
3b36b49
+            }
3b36b49
+        }
3b36b49
+    }
3b36b49
+    /* No C locale warning here, as Py_Initialize will emit one later */
3b36b49
+#endif
31fe33b
+}
3b36b49
+
3b36b49
+
3b36b49
 void
3b36b49
 _Py_InitializeEx_Private(int install_sigs, int install_importlib)
3b36b49
 {
f13050e
@@ -315,11 +493,19 @@ _Py_InitializeEx_Private(int install_sigs, int install_importlib)
3b36b49
     initialized = 1;
3b36b49
     _Py_Finalizing = NULL;
3b36b49
 
3b36b49
-#ifdef HAVE_SETLOCALE
3b36b49
+#ifdef __ANDROID__
3b36b49
+    /* Passing "" to setlocale() on Android requests the C locale rather
3b36b49
+     * than checking environment variables, so request C.UTF-8 explicitly
3b36b49
+     */
3b36b49
+    setlocale(LC_CTYPE, "C.UTF-8");
3b36b49
+#else
2529623
+#ifndef MS_WINDOWS
3b36b49
     /* Set up the LC_CTYPE locale, so we can obtain
3b36b49
        the locale's charset without having to switch
3b36b49
        locales. */
3b36b49
     setlocale(LC_CTYPE, "");
2529623
+    _emit_stderr_warning_for_legacy_locale();
3b36b49
+#endif
3b36b49
 #endif
3b36b49
 
3b36b49
     if ((p = Py_GETENV("PYTHONDEBUG")) && *p != '\0')
f13050e
@@ -1242,12 +1428,8 @@ initstdio(void)
31fe33b
             }
31fe33b
         }
31fe33b
         if (!errors && !(pythonioencoding && *pythonioencoding)) {
31fe33b
-            /* When the LC_CTYPE locale is the POSIX locale ("C locale"),
31fe33b
-               stdin and stdout use the surrogateescape error handler by
31fe33b
-               default, instead of the strict error handler. */
31fe33b
-            char *loc = setlocale(LC_CTYPE, NULL);
31fe33b
-            if (loc != NULL && strcmp(loc, "C") == 0)
31fe33b
-                errors = "surrogateescape";
31fe33b
+            /* Choose the default error handler based on the current locale */
31fe33b
+            errors = get_default_standard_stream_error_handler();
31fe33b
         }
31fe33b
     }
31fe33b
 
3b36b49
diff --git a/configure b/configure
2529623
index 2915246..39e5a27 100755
3b36b49
--- a/configure
3b36b49
+++ b/configure
3b36b49
@@ -834,6 +834,8 @@ with_thread
3b36b49
 enable_ipv6
3b36b49
 with_doc_strings
3b36b49
 with_pymalloc
3b36b49
+with_c_locale_coercion
3b36b49
+with_c_locale_warning
3b36b49
 with_valgrind
3b36b49
 with_dtrace
3b36b49
 with_fpectl
2529623
@@ -1527,6 +1529,12 @@ Optional Packages:
3b36b49
                           deprecated; use --with(out)-threads
3b36b49
   --with(out)-doc-strings disable/enable documentation strings
3b36b49
   --with(out)-pymalloc    disable/enable specialized mallocs
3b36b49
+  --with(out)-c-locale-coercion
3b36b49
+                          disable/enable C locale coercion to a UTF-8 based
3b36b49
+                          locale
3b36b49
+  --with(out)-c-locale-warning
3b36b49
+                          disable/enable locale compatibility warning in the C
3b36b49
+                          locale
3b36b49
   --with-valgrind         Enable Valgrind support
3b36b49
   --with(out)-dtrace      disable/enable DTrace support
3b36b49
   --with-fpectl           enable SIGFPE catching
2529623
@@ -11010,6 +11018,52 @@ fi
3b36b49
 { $as_echo "$as_me:${as_lineno-$LINENO}: result: $with_pymalloc" >&5
3b36b49
 $as_echo "$with_pymalloc" >&6; }
3b36b49
 
3b36b49
+# Check for --with-c-locale-coercion
3b36b49
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for --with-c-locale-coercion" >&5
3b36b49
+$as_echo_n "checking for --with-c-locale-coercion... " >&6; }
3b36b49
+
3b36b49
+# Check whether --with-c-locale-coercion was given.
3b36b49
+if test "${with_c_locale_coercion+set}" = set; then :
3b36b49
+  withval=$with_c_locale_coercion;
3b36b49
+fi
3b36b49
+
3b36b49
+
3b36b49
+if test -z "$with_c_locale_coercion"
3b36b49
+then
3b36b49
+    with_c_locale_coercion="yes"
3b36b49
+fi
3b36b49
+if test "$with_c_locale_coercion" != "no"
3b36b49
+then
3b36b49
+
3b36b49
+$as_echo "#define PY_COERCE_C_LOCALE 1" >>confdefs.h
3b36b49
+
3b36b49
+fi
3b36b49
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $with_c_locale_coercion" >&5
3b36b49
+$as_echo "$with_c_locale_coercion" >&6; }
3b36b49
+
3b36b49
+# Check for --with-c-locale-warning
3b36b49
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for --with-c-locale-warning" >&5
3b36b49
+$as_echo_n "checking for --with-c-locale-warning... " >&6; }
3b36b49
+
3b36b49
+# Check whether --with-c-locale-warning was given.
3b36b49
+if test "${with_c_locale_warning+set}" = set; then :
3b36b49
+  withval=$with_c_locale_warning;
3b36b49
+fi
3b36b49
+
3b36b49
+
3b36b49
+if test -z "$with_c_locale_warning"
3b36b49
+then
3b36b49
+    with_c_locale_warning="yes"
3b36b49
+fi
3b36b49
+if test "$with_c_locale_warning" != "no"
3b36b49
+then
3b36b49
+
3b36b49
+$as_echo "#define PY_WARN_ON_C_LOCALE 1" >>confdefs.h
3b36b49
+
3b36b49
+fi
3b36b49
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $with_c_locale_warning" >&5
3b36b49
+$as_echo "$with_c_locale_warning" >&6; }
3b36b49
+
3b36b49
 # Check for Valgrind support
3b36b49
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for --with-valgrind" >&5
3b36b49
 $as_echo_n "checking for --with-valgrind... " >&6; }
3b36b49
diff --git a/configure.ac b/configure.ac
2529623
index 67dfba3..b9c9f04 100644
3b36b49
--- a/configure.ac
3b36b49
+++ b/configure.ac
2529623
@@ -3279,6 +3279,40 @@ then
3b36b49
 fi
3b36b49
 AC_MSG_RESULT($with_pymalloc)
3b36b49
 
3b36b49
+# Check for --with-c-locale-coercion
3b36b49
+AC_MSG_CHECKING(for --with-c-locale-coercion)
3b36b49
+AC_ARG_WITH(c-locale-coercion,
3b36b49
+            AS_HELP_STRING([--with(out)-c-locale-coercion],
3b36b49
+              [disable/enable C locale coercion to a UTF-8 based locale]))
3b36b49
+
3b36b49
+if test -z "$with_c_locale_coercion"
3b36b49
+then
3b36b49
+    with_c_locale_coercion="yes"
3b36b49
+fi
3b36b49
+if test "$with_c_locale_coercion" != "no"
3b36b49
+then
3b36b49
+    AC_DEFINE(PY_COERCE_C_LOCALE, 1,
3b36b49
+      [Define if you want to coerce the C locale to a UTF-8 based locale])
3b36b49
+fi
3b36b49
+AC_MSG_RESULT($with_c_locale_coercion)
3b36b49
+
3b36b49
+# Check for --with-c-locale-warning
3b36b49
+AC_MSG_CHECKING(for --with-c-locale-warning)
3b36b49
+AC_ARG_WITH(c-locale-warning,
3b36b49
+            AS_HELP_STRING([--with(out)-c-locale-warning],
3b36b49
+              [disable/enable locale compatibility warning in the C locale]))
3b36b49
+
3b36b49
+if test -z "$with_c_locale_warning"
3b36b49
+then
3b36b49
+    with_c_locale_warning="yes"
3b36b49
+fi
3b36b49
+if test "$with_c_locale_warning" != "no"
3b36b49
+then
3b36b49
+    AC_DEFINE(PY_WARN_ON_C_LOCALE, 1,
3b36b49
+      [Define to emit a locale compatibility warning in the C locale])
3b36b49
+fi
3b36b49
+AC_MSG_RESULT($with_c_locale_warning)
3b36b49
+
3b36b49
 # Check for Valgrind support
3b36b49
 AC_MSG_CHECKING([for --with-valgrind])
3b36b49
 AC_ARG_WITH([valgrind],
3b36b49
diff --git a/pyconfig.h.in b/pyconfig.h.in
2529623
index b10c57f..0a6f3e2 100644
3b36b49
--- a/pyconfig.h.in
3b36b49
+++ b/pyconfig.h.in
2529623
@@ -1244,9 +1244,15 @@
3b36b49
 /* Define as the preferred size in bits of long digits */
3b36b49
 #undef PYLONG_BITS_IN_DIGIT
3b36b49
 
3b36b49
+/* Define if you want to coerce the C locale to a UTF-8 based locale */
3b36b49
+#undef PY_COERCE_C_LOCALE
3b36b49
+
3b36b49
 /* Define to printf format modifier for Py_ssize_t */
3b36b49
 #undef PY_FORMAT_SIZE_T
3b36b49
 
3b36b49
+/* Define to emit a locale compatibility warning in the C locale */
3b36b49
+#undef PY_WARN_ON_C_LOCALE
3b36b49
+
3b36b49
 /* Define if you want to build an interpreter with many run-time checks. */
3b36b49
 #undef Py_DEBUG
3b36b49