Skip to content

Commit c46db92

Browse files
bpo-30863: Rewrite PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). (GH-2599)
They no longer cache the wchar_t* representation of string objects.
1 parent df13df4 commit c46db92

File tree

Image for: File tree

2 files changed

Image for: 2 files changed
+123
-123
lines changed

2 files changed

Image for: 2 files changed
+123
-123
lines changed
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
:c:func:`PyUnicode_AsWideChar` and :c:func:`PyUnicode_AsWideCharString` no
2+
longer cache the ``wchar_t*`` representation of string objects.

‎Objects/unicodeobject.c

Lines changed: 121 additions & 123 deletions
Original file line numberDiff line numberDiff line change
@@ -2921,6 +2921,83 @@ PyUnicode_FromFormat(const char *format, ...)
29212921
return ret;
29222922
}
29232923

2924+
static Py_ssize_t
2925+
unicode_get_widechar_size(PyObject *unicode)
2926+
{
2927+
Py_ssize_t res;
2928+
2929+
assert(unicode != NULL);
2930+
assert(_PyUnicode_CHECK(unicode));
2931+
2932+
if (_PyUnicode_WSTR(unicode) != NULL) {
2933+
return PyUnicode_WSTR_LENGTH(unicode);
2934+
}
2935+
assert(PyUnicode_IS_READY(unicode));
2936+
2937+
res = _PyUnicode_LENGTH(unicode);
2938+
#if SIZEOF_WCHAR_T == 2
2939+
if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
2940+
const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
2941+
const Py_UCS4 *end = s + res;
2942+
for (; s < end; ++s) {
2943+
if (*s > 0xFFFF) {
2944+
++res;
2945+
}
2946+
}
2947+
}
2948+
#endif
2949+
return res;
2950+
}
2951+
2952+
static void
2953+
unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
2954+
{
2955+
const wchar_t *wstr;
2956+
2957+
assert(unicode != NULL);
2958+
assert(_PyUnicode_CHECK(unicode));
2959+
2960+
wstr = _PyUnicode_WSTR(unicode);
2961+
if (wstr != NULL) {
2962+
memcpy(w, wstr, size * sizeof(wchar_t));
2963+
return;
2964+
}
2965+
assert(PyUnicode_IS_READY(unicode));
2966+
2967+
if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
2968+
const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
2969+
for (; size--; ++s, ++w) {
2970+
*w = *s;
2971+
}
2972+
}
2973+
else {
2974+
#if SIZEOF_WCHAR_T == 4
2975+
assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
2976+
const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
2977+
for (; size--; ++s, ++w) {
2978+
*w = *s;
2979+
}
2980+
#else
2981+
assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2982+
const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
2983+
for (; size--; ++s, ++w) {
2984+
Py_UCS4 ch = *s;
2985+
if (ch > 0xFFFF) {
2986+
assert(ch <= MAX_UNICODE);
2987+
/* encode surrogate pair in this case */
2988+
*w++ = Py_UNICODE_HIGH_SURROGATE(ch);
2989+
if (!size--)
2990+
break;
2991+
*w = Py_UNICODE_LOW_SURROGATE(ch);
2992+
}
2993+
else {
2994+
*w = ch;
2995+
}
2996+
}
2997+
#endif
2998+
}
2999+
}
3000+
29243001
#ifdef HAVE_WCHAR_H
29253002

29263003
/* Convert a Unicode object to a wide character string.
@@ -2937,59 +3014,63 @@ PyUnicode_AsWideChar(PyObject *unicode,
29373014
Py_ssize_t size)
29383015
{
29393016
Py_ssize_t res;
2940-
const wchar_t *wstr;
29413017

29423018
if (unicode == NULL) {
29433019
PyErr_BadInternalCall();
29443020
return -1;
29453021
}
2946-
wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
2947-
if (wstr == NULL)
3022+
if (!PyUnicode_Check(unicode)) {
3023+
PyErr_BadArgument();
29483024
return -1;
2949-
2950-
if (w != NULL) {
2951-
if (size > res)
2952-
size = res + 1;
2953-
else
2954-
res = size;
2955-
memcpy(w, wstr, size * sizeof(wchar_t));
2956-
return res;
29573025
}
2958-
else
3026+
3027+
res = unicode_get_widechar_size(unicode);
3028+
if (w == NULL) {
29593029
return res + 1;
3030+
}
3031+
3032+
if (size > res) {
3033+
size = res + 1;
3034+
}
3035+
else {
3036+
res = size;
3037+
}
3038+
unicode_copy_as_widechar(unicode, w, size);
3039+
return res;
29603040
}
29613041

29623042
wchar_t*
29633043
PyUnicode_AsWideCharString(PyObject *unicode,
29643044
Py_ssize_t *size)
29653045
{
2966-
const wchar_t *wstr;
29673046
wchar_t *buffer;
29683047
Py_ssize_t buflen;
29693048

29703049
if (unicode == NULL) {
29713050
PyErr_BadInternalCall();
29723051
return NULL;
29733052
}
2974-
2975-
wstr = PyUnicode_AsUnicodeAndSize(unicode, &buflen);
2976-
if (wstr == NULL) {
2977-
return NULL;
2978-
}
2979-
if (size == NULL && wcslen(wstr) != (size_t)buflen) {
2980-
PyErr_SetString(PyExc_ValueError,
2981-
"embedded null character");
3053+
if (!PyUnicode_Check(unicode)) {
3054+
PyErr_BadArgument();
29823055
return NULL;
29833056
}
29843057

2985-
buffer = PyMem_NEW(wchar_t, buflen + 1);
3058+
buflen = unicode_get_widechar_size(unicode);
3059+
buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
29863060
if (buffer == NULL) {
29873061
PyErr_NoMemory();
29883062
return NULL;
29893063
}
2990-
memcpy(buffer, wstr, (buflen + 1) * sizeof(wchar_t));
2991-
if (size != NULL)
3064+
unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3065+
if (size != NULL) {
29923066
*size = buflen;
3067+
}
3068+
else if (wcslen(buffer) != (size_t)buflen) {
3069+
PyMem_FREE(buffer);
3070+
PyErr_SetString(PyExc_ValueError,
3071+
"embedded null character");
3072+
return NULL;
3073+
}
29933074
return buffer;
29943075
}
29953076

@@ -3781,118 +3862,35 @@ PyUnicode_AsUTF8(PyObject *unicode)
37813862
Py_UNICODE *
37823863
PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
37833864
{
3784-
const unsigned char *one_byte;
3785-
#if SIZEOF_WCHAR_T == 4
3786-
const Py_UCS2 *two_bytes;
3787-
#else
3788-
const Py_UCS4 *four_bytes;
3789-
const Py_UCS4 *ucs4_end;
3790-
Py_ssize_t num_surrogates;
3791-
#endif
3792-
wchar_t *w;
3793-
wchar_t *wchar_end;
3794-
37953865
if (!PyUnicode_Check(unicode)) {
37963866
PyErr_BadArgument();
37973867
return NULL;
37983868
}
3799-
if (_PyUnicode_WSTR(unicode) == NULL) {
3869+
Py_UNICODE *w = _PyUnicode_WSTR(unicode);
3870+
if (w == NULL) {
38003871
/* Non-ASCII compact unicode object */
3801-
assert(_PyUnicode_KIND(unicode) != 0);
3872+
assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
38023873
assert(PyUnicode_IS_READY(unicode));
38033874

3804-
if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3805-
#if SIZEOF_WCHAR_T == 2
3806-
four_bytes = PyUnicode_4BYTE_DATA(unicode);
3807-
ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
3808-
num_surrogates = 0;
3809-
3810-
for (; four_bytes < ucs4_end; ++four_bytes) {
3811-
if (*four_bytes > 0xFFFF)
3812-
++num_surrogates;
3813-
}
3814-
3815-
_PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3816-
sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3817-
if (!_PyUnicode_WSTR(unicode)) {
3818-
PyErr_NoMemory();
3819-
return NULL;
3820-
}
3821-
_PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
3822-
3823-
w = _PyUnicode_WSTR(unicode);
3824-
wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3825-
four_bytes = PyUnicode_4BYTE_DATA(unicode);
3826-
for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3827-
if (*four_bytes > 0xFFFF) {
3828-
assert(*four_bytes <= MAX_UNICODE);
3829-
/* encode surrogate pair in this case */
3830-
*w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3831-
*w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
3832-
}
3833-
else
3834-
*w = *four_bytes;
3835-
3836-
if (w > wchar_end) {
3837-
Py_UNREACHABLE();
3838-
}
3839-
}
3840-
*w = 0;
3841-
#else
3842-
/* sizeof(wchar_t) == 4 */
3843-
Py_FatalError("Impossible unicode object state, wstr and str "
3844-
"should share memory already.");
3875+
Py_ssize_t wlen = unicode_get_widechar_size(unicode);
3876+
if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
3877+
PyErr_NoMemory();
38453878
return NULL;
3846-
#endif
38473879
}
3848-
else {
3849-
if ((size_t)_PyUnicode_LENGTH(unicode) >
3850-
PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
3851-
PyErr_NoMemory();
3852-
return NULL;
3853-
}
3854-
_PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3855-
(_PyUnicode_LENGTH(unicode) + 1));
3856-
if (!_PyUnicode_WSTR(unicode)) {
3857-
PyErr_NoMemory();
3858-
return NULL;
3859-
}
3860-
if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3861-
_PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3862-
w = _PyUnicode_WSTR(unicode);
3863-
wchar_end = w + _PyUnicode_LENGTH(unicode);
3864-
3865-
if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3866-
one_byte = PyUnicode_1BYTE_DATA(unicode);
3867-
for (; w < wchar_end; ++one_byte, ++w)
3868-
*w = *one_byte;
3869-
/* null-terminate the wstr */
3870-
*w = 0;
3871-
}
3872-
else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
3873-
#if SIZEOF_WCHAR_T == 4
3874-
two_bytes = PyUnicode_2BYTE_DATA(unicode);
3875-
for (; w < wchar_end; ++two_bytes, ++w)
3876-
*w = *two_bytes;
3877-
/* null-terminate the wstr */
3878-
*w = 0;
3879-
#else
3880-
/* sizeof(wchar_t) == 2 */
3881-
PyObject_FREE(_PyUnicode_WSTR(unicode));
3882-
_PyUnicode_WSTR(unicode) = NULL;
3883-
Py_FatalError("Impossible unicode object state, wstr "
3884-
"and str should share memory already.");
3885-
return NULL;
3886-
#endif
3887-
}
3888-
else {
3889-
Py_UNREACHABLE();
3890-
}
3880+
w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
3881+
if (w == NULL) {
3882+
PyErr_NoMemory();
3883+
return NULL;
3884+
}
3885+
unicode_copy_as_widechar(unicode, w, wlen + 1);
3886+
_PyUnicode_WSTR(unicode) = w;
3887+
if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
3888+
_PyUnicode_WSTR_LENGTH(unicode) = wlen;
38913889
}
38923890
}
38933891
if (size != NULL)
38943892
*size = PyUnicode_WSTR_LENGTH(unicode);
3895-
return _PyUnicode_WSTR(unicode);
3893+
return w;
38963894
}
38973895

38983896
Py_UNICODE *

0 commit comments

Image for: 0 commit comments
Comments
 (0)