@@ -2921,6 +2921,83 @@ PyUnicode_FromFormat(const char *format, ...)
2921
2921
return ret ;
2922
2922
}
2923
2923
2924
+ static Py_ssize_t
2925
+ unicode_get_widechar_size (PyObject * unicode )
2926
+ {
2927
+ Py_ssize_t res ;
2928
+
2929
+ assert (unicode != NULL );
2930
+ assert (_PyUnicode_CHECK (unicode ));
2931
+
2932
+ if (_PyUnicode_WSTR (unicode ) != NULL ) {
2933
+ return PyUnicode_WSTR_LENGTH (unicode );
2934
+ }
2935
+ assert (PyUnicode_IS_READY (unicode ));
2936
+
2937
+ res = _PyUnicode_LENGTH (unicode );
2938
+ #if SIZEOF_WCHAR_T == 2
2939
+ if (PyUnicode_KIND (unicode ) == PyUnicode_4BYTE_KIND ) {
2940
+ const Py_UCS4 * s = PyUnicode_4BYTE_DATA (unicode );
2941
+ const Py_UCS4 * end = s + res ;
2942
+ for (; s < end ; ++ s ) {
2943
+ if (* s > 0xFFFF ) {
2944
+ ++ res ;
2945
+ }
2946
+ }
2947
+ }
2948
+ #endif
2949
+ return res ;
2950
+ }
2951
+
2952
+ static void
2953
+ unicode_copy_as_widechar (PyObject * unicode , wchar_t * w , Py_ssize_t size )
2954
+ {
2955
+ const wchar_t * wstr ;
2956
+
2957
+ assert (unicode != NULL );
2958
+ assert (_PyUnicode_CHECK (unicode ));
2959
+
2960
+ wstr = _PyUnicode_WSTR (unicode );
2961
+ if (wstr != NULL ) {
2962
+ memcpy (w , wstr , size * sizeof (wchar_t ));
2963
+ return ;
2964
+ }
2965
+ assert (PyUnicode_IS_READY (unicode ));
2966
+
2967
+ if (PyUnicode_KIND (unicode ) == PyUnicode_1BYTE_KIND ) {
2968
+ const Py_UCS1 * s = PyUnicode_1BYTE_DATA (unicode );
2969
+ for (; size -- ; ++ s , ++ w ) {
2970
+ * w = * s ;
2971
+ }
2972
+ }
2973
+ else {
2974
+ #if SIZEOF_WCHAR_T == 4
2975
+ assert (PyUnicode_KIND (unicode ) == PyUnicode_2BYTE_KIND );
2976
+ const Py_UCS2 * s = PyUnicode_2BYTE_DATA (unicode );
2977
+ for (; size -- ; ++ s , ++ w ) {
2978
+ * w = * s ;
2979
+ }
2980
+ #else
2981
+ assert (PyUnicode_KIND (unicode ) == PyUnicode_4BYTE_KIND );
2982
+ const Py_UCS4 * s = PyUnicode_4BYTE_DATA (unicode );
2983
+ for (; size -- ; ++ s , ++ w ) {
2984
+ Py_UCS4 ch = * s ;
2985
+ if (ch > 0xFFFF ) {
2986
+ assert (ch <= MAX_UNICODE );
2987
+ /* encode surrogate pair in this case */
2988
+ * w ++ = Py_UNICODE_HIGH_SURROGATE (ch );
2989
+ if (!size -- )
2990
+ break ;
2991
+ * w = Py_UNICODE_LOW_SURROGATE (ch );
2992
+ }
2993
+ else {
2994
+ * w = ch ;
2995
+ }
2996
+ }
2997
+ #endif
2998
+ }
2999
+ }
3000
+
2924
3001
#ifdef HAVE_WCHAR_H
2925
3002
2926
3003
/* Convert a Unicode object to a wide character string.
@@ -2937,59 +3014,63 @@ PyUnicode_AsWideChar(PyObject *unicode,
2937
3014
Py_ssize_t size )
2938
3015
{
2939
3016
Py_ssize_t res ;
2940
- const wchar_t * wstr ;
2941
3017
2942
3018
if (unicode == NULL ) {
2943
3019
PyErr_BadInternalCall ();
2944
3020
return -1 ;
2945
3021
}
2946
- wstr = PyUnicode_AsUnicodeAndSize ( unicode , & res );
2947
- if ( wstr == NULL )
3022
+ if (! PyUnicode_Check ( unicode )) {
3023
+ PyErr_BadArgument ();
2948
3024
return -1 ;
2949
-
2950
- if (w != NULL ) {
2951
- if (size > res )
2952
- size = res + 1 ;
2953
- else
2954
- res = size ;
2955
- memcpy (w , wstr , size * sizeof (wchar_t ));
2956
- return res ;
2957
3025
}
2958
- else
3026
+
3027
+ res = unicode_get_widechar_size (unicode );
3028
+ if (w == NULL ) {
2959
3029
return res + 1 ;
3030
+ }
3031
+
3032
+ if (size > res ) {
3033
+ size = res + 1 ;
3034
+ }
3035
+ else {
3036
+ res = size ;
3037
+ }
3038
+ unicode_copy_as_widechar (unicode , w , size );
3039
+ return res ;
2960
3040
}
2961
3041
2962
3042
wchar_t *
2963
3043
PyUnicode_AsWideCharString (PyObject * unicode ,
2964
3044
Py_ssize_t * size )
2965
3045
{
2966
- const wchar_t * wstr ;
2967
3046
wchar_t * buffer ;
2968
3047
Py_ssize_t buflen ;
2969
3048
2970
3049
if (unicode == NULL ) {
2971
3050
PyErr_BadInternalCall ();
2972
3051
return NULL ;
2973
3052
}
2974
-
2975
- wstr = PyUnicode_AsUnicodeAndSize (unicode , & buflen );
2976
- if (wstr == NULL ) {
2977
- return NULL ;
2978
- }
2979
- if (size == NULL && wcslen (wstr ) != (size_t )buflen ) {
2980
- PyErr_SetString (PyExc_ValueError ,
2981
- "embedded null character" );
3053
+ if (!PyUnicode_Check (unicode )) {
3054
+ PyErr_BadArgument ();
2982
3055
return NULL ;
2983
3056
}
2984
3057
2985
- buffer = PyMem_NEW (wchar_t , buflen + 1 );
3058
+ buflen = unicode_get_widechar_size (unicode );
3059
+ buffer = (wchar_t * ) PyMem_NEW (wchar_t , (buflen + 1 ));
2986
3060
if (buffer == NULL ) {
2987
3061
PyErr_NoMemory ();
2988
3062
return NULL ;
2989
3063
}
2990
- memcpy ( buffer , wstr , ( buflen + 1 ) * sizeof ( wchar_t ) );
2991
- if (size != NULL )
3064
+ unicode_copy_as_widechar ( unicode , buffer , buflen + 1 );
3065
+ if (size != NULL ) {
2992
3066
* size = buflen ;
3067
+ }
3068
+ else if (wcslen (buffer ) != (size_t )buflen ) {
3069
+ PyMem_FREE (buffer );
3070
+ PyErr_SetString (PyExc_ValueError ,
3071
+ "embedded null character" );
3072
+ return NULL ;
3073
+ }
2993
3074
return buffer ;
2994
3075
}
2995
3076
@@ -3781,118 +3862,35 @@ PyUnicode_AsUTF8(PyObject *unicode)
3781
3862
Py_UNICODE *
3782
3863
PyUnicode_AsUnicodeAndSize (PyObject * unicode , Py_ssize_t * size )
3783
3864
{
3784
- const unsigned char * one_byte ;
3785
- #if SIZEOF_WCHAR_T == 4
3786
- const Py_UCS2 * two_bytes ;
3787
- #else
3788
- const Py_UCS4 * four_bytes ;
3789
- const Py_UCS4 * ucs4_end ;
3790
- Py_ssize_t num_surrogates ;
3791
- #endif
3792
- wchar_t * w ;
3793
- wchar_t * wchar_end ;
3794
-
3795
3865
if (!PyUnicode_Check (unicode )) {
3796
3866
PyErr_BadArgument ();
3797
3867
return NULL ;
3798
3868
}
3799
- if (_PyUnicode_WSTR (unicode ) == NULL ) {
3869
+ Py_UNICODE * w = _PyUnicode_WSTR (unicode );
3870
+ if (w == NULL ) {
3800
3871
/* Non-ASCII compact unicode object */
3801
- assert (_PyUnicode_KIND (unicode ) != 0 );
3872
+ assert (_PyUnicode_KIND (unicode ) != PyUnicode_WCHAR_KIND );
3802
3873
assert (PyUnicode_IS_READY (unicode ));
3803
3874
3804
- if (PyUnicode_KIND (unicode ) == PyUnicode_4BYTE_KIND ) {
3805
- #if SIZEOF_WCHAR_T == 2
3806
- four_bytes = PyUnicode_4BYTE_DATA (unicode );
3807
- ucs4_end = four_bytes + _PyUnicode_LENGTH (unicode );
3808
- num_surrogates = 0 ;
3809
-
3810
- for (; four_bytes < ucs4_end ; ++ four_bytes ) {
3811
- if (* four_bytes > 0xFFFF )
3812
- ++ num_surrogates ;
3813
- }
3814
-
3815
- _PyUnicode_WSTR (unicode ) = (wchar_t * ) PyObject_MALLOC (
3816
- sizeof (wchar_t ) * (_PyUnicode_LENGTH (unicode ) + 1 + num_surrogates ));
3817
- if (!_PyUnicode_WSTR (unicode )) {
3818
- PyErr_NoMemory ();
3819
- return NULL ;
3820
- }
3821
- _PyUnicode_WSTR_LENGTH (unicode ) = _PyUnicode_LENGTH (unicode ) + num_surrogates ;
3822
-
3823
- w = _PyUnicode_WSTR (unicode );
3824
- wchar_end = w + _PyUnicode_WSTR_LENGTH (unicode );
3825
- four_bytes = PyUnicode_4BYTE_DATA (unicode );
3826
- for (; four_bytes < ucs4_end ; ++ four_bytes , ++ w ) {
3827
- if (* four_bytes > 0xFFFF ) {
3828
- assert (* four_bytes <= MAX_UNICODE );
3829
- /* encode surrogate pair in this case */
3830
- * w ++ = Py_UNICODE_HIGH_SURROGATE (* four_bytes );
3831
- * w = Py_UNICODE_LOW_SURROGATE (* four_bytes );
3832
- }
3833
- else
3834
- * w = * four_bytes ;
3835
-
3836
- if (w > wchar_end ) {
3837
- Py_UNREACHABLE ();
3838
- }
3839
- }
3840
- * w = 0 ;
3841
- #else
3842
- /* sizeof(wchar_t) == 4 */
3843
- Py_FatalError ("Impossible unicode object state, wstr and str "
3844
- "should share memory already." );
3875
+ Py_ssize_t wlen = unicode_get_widechar_size (unicode );
3876
+ if ((size_t )wlen > PY_SSIZE_T_MAX / sizeof (wchar_t ) - 1 ) {
3877
+ PyErr_NoMemory ();
3845
3878
return NULL ;
3846
- #endif
3847
3879
}
3848
- else {
3849
- if ((size_t )_PyUnicode_LENGTH (unicode ) >
3850
- PY_SSIZE_T_MAX / sizeof (wchar_t ) - 1 ) {
3851
- PyErr_NoMemory ();
3852
- return NULL ;
3853
- }
3854
- _PyUnicode_WSTR (unicode ) = (wchar_t * ) PyObject_MALLOC (sizeof (wchar_t ) *
3855
- (_PyUnicode_LENGTH (unicode ) + 1 ));
3856
- if (!_PyUnicode_WSTR (unicode )) {
3857
- PyErr_NoMemory ();
3858
- return NULL ;
3859
- }
3860
- if (!PyUnicode_IS_COMPACT_ASCII (unicode ))
3861
- _PyUnicode_WSTR_LENGTH (unicode ) = _PyUnicode_LENGTH (unicode );
3862
- w = _PyUnicode_WSTR (unicode );
3863
- wchar_end = w + _PyUnicode_LENGTH (unicode );
3864
-
3865
- if (PyUnicode_KIND (unicode ) == PyUnicode_1BYTE_KIND ) {
3866
- one_byte = PyUnicode_1BYTE_DATA (unicode );
3867
- for (; w < wchar_end ; ++ one_byte , ++ w )
3868
- * w = * one_byte ;
3869
- /* null-terminate the wstr */
3870
- * w = 0 ;
3871
- }
3872
- else if (PyUnicode_KIND (unicode ) == PyUnicode_2BYTE_KIND ) {
3873
- #if SIZEOF_WCHAR_T == 4
3874
- two_bytes = PyUnicode_2BYTE_DATA (unicode );
3875
- for (; w < wchar_end ; ++ two_bytes , ++ w )
3876
- * w = * two_bytes ;
3877
- /* null-terminate the wstr */
3878
- * w = 0 ;
3879
- #else
3880
- /* sizeof(wchar_t) == 2 */
3881
- PyObject_FREE (_PyUnicode_WSTR (unicode ));
3882
- _PyUnicode_WSTR (unicode ) = NULL ;
3883
- Py_FatalError ("Impossible unicode object state, wstr "
3884
- "and str should share memory already." );
3885
- return NULL ;
3886
- #endif
3887
- }
3888
- else {
3889
- Py_UNREACHABLE ();
3890
- }
3880
+ w = (wchar_t * ) PyObject_MALLOC (sizeof (wchar_t ) * (wlen + 1 ));
3881
+ if (w == NULL ) {
3882
+ PyErr_NoMemory ();
3883
+ return NULL ;
3884
+ }
3885
+ unicode_copy_as_widechar (unicode , w , wlen + 1 );
3886
+ _PyUnicode_WSTR (unicode ) = w ;
3887
+ if (!PyUnicode_IS_COMPACT_ASCII (unicode )) {
3888
+ _PyUnicode_WSTR_LENGTH (unicode ) = wlen ;
3891
3889
}
3892
3890
}
3893
3891
if (size != NULL )
3894
3892
* size = PyUnicode_WSTR_LENGTH (unicode );
3895
- return _PyUnicode_WSTR ( unicode ) ;
3893
+ return w ;
3896
3894
}
3897
3895
3898
3896
Py_UNICODE *
0 commit comments