From 02a4d57263a9846de35b0db12763ff9e7326f62c Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Thu, 27 Feb 2020 13:48:59 +0900 Subject: [PATCH] bpo-39087: Optimize PyUnicode_AsUTF8AndSize() (GH-18327) Avoid using temporary bytes object. --- .../2020-02-03-21-12-39.bpo-39087.YnbUpL.rst | 2 + Objects/stringlib/codecs.h | 35 ++++--- Objects/unicodeobject.c | 98 ++++++++++++++----- 3 files changed, 92 insertions(+), 43 deletions(-) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2020-02-03-21-12-39.bpo-39087.YnbUpL.rst diff --git a/Misc/NEWS.d/next/Core and Builtins/2020-02-03-21-12-39.bpo-39087.YnbUpL.rst b/Misc/NEWS.d/next/Core and Builtins/2020-02-03-21-12-39.bpo-39087.YnbUpL.rst new file mode 100644 index 00000000000000..847f78f5b182ea --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2020-02-03-21-12-39.bpo-39087.YnbUpL.rst @@ -0,0 +1,2 @@ +Optimize :c:func:`PyUnicode_AsUTF8` and :c:func:`PyUnicode_AsUTF8AndSize` +slightly when they need to create internal UTF-8 cache. diff --git a/Objects/stringlib/codecs.h b/Objects/stringlib/codecs.h index 269a5581f70055..eb42e071751d76 100644 --- a/Objects/stringlib/codecs.h +++ b/Objects/stringlib/codecs.h @@ -256,8 +256,9 @@ STRINGLIB(utf8_decode)(const char **inptr, const char *end, /* UTF-8 encoder specialized for a Unicode kind to avoid the slow PyUnicode_READ() macro. Delete some parts of the code depending on the kind: UCS-1 strings don't need to handle surrogates for example. */ -Py_LOCAL_INLINE(PyObject *) -STRINGLIB(utf8_encoder)(PyObject *unicode, +Py_LOCAL_INLINE(char *) +STRINGLIB(utf8_encoder)(_PyBytesWriter *writer, + PyObject *unicode, STRINGLIB_CHAR *data, Py_ssize_t size, _Py_error_handler error_handler, @@ -277,17 +278,16 @@ STRINGLIB(utf8_encoder)(PyObject *unicode, #else /* STRINGLIB_SIZEOF_CHAR == 4 */ const Py_ssize_t max_char_size = 4; #endif - _PyBytesWriter writer; assert(size >= 0); - _PyBytesWriter_Init(&writer); - if (size > PY_SSIZE_T_MAX / max_char_size) { /* integer overflow */ - return PyErr_NoMemory(); + PyErr_NoMemory(); + return NULL; } - p = _PyBytesWriter_Alloc(&writer, size * max_char_size); + _PyBytesWriter_Init(writer); + p = _PyBytesWriter_Alloc(writer, size * max_char_size); if (p == NULL) return NULL; @@ -323,7 +323,7 @@ STRINGLIB(utf8_encoder)(PyObject *unicode, endpos++; /* Only overallocate the buffer if it's not the last write */ - writer.overallocate = (endpos < size); + writer->overallocate = (endpos < size); switch (error_handler) { @@ -347,8 +347,8 @@ STRINGLIB(utf8_encoder)(PyObject *unicode, case _Py_ERROR_BACKSLASHREPLACE: /* subtract preallocated bytes */ - writer.min_size -= max_char_size * (endpos - startpos); - p = backslashreplace(&writer, p, + writer->min_size -= max_char_size * (endpos - startpos); + p = backslashreplace(writer, p, unicode, startpos, endpos); if (p == NULL) goto error; @@ -357,8 +357,8 @@ STRINGLIB(utf8_encoder)(PyObject *unicode, case _Py_ERROR_XMLCHARREFREPLACE: /* subtract preallocated bytes */ - writer.min_size -= max_char_size * (endpos - startpos); - p = xmlcharrefreplace(&writer, p, + writer->min_size -= max_char_size * (endpos - startpos); + p = xmlcharrefreplace(writer, p, unicode, startpos, endpos); if (p == NULL) goto error; @@ -387,10 +387,10 @@ STRINGLIB(utf8_encoder)(PyObject *unicode, goto error; /* subtract preallocated bytes */ - writer.min_size -= max_char_size * (newpos - startpos); + writer->min_size -= max_char_size * (newpos - startpos); if (PyBytes_Check(rep)) { - p = _PyBytesWriter_WriteBytes(&writer, p, + p = _PyBytesWriter_WriteBytes(writer, p, PyBytes_AS_STRING(rep), PyBytes_GET_SIZE(rep)); } @@ -406,7 +406,7 @@ STRINGLIB(utf8_encoder)(PyObject *unicode, goto error; } - p = _PyBytesWriter_WriteBytes(&writer, p, + p = _PyBytesWriter_WriteBytes(writer, p, PyUnicode_DATA(rep), PyUnicode_GET_LENGTH(rep)); } @@ -420,7 +420,7 @@ STRINGLIB(utf8_encoder)(PyObject *unicode, /* If overallocation was disabled, ensure that it was the last write. Otherwise, we missed an optimization */ - assert(writer.overallocate || i == size); + assert(writer->overallocate || i == size); } else #if STRINGLIB_SIZEOF_CHAR > 2 @@ -449,14 +449,13 @@ STRINGLIB(utf8_encoder)(PyObject *unicode, Py_XDECREF(error_handler_obj); Py_XDECREF(exc); #endif - return _PyBytesWriter_Finish(&writer, p); + return p; #if STRINGLIB_SIZEOF_CHAR > 1 error: Py_XDECREF(rep); Py_XDECREF(error_handler_obj); Py_XDECREF(exc); - _PyBytesWriter_Dealloc(&writer); return NULL; #endif } diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index ee6d3dfd3945bd..e0a666f70da366 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -3991,11 +3991,11 @@ PyUnicode_FSDecoder(PyObject* arg, void* addr) } +static int unicode_fill_utf8(PyObject *unicode); + const char * PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize) { - PyObject *bytes; - if (!PyUnicode_Check(unicode)) { PyErr_BadArgument(); return NULL; @@ -4004,21 +4004,9 @@ PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize) return NULL; if (PyUnicode_UTF8(unicode) == NULL) { - assert(!PyUnicode_IS_COMPACT_ASCII(unicode)); - bytes = _PyUnicode_AsUTF8String(unicode, NULL); - if (bytes == NULL) - return NULL; - _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1); - if (_PyUnicode_UTF8(unicode) == NULL) { - PyErr_NoMemory(); - Py_DECREF(bytes); + if (unicode_fill_utf8(unicode) == -1) { return NULL; } - _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes); - memcpy(_PyUnicode_UTF8(unicode), - PyBytes_AS_STRING(bytes), - _PyUnicode_UTF8_LENGTH(unicode) + 1); - Py_DECREF(bytes); } if (psize) @@ -5381,10 +5369,6 @@ static PyObject * unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler, const char *errors) { - enum PyUnicode_Kind kind; - void *data; - Py_ssize_t size; - if (!PyUnicode_Check(unicode)) { PyErr_BadArgument(); return NULL; @@ -5397,9 +5381,12 @@ unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler, return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode), PyUnicode_UTF8_LENGTH(unicode)); - kind = PyUnicode_KIND(unicode); - data = PyUnicode_DATA(unicode); - size = PyUnicode_GET_LENGTH(unicode); + enum PyUnicode_Kind kind = PyUnicode_KIND(unicode); + void *data = PyUnicode_DATA(unicode); + Py_ssize_t size = PyUnicode_GET_LENGTH(unicode); + + _PyBytesWriter writer; + char *end; switch (kind) { default: @@ -5407,12 +5394,73 @@ unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler, case PyUnicode_1BYTE_KIND: /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */ assert(!PyUnicode_IS_ASCII(unicode)); - return ucs1lib_utf8_encoder(unicode, data, size, error_handler, errors); + end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors); + break; + case PyUnicode_2BYTE_KIND: + end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors); + break; + case PyUnicode_4BYTE_KIND: + end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors); + break; + } + + if (end == NULL) { + _PyBytesWriter_Dealloc(&writer); + return NULL; + } + return _PyBytesWriter_Finish(&writer, end); +} + +static int +unicode_fill_utf8(PyObject *unicode) +{ + /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */ + assert(!PyUnicode_IS_ASCII(unicode)); + + enum PyUnicode_Kind kind = PyUnicode_KIND(unicode); + void *data = PyUnicode_DATA(unicode); + Py_ssize_t size = PyUnicode_GET_LENGTH(unicode); + + _PyBytesWriter writer; + char *end; + + switch (kind) { + default: + Py_UNREACHABLE(); + case PyUnicode_1BYTE_KIND: + end = ucs1lib_utf8_encoder(&writer, unicode, data, size, + _Py_ERROR_STRICT, NULL); + break; case PyUnicode_2BYTE_KIND: - return ucs2lib_utf8_encoder(unicode, data, size, error_handler, errors); + end = ucs2lib_utf8_encoder(&writer, unicode, data, size, + _Py_ERROR_STRICT, NULL); + break; case PyUnicode_4BYTE_KIND: - return ucs4lib_utf8_encoder(unicode, data, size, error_handler, errors); + end = ucs4lib_utf8_encoder(&writer, unicode, data, size, + _Py_ERROR_STRICT, NULL); + break; + } + if (end == NULL) { + _PyBytesWriter_Dealloc(&writer); + return -1; + } + + char *start = writer.use_small_buffer ? writer.small_buffer : + PyBytes_AS_STRING(writer.buffer); + Py_ssize_t len = end - start; + + char *cache = PyObject_MALLOC(len + 1); + if (cache == NULL) { + _PyBytesWriter_Dealloc(&writer); + PyErr_NoMemory(); + return -1; } + _PyUnicode_UTF8(unicode) = cache; + _PyUnicode_UTF8_LENGTH(unicode) = len; + memcpy(cache, start, len); + cache[len] = '\0'; + _PyBytesWriter_Dealloc(&writer); + return 0; } PyObject *