Skip to content

Commit

Permalink
bpo-39087: Optimize PyUnicode_AsUTF8AndSize() (pythonGH-18327)
Browse files Browse the repository at this point in the history
Avoid using temporary bytes object.
  • Loading branch information
methane authored Feb 27, 2020
1 parent 0c6e3aa commit 02a4d57
Show file tree
Hide file tree
Showing 3 changed files with 92 additions and 43 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Optimize :c:func:`PyUnicode_AsUTF8` and :c:func:`PyUnicode_AsUTF8AndSize`
slightly when they need to create internal UTF-8 cache.
35 changes: 17 additions & 18 deletions Objects/stringlib/codecs.h
Original file line number Diff line number Diff line change
Expand Up @@ -256,8 +256,9 @@ STRINGLIB(utf8_decode)(const char **inptr, const char *end,
/* UTF-8 encoder specialized for a Unicode kind to avoid the slow
PyUnicode_READ() macro. Delete some parts of the code depending on the kind:
UCS-1 strings don't need to handle surrogates for example. */
Py_LOCAL_INLINE(PyObject *)
STRINGLIB(utf8_encoder)(PyObject *unicode,
Py_LOCAL_INLINE(char *)
STRINGLIB(utf8_encoder)(_PyBytesWriter *writer,
PyObject *unicode,
STRINGLIB_CHAR *data,
Py_ssize_t size,
_Py_error_handler error_handler,
Expand All @@ -277,17 +278,16 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
#else /* STRINGLIB_SIZEOF_CHAR == 4 */
const Py_ssize_t max_char_size = 4;
#endif
_PyBytesWriter writer;

assert(size >= 0);
_PyBytesWriter_Init(&writer);

if (size > PY_SSIZE_T_MAX / max_char_size) {
/* integer overflow */
return PyErr_NoMemory();
PyErr_NoMemory();
return NULL;
}

p = _PyBytesWriter_Alloc(&writer, size * max_char_size);
_PyBytesWriter_Init(writer);
p = _PyBytesWriter_Alloc(writer, size * max_char_size);
if (p == NULL)
return NULL;

Expand Down Expand Up @@ -323,7 +323,7 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
endpos++;

/* Only overallocate the buffer if it's not the last write */
writer.overallocate = (endpos < size);
writer->overallocate = (endpos < size);

switch (error_handler)
{
Expand All @@ -347,8 +347,8 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,

case _Py_ERROR_BACKSLASHREPLACE:
/* subtract preallocated bytes */
writer.min_size -= max_char_size * (endpos - startpos);
p = backslashreplace(&writer, p,
writer->min_size -= max_char_size * (endpos - startpos);
p = backslashreplace(writer, p,
unicode, startpos, endpos);
if (p == NULL)
goto error;
Expand All @@ -357,8 +357,8 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,

case _Py_ERROR_XMLCHARREFREPLACE:
/* subtract preallocated bytes */
writer.min_size -= max_char_size * (endpos - startpos);
p = xmlcharrefreplace(&writer, p,
writer->min_size -= max_char_size * (endpos - startpos);
p = xmlcharrefreplace(writer, p,
unicode, startpos, endpos);
if (p == NULL)
goto error;
Expand Down Expand Up @@ -387,10 +387,10 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
goto error;

/* subtract preallocated bytes */
writer.min_size -= max_char_size * (newpos - startpos);
writer->min_size -= max_char_size * (newpos - startpos);

if (PyBytes_Check(rep)) {
p = _PyBytesWriter_WriteBytes(&writer, p,
p = _PyBytesWriter_WriteBytes(writer, p,
PyBytes_AS_STRING(rep),
PyBytes_GET_SIZE(rep));
}
Expand All @@ -406,7 +406,7 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
goto error;
}

p = _PyBytesWriter_WriteBytes(&writer, p,
p = _PyBytesWriter_WriteBytes(writer, p,
PyUnicode_DATA(rep),
PyUnicode_GET_LENGTH(rep));
}
Expand All @@ -420,7 +420,7 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,

/* If overallocation was disabled, ensure that it was the last
write. Otherwise, we missed an optimization */
assert(writer.overallocate || i == size);
assert(writer->overallocate || i == size);
}
else
#if STRINGLIB_SIZEOF_CHAR > 2
Expand Down Expand Up @@ -449,14 +449,13 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
Py_XDECREF(error_handler_obj);
Py_XDECREF(exc);
#endif
return _PyBytesWriter_Finish(&writer, p);
return p;

#if STRINGLIB_SIZEOF_CHAR > 1
error:
Py_XDECREF(rep);
Py_XDECREF(error_handler_obj);
Py_XDECREF(exc);
_PyBytesWriter_Dealloc(&writer);
return NULL;
#endif
}
Expand Down
98 changes: 73 additions & 25 deletions Objects/unicodeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -3991,11 +3991,11 @@ PyUnicode_FSDecoder(PyObject* arg, void* addr)
}


static int unicode_fill_utf8(PyObject *unicode);

const char *
PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
{
PyObject *bytes;

if (!PyUnicode_Check(unicode)) {
PyErr_BadArgument();
return NULL;
Expand All @@ -4004,21 +4004,9 @@ PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
return NULL;

if (PyUnicode_UTF8(unicode) == NULL) {
assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
bytes = _PyUnicode_AsUTF8String(unicode, NULL);
if (bytes == NULL)
return NULL;
_PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
if (_PyUnicode_UTF8(unicode) == NULL) {
PyErr_NoMemory();
Py_DECREF(bytes);
if (unicode_fill_utf8(unicode) == -1) {
return NULL;
}
_PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
memcpy(_PyUnicode_UTF8(unicode),
PyBytes_AS_STRING(bytes),
_PyUnicode_UTF8_LENGTH(unicode) + 1);
Py_DECREF(bytes);
}

if (psize)
Expand Down Expand Up @@ -5381,10 +5369,6 @@ static PyObject *
unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
const char *errors)
{
enum PyUnicode_Kind kind;
void *data;
Py_ssize_t size;

if (!PyUnicode_Check(unicode)) {
PyErr_BadArgument();
return NULL;
Expand All @@ -5397,22 +5381,86 @@ unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
PyUnicode_UTF8_LENGTH(unicode));

kind = PyUnicode_KIND(unicode);
data = PyUnicode_DATA(unicode);
size = PyUnicode_GET_LENGTH(unicode);
enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
void *data = PyUnicode_DATA(unicode);
Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);

_PyBytesWriter writer;
char *end;

switch (kind) {
default:
Py_UNREACHABLE();
case PyUnicode_1BYTE_KIND:
/* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
assert(!PyUnicode_IS_ASCII(unicode));
return ucs1lib_utf8_encoder(unicode, data, size, error_handler, errors);
end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
break;
case PyUnicode_2BYTE_KIND:
end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
break;
case PyUnicode_4BYTE_KIND:
end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
break;
}

if (end == NULL) {
_PyBytesWriter_Dealloc(&writer);
return NULL;
}
return _PyBytesWriter_Finish(&writer, end);
}

static int
unicode_fill_utf8(PyObject *unicode)
{
/* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
assert(!PyUnicode_IS_ASCII(unicode));

enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
void *data = PyUnicode_DATA(unicode);
Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);

_PyBytesWriter writer;
char *end;

switch (kind) {
default:
Py_UNREACHABLE();
case PyUnicode_1BYTE_KIND:
end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
_Py_ERROR_STRICT, NULL);
break;
case PyUnicode_2BYTE_KIND:
return ucs2lib_utf8_encoder(unicode, data, size, error_handler, errors);
end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
_Py_ERROR_STRICT, NULL);
break;
case PyUnicode_4BYTE_KIND:
return ucs4lib_utf8_encoder(unicode, data, size, error_handler, errors);
end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
_Py_ERROR_STRICT, NULL);
break;
}
if (end == NULL) {
_PyBytesWriter_Dealloc(&writer);
return -1;
}

char *start = writer.use_small_buffer ? writer.small_buffer :
PyBytes_AS_STRING(writer.buffer);
Py_ssize_t len = end - start;

char *cache = PyObject_MALLOC(len + 1);
if (cache == NULL) {
_PyBytesWriter_Dealloc(&writer);
PyErr_NoMemory();
return -1;
}
_PyUnicode_UTF8(unicode) = cache;
_PyUnicode_UTF8_LENGTH(unicode) = len;
memcpy(cache, start, len);
cache[len] = '\0';
_PyBytesWriter_Dealloc(&writer);
return 0;
}

PyObject *
Expand Down

0 comments on commit 02a4d57

Please sign in to comment.