Skip to content

Commit

Permalink
escape: Handle surrogate pairs
Browse files Browse the repository at this point in the history
@vit-zikmund 's suggestion is very helpful to get working for handling
surrogate pairs.

Signed-off-by: Hiroshi Hatake <[email protected]>
Co-authored-by: Vit Zikmund <[email protected]>
  • Loading branch information
cosmo0920 and vit-zikmund committed Jan 6, 2025
1 parent 09214eb commit 2d2fe7d
Showing 1 changed file with 51 additions and 3 deletions.
54 changes: 51 additions & 3 deletions src/flb_unescape.c
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,23 @@ static int u8_wc_toutf8(char *dest, uint32_t ch)
return 0;
}

static int u8_high_surrogate(uint32_t ch) {
return ch >= 0xD800 && ch <= 0xDBFF;
}

static int u8_low_surrogate(uint32_t ch) {
return ch >= 0xDC00 && ch <= 0xDFFF;
}

static uint32_t u8_combine_surrogates(uint32_t high, uint32_t low) {
return 0x10000 + (((high - 0xD800) << 10) | (low - 0xDC00));
}

/* assumes that src points to the character after a backslash
returns number of input characters processed */
static int u8_read_escape_sequence(const char *str, int size, uint32_t *dest)
{
uint32_t ch;
uint32_t ch = 0;
char digs[9]="\0\0\0\0\0\0\0\0";
int dno=0, i=1;

Expand Down Expand Up @@ -107,8 +119,41 @@ static int u8_read_escape_sequence(const char *str, int size, uint32_t *dest)
while (i < size && hex_digit(str[i]) && dno < 4) {
digs[dno++] = str[i++];
}
if (dno > 0) {
ch = strtol(digs, NULL, 16);
if (dno != 4) {
/* Incomplete \u escape sequence */
goto invalid_sequence;
}
ch = strtol(digs, NULL, 16);
if (u8_low_surrogate(ch)) {
/* Invalid: low surrogate without preceding high surrogate */
goto invalid_sequence;
}
else if (u8_high_surrogate(ch)) {
/* Handle a surrogate pair.
* Note that i is already incremented with 4 here. */
if (i + 2 < size && str[i] == '\\' && str[i + 1] == 'u') {
dno = 0;
i += 2; /* Skip "\u" */
while (i < size && hex_digit(str[i]) && dno < 4) {
digs[dno++] = str[i++];
}
if (dno != 4) {
/* Incomplete low surrogate */
goto invalid_sequence;
}
uint32_t low = strtol(digs, NULL, 16);
if (u8_low_surrogate(low)) {
ch = u8_combine_surrogates(ch, low);
}
else {
/* Invalid: high surrogate not followed by low surrogate */
goto invalid_sequence;
}
}
else {
/* Invalid: high surrogate not followed by \u */
goto invalid_sequence;
}
}
}
else if (str[0] == 'U') {
Expand All @@ -122,6 +167,9 @@ static int u8_read_escape_sequence(const char *str, int size, uint32_t *dest)
*dest = ch;

return i;

invalid_sequence:
return -1;
}

int flb_unescape_string_utf8(const char *in_buf, int sz, char *out_buf)
Expand Down

0 comments on commit 2d2fe7d

Please sign in to comment.