Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add StringUtil::toUTF8() #13

Merged
merged 4 commits into from
Feb 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions .github/workflows/validate.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,9 @@ jobs:
- "8.1"
- "8.2"
- "8.3"

dependencies:
- lowest
- highest

illuminate:
- ^8.73
- ^9
Expand Down
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@ See [GitHub releases](https://github.com/mll-lab/php-utils/releases).

## Unreleased

### Added

- Add `StringUtil::toUTF8()`

## v1.11.0

### Added
Expand Down
6 changes: 3 additions & 3 deletions src/QxManager/FilledRow.php
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,9 @@ public function __construct(
string $targetName,
string $signalCh1,
string $signalCh2,
int $referenceCopies = null,
string $wellNotes = null,
string $rdqConversionFactor = null
?int $referenceCopies = null,
?string $wellNotes = null,
?string $rdqConversionFactor = null
) {
$this->targetName = $targetName;
$this->signalCh1 = $signalCh1;
Expand Down
78 changes: 78 additions & 0 deletions src/StringUtil.php
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,21 @@

final class StringUtil
{
/** https://en.wikipedia.org/wiki/Byte_order_mark#UTF-8 */
public const UTF_8_BOM = "\xEF\xBB\xBF";

/** https://en.wikipedia.org/wiki/Byte_order_mark#UTF-16 */
public const UTF_16_BIG_ENDIAN_BOM = "\xFE\xFF";

/** https://en.wikipedia.org/wiki/Byte_order_mark#UTF-16 */
public const UTF_16_LITTLE_ENDIAN_BOM = "\xFF\xFE";

/** https://en.wikipedia.org/wiki/Byte_order_mark#UTF-32 */
public const UTF_32_BIG_ENDIAN_BOM = "\x00\x00\xFE\xFF";

/** https://en.wikipedia.org/wiki/Byte_order_mark#UTF-32 */
public const UTF_32_LITTLE_ENDIAN_BOM = "\xFF\xFE\x00\x00";

/** @param iterable<string|null> $parts */
public static function joinNonEmpty(string $glue, iterable $parts): string
{
Expand Down Expand Up @@ -82,6 +97,69 @@
return \Safe\preg_replace("/\r\n|\r|\n/", $to, $input);
}

/** Convert string that could be in different UTF encodings (UTF-8, UTF-16BE, ...) to UTF-8. */
public static function toUTF8(string $string): string
{
$encoding = mb_detect_encoding($string, null, true);

if ($encoding === false) {
$encoding = self::guessEncoding($string);
}

error_clear_last();
// @phpstan-ignore-next-line \Safe\mb_convert_encoding is not available in older PHP versions
$converted = mb_convert_encoding($string, 'UTF-8', $encoding);
// @phpstan-ignore-next-line mb_convert_encoding can return false in older PHP versions
if (! is_string($converted)) {
$error = error_get_last();
$notString = gettype($converted);
throw new \ErrorException($error['message'] ?? "Expected mb_convert_encoding to return string, got {$notString}.", 0, $error['type'] ?? 1);

Check warning on line 116 in src/StringUtil.php

View check run for this annotation

Codecov / codecov/patch

src/StringUtil.php#L114-L116

Added lines #L114 - L116 were not covered by tests
}

return $converted;
}

private static function guessEncoding(string $text): string
{
// @see https://www.php.net/manual/en/function.mb-detect-encoding.php#91051
$first3 = substr($text, 0, 3);
if ($first3 === self::UTF_8_BOM) {
return 'UTF-8';

Check warning on line 127 in src/StringUtil.php

View check run for this annotation

Codecov / codecov/patch

src/StringUtil.php#L127

Added line #L127 was not covered by tests
}

$first4 = substr($text, 0, 3);
if ($first4 === self::UTF_32_BIG_ENDIAN_BOM) {
return 'UTF-32BE';

Check warning on line 132 in src/StringUtil.php

View check run for this annotation

Codecov / codecov/patch

src/StringUtil.php#L132

Added line #L132 was not covered by tests
}
if ($first4 === self::UTF_32_LITTLE_ENDIAN_BOM) {
return 'UTF-32LE';

Check warning on line 135 in src/StringUtil.php

View check run for this annotation

Codecov / codecov/patch

src/StringUtil.php#L135

Added line #L135 was not covered by tests
}

$first2 = substr($text, 0, 2);
if ($first2 === self::UTF_16_BIG_ENDIAN_BOM) {
return 'UTF-16BE';

Check warning on line 140 in src/StringUtil.php

View check run for this annotation

Codecov / codecov/patch

src/StringUtil.php#L140

Added line #L140 was not covered by tests
}
if ($first2 === self::UTF_16_LITTLE_ENDIAN_BOM) {
return 'UTF-16LE';
}

// https://kence.org/2019/11/27/detecting-windows-1252-encoding
// If the string contains characters in ranges that are either control characters
// or invalid for ISO-8859-1 or CP-1252, we are unable to reliably guess.
if (\Safe\preg_match('/[\x00-\x08\x0E-\x1F\x81\x8D\x8F\x90\x9D]/', $text, $matches) !== 0) {
throw new \Exception("Can not determine UTF encoding of text: {$text}");

Check warning on line 150 in src/StringUtil.php

View check run for this annotation

Codecov / codecov/patch

src/StringUtil.php#L150

Added line #L150 was not covered by tests
}

// If we get here, we're going to assume it's either Windows-1252 or ISO-8859-1.
// If the string contains characters in the ISO-8859-1 reserved range, that's probably Windows-1252.
if (\Safe\preg_match('/[\x80-\x9F]/', $text) !== 0) {
return 'Windows-1252';

Check warning on line 156 in src/StringUtil.php

View check run for this annotation

Codecov / codecov/patch

src/StringUtil.php#L156

Added line #L156 was not covered by tests
}

// Give up and return ISO-8859-1.
return 'ISO-8859-1';
}

/**
* Pad a number with leading zero's.
*
Expand Down
41 changes: 41 additions & 0 deletions tests/StringUtilTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,47 @@ public function testNormalizeLineEndings(): void
);
}

public function testUTF8(): void
{
$expectedUTF8 = 'test';

$string = \Safe\file_get_contents(__DIR__ . '/StringUtilTestData/UTF-8.csv');

self::assertSame($expectedUTF8, $string);
self::assertSame($expectedUTF8, StringUtil::toUTF8($string));
}

public function testUTF16LE(): void
{
// The zero width no-break space (ZWNBSP) is a deprecated use of the Unicode character at code point U+FEFF.
// Character U+FEFF is intended for use as a Byte Order Mark (BOM) at the start of a file
// -> https://unicode-explorer.com/c/FEFF
$expectedUTF8 = 'test';

$string = \Safe\file_get_contents(__DIR__ . '/StringUtilTestData/UTF-16LE.csv');
self::assertNotSame($expectedUTF8, $string);
self::assertSame($expectedUTF8, StringUtil::toUTF8($string));
}

public function testWindows1252(): void
{
$expectedUTF8 = <<<CSV
FileName,WellId,Sample Description,From [bp],To [bp],Average Size [bp],Conc. [ng/µl],Region Molarity [nmol/l],% of Total,Region Comment
2023-05-16 - 13.01.27.D1000,A12,RNA_191_23-049780_A1,170,550,312,23.7,121,95.50,IDT
2023-05-16 - 13.01.27.D1000,B12,RNA_191_23-049782_B1,170,550,308,16.1,82.5,92.27,IDT
2023-05-16 - 13.01.27.D1000,C12,RNA_191_23-049776_C1,170,550,310,16.7,85.3,93.76,IDT
2023-05-16 - 13.01.27.D1000,D12,RNA_191_23-049778_D1,170,550,307,11.4,58.6,91.65,IDT
2023-05-16 - 13.01.27.D1000,E12,RNA_191_NTC_E1,170,550,304,9.63,50.0,90.88,IDT

CSV;

$string = \Safe\file_get_contents(__DIR__ . '/StringUtilTestData/windows-1252.csv');
self::assertNotSame($expectedUTF8, $string);

$utf8String = StringUtil::toUTF8($string);
self::assertSame(StringUtil::normalizeLineEndings($expectedUTF8), StringUtil::normalizeLineEndings($utf8String));
}

public function testLeftPadNumber(): void
{
self::assertSame(
Expand Down
Binary file added tests/StringUtilTestData/UTF-16LE.csv
Binary file not shown.
1 change: 1 addition & 0 deletions tests/StringUtilTestData/UTF-8.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
test
6 changes: 6 additions & 0 deletions tests/StringUtilTestData/windows-1252.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
FileName,WellId,Sample Description,From [bp],To [bp],Average Size [bp],Conc. [ng/µl],Region Molarity [nmol/l],% of Total,Region Comment
2023-05-16 - 13.01.27.D1000,A12,RNA_191_23-049780_A1,170,550,312,23.7,121,95.50,IDT
2023-05-16 - 13.01.27.D1000,B12,RNA_191_23-049782_B1,170,550,308,16.1,82.5,92.27,IDT
2023-05-16 - 13.01.27.D1000,C12,RNA_191_23-049776_C1,170,550,310,16.7,85.3,93.76,IDT
2023-05-16 - 13.01.27.D1000,D12,RNA_191_23-049778_D1,170,550,307,11.4,58.6,91.65,IDT
2023-05-16 - 13.01.27.D1000,E12,RNA_191_NTC_E1,170,550,304,9.63,50.0,90.88,IDT
Loading