From a0504247f5a0c551d0f7218b27329008d0fc1b14 Mon Sep 17 00:00:00 2001 From: Khushboo Desai Date: Tue, 24 Dec 2024 07:03:58 +0530 Subject: [PATCH 1/2] Adds support to string coercions --- .../hive/formats/ion/IonDecoderFactory.java | 107 +++++++++++++++++- .../trino/hive/formats/ion/TestIonFormat.java | 36 ++++++ 2 files changed, 139 insertions(+), 4 deletions(-) diff --git a/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/ion/IonDecoderFactory.java b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/ion/IonDecoderFactory.java index 6ddfc58a64fd..76a9bc0feee7 100644 --- a/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/ion/IonDecoderFactory.java +++ b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/ion/IonDecoderFactory.java @@ -56,6 +56,7 @@ import java.math.BigDecimal; import java.math.BigInteger; import java.math.RoundingMode; +import java.nio.charset.StandardCharsets; import java.time.LocalDate; import java.util.Arrays; import java.util.HashSet; @@ -127,8 +128,8 @@ private static BlockDecoder decoderForType(Type type) case BooleanType t -> wrapDecoder(boolDecoder, t, IonType.BOOL); case DateType t -> wrapDecoder(dateDecoder, t, IonType.TIMESTAMP); case TimestampType t -> wrapDecoder(timestampDecoder(t), t, IonType.TIMESTAMP); - case VarcharType t -> wrapDecoder(varcharDecoder(t), t, IonType.STRING, IonType.SYMBOL); - case CharType t -> wrapDecoder(charDecoder(t), t, IonType.STRING, IonType.SYMBOL); + case VarcharType t -> wrapDecoderWithTextCoercion(varcharDecoder(t), t, IonType.STRING, IonType.SYMBOL); + case CharType t -> wrapDecoderWithTextCoercion(charDecoder(t), t, IonType.STRING, IonType.SYMBOL); case VarbinaryType t -> wrapDecoder(binaryDecoder, t, IonType.BLOB, IonType.CLOB); case RowType t -> wrapDecoder(RowDecoder.forFields(t.getFields()), t, IonType.STRUCT); case ArrayType t -> wrapDecoder(new ArrayDecoder(decoderForType(t.getElementType())), t, IonType.LIST, IonType.SEXP); @@ -149,13 +150,52 @@ private static BlockDecoder decoderForType(Type type) private static BlockDecoder wrapDecoder(BlockDecoder decoder, Type trinoType, IonType... allowedTypes) { Set allowedWithNull = new HashSet<>(Arrays.asList(allowedTypes)); + return createConfigurableDecoder(decoder, trinoType, false, allowedTypes); + } + + /** + * Wraps decoders for common handling logic. + *

+ * Handles un-typed and correctly typed null values. + * Throws for mistyped values, whether null or not. + * Delegates to Decoder for correctly-typed, non-null values. + * Handles text coercion for Varchar and Char types. + *

+ * This code treats all values as nullable. + */ + private static BlockDecoder wrapDecoderWithTextCoercion(BlockDecoder decoder, Type trinoType, IonType... allowedTypes) + { + return createConfigurableDecoder(decoder, trinoType, true, allowedTypes); + } + + /** + * Wraps decoders for common handling logic. + *

+ * Handles un-typed and correctly typed null values. + * Throws for mistyped values, whether null or not. + * Delegates to Decoder for correctly-typed, non-null values. + * Handles text coercion for Varchar and Char types. + *

+ * This code treats all values as nullable. + */ + private static BlockDecoder createConfigurableDecoder(BlockDecoder decoder, Type trinoType, boolean textCoercion, + IonType... allowedTypes) + { + final Set allowedWithNull = new HashSet<>(Arrays.asList(allowedTypes)); allowedWithNull.add(IonType.NULL); return (reader, builder) -> { final IonType ionType = reader.getType(); if (!allowedWithNull.contains(ionType)) { - throw new TrinoException(StandardErrorCode.GENERIC_USER_ERROR, - "Cannot coerce IonType %s to Trino type %s".formatted(ionType, trinoType)); + if (textCoercion) { + String coercedValue = coerceToString(reader, ionType); + VarcharType.VARCHAR.writeSlice(builder, Slices.utf8Slice(coercedValue)); + return; + } + else { + throw new TrinoException(StandardErrorCode.GENERIC_USER_ERROR, + "Cannot coerce IonType %s to Trino type %s".formatted(ionType, trinoType)); + } } if (reader.isNullValue()) { builder.appendNull(); @@ -166,6 +206,65 @@ private static BlockDecoder wrapDecoder(BlockDecoder decoder, Type trinoType, Io }; } + /** + * Coerces an Ion value to its string representation. + * + * This method handles all IonTypes and converts them to a string format. + * For complex types (LIST, SEXP, STRUCT), it recursively processes their elements. + * + * @param reader The IonReader containing the value to be coerced. + * @param type The IonType of the value to be coerced. + * @return A string representation of the Ion value. + * @throws IllegalArgumentException if the IonType is not supported for text coercion. + * @throws IonException if there's an error reading from the IonReader. + */ + private static String coerceToString(IonReader reader, IonType type) + { + switch (type) { + case BOOL: + return Boolean.toString(reader.booleanValue()); + case INT: + return Long.toString(reader.longValue()); + case FLOAT: + return Double.toString(reader.doubleValue()); + case DECIMAL: + return reader.decimalValue().toString(); + case TIMESTAMP: + return reader.timestampValue().toString(); + case SYMBOL: + case STRING: + return reader.stringValue(); + case CLOB: + case BLOB: + return new String(reader.newBytes(), StandardCharsets.UTF_8); + case LIST: + case SEXP: + StringBuilder sb = new StringBuilder("["); + reader.stepIn(); + while (reader.next() != null) { + if (sb.length() > 1) { + sb.append(", "); + } + sb.append(coerceToString(reader, reader.getType())); + } + reader.stepOut(); + return sb.append("]").toString(); + case STRUCT: + sb = new StringBuilder("{"); + reader.stepIn(); + while (reader.next() != null) { + if (sb.length() > 1) { + sb.append(", "); + } + sb.append(reader.getFieldName()).append(": ").append(coerceToString(reader, reader.getType())); + } + reader.stepOut(); + return sb.append("}").toString(); + default: + throw new IllegalArgumentException(String.format("Text coercion is not supported for IonType: %s", type)); + } + } + /** * The RowDecoder is used as the BlockDecoder for nested RowTypes and is used for decoding * top-level structs into pages. diff --git a/lib/trino-hive-formats/src/test/java/io/trino/hive/formats/ion/TestIonFormat.java b/lib/trino-hive-formats/src/test/java/io/trino/hive/formats/ion/TestIonFormat.java index bb9b3539e566..49feeb8c6888 100644 --- a/lib/trino-hive-formats/src/test/java/io/trino/hive/formats/ion/TestIonFormat.java +++ b/lib/trino-hive-formats/src/test/java/io/trino/hive/formats/ion/TestIonFormat.java @@ -186,6 +186,42 @@ public void testCaseInsensitivityOfKeys() List.of(31, "baz")); } + @Test + public void testStringCoercions() + throws IOException + { + assertValues( + RowType.rowType( + field("foo", VARCHAR)), + "{ foo: true }", + List.of("true")); + assertValues( + RowType.rowType( + field("foo", VARCHAR)), + "{ foo: 31 }", + List.of("31")); + assertValues( + RowType.rowType( + field("foo", VARCHAR)), + "{ foo: 31.50 }", + List.of("31.50")); + assertValues( + RowType.rowType( + field("foo", VARCHAR)), + "{ foo: [1, 2, 3] }", + List.of("[1, 2, 3]")); + assertValues( + RowType.rowType( + field("foo", VARCHAR)), + "{ foo: \"bar\" }", + List.of("bar")); + assertValues( + RowType.rowType( + field("foo", VARCHAR)), + "{ foo: { nested_foo: 12 } }", + List.of("{nested_foo: 12}")); + } + @Test public void testCaseInsensitivityOfDuplicateKeys() throws IOException From b3c8a5a4566d2b42d755e533eadfb7cf41b77656 Mon Sep 17 00:00:00 2001 From: Khushboo Desai Date: Mon, 13 Jan 2025 11:58:30 -0800 Subject: [PATCH 2/2] Adds suggested changes --- .../hive/formats/ion/IonDecoderFactory.java | 154 ++++++------------ .../trino/hive/formats/ion/TestIonFormat.java | 4 +- 2 files changed, 49 insertions(+), 109 deletions(-) diff --git a/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/ion/IonDecoderFactory.java b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/ion/IonDecoderFactory.java index 76a9bc0feee7..255882bb43ef 100644 --- a/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/ion/IonDecoderFactory.java +++ b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/ion/IonDecoderFactory.java @@ -16,7 +16,9 @@ import com.amazon.ion.IonException; import com.amazon.ion.IonReader; import com.amazon.ion.IonType; +import com.amazon.ion.IonWriter; import com.amazon.ion.Timestamp; +import com.amazon.ion.system.IonTextWriterBuilder; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import io.airlift.slice.Slices; @@ -53,10 +55,10 @@ import io.trino.spi.type.VarcharType; import io.trino.spi.type.Varchars; +import java.io.IOException; import java.math.BigDecimal; import java.math.BigInteger; import java.math.RoundingMode; -import java.nio.charset.StandardCharsets; import java.time.LocalDate; import java.util.Arrays; import java.util.HashSet; @@ -128,8 +130,8 @@ private static BlockDecoder decoderForType(Type type) case BooleanType t -> wrapDecoder(boolDecoder, t, IonType.BOOL); case DateType t -> wrapDecoder(dateDecoder, t, IonType.TIMESTAMP); case TimestampType t -> wrapDecoder(timestampDecoder(t), t, IonType.TIMESTAMP); - case VarcharType t -> wrapDecoderWithTextCoercion(varcharDecoder(t), t, IonType.STRING, IonType.SYMBOL); - case CharType t -> wrapDecoderWithTextCoercion(charDecoder(t), t, IonType.STRING, IonType.SYMBOL); + case VarcharType t -> wrapDecoder(varcharDecoder(t), t, IonType.values()); + case CharType t -> wrapDecoder(charDecoder(t), t, IonType.values()); case VarbinaryType t -> wrapDecoder(binaryDecoder, t, IonType.BLOB, IonType.CLOB); case RowType t -> wrapDecoder(RowDecoder.forFields(t.getFields()), t, IonType.STRUCT); case ArrayType t -> wrapDecoder(new ArrayDecoder(decoderForType(t.getElementType())), t, IonType.LIST, IonType.SEXP); @@ -148,38 +150,6 @@ private static BlockDecoder decoderForType(Type type) * This code treats all values as nullable. */ private static BlockDecoder wrapDecoder(BlockDecoder decoder, Type trinoType, IonType... allowedTypes) - { - Set allowedWithNull = new HashSet<>(Arrays.asList(allowedTypes)); - return createConfigurableDecoder(decoder, trinoType, false, allowedTypes); - } - - /** - * Wraps decoders for common handling logic. - *

- * Handles un-typed and correctly typed null values. - * Throws for mistyped values, whether null or not. - * Delegates to Decoder for correctly-typed, non-null values. - * Handles text coercion for Varchar and Char types. - *

- * This code treats all values as nullable. - */ - private static BlockDecoder wrapDecoderWithTextCoercion(BlockDecoder decoder, Type trinoType, IonType... allowedTypes) - { - return createConfigurableDecoder(decoder, trinoType, true, allowedTypes); - } - - /** - * Wraps decoders for common handling logic. - *

- * Handles un-typed and correctly typed null values. - * Throws for mistyped values, whether null or not. - * Delegates to Decoder for correctly-typed, non-null values. - * Handles text coercion for Varchar and Char types. - *

- * This code treats all values as nullable. - */ - private static BlockDecoder createConfigurableDecoder(BlockDecoder decoder, Type trinoType, boolean textCoercion, - IonType... allowedTypes) { final Set allowedWithNull = new HashSet<>(Arrays.asList(allowedTypes)); allowedWithNull.add(IonType.NULL); @@ -187,15 +157,8 @@ private static BlockDecoder createConfigurableDecoder(BlockDecoder decoder, Type return (reader, builder) -> { final IonType ionType = reader.getType(); if (!allowedWithNull.contains(ionType)) { - if (textCoercion) { - String coercedValue = coerceToString(reader, ionType); - VarcharType.VARCHAR.writeSlice(builder, Slices.utf8Slice(coercedValue)); - return; - } - else { - throw new TrinoException(StandardErrorCode.GENERIC_USER_ERROR, - "Cannot coerce IonType %s to Trino type %s".formatted(ionType, trinoType)); - } + throw new TrinoException(StandardErrorCode.GENERIC_USER_ERROR, + "Cannot coerce IonType %s to Trino type %s".formatted(ionType, trinoType)); } if (reader.isNullValue()) { builder.appendNull(); @@ -206,65 +169,6 @@ private static BlockDecoder createConfigurableDecoder(BlockDecoder decoder, Type }; } - /** - * Coerces an Ion value to its string representation. - * - * This method handles all IonTypes and converts them to a string format. - * For complex types (LIST, SEXP, STRUCT), it recursively processes their elements. - * - * @param reader The IonReader containing the value to be coerced. - * @param type The IonType of the value to be coerced. - * @return A string representation of the Ion value. - * @throws IllegalArgumentException if the IonType is not supported for text coercion. - * @throws IonException if there's an error reading from the IonReader. - */ - private static String coerceToString(IonReader reader, IonType type) - { - switch (type) { - case BOOL: - return Boolean.toString(reader.booleanValue()); - case INT: - return Long.toString(reader.longValue()); - case FLOAT: - return Double.toString(reader.doubleValue()); - case DECIMAL: - return reader.decimalValue().toString(); - case TIMESTAMP: - return reader.timestampValue().toString(); - case SYMBOL: - case STRING: - return reader.stringValue(); - case CLOB: - case BLOB: - return new String(reader.newBytes(), StandardCharsets.UTF_8); - case LIST: - case SEXP: - StringBuilder sb = new StringBuilder("["); - reader.stepIn(); - while (reader.next() != null) { - if (sb.length() > 1) { - sb.append(", "); - } - sb.append(coerceToString(reader, reader.getType())); - } - reader.stepOut(); - return sb.append("]").toString(); - case STRUCT: - sb = new StringBuilder("{"); - reader.stepIn(); - while (reader.next() != null) { - if (sb.length() > 1) { - sb.append(", "); - } - sb.append(reader.getFieldName()).append(": ").append(coerceToString(reader, reader.getType())); - } - reader.stepOut(); - return sb.append("}").toString(); - default: - throw new IllegalArgumentException(String.format("Text coercion is not supported for IonType: %s", type)); - } - } - /** * The RowDecoder is used as the BlockDecoder for nested RowTypes and is used for decoding * top-level structs into pages. @@ -459,16 +363,52 @@ private static BlockDecoder decimalDecoder(DecimalType type) }; } + private static String getCoercedValue(IonReader ionReader) + { + IonTextWriterBuilder textWriterBuilder = IonTextWriterBuilder.standard(); + StringBuilder stringBuilder = new StringBuilder(); + IonWriter writer = textWriterBuilder.build(stringBuilder); + try { + writer.writeValue(ionReader); + } + catch (IOException e) { + throw new RuntimeException(e); + } + return stringBuilder.toString(); + } + private static BlockDecoder varcharDecoder(VarcharType type) { - return (ionReader, blockBuilder) -> - type.writeSlice(blockBuilder, Varchars.truncateToLength(Slices.utf8Slice(ionReader.stringValue()), type)); + return (ionReader, blockBuilder) -> { + IonType valueType = ionReader.getType(); + String value; + + if (valueType == IonType.SYMBOL || valueType == IonType.STRING) { + value = ionReader.stringValue(); + } + else { + // For any types other than IonType.SYMBOL and IonType.STRING, performs text coercion + value = getCoercedValue(ionReader); + } + type.writeSlice(blockBuilder, Varchars.truncateToLength(Slices.utf8Slice(value), type)); + }; } private static BlockDecoder charDecoder(CharType type) { - return (ionReader, blockBuilder) -> - type.writeSlice(blockBuilder, Chars.truncateToLengthAndTrimSpaces(Slices.utf8Slice(ionReader.stringValue()), type)); + return (ionReader, blockBuilder) -> { + IonType valueType = ionReader.getType(); + String value; + + if (valueType == IonType.SYMBOL || valueType == IonType.STRING) { + value = ionReader.stringValue(); + } + else { + // For any types other than IonType.SYMBOL and IonType.STRING, performs text coercion + value = getCoercedValue(ionReader); + } + type.writeSlice(blockBuilder, Chars.truncateToLengthAndTrimSpaces(Slices.utf8Slice(value), type)); + }; } private static final BlockDecoder byteDecoder = (ionReader, blockBuilder) -> diff --git a/lib/trino-hive-formats/src/test/java/io/trino/hive/formats/ion/TestIonFormat.java b/lib/trino-hive-formats/src/test/java/io/trino/hive/formats/ion/TestIonFormat.java index 49feeb8c6888..ead606fe3e1b 100644 --- a/lib/trino-hive-formats/src/test/java/io/trino/hive/formats/ion/TestIonFormat.java +++ b/lib/trino-hive-formats/src/test/java/io/trino/hive/formats/ion/TestIonFormat.java @@ -209,7 +209,7 @@ public void testStringCoercions() RowType.rowType( field("foo", VARCHAR)), "{ foo: [1, 2, 3] }", - List.of("[1, 2, 3]")); + List.of("[1,2,3]")); assertValues( RowType.rowType( field("foo", VARCHAR)), @@ -219,7 +219,7 @@ public void testStringCoercions() RowType.rowType( field("foo", VARCHAR)), "{ foo: { nested_foo: 12 } }", - List.of("{nested_foo: 12}")); + List.of("{nested_foo:12}")); } @Test