Skip to content

Commit

Permalink
Fix issue regarding empty tags & and code refactoring (#8)
Browse files Browse the repository at this point in the history
* Fix issue regarding empty tags
* Set correct encoding after removing empty tags
  • Loading branch information
BolZer authored Feb 7, 2025
1 parent e3de098 commit efdc7be
Show file tree
Hide file tree
Showing 7 changed files with 434 additions and 57 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,8 @@ public class InvalidXmlException extends ValidatorException {
public InvalidXmlException() {
super();
}

public InvalidXmlException(Throwable cause) {
super("the xml is invalid", cause);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
package io.github.easybill.Exceptions;

public class XmlSanitizationException extends ValidatorException {

public XmlSanitizationException(Throwable cause) {
super("could not sanitize the xml accordingly", cause);
}
}
61 changes: 7 additions & 54 deletions src/main/java/io/github/easybill/Services/ValidationService.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.regex.Pattern;
import net.sf.saxon.type.ValidationException;
import org.checkerframework.checker.nullness.qual.NonNull;
import org.mozilla.universalchardet.UniversalDetector;

Expand Down Expand Up @@ -66,13 +66,14 @@ public final class ValidationService implements IValidationService {

var xml = new String(bytesFromSteam, charset);

if (isXmlInvalid(xml)) {
if (xml.isBlank()) {
throw new InvalidXmlException();
}

xml = removeBOM(xml);
xml = XMLSanitizer.sanitize(xml, charset);

var xmlSyntaxType = determineXmlSyntax(xml)
var xmlSyntaxType = XMLSyntaxGuesser
.tryGuessSyntax(xml)
.orElseThrow(InvalidXmlException::new);

var report = innerValidateSchematron(
Expand Down Expand Up @@ -144,56 +145,6 @@ private Charset determineCharsetForXmlPayload(byte[] bytes)
throw new InvalidXmlException();
}

private boolean isXmlInvalid(@NonNull String xml) {
return xml.isBlank() || (!checkIfUblXml(xml) && !checkIfCiiXml(xml));
}

private Optional<XMLSyntaxType> determineXmlSyntax(@NonNull String xml) {
if (checkIfCiiXml(xml)) {
return Optional.of(XMLSyntaxType.CII);
}

if (checkIfUblXml(xml)) {
return Optional.of(XMLSyntaxType.UBL);
}

return Optional.empty();
}

private boolean checkIfCiiXml(@NonNull CharSequence payload) {
return Pattern
.compile("[<:](CrossIndustryInvoice)")
.matcher(payload)
.find();
}

private boolean checkIfUblXml(@NonNull CharSequence payload) {
return Pattern
.compile("[<:](Invoice|CreditNote)")
.matcher(payload)
.find();
}

private @NonNull String removeBOM(@NonNull String $payload) {
String UTF8_BOM = "\uFEFF";
String UTF16LE_BOM = "\uFFFE";
String UTF16BE_BOM = "\uFEFF";

if ($payload.isEmpty()) {
return $payload;
}

if (
$payload.startsWith(UTF8_BOM) ||
$payload.startsWith(UTF16LE_BOM) ||
$payload.startsWith(UTF16BE_BOM)
) {
return $payload.substring(1);
}

return $payload;
}

private Optional<SchematronOutputType> innerValidateSchematron(
@NonNull XMLSyntaxType xmlSyntaxType,
byte[] bytes
Expand All @@ -213,6 +164,8 @@ private Optional<SchematronOutputType> innerValidateSchematron(
};
} catch (IllegalArgumentException exception) {
throw new ParsingException(exception);
} catch (ValidationException exception) {
throw new InvalidXmlException(exception);
}
}
}
149 changes: 149 additions & 0 deletions src/main/java/io/github/easybill/Services/XMLSanitizer.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
package io.github.easybill.Services;

import io.github.easybill.Exceptions.XmlSanitizationException;
import java.io.*;
import java.nio.charset.Charset;
import javax.xml.XMLConstants;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.checkerframework.checker.nullness.qual.NonNull;
import org.w3c.dom.*;
import org.w3c.dom.Document;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

public final class XMLSanitizer {

public static @NonNull String sanitize(
@NonNull String xml,
@NonNull Charset charset
) throws XmlSanitizationException {
try {
return removeEmptyTags(
removeInvalidCharsFromProlog(removeBOM(xml)),
charset
);
} catch (Exception exception) {
throw new XmlSanitizationException(exception);
}
}

private static @NonNull String removeInvalidCharsFromProlog(
@NonNull String payload
) {
var indexOfXmlIntro = payload.indexOf("<?xml version");

if (indexOfXmlIntro == 0) {
return payload;
}

return payload.substring(indexOfXmlIntro);
}

private static @NonNull String removeBOM(@NonNull String xml) {
String UTF8_BOM = "\uFEFF";
String UTF16LE_BOM = "\uFFFE";
String UTF16BE_BOM = "\uFEFF";

if (xml.isEmpty()) {
return xml;
}

if (
xml.startsWith(UTF8_BOM) ||
xml.startsWith(UTF16LE_BOM) ||
xml.startsWith(UTF16BE_BOM)
) {
return xml.substring(1);
}

return xml;
}

private static @NonNull String removeEmptyTags(
@NonNull String xml,
@NonNull Charset charset
)
throws ParserConfigurationException, IOException, SAXException, TransformerException {
byte[] xmlBytes = xml.getBytes(charset);

var builderFactory = DocumentBuilderFactory.newInstance();
builderFactory.setNamespaceAware(true);
builderFactory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);

DocumentBuilder db = builderFactory.newDocumentBuilder();

try (
InputStream inputStream = new ByteArrayInputStream(xmlBytes);
Reader reader = new InputStreamReader(inputStream, charset)
) {
Document document = db.parse(new InputSource(reader));

removeEmptyElements(document.getDocumentElement());

TransformerFactory transformerFactory =
TransformerFactory.newInstance();

Transformer transformer = transformerFactory.newTransformer();
transformer.setOutputProperty(OutputKeys.ENCODING, charset.name());
transformer.setOutputProperty(OutputKeys.INDENT, "yes");
transformer.setOutputProperty(OutputKeys.METHOD, "xml");

try (
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
Writer writer = new OutputStreamWriter(outputStream, charset)
) {
transformer.transform(
new DOMSource(document),
new StreamResult(writer)
);
writer.flush();

return outputStream.toString(charset);
}
}
}

private static void removeEmptyElements(Element element) {
NodeList children = element.getChildNodes();

for (int i = children.getLength() - 1; i >= 0; i--) {
Node child = children.item(i);

if (child == null) {
continue;
}

if (child.getNodeType() == Node.ELEMENT_NODE) {
removeEmptyElements((Element) child);
}

if (
child.getNodeType() == Node.ELEMENT_NODE &&
isEmptyElement((Element) child)
) {
element.removeChild(child);
}
}
}

private static boolean isEmptyElement(Element element) {
return (
element.getChildNodes().getLength() == 0 ||
(
element.getChildNodes().getLength() == 1 &&
element.getFirstChild() != null &&
element.getFirstChild().getNodeType() == Node.TEXT_NODE &&
element.getFirstChild().getTextContent() != null &&
element.getFirstChild().getTextContent().trim().isEmpty()
)
);
}
}
35 changes: 35 additions & 0 deletions src/main/java/io/github/easybill/Services/XMLSyntaxGuesser.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
package io.github.easybill.Services;

import io.github.easybill.Enums.XMLSyntaxType;
import java.util.Optional;
import java.util.regex.Pattern;
import org.checkerframework.checker.nullness.qual.NonNull;

public final class XMLSyntaxGuesser {

public static Optional<XMLSyntaxType> tryGuessSyntax(@NonNull String xml) {
if (checkIfCiiXml(xml)) {
return Optional.of(XMLSyntaxType.CII);
}

if (checkIfUblXml(xml)) {
return Optional.of(XMLSyntaxType.UBL);
}

return Optional.empty();
}

private static boolean checkIfCiiXml(@NonNull CharSequence payload) {
return Pattern
.compile("[<:](CrossIndustryInvoice)")
.matcher(payload)
.find();
}

private static boolean checkIfUblXml(@NonNull CharSequence payload) {
return Pattern
.compile("[<:](Invoice|CreditNote)")
.matcher(payload)
.find();
}
}
26 changes: 23 additions & 3 deletions src/test/java/io/github/easybill/ValidationControllerTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,27 @@ void testValidationEndpointWithPayloadIncludingCharsInProlog()
.when()
.post("/validation")
.then()
.statusCode(422);
.statusCode(200);
}

@ParameterizedTest
@ValueSource(strings = { "CII/CII_empty_tags.xml" })
void testDocumentWithEmptyTags(@NonNull String fixtureFileName)
throws IOException {
given()
.body(loadFixtureFileAsStream(fixtureFileName))
.contentType(ContentType.XML)
.when()
.post("/validation")
.then()
.statusCode(200)
.contentType(ContentType.JSON)
.body("is_valid", equalTo(false))
.body(
"meta.validation_profile",
equalTo(XMLSyntaxType.CII.toString())
)
.body("errors", not(empty()));
}

@ParameterizedTest
Expand Down Expand Up @@ -147,7 +167,6 @@ void testValidationEndpointWithValidUblDocuments(
"CII/CII_ZUGFeRD_23_XRECHNUNG_Einfach.xml",
"CII/CII_ZUGFeRD_23_XRECHNUNG_Elektron.xml",
"CII/CII_ZUGFeRD_23_XRECHNUNG_Reisekostenabrechnung.xml",
"CII/XRechnung-O.xml",
"CII/CII_ZUGFeRD_23_EXTENDED_Rechnungskorrektur.xml",
}
)
Expand Down Expand Up @@ -188,7 +207,8 @@ static Stream<Arguments> providerValuesValidationEndpointWithInvalidPayload() {
Arguments.of(
"CII/CII_ZUGFeRD_23_EXTENDED_Projektabschlussrechnung.xml"
),
Arguments.of("CII/CII_ZUGFeRD_23_EXTENDED_Warenrechnung.xml")
Arguments.of("CII/CII_ZUGFeRD_23_EXTENDED_Warenrechnung.xml"),
Arguments.of("CII/XRechnung-O.xml")
);
}

Expand Down
Loading

0 comments on commit efdc7be

Please sign in to comment.