Skip to content

Commit

Permalink
feat: improve automatic format detection
Browse files Browse the repository at this point in the history
Add more patterns to automatic format detector, pick a format that produces maximum date

fixes #103
  • Loading branch information
vlsi committed Aug 18, 2017
1 parent 6f3f929 commit 2340c61
Show file tree
Hide file tree
Showing 3 changed files with 182 additions and 28 deletions.
97 changes: 69 additions & 28 deletions src/main/java/net/atomique/ksar/AllParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
package net.atomique.ksar;

import net.atomique.ksar.xml.OSConfig;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand All @@ -14,26 +15,49 @@
import java.time.LocalTime;
import java.time.format.DateTimeFormatter;
import java.time.format.DateTimeParseException;
import java.util.HashMap;
import java.util.Map;
import java.util.List;
import java.util.Locale;
import java.util.TreeSet;
import java.util.stream.Collectors;
import java.util.stream.Stream;

public abstract class AllParser {

private static final Logger log = LoggerFactory.getLogger(AllParser.class);
private static final Map<String, String> DATE_FORMAT_REGEXPS = new HashMap<String, String>() {
{
put("^\\d{8}$", "yyyyMMdd");
put("^\\d{1,2}-\\d{1,2}-\\d{4}$", "dd-MM-yyyy");
put("^\\d{4}-\\d{1,2}-\\d{1,2}$", "yyyy-MM-dd");
put("^\\d{1,2}/\\d{1,2}/\\d{4}$", "MM/dd/yyyy");
put("^\\d{4}/\\d{1,2}/\\d{1,2}$", "yyyy/MM/dd");
put("^\\d{1,2}\\s[a-z]{3}\\s\\d{4}$", "dd MMM yyyy");
put("^\\d{1,2}\\s[a-z]{4,}\\s\\d{4}$", "dd MMMM yyyy");
put("^\\d{1,2}-\\d{1,2}-\\d{2}$", "dd-MM-yy");
put("^\\d{1,2}/\\d{1,2}/\\d{2}$", "MM/dd/yy");
}
};

private static final List<DateTimeFormatter> DATE_FORMATS = Stream.of(
"MM dd, yy",
"MM-dd-yy",
"MM/dd/yy",
"MM-dd-yyyy",
"MM/dd/yyyy",
"dd-MM-yy",
"dd.MM.yy",
"dd/MM/yy",
"dd.MM.yy.",
"dd-MM-yyyy",
"dd.MM.yyyy",
"dd/MM/yyyy",
"dd.MM.yyyy.",
"yy. MM. dd",
"yy-MM-dd",
"yy.MM.dd",
"yy/MM/dd",
"yy年MM月dd日",
"yy.dd.MM",
"yyyy. MM. dd",
"yyyy-MM-dd",
"yyyy.MM.dd",
"yyyy/MM/dd",
"yyyy.MM.dd.",
"yyyy年MM月dd日",
"yyyy.dd.MM",
"yyyyMMdd",
"dd MMM yyyy",
"dd MMMM yyyy",
"MMM dd yyyy",
"MMMM dd yyyy"
).map(p -> DateTimeFormatter.ofPattern(p, Locale.US)).collect(Collectors.toList());

public AllParser() {

Expand Down Expand Up @@ -80,14 +104,7 @@ public boolean setDate(String s) {
}

try {
DateTimeFormatter formatter;
if ("Automatic Detection".equals(dateFormat)) {
formatter = DateTimeFormatter.ofPattern(determineDateFormat(s));

} else {
formatter = DateTimeFormatter.ofPattern(dateFormat);
}

DateTimeFormatter formatter = getDateFormatter(s);
currentDate = LocalDate.parse(s, formatter);

parsedate = currentDate;
Expand All @@ -109,6 +126,20 @@ public boolean setDate(String s) {
return true;
}

private DateTimeFormatter getDateFormatter(String s) {
if (dateFormatter != null) {
return dateFormatter;
}
DateTimeFormatter format = null;
if ("Automatic Detection".equals(dateFormat)) {
format = determineDateFormat(s);
} else {
format = DateTimeFormatter.ofPattern(dateFormat);
}
dateFormatter = format;
return dateFormatter;
}

public String getDate() {
if (sarStartDate.equals(sarEndDate)) {
return sarStartDate;
Expand All @@ -125,13 +156,21 @@ public String getCurrentStat() {
return currentStat;
}

public static String determineDateFormat(String dateString) {
for (String regexp : DATE_FORMAT_REGEXPS.keySet()) {
if (dateString.toLowerCase().matches(regexp)) {
return DATE_FORMAT_REGEXPS.get(regexp);
public static DateTimeFormatter determineDateFormat(String dateString) {
DateTimeFormatter best = null;
LocalDate bestDate = null;
for (DateTimeFormatter format : DATE_FORMATS) {
try {
LocalDate nextDate = LocalDate.parse(dateString, format);
if (bestDate == null || nextDate.compareTo(bestDate) >= 0) {
bestDate = nextDate;
best = format;
}
} catch (DateTimeParseException e) {
/* ignore */
}
}
return null; // Unknown format.
return best;
}

protected String sarStartDate = null;
Expand Down Expand Up @@ -159,4 +198,6 @@ public static String determineDateFormat(String dateString) {
protected String dateFormat = "MM/dd/yy";
protected String timeFormat = "HH:mm:ss";
protected int timeColumn = 1;

private DateTimeFormatter dateFormatter;
}
42 changes: 42 additions & 0 deletions src/test/java/net/atomique/ksar/parser/DateFormatHelperTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
package net.atomique.ksar.parser;

import org.junit.Ignore;
import org.junit.Test;

import java.time.LocalDate;
import java.time.format.DateTimeFormatter;
import java.time.format.FormatStyle;
import java.util.*;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.regex.Pattern;

public class DateFormatHelperTest {
@Test
@Ignore
public void generateTests() throws Exception {
Set<String> allFormats = new HashSet<>();
LocalDate date = LocalDate.of(2017, 10, 18);
Predicate<String> nonPunctuation = Pattern.compile("[^ ./-\\:0-9]{3,}").asPredicate();
for (Locale locale : Locale.getAvailableLocales()) {
for (FormatStyle style : EnumSet.of(FormatStyle.SHORT, FormatStyle.MEDIUM)) {
DateTimeFormatter f = DateTimeFormatter.ofLocalizedDate(style).withLocale(locale);
String str = f.format(date);
if (nonPunctuation.test(str)) {
continue;
}
String v = str.replaceAll("2017", "yyyy").replaceAll("17", "yy")
.replaceAll("18", "dd").replaceAll("10", "MM");
allFormats.add(v);
}
}
List<String> formats = new ArrayList<>(allFormats);
formats.sort(Comparator.<String, String>comparing(v -> v.replaceAll("[^\\w]", "-"))
.thenComparing(Function.identity()));

for (String format : formats) {
System.out.println('"' + format + "\",");
}
}

}
71 changes: 71 additions & 0 deletions src/test/java/net/atomique/ksar/parser/DateFormatTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
package net.atomique.ksar.parser;

import net.atomique.ksar.AllParser;

import org.junit.Assert;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.Parameterized;

import java.time.LocalDate;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;

@RunWith(Parameterized.class)
public class DateFormatTest {
private final String text;
private final LocalDate date;
private final String expected;

public DateFormatTest(LocalDate date, String text, String expected) {
this.text = text;
this.date = date;
this.expected = expected;
}

@Parameterized.Parameters(name = "{1} -> {2}")
public static Iterable<Object[]> params() {
Collection<Object[]> res = new ArrayList<>();

// See DateTest.generateFormats
LocalDate date = LocalDate.of(2017, 5, 16);
for (String format : Arrays.asList(
"MM-dd-yy",
"MM/dd/yy",
"dd-MM-yy",
"dd.MM.yy",
"dd/MM/yy",
"dd.MM.yy.",
"dd-MM-yyyy",
"dd.MM.yyyy",
"dd/MM/yyyy",
"yy. MM. dd",
"yy-MM-dd",
"yy.MM.dd",
"yy/MM/dd",
"yy年MM月dd日",
"yy.dd.MM",
"yyyy-MM-dd",
"yyyy.MM.dd",
"yyyy/MM/dd",
"yyyy.MM.dd."
)) {
DateTimeFormatter df = DateTimeFormatter.ofPattern(format);
res.add(new Object[]{date, df.format(date), format});
}
// See https://github.com/vlsi/ksar/issues/103
LocalDate aug_04_2017 = LocalDate.of(2017, 8, 4);
res.add(new Object[]{aug_04_2017, "04/08/17", "dd/MM/yy"});
return res;
}

@Test
public void run() {
DateTimeFormatter format = AllParser.determineDateFormat(text);
LocalDate date = LocalDate.parse(text, format);
Assert.assertEquals(text, this.date, date);
}

}

0 comments on commit 2340c61

Please sign in to comment.