diff --git a/Cargo.lock b/Cargo.lock index fa0c768b8..4ee21b82d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -789,6 +789,21 @@ dependencies = [ "winapi 0.3.9", ] +[[package]] +name = "clap" +version = "2.34.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a0610544180c38b88101fecf2dd634b174a62eef6946f84dfc6a7127512b381c" +dependencies = [ + "ansi_term", + "atty", + "bitflags", + "strsim 0.8.0", + "textwrap", + "unicode-width", + "vec_map", +] + [[package]] name = "concurrent-queue" version = "1.2.2" @@ -1015,7 +1030,7 @@ dependencies = [ "ident_case", "proc-macro2", "quote", - "strsim", + "strsim 0.10.0", "syn", ] @@ -1099,7 +1114,7 @@ dependencies = [ "lazy_static", "regex", "serde", - "strsim", + "strsim 0.10.0", ] [[package]] @@ -1184,6 +1199,12 @@ dependencies = [ "winapi 0.3.9", ] +[[package]] +name = "fixedbitset" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "279fb028e20b3c4c320317955b77c5e0c9701f05a1d309905d6fc702cdc5053e" + [[package]] name = "flate2" version = "1.0.22" @@ -1451,6 +1472,24 @@ dependencies = [ "smallvec", ] +[[package]] +name = "grex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3616b65969ace16c2092a553702e2db26cad939dbbb920335a5b6b482b36d86f" +dependencies = [ + "atty", + "itertools", + "lazy_static", + "ndarray", + "petgraph", + "regex", + "structopt", + "unic-char-range", + "unic-ucd-category", + "unicode-segmentation", +] + [[package]] name = "h2" version = "0.2.7" @@ -1929,6 +1968,15 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a3e378b66a060d48947b590737b30a1be76706c8dd7b8ba0f2fe3989c68a853f" +[[package]] +name = "matrixmultiply" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "add85d4dd35074e6fedc608f8c8f513a3548619a9024b751949ef0e8e45a4d84" +dependencies = [ + "rawpointer", +] + [[package]] name = "memchr" version = "2.4.1" @@ -2039,6 +2087,19 @@ dependencies = [ "winapi 0.3.9", ] +[[package]] +name = "ndarray" +version = "0.15.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dec23e6762830658d2b3d385a75aa212af2f67a4586d4442907144f3bb6a1ca8" +dependencies = [ + "matrixmultiply", + "num-complex 0.4.0", + "num-integer", + "num-traits", + "rawpointer", +] + [[package]] name = "net2" version = "0.2.37" @@ -2376,6 +2437,16 @@ dependencies = [ "sha-1 0.8.2", ] +[[package]] +name = "petgraph" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a13a2fa9d0b63e5f22328828741e523766fff0ee9e779316902290dff3f824f" +dependencies = [ + "fixedbitset", + "indexmap", +] + [[package]] name = "pin-project" version = "0.4.29" @@ -2459,6 +2530,30 @@ version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eb9f9e6e233e5c4a35559a617bf40a4ec447db2e84c20b55a6f83167b7e57872" +[[package]] +name = "proc-macro-error" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" +dependencies = [ + "proc-macro-error-attr", + "proc-macro2", + "quote", + "syn", + "version_check", +] + +[[package]] +name = "proc-macro-error-attr" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" +dependencies = [ + "proc-macro2", + "quote", + "version_check", +] + [[package]] name = "proc-macro-hack" version = "0.5.19" @@ -2561,6 +2656,7 @@ dependencies = [ "filetime", "flexi_logger", "governor", + "grex", "hlua", "indicatif", "itertools", @@ -2581,7 +2677,7 @@ dependencies = [ "self_update", "serde", "serde_json", - "strsim", + "strsim 0.10.0", "tabwriter", "test-data-generation", "thousands", @@ -2750,6 +2846,12 @@ dependencies = [ "bitflags", ] +[[package]] +name = "rawpointer" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3" + [[package]] name = "rayon" version = "1.5.1" @@ -3237,12 +3339,42 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "213701ba3370744dcd1a12960caa4843b3d68b4d1c0a5d575e0d65b2ee9d16c0" +[[package]] +name = "strsim" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" + [[package]] name = "strsim" version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" +[[package]] +name = "structopt" +version = "0.3.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c6b5c64445ba8094a6ab0c3cd2ad323e07171012d9c98b0b15651daf1787a10" +dependencies = [ + "clap", + "lazy_static", + "structopt-derive", +] + +[[package]] +name = "structopt-derive" +version = "0.4.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dcb5ae327f9cc13b68763b5749770cb9e048a99bd9dfdfa58d0cf05d5f64afe0" +dependencies = [ + "heck", + "proc-macro-error", + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "syn" version = "1.0.86" @@ -3308,6 +3440,15 @@ dependencies = [ "yaml-rust", ] +[[package]] +name = "textwrap" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060" +dependencies = [ + "unicode-width", +] + [[package]] name = "thiserror" version = "1.0.30" @@ -3602,6 +3743,48 @@ version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "56dee185309b50d1f11bfedef0fe6d036842e3fb77413abef29f8f8d1c5d4c1c" +[[package]] +name = "unic-char-property" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8c57a407d9b6fa02b4795eb81c5b6652060a15a7903ea981f3d723e6c0be221" +dependencies = [ + "unic-char-range", +] + +[[package]] +name = "unic-char-range" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0398022d5f700414f6b899e10b8348231abf9173fa93144cbc1a43b9793c1fbc" + +[[package]] +name = "unic-common" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "80d7ff825a6a654ee85a63e80f92f054f904f21e7d12da4e22f9834a4aaa35bc" + +[[package]] +name = "unic-ucd-category" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b8d4591f5fcfe1bd4453baaf803c40e1b1e69ff8455c47620440b46efef91c0" +dependencies = [ + "matches", + "unic-char-property", + "unic-char-range", + "unic-ucd-version", +] + +[[package]] +name = "unic-ucd-version" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96bd2f2237fe450fcd0a1d2f5f4e91711124f7857ba2e964247776ebeeb7b0c4" +dependencies = [ + "unic-common", +] + [[package]] name = "unicase" version = "2.6.0" @@ -3699,6 +3882,12 @@ dependencies = [ "version_check", ] +[[package]] +name = "vec_map" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" + [[package]] name = "version_check" version = "0.9.4" diff --git a/Cargo.toml b/Cargo.toml index 1aff07c6d..6c99c0f51 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -60,6 +60,7 @@ eudex = { version = "0.1", optional = true } filetime = "0.2" flexi_logger = { version = "0.22", features = ["compress"] } governor = "0.4" +grex = "1.3.0" hlua = { version = "0.4", optional = true } indicatif = "0.16" itertools = "0.10" diff --git a/resources/test/adur-public-toilets.csv b/resources/test/adur-public-toilets.csv index c5d040006..1823c9432 100644 --- a/resources/test/adur-public-toilets.csv +++ b/resources/test/adur-public-toilets.csv @@ -1,16 +1,16 @@ ExtractDate,OrganisationURI,OrganisationLabel,ServiceTypeURI,ServiceTypeLabel,LocationText,CoordinateReferenceSystem,GeoX,GeoY,GeoPointLicensingURL,Category,AccessibleCategory,RADARKeyNeeded,BabyChange,FamilyToilet,ChangingPlace,AutomaticPublicConvenience,FullTimeStaffing,PartOfCommunityScheme,CommunitySchemeName,ChargeAmount,InfoURL,OpeningHours,ManagedBy,ReportEmail,ReportTel,Notes,UPRN,Postcode,StreetAddress,GeoAreaURI,GeoAreaLabel - ,http://opendatacommunities.org/id/district-council/adur,,http://id.esd.org.uk/service/579,Public toilets,BEACH GREEN PUBLIC CONVENIENCES BRIGHTON ROAD LANCING,OSGB36,518072,103649,http://www.ordnancesurvey.co.uk/business-and-government/help-and-support/public-sector/guidance/derived-data-exemptions.html,Female and male,Unisex,Yes,No,No,No,No,No,No,,,http://www.adur-worthing.gov.uk/streets-and-travel/public-toilets/,"S = 09:00 - 21:00 W = 09:00 - 17:00 ",ADC,surveyors@adur-worthing.gov.uk,01903 221471,,60001449,,BEACH GREEN PUBLIC CONVENIENCES BRIGHTON ROAD LANCING,, -07/07/2014 00:00,http://opendatacommunities.org/id/district-council/adur,Adur,http://id.esd.org.uk/service/579,Public toilets,PUBLIC CONVENIENCES MONKS RECREATION GROUND CRABTREE LANE LANCING,OSGB36,518225,104730,http://www.ordnancesurvey.co.uk/business-and-government/help-and-support/public-sector/guidance/derived-data-exemptions.html,Female and male,None,Yes,No,No,No,No,No,No,,,http://www.adur-worthing.gov.uk/streets-and-travel/public-toilets/,"S = 09:00 - 15:00 W = 09:00 - 15:00",ADC,surveyors@adur-worthing.gov.uk,01903 221471,,60002210,,PUBLIC CONVENIENCES MONKS RECREATION GROUND CRABTREE LANE LANCING,, -2014-07-07 00:00,http://opendatacommunities.org/id/district-council/adur,Adur,http://id.esd.org.uk/service/579,Public toilets,PUBLIC CONVENIENCES SHOPSDAM ROAD LANCING,OSGB3,518915,103795,http://www.ordnancesurvey.co.uk/business-and-government/help-and-support/public-sector/guidance/derived-data-exemptions.html,Mens,Unisex,Yes,No,No,No,No,No,No,,,http://www.adur-worthing.gov.uk/streets-and-travel/public-toilets/,"S = 09:00 - 21:00 W = 09:00 - 17:00",ADC,surveyors@adur-worthing.gov.uk,01903 221471,,60007428,,,, -07/07/2014 00:00,http://opendatacommunities.org/id/district-council/adur,Adur,http://id.esd.org.uk/service/579,Public toilets,PUBLIC CONVENIENCES YEW TREE CLOSE LANCING,OSGB36,518222,104168,http://www.ordnancesurvey.co.uk/business-and-government/help-and-support/public-sector/guidance/derived-data-exemptions.html,Female and male,Unisex,Yes,No,No,No,No,No,No,,,http://www.adur-worthing.gov.uk/streets-and-travel/public-toilets/,"S = 09:00 - 21:00 W = 09:00 - 17:00",ADC,surveyors@adur-worthing.gov.uk,01903 221471,,60008859,,PUBLIC CONVENIENCES YEW TREE CLOSE LANCING,, -07/07/2014 00:00,http://opendatacommunities.org/id/district-council/adur,Adur,http://id.esd.org.uk/service/579,Public toilets,PUBLIC CONVENIENCES BEACH GREEN SHOREHAM-BY-SEA,OSGB36,521299,104515,http://www.ordnancesurvey.co.uk/business-and-government/help-and-support/public-sector/guidance/derived-data-exemptions.html,Female and male,Unisex,Yes,No,No,No,No,No,No,,,http://www.adur-worthing.gov.uk/streets-and-travel/public-toilets/,"S = 09:00 - 21:00 W = 09:00 - 17:00",ADC,surveyors@adur-worthing.gov.uk,01903 221471,,60009402,,PUBLIC CONVENIENCES BEACH GREEN SHOREHAM-BY-SEA,, -07/07/2014 00:00,http://opendatacommunities.org/id/district-council/adur,Adur,http://id.esd.org.uk/service/579,Public toilets,PUBLIC CONVENIENCES ADUR RECREATION GROUND BRIGHTON ROAD SHOREHAM-BY-SEA,OSGB36,521048,104977,http://www.ordnancesurvey.co.uk/business-and-government/help-and-support/public-sector/guidance/derived-data-exemptions.html,Female and male,Unisex,Yes,No,No,No,No,No,No,,,http://www.adur-worthing.gov.uk/streets-and-travel/public-toilets/,"S = 08:00 - 21:00 W = 08:00 - 17:00",ADC,surveyors@adur-worthing.gov.uk,01903 221471,,60009666,,PUBLIC CONVENIENCES ADUR RECREATION GROUND BRIGHTON ROAD SHOREHAM-BY-SEA,, -07/07/2014 00:00,http://opendatacommunities.org/id/district-council/adur,Adur,http://id.esd.org.uk/service/579,Public toilets,PUBLIC CONVENIENCES FORTHAVEN SHOREHAM-BY-SEA,OSGB36,523294,104588,http://www.ordnancesurvey.co.uk/business-and-government/help-and-support/public-sector/guidance/derived-data-exemptions.html,Female and male,Unisex,Yes,No,No,No,No,No,No,,,http://www.adur-worthing.gov.uk/streets-and-travel/public-toilets/,"S = 09:00 - 21:00 W = 09:00 - 17:00",ADC,surveyors@adur-worthing.gov.uk,01903 221471,,60011970,,PUBLIC CONVENIENCES FORTHAVEN SHOREHAM-BY-SEA,, -07/07/2014 00:00,http://opendatacommunities.org/id/district-council/adur,Adur,http://id.esd.org.uk/service/579,Public toilets,PUBLIC CONVENIENCES MIDDLE STREET SHOREHAM-BY-SEA,OSGB36,521515,105083,http://www.ordnancesurvey.co.uk/business-and-government/help-and-support/public-sector/guidance/derived-data-exemptions.html,Female and male,Unisex,Yes,No,No,No,No,No,No,,,http://www.adur-worthing.gov.uk/streets-and-travel/public-toilets/,"S = 09:00 - 21:00 W = 09:00 - 17:00",ADC,surveyors@adur-worthing.gov.uk,01903 221471,,60014163,,PUBLIC CONVENIENCES MIDDLE STREET SHOREHAM-BY-SEA,, -07/07/2014 00:00,http://opendatacommunities.org/id/district-council/adur,Adur,http://id.esd.org.uk/service/579,Public toilets,PUBLIC CONVENIENCES CEMETERY MILL LANE SHOREHAM-BY-SEA,OSGB36,521440,105725,http://www.ordnancesurvey.co.uk/business-and-government/help-and-support/public-sector/guidance/derived-data-exemptions.html,Female and male,None,No,No,No,No,No,No,No,,,http://www.adur-worthing.gov.uk/streets-and-travel/public-toilets/,,ADC,surveyors@adur-worthing.gov.uk,01903 221471,Grounds staff only not public,60014340,,PUBLIC CONVENIENCES CEMETERY MILL LANE SHOREHAM-BY-SEA,, -07/07/2014 00:00,http://opendatacommunities.org/id/district-council/adur,Adur,http://id.esd.org.uk/service/579,Public toilets,PUBLIC CONVENIENCES SOUTH PAVILION BUCKINGHAM PARK UPPER SHOREHAM ROAD SHOREHAM-BY-SEA,OSGB36,522118,105939,http://www.ordnancesurvey.co.uk/business-and-government/help-and-support/public-sector/guidance/derived-data-exemptions.html,Female and male,None,No,No,No,No,No,No,No,,,http://www.adur-worthing.gov.uk/streets-and-travel/public-toilets/,"S = 09:00 - 21:00 W = 09:00 - 17:00",ADC,surveyors@adur-worthing.gov.uk,01903 221471,,60017866,,PUBLIC CONVENIENCES SOUTH PAVILION BUCKINGHAM PARK UPPER SHOREHAM ROAD SHOREHAM-BY-SEA,, -07/07/2014 00:00,http://opendatacommunities.org/id/district-council/adur,Adur,http://id.esd.org.uk/service/579,Public toilets,PUBLIC CONVENIENCE SOUTHWICK STREET SOUTHWICK,OSGB36,524401,105405,http://www.ordnancesurvey.co.uk/business-and-government/help-and-support/public-sector/guidance/derived-data-exemptions.html,Female and male,Unisex,Yes,No,No,No,No,No,No,,,http://www.adur-worthing.gov.uk/streets-and-travel/public-toilets/,"S = 08:00 - 21:00 W = 08:00 - 17:00",ADC,surveyors@adur-worthing.gov.uk,01903 221471,,60026354,,PUBLIC CONVENIENCE SOUTHWICK STREET SOUTHWICK,, -07/07/2014 00:00,http://opendatacommunities.org/id/district-council/adur,Adur,http://id.esd.org.uk/service/579,Public toilets,WEST BEACH PUBLIC CONVENIENCES WEST BEACH ROAD LANCING,OSGB36,520354,104246,http://www.ordnancesurvey.co.uk/business-and-government/help-and-support/public-sector/guidance/derived-data-exemptions.html,Female and male,Unisex,Yes,No,No,No,No,No,No,,,http://www.adur-worthing.gov.uk/streets-and-travel/public-toilets/,"S = 09:00 - 21:00 W = 09:00 - 17:00",,surveyors@adur-worthing.gov.uk,01903 221471,,60028994,,WEST BEACH PUBLIC CONVENIENCES WEST BEACH ROAD LANCING,, -07/07/2014 00:00,http://opendatacommunities.org/id/district-council/adur,Adur,http://id.esd.org.uk/service/579,Public toilets,BEACH TOILETS BASIN ROAD SOUTH SOUTHWICK,OSGB36,524375,104753,http://www.ordnancesurvey.co.uk/business-and-government/help-and-support/public-sector/guidance/derived-data-exemptions.html,Female and male,Unisex,Yes,No,No,No,No,No,No,,,http://www.adur-worthing.gov.uk/streets-and-travel/public-toilets/,"S = 09:00 - 21:00 W = 09:00 - 17:00",ADC,surveyors@adur-worthing.gov.uk,01903 221471,,60029181,,BEACH TOILETS BASIN ROAD SOUTH SOUTHWICK,, -07/07/2014 00:00,http://opendatacommunities.org/id/district-council/adur,Adur,http://id.esd.org.uk/service/579,Public toilets,BEACH TOILETS BASIN ROAD SOUTH SOUTHWICK,OSGB36,522007,106062,http://www.ordnancesurvey.co.uk/business-and-government/help-and-support/public-sector/guidance/derived-data-exemptions.html,Female and male,None,No,No,No,No,No,No,No,,,http://www.adur-worthing.gov.uk/streets-and-travel/public-toilets/,,ADC,surveyors@adur-worthing.gov.uk,01903 221471,Grounds staff only not public,60032527,,PUBLIC CONVENIENCE NORTH PAVILION BUCKINGHAM PARK UPPER SHOREHAM ROAD SHOREHAM-BY-SEA,, -07/07/2014 00:00,http://opendatacommunities.org/id/district-council/adur,Adur,http://id.esd.org.uk/service/579,Public toilets,BEACH TOILETS BASIN ROAD SOUTH SOUTHWICK,OSGB36,522083,105168,http://www.ordnancesurvey.co.uk/business-and-government/help-and-support/public-sector/guidance/derived-data-exemptions.html,Female and male,Unisex,Yes,No,No,No,No,No,No,,,http://www.adur-worthing.gov.uk/streets-and-travel/public-toilets/,09.00 - 17.00,ADC,surveyors@adur-worthing.gov.uk,01903 221471,,60034215,,PUBLIC CONVENIENCES CIVIC CENTRE HAM ROAD SHOREHAM-BY-SEA,, \ No newline at end of file + ,http://opendatacommunities.org/id/district-council/adur,,http://id.esd.org.uk/service/579,Public toilets,BEACH GREEN PUBLIC CONVENIENCES BRIGHTON ROAD LANCING,OSGB36,518072,103649,http://www.ordnancesurvey.co.uk/business-and-government/help-and-support/public-sector/guidance/derived-data-exemptions.html,Female and male,Unisex,Yes,No,No,No,No,No,No,,,http://www.adur-worthing.gov.uk/streets-and-travel/public-toilets/,"S = 09:00 - 21:00 W = 09:00 - 17:00 ",ADC,surveyor_1@adur-worthing.gov.uk,01903 221471,,60001449,,BEACH GREEN PUBLIC CONVENIENCES BRIGHTON ROAD LANCING,, +07/07/2014 00:00,http://opendatacommunities.org/id/district-council/adur,Adur,http://id.esd.org.uk/service/579,Public toilets,PUBLIC CONVENIENCES MONKS RECREATION GROUND CRABTREE LANE LANCING,OSGB36,518225,104730,http://www.ordnancesurvey.co.uk/business-and-government/help-and-support/public-sector/guidance/derived-data-exemptions.html,Female and male,None,Yes,No,No,No,No,No,No,,,http://www.adur-worthing.gov.uk/streets-and-travel/public-toilets/,"S = 09:00 - 15:00 W = 09:00 - 15:00",ADC,surveyor_2@adur-worthing.gov.uk,01903 221471,,60002210,,PUBLIC CONVENIENCES MONKS RECREATION GROUND CRABTREE LANE LANCING,, +2014-07-07 00:00,http://opendatacommunities.org/id/district-council/adur,Adur,http://id.esd.org.uk/service/579,Public toilets,PUBLIC CONVENIENCES SHOPSDAM ROAD LANCING,OSGB3,518915,103795,http://www.ordnancesurvey.co.uk/business-and-government/help-and-support/public-sector/guidance/derived-data-exemptions.html,Mens,Unisex,Yes,No,No,No,No,No,No,,,http://www.adur-worthing.gov.uk/streets-and-travel/public-toilets/,"S = 09:00 - 21:00 W = 09:00 - 17:00",ADC,surveyor_3@adur-worthing.gov.uk,01903 221471,,60007428,,,, +07/07/2014 00:00,http://opendatacommunities.org/id/district-council/adur,Adur,http://id.esd.org.uk/service/579,Public toilets,PUBLIC CONVENIENCES YEW TREE CLOSE LANCING,OSGB36,518222,104168,http://www.ordnancesurvey.co.uk/business-and-government/help-and-support/public-sector/guidance/derived-data-exemptions.html,Female and male,Unisex,Yes,No,No,No,No,No,No,,,http://www.adur-worthing.gov.uk/streets-and-travel/public-toilets/,"S = 09:00 - 21:00 W = 09:00 - 17:00",ADC,surveyor_4@adur-worthing.gov.uk,01903 221471,,60008859,,PUBLIC CONVENIENCES YEW TREE CLOSE LANCING,, +07/07/2014 00:00,http://opendatacommunities.org/id/district-council/adur,Adur,http://id.esd.org.uk/service/579,Public toilets,PUBLIC CONVENIENCES BEACH GREEN SHOREHAM-BY-SEA,OSGB36,521299,104515,http://www.ordnancesurvey.co.uk/business-and-government/help-and-support/public-sector/guidance/derived-data-exemptions.html,Female and male,Unisex,Yes,No,No,No,No,No,No,,,http://www.adur-worthing.gov.uk/streets-and-travel/public-toilets/,"S = 09:00 - 21:00 W = 09:00 - 17:00",ADC,surveyor_5@adur-worthing.gov.uk,01903 221471,,60009402,,PUBLIC CONVENIENCES BEACH GREEN SHOREHAM-BY-SEA,, +07/07/2014 00:00,http://opendatacommunities.org/id/district-council/adur,Adur,http://id.esd.org.uk/service/579,Public toilets,PUBLIC CONVENIENCES ADUR RECREATION GROUND BRIGHTON ROAD SHOREHAM-BY-SEA,OSGB36,521048,104977,http://www.ordnancesurvey.co.uk/business-and-government/help-and-support/public-sector/guidance/derived-data-exemptions.html,Female and male,Unisex,Yes,No,No,No,No,No,No,,,http://www.adur-worthing.gov.uk/streets-and-travel/public-toilets/,"S = 08:00 - 21:00 W = 08:00 - 17:00",ADC,surveyor_6@adur-worthing.gov.uk,01903 221471,,60009666,,PUBLIC CONVENIENCES ADUR RECREATION GROUND BRIGHTON ROAD SHOREHAM-BY-SEA,, +07/07/2014 00:00,http://opendatacommunities.org/id/district-council/adur,Adur,http://id.esd.org.uk/service/579,Public toilets,PUBLIC CONVENIENCES FORTHAVEN SHOREHAM-BY-SEA,OSGB36,523294,104588,http://www.ordnancesurvey.co.uk/business-and-government/help-and-support/public-sector/guidance/derived-data-exemptions.html,Female and male,Unisex,Yes,No,No,No,No,No,No,,,http://www.adur-worthing.gov.uk/streets-and-travel/public-toilets/,"S = 09:00 - 21:00 W = 09:00 - 17:00",ADC,surveyor_7@adur-worthing.gov.uk,01903 221471,,60011970,,PUBLIC CONVENIENCES FORTHAVEN SHOREHAM-BY-SEA,, +07/07/2014 00:00,http://opendatacommunities.org/id/district-council/adur,Adur,http://id.esd.org.uk/service/579,Public toilets,PUBLIC CONVENIENCES MIDDLE STREET SHOREHAM-BY-SEA,OSGB36,521515,105083,http://www.ordnancesurvey.co.uk/business-and-government/help-and-support/public-sector/guidance/derived-data-exemptions.html,Female and male,Unisex,Yes,No,No,No,No,No,No,,,http://www.adur-worthing.gov.uk/streets-and-travel/public-toilets/,"S = 09:00 - 21:00 W = 09:00 - 17:00",ADC,surveyor_8@adur-worthing.gov.uk,01903 221471,,60014163,,PUBLIC CONVENIENCES MIDDLE STREET SHOREHAM-BY-SEA,, +07/07/2014 00:00,http://opendatacommunities.org/id/district-council/adur,Adur,http://id.esd.org.uk/service/579,Public toilets,PUBLIC CONVENIENCES CEMETERY MILL LANE SHOREHAM-BY-SEA,OSGB36,521440,105725,http://www.ordnancesurvey.co.uk/business-and-government/help-and-support/public-sector/guidance/derived-data-exemptions.html,Female and male,None,No,No,No,No,No,No,No,,,http://www.adur-worthing.gov.uk/streets-and-travel/public-toilets/,,ADC,surveyor_9@adur-worthing.gov.uk,01903 221471,Grounds staff only not public,60014340,,PUBLIC CONVENIENCES CEMETERY MILL LANE SHOREHAM-BY-SEA,, +07/07/2014 00:00,http://opendatacommunities.org/id/district-council/adur,Adur,http://id.esd.org.uk/service/579,Public toilets,PUBLIC CONVENIENCES SOUTH PAVILION BUCKINGHAM PARK UPPER SHOREHAM ROAD SHOREHAM-BY-SEA,OSGB36,522118,105939,http://www.ordnancesurvey.co.uk/business-and-government/help-and-support/public-sector/guidance/derived-data-exemptions.html,Female and male,None,No,No,No,No,No,No,No,,,http://www.adur-worthing.gov.uk/streets-and-travel/public-toilets/,"S = 09:00 - 21:00 W = 09:00 - 17:00",ADC,surveyor_10@adur-worthing.gov.uk,01903 221471,,60017866,,PUBLIC CONVENIENCES SOUTH PAVILION BUCKINGHAM PARK UPPER SHOREHAM ROAD SHOREHAM-BY-SEA,, +07/07/2014 00:00,http://opendatacommunities.org/id/district-council/adur,Adur,http://id.esd.org.uk/service/579,Public toilets,PUBLIC CONVENIENCE SOUTHWICK STREET SOUTHWICK,OSGB36,524401,105405,http://www.ordnancesurvey.co.uk/business-and-government/help-and-support/public-sector/guidance/derived-data-exemptions.html,Female and male,Unisex,Yes,No,No,No,No,No,No,,,http://www.adur-worthing.gov.uk/streets-and-travel/public-toilets/,"S = 08:00 - 21:00 W = 08:00 - 17:00",ADC,surveyor_11@adur-worthing.gov.uk,01903 221471,,60026354,,PUBLIC CONVENIENCE SOUTHWICK STREET SOUTHWICK,, +07/07/2014 00:00,http://opendatacommunities.org/id/district-council/adur,Adur,http://id.esd.org.uk/service/579,Public toilets,WEST BEACH PUBLIC CONVENIENCES WEST BEACH ROAD LANCING,OSGB36,520354,104246,http://www.ordnancesurvey.co.uk/business-and-government/help-and-support/public-sector/guidance/derived-data-exemptions.html,Female and male,Unisex,Yes,No,No,No,No,No,No,,,http://www.adur-worthing.gov.uk/streets-and-travel/public-toilets/,"S = 09:00 - 21:00 W = 09:00 - 17:00",,surveyor_12@adur-worthing.gov.uk,01903 221471,,60028994,,WEST BEACH PUBLIC CONVENIENCES WEST BEACH ROAD LANCING,, +07/07/2014 00:00,http://opendatacommunities.org/id/district-council/adur,Adur,http://id.esd.org.uk/service/579,Public toilets,BEACH TOILETS BASIN ROAD SOUTH SOUTHWICK,OSGB36,524375,104753,http://www.ordnancesurvey.co.uk/business-and-government/help-and-support/public-sector/guidance/derived-data-exemptions.html,Female and male,Unisex,Yes,No,No,No,No,No,No,,,http://www.adur-worthing.gov.uk/streets-and-travel/public-toilets/,"S = 09:00 - 21:00 W = 09:00 - 17:00",ADC,surveyor_13@adur-worthing.gov.uk,01903 221471,,60029181,,BEACH TOILETS BASIN ROAD SOUTH SOUTHWICK,, +07/07/2014 00:00,http://opendatacommunities.org/id/district-council/adur,Adur,http://id.esd.org.uk/service/579,Public toilets,BEACH TOILETS BASIN ROAD SOUTH SOUTHWICK,OSGB36,522007,106062,http://www.ordnancesurvey.co.uk/business-and-government/help-and-support/public-sector/guidance/derived-data-exemptions.html,Female and male,None,No,No,No,No,No,No,No,,,http://www.adur-worthing.gov.uk/streets-and-travel/public-toilets/,,ADC,surveyor_14@adur-worthing.gov.uk,01903 221471,Grounds staff only not public,60032527,,PUBLIC CONVENIENCE NORTH PAVILION BUCKINGHAM PARK UPPER SHOREHAM ROAD SHOREHAM-BY-SEA,, +07/07/2014 00:00,http://opendatacommunities.org/id/district-council/adur,Adur,http://id.esd.org.uk/service/579,Public toilets,BEACH TOILETS BASIN ROAD SOUTH SOUTHWICK,OSGB36,522083,105168,http://www.ordnancesurvey.co.uk/business-and-government/help-and-support/public-sector/guidance/derived-data-exemptions.html,Female and male,Unisex,Yes,No,No,No,No,No,No,,,http://www.adur-worthing.gov.uk/streets-and-travel/public-toilets/,09.00 - 17.00,ADC,surveyor_15@adur-worthing.gov.uk,01903 221471,,60034215,,PUBLIC CONVENIENCES CIVIC CENTRE HAM ROAD SHOREHAM-BY-SEA,, \ No newline at end of file diff --git a/resources/test/adur-public-toilets.csv.schema-with-value-constraints.expected.json b/resources/test/adur-public-toilets.csv.schema-with-value-constraints.expected.json index e9788943c..23c24d849 100644 --- a/resources/test/adur-public-toilets.csv.schema-with-value-constraints.expected.json +++ b/resources/test/adur-public-toilets.csv.schema-with-value-constraints.expected.json @@ -280,14 +280,12 @@ }, "ReportEmail": { "description": "ReportEmail column from adur-public-toilets.csv", - "minLength": 30, - "maxLength": 30, + "minLength": 31, + "maxLength": 32, "type": [ "string" ], - "enum": [ - "surveyors@adur-worthing.gov.uk" - ] + "pattern": "^\\w{9}\\d(?:\\d)?@\\w{4}\\-\\w{8}\\.\\w{3}\\.\\w\\w$" }, "ReportTel": { "description": "ReportTel column from adur-public-toilets.csv", diff --git a/src/cmd/fetch.rs b/src/cmd/fetch.rs index 0c4c8e55b..9c4432304 100644 --- a/src/cmd/fetch.rs +++ b/src/cmd/fetch.rs @@ -152,9 +152,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> { #[allow(unused_assignments)] let mut record = csv::ByteRecord::new(); - for row in rdr.byte_records() { - record = row?; - + while rdr.read_byte_record(&mut record)? { if !args.flag_quiet { progress.inc(1); } diff --git a/src/cmd/schema.rs b/src/cmd/schema.rs index ca1db2d72..39de3dd70 100644 --- a/src/cmd/schema.rs +++ b/src/cmd/schema.rs @@ -1,15 +1,16 @@ use crate::cmd::stats::Stats; -use crate::config::Delimiter; +use crate::config::{Config, Delimiter}; use crate::select::SelectColumns; use crate::util; use crate::CliError; use crate::CliResult; use csv::ByteRecord; +use grex::RegExpBuilder; use log::{debug, error, info, warn}; use serde::Deserialize; use serde_json::{json, value::Number, Map, Value}; use stats::Frequencies; -use std::{collections::hash_map::HashMap, fs::File, io::Write, path::Path}; +use std::{collections::hash_map::HashMap, collections::HashSet, fs::File, io::Write, path::Path}; macro_rules! fail { ($mesg:expr) => { @@ -34,8 +35,8 @@ Usage: qsv schema [options] [] Schema options: - --enum-threshold NUM Cardinality threshold for adding enum constraints [default: 12] - --pattern-columns Select columns to add pattern constraints [default: none] + --enum-threshold NUM Cardinality threshold for adding enum constraints [default: 50] + --pattern-columns Select columns to add pattern constraints Common options: -h, --help Display this message @@ -47,7 +48,6 @@ Common options: Must be a single character. [default: ,] "; -#[allow(dead_code)] #[derive(Deserialize, Debug)] struct Args { flag_enum_threshold: usize, @@ -75,19 +75,32 @@ pub fn run(argv: &[&str]) -> CliResult<()> { let mut schema_output_file = File::create(&schema_output_filename).expect("unable to create schema output file"); - let properties_map: Map = match infer_schema_from_stats(&args, input_filename) { - Ok(map) => map, - Err(e) => { - let msg = format!("Failed to infer schema via stats and frequency: {e}"); - fail!(msg); - } - }; + // build schema for each field by their inferred type, min/max value/length, and unique values + let mut properties_map: Map = + match infer_schema_from_stats(&args, input_filename) { + Ok(map) => map, + Err(e) => { + let msg = format!("Failed to infer schema via stats and frequency: {e}"); + fail!(msg); + } + }; - let mut fields: Vec = Vec::new(); - for key in properties_map.keys() { - fields.push(Value::String(key.clone())); + // generate regex patternfor selected String columns + let pattern_map = generate_string_patterns(&args, &properties_map)?; + + // enrich properties map with pattern constraint for String fields + for (field_name, field_def) in properties_map.iter_mut() { + // dbg!(&field_name, &field_def); + if pattern_map.contains_key(field_name) && should_emit_pattern_constraint(field_def) { + let field_def_map = field_def.as_object_mut().unwrap(); + let pattern = Value::String(pattern_map[field_name].clone()); + field_def_map.insert("pattern".to_string(), pattern); + } } + // generated list of required fields + let required_fields = get_required_fields(&properties_map); + // create final JSON object for output let schema = json!({ "$schema": "https://json-schema.org/draft-07/schema", @@ -95,7 +108,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> { "description": "Inferred JSON Schema from QSV schema command", "type": "object", "properties": Value::Object(properties_map), - "required": Value::Array(fields) + "required": Value::Array(required_fields) }); let schema_pretty = serde_json::to_string_pretty(&schema).expect("prettify schema json"); @@ -112,121 +125,14 @@ pub fn run(argv: &[&str]) -> CliResult<()> { Ok(()) } -/// get stats records from cmd::stats -/// returns tuple (csv_fields, csv_stats, stats_col_index_map) -fn get_stats_records(args: &Args) -> CliResult<(ByteRecord, Vec, HashMap)> { - let stats_args = crate::cmd::stats::Args { - arg_input: args.arg_input.clone(), - flag_select: crate::select::SelectColumns::parse("").unwrap(), - flag_everything: false, - flag_mode: false, - flag_cardinality: true, - flag_median: false, - flag_quartiles: false, - flag_nulls: false, - flag_nullcount: true, - flag_jobs: util::max_jobs() as isize, - flag_output: None, - flag_no_headers: args.flag_no_headers, - flag_delimiter: args.flag_delimiter, - }; - - let (csv_fields, csv_stats) = match stats_args.rconfig().indexed() { - Ok(o) => match o { - None => { - info!("no index, triggering sequential stats"); - stats_args.sequential_stats() - } - Some(idx) => { - info!("has index, triggering parallel stats"); - stats_args.parallel_stats(idx) - } - }, - Err(e) => { - warn!("error determining if indexed, triggering sequential stats: {e}"); - stats_args.sequential_stats() - } - }?; - - let stats_columns = stats_args.stat_headers(); - debug!("stats columns: {stats_columns:?}"); - - let mut stats_col_index_map = HashMap::new(); - - for (i, col) in stats_columns.iter().enumerate() { - if col != "field" { - // need offset by 1 due to extra "field" column in headers that's not in stats records - stats_col_index_map.insert(col.to_owned(), i - 1); - } - } - - Ok((csv_fields, csv_stats, stats_col_index_map)) -} - -/// get frequency tables from cmd::stats -/// returns tuple (csv_fields, csv_stats, stats_col_index_map) -fn get_frequency_tables( - args: &Args, - column_select_arg: &str, -) -> CliResult<(ByteRecord, Vec>>)> { - let freq_args = crate::cmd::frequency::Args { - arg_input: args.arg_input.clone(), - flag_select: crate::select::SelectColumns::parse(column_select_arg).unwrap(), - flag_limit: args.flag_enum_threshold, - flag_asc: false, - flag_no_nulls: true, - flag_jobs: util::max_jobs() as isize, - flag_output: None, - flag_no_headers: args.flag_no_headers, - flag_delimiter: args.flag_delimiter, - }; - - let (headers, ftables) = match freq_args.rconfig().indexed()? { - Some(ref mut idx) => freq_args.parallel_ftables(idx), - _ => freq_args.sequential_ftables(), - }?; - - Ok((headers, ftables)) -} - -// get column selector arg for low cardinality columns -fn build_low_cardinality_column_selector_arg( - enum_cardinality_threshold: usize, - csv_fields: &ByteRecord, - csv_stats: &[Stats], - stats_col_index_map: &HashMap, -) -> String { - let mut low_cardinality_column_indices = Vec::new(); - - // identify low cardinality columns - for i in 0..csv_fields.len() { - // grab stats record for current column - let stats_record = csv_stats.get(i).unwrap().clone().to_record(); - - // get Cardinality - let col_cardinality = match stats_record.get(stats_col_index_map["cardinality"]) { - Some(s) => s.parse::().unwrap_or(0_usize), - None => 0_usize, - }; - // debug!("column_{i}: cardinality={col_cardinality}"); - - if col_cardinality <= enum_cardinality_threshold { - // column selector uses 1-based index - low_cardinality_column_indices.push(i + 1); - }; - } - - debug!("low cardinality columns: {low_cardinality_column_indices:?}"); - - use itertools::Itertools; - let column_select_arg: String = low_cardinality_column_indices - .iter() - .map(ToString::to_string) - .join(","); - - column_select_arg -} - +/// Builds JSON MAP object that corresponds to the "properties" object of JSON Schema (Draft 7) by looking at CSV value stats +/// Supported JSON Schema validation vocabularies: +/// * type +/// * enum +/// * minLength +/// * maxLength +/// * min +/// * max #[allow(clippy::len_zero)] fn infer_schema_from_stats(args: &Args, input_filename: &str) -> CliResult> { // invoke cmd::stats @@ -240,64 +146,17 @@ fn infer_schema_from_stats(args: &Args, input_filename: &str) -> CliResult> = HashMap::new(); - - // iterate through fields and gather unique values for each field - for (i, header) in freq_csv_fields.iter().enumerate() { - let mut unique_values = Vec::new(); - - for (val_byte_vec, _count) in frequency_tables[i].most_frequent() { - match std::str::from_utf8(val_byte_vec) { - Ok(s) => { - unique_values.push(s.to_string()); - } - Err(e) => { - let msg = format!("Can't read value from column {i} as utf8: {e}"); - error!("{msg}"); - fail!(msg); - } - }; - } - - // convert csv header to string - let header_string: String = match std::str::from_utf8(header) { - Ok(s) => s.to_string(), - Err(e) => { - let msg = format!("Can't read header from column {i} as utf8: {e}"); - error!("{msg}"); - fail!(msg); - } - }; - - // sort the values so enum list so schema can be diff'ed between runs - unique_values.sort(); - - debug!( - "enum[{header_string}]: len={}, val={:?}", - unique_values.len(), - unique_values - ); - unique_values_map.insert(header_string, unique_values); - } - - // dbg!(&unique_values_map); + // invoke cmd::frequency to get unique values for each field + let unique_values_map = get_unique_values(args, &column_select_arg)?; // map holds "properties" object of json schema let mut properties_map: Map = Map::new(); // generate definition for each CSV column/field and add to properties_map for i in 0..csv_fields.len() { - let header = csv_fields.get(i).unwrap(); + let header_byte_slice = csv_fields.get(i).unwrap(); // convert csv header to string - let header_string: String = match std::str::from_utf8(header) { - Ok(s) => s.to_string(), - Err(e) => { - fail!(format!("Can't read header from column {i} as utf8: {e}")); - } - }; + let header_string = convert_to_string(header_byte_slice)?; // grab stats record for current column let stats_record = csv_stats.get(i).unwrap().clone().to_record(); @@ -426,3 +285,267 @@ fn infer_schema_from_stats(args: &Args, input_filename: &str) -> CliResult CliResult<(ByteRecord, Vec, HashMap)> { + let stats_args = crate::cmd::stats::Args { + arg_input: args.arg_input.clone(), + flag_select: crate::select::SelectColumns::parse("").unwrap(), + flag_everything: false, + flag_mode: false, + flag_cardinality: true, + flag_median: false, + flag_quartiles: false, + flag_nulls: false, + flag_nullcount: true, + flag_jobs: util::max_jobs() as isize, + flag_output: None, + flag_no_headers: args.flag_no_headers, + flag_delimiter: args.flag_delimiter, + }; + + let (csv_fields, csv_stats) = match stats_args.rconfig().indexed() { + Ok(o) => match o { + None => { + info!("no index, triggering sequential stats"); + stats_args.sequential_stats() + } + Some(idx) => { + info!("has index, triggering parallel stats"); + stats_args.parallel_stats(idx) + } + }, + Err(e) => { + warn!("error determining if indexed, triggering sequential stats: {e}"); + stats_args.sequential_stats() + } + }?; + + let stats_columns = stats_args.stat_headers(); + debug!("stats columns: {stats_columns:?}"); + + let mut stats_col_index_map = HashMap::new(); + + for (i, col) in stats_columns.iter().enumerate() { + if col != "field" { + // need offset by 1 due to extra "field" column in headers that's not in stats records + stats_col_index_map.insert(col.to_owned(), i - 1); + } + } + + Ok((csv_fields, csv_stats, stats_col_index_map)) +} + +/// get column selector argument string for low cardinality columns +fn build_low_cardinality_column_selector_arg( + enum_cardinality_threshold: usize, + csv_fields: &ByteRecord, + csv_stats: &[Stats], + stats_col_index_map: &HashMap, +) -> String { + let mut low_cardinality_column_indices = Vec::new(); + + // identify low cardinality columns + for i in 0..csv_fields.len() { + // grab stats record for current column + let stats_record = csv_stats.get(i).unwrap().clone().to_record(); + + // get Cardinality + let col_cardinality = match stats_record.get(stats_col_index_map["cardinality"]) { + Some(s) => s.parse::().unwrap_or(0_usize), + None => 0_usize, + }; + // debug!("column_{i}: cardinality={col_cardinality}"); + + if col_cardinality <= enum_cardinality_threshold { + // column selector uses 1-based index + low_cardinality_column_indices.push(i + 1); + }; + } + + debug!("low cardinality columns: {low_cardinality_column_indices:?}"); + + use itertools::Itertools; + let column_select_arg: String = low_cardinality_column_indices + .iter() + .map(ToString::to_string) + .join(","); + + column_select_arg +} + +/// get frequency tables from cmd::stats +/// returns map of unique valules keyed by header +fn get_unique_values( + args: &Args, + column_select_arg: &str, +) -> CliResult>> { + // prepare arg for invoking cmd::frequency + let freq_args = crate::cmd::frequency::Args { + arg_input: args.arg_input.clone(), + flag_select: crate::select::SelectColumns::parse(column_select_arg).unwrap(), + flag_limit: args.flag_enum_threshold, + flag_asc: false, + flag_no_nulls: true, + flag_jobs: util::max_jobs() as isize, + flag_output: None, + flag_no_headers: args.flag_no_headers, + flag_delimiter: args.flag_delimiter, + }; + + let (headers, ftables) = match freq_args.rconfig().indexed()? { + Some(ref mut idx) => freq_args.parallel_ftables(idx), + _ => freq_args.sequential_ftables(), + }?; + + let unique_values_map = construct_map_of_unique_values(headers, ftables)?; + + Ok(unique_values_map) +} + +/// construct map of unique values keyed by header +fn construct_map_of_unique_values( + freq_csv_fields: ByteRecord, + frequency_tables: Vec>>, +) -> CliResult>> { + let mut unique_values_map: HashMap> = HashMap::new(); + + // iterate through fields and gather unique values for each field + for (i, header_byte_slice) in freq_csv_fields.iter().enumerate() { + let mut unique_values = Vec::new(); + + for (val_byte_vec, _count) in frequency_tables[i].most_frequent() { + let val_string = convert_to_string(val_byte_vec.as_slice())?; + unique_values.push(val_string); + } + + let header_string = convert_to_string(header_byte_slice)?; + + // sort the values so enum list so schema can be diff'ed between runs + unique_values.sort(); + + debug!( + "enum[{header_string}]: len={}, val={:?}", + unique_values.len(), + unique_values + ); + unique_values_map.insert(header_string, unique_values); + } + + // dbg!(&unique_values_map); + + Ok(unique_values_map) +} + +/// convert byte slice to UTF8 String +fn convert_to_string(byte_slice: &[u8]) -> CliResult { + // convert csv header to string + let string: String = match std::str::from_utf8(byte_slice) { + Ok(s) => s.to_string(), + Err(e) => { + let msg = + format!("Can't convert byte slice to utf8 string. slice={byte_slice:?}, error={e}"); + error!("{msg}"); + fail!(msg); + } + }; + + Ok(string) +} + +/// determine required fields +fn get_required_fields(properties_map: &Map) -> Vec { + let mut fields: Vec = Vec::new(); + + // for CSV, all columns in original input file are assume required + for key in properties_map.keys() { + fields.push(Value::String(key.clone())); + } + + fields +} + +/// generate map of regex patterns from selected String column of CSV +fn generate_string_patterns( + args: &Args, + properties_map: &Map, +) -> CliResult> { + // standard boiler-plate for reading CSV + + let rconfig = Config::new(&args.arg_input) + .delimiter(args.flag_delimiter) + .no_headers(args.flag_no_headers) + .select(args.flag_pattern_columns.clone()); + + let mut rdr = rconfig.reader()?; + + let headers = rdr.byte_headers()?.clone(); + let sel = rconfig.selection(&headers)?; + + let mut pattern_map: HashMap = HashMap::new(); + + // return empty pattern map when: + // * no columns are selected + // * all columns are selected (by default, all columns are selected when no columns are explicitly specified) + if sel.len() == 0 || sel.len() == headers.len() { + debug!("no pattern columns selected"); + return Ok(pattern_map); + } + + // Map each Header to its unique Set of values + let mut unique_values_map: HashMap> = HashMap::new(); + + #[allow(unused_assignments)] + let mut record = csv::ByteRecord::new(); + while rdr.read_byte_record(&mut record)? { + for (i, value_byte_slice) in sel.select(&record).enumerate() { + // get header based on column index in Selection array + let header_byte_slice: &[u8] = headers.get(sel[i]).unwrap(); + + // convert header and value byte arrays to UTF8 strings + let header_string: String = convert_to_string(header_byte_slice)?; + + // pattern validation only applies to String type, so skip if not String + if !should_emit_pattern_constraint(&properties_map[&header_string]) { + continue; + } + + let value_string: String = convert_to_string(value_byte_slice)?; + + let set = unique_values_map + .entry(header_string) + .or_insert_with(HashSet::::new); + set.insert(value_string); + } + } + + debug!("unique values for eligible pattern columns: {unique_values_map:?}"); + + for (header, value_set) in unique_values_map.iter() { + // Convert Set to Vector + let values: Vec<&String> = Vec::from_iter(value_set); + + // build regex based on unique values + let regexp: String = RegExpBuilder::from(&values) + .with_conversion_of_digits() + .with_conversion_of_words() + .with_conversion_of_repetitions() + .with_minimum_repetitions(2) + .build(); + + pattern_map.insert(header.to_owned(), regexp); + } + + debug!("pattern map: {pattern_map:?}"); + + Ok(pattern_map) +} + +// only emit "pattern" constraint for String fields without enum constraint +fn should_emit_pattern_constraint(field_def: &Value) -> bool { + let type_list = field_def[&"type"].as_array().unwrap(); + let has_enum = field_def.get(&"enum").is_some(); + + type_list.contains(&Value::String("string".to_string())) && !has_enum +} diff --git a/src/cmd/validate.rs b/src/cmd/validate.rs index 87084c410..5f1f9c740 100644 --- a/src/cmd/validate.rs +++ b/src/cmd/validate.rs @@ -157,7 +157,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> { debug!("instance[{}]: {:?}", &row_index, &instance); match validate_json_instance(&instance, &schema_compiled) { - Ok(validation_result) => { + Ok(mut validation_result) => { let results = &validation_result["valid"]; debug!("validation[{row_index}]: {results:?}"); @@ -198,18 +198,13 @@ pub fn run(argv: &[&str]) -> CliResult<()> { invalid_wtr.write_byte_record(&record)?; // write to error report - let mut enriched_results_map = validation_result - .as_object() - .expect("get validation results as map") - .clone(); - let _ = enriched_results_map.insert( + validation_result.as_object_mut().unwrap().insert( "row_index".to_string(), Value::Number(Number::from(row_index)), ); - let enriched_results: Value = Value::Object(enriched_results_map); error_report_file - .write_all(format!("{enriched_results}\n").as_bytes()) + .write_all(format!("{validation_result}\n").as_bytes()) .expect("unable to write to validation error report"); // for fail-fast, just break out of loop diff --git a/tests/test_schema.rs b/tests/test_schema.rs index 8ce2d3db0..71f9a26b7 100644 --- a/tests/test_schema.rs +++ b/tests/test_schema.rs @@ -16,6 +16,8 @@ fn generate_schema_with_value_constraints_then_feed_into_validate() { cmd.arg("adur-public-toilets.csv"); cmd.arg("--enum-threshold"); cmd.arg("13"); + cmd.arg("--pattern-columns"); + cmd.arg("ReportEmail,OpeningHours"); wrk.output(&mut cmd); // load output schema file diff --git a/tests/test_validate.rs b/tests/test_validate.rs index 2b3547d65..29b712964 100644 --- a/tests/test_validate.rs +++ b/tests/test_validate.rs @@ -61,8 +61,8 @@ fn validate_adur_public_toilets_dataset_with_json_schema() { // row 3: wrong value for CoordinateReferenceSystem and Category // note: removed unnecessary quotes for string column "OpeningHours" let invalid_expected = r#"ExtractDate,OrganisationURI,OrganisationLabel,ServiceTypeURI,ServiceTypeLabel,LocationText,CoordinateReferenceSystem,GeoX,GeoY,GeoPointLicensingURL,Category,AccessibleCategory,RADARKeyNeeded,BabyChange,FamilyToilet,ChangingPlace,AutomaticPublicConvenience,FullTimeStaffing,PartOfCommunityScheme,CommunitySchemeName,ChargeAmount,InfoURL,OpeningHours,ManagedBy,ReportEmail,ReportTel,Notes,UPRN,Postcode,StreetAddress,GeoAreaURI,GeoAreaLabel - ,http://opendatacommunities.org/id/district-council/adur,,http://id.esd.org.uk/service/579,Public toilets,BEACH GREEN PUBLIC CONVENIENCES BRIGHTON ROAD LANCING,OSGB36,518072,103649,http://www.ordnancesurvey.co.uk/business-and-government/help-and-support/public-sector/guidance/derived-data-exemptions.html,Female and male,Unisex,Yes,No,No,No,No,No,No,,,http://www.adur-worthing.gov.uk/streets-and-travel/public-toilets/,S = 09:00 - 21:00 W = 09:00 - 17:00 ,ADC,surveyors@adur-worthing.gov.uk,01903 221471,,60001449,,BEACH GREEN PUBLIC CONVENIENCES BRIGHTON ROAD LANCING,, -2014-07-07 00:00,http://opendatacommunities.org/id/district-council/adur,Adur,http://id.esd.org.uk/service/579,Public toilets,PUBLIC CONVENIENCES SHOPSDAM ROAD LANCING,OSGB3,518915,103795,http://www.ordnancesurvey.co.uk/business-and-government/help-and-support/public-sector/guidance/derived-data-exemptions.html,Mens,Unisex,Yes,No,No,No,No,No,No,,,http://www.adur-worthing.gov.uk/streets-and-travel/public-toilets/,S = 09:00 - 21:00 W = 09:00 - 17:00,ADC,surveyors@adur-worthing.gov.uk,01903 221471,,60007428,,,, + ,http://opendatacommunities.org/id/district-council/adur,,http://id.esd.org.uk/service/579,Public toilets,BEACH GREEN PUBLIC CONVENIENCES BRIGHTON ROAD LANCING,OSGB36,518072,103649,http://www.ordnancesurvey.co.uk/business-and-government/help-and-support/public-sector/guidance/derived-data-exemptions.html,Female and male,Unisex,Yes,No,No,No,No,No,No,,,http://www.adur-worthing.gov.uk/streets-and-travel/public-toilets/,S = 09:00 - 21:00 W = 09:00 - 17:00 ,ADC,surveyor_1@adur-worthing.gov.uk,01903 221471,,60001449,,BEACH GREEN PUBLIC CONVENIENCES BRIGHTON ROAD LANCING,, +2014-07-07 00:00,http://opendatacommunities.org/id/district-council/adur,Adur,http://id.esd.org.uk/service/579,Public toilets,PUBLIC CONVENIENCES SHOPSDAM ROAD LANCING,OSGB3,518915,103795,http://www.ordnancesurvey.co.uk/business-and-government/help-and-support/public-sector/guidance/derived-data-exemptions.html,Mens,Unisex,Yes,No,No,No,No,No,No,,,http://www.adur-worthing.gov.uk/streets-and-travel/public-toilets/,S = 09:00 - 21:00 W = 09:00 - 17:00,ADC,surveyor_3@adur-worthing.gov.uk,01903 221471,,60007428,,,, "#; let invalid_output: String = wrk.from_str(&wrk.path("data.csv.invalid")); assert_eq!(invalid_expected.to_string(), invalid_output); @@ -95,8 +95,8 @@ fn validate_adur_public_toilets_dataset_with_json_schema_url() { wrk.output(&mut cmd); let invalid_expected = r#"ExtractDate,OrganisationURI,OrganisationLabel,ServiceTypeURI,ServiceTypeLabel,LocationText,CoordinateReferenceSystem,GeoX,GeoY,GeoPointLicensingURL,Category,AccessibleCategory,RADARKeyNeeded,BabyChange,FamilyToilet,ChangingPlace,AutomaticPublicConvenience,FullTimeStaffing,PartOfCommunityScheme,CommunitySchemeName,ChargeAmount,InfoURL,OpeningHours,ManagedBy,ReportEmail,ReportTel,Notes,UPRN,Postcode,StreetAddress,GeoAreaURI,GeoAreaLabel - ,http://opendatacommunities.org/id/district-council/adur,,http://id.esd.org.uk/service/579,Public toilets,BEACH GREEN PUBLIC CONVENIENCES BRIGHTON ROAD LANCING,OSGB36,518072,103649,http://www.ordnancesurvey.co.uk/business-and-government/help-and-support/public-sector/guidance/derived-data-exemptions.html,Female and male,Unisex,Yes,No,No,No,No,No,No,,,http://www.adur-worthing.gov.uk/streets-and-travel/public-toilets/,S = 09:00 - 21:00 W = 09:00 - 17:00 ,ADC,surveyors@adur-worthing.gov.uk,01903 221471,,60001449,,BEACH GREEN PUBLIC CONVENIENCES BRIGHTON ROAD LANCING,, -2014-07-07 00:00,http://opendatacommunities.org/id/district-council/adur,Adur,http://id.esd.org.uk/service/579,Public toilets,PUBLIC CONVENIENCES SHOPSDAM ROAD LANCING,OSGB3,518915,103795,http://www.ordnancesurvey.co.uk/business-and-government/help-and-support/public-sector/guidance/derived-data-exemptions.html,Mens,Unisex,Yes,No,No,No,No,No,No,,,http://www.adur-worthing.gov.uk/streets-and-travel/public-toilets/,S = 09:00 - 21:00 W = 09:00 - 17:00,ADC,surveyors@adur-worthing.gov.uk,01903 221471,,60007428,,,, + ,http://opendatacommunities.org/id/district-council/adur,,http://id.esd.org.uk/service/579,Public toilets,BEACH GREEN PUBLIC CONVENIENCES BRIGHTON ROAD LANCING,OSGB36,518072,103649,http://www.ordnancesurvey.co.uk/business-and-government/help-and-support/public-sector/guidance/derived-data-exemptions.html,Female and male,Unisex,Yes,No,No,No,No,No,No,,,http://www.adur-worthing.gov.uk/streets-and-travel/public-toilets/,S = 09:00 - 21:00 W = 09:00 - 17:00 ,ADC,surveyor_1@adur-worthing.gov.uk,01903 221471,,60001449,,BEACH GREEN PUBLIC CONVENIENCES BRIGHTON ROAD LANCING,, +2014-07-07 00:00,http://opendatacommunities.org/id/district-council/adur,Adur,http://id.esd.org.uk/service/579,Public toilets,PUBLIC CONVENIENCES SHOPSDAM ROAD LANCING,OSGB3,518915,103795,http://www.ordnancesurvey.co.uk/business-and-government/help-and-support/public-sector/guidance/derived-data-exemptions.html,Mens,Unisex,Yes,No,No,No,No,No,No,,,http://www.adur-worthing.gov.uk/streets-and-travel/public-toilets/,S = 09:00 - 21:00 W = 09:00 - 17:00,ADC,surveyor_3@adur-worthing.gov.uk,01903 221471,,60007428,,,, "#; let invalid_output: String = wrk.from_str(&wrk.path("data.csv.invalid")); assert_eq!(invalid_expected.to_string(), invalid_output);