Skip to content

Commit

Permalink
pat: Correct number matching (again) (#330)
Browse files Browse the repository at this point in the history
* pat: correct numeric matching

addresses #27

Signed-off-by: Tim Bray <[email protected]>

* back off from math/rand/v2 to math/rand for go 1.19

Signed-off-by: Tim Bray <[email protected]>

* patch flaws revealed by concurrency_test.go

Signed-off-by: Tim Bray <[email protected]>

---------

Signed-off-by: Tim Bray <[email protected]>
  • Loading branch information
timbray authored Jul 9, 2024
1 parent b1c32f5 commit 20480ea
Show file tree
Hide file tree
Showing 12 changed files with 455 additions and 287 deletions.
13 changes: 13 additions & 0 deletions PATTERNS.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,19 @@ Thus, the following Pattern would match both JSON events above:
{"alpha": {"beta": [1]}}
```

### Numeric Values

It would be convenient if Quamina knew, for matching purposes, that 35,
35.00, and 3.5e1 were all the same number.

In many cases, Quamina can manage this. Specifically, for numbers that:

* are between -5.0e9 and 5.0e9 inclusive.
* have five or fewer fractional digits.

Numbers which do not meet these criteria will be treated as strings, which
usually produces good results.

## Extended Patterns
An **Extended Pattern** **MUST** be a JSON object containing
a single field whose name is called the **Pattern Type**.
Expand Down
9 changes: 2 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -143,13 +143,8 @@ The `"exists":true` and `"exists":false` patterns
have corner cases; details are covered in
[Patterns in Quamina](PATTERNS.md).

Number matching is weak - the number has to appear
exactly the same in the Pattern and the Event. I.e.,
Quamina doesn't know that 35, 35.000, and 3.5e1 are the
same number. There's a fix for this in the code which
is not yet activated because it causes a
significant performance penalty, so the API needs to
be enhanced to only ask for it when you need it.
Quamina can match numeric values correctly, subject to
certain limits; details are in [Patterns in Quamina](PATTERNS.md).

## Flattening and Matching

Expand Down
128 changes: 64 additions & 64 deletions cl2_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,127 +16,93 @@ var (
cl2LineCount int
)

/* This test adopted, with thanks, from aws/event-ruler */

func getCL2Lines(t *testing.T) [][]byte {
t.Helper()

cl2Lock.Lock()
defer cl2Lock.Unlock()
if cl2Lines != nil {
return cl2Lines
}
file, err := os.Open("testdata/citylots2.json.gz")
if err != nil {
t.Fatalf("Can't open citylots2.json.gz: %v", err.Error())
}
defer func(file *os.File) {
_ = file.Close()
}(file)
zr, err := gzip.NewReader(file)
if err != nil {
t.Fatalf("Can't open zip reader: %v", err.Error())
}

scanner := bufio.NewScanner(zr)
buf := make([]byte, oneMeg)
scanner.Buffer(buf, oneMeg)
for scanner.Scan() {
cl2LineCount++
cl2Lines = append(cl2Lines, []byte(scanner.Text()))
}
return cl2Lines
}

func TestRulerCl2(t *testing.T) {
exactRules := []string{
// test rules pulled out so that it's easy to write test funcs focusing in on one set of htem.
var (
anythingButRules = []string{
"{\n" +
" \"properties\": {\n" +
" \"MAPBLKLOT\": [ \"1430022\" ]\n" +
" }" +
" \"STREET\": [ { \"anything-but\": [ \"FULTON\" ] } ]\n" +
" }\n" +
"}",
"{\n" +
" \"properties\": {\n" +
" \"MAPBLKLOT\": [ \"2607117\" ]\n" +
" \"STREET\": [ { \"anything-but\": [ \"MASON\" ] } ]\n" +
" }\n" +
"}",
"{\n" +
" \"properties\": {\n" +
" \"MAPBLKLOT\": [ \"2607218\" ]\n" +
" \"ST_TYPE\": [ { \"anything-but\": [ \"ST\" ] } ]\n" +
" }\n" +
"}",
"{\n" +
" \"properties\": {\n" +
" \"MAPBLKLOT\": [ \"3745012\" ]\n" +
" \"geometry\": {\n" +
" \"type\": [ {\"anything-but\": [ \"Polygon\" ] } ]\n" +
" }\n" +
"}",
"{\n" +
" \"properties\": {\n" +
" \"MAPBLKLOT\": [ \"VACSTWIL\" ]\n" +
" \"FROM_ST\": [ { \"anything-but\": [ \"441\" ] } ]\n" +
" }\n" +
"}",
}
exactMatches := []int{1, 101, 35, 655, 1}

prefixRules := []string{
anythingButMatches = []int{211158, 210411, 96682, 120, 210615}
exactRules = []string{
"{\n" +
" \"properties\": {\n" +
" \"STREET\": [ { \"prefix\": \"AC\" } ]\n" +
" }\n" +
" \"MAPBLKLOT\": [ \"1430022\" ]\n" +
" }" +
"}",
"{\n" +
" \"properties\": {\n" +
" \"STREET\": [ { \"prefix\": \"BL\" } ]\n" +
" \"MAPBLKLOT\": [ \"2607117\" ]\n" +
" }\n" +
"}",
"{\n" +
" \"properties\": {\n" +
" \"STREET\": [ { \"prefix\": \"DR\" } ]\n" +
" \"MAPBLKLOT\": [ \"2607218\" ]\n" +
" }\n" +
"}",
"{\n" +
" \"properties\": {\n" +
" \"STREET\": [ { \"prefix\": \"FU\" } ]\n" +
" \"MAPBLKLOT\": [ \"3745012\" ]\n" +
" }\n" +
"}",
"{\n" +
" \"properties\": {\n" +
" \"STREET\": [ { \"prefix\": \"RH\" } ]\n" +
" \"MAPBLKLOT\": [ \"VACSTWIL\" ]\n" +
" }\n" +
"}",
}
prefixMatches := []int{24, 442, 38, 2387, 328}

anythingButRules := []string{
exactMatches = []int{1, 101, 35, 655, 1}
prefixRules = []string{
"{\n" +
" \"properties\": {\n" +
" \"STREET\": [ { \"anything-but\": [ \"FULTON\" ] } ]\n" +
" \"STREET\": [ { \"prefix\": \"AC\" } ]\n" +
" }\n" +
"}",
"{\n" +
" \"properties\": {\n" +
" \"STREET\": [ { \"anything-but\": [ \"MASON\" ] } ]\n" +
" \"STREET\": [ { \"prefix\": \"BL\" } ]\n" +
" }\n" +
"}",
"{\n" +
" \"properties\": {\n" +
" \"ST_TYPE\": [ { \"anything-but\": [ \"ST\" ] } ]\n" +
" \"STREET\": [ { \"prefix\": \"DR\" } ]\n" +
" }\n" +
"}",
"{\n" +
" \"geometry\": {\n" +
" \"type\": [ {\"anything-but\": [ \"Polygon\" ] } ]\n" +
" \"properties\": {\n" +
" \"STREET\": [ { \"prefix\": \"FU\" } ]\n" +
" }\n" +
"}",
"{\n" +
" \"properties\": {\n" +
" \"FROM_ST\": [ { \"anything-but\": [ \"441\" ] } ]\n" +
" \"STREET\": [ { \"prefix\": \"RH\" } ]\n" +
" }\n" +
"}",
}
anythingButMatches := []int{211158, 210411, 96682, 120, 210615}

shellstyleRules := []string{
prefixMatches = []int{24, 442, 38, 2387, 328}
shellstyleRules = []string{
"{\n" +
" \"properties\": {\n" +
" \"MAPBLKLOT\": [ { \"shellstyle\": \"143*\" } ]\n" +
Expand All @@ -163,8 +129,7 @@ func TestRulerCl2(t *testing.T) {
" }\n" +
"}",
}
shellstyleMatches := []int{490, 713, 43, 2540, 1}

shellstyleMatches = []int{490, 713, 43, 2540, 1}
/* will add when we have numeric
complexArraysRules := []string{
"{\n" +
Expand Down Expand Up @@ -211,6 +176,41 @@ func TestRulerCl2(t *testing.T) {
complexArraysMatches := []int{227, 2, 149444, 64368, 127485}
*/

)

/* This test adopted, with thanks, from aws/event-ruler */

func getCL2Lines(t *testing.T) [][]byte {
t.Helper()

cl2Lock.Lock()
defer cl2Lock.Unlock()
if cl2Lines != nil {
return cl2Lines
}
file, err := os.Open("testdata/citylots2.json.gz")
if err != nil {
t.Fatalf("Can't open citylots2.json.gz: %v", err.Error())
}
defer func(file *os.File) {
_ = file.Close()
}(file)
zr, err := gzip.NewReader(file)
if err != nil {
t.Fatalf("Can't open zip reader: %v", err.Error())
}

scanner := bufio.NewScanner(zr)
buf := make([]byte, oneMeg)
scanner.Buffer(buf, oneMeg)
for scanner.Scan() {
cl2LineCount++
cl2Lines = append(cl2Lines, []byte(scanner.Text()))
}
return cl2Lines
}

func TestRulerCl2(t *testing.T) {
lines := getCL2Lines(t)
fmt.Printf("lines: %d\n", len(lines))

Expand Down
18 changes: 1 addition & 17 deletions field_matcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -132,21 +132,6 @@ func (m *fieldMatcher) addTransition(field *patternField, printer printer) []*fi
var nextFieldMatchers []*fieldMatcher
for _, val := range field.vals {
nextFieldMatchers = append(nextFieldMatchers, vm.addTransition(val, printer))

// if the val is a number, let's add a transition on the canonicalized number
// TODO: Only do this if asked
/*
if val.vType == numberType {
c, err := canonicalize([]byte(val.val))
if err == nil {
number := typedVal{
vType: literalType,
val: c,
}
nextFieldMatchers = append(nextFieldMatchers, vm.addTransition(number))
}
}
*/
}
m.update(freshStart)
return nextFieldMatchers
Expand All @@ -162,6 +147,5 @@ func (m *fieldMatcher) transitionOn(field *Field, bufs *bufpair) []*fieldMatcher
if !ok {
return nil
}

return valMatcher.transitionOn(field.Val, bufs)
return valMatcher.transitionOn(field, bufs)
}
Loading

0 comments on commit 20480ea

Please sign in to comment.