-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathProbability.ecl
277 lines (264 loc) · 11.9 KB
/
Probability.ecl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
IMPORT $ AS HPCC_Causality;
IMPORT HPCC_Causality.Types;
IMPORT ML_Core.Types AS cTypes;
IMPORT Python3 AS Python;
IMPORT HPCC_Causality.internal.ProbSpace;
ProbQuery := Types.ProbQuery;
ProbSpec := Types.ProbSpec;
Distr := Types.Distribution;
DatasetSummary := Types.DatasetSummary;
NumericField := cTypes.NumericField;
AnyField := Types.AnyField;
nlQuery := Types.nlQuery;
/**
* Probability Module
*
* Contains a set of probability functions to execute against a multivariate dataset.
* The dataset consists of a set of variable names, and a set of observations for
* each variable.
* The observations are in NumericField format, with the field number corresponding to
* the order of the variable names.
*
* Probability functions include:
* - "Natural Language" probability queries. These queries are much easier to specify
* than the "structured" queries below. See README.md for query syntax.
* - Query(...) -- Queries for probabilities or expectations, returning scalar values.
* - QueryDistr(...) -- Queries returning a univariate distribution.
* - "Structured probability queries. These use nested structures to represent the
* query. They are more difficult to use, but may lend themselves to programmatically
* built queries.
* - P(...) -- Unconditional, Conditional and Joint Numerical Probabilities.
* - E(...) -- Unconditional and Conditional Expectations
* - Distr(...) -- Unconditional and Conditional Distributions
* - Dependence(...) -- Test of Dependence and Conditional Dependence Between Variables
* - isIndependent(...) -- Boolean Independence and Conditional Independence Test.
* - Predict(...) -- Machine Learning style regression without training required.
* - Classify(...) -- Machine Learning style classification without training required.
*
* @param ds -- Set of multivariate observations in AnyField format. Each observation
* shares an id (1 - numObservations), and field numbers correspond to the
* order of variable names in the varNames parameter. This dataset can
* be produced from a record-oriented dataset using the ToAnyField() macro.
* See README.md for details.
* @param varNames -- An ordered list of variable name strings. This set may be extracted
* from the AnyField() macro. See README.md.
*/
EXPORT Probability(DATASET(AnyField) ds, SET OF STRING varNames, SET OF STRING categoricals=[]) := MODULE
// This is a module-level initialized ProbSpace. Initialization happens when
// The first probability function is called. At that point, the dataset
// is sent to each node.
EXPORT PS := ProbSpace.Init(ds, varNames, categoricals);
/**
* Summary
*/
EXPORT DatasetSummary Summary(UNSIGNED psId=PS) := FUNCTION
RETURN ProbSpace.getSummary(psId);
END;
/**
* Produce a Subspace from the indicated probability space by applying
* a filter. The filter is specified the same as the conditional portion
* of a Natural probability query.
* The returned ProbabilitySpace Id can be used in subsequent calls to
* any of the probability functions.
* SubSpaces essentially create a multivariate conditional distribution
* of the original Probability Space, but filtering the set of records.
* This can be useful for cleaning a probability space (i.e. by eliminating
* records where certain attributes have certain values), or for generating
* a conditional space to issue multiple queries against.
*
* Example:
* // Generate a subspace of married men in very good or excellent health.
* new_psid := prob.SubSpace('gender=male, married=yes, genhealth in [4-verygood, 5-excellent]', prob.PS);
*
* @param filter A bound query condition.
* @param psid The probability space id of the space to be subspaced.
* @return A new probability space id for use in future queries.
*
*/
EXPORT UNSIGNED SubSpace(STRING filter, UNSIGNED psId=PS) := FUNCTION
filtDS := DATASET([{1, filter}], nlQuery);
filtDS_D := DISTRIBUTE(filtDS, ALL);
psidDS := ProbSpace.SubSpace(filtDS_D, psId);
newPsid := MAX(psidDS, id);
RETURN newPsid;
END;
// Natural Language Queries
/**
* Natural Language Probability or Expectation query.
*
* @param gueries A set of "natural" mathematical query strings. See README.md
* for details on query syntax.
* @param psid An UNSIGNED identifier from the main probability space (probspace.PS)
* or a SubSpace.
* @return A list of numeric or string results in Types.AnyField format.
*
*/
EXPORT DATASET(AnyField) Query(SET OF STRING queries, UNSIGNED psid=PS) := FUNCTION
dummy := DATASET([{1}], {UNSIGNED d});
queryRecs := NORMALIZE(dummy, COUNT(queries), TRANSFORM(nlQuery, SELF.id:=COUNTER,
SELF.query:=queries[COUNTER]));
queries_D := DISTRIBUTE(queryRecs, id);
results := ProbSpace.Query(queries_D, psid);
return SORT(results, id);
END;
/**
* Natural Language Probability Distribution query
*
* @param gueries A set of "natural" mathematical query strings. See README.md
* for details on query syntax.
* @param psid An UNSIGNED identifier from the main probability space (probspace.PS)
* or a SubSpace.
* @return A dataset of Types.Distribution records, summarizing each requested
* distribution.
*
*/
EXPORT DATASET(Distr) QueryDistr(SET OF STRING queries, UNSIGNED psid=PS) := FUNCTION
dummy := DATASET([{1}], {UNSIGNED d});
queryRecs := NORMALIZE(dummy, COUNT(queries), TRANSFORM(nlQuery, SELF.id:=COUNTER,
SELF.query:=queries[COUNTER]));
queries_D := DISTRIBUTE(queryRecs, id);
results := ProbSpace.QueryDistr(queries_D, psid);
return SORT(results, id);
END;
// End Natural Language Queries
// Structured Queries
/**
* Calculate a series of numerical probabilities.
*
* Queries are of the form:
* - Exact Query -- P(Var = Val | List of Conditions)
* - Range Query -- P(Val1 <= Var <= Val2 | List of Conditions)
* - Joint Probability -- P([Exact or Range Query 1, ...] | List of Conditions)
*
* @param queries A list of queries. One or more target may be specified for each
* query, and the targets must be bound (i.e. with 1 or 2 arguments).
*
* @return A set of NumericField records, with value being the probability
* of the query as field-number 1.
*/
EXPORT DATASET(NumericField) P(DATASET(ProbQuery) queries, UNSIGNED psid=PS) := FUNCTION
queries_D := DISTRIBUTE(queries, id);
probs := ProbSpace.P(queries_D, psid);
probs_S := SORT(probs, id);
RETURN probs_S;
END;
/**
* Calculate a series of numerical expected values.
*
* Expectations are of the form:
* - E(Var | List of Conditions)
*
* @param queries A list of queries. Exactly 1 target per query must be specified,
* and the target must be unbound (i.e. with zero arguments).
*
* @return A set of NumericField records, with value being the Expected Value of each
* query.
*/
EXPORT DATASET(NumericField) E(DATASET(ProbQuery) queries, UNSIGNED psid=PS) := FUNCTION
queries_D := DISTRIBUTE(queries, id);
exps := ProbSpace.E(queries_D, psid);
exps_S := SORT(exps, id);
RETURN exps_S;
END;
/**
* Calculate a series of Distributions.
*
* Distributions are of the form:
* - Distr(Var | List of Conditions)
*
* @param queries A list of queries. Exactly 1 target per query must be specified,
* and the target must be unbound (i.e. with zero arguments).
*
* @return A set of Types.Distr records, describing each of the queried distributions.
*/
EXPORT DATASET(Distr) Distr(DATASET(ProbQuery) queries, UNSIGNED psid=PS) := FUNCTION
queries_D := DISTRIBUTE(queries, id);
distrs := ProbSpace.Distr(queries_D, psid);
distrs_S := SORT(distrs, id);
RETURN distrs_S;
END;
// End Structured Queries
/**
* Perform a series of dependency tests.
*
* Form:
* - Dependency(target1, target2 | List of conditions)
*
* @param queries A list of queries. Exactly 2 targets per query must be specified.
*
* @return a list of p-values with .5 confidence, in NumericField
* format.
* Values less than .5 indicate probable independence.
* Values greater than .5 indicate probable dependence
*/
EXPORT DATASET(NumericField) Dependence(DATASET(ProbQuery) queries, UNSIGNED psid=PS) := FUNCTION
queries_D := DISTRIBUTE(queries, id);
deps := ProbSpace.Dependence(queries_D, psid);
deps_S := SORT(deps, id);
RETURN deps_S;
END;
/**
* Perform a series of dependency tests and evaluate the results
* as a Boolean.
*
* Form:
* - isIndependent(target1, target2 | List of conditions)
*
* @param queries A list of queries. Exactly 2 targets per query must be specified.
*
* @return A list of results as NumericField. Result of 1 indicates that the two
* targets are most likely independent. 0 indicates probable dependence.
*
*/
EXPORT DATASET(NumericField) isIndependent(DATASET(ProbQuery) queries, UNSIGNED psid=PS) := FUNCTION
queries_D := DISTRIBUTE(queries, id);
deps := ProbSpace.Dependence(queries_D, psid);
deps_B := PROJECT(deps, TRANSFORM(RECORDOF(LEFT),
SELF.value := IF(LEFT.value > .5, 0, 1),
SELF := LEFT), LOCAL);
deps_S := SORT(deps_B, id);
RETURN deps_S;
END;
/**
* Perform a set of regression style predictions on a continuous variable.
*
* Form:
* - E(target | conditions)
*
* @param target The dependent variable (i.e. prediction target). The target should be a continuous
* variable.
* @param varNames The names of the independent variables to be used for prediction
* @param varDat The values of the conditional variables in NumericField format. The field numbers
* correspond to the order of the varNames list.
* @return A DATASET(NumericField) with the prediction values in field number 1.
*
*/
EXPORT DATASET(NumericField) Predict(STRING target, SET OF STRING varNames, DATASET(NumericField) varDat, UNSIGNED psid=PS) := FUNCTION
dat_D := DISTRIBUTE(varDat, id);
dat_S := SORT(dat_D, id, number, LOCAL);
preds := ProbSpace.Predict(dat_S, varNames, target, psid);
preds_S := SORT(preds, id);
RETURN preds_S;
END;
/**
* Perform a set of classification predictions on a discrete target variable.
*
* Form:
* - E(target | conditions)
*
* @param target The dependent variable (i.e. prediction target). The target should be a discrete
* variable.
* @param varNames The names of the independent variables to be used for prediction
* @param varDat The values of the conditional variables in NumericField format. The field numbers
* correspond to the order of the varNames list.
* @return A DATASET(NumericField) with the prediction values in field number 1.
*
*/
EXPORT DATASET(NumericField) Classify(STRING target, SET OF STRING varNames, DATASET(NumericField) varDat, UNSIGNED psid=PS) := FUNCTION
dat_D := DISTRIBUTE(varDat, id);
dat_S := SORT(dat_D, id, number, LOCAL);
preds := ProbSpace.Classify(dat_S, varNames, target, psid);
preds_S := SORT(preds, id);
RETURN preds_S;
END;
END; // Probability Module