-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy patharegex.c
312 lines (275 loc) · 9.79 KB
/
aregex.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
/*
* aregex.c - Gawk extension to access the TRE approximate regex.
* Copyright (C) 2018-9 Cam Webb, <[email protected]>
* Distributed under the GNU Pulbic Licence v3
*/
// Minimal headers:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#include <gawkextlib.h>
#include <tre/tre.h>
#define MAXNSUBMATCH 20 // Max Number of parenthetical substring matches
#define DEFMAXCOST 5 // Default max_cost for match
#define DEBUG 0 // Print debug info
#define PACKAGE_STRING "aregex-1.1.0"
// Gawkextlib boilerplate:
static const gawk_api_t *api;
static awk_ext_id_t ext_id;
int plugin_is_GPL_compatible;
/* regex hash table */
static strhash *ht_regex;
#ifdef AREGEX_MEM_DEBUG
static int ht_regex_n_alloced = 0;
#endif
/* hash element destructor */
static void
he_data_destroy (void *data, void *opaque, strhash *ht, strhash_entry *he)
{
if (he && he->data) {
tre_regfree (he->data);
gawk_free (he->data);
#ifdef AREGEX_MEM_DEBUG
ht_regex_n_alloced -= sizeof(regex_t);
#endif
he->data = NULL;
}
}
/* look up regex in cache and create it if not found */
static regex_t *
tre_regex_lookup (const char* pattern, size_t pattern_len)
{
strhash_entry * he;
he = strhash_get (ht_regex, pattern, pattern_len, 0);
if (!he) {
regex_t * rx;
size_t sz;
int rc;
int flags;
flags = REG_EXTENDED;
sz = sizeof (regex_t);
rx = gawk_calloc (1, sz);
#ifdef AREGEX_MEM_DEBUG
ht_regex_n_alloced += sz;
#endif
rc = tre_regncomp (rx, pattern, pattern_len, flags);
if ( rc == REG_OK) {
he = strhash_get (ht_regex, pattern, pattern_len, 1); /* he ensured not to be NULL */
he->data = rx;
} else {
/* regexp compilation failed */
char tre_err_buf[128], err_buf[256];
tre_regerror(rc, rx, tre_err_buf, sizeof tre_err_buf);
snprintf(err_buf, sizeof err_buf, "aregex: tre: %s in /%s/", tre_err_buf, pattern);
update_ERRNO_string(err_buf);
gawk_free (rx);
#ifdef AREGEX_MEM_DEBUG
ht_regex_n_alloced -= sz;
#endif
}
}
return he ? he->data : NULL;
}
// Main amatch() function definition
static awk_value_t * do_amatch(int nargs, awk_value_t *result \
, struct awk_ext_func *unused)
{
int i;
// 1. Set default costs
const char *parami[8];
int paramv[8];
parami[0] = "cost_ins"; paramv[0] = 1;
parami[1] = "cost_del"; paramv[1] = 1;
parami[2] = "cost_subst"; paramv[2] = 1;
parami[3] = "max_cost"; paramv[3] = DEFMAXCOST;
parami[4] = "max_del"; paramv[4] = DEFMAXCOST;
parami[5] = "max_ins"; paramv[5] = DEFMAXCOST;
parami[6] = "max_subst"; paramv[6] = DEFMAXCOST;
parami[7] = "max_err"; paramv[7] = DEFMAXCOST;
// 2. Read 3rd, 'costs' argument, if present
// (these variable declarations outside, because needed during output: )
awk_value_t costs;
awk_value_t simplecost;
awk_value_t costindex;
awk_value_t costval;
awk_bool_t hascostarr = 0;
if (nargs > 2) {
// if just a simple integer for 3rd argument:
if (get_argument(2, AWK_NUMBER, &simplecost)) {
paramv[3] = (int) simplecost.num_value;
paramv[4] = (int) simplecost.num_value;
paramv[5] = (int) simplecost.num_value;
paramv[6] = (int) simplecost.num_value;
paramv[7] = (int) simplecost.num_value;
}
else if (get_argument(2, AWK_ARRAY, &costs)) {
hascostarr = 1;
for (i = 0; i < 8; i++) {
// create an index for reading array
make_const_string(parami[i], strlen(parami[i]), &costindex);
// if there is an array element with that index
if (get_array_element(costs.array_cookie, &costindex, \
AWK_STRING, &costval)) {
// update the cost value
paramv[i] = atoi(costval.str_value.str);
if (DEBUG) {
warning(ext_id, "cost %s = %d", parami[i], atoi(costval.str_value.str));
}
}
}
}
else fatal(ext_id, "amatch: 3rd argument present, but could not be read.");
}
// 3. Read the string and regex arguments (1st and 2nd)
awk_value_t re;
awk_value_t str;
if (!get_argument(0, AWK_STRING, &str))
fatal(ext_id, "amatch: 1st param., the string, not found");
if (!get_argument(1, AWK_STRING, &re))
fatal(ext_id, "amatch: 2nd param., the regex, not found");
// ( for wchar_t:
// wchar_t rew[] = L"";
// swprintf(rew, strlen(re.str_value.str), L"%ls", re.str_value.str); )
// 4. Compile regex
regex_t *preg;
preg = tre_regex_lookup(re.str_value.str, re.str_value.len);
if(!preg) return make_number(-1, result);
// ( for wchar_t:
// tre_regwcomp(&preg, rew, REG_EXTENDED); )
// 5. Do the match
// set approx match params
regaparams_t params = { 0 };
params.cost_ins = paramv[0];
params.cost_del = paramv[1];
params.cost_subst = paramv[2];
params.max_cost = paramv[3];
params.max_del = paramv[4];
params.max_ins = paramv[5];
params.max_subst = paramv[6];
params.max_err = paramv[7];
// create necessary structure for details of match
regamatch_t match ;
regmatch_t pmatch[MAXNSUBMATCH];
match.nmatch = MAXNSUBMATCH;
match.pmatch = &pmatch[0];
// do the approx regexp itself!
int treret;
treret = tre_regaexec(preg, str.str_value.str, &match, params, 0);
// ( for wchar_t:
// treret = tre_regawexec(&pregw, rew, &match, params, 0); )
// set the amatch() return value depending on tre_regaexec() return
// 1 if success, 0 if no match
int rval = 1;
if (treret == REG_NOMATCH) rval = 0;
// Catch a "mem. not. allocated" return from tre_regaexec()
if (treret == REG_ESPACE) {
warning(ext_id, \
"amatch: TRE err., mem. insufficient to complete the match.");
return make_null_string(result);
}
// 6. If there is a cost array, set some return values (if a match)
if ((hascostarr) && (rval)) {
int n;
char matchcost[20]; // Single integers, max width ~= 10
#define COST_LEN 4
#define NUM_INS_LEN 7
#define NUM_DEL_LEN 7
#define NUM_SUBST_LEN 9
// cost
del_array_element(costs.array_cookie, \
make_const_string("cost", COST_LEN, &costindex));
n = sprintf(matchcost, "%d", match.cost);
set_array_element(costs.array_cookie, \
make_const_string("cost", COST_LEN, &costindex), \
make_const_string(matchcost, n, &costval));
// num_ins
del_array_element(costs.array_cookie, \
make_const_string("num_ins", NUM_INS_LEN, &costindex));
n = sprintf(matchcost, "%d", match.num_ins);
set_array_element(costs.array_cookie, \
make_const_string("num_ins", NUM_INS_LEN, &costindex), \
make_const_string(matchcost, n, &costval));
// num_del
del_array_element(costs.array_cookie, \
make_const_string("num_del", NUM_DEL_LEN, &costindex));
n = sprintf(matchcost, "%d", match.num_del);
set_array_element(costs.array_cookie, \
make_const_string("num_del", NUM_DEL_LEN, &costindex), \
make_const_string(matchcost, n, &costval));
// num_subst
del_array_element(costs.array_cookie, \
make_const_string("num_subst", NUM_SUBST_LEN, &costindex));
n = sprintf(matchcost, "%d", match.num_subst);
set_array_element(costs.array_cookie, \
make_const_string("num_subst", NUM_SUBST_LEN, &costindex), \
make_const_string(matchcost, n, &costval));
}
// 7. Set 4th argument array, for matched substrings, if present
// and if a match found
if ((nargs == 4) && (rval)) {
int n,m;
awk_value_t substr;
// read 4th argument
if (!get_argument(3, AWK_ARRAY, &substr)) {
warning(ext_id, "amatch: Could not read 4th argument.");
}
else clear_array(substr.array_cookie);
// hand the TRE substrings over to the gawk substring array
char outindexc[20];
int maxsubsize = 0;
// first, find max size of substring
for (i = 0; i < MAXNSUBMATCH ; i++)
if (match.pmatch[i].rm_so != -1)
if (maxsubsize < match.pmatch[i].rm_eo - match.pmatch[i].rm_so)
maxsubsize = match.pmatch[i].rm_eo - match.pmatch[i].rm_so;
// dimension the substring:
char outvalc[maxsubsize+1];
awk_value_t outindexp;
awk_value_t outvalp;
for (i = 0 ; i < (int) match.nmatch; i++) {
if (match.pmatch[i].rm_so != -1) {
n = sprintf(outindexc, "%d", i);
// ( "%d %.*s", match.pmatch[i].rm_so+1, ... gives position
// by bytes, not by chars )
m = sprintf(outvalc, "%.*s", \
match.pmatch[i].rm_eo - match.pmatch[i].rm_so, \
str.str_value.str + match.pmatch[i].rm_so);
set_array_element(substr.array_cookie,
make_const_string(outindexc, n, &outindexp), \
make_const_string(outvalc, m, &outvalp));
}
}
}
return make_number(rval, result);
}
// Gawkextlib boilerplate:
static awk_ext_func_t func_table[] = \
{
{ "amatch", do_amatch, 4, 2, awk_false, NULL },
};
/* procedure run on exiting gawk and the extension */
static void
aregex_awk_atexit (void* data, int exit_status)
{
strhash_destroy (ht_regex, he_data_destroy, NULL);
#ifdef AREGEX_MEM_DEBUG
if(ht_regex_n_alloced)
warning(ext_id,"aregex: memory leakage: %d bytes", ht_regex_n_alloced);
#endif
}
/* initialize extension */
static char ext_version[512];
static void set_ext_version(void) {
snprintf(ext_version, sizeof ext_version, "%s (%s)", PACKAGE_STRING, tre_version());
}
static awk_bool_t
aregex_init_func (void)
{
ht_regex = strhash_create (0);
awk_atexit (aregex_awk_atexit, NULL);
set_ext_version();
return awk_true;
}
static awk_bool_t (*init_func)(void) = aregex_init_func;
dl_load_func(func_table, amatch, "")