-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
2 changed files
with
33 additions
and
43 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,11 +1,11 @@ | ||
/* | ||
* aregex.c - Gawk extension to access the TRE approximate regex. | ||
* Copyright (C) 2018 Cam Webb | ||
* Distributed under the Gnu Pulbic Licence v3 | ||
* Copyright (C) 2018 Cam Webb, <[email protected]> | ||
* Distributed under the GNU Pulbic Licence v3 | ||
*/ | ||
|
||
#include "common.h" | ||
#include <tre/tre.h> | ||
#include <tre/tre.h> // From the TRE regex lib. https://laurikari.net/tre/ | ||
|
||
static awk_value_t * do_aregex(int nargs, awk_value_t *result API_FINFO_ARG) | ||
{ | ||
|
@@ -19,18 +19,19 @@ static awk_value_t * do_aregex(int nargs, awk_value_t *result API_FINFO_ARG) | |
lintwarn(ext_id, _("aregex() called with >3 arguments")); | ||
} | ||
|
||
// set the cost | ||
// Set the default max cost | ||
int defcost = 2; | ||
if (get_argument(2, AWK_NUMBER, &incost)) defcost = incost.num_value ; | ||
|
||
// If the string arguments are read | ||
// If the string arguments are read... | ||
if ((get_argument(0, AWK_STRING, &re)) && \ | ||
(get_argument(1, AWK_STRING, &str))) { | ||
|
||
// Compile RE | ||
regex_t preg; | ||
tre_regcomp(&preg, re.str_value.str, REG_EXTENDED); | ||
|
||
|
||
// Set approx grep params | ||
regaparams_t params = { 0 }; | ||
params.cost_ins = 1; | ||
params.cost_del = 1; | ||
|
@@ -42,29 +43,34 @@ static awk_value_t * do_aregex(int nargs, awk_value_t *result API_FINFO_ARG) | |
params.max_err = defcost; | ||
|
||
regamatch_t match; | ||
match.nmatch = 0; | ||
match.pmatch = 0; | ||
match.nmatch = 0; // No partial match arrays needed | ||
match.pmatch = 0; // - ditto - | ||
|
||
int treret = 0; | ||
int rval = 0; | ||
|
||
|
||
// Do the approx regexp | ||
treret = tre_regaexec(&preg, str.str_value.str, &match, params, 0); | ||
|
||
if (treret == REG_NOMATCH) rval = -1; | ||
else if (treret == REG_ESPACE) { | ||
rval = -999; | ||
warning(ext_id, _("aregex: TRE error: temporary memory could not be allocated\n to complete the matching operation.")); | ||
warning(ext_id, \ | ||
_("aregex: TRE err.: mem. insufficient to complete the match.")); | ||
} | ||
else rval = match.cost; | ||
|
||
// Return: cost (Levenshtein distance) if success, -1 if no match, | ||
// -999 if memory error | ||
|
||
return make_number(rval, result); | ||
} | ||
|
||
// On string argument failure: | ||
return make_null_string(result); | ||
} | ||
|
||
// Boilerplate: | ||
// Gawkextlib boilerplate: | ||
|
||
static awk_ext_func_t func_table[] = \ | ||
{ | ||
|
@@ -81,3 +87,4 @@ static awk_bool_t (*init_func)(void) = init_my_module; | |
static const char *ext_version = PACKAGE_STRING; | ||
|
||
dl_load_func(func_table, fmatch, "") | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
.TH AREGEX 3am "Nov 24 2018" "Free Software Foundation" "GNU Awk Extension Modules" | ||
.SH NAME | ||
aregex \- access approximate (fuzzy) regex matching from TRE library | ||
\fBaregex\fR \- Approximate (fuzzy) regular expresion or plain string matching using the TRE library | ||
.SH SYNOPSIS | ||
.ft CW | ||
@load "aregex" | ||
|
@@ -14,45 +14,28 @@ extension adds a function named | |
.BR aregex() | ||
as follows: | ||
.TP | ||
.B aregex() | ||
This function takes an integer argument and returns the result from calling | ||
the | ||
.IR aregex (3) | ||
C library function. If the argument is not numeric, it will return an empty string. | ||
\fBaregex(\fIr \fB, \fIs \fR[\fB, \fIc\fR]\fB)\fR | ||
.TP | ||
This function takes two mandatory string arguments, and an optional third integer argument. \fIr \fRis an extended regular expression or plain string to be matched against string \fIs\fR. The regex (\fIr\fR) is bounded by double-quotes, not by the usual \fIgawk\fR slashes. \fIc\fR is an optional total cost of additions, deletions and substitutions (each of cost = 1). Default value of \fIc\fR is 2. | ||
.TP | ||
The function returns an integer: (i) \fI0\fR or \fI>0\fR on a successful match, with the value being the Levenshtein distance between string \fIs\fR and the closest string expanded from the regex \fIr\fR, (ii) \fI-1\fR for a failure to match (i.e., total cost of approximate match > \fIc\fR), or (iii) \fI-999\fR for an error of the TRE library. | ||
.TP | ||
Place plain text (or regex) between `^' and `$' in \fIr\fR to match two full strings or whole words (i.e., so that \fIr\fR does not just act as a substring). | ||
.\" .SH NOTES | ||
.\" .SH BUGS | ||
.SH EXAMPLE | ||
.ft CW | ||
.nf | ||
@load "aregex" | ||
\&... | ||
printf "The mapped value of 11 is %s\en", aregex(11) | ||
print "The distance between 'abcdef' and 'abdex' is " \\ | ||
aregex("abcdef", "abdex", 3) | ||
.fi | ||
.ft R | ||
.SH "SEE ALSO" | ||
.BR aregex (3) | ||
.SH AUTHOR | ||
Cam Webb | ||
.SH COPYING PERMISSIONS | ||
Copyright \(co 2018, | ||
Free Software Foundation, Inc. | ||
.PP | ||
Permission is granted to make and distribute verbatim copies of | ||
this manual page provided the copyright notice and this permission | ||
notice are preserved on all copies. | ||
.ig | ||
Permission is granted to process this file through troff and print the | ||
results, provided the printed document carries copying permission | ||
notice identical to this one except for the removal of this paragraph | ||
(this paragraph not being relevant to the printed manual page). | ||
.. | ||
.PP | ||
Permission is granted to copy and distribute modified versions of this | ||
manual page under the conditions for verbatim copying, provided that | ||
the entire resulting derived work is distributed under the terms of a | ||
permission notice identical to this one. | ||
The Gawk extension lib: https://sourceforge.net/projects/gawkextlib/ | ||
.PP | ||
Permission is granted to copy and distribute translations of this | ||
manual page into another language, under the above conditions for | ||
modified versions, except that this permission notice may be stated in | ||
a translation approved by the Foundation. | ||
TRE library: https://laurikari.net/tre/ | ||
.SH AUTHORS | ||
Cam Webb <[email protected]>, @laurikari for the TRE library, the \fIgawkextlib\fR authors | ||
.\" .SH COPYING PERMISSIONS |