wireshark/epan/wslua/lrexlib_glib.c

403 lines
13 KiB
C

/* lrexlib_glib.c - Lua binding of GLib Regex library */
/* This is similar to Lrexlib's PCRE implementation, but has been changed
* for GLib's pcre implementation, which is different.
*
* The changes made by me, Hadriel Kaplan, are in the Public Domain, or
* under the MIT license if your country does not allow Public Domain.
Copyright (c) 2014 Hadriel Kaplan
SPDX-License-Identifier: MIT
* Changes relative to Lrelxib-PCRE:
* - No chartables or locale handling
* - dfa_exec doesn't take 'ovecsize' nor 'wscount' args
* - dfa_exec returns boolean true for partial match, without subcapture info
* - named subgroups do not return a table of name-keyed entries, because
* GLib doesn't provide a way to learn that information
* - there is no 'config()' function, since GLib doesn't offer such info
* - the 'flags()' function still works, returning all flags, but two new
* functions 'compile_flags()' and 'match_flags()' return just their respective
* flags, since GLib has a different and smaller set of such flags, for
* regex compile vs. match functions
* - Using POSIX character classes against strings with non-ASCII characters
* might match high-order characters, because glib always sets PCRE_UCP
* even if G_REGEX_RAW is set. For example, '[:alpha;]' and '\w' match certain
* non-ASCII bytes.
* - obviously quite a bit else is changed to interface to GLib's regex instead
* of PCRE, but hopefully those changes aren't visible to user/caller
*/
#include <stdlib.h>
#include <string.h>
#include <locale.h>
#include <glib.h>
#include "lua.h"
#include "lauxlib.h"
#include "lrexlib.h"
extern flag_pair gregex_error_flags[];
/* These 2 settings may be redefined from the command-line or the makefile.
* They should be kept in sync between themselves and with the target name.
*/
#ifndef REX_LIBNAME
# ifdef LREXLIB_WIRESHARK
# define REX_LIBNAME "GRegex"
# else
# define REX_LIBNAME "rex_glib"
# endif
#endif
#define REX_TYPENAME REX_LIBNAME"_regex"
#define ALG_CFLAGS_DFLT G_REGEX_RAW
#define ALG_EFLAGS_DFLT 0
static int getcflags (lua_State *L, int pos);
#define ALG_GETCFLAGS(L,pos) getcflags(L, pos)
#define ALG_NOMATCH(res) ((res) == FALSE)
#define ALG_ISMATCH(res) ((res) == TRUE)
#define ALG_SUBBEG(ud,n) getSubStartPos(ud,n)
#define ALG_SUBEND(ud,n) getSubEndPos(ud,n)
#define ALG_SUBLEN(ud,n) (ALG_SUBEND(ud,n) - ALG_SUBBEG(ud,n))
#define ALG_SUBVALID(ud,n) (ALG_SUBBEG(ud,n) >= 0)
#define ALG_NSUB(ud) ((int) g_regex_get_capture_count(ud->pr))
#define ALG_PUSHSUB(L,ud,text,n) \
lua_pushlstring (L, (text) + ALG_SUBBEG(ud,n), ALG_SUBLEN(ud,n))
#define ALG_PUSHSUB_OR_FALSE(L,ud,text,n) \
{ if ( ALG_SUBVALID(ud,n) ) { ALG_PUSHSUB (L,ud,text,n); } else { lua_pushboolean (L,0); } }
#define ALG_PUSHSTART(L,ud,offs,n) lua_pushinteger(L, (offs) + ALG_SUBBEG(ud,n) + 1)
#define ALG_PUSHEND(L,ud,offs,n) lua_pushinteger(L, (offs) + ALG_SUBEND(ud,n))
#define ALG_PUSHOFFSETS(L,ud,offs,n) \
(ALG_PUSHSTART(L,ud,offs,n), ALG_PUSHEND(L,ud,offs,n))
#define ALG_BASE(st) 0
#define ALG_PULL
/* we define ALG_USERETRY because GLib does expose PCRE's NOTEMPTY and ANCHORED flags */
#define ALG_USERETRY
/* Mask of all the possible values for GRegexCompileFlags. */
#define G_REGEX_COMPILE_MASK (G_REGEX_CASELESS | \
G_REGEX_MULTILINE | \
G_REGEX_DOTALL | \
G_REGEX_EXTENDED | \
G_REGEX_ANCHORED | \
G_REGEX_DOLLAR_ENDONLY | \
G_REGEX_UNGREEDY | \
G_REGEX_RAW | \
G_REGEX_NO_AUTO_CAPTURE | \
G_REGEX_OPTIMIZE | \
G_REGEX_DUPNAMES | \
G_REGEX_NEWLINE_CR | \
G_REGEX_NEWLINE_LF | \
G_REGEX_NEWLINE_CRLF | \
G_REGEX_FIRSTLINE | \
G_REGEX_NEWLINE_ANYCRLF | \
G_REGEX_BSR_ANYCRLF | \
G_REGEX_JAVASCRIPT_COMPAT)
/* Mask of all the possible values for GRegexMatchFlags. */
#define G_REGEX_MATCH_MASK (G_REGEX_MATCH_ANCHORED | \
G_REGEX_MATCH_NOTBOL | \
G_REGEX_MATCH_NOTEOL | \
G_REGEX_MATCH_NOTEMPTY | \
G_REGEX_MATCH_PARTIAL | \
G_REGEX_MATCH_NEWLINE_CR | \
G_REGEX_MATCH_NEWLINE_LF | \
G_REGEX_MATCH_NEWLINE_CRLF | \
G_REGEX_MATCH_NEWLINE_ANY | \
G_REGEX_MATCH_NEWLINE_ANYCRLF | \
G_REGEX_MATCH_BSR_ANYCRLF | \
G_REGEX_MATCH_BSR_ANY | \
G_REGEX_MATCH_PARTIAL_SOFT | \
G_REGEX_MATCH_PARTIAL_HARD | \
G_REGEX_MATCH_NOTEMPTY_ATSTART)
static int check_eflags(lua_State *L, const int idx, const int def);
#define ALG_GETEFLAGS(L,idx) check_eflags(L, idx, ALG_EFLAGS_DFLT)
typedef struct {
GRegex * pr;
GMatchInfo * match_info;
GError * error; /* didn't want to put this here, but can't free it otherwise */
int freed;
} TGrgx;
static void minfo_free(TGrgx* ud) {
g_match_info_free (ud->match_info);
ud->match_info = NULL;
}
static void gerror_free(TGrgx* ud) {
if (ud->error)
g_error_free (ud->error);
ud->error = NULL;
}
static int getSubStartPos(TGrgx* ud, int n) {
int start_pos = -1;
g_match_info_fetch_pos (ud->match_info, n, &start_pos, NULL);
return start_pos;
}
static int getSubEndPos(TGrgx* ud, int n) {
int end_pos = -1;
g_match_info_fetch_pos (ud->match_info, n, NULL, &end_pos);
return end_pos;
}
#define TUserdata TGrgx
/* TODO: handle named subpatterns somehow */
#if 0
static void do_named_subpatterns (lua_State *L, TGrgx *ud, const char *text);
# define DO_NAMED_SUBPATTERNS do_named_subpatterns
#endif
#include "lrexlib_algo.h"
/* Functions
******************************************************************************
*/
static int getcflags (lua_State *L, int pos) {
switch (lua_type (L, pos)) {
case LUA_TNONE:
case LUA_TNIL:
return ALG_CFLAGS_DFLT;
case LUA_TNUMBER: {
int res = (int) lua_tointeger (L, pos);
if ((res & ~G_REGEX_COMPILE_MASK) != 0) {
return luaL_error (L, "GLib Regex compile flag is invalid");
}
return res;
}
case LUA_TSTRING: {
const char *s = lua_tostring (L, pos);
int res = 0, ch;
while ((ch = *s++) != '\0') {
if (ch == 'i') res |= G_REGEX_CASELESS;
else if (ch == 'm') res |= G_REGEX_MULTILINE;
else if (ch == 's') res |= G_REGEX_DOTALL;
else if (ch == 'x') res |= G_REGEX_EXTENDED;
else if (ch == 'U') res |= G_REGEX_UNGREEDY;
}
return (int)res;
}
default:
return luaL_typerror (L, pos, "number or string");
}
}
static int check_eflags(lua_State *L, const int idx, const int def) {
int eflags = (int) luaL_optinteger (L, idx, def);
if ((eflags & ~G_REGEX_MATCH_MASK) != 0) {
return luaL_error (L, "GLib Regex match flag is invalid");
}
return eflags;
}
/* this function is used in algo.h as well */
static int generate_error (lua_State *L, const TGrgx *ud, int errcode) {
const char *key = get_flag_key (gregex_error_flags, ud->error->code);
(void) errcode;
if (key)
return luaL_error (L, "error G_REGEX_%s (%s)", key, ud->error->message);
else
return luaL_error (L, "GLib Regex error: %s (code %d)", ud->error->message, ud->error->code);
}
static int compile_regex (lua_State *L, const TArgComp *argC, TGrgx **pud) {
TGrgx *ud;
ud = (TGrgx*)lua_newuserdata (L, sizeof (TGrgx));
memset (ud, 0, sizeof (TGrgx)); /* initialize all members to 0 */
lua_pushvalue (L, ALG_ENVIRONINDEX);
lua_setmetatable (L, -2);
ud->pr = g_regex_new (argC->pattern,
(GRegexCompileFlags)(argC->cflags | G_REGEX_RAW), (GRegexMatchFlags)0, &ud->error);
if (pud) *pud = ud;
if (!ud->pr)
return luaL_error (L, "%s (code: %d)", ud->error->message, ud->error->code);
return 1;
}
/* method r:dfa_exec (s, [st], [ef]) */
static void checkarg_dfa_exec (lua_State *L, TArgExec *argE, TGrgx **ud) {
*ud = check_ud (L);
argE->text = luaL_checklstring (L, 2, &argE->textlen);
argE->startoffset = get_startoffset (L, 3, argE->textlen);
argE->eflags = ALG_GETEFLAGS (L, 4);
}
/* unlike PCRE, partial matching won't return the actual substrings/matches */
static int Gregex_dfa_exec (lua_State *L)
{
TArgExec argE;
TGrgx *ud;
gboolean res;
checkarg_dfa_exec (L, &argE, &ud);
gerror_free (ud);
res = g_regex_match_all_full (ud->pr, argE.text, (int)argE.textlen,
argE.startoffset, (GRegexMatchFlags)argE.eflags, &ud->match_info, &ud->error);
if (ALG_ISMATCH (res)) {
int i, start_pos, end_pos;
int max = g_match_info_get_match_count (ud->match_info);
g_match_info_fetch_pos (ud->match_info, 0, &start_pos, NULL);
lua_pushinteger (L, start_pos + 1); /* 1-st return value */
lua_newtable (L); /* 2-nd return value */
for (i=0; i<max; i++) {
g_match_info_fetch_pos (ud->match_info, i, NULL, &end_pos);
/* I don't know why these offsets aren't incremented by 1 to match Lua indexing? */
lua_pushinteger (L, end_pos);
lua_rawseti (L, -2, i+1);
}
lua_pushinteger (L, max); /* 3-rd return value */
minfo_free (ud);
return 3;
}
else if (g_match_info_is_partial_match(ud->match_info)) {
lua_pushboolean(L,1);
minfo_free (ud);
return 1;
}
else {
minfo_free (ud);
if (ALG_NOMATCH (res)) {
lua_pushnil (L);
return 1;
}
else
return generate_error (L, ud, 0);
}
}
#ifdef ALG_USERETRY
static int gmatch_exec (TUserdata *ud, TArgExec *argE, int retry) {
int eflags = retry ? (argE->eflags|G_REGEX_MATCH_NOTEMPTY|G_REGEX_MATCH_ANCHORED) : argE->eflags;
minfo_free (ud);
gerror_free (ud);
return g_regex_match_full (ud->pr, argE->text, argE->textlen,
argE->startoffset, (GRegexMatchFlags)eflags, &ud->match_info, &ud->error);
}
#else
static int gmatch_exec (TUserdata *ud, TArgExec *argE) {
minfo_free (ud);
gerror_free (ud);
return g_regex_match_full (ud->pr, argE->text, argE->textlen,
argE->startoffset, (GRegexMatchFlags)argE->eflags, &ud->match_info, &ud->error);
}
#endif
static void gmatch_pushsubject (lua_State *L, TArgExec *argE) {
lua_pushlstring (L, argE->text, argE->textlen);
}
static int findmatch_exec (TGrgx *ud, TArgExec *argE) {
minfo_free (ud);
gerror_free (ud);
return g_regex_match_full (ud->pr, argE->text, argE->textlen,
argE->startoffset, (GRegexMatchFlags)argE->eflags, &ud->match_info, &ud->error);
}
#ifdef ALG_USERETRY
static int gsub_exec (TGrgx *ud, TArgExec *argE, int st, int retry) {
int eflags = retry ? (argE->eflags|G_REGEX_MATCH_NOTEMPTY|G_REGEX_MATCH_ANCHORED) : argE->eflags;
minfo_free (ud);
gerror_free (ud);
return g_regex_match_full (ud->pr, argE->text, argE->textlen,
st, (GRegexMatchFlags)eflags, &ud->match_info, &ud->error);
}
#else
static int gsub_exec (TGrgx *ud, TArgExec *argE, int st) {
minfo_free (ud);
gerror_free (ud);
return g_regex_match_full (ud->pr, argE->text, argE->textlen,
st, (GRegexMatchFlags)argE->eflags, &ud->match_info, &ud->error);
}
#endif
static int split_exec (TGrgx *ud, TArgExec *argE, int offset) {
minfo_free (ud);
gerror_free (ud);
return g_regex_match_full (ud->pr, argE->text, argE->textlen, offset,
(GRegexMatchFlags)argE->eflags, &ud->match_info, &ud->error);
}
static int Gregex_gc (lua_State *L) {
TGrgx *ud = check_ud (L);
if (ud->freed == 0) { /* precaution against "manual" __gc calling */
ud->freed = 1;
if (ud->pr) g_regex_unref (ud->pr);
minfo_free (ud);
gerror_free (ud);
}
return 0;
}
static int Gregex_tostring (lua_State *L) {
TGrgx *ud = check_ud (L);
if (ud->freed == 0)
lua_pushfstring (L, "%s (%p)", REX_TYPENAME, (void*)ud);
else
lua_pushfstring (L, "%s (deleted)", REX_TYPENAME);
return 1;
}
static int Gregex_version (lua_State *L) {
lua_pushfstring (L, "%d.%d.%d", GLIB_MAJOR_VERSION, GLIB_MINOR_VERSION, GLIB_MICRO_VERSION);
return 1;
}
static const luaL_Reg r_methods[] = {
{ "exec", algm_exec },
{ "tfind", algm_tfind }, /* old name: match */
{ "find", algm_find },
{ "match", algm_match },
{ "dfa_exec", Gregex_dfa_exec },
{ "__gc", Gregex_gc },
{ "__tostring", Gregex_tostring },
{ NULL, NULL }
};
static const luaL_Reg r_functions[] = {
{ "match", algf_match },
{ "find", algf_find },
{ "gmatch", algf_gmatch },
{ "gsub", algf_gsub },
{ "split", algf_split },
{ "new", algf_new },
{ "flags", Gregex_get_flags },
{ "compile_flags", Gregex_get_compile_flags },
{ "match_flags", Gregex_get_match_flags },
{ "version", Gregex_version },
{ NULL, NULL }
};
/* Open the library */
REX_API int REX_OPENLIB (lua_State *L) {
alg_register(L, r_methods, r_functions, "GLib Regex");
return 1;
}