wireshark/epan/dfilter/grammar.lemon

407 lines
9.8 KiB
Plaintext
Raw Normal View History

%include {
#include "config.h"
#include <assert.h>
#include "dfilter-int.h"
#include "syntax-tree.h"
#include "sttype-range.h"
#include "sttype-test.h"
#include "sttype-function.h"
#include "sttype-set.h"
#include "drange.h"
#include "grammar.h"
#ifdef _WIN32
#pragma warning(disable:4671)
#endif
static stnode_t *
new_function(dfwork_t *dfw, df_lval_t *lval);
static stnode_t *
new_test(dfwork_t *dfw, test_op_t op, df_lval_t *lval);
/* End of C code */
}
/* Parser Information */
%name Dfilter
%token_prefix TOKEN_
%extra_argument {dfwork_t *dfw}
/* Terminal and Non-Terminal types and destructors */
%token_type {df_lval_t*}
%token_destructor {
(void)dfw;
df_lval_free($$, TRUE);
}
%default_type {stnode_t*}
%default_destructor {stnode_free($$);}
%type range_node {drange_node*}
%destructor range_node {drange_node_free($$);}
%type range_node_list {GSList*}
%destructor range_node_list {drange_node_free_list($$);}
%type function_params {GSList*}
%destructor function_params {st_funcparams_free($$);}
%type set_list {GSList*}
%destructor set_list {set_nodelist_free($$);}
2021-10-26 09:35:12 +00:00
%type set_element {GSList*}
%destructor set_element {set_nodelist_free($$);}
/* This is called as soon as a syntax error happens. After that,
any "error" symbols are shifted, if possible. */
%syntax_error {
if (!TOKEN) {
dfilter_fail(dfw, "Unexpected end of filter expression.");
return;
}
dfilter_fail(dfw, "\"%s\" was unexpected in this context.",
df_lval_value(TOKEN));
}
/* When a parse fails, mark an error. This occurs after
the above syntax_error code and after the parser fails to
use error recovery, shifting an "error" symbol and successfully
shifting 3 more symbols. */
%parse_failure {
dfw->syntax_error = TRUE;
}
/* ----------------- The grammar -------------- */
/* Associativity */
%left TEST_AND.
%left TEST_OR.
%nonassoc TEST_ALL_EQ TEST_ANY_EQ TEST_ALL_NE TEST_ANY_NE TEST_LT TEST_LE TEST_GT TEST_GE
TEST_CONTAINS TEST_MATCHES TEST_BITWISE_AND.
%right TEST_NOT.
/* Top-level targets */
sentence ::= expr(X). { dfw->st_root = X; }
sentence ::= . { dfw->st_root = NULL; }
expr(X) ::= relation_test(R). { X = R; }
expr(X) ::= logical_test(L). { X = L; }
%code {
static stnode_t *
new_test(dfwork_t *dfw _U_, test_op_t op, df_lval_t *lval)
{
stnode_t *node = stnode_new_test(op, df_lval_value(lval));
df_lval_free(lval, FALSE);
return node;
}
}
/* Logical tests */
logical_test(T) ::= expr(E) TEST_AND(L) expr(F).
{
T = new_test(dfw, TEST_OP_AND, L);
sttype_test_set2_args(T, E, F);
}
logical_test(T) ::= expr(E) TEST_OR(L) expr(F).
{
T = new_test(dfw, TEST_OP_OR, L);
sttype_test_set2_args(T, E, F);
}
logical_test(T) ::= TEST_NOT(L) expr(E).
{
T = new_test(dfw, TEST_OP_NOT, L);
sttype_test_set1_args(T, E);
}
logical_test(T) ::= entity(E).
{
T = new_test(dfw, TEST_OP_EXISTS, NULL);
sttype_test_set1_args(T, E);
}
logical_test(T) ::= bitwise_term(E).
{
T = new_test(dfw, TEST_OP_NOTZERO, NULL);
sttype_test_set1_args(T, E);
}
/* Entities, or things that can be compared/tested/checked */
atom(E) ::= STRING(S).
{
E = stnode_new_string(df_lval_value(S), df_lval_value(S));
df_lval_free(S, FALSE);
}
atom(E) ::= CHARCONST(C).
{
E = stnode_new_charconst(df_lval_number(C), df_lval_value(C));
df_lval_free(C, FALSE);
}
atom(E) ::= UNPARSED(U).
{
E = stnode_new_unparsed(df_lval_value(U), df_lval_value(U));
df_lval_free(U, FALSE);
}
atom(E) ::= LITERAL(S).
dfilter: Add special syntax for literals and names The syntax for protocols and some literals like numbers and bytes/addresses can be ambiguous. Some protocols can be parsed as a literal, for example the protocol "fc" (Fibre Channel) can be parsed as 0xFC. If a numeric protocol is registered that will also take precedence over any literal, according to the current rules, thereby breaking numerical comparisons to that number. The same for an hypothetical protocol named "true", etc. To allow the user to disambiguate this meaning introduce new syntax. Any value prefixed with ':' or enclosed in <,> will be treated as a literal value only. The value :fc or <fc> will always mean 0xFC, under any context. Never a protocol whose filter name is "fc". Likewise any value prefixed with a dot will always be parsed as an identifier (protocol or protocol field) in the language. Never any literal value parsed from the token "fc". This allows the user to be explicit about the meaning, and between the two explicit methods plus the ambiguous one it doesn't completely break any one meaning. The difference can be seen in the following two programs: Filter: frame == fc Constants: Instructions: 00000 READ_TREE frame -> reg#0 00001 IF-FALSE-GOTO 5 00002 READ_TREE fc -> reg#1 00003 IF-FALSE-GOTO 5 00004 ANY_EQ reg#0 == reg#1 00005 RETURN -------- Filter: frame == :fc Constants: 00000 PUT_FVALUE fc <FT_PROTOCOL> -> reg#1 Instructions: 00000 READ_TREE frame -> reg#0 00001 IF-FALSE-GOTO 3 00002 ANY_EQ reg#0 == reg#1 00003 RETURN The filter "frame == fc" is the same as "filter == .fc", according to the current heuristic, except the first form will try to parse it as a literal if the name does not correspond to any registered protocol. By treating a leading dot as a name in the language we necessarily disallow writing floats with a leading dot. We will also disallow writing with an ending dot when using unparsed values. This is a backward incompatibility but has the happy side effect of making the expression {1...2} unambiguous. This could either mean "1 .. .2" or "1. .. 2". If we require a leading and ending digit then the meaning is clear: 1.0..0.2 -> 1.0 .. 0.2 Fixes #17731.
2022-02-22 21:55:05 +00:00
{
E = stnode_new_literal(df_lval_value(S), df_lval_value(S));
dfilter: Add special syntax for literals and names The syntax for protocols and some literals like numbers and bytes/addresses can be ambiguous. Some protocols can be parsed as a literal, for example the protocol "fc" (Fibre Channel) can be parsed as 0xFC. If a numeric protocol is registered that will also take precedence over any literal, according to the current rules, thereby breaking numerical comparisons to that number. The same for an hypothetical protocol named "true", etc. To allow the user to disambiguate this meaning introduce new syntax. Any value prefixed with ':' or enclosed in <,> will be treated as a literal value only. The value :fc or <fc> will always mean 0xFC, under any context. Never a protocol whose filter name is "fc". Likewise any value prefixed with a dot will always be parsed as an identifier (protocol or protocol field) in the language. Never any literal value parsed from the token "fc". This allows the user to be explicit about the meaning, and between the two explicit methods plus the ambiguous one it doesn't completely break any one meaning. The difference can be seen in the following two programs: Filter: frame == fc Constants: Instructions: 00000 READ_TREE frame -> reg#0 00001 IF-FALSE-GOTO 5 00002 READ_TREE fc -> reg#1 00003 IF-FALSE-GOTO 5 00004 ANY_EQ reg#0 == reg#1 00005 RETURN -------- Filter: frame == :fc Constants: 00000 PUT_FVALUE fc <FT_PROTOCOL> -> reg#1 Instructions: 00000 READ_TREE frame -> reg#0 00001 IF-FALSE-GOTO 3 00002 ANY_EQ reg#0 == reg#1 00003 RETURN The filter "frame == fc" is the same as "filter == .fc", according to the current heuristic, except the first form will try to parse it as a literal if the name does not correspond to any registered protocol. By treating a leading dot as a name in the language we necessarily disallow writing floats with a leading dot. We will also disallow writing with an ending dot when using unparsed values. This is a backward incompatibility but has the happy side effect of making the expression {1...2} unambiguous. This could either mean "1 .. .2" or "1. .. 2". If we require a leading and ending digit then the meaning is clear: 1.0..0.2 -> 1.0 .. 0.2 Fixes #17731.
2022-02-22 21:55:05 +00:00
df_lval_free(S, FALSE);
}
atom(E) ::= IDENTIFIER(F).
dfilter: Add special syntax for literals and names The syntax for protocols and some literals like numbers and bytes/addresses can be ambiguous. Some protocols can be parsed as a literal, for example the protocol "fc" (Fibre Channel) can be parsed as 0xFC. If a numeric protocol is registered that will also take precedence over any literal, according to the current rules, thereby breaking numerical comparisons to that number. The same for an hypothetical protocol named "true", etc. To allow the user to disambiguate this meaning introduce new syntax. Any value prefixed with ':' or enclosed in <,> will be treated as a literal value only. The value :fc or <fc> will always mean 0xFC, under any context. Never a protocol whose filter name is "fc". Likewise any value prefixed with a dot will always be parsed as an identifier (protocol or protocol field) in the language. Never any literal value parsed from the token "fc". This allows the user to be explicit about the meaning, and between the two explicit methods plus the ambiguous one it doesn't completely break any one meaning. The difference can be seen in the following two programs: Filter: frame == fc Constants: Instructions: 00000 READ_TREE frame -> reg#0 00001 IF-FALSE-GOTO 5 00002 READ_TREE fc -> reg#1 00003 IF-FALSE-GOTO 5 00004 ANY_EQ reg#0 == reg#1 00005 RETURN -------- Filter: frame == :fc Constants: 00000 PUT_FVALUE fc <FT_PROTOCOL> -> reg#1 Instructions: 00000 READ_TREE frame -> reg#0 00001 IF-FALSE-GOTO 3 00002 ANY_EQ reg#0 == reg#1 00003 RETURN The filter "frame == fc" is the same as "filter == .fc", according to the current heuristic, except the first form will try to parse it as a literal if the name does not correspond to any registered protocol. By treating a leading dot as a name in the language we necessarily disallow writing floats with a leading dot. We will also disallow writing with an ending dot when using unparsed values. This is a backward incompatibility but has the happy side effect of making the expression {1...2} unambiguous. This could either mean "1 .. .2" or "1. .. 2". If we require a leading and ending digit then the meaning is clear: 1.0..0.2 -> 1.0 .. 0.2 Fixes #17731.
2022-02-22 21:55:05 +00:00
{
char *name = df_lval_value(F);
header_field_info *hfinfo = dfilter_resolve_unparsed(dfw, name);
if (hfinfo == NULL) {
dfilter_fail(dfw, "\"%s\" is not a valid protocol or protocol field.", name);
}
E = stnode_new(STTYPE_FIELD, hfinfo, name);
df_lval_free(F, FALSE);
}
entity(E) ::= atom(A). { E = A; }
dfilter: Refactor macro tree references This replaces the current macro reference system with a completely different implementation. Instead of a macro a reference is a syntax element. A reference is a constant that can be filled in the dfilter code after compilation from an existing protocol tree. It is best understood as a field value that can be read from a fixed tree that is not the frame being filtered. Usually this fixed tree is the currently selected frame when the filter is applied. This allows comparing fields in the filtered frame with fields in the selected frame. Because the field reference syntax uses the same sigil notation as a macro we have to use a heuristic to distinguish them: if the name has a dot it is a field reference, otherwise it is a macro name. The reference is synctatically validated at compile time. There are two main advantages to this implementation (and a couple of minor ones): The protocol tree for each selected frame is only walked if we have a display filter and if the display filter uses references. Also only the actual reference values are copied, intead of loading the entire tree into a hash table (in textual form even). The other advantage is that the reference is tested like a protocol field against all the values in the selected frame (if there is more than one). Currently the reference fields are not "primed" during dissection, so the entire tree is walked to find a particular reference (this is similar to the previous implementation). If the display filter contains a valid reference and the reference is not loaded at the time the filter is run the result is the same as a non existing field for a regular READ_TREE instruction. Fixes #17599.
2022-03-27 14:26:46 +00:00
entity(E) ::= REF_OPEN REFERENCE(F) REF_CLOSE.
{
char *name = df_lval_value(F);
header_field_info *hfinfo = dfilter_resolve_unparsed(dfw, name);
if (hfinfo == NULL) {
dfilter_fail(dfw, "\"%s\" is not a valid protocol or protocol field.", name);
}
E = stnode_new(STTYPE_REFERENCE, hfinfo, df_lval_value(F));
df_lval_free(F, FALSE);
}
entity(E) ::= range(R). { E = R; }
entity(E) ::= function(F). { E = F; }
bitwise_term(T) ::= entity(F) BITWISE_AND(B) entity(M).
{
T = stnode_new(STTYPE_BITWISE, NULL, df_lval_value(B));
sttype_test_set2(T, OP_BITWISE_AND, F, M);
df_lval_free(B, FALSE);
}
arithmetic_term(T) ::= PLUS entity(N).
{
T = N;
}
arithmetic_term(T) ::= MINUS entity(N).
{
T = stnode_new(STTYPE_ARITHMETIC, NULL, NULL);
sttype_test_set1(T, OP_UNARY_MINUS, N);
}
arithmetic_term(T) ::= entity(E) PLUS(P) entity(N).
{
T = stnode_new(STTYPE_ARITHMETIC, NULL, df_lval_value(P));
sttype_test_set2(T, OP_ADD, E, N);
df_lval_free(P, FALSE);
}
arithmetic_term(T) ::= entity(E) MINUS(M) entity(N).
{
T = stnode_new(STTYPE_ARITHMETIC, NULL, df_lval_value(M));
sttype_test_set2(T, OP_SUBTRACT, E, N);
df_lval_free(M, FALSE);
}
term(T) ::= entity(E). { T = E; }
term(T) ::= bitwise_term(E). { T = E; }
term(T) ::= arithmetic_term(E). { T = E; }
/* Ranges */
range(R) ::= entity(E) LBRACKET range_node_list(L) RBRACKET.
{
R = stnode_new(STTYPE_RANGE, NULL, NULL);
sttype_range_set(R, E, L);
/* Delete the list, but not the drange_nodes that
* the list contains. */
g_slist_free(L);
}
range_node_list(L) ::= range_node(D).
{
L = g_slist_append(NULL, D);
}
range_node_list(L) ::= range_node_list(P) COMMA range_node(D).
{
L = g_slist_append(P, D);
}
range_node(D) ::= RANGE(R).
{
char *err = NULL;
D = drange_node_from_str(df_lval_value(R), &err);
if (err != NULL) {
dfilter_fail(dfw, "%s", err);
g_free(err);
}
df_lval_free(R, TRUE);
}
/* Relational tests */
cmp_op(O) ::= TEST_ALL_EQ(L). { O = new_test(dfw, TEST_OP_ALL_EQ, L); }
cmp_op(O) ::= TEST_ANY_EQ(L). { O = new_test(dfw, TEST_OP_ANY_EQ, L); }
cmp_op(O) ::= TEST_ALL_NE(L). { O = new_test(dfw, TEST_OP_ALL_NE, L); }
cmp_op(O) ::= TEST_ANY_NE(L). { O = new_test(dfw, TEST_OP_ANY_NE, L); }
cmp_op(O) ::= TEST_GT(L). { O = new_test(dfw, TEST_OP_GT, L); }
cmp_op(O) ::= TEST_GE(L). { O = new_test(dfw, TEST_OP_GE, L); }
cmp_op(O) ::= TEST_LT(L). { O = new_test(dfw, TEST_OP_LT, L); }
cmp_op(O) ::= TEST_LE(L). { O = new_test(dfw, TEST_OP_LE, L); }
comparison_test(T) ::= term(E) cmp_op(O) term(F).
{
T = O;
sttype_test_set2_args(O, E, F);
}
/* 'a == b == c' or 'a < b <= c <= d < e' */
comparison_test(T) ::= term(E) cmp_op(O) comparison_test(R).
{
stnode_t *L, *F;
/* for now generate it like E O F TEST_OP_AND F P G, later it could be optimized
or semantically checked (to make a <= b >= c or a == b != c invalid)?
*/
F = R;
do {
ws_assert(F != NULL && stnode_type_id(F) == STTYPE_TEST);
sttype_test_get(F, NULL, &F, NULL);
} while (stnode_type_id(F) == STTYPE_TEST);
L = O;
sttype_test_set2_args(L, E, stnode_dup(F));
T = stnode_new_test(TEST_OP_AND, NULL);
sttype_test_set2_args(T, L, R);
}
relation_test(T) ::= comparison_test(C). { T = C; }
/* Does not chain like math comparisons. */
rel_binop(O) ::= TEST_CONTAINS(L). { O = new_test(dfw, TEST_OP_CONTAINS, L); }
rel_binop(O) ::= TEST_MATCHES(L). { O = new_test(dfw, TEST_OP_MATCHES, L); }
relation_test(T) ::= entity(E) rel_binop(O) entity(F).
{
T = O;
sttype_test_set2_args(O, E, F);
}
relation_test(T) ::= entity(E) TEST_IN(O) set(S).
{
T = new_test(dfw, TEST_OP_IN, O);
sttype_test_set2_args(T, E, S);
}
relation_test(T) ::= entity(E) TEST_NOT(P) TEST_IN(O) set(S).
{
stnode_t *R = new_test(dfw, TEST_OP_IN, O);
sttype_test_set2_args(R, E, S);
T = new_test(dfw, TEST_OP_NOT, P);
sttype_test_set1_args(T, R);
}
2021-10-26 09:35:12 +00:00
set(S) ::= LBRACE set_list(L) RBRACE.
{
S = stnode_new(STTYPE_SET, L, NULL);
2021-10-26 09:35:12 +00:00
}
set_list(L) ::= set_element(N).
{
L = g_slist_concat(NULL, N);
}
set_list(L) ::= set_list(P) COMMA set_element(N).
{
L = g_slist_concat(P, N);
}
set_entity(N) ::= atom(X).
{
N = X;
}
set_entity(N) ::= MINUS atom(X).
{
N = stnode_new(STTYPE_ARITHMETIC, NULL, NULL);
sttype_test_set1(N, OP_UNARY_MINUS, X);
}
set_entity(N) ::= PLUS atom(X).
{
N = X;
}
set_element(N) ::= set_entity(X).
{
N = g_slist_append(NULL, X);
N = g_slist_append(N, NULL);
}
set_element(N) ::= set_entity(X) DOTDOT set_entity(Y).
{
N = g_slist_append(NULL, X);
N = g_slist_append(N, Y);
}
/* Functions */
%code {
static stnode_t *
new_function(dfwork_t *dfw, df_lval_t *lval)
{
const char *name = df_lval_value(lval);
df_func_def_t *def = df_func_lookup(name);
if (!def) {
dfilter_fail(dfw, "Function '%s' does not exist", name);
}
stnode_t *node = stnode_new(STTYPE_FUNCTION, def, df_lval_value(lval));
df_lval_free(lval, FALSE);
return node;
}
}
/* A function can have one or more parameters */
function(F) ::= UNPARSED(U) LPAREN function_params(P) RPAREN.
{
F = new_function(dfw, U);
sttype_function_set_params(F, P);
}
/* A function can have zero parameters. */
function(F) ::= UNPARSED(U) LPAREN RPAREN.
{
F = new_function(dfw, U);
}
function_params(P) ::= entity(E).
{
P = g_slist_append(NULL, E);
}
function_params(P) ::= function_params(L) COMMA entity(E).
{
P = g_slist_append(L, E);
}
/* Any expression inside parens is simply that expression */
expr(X) ::= LPAREN expr(Y) RPAREN.
{
X = Y;
}