wireshark/epan/dfilter/scanner.l

%top {
/* Include this before everything else, for various large-file definitions */
#include "config.h"
#include <wireshark.h>

#include <stdlib.h>
#include <errno.h>

#include "dfilter-int.h"
#include "syntax-tree.h"
#include "grammar.h"
#include "dfunctions.h"
}

/*
 * We want a reentrant scanner.
 */
%option reentrant

/*
 * We don't use input, so don't generate code for it.
 */
%option noinput

/*
 * We don't use unput, so don't generate code for it.
 */
%option nounput

/*
 * We don't read interactively from the terminal.
 */
%option never-interactive

/*
 * Prefix scanner routines with "df_" rather than "yy", so this scanner
 * can coexist with other scanners.
 */
%option prefix="df_"

/*
 * We're reading from a string, so we don't need yywrap.
 */
%option noyywrap

/*
 * The type for the state we keep for a scanner.
 */
%option extra-type="df_scanner_state_t *"

/*
 * We have to override the memory allocators so that we don't get
 * "unused argument" warnings from the yyscanner argument (which
 * we don't use, as we have a global memory allocator).
 *
 * We provide, as macros, our own versions of the routines generated by Flex,
 * which just call malloc()/realloc()/free() (as the Flex versions do),
 * discarding the extra argument.
 */
%option noyyalloc
%option noyyrealloc
%option noyyfree

%{
/*
 * Wireshark - Network traffic analyzer
 * By Gerald Combs <gerald@wireshark.org>
 * Copyright 2001 Gerald Combs
 *
 * SPDX-License-Identifier: GPL-2.0-or-later
 */

/*
 * Disable diagnostics in the code generated by Flex.
 */
DIAG_OFF_FLEX

#define LVAL		df_lval
#define LVAL_TYPE	stnode_t*
#define LVAL_INIT_VAL	NULL
#define MODNAME		df
#define FLEX_YY_PREFIX	df_

#include <lemonflex-head.inc>

/*#undef YY_NO_UNPUT*/

static int set_lval_str(int token, const char *token_value);
static int simple(int token, const char *token_value);
#define SIMPLE(token) simple(token, yytext)

/*
 * Sleazy hack to suppress compiler warnings in yy_fatal_error().
 */
#define YY_EXIT_FAILURE ((void)yyscanner, 2)

/*
 * Macros for the allocators, to discard the extra argument.
 */
#define df_alloc(size, yyscanner)		(void *)malloc(size)
#define df_realloc(ptr, size, yyscanner)	(void *)realloc((char *)(ptr), (size))
#define df_free(ptr, yyscanner)			free((char *)ptr)

%}

%x RANGE_INT
%x RANGE_PUNCT
%x DQUOTE
%x SQUOTE

%%

[[:blank:]\n]+		{
	/* Ignore whitespace, unless set elements are being parsed. Perhaps it
	 * should have used commas from the beginning, but now we are stuck with
	 * whitespace as separators. */
	if (yyextra->in_set) {
		return simple(TOKEN_WHITESPACE, NULL);
	}
}


"("		return SIMPLE(TOKEN_LPAREN);
")"		return SIMPLE(TOKEN_RPAREN);
","		return SIMPLE(TOKEN_COMMA);

"{"[[:blank:]\n]*	{
	yyextra->in_set = TRUE;
	return simple(TOKEN_LBRACE, "{");
}
[[:blank:]\n]*".."[[:blank:]\n]*	return simple(TOKEN_DOTDOT, "..");
[[:blank:]\n]*"}"	{
	yyextra->in_set = FALSE;
	return simple(TOKEN_RBRACE, "}");
}

"=="		return SIMPLE(TOKEN_TEST_EQ);
"eq"		return SIMPLE(TOKEN_TEST_EQ);
"!="			{
	add_deprecated_token(yyextra->dfw, "!=");
	return SIMPLE(TOKEN_TEST_NE);
}
"ne"			{
	add_deprecated_token(yyextra->dfw, "ne");
	return SIMPLE(TOKEN_TEST_NE);
}
">"		return SIMPLE(TOKEN_TEST_GT);
"gt"		return SIMPLE(TOKEN_TEST_GT);
">="		return SIMPLE(TOKEN_TEST_GE);
"ge"		return SIMPLE(TOKEN_TEST_GE);
"<"		return SIMPLE(TOKEN_TEST_LT);
"lt"		return SIMPLE(TOKEN_TEST_LT);
"<="		return SIMPLE(TOKEN_TEST_LE);
"le"		return SIMPLE(TOKEN_TEST_LE);
"bitwise_and"	return SIMPLE(TOKEN_TEST_BITWISE_AND);
"&"		return SIMPLE(TOKEN_TEST_BITWISE_AND);
"contains"	return SIMPLE(TOKEN_TEST_CONTAINS);
"~"		return SIMPLE(TOKEN_TEST_MATCHES);
"matches"	return SIMPLE(TOKEN_TEST_MATCHES);
"!"		return SIMPLE(TOKEN_TEST_NOT);
"not"		return SIMPLE(TOKEN_TEST_NOT);
"&&"		return SIMPLE(TOKEN_TEST_AND);
"and"		return SIMPLE(TOKEN_TEST_AND);
"||"		return SIMPLE(TOKEN_TEST_OR);
"or"		return SIMPLE(TOKEN_TEST_OR);
"in"		return SIMPLE(TOKEN_TEST_IN);

	/*
	 * The syntax for ranges must handle slice[-d-d] and slice[-d--5], e.g:
	 *   frame[-10-5] (minus ten to five)
	 *   frame[-10--5] (minus ten to minus 5)
	 */

"["					{
	BEGIN(RANGE_INT);
	return SIMPLE(TOKEN_LBRACKET);
}

<RANGE_INT>[+-]?[[:alnum:]]+		{
	BEGIN(RANGE_PUNCT);
	return set_lval_str(TOKEN_INTEGER, yytext);
}

<RANGE_INT,RANGE_PUNCT>":"		{
	BEGIN(RANGE_INT);
	return SIMPLE(TOKEN_COLON);
}

<RANGE_PUNCT>"-"			{
	BEGIN(RANGE_INT);
	return SIMPLE(TOKEN_HYPHEN);
}

<RANGE_INT,RANGE_PUNCT>","		{
	BEGIN(RANGE_INT);
	return SIMPLE(TOKEN_COMMA);
}

<RANGE_INT,RANGE_PUNCT>"]"		{
	BEGIN(INITIAL);
	return SIMPLE(TOKEN_RBRACKET);
}

	/* Error if none of the above while scanning a range (slice) */

<RANGE_PUNCT>[^:\-,\]]+		{
	BEGIN(RANGE_INT);
	return set_lval_str(TOKEN_UNPARSED, yytext);
}

<RANGE_INT>[+-]?[^[:alnum:]\]]+	{
	BEGIN(RANGE_PUNCT);
	return set_lval_str(TOKEN_UNPARSED, yytext);
}

[rR]{0,1}\042			{
	/* start quote of a quoted string */
	/* The example of how to scan for strings was taken from
	the flex 2.5.4 manual, from the section "Start Conditions".
	See:
	http://www.gnu.org/software/flex/manual/html_node/flex_11.html */

	BEGIN(DQUOTE);
	/* A previous filter that failed to compile due to
	a missing end quote will have left quoted_string set
	to something. Clear it now that we are starting
	a new quoted string. */
	if (yyextra->quoted_string) {
		g_string_free(yyextra->quoted_string, TRUE);
		/* Don't set quoted_string to NULL, as we
		do in other quoted_string-cleanup code, as we're
		about to set it in the next line. */
	}
	yyextra->quoted_string = g_string_new("");
	if (yytext[0] == 'r' || yytext[0] == 'R') {
		/*
		 * This is a raw string (like in Python). Rules: 1) The two
		 * escape sequences are \\ and \". 2) Backslashes are
		 * preserved. 3) Double quotes in the string must be escaped.
		 * Corollary: Strings cannot end with an odd number of
		 * backslashes.
		 * Example: r"a\b\x12\"\\" is the string (including the implicit NUL terminator)
		 * {'a', '\\', 'b', '\\', 'x', '1', '2', '\\', '"', '\\'. '\\', '\0'}
		 */
		yyextra->raw_string = TRUE;
	}
	else {
		yyextra->raw_string = FALSE;
	}
}

<DQUOTE><<EOF>>				{
	/* unterminated string */
	/* The example of how to handle unclosed strings was taken from
	the flex 2.5.4 manual, from the section "End-of-file rules".
	See:
	http://www.gnu.org/software/flex/manual/html_node/flex_13.html */

	dfilter_fail(yyextra->dfw, "The final quote was missing from a quoted string.");
	return SCAN_FAILED;
}

<DQUOTE>\042			{
	/* end quote */
	int token;
	BEGIN(INITIAL);
	token = set_lval_str(TOKEN_STRING, yyextra->quoted_string->str);
	g_string_free(yyextra->quoted_string, TRUE);
	yyextra->quoted_string = NULL;
	return token;
}

<DQUOTE>\\[0-7]{1,3} {
	/* octal sequence */
	if (yyextra->raw_string) {
		g_string_append(yyextra->quoted_string, yytext);
	}
	else {
		unsigned long result;
		result = strtoul(yytext + 1, NULL, 8);
		if (result == 0) {
			g_string_free(yyextra->quoted_string, TRUE);
			yyextra->quoted_string = NULL;
			dfilter_fail(yyextra->dfw, "%s (NUL byte) cannot be used with a regular string.", yytext);
			return SCAN_FAILED;
		}
		if (result > 0xff) {
			g_string_free(yyextra->quoted_string, TRUE);
			yyextra->quoted_string = NULL;
			dfilter_fail(yyextra->dfw, "%s is larger than 255.", yytext);
			return SCAN_FAILED;
		}
		g_string_append_c(yyextra->quoted_string, (gchar) result);
	}
}

<DQUOTE>\\x[[:xdigit:]]{1,2} {
	/* hex sequence */
	if (yyextra->raw_string) {
		g_string_append(yyextra->quoted_string, yytext);
	}
	else {
		unsigned long result;
		result = strtoul(yytext + 2, NULL, 16);
		if (result == 0) {
			g_string_free(yyextra->quoted_string, TRUE);
			yyextra->quoted_string = NULL;
			dfilter_fail(yyextra->dfw, "%s (NUL byte) cannot be used with a regular string.", yytext);
			return SCAN_FAILED;
		}
		g_string_append_c(yyextra->quoted_string, (gchar) result);
	}
}


<DQUOTE>\\.				{
	/* escaped character */
	if (yyextra->raw_string) {
		g_string_append(yyextra->quoted_string, yytext);
	}
	else {
		g_string_append_c(yyextra->quoted_string, yytext[1]);
	}
}

<DQUOTE>[^\\\042]+			{
	/* non-escaped string */
	g_string_append(yyextra->quoted_string, yytext);
}


\047				{
	/* start quote of a quoted character value */
	/* The example of how to scan for strings was taken from
	the Flex manual, from the section "Start Conditions".
	See:
	http://flex.sourceforge.net/manual/Start-Conditions.html#Start-Conditions */

	BEGIN(SQUOTE);
	/* A previous filter that failed to compile due to
	a missing end quote will have left quoted_string set
	to something. Clear it now that we are starting
	a new quoted string. */
	if (yyextra->quoted_string) {
		g_string_free(yyextra->quoted_string, TRUE);
		/* Don't set quoted_string to NULL, as we
		do in other quoted_string-cleanup code, as we're
		about to set it in the next line. */
	}
	yyextra->quoted_string = g_string_new("'");
}

<SQUOTE><<EOF>>				{
	/* unterminated character value */
	/* The example of how to handle unclosed strings was taken from
	the Flex manual, from the section "End-of-file rules".
	See:
	http://flex.sourceforge.net/manual/EOF.html#EOF.html */

	dfilter_fail(yyextra->dfw, "The final quote was missing from a character constant.");
	return SCAN_FAILED;
}

<SQUOTE>\047			{
	/* end quote */
	int token;
	BEGIN(INITIAL);
	g_string_append_c(yyextra->quoted_string, '\'');
	token = set_lval_str(TOKEN_CHARCONST, yyextra->quoted_string->str);
	g_string_free(yyextra->quoted_string, TRUE);
	yyextra->quoted_string = NULL;
	return token;
}

<SQUOTE>\\.				{
	/* escaped character */
	g_string_append(yyextra->quoted_string, yytext);
}

<SQUOTE>[^\\\047]+			{
	/* non-escaped string */
	g_string_append(yyextra->quoted_string, yytext);
}


[-[:alnum:]_\.:]*\/[[:digit:]]+  {
        /* CIDR */
        return set_lval_str(TOKEN_UNPARSED, yytext);
}

		([.][-+[:alnum:]_:]+)+[.]{0,2} |
[-+[:alnum:]_:]+([.][-+[:alnum:]_:]+)*[.]{0,2} {
	/* Is it a field name or some other value (float, integer, bytes, ...)? */

	/* Trailing dot is allowed for floats, but make sure that trailing ".."
	 * is interpreted as a token on its own. */
	if (strstr(yytext, "..")) {
		yyless(yyleng-2);
	}

	/* No match, so treat it as an unparsed string */
	return set_lval_str(TOKEN_UNPARSED, yytext);
}

. {
	/* Default */
	return set_lval_str(TOKEN_UNPARSED, yytext);
}


%%

/*
 * Turn diagnostics back on, so we check the code that we've written.
 */
DIAG_ON_FLEX

static int
simple(int token, const char *token_value)
{
	switch (token) {
		case TOKEN_LPAREN:
		case TOKEN_RPAREN:
		case TOKEN_LBRACKET:
		case TOKEN_RBRACKET:
		case TOKEN_LBRACE:
		case TOKEN_RBRACE:
		case TOKEN_COLON:
		case TOKEN_COMMA:
		case TOKEN_DOTDOT:
		case TOKEN_HYPHEN:
		case TOKEN_WHITESPACE:
		case TOKEN_TEST_EQ:
		case TOKEN_TEST_NE:
		case TOKEN_TEST_GT:
		case TOKEN_TEST_GE:
		case TOKEN_TEST_LT:
		case TOKEN_TEST_LE:
		case TOKEN_TEST_BITWISE_AND:
		case TOKEN_TEST_CONTAINS:
		case TOKEN_TEST_MATCHES:
		case TOKEN_TEST_NOT:
		case TOKEN_TEST_AND:
		case TOKEN_TEST_OR:
		case TOKEN_TEST_IN:
			break;
		default:
			ws_assert_not_reached();
	}
	stnode_init(df_lval, STTYPE_UNINITIALIZED, NULL, token_value);
	return token;
}

static int
set_lval_str(int token, const char *token_value)
{
	sttype_id_t	type_id;

	switch (token) {
		case TOKEN_STRING:
			type_id = STTYPE_STRING;
			break;
		case TOKEN_CHARCONST:
			type_id = STTYPE_CHARCONST;
			break;
		case TOKEN_UNPARSED:
			type_id = STTYPE_UNPARSED;
			break;
		case TOKEN_INTEGER:
			/* Not used in AST. */
			type_id = STTYPE_UNINITIALIZED;
			break;
		default:
			ws_assert_not_reached();
	}
	stnode_init(df_lval, type_id, (gpointer)token_value, token_value);
	return token;
}