From 2b5fd572f7c60fb7043c44c4638216db0d7b2c7d Mon Sep 17 00:00:00 2001
From: John Thacker <johnthacker@gmail.com>
Date: Fri, 26 Jan 2024 20:54:21 -0500
Subject: [PATCH] File sets: Match both formats, compressed files

For file sets produced by multiple file captures, match both
the number before time and the newer time before number format.
Distinguish them in the return value, since files of different
formats are not part of the same set.

Also handle files with a compression suffix as well, as we can
produce that in captures. Since in multi file captures compression
is done when switching files, allow file sets to have a mixture of files
compressed and uncompressed. When doing a multi file capture and
compressing, the last file is not compressed.

Add information to the user guide and release notes

Related to #12371
---
 doc/release-notes.adoc               |   7 +
 docbook/wsug_src/wsug_io.adoc        |   9 +-
 fileset.c                            | 187 ++++++++++++++++-----------
 fileset.h                            |  21 ++-
 ui/qt/models/fileset_entry_model.cpp |   7 +-
 5 files changed, 148 insertions(+), 83 deletions(-)

diff --git a/doc/release-notes.adoc b/doc/release-notes.adoc
index 16aa24cbe0..d5f0bba4fd 100644
--- a/doc/release-notes.adoc
+++ b/doc/release-notes.adoc
@@ -122,6 +122,13 @@ The following features are new (or have been significantly updated) since versio
 
 * Truncated fields in the detail view are now displayed as "Field name […]: data" instead of "Field name [truncated]: data"
 
+* When capturing files in multiple file mode, a pattern that places the date and time
+  before the index number can be used (e.g., foo_20240714110102_00001.pcap instead of
+  foo_00001_20240714110102.pcap). This causes filenames to sort in chronological order
+  across file sets from different captures. The File Set dialog has been updated to
+  handle the new pattern, which has been capable of being produced by tshark since
+  version 3.6.0
+
 === Removed Features and Support
 
 * The tshark `-G` option with no argument is deprecated and will be removed in
diff --git a/docbook/wsug_src/wsug_io.adoc b/docbook/wsug_src/wsug_io.adoc
index df8fced7c1..5e3fe1dd98 100644
--- a/docbook/wsug_src/wsug_io.adoc
+++ b/docbook/wsug_src/wsug_io.adoc
@@ -703,10 +703,13 @@ some features to handle these file sets in a convenient way.
 
 .How does Wireshark detect the files of a file set?
 ****
-A filename in a file set uses the format Prefix_Number_DateTimeSuffix which
-might look something like `test_00001_20240714183910.pcap`. All files of a file
+A filename in a file set uses the format Prefix_Number_DateTimeSuffix (or,
+in Wireshark 4.4.0 and later, Prefix_DateTime_NumberSuffix) which might
+look something like `test_00001_20240714183910.pcap`. All files of a file
 set share the same prefix (e.g., “test”) and suffix (e.g., “.pcap”) and a
-varying middle part.
+varying middle part. Files are also allowed to have a second compression
+suffix of types that Wireshark can open; the compression suffix does not
+have to match for all files in a set.
 
 To find the files of a file set, Wireshark scans the directory where the
 currently loaded file resides and checks for files matching the filename pattern
diff --git a/fileset.c b/fileset.c
index f137de3b73..593d7d9f8a 100644
--- a/fileset.c
+++ b/fileset.c
@@ -23,6 +23,8 @@
 #include <wsutil/filesystem.h>
 #include <wsutil/ws_assert.h>
 
+#include <wiretap/wtap.h>
+
 #include <epan/strutil.h>
 
 #include "fileset.h"
@@ -58,104 +60,141 @@ static fileset set = { NULL, NULL};
 #endif /* _WIN32 */
 
 /* is this a probable file of a file set (does the naming pattern match)? */
-gboolean
-fileset_filename_match_pattern(const char *fname)
+fileset_match_t
+fileset_filename_match_pattern(const char *fname, char **prefix, char **suffix, char **time)
 {
-    char        *pfx;
-    size_t       baselen;
-    size_t      minlen = strlen("_00001_20050418010750");
+    char        *sfx;
     char        *filename;
+    fileset_match_t ret = FILESET_NO_MATCH;
+    static char *pattern = "(?P<prefix>.*)_\\d{5}_(?P<time>\\d{14})$";
+    static char *pattern2 = "(?P<prefix>.*)_(?P<time>\\d{14})_\\d{5}$";
+    static GRegex *regex = NULL;
+    static GRegex *regex2 = NULL;
 
+    if (regex == NULL) {
+        GError *gerr = NULL;
+        regex = g_regex_new(pattern,
+                        (GRegexCompileFlags)(G_REGEX_OPTIMIZE | G_REGEX_ANCHORED),
+                        G_REGEX_MATCH_ANCHORED, NULL);
+        if (gerr) {
+                ws_warning("failed to compile regex: %s", gerr->message);
+                g_error_free(gerr);
+                regex = NULL;
+                return ret;
+        }
+    }
+
+    if (regex2 == NULL) {
+        GError *gerr = NULL;
+        regex2 = g_regex_new(pattern2,
+                        (GRegexCompileFlags)(G_REGEX_OPTIMIZE | G_REGEX_ANCHORED),
+                        G_REGEX_MATCH_ANCHORED, NULL);
+        if (gerr) {
+                ws_warning("failed to compile regex: %s", gerr->message);
+                g_error_free(gerr);
+                regex2 = NULL;
+                return ret;
+        }
+    }
 
     /* d:\dir1\test_00001_20050418010750.cap */
-    filename = g_strdup(get_basename(fname));
+    filename = g_path_get_basename(fname);
 
     /* test_00001_20050418010750.cap */
-    pfx = strrchr(filename, '.');
-    if(pfx == NULL) {  /* suffix is optional */
-        pfx = filename + strlen(filename);
-    }
-    /* test_00001_20050418010750 */
-    *pfx = '\0';
-
-    /* filename long enough? */
-    baselen = strlen(filename);
-    if(baselen < minlen) {
-        g_free(filename);
-        return FALSE;
-    }
-
-    /* there must be two underscores at special places */
-    if(filename[baselen-minlen] != '_' || filename[baselen-minlen+6] != '_') {
-        g_free(filename);
-        return FALSE;
-    }
-
-    /* replace the two underscores by digits */
-    filename[baselen-minlen] = '0';
-    filename[baselen-minlen+6] = '0';
-
-    /* we should have only digits now */
-    while(minlen--) {
-        baselen--;
-
-        if(!g_ascii_isdigit( filename[baselen])) {
-            g_free(filename);
-            return FALSE;
+    sfx = strrchr(filename, '.');
+    if (sfx != NULL) {
+        *sfx = '\0';
+        GSList *compression_type_extensions = wtap_get_all_compression_type_extensions_list();
+        char *ext = g_ascii_strdown(sfx + 1, -1);
+        for (GSList *compression_extension = compression_type_extensions;
+                compression_extension != NULL;
+                compression_extension = g_slist_next(compression_extension)) {
+            if (g_strcmp0(ext, (const char*)compression_extension->data) == 0) {
+                sfx = strrchr(filename, '.');
+                if (sfx != NULL) {
+                    *sfx = '\0';
+                }
+                break;
+            }
         }
+        g_free(ext);
+        g_slist_free(compression_type_extensions);
+    } else { /* suffix is optional */
+        sfx = filename + strlen(filename);
+    }
+
+    /* test_00001_20050418010750 */
+
+    GMatchInfo *match_info;
+    g_regex_match(regex, filename, 0, &match_info);
+    if (g_match_info_matches(match_info)) {
+        if (prefix) {
+            *prefix = g_match_info_fetch_named(match_info, "prefix");
+        }
+        if (time) {
+            *time = g_match_info_fetch_named(match_info, "time");
+        }
+        if (suffix) {
+            *suffix = g_strdup(sfx);
+        }
+        ret = FILESET_NUM_TIME;
+    }
+    g_match_info_free(match_info);
+
+    if (ret == FILESET_NO_MATCH) {
+        g_regex_match(regex2, filename, 0, &match_info);
+        if (g_match_info_matches(match_info)) {
+            if (prefix) {
+                *prefix = g_match_info_fetch_named(match_info, "prefix");
+            }
+            if (time) {
+                *time = g_match_info_fetch_named(match_info, "time");
+            }
+            if (suffix) {
+                *suffix = g_strdup(sfx);
+            }
+            ret = FILESET_TIME_NUM;
+        }
+        g_match_info_free(match_info);
     }
 
     g_free(filename);
 
-    /* ok, seems to be good */
-    return TRUE;
+    return ret;
 }
 
 
-/* test, if both files could be in the same file set */
-/* (the filenames must already be in correct shape) */
+/* test if both files could be in the same file set */
+/* (fname2 must already be in correct shape) */
 static gboolean
 fileset_is_file_in_set(const char *fname1, const char *fname2)
 {
     char        *pfx1;
     char        *pfx2;
-    char        *dup_f1;
-    char        *dup_f2;
-    size_t       minlen = strlen("_00001_20050418010750");
+    char        *sfx1;
+    char        *sfx2;
+    fileset_match_t match1;
+    fileset_match_t match2;
+    gboolean    ret = FALSE;
 
+    match1 = fileset_filename_match_pattern(fname1, &pfx1, &sfx1, NULL);
+    if (match1 == FILESET_NO_MATCH) {
+        return FALSE;
+    }
 
+    match2 = fileset_filename_match_pattern(fname2, &pfx2, &sfx2, NULL);
     /* just to be sure ... */
-    ws_assert(fileset_filename_match_pattern(fname1));
-    ws_assert(fileset_filename_match_pattern(fname2));
-
-    dup_f1 = g_strdup(fname1);
-    dup_f2 = g_strdup(fname2);
-
-    pfx1 = strrchr(dup_f1, '.');
-    pfx2 = strrchr(dup_f2, '.');
-    /* suffix is optional */
-    if (!pfx1) pfx1 = dup_f1 + strlen(dup_f1);
-    if (!pfx2) pfx2 = dup_f2 + strlen(dup_f2);
-
-    /* the optional suffix (file extension) must be equal */
-    if(strcmp(pfx1, pfx2) != 0) {
-        g_free(dup_f1);
-        g_free(dup_f2);
-        return FALSE;
+    ws_assert(match2 != FILESET_NO_MATCH);
+    if (match1 == match2 && g_strcmp0(pfx1, pfx2) == 0 && g_strcmp0(sfx1, sfx2) == 0) {
+        ret = TRUE;
     }
 
-    *(pfx1-minlen) = '\0';
-    *(pfx2-minlen) = '\0';
+    g_free(pfx1);
+    g_free(pfx2);
+    g_free(sfx1);
+    g_free(sfx2);
 
-    if(strcmp(dup_f1, dup_f2) != 0) {
-        g_free(dup_f1);
-        g_free(dup_f2);
-        return FALSE;
-    }
-
-    g_free(dup_f1);
-    g_free(dup_f2);
-    return TRUE;
+    return ret;
 }
 
 /* GCompareFunc helper for g_list_find_custom() */
@@ -292,12 +331,12 @@ fileset_add_dir(const char *fname, void *window)
     dirname = g_string_append_c(dirname, G_DIR_SEPARATOR);
 
     /* is the current file probably a part of any fileset? */
-    if(fileset_filename_match_pattern(fname)) {
+    if(fileset_filename_match_pattern(fname, NULL, NULL, NULL)) {
         /* yes, go through the files in the directory and check if the file in question is part of the current file set */
         if ((dir = ws_dir_open(dirname->str, 0, NULL)) != NULL) {
             while ((file = ws_dir_read_name(dir)) != NULL) {
                 name = ws_dir_get_name(file);
-                if(fileset_filename_match_pattern(name) && fileset_is_file_in_set(name, get_basename(fname))) {
+                if(fileset_is_file_in_set(name, get_basename(fname))) {
                     fileset_add_file(dirname->str, name, strcmp(name, get_basename(fname))== 0 /* current */);
                 }
             } /* while */
diff --git a/fileset.h b/fileset.h
index 9155b43fb2..84c075d206 100644
--- a/fileset.h
+++ b/fileset.h
@@ -25,9 +25,26 @@ typedef struct _fileset_entry {
     gboolean current;        /* is this the currently loaded file? */
 } fileset_entry;
 
+typedef enum {
+    FILESET_NO_MATCH,
+    FILESET_TIME_NUM,
+    FILESET_NUM_TIME
+} fileset_match_t;
 
-/* helper: is this a probable file of a file set (does the naming pattern match)? */
-extern gboolean fileset_filename_match_pattern(const char *fname);
+/* helper: is this a probable file of a file set (does the naming pattern match)?
+ * Possible naming patterns are prefix_NNNNN_YYYYMMDDHHMMSS.ext[.gz] and
+ * prefix_YYYYMMDDHHMMSS_NNNNN.ext[.gz], where any compression suffix
+ * supported by libwiretap is allowed. The validation is minimal; e.g., the
+ * time is only checked to see if all 14 characters are digits.
+ *
+ * @param[in] fname The filename to check for a naming pattern.
+ * @param[out] prefix If not NULL and the filename matches, the prefix
+ * @param[out] suffix If not NULL and the filename matches, the suffix
+ * (file extension) not including the compression suffix
+ * @param[out] time If not NULL and the filename matches, the time component
+ * @return The type of pattern match, or FILESET_NO_MATCH.
+ * */
+extern fileset_match_t fileset_filename_match_pattern(const char *fname, char **prefix, char **suffix, char **time);
 
 extern void fileset_add_dir(const char *fname, void *window);
 
diff --git a/ui/qt/models/fileset_entry_model.cpp b/ui/qt/models/fileset_entry_model.cpp
index 8c9f504b04..86979e38c2 100644
--- a/ui/qt/models/fileset_entry_model.cpp
+++ b/ui/qt/models/fileset_entry_model.cpp
@@ -121,14 +121,13 @@ void FilesetEntryModel::clear()
 }
 
 QString FilesetEntryModel::nameToDate(const char *name) const {
+    char *date;
     QString dn;
 
-    if (!fileset_filename_match_pattern(name))
+    if (fileset_filename_match_pattern(name, NULL, NULL, &date) == FILESET_NO_MATCH)
         return NULL;
 
-    dn = name;
-    dn.remove(QRegularExpression(".*_"));
-    dn.truncate(14);
+    dn = gchar_free_to_qstring(date);
     dn.insert(4, '-');
     dn.insert(7, '-');
     dn.insert(10, ' ');