HTTP: Speed up chunked Transfer-Encoding on TCP

HTTP/1.1 chunked Transfer-Encoding doesn't have a overall length,
but requires scanning through variable length chunks to find the
end. If we determine that additional segments are needed, and
we have a sequence number (or other identifier) for the message,
store the position of the last chunk size found.

Use this to start scanning at that same offset when the next
segment arrives, reducing the algorithm for determining if
when we have the complete chunked message from O(N^2) to O(N),
which can be significant on captures with many chunks.

This does most of #14382, reducing the length of time to process
a file with 2 pass tshark from over 8.5 secs to under 3 seconds
on my machine. There is still some O(N^2) contribution from the
reassembly code itself with many small fragments (see #17311).

Other dissectors need some small changes to enable this for
HTTP over other transport layers. (TLS would be fairly easy and
is the other important case.)
This commit is contained in:
John Thacker 2022-07-17 17:10:43 -04:00 committed by A Wireshark GitLab Utility
parent 5e04463282
commit 13df9b0b64
6 changed files with 74 additions and 17 deletions

View File

@ -1021,6 +1021,8 @@ get_http_conversation_data(packet_info *pinfo, conversation_t **conversation)
if(!conv_data) {
/* Setup the conversation structure itself */
conv_data = wmem_new0(wmem_file_scope(), http_conv_t);
conv_data->chunk_offsets_fwd = wmem_map_new(wmem_file_scope(), g_int_hash, g_int_equal);
conv_data->chunk_offsets_rev = wmem_map_new(wmem_file_scope(), g_int_hash, g_int_equal);
conversation_add_proto_data(*conversation, proto_http,
conv_data);
@ -1092,7 +1094,8 @@ static http_info_value_t *stat_info;
static int
dissect_http_message(tvbuff_t *tvb, int offset, packet_info *pinfo,
proto_tree *tree, http_conv_t *conv_data,
const char* proto_tag, int proto, gboolean end_of_stream)
const char* proto_tag, int proto, gboolean end_of_stream,
const guint32* const seq)
{
proto_tree *http_tree = NULL;
proto_item *ti = NULL;
@ -1125,6 +1128,22 @@ dissect_http_message(tvbuff_t *tvb, int offset, packet_info *pinfo,
gboolean leading_crlf = FALSE;
http_message_info_t message_info;
wmem_map_t *header_value_map = wmem_map_new(wmem_packet_scope(), g_str_hash, g_str_equal);
int chunk_offset = 0;
wmem_map_t *chunk_map = NULL;
conversation_t *conversation;
conversation = find_or_create_conversation(pinfo);
if (cmp_address(&pinfo->src, conversation_key_addr1(conversation->key_ptr)) == 0 && pinfo->srcport == conversation_key_port1(conversation->key_ptr)) {
chunk_map = conv_data->chunk_offsets_fwd;
} else if (cmp_address(&pinfo->dst, conversation_key_addr1(conversation->key_ptr)) == 0 && pinfo->destport == conversation_key_port1(conversation->key_ptr)) {
chunk_map = conv_data->chunk_offsets_rev;
}
if (seq && chunk_map) {
chunk_offset = GPOINTER_TO_INT(wmem_map_lookup(chunk_map, seq));
/* Returns 0 when there is no entry in the map, as we want. */
}
reported_length = tvb_reported_length_remaining(tvb, offset);
if (reported_length < 1) {
@ -1190,7 +1209,7 @@ dissect_http_message(tvbuff_t *tvb, int offset, packet_info *pinfo,
* desegmentation if we're told to.
*/
if (!req_resp_hdrs_do_reassembly(tvb, offset, pinfo,
http_desegment_headers, http_desegment_body, FALSE)) {
http_desegment_headers, http_desegment_body, FALSE, &chunk_offset)) {
/*
* More data needed for desegmentation.
*/
@ -1252,10 +1271,13 @@ dissect_http_message(tvbuff_t *tvb, int offset, packet_info *pinfo,
}
}
if (!req_resp_hdrs_do_reassembly(tvb, offset, pinfo,
http_desegment_headers, try_desegment_body, http_type == HTTP_RESPONSE)) {
http_desegment_headers, try_desegment_body, http_type == HTTP_RESPONSE, &chunk_offset)) {
/*
* More data needed for desegmentation.
*/
if (seq && chunk_map && chunk_offset) {
wmem_map_insert(chunk_map, seq, GINT_TO_POINTER(chunk_offset));
}
return -1;
}
} else if (have_seen_http) {
@ -3696,7 +3718,7 @@ check_auth_kerberos(proto_item *hdr_item, tvbuff_t *tvb, packet_info *pinfo, con
static void
dissect_http_on_stream(tvbuff_t *tvb, packet_info *pinfo, proto_tree *tree,
http_conv_t *conv_data, gboolean end_of_stream)
http_conv_t *conv_data, gboolean end_of_stream, const guint32 *seq)
{
int offset = 0;
int len;
@ -3726,7 +3748,7 @@ dissect_http_on_stream(tvbuff_t *tvb, packet_info *pinfo, proto_tree *tree,
}
break;
}
len = dissect_http_message(tvb, offset, pinfo, tree, conv_data, "HTTP", proto_http, end_of_stream);
len = dissect_http_message(tvb, offset, pinfo, tree, conv_data, "HTTP", proto_http, end_of_stream, seq);
if (len == -1)
break;
offset += len;
@ -3781,7 +3803,8 @@ dissect_http_tcp(tvbuff_t *tvb, packet_info *pinfo, proto_tree *tree, void* data
/* XXX - how to detect end-of-stream without tcpinfo */
end_of_stream = (tcpinfo && IS_TH_FIN(tcpinfo->flags));
dissect_http_on_stream(tvb, pinfo, tree, conv_data, end_of_stream);
dissect_http_on_stream(tvb, pinfo, tree, conv_data, end_of_stream, tcpinfo ? &tcpinfo->seq : NULL);
return tvb_captured_length(tvb);
}
@ -3824,8 +3847,13 @@ dissect_http_tls(tvbuff_t *tvb, packet_info *pinfo, proto_tree *tree, void* data
/*
* XXX - we need to provide an end-of-stream indication.
* tls should also provide the byte offset inside the stream,
* similar to TCP sequence numbers. It already provides the
* app_handle to heuristic dissectors as the (void *)data,
* so we'd have to change it everywhere or pass it a different
* way (e.g., pinfo->pool proto data).
*/
dissect_http_on_stream(tvb, pinfo, tree, conv_data, FALSE);
dissect_http_on_stream(tvb, pinfo, tree, conv_data, FALSE, NULL);
return tvb_captured_length(tvb);
}
@ -3878,7 +3906,7 @@ dissect_http_sctp(tvbuff_t *tvb, packet_info *pinfo, proto_tree *tree, void* dat
/*
* XXX - we need to provide an end-of-stream indication.
*/
dissect_http_on_stream(tvb, pinfo, tree, conv_data, FALSE);
dissect_http_on_stream(tvb, pinfo, tree, conv_data, FALSE, NULL);
return tvb_captured_length(tvb);
}
@ -3894,7 +3922,7 @@ dissect_http(tvbuff_t *tvb, packet_info *pinfo, proto_tree *tree, void* data _U_
* XXX - what should be done about reassembly, pipelining, etc.
* here?
*/
dissect_http_on_stream(tvb, pinfo, tree, conv_data, FALSE);
dissect_http_on_stream(tvb, pinfo, tree, conv_data, FALSE, NULL);
return tvb_captured_length(tvb);
}
@ -3905,7 +3933,7 @@ dissect_ssdp(tvbuff_t *tvb, packet_info *pinfo, proto_tree *tree, void* data _U_
http_conv_t *conv_data;
conv_data = get_http_conversation_data(pinfo, &conversation);
dissect_http_message(tvb, 0, pinfo, tree, conv_data, "SSDP", proto_ssdp, FALSE);
dissect_http_message(tvb, 0, pinfo, tree, conv_data, "SSDP", proto_ssdp, FALSE, NULL);
return tvb_captured_length(tvb);
}

View File

@ -64,6 +64,10 @@ typedef struct _http_conv_t {
gchar *request_uri;
gchar *full_uri;
/* Used to speed up desegmenting of chunked Transfer-Encoding. */
wmem_map_t *chunk_offsets_fwd;
wmem_map_t *chunk_offsets_rev;
/* Fields related to proxied/tunneled/Upgraded connections. */
guint32 startframe; /* First frame of proxied connection */
int startoffset; /* Offset within the frame where the new protocol begins. */

View File

@ -848,7 +848,7 @@ dissect_rtspmessage(tvbuff_t *tvb, int offset, packet_info *pinfo,
* assumes zero if missing.
*/
if (!req_resp_hdrs_do_reassembly(tvb, offset, pinfo,
rtsp_desegment_headers, rtsp_desegment_body, FALSE)) {
rtsp_desegment_headers, rtsp_desegment_body, FALSE, NULL)) {
/*
* More data needed for desegmentation.
*/

View File

@ -3497,7 +3497,7 @@ dissect_sip_common(tvbuff_t *tvb, int offset, int remaining_length, packet_info
* RFC 6594, Section 20.14. requires Content-Length for TCP.
*/
if (!req_resp_hdrs_do_reassembly(tvb, offset, pinfo,
sip_desegment_headers, sip_desegment_body, FALSE)) {
sip_desegment_headers, sip_desegment_body, FALSE, NULL)) {
/*
* More data needed for desegmentation.
*/

View File

@ -27,7 +27,7 @@
gboolean
req_resp_hdrs_do_reassembly(tvbuff_t *tvb, const int offset, packet_info *pinfo,
const gboolean desegment_headers, const gboolean desegment_body,
gboolean desegment_until_fin)
gboolean desegment_until_fin, int *last_chunk_offset)
{
gint next_offset = offset;
gint next_offset_sav;
@ -164,9 +164,13 @@ req_resp_hdrs_do_reassembly(tvbuff_t *tvb, const int offset, packet_info *pinfo,
} else if (g_ascii_strncasecmp( line, "Transfer-Encoding:", 18) == 0) {
/*
* Find out if this Transfer-Encoding is
* chunked. It should be, since there
* really aren't any other types, but
* RFC 2616 allows for them.
* chunked. It should be, since the
* other types aren't really used, but
* RFC 7230 defines some.
* (RFC 3261 says "chunked" MUST NOT be
* used for SIP, and RFCs 2326 and 7826
* say the same for RTSP, but handle it
* anyway.)
*/
gchar *p;
guint len;
@ -201,6 +205,10 @@ req_resp_hdrs_do_reassembly(tvbuff_t *tvb, const int offset, packet_info *pinfo,
* The above loop ends when we reached the end of the headers, so
* there should be content_length bytes after the 4 terminating bytes
* and next_offset points to after the end of the headers.
*
* XXX: If desegment_headers is FALSE but desegment_body is TRUE,
* then for HTTP Responses we will always set to DESEGMENT_UNTIL_FIN,
* which is probably not what we want.
*/
if (desegment_body) {
if (chunked_encoding) {
@ -216,6 +224,9 @@ req_resp_hdrs_do_reassembly(tvbuff_t *tvb, const int offset, packet_info *pinfo,
* a trailing header, or the start of a new response.
*/
gboolean done_chunking = FALSE;
if (last_chunk_offset != NULL && *last_chunk_offset) {
next_offset = offset + *last_chunk_offset;
}
while (!done_chunking) {
guint chunk_size = 0;
@ -248,6 +259,15 @@ req_resp_hdrs_do_reassembly(tvbuff_t *tvb, const int offset, packet_info *pinfo,
}
/* We have a line with the chunk size in it.*/
/* Save off the offset so we can skip this work next time.
* Use a relative offset, because we might call this
* with a different offset with a reassembled tvb.
*/
if (last_chunk_offset != NULL) {
*last_chunk_offset = next_offset - offset;
}
chunk_string = tvb_get_string_enc(pinfo->pool, tvb, next_offset,
linelen, ENC_ASCII);
c = chunk_string;

View File

@ -25,11 +25,16 @@
* @param desegment_until_fin When desegment_body is enabled and no
* Content-Length header is found, assume that all data following the headers
* are part of the body.
* @param[in,out] last_chunk_offset For the chunked Transfer-Encoding,
* the offset (relative to the initial tvb offset) of the last chunk size
* found. The result can be fed back into a future call in order to skip
* to a later chunk and reduce processing from O(N^2) to O(N). Use 0 for
* the initial call. Only set when chunked TE is found. May be NULL.
* @return TRUE if desegmentation is complete otherwise FALSE
*/
WS_DLL_PUBLIC gboolean
req_resp_hdrs_do_reassembly(tvbuff_t *tvb, const int offset, packet_info *pinfo,
const gboolean desegment_headers, const gboolean desegment_body,
gboolean desegment_until_fin);
gboolean desegment_until_fin, int *last_chunk_offset);
#endif