From 51e1381b235b3fad563f5ec7467ea4e001f2605b Mon Sep 17 00:00:00 2001 From: Gerald Combs Date: Fri, 8 Oct 2021 15:29:42 -0700 Subject: [PATCH] Tools: Quote some elements in html2text. Quote spans with backticks and spans with double quotes. --- NEWS | 18 +++++++++++------- tools/html2text.py | 26 ++++++++++++++++++++++++-- 2 files changed, 35 insertions(+), 9 deletions(-) diff --git a/NEWS b/NEWS index 7e777425d7..fbb1e342c2 100644 --- a/NEWS +++ b/NEWS @@ -1,4 +1,4 @@ -Wireshark 3.5.1 Release Notes +Wireshark 3.7.0 Release Notes This is an experimental release intended to test new features for Wireshark 3.6. @@ -123,6 +123,9 @@ Wireshark 3.5.1 Release Notes • The settings in the 'Import from Hex Dump' dialog is now stored in a profile import_hexdump.json file. + • Reload Lua plugins has been improved to properly support + FileHandler. + New File Format Decoding Support Vector Informatik Binary Log File (BLF) @@ -167,8 +170,9 @@ Wireshark 3.5.1 Release Notes Wireshark and TShark look in several different locations for preference files, plugins, SNMP MIBS, and RADIUS dictionaries. These - locations vary from platform to platform. You can use About → Folders - to find the default locations on your system. + locations vary from platform to platform. You can use "Help › About + Wireshark › Folders" or `tshark -G folders` to find the default + locations on your system. Getting Help @@ -185,7 +189,7 @@ Wireshark 3.5.1 Release Notes A complete FAQ is available on the Wireshark web site[8]. - Last updated 2021-10-03 09:05:36 UTC + Last updated 2021-10-08 21:37:06 UTC References @@ -193,9 +197,9 @@ Wireshark 3.5.1 Release Notes .html 2. https://www.wireshark.org/docs/wsug_html_chunked/_rtp.html#ChTelRt pPlayer - 3. https://www.wireshark.org/docs/wsug_html_chunked//ChAdvFollowStrea - mSection.html - 4. https://www.wireshark.org/download.html#thirdparty + 3. https://www.wireshark.org/docs/wsug_html_chunked/ChAdvFollowStream + Section.html + 4. https://www.wireshark.org/download.html 5. https://ask.wireshark.org/ 6. https://www.wireshark.org/lists/ 7. https://gitlab.com/wireshark/wireshark/-/issues diff --git a/tools/html2text.py b/tools/html2text.py index 84af66a3df..a8e6bffde5 100755 --- a/tools/html2text.py +++ b/tools/html2text.py @@ -44,6 +44,9 @@ class TextHTMLParser(HTMLParser): self.need_space = False # Whether to prevent word-wrapping the contents (for "pre" tag) self.skip_wrap = False + # Quoting + self.need_quote = False + self.quote_stack = [] # track list items self.list_item_prefix = None self.ordered_list_index = None @@ -89,6 +92,9 @@ class TextHTMLParser(HTMLParser): # terminated. if tag == 'br' or tag == 'li': self._commit_block('\n') + if tag == 'code': + self.need_quote = True + self.quote_stack.append('`') if tag == 'pre': self.skip_wrap = True if tag in ('ol', 'ul'): @@ -116,10 +122,22 @@ class TextHTMLParser(HTMLParser): self.href = href except IndexError: self.href = None + if tag == 'span': + try: + el_class = [attr[1] for attr in attrs if attr[0] == 'class'][0] + if 'menuseq' in el_class: + sys.stderr.write('menuseq\n') + self.need_quote = True + self.quote_stack.append('"') + except IndexError: + pass if tag in self.ignore_tags: self.ignore_level += 1 def handle_data(self, data): + quote = '' + if self.need_quote: + quote = self.quote_stack[-1] if self.ignore_level > 0: return elif self.skip_wrap: @@ -132,21 +150,25 @@ class TextHTMLParser(HTMLParser): # For normal text, fold multiple whitespace and strip # leading and trailing spaces for the whole block (but # keep spaces in the middle). - block = '' + block = quote if data.strip() and data[:1].isspace(): # Keep spaces in the middle self.need_space = True if self.need_space and data.strip() and self.text_block: - block = ' ' + block = ' ' + quote block += ' '.join(data.split()) self.need_space = data[-1:].isspace() self.text_block += block + self.need_quote = False def handle_endtag(self, tag): block_elements = 'p li ul pre ol h1 h2 h3 h4 h5 h6' #block_elements += ' dl dd dt' if tag in block_elements.split(): self._commit_block() + if tag in ('code', 'span'): + # XXX This span isn't guaranteed to match its opening. + self.text_block += self.quote_stack.pop() if tag in ('ol', 'ul'): self.list_indent_level -= 1 self.list_item_indent = " " * (self.list_indent_level - 1)