Tools: Quote some elements in html2text.

Quote <code> spans with backticks and <span class=menuseq> spans with
double quotes.
This commit is contained in:
Gerald Combs 2021-10-08 15:29:42 -07:00 committed by Wireshark GitLab Utility
parent ce0592514c
commit 51e1381b23
2 changed files with 35 additions and 9 deletions

18
NEWS
View File

@ -1,4 +1,4 @@
Wireshark 3.5.1 Release Notes Wireshark 3.7.0 Release Notes
This is an experimental release intended to test new features for This is an experimental release intended to test new features for
Wireshark 3.6. Wireshark 3.6.
@ -123,6 +123,9 @@ Wireshark 3.5.1 Release Notes
• The settings in the 'Import from Hex Dump' dialog is now stored • The settings in the 'Import from Hex Dump' dialog is now stored
in a profile import_hexdump.json file. in a profile import_hexdump.json file.
• Reload Lua plugins has been improved to properly support
FileHandler.
New File Format Decoding Support New File Format Decoding Support
Vector Informatik Binary Log File (BLF) Vector Informatik Binary Log File (BLF)
@ -167,8 +170,9 @@ Wireshark 3.5.1 Release Notes
Wireshark and TShark look in several different locations for Wireshark and TShark look in several different locations for
preference files, plugins, SNMP MIBS, and RADIUS dictionaries. These preference files, plugins, SNMP MIBS, and RADIUS dictionaries. These
locations vary from platform to platform. You can use About → Folders locations vary from platform to platform. You can use "Help About
to find the default locations on your system. Wireshark Folders" or `tshark -G folders` to find the default
locations on your system.
Getting Help Getting Help
@ -185,7 +189,7 @@ Wireshark 3.5.1 Release Notes
A complete FAQ is available on the Wireshark web site[8]. A complete FAQ is available on the Wireshark web site[8].
Last updated 2021-10-03 09:05:36 UTC Last updated 2021-10-08 21:37:06 UTC
References References
@ -193,9 +197,9 @@ Wireshark 3.5.1 Release Notes
.html .html
2. https://www.wireshark.org/docs/wsug_html_chunked/_rtp.html#ChTelRt 2. https://www.wireshark.org/docs/wsug_html_chunked/_rtp.html#ChTelRt
pPlayer pPlayer
3. https://www.wireshark.org/docs/wsug_html_chunked//ChAdvFollowStrea 3. https://www.wireshark.org/docs/wsug_html_chunked/ChAdvFollowStream
mSection.html Section.html
4. https://www.wireshark.org/download.html#thirdparty 4. https://www.wireshark.org/download.html
5. https://ask.wireshark.org/ 5. https://ask.wireshark.org/
6. https://www.wireshark.org/lists/ 6. https://www.wireshark.org/lists/
7. https://gitlab.com/wireshark/wireshark/-/issues 7. https://gitlab.com/wireshark/wireshark/-/issues

View File

@ -44,6 +44,9 @@ class TextHTMLParser(HTMLParser):
self.need_space = False self.need_space = False
# Whether to prevent word-wrapping the contents (for "pre" tag) # Whether to prevent word-wrapping the contents (for "pre" tag)
self.skip_wrap = False self.skip_wrap = False
# Quoting
self.need_quote = False
self.quote_stack = []
# track list items # track list items
self.list_item_prefix = None self.list_item_prefix = None
self.ordered_list_index = None self.ordered_list_index = None
@ -89,6 +92,9 @@ class TextHTMLParser(HTMLParser):
# terminated. # terminated.
if tag == 'br' or tag == 'li': if tag == 'br' or tag == 'li':
self._commit_block('\n') self._commit_block('\n')
if tag == 'code':
self.need_quote = True
self.quote_stack.append('`')
if tag == 'pre': if tag == 'pre':
self.skip_wrap = True self.skip_wrap = True
if tag in ('ol', 'ul'): if tag in ('ol', 'ul'):
@ -116,10 +122,22 @@ class TextHTMLParser(HTMLParser):
self.href = href self.href = href
except IndexError: except IndexError:
self.href = None self.href = None
if tag == 'span':
try:
el_class = [attr[1] for attr in attrs if attr[0] == 'class'][0]
if 'menuseq' in el_class:
sys.stderr.write('menuseq\n')
self.need_quote = True
self.quote_stack.append('"')
except IndexError:
pass
if tag in self.ignore_tags: if tag in self.ignore_tags:
self.ignore_level += 1 self.ignore_level += 1
def handle_data(self, data): def handle_data(self, data):
quote = ''
if self.need_quote:
quote = self.quote_stack[-1]
if self.ignore_level > 0: if self.ignore_level > 0:
return return
elif self.skip_wrap: elif self.skip_wrap:
@ -132,21 +150,25 @@ class TextHTMLParser(HTMLParser):
# For normal text, fold multiple whitespace and strip # For normal text, fold multiple whitespace and strip
# leading and trailing spaces for the whole block (but # leading and trailing spaces for the whole block (but
# keep spaces in the middle). # keep spaces in the middle).
block = '' block = quote
if data.strip() and data[:1].isspace(): if data.strip() and data[:1].isspace():
# Keep spaces in the middle # Keep spaces in the middle
self.need_space = True self.need_space = True
if self.need_space and data.strip() and self.text_block: if self.need_space and data.strip() and self.text_block:
block = ' ' block = ' ' + quote
block += ' '.join(data.split()) block += ' '.join(data.split())
self.need_space = data[-1:].isspace() self.need_space = data[-1:].isspace()
self.text_block += block self.text_block += block
self.need_quote = False
def handle_endtag(self, tag): def handle_endtag(self, tag):
block_elements = 'p li ul pre ol h1 h2 h3 h4 h5 h6' block_elements = 'p li ul pre ol h1 h2 h3 h4 h5 h6'
#block_elements += ' dl dd dt' #block_elements += ' dl dd dt'
if tag in block_elements.split(): if tag in block_elements.split():
self._commit_block() self._commit_block()
if tag in ('code', 'span'):
# XXX This span isn't guaranteed to match its opening.
self.text_block += self.quote_stack.pop()
if tag in ('ol', 'ul'): if tag in ('ol', 'ul'):
self.list_indent_level -= 1 self.list_indent_level -= 1
self.list_item_indent = " " * (self.list_indent_level - 1) self.list_item_indent = " " * (self.list_indent_level - 1)