2008-12-17 19:41:43 +00:00
|
|
|
#!/usr/bin/env python
|
Always use html2text.py for FAQ, improve output
A recent commit broke compilation with Python 3. The original author of
html2text.py is deceased and the fork has increased the number of files
for this "simple" helper.
The html2text.py script in this patch was rewritten and its output
matches with lynx (except for a few newlines around lists). This means
that indentation has been added for headings, paragraphs and lists.
Also, since it was written from scratch, a new license could be chosen
that matches Wireshark.
Since now the in-tree html2text.py script provides nicer output, remove
detection of the alternative programs (elinks, links). lynx/w3m is
somehow still necessary for asciidoc though.
(I also looked into reusing html2text.py for the release notes to
replace asciidoc, but the --format=html output produces different output
(HTML adds a ToC and section numbers). For now still require lynx for
release notes)
Tested with Python 2.6.6, 2.7.9, 3.2.6 and 3.4.3 under LC_ALL=C and
LC_ALL=en_US.UTF-8 on Linux. Tested reading from stdin and file, writing
to file, pipe and tty. Tested with cmake (Ninja) and autotools on Arch
Linux x86_64. Test:
# For each $PATH per python version, execute (with varying LC_ALL)
help/faq.py -b | tools/html2text.py /dev/stdin | md5sum
help/faq.py -b | tools/html2text.py | md5sum
help/faq.py -b | tools/html2text.py
help/faq.py -b | tools/html2text.py >/dev/null
Change-Id: I6409450a3e6c8b010ca082251f9db7358b0cc2fd
Reviewed-on: https://code.wireshark.org/review/7779
Petri-Dish: Peter Wu <peter@lekensteyn.nl>
Tested-by: Petri Dish Buildbot <buildbot-no-reply@wireshark.org>
Reviewed-by: Anders Broman <a.broman58@gmail.com>
2015-03-21 10:57:01 +00:00
|
|
|
#
|
|
|
|
# html2text.py - converts HTML to text
|
|
|
|
#
|
|
|
|
# Wireshark - Network traffic analyzer
|
|
|
|
# By Gerald Combs <gerald@wireshark.org>
|
|
|
|
# Copyright 1998 Gerald Combs
|
|
|
|
#
|
|
|
|
# This program is free software; you can redistribute it and/or
|
|
|
|
# modify it under the terms of the GNU General Public License
|
|
|
|
# as published by the Free Software Foundation; either version 2
|
|
|
|
# of the License, or (at your option) any later version.
|
|
|
|
#
|
|
|
|
# This program is distributed in the hope that it will be useful,
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
# GNU General Public License for more details.
|
|
|
|
#
|
|
|
|
# You should have received a copy of the GNU General Public License
|
|
|
|
# along with this program; if not, write to the Free Software
|
|
|
|
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
|
|
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
|
|
|
__author__ = "Peter Wu <peter@lekensteyn.nl>"
|
|
|
|
__copyright__ = "Copyright 2015, Peter Wu"
|
|
|
|
__license__ = "GPL (v2 or later)"
|
2008-12-17 19:49:18 +00:00
|
|
|
|
2008-12-17 19:41:43 +00:00
|
|
|
# TODO:
|
Always use html2text.py for FAQ, improve output
A recent commit broke compilation with Python 3. The original author of
html2text.py is deceased and the fork has increased the number of files
for this "simple" helper.
The html2text.py script in this patch was rewritten and its output
matches with lynx (except for a few newlines around lists). This means
that indentation has been added for headings, paragraphs and lists.
Also, since it was written from scratch, a new license could be chosen
that matches Wireshark.
Since now the in-tree html2text.py script provides nicer output, remove
detection of the alternative programs (elinks, links). lynx/w3m is
somehow still necessary for asciidoc though.
(I also looked into reusing html2text.py for the release notes to
replace asciidoc, but the --format=html output produces different output
(HTML adds a ToC and section numbers). For now still require lynx for
release notes)
Tested with Python 2.6.6, 2.7.9, 3.2.6 and 3.4.3 under LC_ALL=C and
LC_ALL=en_US.UTF-8 on Linux. Tested reading from stdin and file, writing
to file, pipe and tty. Tested with cmake (Ninja) and autotools on Arch
Linux x86_64. Test:
# For each $PATH per python version, execute (with varying LC_ALL)
help/faq.py -b | tools/html2text.py /dev/stdin | md5sum
help/faq.py -b | tools/html2text.py | md5sum
help/faq.py -b | tools/html2text.py
help/faq.py -b | tools/html2text.py >/dev/null
Change-Id: I6409450a3e6c8b010ca082251f9db7358b0cc2fd
Reviewed-on: https://code.wireshark.org/review/7779
Petri-Dish: Peter Wu <peter@lekensteyn.nl>
Tested-by: Petri Dish Buildbot <buildbot-no-reply@wireshark.org>
Reviewed-by: Anders Broman <a.broman58@gmail.com>
2015-03-21 10:57:01 +00:00
|
|
|
# multiple list indentation levels
|
|
|
|
# maybe allow for ascii output instead of utf-8?
|
|
|
|
|
|
|
|
import sys
|
|
|
|
from textwrap import TextWrapper
|
|
|
|
try:
|
|
|
|
from HTMLParser import HTMLParser
|
|
|
|
from htmlentitydefs import name2codepoint
|
|
|
|
except: # Python 3
|
|
|
|
from html.parser import HTMLParser
|
|
|
|
from html.entities import name2codepoint
|
|
|
|
unichr = chr # for html entity handling
|
|
|
|
|
|
|
|
class TextHTMLParser(HTMLParser):
|
|
|
|
"""Converts a HTML document to text."""
|
|
|
|
def __init__(self):
|
2008-12-17 19:41:43 +00:00
|
|
|
try:
|
Always use html2text.py for FAQ, improve output
A recent commit broke compilation with Python 3. The original author of
html2text.py is deceased and the fork has increased the number of files
for this "simple" helper.
The html2text.py script in this patch was rewritten and its output
matches with lynx (except for a few newlines around lists). This means
that indentation has been added for headings, paragraphs and lists.
Also, since it was written from scratch, a new license could be chosen
that matches Wireshark.
Since now the in-tree html2text.py script provides nicer output, remove
detection of the alternative programs (elinks, links). lynx/w3m is
somehow still necessary for asciidoc though.
(I also looked into reusing html2text.py for the release notes to
replace asciidoc, but the --format=html output produces different output
(HTML adds a ToC and section numbers). For now still require lynx for
release notes)
Tested with Python 2.6.6, 2.7.9, 3.2.6 and 3.4.3 under LC_ALL=C and
LC_ALL=en_US.UTF-8 on Linux. Tested reading from stdin and file, writing
to file, pipe and tty. Tested with cmake (Ninja) and autotools on Arch
Linux x86_64. Test:
# For each $PATH per python version, execute (with varying LC_ALL)
help/faq.py -b | tools/html2text.py /dev/stdin | md5sum
help/faq.py -b | tools/html2text.py | md5sum
help/faq.py -b | tools/html2text.py
help/faq.py -b | tools/html2text.py >/dev/null
Change-Id: I6409450a3e6c8b010ca082251f9db7358b0cc2fd
Reviewed-on: https://code.wireshark.org/review/7779
Petri-Dish: Peter Wu <peter@lekensteyn.nl>
Tested-by: Petri Dish Buildbot <buildbot-no-reply@wireshark.org>
Reviewed-by: Anders Broman <a.broman58@gmail.com>
2015-03-21 10:57:01 +00:00
|
|
|
# Python 3.4
|
|
|
|
HTMLParser. __init__(self, convert_charrefs=True)
|
|
|
|
except:
|
|
|
|
HTMLParser. __init__(self)
|
|
|
|
# All text, concatenated
|
|
|
|
self.output_buffer = ''
|
|
|
|
# The current text block which is being constructed
|
|
|
|
self.text_block = ''
|
|
|
|
# Whether the previous element was terminated with whitespace
|
|
|
|
self.need_space = False
|
|
|
|
# Whether to prevent word-wrapping the contents (for "pre" tag)
|
|
|
|
self.skip_wrap = False
|
|
|
|
# track list items
|
|
|
|
self.list_item_prefix = None
|
|
|
|
self.ordered_list_index = None
|
|
|
|
# Indentation (for heading and paragraphs)
|
|
|
|
self.indent_levels = [0, 0]
|
|
|
|
|
|
|
|
def _wrap_text(self, text):
|
|
|
|
"""Wraps text, but additionally indent list items."""
|
|
|
|
initial_indent = indent = sum(self.indent_levels) * ' '
|
|
|
|
if self.list_item_prefix:
|
|
|
|
initial_indent += self.list_item_prefix
|
|
|
|
indent += ' '
|
|
|
|
wrapper = TextWrapper(width=66, break_on_hyphens=False,
|
|
|
|
initial_indent=initial_indent, subsequent_indent=indent)
|
|
|
|
return '\n'.join(wrapper.wrap(text))
|
|
|
|
|
|
|
|
def _commit_block(self, newline='\n\n'):
|
|
|
|
text = self.text_block
|
|
|
|
if text:
|
|
|
|
if not self.skip_wrap:
|
|
|
|
text = self._wrap_text(text)
|
|
|
|
self.output_buffer += text + newline
|
|
|
|
self.text_block = ''
|
|
|
|
self.need_space = False
|
|
|
|
|
|
|
|
def handle_starttag(self, tag, attrs):
|
|
|
|
# end a block of text on <br>, but also flush list items which are not
|
|
|
|
# terminated.
|
|
|
|
if tag == 'br' or tag == 'li':
|
|
|
|
self._commit_block('\n')
|
|
|
|
if tag == 'pre':
|
|
|
|
self.skip_wrap = True
|
|
|
|
# Following list items are numbered.
|
|
|
|
if tag == 'ol':
|
|
|
|
self.ordered_list_index = 1
|
|
|
|
if tag == 'ul':
|
|
|
|
self.list_item_prefix = ' * '
|
|
|
|
if tag == 'li' and self.ordered_list_index:
|
|
|
|
self.list_item_prefix = ' %d. ' % (self.ordered_list_index)
|
|
|
|
self.ordered_list_index += 1
|
|
|
|
if tag[0] == 'h' and len(tag) == 2 and \
|
|
|
|
(tag[1] >= '1' and tag[1] <= '6'):
|
|
|
|
self.indent_levels = [int(tag[1]) - 1, 0]
|
|
|
|
if tag == 'p':
|
|
|
|
self.indent_levels[1] = 1
|
2008-12-17 19:41:43 +00:00
|
|
|
|
|
|
|
def handle_data(self, data):
|
Always use html2text.py for FAQ, improve output
A recent commit broke compilation with Python 3. The original author of
html2text.py is deceased and the fork has increased the number of files
for this "simple" helper.
The html2text.py script in this patch was rewritten and its output
matches with lynx (except for a few newlines around lists). This means
that indentation has been added for headings, paragraphs and lists.
Also, since it was written from scratch, a new license could be chosen
that matches Wireshark.
Since now the in-tree html2text.py script provides nicer output, remove
detection of the alternative programs (elinks, links). lynx/w3m is
somehow still necessary for asciidoc though.
(I also looked into reusing html2text.py for the release notes to
replace asciidoc, but the --format=html output produces different output
(HTML adds a ToC and section numbers). For now still require lynx for
release notes)
Tested with Python 2.6.6, 2.7.9, 3.2.6 and 3.4.3 under LC_ALL=C and
LC_ALL=en_US.UTF-8 on Linux. Tested reading from stdin and file, writing
to file, pipe and tty. Tested with cmake (Ninja) and autotools on Arch
Linux x86_64. Test:
# For each $PATH per python version, execute (with varying LC_ALL)
help/faq.py -b | tools/html2text.py /dev/stdin | md5sum
help/faq.py -b | tools/html2text.py | md5sum
help/faq.py -b | tools/html2text.py
help/faq.py -b | tools/html2text.py >/dev/null
Change-Id: I6409450a3e6c8b010ca082251f9db7358b0cc2fd
Reviewed-on: https://code.wireshark.org/review/7779
Petri-Dish: Peter Wu <peter@lekensteyn.nl>
Tested-by: Petri Dish Buildbot <buildbot-no-reply@wireshark.org>
Reviewed-by: Anders Broman <a.broman58@gmail.com>
2015-03-21 10:57:01 +00:00
|
|
|
if self.skip_wrap:
|
|
|
|
block = data
|
|
|
|
else:
|
|
|
|
# For normal text, fold multiple whitespace and strip
|
|
|
|
# leading and trailing spaces for the whole block (but
|
|
|
|
# keep spaces in the middle).
|
|
|
|
block = ''
|
|
|
|
if data.strip() and data[:1].isspace():
|
|
|
|
# Keep spaces in the middle
|
|
|
|
self.need_space = True
|
|
|
|
if self.need_space and data.strip() and self.text_block:
|
|
|
|
block = ' '
|
|
|
|
block += ' '.join(data.split())
|
|
|
|
self.need_space = data[-1:].isspace()
|
|
|
|
self.text_block += block
|
|
|
|
|
|
|
|
def handle_endtag(self, tag):
|
|
|
|
block_elements = 'p li ul pre ol h1 h2 h3 h4 h5 h6'
|
|
|
|
#block_elements += ' dl dd dt'
|
|
|
|
if tag in block_elements.split():
|
|
|
|
self._commit_block()
|
|
|
|
if tag in ('ol', 'ul'):
|
|
|
|
self.list_item_prefix = None
|
|
|
|
self.ordered_list_index = None
|
|
|
|
if tag == 'pre':
|
|
|
|
self.skip_wrap = False
|
|
|
|
|
|
|
|
def handle_charref(self, name):
|
|
|
|
self.handle_data(unichr(int(name)))
|
|
|
|
|
|
|
|
def handle_entityref(self, name):
|
|
|
|
self.handle_data(unichr(name2codepoint[name]))
|
2008-12-17 19:41:43 +00:00
|
|
|
|
Always use html2text.py for FAQ, improve output
A recent commit broke compilation with Python 3. The original author of
html2text.py is deceased and the fork has increased the number of files
for this "simple" helper.
The html2text.py script in this patch was rewritten and its output
matches with lynx (except for a few newlines around lists). This means
that indentation has been added for headings, paragraphs and lists.
Also, since it was written from scratch, a new license could be chosen
that matches Wireshark.
Since now the in-tree html2text.py script provides nicer output, remove
detection of the alternative programs (elinks, links). lynx/w3m is
somehow still necessary for asciidoc though.
(I also looked into reusing html2text.py for the release notes to
replace asciidoc, but the --format=html output produces different output
(HTML adds a ToC and section numbers). For now still require lynx for
release notes)
Tested with Python 2.6.6, 2.7.9, 3.2.6 and 3.4.3 under LC_ALL=C and
LC_ALL=en_US.UTF-8 on Linux. Tested reading from stdin and file, writing
to file, pipe and tty. Tested with cmake (Ninja) and autotools on Arch
Linux x86_64. Test:
# For each $PATH per python version, execute (with varying LC_ALL)
help/faq.py -b | tools/html2text.py /dev/stdin | md5sum
help/faq.py -b | tools/html2text.py | md5sum
help/faq.py -b | tools/html2text.py
help/faq.py -b | tools/html2text.py >/dev/null
Change-Id: I6409450a3e6c8b010ca082251f9db7358b0cc2fd
Reviewed-on: https://code.wireshark.org/review/7779
Petri-Dish: Peter Wu <peter@lekensteyn.nl>
Tested-by: Petri Dish Buildbot <buildbot-no-reply@wireshark.org>
Reviewed-by: Anders Broman <a.broman58@gmail.com>
2015-03-21 10:57:01 +00:00
|
|
|
def close(self):
|
|
|
|
HTMLParser.close(self)
|
|
|
|
self._commit_block()
|
|
|
|
byte_output = self.output_buffer.encode('utf-8')
|
|
|
|
if hasattr(sys.stdout, 'buffer'):
|
|
|
|
sys.stdout.buffer.write(byte_output)
|
|
|
|
else:
|
|
|
|
sys.stdout.write(byte_output)
|
2008-12-17 19:41:43 +00:00
|
|
|
|
|
|
|
|
Always use html2text.py for FAQ, improve output
A recent commit broke compilation with Python 3. The original author of
html2text.py is deceased and the fork has increased the number of files
for this "simple" helper.
The html2text.py script in this patch was rewritten and its output
matches with lynx (except for a few newlines around lists). This means
that indentation has been added for headings, paragraphs and lists.
Also, since it was written from scratch, a new license could be chosen
that matches Wireshark.
Since now the in-tree html2text.py script provides nicer output, remove
detection of the alternative programs (elinks, links). lynx/w3m is
somehow still necessary for asciidoc though.
(I also looked into reusing html2text.py for the release notes to
replace asciidoc, but the --format=html output produces different output
(HTML adds a ToC and section numbers). For now still require lynx for
release notes)
Tested with Python 2.6.6, 2.7.9, 3.2.6 and 3.4.3 under LC_ALL=C and
LC_ALL=en_US.UTF-8 on Linux. Tested reading from stdin and file, writing
to file, pipe and tty. Tested with cmake (Ninja) and autotools on Arch
Linux x86_64. Test:
# For each $PATH per python version, execute (with varying LC_ALL)
help/faq.py -b | tools/html2text.py /dev/stdin | md5sum
help/faq.py -b | tools/html2text.py | md5sum
help/faq.py -b | tools/html2text.py
help/faq.py -b | tools/html2text.py >/dev/null
Change-Id: I6409450a3e6c8b010ca082251f9db7358b0cc2fd
Reviewed-on: https://code.wireshark.org/review/7779
Petri-Dish: Peter Wu <peter@lekensteyn.nl>
Tested-by: Petri Dish Buildbot <buildbot-no-reply@wireshark.org>
Reviewed-by: Anders Broman <a.broman58@gmail.com>
2015-03-21 10:57:01 +00:00
|
|
|
def main():
|
|
|
|
htmlparser = TextHTMLParser()
|
|
|
|
if len(sys.argv) > 1:
|
|
|
|
if sys.version_info[0] >= 3:
|
|
|
|
# Python 3: read file as utf-8
|
|
|
|
kwargs = { 'encoding': 'utf-8' }
|
2008-12-17 19:41:43 +00:00
|
|
|
else:
|
Always use html2text.py for FAQ, improve output
A recent commit broke compilation with Python 3. The original author of
html2text.py is deceased and the fork has increased the number of files
for this "simple" helper.
The html2text.py script in this patch was rewritten and its output
matches with lynx (except for a few newlines around lists). This means
that indentation has been added for headings, paragraphs and lists.
Also, since it was written from scratch, a new license could be chosen
that matches Wireshark.
Since now the in-tree html2text.py script provides nicer output, remove
detection of the alternative programs (elinks, links). lynx/w3m is
somehow still necessary for asciidoc though.
(I also looked into reusing html2text.py for the release notes to
replace asciidoc, but the --format=html output produces different output
(HTML adds a ToC and section numbers). For now still require lynx for
release notes)
Tested with Python 2.6.6, 2.7.9, 3.2.6 and 3.4.3 under LC_ALL=C and
LC_ALL=en_US.UTF-8 on Linux. Tested reading from stdin and file, writing
to file, pipe and tty. Tested with cmake (Ninja) and autotools on Arch
Linux x86_64. Test:
# For each $PATH per python version, execute (with varying LC_ALL)
help/faq.py -b | tools/html2text.py /dev/stdin | md5sum
help/faq.py -b | tools/html2text.py | md5sum
help/faq.py -b | tools/html2text.py
help/faq.py -b | tools/html2text.py >/dev/null
Change-Id: I6409450a3e6c8b010ca082251f9db7358b0cc2fd
Reviewed-on: https://code.wireshark.org/review/7779
Petri-Dish: Peter Wu <peter@lekensteyn.nl>
Tested-by: Petri Dish Buildbot <buildbot-no-reply@wireshark.org>
Reviewed-by: Anders Broman <a.broman58@gmail.com>
2015-03-21 10:57:01 +00:00
|
|
|
kwargs = {}
|
|
|
|
with open(sys.argv[1], **kwargs) as f:
|
|
|
|
for line in f:
|
|
|
|
htmlparser.feed(line)
|
2008-12-17 19:41:43 +00:00
|
|
|
else:
|
Always use html2text.py for FAQ, improve output
A recent commit broke compilation with Python 3. The original author of
html2text.py is deceased and the fork has increased the number of files
for this "simple" helper.
The html2text.py script in this patch was rewritten and its output
matches with lynx (except for a few newlines around lists). This means
that indentation has been added for headings, paragraphs and lists.
Also, since it was written from scratch, a new license could be chosen
that matches Wireshark.
Since now the in-tree html2text.py script provides nicer output, remove
detection of the alternative programs (elinks, links). lynx/w3m is
somehow still necessary for asciidoc though.
(I also looked into reusing html2text.py for the release notes to
replace asciidoc, but the --format=html output produces different output
(HTML adds a ToC and section numbers). For now still require lynx for
release notes)
Tested with Python 2.6.6, 2.7.9, 3.2.6 and 3.4.3 under LC_ALL=C and
LC_ALL=en_US.UTF-8 on Linux. Tested reading from stdin and file, writing
to file, pipe and tty. Tested with cmake (Ninja) and autotools on Arch
Linux x86_64. Test:
# For each $PATH per python version, execute (with varying LC_ALL)
help/faq.py -b | tools/html2text.py /dev/stdin | md5sum
help/faq.py -b | tools/html2text.py | md5sum
help/faq.py -b | tools/html2text.py
help/faq.py -b | tools/html2text.py >/dev/null
Change-Id: I6409450a3e6c8b010ca082251f9db7358b0cc2fd
Reviewed-on: https://code.wireshark.org/review/7779
Petri-Dish: Peter Wu <peter@lekensteyn.nl>
Tested-by: Petri Dish Buildbot <buildbot-no-reply@wireshark.org>
Reviewed-by: Anders Broman <a.broman58@gmail.com>
2015-03-21 10:57:01 +00:00
|
|
|
f = sys.stdin
|
|
|
|
if hasattr(f, 'buffer'):
|
|
|
|
# Access raw (byte) buffer in Python 3 instead of decoded one
|
|
|
|
f = f.buffer
|
|
|
|
# Read stdin as as Unicode string
|
|
|
|
htmlparser.feed(f.read().decode('utf-8'))
|
|
|
|
htmlparser.close()
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
sys.exit(main())
|