wireshark/tools/html2text.py

#!/usr/bin/env python
#
# html2text.py - converts HTML to text
#
# Wireshark - Network traffic analyzer
# By Gerald Combs <gerald@wireshark.org>
# Copyright 1998 Gerald Combs
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

from __future__ import unicode_literals

__author__      = "Peter Wu <peter@lekensteyn.nl>"
__copyright__   = "Copyright 2015, Peter Wu"
__license__     = "GPL (v2 or later)"

# TODO:
#   multiple list indentation levels
#   maybe allow for ascii output instead of utf-8?

import sys
from textwrap import TextWrapper
try:
    from HTMLParser import HTMLParser
    from htmlentitydefs import name2codepoint
except: # Python 3
    from html.parser import HTMLParser
    from html.entities import name2codepoint
    unichr = chr # for html entity handling

class TextHTMLParser(HTMLParser):
    """Converts a HTML document to text."""
    def __init__(self):
        try:
            # Python 3.4
            HTMLParser. __init__(self, convert_charrefs=True)
        except:
            HTMLParser. __init__(self)
        # All text, concatenated
        self.output_buffer = ''
        # The current text block which is being constructed
        self.text_block = ''
        # Whether the previous element was terminated with whitespace
        self.need_space = False
        # Whether to prevent word-wrapping the contents (for "pre" tag)
        self.skip_wrap = False
        # track list items
        self.list_item_prefix = None
        self.ordered_list_index = None
        # Indentation (for heading and paragraphs)
        self.indent_levels = [0, 0]

    def _wrap_text(self, text):
        """Wraps text, but additionally indent list items."""
        initial_indent = indent = sum(self.indent_levels) * ' '
        if self.list_item_prefix:
            initial_indent += self.list_item_prefix
            indent += '    '
        wrapper = TextWrapper(width=66, break_on_hyphens=False,
            initial_indent=initial_indent, subsequent_indent=indent)
        return '\n'.join(wrapper.wrap(text))

    def _commit_block(self, newline='\n\n'):
        text = self.text_block
        if text:
            if not self.skip_wrap:
                text = self._wrap_text(text)
            self.output_buffer += text + newline
            self.text_block = ''
        self.need_space = False

    def handle_starttag(self, tag, attrs):
        # end a block of text on <br>, but also flush list items which are not
        # terminated.
        if tag == 'br' or tag == 'li':
            self._commit_block('\n')
        if tag == 'pre':
            self.skip_wrap = True
        # Following list items are numbered.
        if tag == 'ol':
            self.ordered_list_index = 1
        if tag == 'ul':
            self.list_item_prefix = '  * '
        if tag == 'li' and self.ordered_list_index:
            self.list_item_prefix =  ' %d. ' % (self.ordered_list_index)
            self.ordered_list_index += 1
        if tag[0] == 'h' and len(tag) == 2 and \
            (tag[1] >= '1' and tag[1] <= '6'):
            self.indent_levels = [int(tag[1]) - 1, 0]
        if tag == 'p':
            self.indent_levels[1] = 1

    def handle_data(self, data):
        if self.skip_wrap:
            block = data
        else:
            # For normal text, fold multiple whitespace and strip
            # leading and trailing spaces for the whole block (but
            # keep spaces in the middle).
            block = ''
            if data.strip() and data[:1].isspace():
                # Keep spaces in the middle
                self.need_space = True
            if self.need_space and data.strip() and self.text_block:
                block = ' '
            block += ' '.join(data.split())
            self.need_space = data[-1:].isspace()
        self.text_block += block

    def handle_endtag(self, tag):
        block_elements = 'p li ul pre ol h1 h2 h3 h4 h5 h6'
        #block_elements += ' dl dd dt'
        if tag in block_elements.split():
            self._commit_block()
        if tag in ('ol', 'ul'):
            self.list_item_prefix = None
            self.ordered_list_index = None
        if tag == 'pre':
            self.skip_wrap = False

    def handle_charref(self, name):
        self.handle_data(unichr(int(name)))

    def handle_entityref(self, name):
        self.handle_data(unichr(name2codepoint[name]))

    def close(self):
        HTMLParser.close(self)
        self._commit_block()
        byte_output = self.output_buffer.encode('utf-8')
        if hasattr(sys.stdout, 'buffer'):
            sys.stdout.buffer.write(byte_output)
        else:
            sys.stdout.write(byte_output)


def main():
    htmlparser = TextHTMLParser()
    if len(sys.argv) > 1:
        if sys.version_info[0] >= 3:
            # Python 3: read file as utf-8
            kwargs = { 'encoding': 'utf-8' }
        else:
            kwargs = {}
        with open(sys.argv[1], **kwargs) as f:
            for line in f:
                htmlparser.feed(line)
    else:
        f = sys.stdin
        if hasattr(f, 'buffer'):
            # Access raw (byte) buffer in Python 3 instead of decoded one
            f = f.buffer
        # Read stdin as as Unicode string
        htmlparser.feed(f.read().decode('utf-8'))
    htmlparser.close()

if __name__ == '__main__':
    sys.exit(main())
Add html2text.py 2.35 from http://www.aaronsw.com/2002/html2text/. svn path=/trunk/; revision=27039 2008-12-17 19:41:43 +00:00			`#!/usr/bin/env python`
Always use html2text.py for FAQ, improve output A recent commit broke compilation with Python 3. The original author of html2text.py is deceased and the fork has increased the number of files for this "simple" helper. The html2text.py script in this patch was rewritten and its output matches with lynx (except for a few newlines around lists). This means that indentation has been added for headings, paragraphs and lists. Also, since it was written from scratch, a new license could be chosen that matches Wireshark. Since now the in-tree html2text.py script provides nicer output, remove detection of the alternative programs (elinks, links). lynx/w3m is somehow still necessary for asciidoc though. (I also looked into reusing html2text.py for the release notes to replace asciidoc, but the --format=html output produces different output (HTML adds a ToC and section numbers). For now still require lynx for release notes) Tested with Python 2.6.6, 2.7.9, 3.2.6 and 3.4.3 under LC_ALL=C and LC_ALL=en_US.UTF-8 on Linux. Tested reading from stdin and file, writing to file, pipe and tty. Tested with cmake (Ninja) and autotools on Arch Linux x86_64. Test: # For each $PATH per python version, execute (with varying LC_ALL) help/faq.py -b \| tools/html2text.py /dev/stdin \| md5sum help/faq.py -b \| tools/html2text.py \| md5sum help/faq.py -b \| tools/html2text.py help/faq.py -b \| tools/html2text.py >/dev/null Change-Id: I6409450a3e6c8b010ca082251f9db7358b0cc2fd Reviewed-on: https://code.wireshark.org/review/7779 Petri-Dish: Peter Wu <peter@lekensteyn.nl> Tested-by: Petri Dish Buildbot <buildbot-no-reply@wireshark.org> Reviewed-by: Anders Broman <a.broman58@gmail.com> 2015-03-21 10:57:01 +00:00			`#`
			`# html2text.py - converts HTML to text`
			`#`
			`# Wireshark - Network traffic analyzer`
			`# By Gerald Combs <gerald@wireshark.org>`
			`# Copyright 1998 Gerald Combs`
			`#`
			`# This program is free software; you can redistribute it and/or`
			`# modify it under the terms of the GNU General Public License`
			`# as published by the Free Software Foundation; either version 2`
			`# of the License, or (at your option) any later version.`
			`#`
			`# This program is distributed in the hope that it will be useful,`
			`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`# GNU General Public License for more details.`
			`#`
			`# You should have received a copy of the GNU General Public License`
			`# along with this program; if not, write to the Free Software`
			`# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.`

			`from __future__ import unicode_literals`

			`__author__ = "Peter Wu <peter@lekensteyn.nl>"`
			`__copyright__ = "Copyright 2015, Peter Wu"`
			`__license__ = "GPL (v2 or later)"`
Update html2text.py to suit our needs. Add spaces in the faq.txt target so to fix a problem with OS X 10.4. Add html2text.py to the end of the faq.txt target. svn path=/trunk/; revision=27040 2008-12-17 19:49:18 +00:00
Add html2text.py 2.35 from http://www.aaronsw.com/2002/html2text/. svn path=/trunk/; revision=27039 2008-12-17 19:41:43 +00:00			`# TODO:`
Always use html2text.py for FAQ, improve output A recent commit broke compilation with Python 3. The original author of html2text.py is deceased and the fork has increased the number of files for this "simple" helper. The html2text.py script in this patch was rewritten and its output matches with lynx (except for a few newlines around lists). This means that indentation has been added for headings, paragraphs and lists. Also, since it was written from scratch, a new license could be chosen that matches Wireshark. Since now the in-tree html2text.py script provides nicer output, remove detection of the alternative programs (elinks, links). lynx/w3m is somehow still necessary for asciidoc though. (I also looked into reusing html2text.py for the release notes to replace asciidoc, but the --format=html output produces different output (HTML adds a ToC and section numbers). For now still require lynx for release notes) Tested with Python 2.6.6, 2.7.9, 3.2.6 and 3.4.3 under LC_ALL=C and LC_ALL=en_US.UTF-8 on Linux. Tested reading from stdin and file, writing to file, pipe and tty. Tested with cmake (Ninja) and autotools on Arch Linux x86_64. Test: # For each $PATH per python version, execute (with varying LC_ALL) help/faq.py -b \| tools/html2text.py /dev/stdin \| md5sum help/faq.py -b \| tools/html2text.py \| md5sum help/faq.py -b \| tools/html2text.py help/faq.py -b \| tools/html2text.py >/dev/null Change-Id: I6409450a3e6c8b010ca082251f9db7358b0cc2fd Reviewed-on: https://code.wireshark.org/review/7779 Petri-Dish: Peter Wu <peter@lekensteyn.nl> Tested-by: Petri Dish Buildbot <buildbot-no-reply@wireshark.org> Reviewed-by: Anders Broman <a.broman58@gmail.com> 2015-03-21 10:57:01 +00:00			`# multiple list indentation levels`
			`# maybe allow for ascii output instead of utf-8?`

			`import sys`
			`from textwrap import TextWrapper`
			`try:`
			`from HTMLParser import HTMLParser`
			`from htmlentitydefs import name2codepoint`
			`except: # Python 3`
			`from html.parser import HTMLParser`
			`from html.entities import name2codepoint`
			`unichr = chr # for html entity handling`

			`class TextHTMLParser(HTMLParser):`
			`"""Converts a HTML document to text."""`
			`def __init__(self):`
Add html2text.py 2.35 from http://www.aaronsw.com/2002/html2text/. svn path=/trunk/; revision=27039 2008-12-17 19:41:43 +00:00			`try:`
Always use html2text.py for FAQ, improve output A recent commit broke compilation with Python 3. The original author of html2text.py is deceased and the fork has increased the number of files for this "simple" helper. The html2text.py script in this patch was rewritten and its output matches with lynx (except for a few newlines around lists). This means that indentation has been added for headings, paragraphs and lists. Also, since it was written from scratch, a new license could be chosen that matches Wireshark. Since now the in-tree html2text.py script provides nicer output, remove detection of the alternative programs (elinks, links). lynx/w3m is somehow still necessary for asciidoc though. (I also looked into reusing html2text.py for the release notes to replace asciidoc, but the --format=html output produces different output (HTML adds a ToC and section numbers). For now still require lynx for release notes) Tested with Python 2.6.6, 2.7.9, 3.2.6 and 3.4.3 under LC_ALL=C and LC_ALL=en_US.UTF-8 on Linux. Tested reading from stdin and file, writing to file, pipe and tty. Tested with cmake (Ninja) and autotools on Arch Linux x86_64. Test: # For each $PATH per python version, execute (with varying LC_ALL) help/faq.py -b \| tools/html2text.py /dev/stdin \| md5sum help/faq.py -b \| tools/html2text.py \| md5sum help/faq.py -b \| tools/html2text.py help/faq.py -b \| tools/html2text.py >/dev/null Change-Id: I6409450a3e6c8b010ca082251f9db7358b0cc2fd Reviewed-on: https://code.wireshark.org/review/7779 Petri-Dish: Peter Wu <peter@lekensteyn.nl> Tested-by: Petri Dish Buildbot <buildbot-no-reply@wireshark.org> Reviewed-by: Anders Broman <a.broman58@gmail.com> 2015-03-21 10:57:01 +00:00			`# Python 3.4`
			`HTMLParser. __init__(self, convert_charrefs=True)`
			`except:`
			`HTMLParser. __init__(self)`
			`# All text, concatenated`
			`self.output_buffer = ''`
			`# The current text block which is being constructed`
			`self.text_block = ''`
			`# Whether the previous element was terminated with whitespace`
			`self.need_space = False`
			`# Whether to prevent word-wrapping the contents (for "pre" tag)`
			`self.skip_wrap = False`
			`# track list items`
			`self.list_item_prefix = None`
			`self.ordered_list_index = None`
			`# Indentation (for heading and paragraphs)`
			`self.indent_levels = [0, 0]`

			`def _wrap_text(self, text):`
			`"""Wraps text, but additionally indent list items."""`
			`initial_indent = indent = sum(self.indent_levels) * ' '`
			`if self.list_item_prefix:`
			`initial_indent += self.list_item_prefix`
			`indent += ' '`
			`wrapper = TextWrapper(width=66, break_on_hyphens=False,`
			`initial_indent=initial_indent, subsequent_indent=indent)`
			`return '\n'.join(wrapper.wrap(text))`

			`def _commit_block(self, newline='\n\n'):`
			`text = self.text_block`
			`if text:`
			`if not self.skip_wrap:`
			`text = self._wrap_text(text)`
			`self.output_buffer += text + newline`
			`self.text_block = ''`
			`self.need_space = False`

			`def handle_starttag(self, tag, attrs):`
			`# end a block of text on <br>, but also flush list items which are not`
			`# terminated.`
			`if tag == 'br' or tag == 'li':`
			`self._commit_block('\n')`
			`if tag == 'pre':`
			`self.skip_wrap = True`
			`# Following list items are numbered.`
			`if tag == 'ol':`
			`self.ordered_list_index = 1`
			`if tag == 'ul':`
			`self.list_item_prefix = ' * '`
			`if tag == 'li' and self.ordered_list_index:`
			`self.list_item_prefix = ' %d. ' % (self.ordered_list_index)`
			`self.ordered_list_index += 1`
			`if tag[0] == 'h' and len(tag) == 2 and \`
			`(tag[1] >= '1' and tag[1] <= '6'):`
			`self.indent_levels = [int(tag[1]) - 1, 0]`
			`if tag == 'p':`
			`self.indent_levels[1] = 1`
Add html2text.py 2.35 from http://www.aaronsw.com/2002/html2text/. svn path=/trunk/; revision=27039 2008-12-17 19:41:43 +00:00
			`def handle_data(self, data):`
Always use html2text.py for FAQ, improve output A recent commit broke compilation with Python 3. The original author of html2text.py is deceased and the fork has increased the number of files for this "simple" helper. The html2text.py script in this patch was rewritten and its output matches with lynx (except for a few newlines around lists). This means that indentation has been added for headings, paragraphs and lists. Also, since it was written from scratch, a new license could be chosen that matches Wireshark. Since now the in-tree html2text.py script provides nicer output, remove detection of the alternative programs (elinks, links). lynx/w3m is somehow still necessary for asciidoc though. (I also looked into reusing html2text.py for the release notes to replace asciidoc, but the --format=html output produces different output (HTML adds a ToC and section numbers). For now still require lynx for release notes) Tested with Python 2.6.6, 2.7.9, 3.2.6 and 3.4.3 under LC_ALL=C and LC_ALL=en_US.UTF-8 on Linux. Tested reading from stdin and file, writing to file, pipe and tty. Tested with cmake (Ninja) and autotools on Arch Linux x86_64. Test: # For each $PATH per python version, execute (with varying LC_ALL) help/faq.py -b \| tools/html2text.py /dev/stdin \| md5sum help/faq.py -b \| tools/html2text.py \| md5sum help/faq.py -b \| tools/html2text.py help/faq.py -b \| tools/html2text.py >/dev/null Change-Id: I6409450a3e6c8b010ca082251f9db7358b0cc2fd Reviewed-on: https://code.wireshark.org/review/7779 Petri-Dish: Peter Wu <peter@lekensteyn.nl> Tested-by: Petri Dish Buildbot <buildbot-no-reply@wireshark.org> Reviewed-by: Anders Broman <a.broman58@gmail.com> 2015-03-21 10:57:01 +00:00			`if self.skip_wrap:`
			`block = data`
			`else:`
			`# For normal text, fold multiple whitespace and strip`
			`# leading and trailing spaces for the whole block (but`
			`# keep spaces in the middle).`
			`block = ''`
			`if data.strip() and data[:1].isspace():`
			`# Keep spaces in the middle`
			`self.need_space = True`
			`if self.need_space and data.strip() and self.text_block:`
			`block = ' '`
			`block += ' '.join(data.split())`
			`self.need_space = data[-1:].isspace()`
			`self.text_block += block`

			`def handle_endtag(self, tag):`
			`block_elements = 'p li ul pre ol h1 h2 h3 h4 h5 h6'`
			`#block_elements += ' dl dd dt'`
			`if tag in block_elements.split():`
			`self._commit_block()`
			`if tag in ('ol', 'ul'):`
			`self.list_item_prefix = None`
			`self.ordered_list_index = None`
			`if tag == 'pre':`
			`self.skip_wrap = False`

			`def handle_charref(self, name):`
			`self.handle_data(unichr(int(name)))`

			`def handle_entityref(self, name):`
			`self.handle_data(unichr(name2codepoint[name]))`
Add html2text.py 2.35 from http://www.aaronsw.com/2002/html2text/. svn path=/trunk/; revision=27039 2008-12-17 19:41:43 +00:00
Always use html2text.py for FAQ, improve output A recent commit broke compilation with Python 3. The original author of html2text.py is deceased and the fork has increased the number of files for this "simple" helper. The html2text.py script in this patch was rewritten and its output matches with lynx (except for a few newlines around lists). This means that indentation has been added for headings, paragraphs and lists. Also, since it was written from scratch, a new license could be chosen that matches Wireshark. Since now the in-tree html2text.py script provides nicer output, remove detection of the alternative programs (elinks, links). lynx/w3m is somehow still necessary for asciidoc though. (I also looked into reusing html2text.py for the release notes to replace asciidoc, but the --format=html output produces different output (HTML adds a ToC and section numbers). For now still require lynx for release notes) Tested with Python 2.6.6, 2.7.9, 3.2.6 and 3.4.3 under LC_ALL=C and LC_ALL=en_US.UTF-8 on Linux. Tested reading from stdin and file, writing to file, pipe and tty. Tested with cmake (Ninja) and autotools on Arch Linux x86_64. Test: # For each $PATH per python version, execute (with varying LC_ALL) help/faq.py -b \| tools/html2text.py /dev/stdin \| md5sum help/faq.py -b \| tools/html2text.py \| md5sum help/faq.py -b \| tools/html2text.py help/faq.py -b \| tools/html2text.py >/dev/null Change-Id: I6409450a3e6c8b010ca082251f9db7358b0cc2fd Reviewed-on: https://code.wireshark.org/review/7779 Petri-Dish: Peter Wu <peter@lekensteyn.nl> Tested-by: Petri Dish Buildbot <buildbot-no-reply@wireshark.org> Reviewed-by: Anders Broman <a.broman58@gmail.com> 2015-03-21 10:57:01 +00:00			`def close(self):`
			`HTMLParser.close(self)`
			`self._commit_block()`
			`byte_output = self.output_buffer.encode('utf-8')`
			`if hasattr(sys.stdout, 'buffer'):`
			`sys.stdout.buffer.write(byte_output)`
			`else:`
			`sys.stdout.write(byte_output)`
Add html2text.py 2.35 from http://www.aaronsw.com/2002/html2text/. svn path=/trunk/; revision=27039 2008-12-17 19:41:43 +00:00

Always use html2text.py for FAQ, improve output A recent commit broke compilation with Python 3. The original author of html2text.py is deceased and the fork has increased the number of files for this "simple" helper. The html2text.py script in this patch was rewritten and its output matches with lynx (except for a few newlines around lists). This means that indentation has been added for headings, paragraphs and lists. Also, since it was written from scratch, a new license could be chosen that matches Wireshark. Since now the in-tree html2text.py script provides nicer output, remove detection of the alternative programs (elinks, links). lynx/w3m is somehow still necessary for asciidoc though. (I also looked into reusing html2text.py for the release notes to replace asciidoc, but the --format=html output produces different output (HTML adds a ToC and section numbers). For now still require lynx for release notes) Tested with Python 2.6.6, 2.7.9, 3.2.6 and 3.4.3 under LC_ALL=C and LC_ALL=en_US.UTF-8 on Linux. Tested reading from stdin and file, writing to file, pipe and tty. Tested with cmake (Ninja) and autotools on Arch Linux x86_64. Test: # For each $PATH per python version, execute (with varying LC_ALL) help/faq.py -b \| tools/html2text.py /dev/stdin \| md5sum help/faq.py -b \| tools/html2text.py \| md5sum help/faq.py -b \| tools/html2text.py help/faq.py -b \| tools/html2text.py >/dev/null Change-Id: I6409450a3e6c8b010ca082251f9db7358b0cc2fd Reviewed-on: https://code.wireshark.org/review/7779 Petri-Dish: Peter Wu <peter@lekensteyn.nl> Tested-by: Petri Dish Buildbot <buildbot-no-reply@wireshark.org> Reviewed-by: Anders Broman <a.broman58@gmail.com> 2015-03-21 10:57:01 +00:00			`def main():`
			`htmlparser = TextHTMLParser()`
			`if len(sys.argv) > 1:`
			`if sys.version_info[0] >= 3:`
			`# Python 3: read file as utf-8`
			`kwargs = { 'encoding': 'utf-8' }`
Add html2text.py 2.35 from http://www.aaronsw.com/2002/html2text/. svn path=/trunk/; revision=27039 2008-12-17 19:41:43 +00:00			`else:`
Always use html2text.py for FAQ, improve output A recent commit broke compilation with Python 3. The original author of html2text.py is deceased and the fork has increased the number of files for this "simple" helper. The html2text.py script in this patch was rewritten and its output matches with lynx (except for a few newlines around lists). This means that indentation has been added for headings, paragraphs and lists. Also, since it was written from scratch, a new license could be chosen that matches Wireshark. Since now the in-tree html2text.py script provides nicer output, remove detection of the alternative programs (elinks, links). lynx/w3m is somehow still necessary for asciidoc though. (I also looked into reusing html2text.py for the release notes to replace asciidoc, but the --format=html output produces different output (HTML adds a ToC and section numbers). For now still require lynx for release notes) Tested with Python 2.6.6, 2.7.9, 3.2.6 and 3.4.3 under LC_ALL=C and LC_ALL=en_US.UTF-8 on Linux. Tested reading from stdin and file, writing to file, pipe and tty. Tested with cmake (Ninja) and autotools on Arch Linux x86_64. Test: # For each $PATH per python version, execute (with varying LC_ALL) help/faq.py -b \| tools/html2text.py /dev/stdin \| md5sum help/faq.py -b \| tools/html2text.py \| md5sum help/faq.py -b \| tools/html2text.py help/faq.py -b \| tools/html2text.py >/dev/null Change-Id: I6409450a3e6c8b010ca082251f9db7358b0cc2fd Reviewed-on: https://code.wireshark.org/review/7779 Petri-Dish: Peter Wu <peter@lekensteyn.nl> Tested-by: Petri Dish Buildbot <buildbot-no-reply@wireshark.org> Reviewed-by: Anders Broman <a.broman58@gmail.com> 2015-03-21 10:57:01 +00:00			`kwargs = {}`
			`with open(sys.argv[1], **kwargs) as f:`
			`for line in f:`
			`htmlparser.feed(line)`
Add html2text.py 2.35 from http://www.aaronsw.com/2002/html2text/. svn path=/trunk/; revision=27039 2008-12-17 19:41:43 +00:00			`else:`
Always use html2text.py for FAQ, improve output A recent commit broke compilation with Python 3. The original author of html2text.py is deceased and the fork has increased the number of files for this "simple" helper. The html2text.py script in this patch was rewritten and its output matches with lynx (except for a few newlines around lists). This means that indentation has been added for headings, paragraphs and lists. Also, since it was written from scratch, a new license could be chosen that matches Wireshark. Since now the in-tree html2text.py script provides nicer output, remove detection of the alternative programs (elinks, links). lynx/w3m is somehow still necessary for asciidoc though. (I also looked into reusing html2text.py for the release notes to replace asciidoc, but the --format=html output produces different output (HTML adds a ToC and section numbers). For now still require lynx for release notes) Tested with Python 2.6.6, 2.7.9, 3.2.6 and 3.4.3 under LC_ALL=C and LC_ALL=en_US.UTF-8 on Linux. Tested reading from stdin and file, writing to file, pipe and tty. Tested with cmake (Ninja) and autotools on Arch Linux x86_64. Test: # For each $PATH per python version, execute (with varying LC_ALL) help/faq.py -b \| tools/html2text.py /dev/stdin \| md5sum help/faq.py -b \| tools/html2text.py \| md5sum help/faq.py -b \| tools/html2text.py help/faq.py -b \| tools/html2text.py >/dev/null Change-Id: I6409450a3e6c8b010ca082251f9db7358b0cc2fd Reviewed-on: https://code.wireshark.org/review/7779 Petri-Dish: Peter Wu <peter@lekensteyn.nl> Tested-by: Petri Dish Buildbot <buildbot-no-reply@wireshark.org> Reviewed-by: Anders Broman <a.broman58@gmail.com> 2015-03-21 10:57:01 +00:00			`f = sys.stdin`
			`if hasattr(f, 'buffer'):`
			`# Access raw (byte) buffer in Python 3 instead of decoded one`
			`f = f.buffer`
			`# Read stdin as as Unicode string`
			`htmlparser.feed(f.read().decode('utf-8'))`
			`htmlparser.close()`

			`if __name__ == '__main__':`
			`sys.exit(main())`