Always use html2text.py for FAQ, improve output

A recent commit broke compilation with Python 3. The original author of html2text.py is deceased and the fork has increased the number of files for this "simple" helper. The html2text.py script in this patch was rewritten and its output matches with lynx (except for a few newlines around lists). This means that indentation has been added for headings, paragraphs and lists. Also, since it was written from scratch, a new license could be chosen that matches Wireshark. Since now the in-tree html2text.py script provides nicer output, remove detection of the alternative programs (elinks, links). lynx/w3m is somehow still necessary for asciidoc though. (I also looked into reusing html2text.py for the release notes to replace asciidoc, but the --format=html output produces different output (HTML adds a ToC and section numbers). For now still require lynx for release notes) Tested with Python 2.6.6, 2.7.9, 3.2.6 and 3.4.3 under LC_ALL=C and LC_ALL=en_US.UTF-8 on Linux. Tested reading from stdin and file, writing to file, pipe and tty. Tested with cmake (Ninja) and autotools on Arch Linux x86_64. Test: # For each $PATH per python version, execute (with varying LC_ALL) help/faq.py -b | tools/html2text.py /dev/stdin | md5sum help/faq.py -b | tools/html2text.py | md5sum help/faq.py -b | tools/html2text.py help/faq.py -b | tools/html2text.py >/dev/null Change-Id: I6409450a3e6c8b010ca082251f9db7358b0cc2fd Reviewed-on: https://code.wireshark.org/review/7779 Petri-Dish: Peter Wu <peter@lekensteyn.nl> Tested-by: Petri Dish Buildbot <buildbot-no-reply@wireshark.org> Reviewed-by: Anders Broman <a.broman58@gmail.com>
2015-03-21 11:57:01 +01:00 · 2015-03-21 11:57:01 +01:00 · 68698db8cc
parent 83b6338673
commit 68698db8cc
10 changed files with 168 additions and 566 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1416,10 +1416,9 @@ else()
 	endforeach()
 endif(WIN32)
 add_custom_command(TARGET copy_data_files PRE_BUILD
-	COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/help/faq.py > faq.tmp.html
+	COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/help/faq.py -b > faq.tmp.html
 	COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/tools/html2text.py
-		--width=72 --no-links faq.tmp.html
+		faq.tmp.html > "${DATAFILE_DIR}/help/faq.txt"
 		> "${DATAFILE_DIR}/help/faq.txt"
 	COMMAND ${CMAKE_COMMAND} -E remove faq.tmp.html
 )
--- a/1
+++ b/1
@ -24,7 +24,6 @@ covered by other licenses that are not themselves directly compatible with the
 GPLv2. This is OK, as only the tools themselves are licensed this way, the
 output of the tools is not considered a derived work, and so can be safely
 licensed for Wireshark's use. An incomplete selection of these tools includes:
 - the html2text utility (tools/html2text.py) is licensed under the GPLv3.
 - the pidl utility (tools/pidl) is licensed under the GPLv3+.
 Parts of Wireshark can be built and distributed as libraries. These
--- a/cmake/modules/FindLYNX.cmake
+++ b/cmake/modules/FindLYNX.cmake
@ -1,6 +1,6 @@
 #
 # - Find unix commands from cygwin
-# This module looks for some usual Unix commands.
+# This module looks for lynx (used by asciidoc)
 #
 INCLUDE(FindCygwin)
@ -8,9 +8,6 @@ INCLUDE(FindCygwin)
 FIND_PROGRAM(LYNX_EXECUTABLE
  NAMES
    lynx
    elinks
    links
    true
  PATHS
    ${CYGWIN_INSTALL_PATH}/bin
    /bin
@ -23,18 +20,3 @@ INCLUDE(FindPackageHandleStandardArgs)
 FIND_PACKAGE_HANDLE_STANDARD_ARGS(LYNX DEFAULT_MSG LYNX_EXECUTABLE)
 MARK_AS_ADVANCED(LYNX_EXECUTABLE)
 # Convert html to text
 IF(LYNX_EXECUTABLE MATCHES lynx)
 	# (See Bug # 1446 for note re 'force-html' below)
 	set(HTML2TXT "lynx -dump -width=72 -nolist -stdin -force-html")
 ELSEIF(LYNX_EXECUTABLE MATCHES elinks)
 	set(HTML2TXT "elinks -dump -dump-width 72")
 ELSEIF(LYNX_EXECUTABLE MATCHES links)
 	set(HTML2TXT "links -dump -width 72")
 ELSEIF(LYNX_EXECUTABLE MATCHES true)
 	set(HTML2TXT "true")
 ELSE()
 	message(ERROR "Should never be reached - please report!")
 ENDIF()
 message(STATUS "html2text: ${HTML2TXT}")
--- a/config.nmake
+++ b/config.nmake
@ -79,8 +79,7 @@ PROGRAM_FILES=$(PROGRAMFILES)
 PROGRAM_FILES_W6432=$(PROGRAMW6432)
 #
-# Location of the "tools" directory. This affects HTML2TXT below and should
+# Location of the "tools" directory. This affects the path to textify.ps1
 # be overridden by makefiles in any subdirectories that use HTML2TXT.
 !IFNDEF TOOLS_DIR
 TOOLS_DIR=tools
 !ENDIF
@ -1321,17 +1320,6 @@ FOP=$(WIRESHARK_LIB_DIR)\fop-1.0\fop.bat
 # Additional options to fop.
 FOP_OPTS=-Xmx256m
 # html to text converter for text version of release notes, e.g. elinks.
 # This could also be "lynx", or "true" if neither elinks nor lynx is installed
 # (cygwin: lynx works, elinks not available, links and true doesn't produce output)
 #HTML2TXT=elinks -dump -dump-width 72
 ##HTML2TXT=links -dump -width 72 ## XXX: Fails: For links -dump requires 'url' (filename) arg.
 #HTML2TXT=lynx -dump -width=72 -nolist -stdin
 !IFNDEF HTML2TXT
 HTML2TXT=$(PYTHON) $(TOOLS_DIR)\html2text.py --width=72 --no-links
 !ENDIF
 # the XSL processor (part of cygwin's libxslt package)
 XSLTPROC="xsltproc"
--- a/configure.ac
+++ b/configure.ac
@ -760,23 +760,19 @@ AC_PATH_PROG(A2X, a2x)
 AC_CHECK_PROG(HAVE_A2X, a2x, "yes", "no")
 AM_CONDITIONAL(HAVE_A2X, test x$HAVE_A2X = xyes)
 # Want to control a tape drive? Use mt. Want to convert HTML to text?
 # Uhhhhh... elinks? lynx? w3m? pandoc? html2text?
 AC_PATH_PROG(ELINKS, elinks)
 AC_CHECK_PROG(HAVE_ELINKS, elinks, "yes", "no")
 AM_CONDITIONAL(HAVE_ELINKS, test x$HAVE_ELINKS = xyes)
 # Check for fop (translate .fo to e.g. pdf)
 AC_PATH_PROG(FOP, fop)
 AC_CHECK_PROG(HAVE_FOP, fop, "yes", "no")
 AM_CONDITIONAL(HAVE_FOP, test x$HAVE_FOP = xyes)
-# Check for lynx (html -> text)
+# TODO: HAVE_LYNX and HAVE_W3M are unused. Maybe require one of them
 # to be found when a2x is enabled? Otherwise it will fail later...
 # Check for lynx (asciidoc text format from html)
 AC_PATH_PROG(LYNX, lynx)
 AC_CHECK_PROG(HAVE_LYNX, lynx, "yes", "no")
 AM_CONDITIONAL(HAVE_LYNX, test x$HAVE_LYNX = xyes)
-# Check for w3m (html -> text)
+# Check for w3m (asciidoc text format from html)
 AC_PATH_PROG(W3M, w3m)
 AC_CHECK_PROG(HAVE_W3M, w3m, "yes", "no")
 AM_CONDITIONAL(HAVE_W3M, test x$HAVE_W3M = xyes)
--- a/docbook/Makefile.am
+++ b/docbook/Makefile.am
@ -18,26 +18,6 @@ A2X_TEXT_OPTS=
 A2X_TEXT_OPTS+="--lynx"
 #endif
 # html to text converter for text version of release notes, e.g. elinks.
 # This could also be "lynx", or "true" if neither elinks nor lynx is installed
 # (See Bug # 1446 for note re 'force-html' below)
 # Sorry about the indenting, but that's what automake requires...
 if HAVE_ELINKS
 HTML2TXT=$(ELINKS) -dump -dump-width 72
 ## links: -dump requires 'url' argument (as opposed to elinks & lynx)
 ## (Rather than fixing things we'll just disable the use of links).
 ##else
 ##if HAVE_LINKS
 ##HTML2TXT=$(LINKS) -dump -width 72
 else
 if HAVE_LYNX
 HTML2TXT=$(LYNX) -dump -width=72 -nolist -stdin -force-html
 else
 HTML2TXT="true"
 endif
 ##endif
 endif
 ############### YOU SHOULDN'T HAVE TO EDIT ANYTHING BELOW THIS LINE! ################
 include Makefile.common
--- a/help/Makefile.am
+++ b/help/Makefile.am
@ -43,13 +43,8 @@ CLEANFILES = faq.txt
 MAINTAINERCLEANFILES = \
 	Makefile.in
-# Try our best to convert the FAQ to text.
+# Convert the FAQ to text.
 # The output of html2text.py isn't as pretty as elinks, links, or lynx. If that ever changes, we
 # can use it exclusively.
 faq.txt: $(srcdir)/faq.py
-	 $(AM_V_GEN)$(srcdir)/faq.py >$@.tmp && \
+	$(AM_V_GEN)$(srcdir)/faq.py -b >$@.tmp && \
-	 command -v elinks > /dev/null && elinks -dump -dump-width 72 -no-numbering -no-references < $@.tmp > $@ || \
+	$(srcdir)/../tools/html2text.py $@.tmp > $@ && \
 	 command -v links  > /dev/null && links -width 72 -html-numbered-links 0 -dump               $@.tmp > $@ || \
 	 command -v lynx   > /dev/null && lynx -dump -width=72 -nolist -stdin -force-html          < $@.tmp > $@ || \
         $(srcdir)/../tools/html2text.py --width=72 --no-links $@.tmp > $@ && \
 	rm -f $@.tmp
--- a/help/Makefile.nmake
+++ b/help/Makefile.nmake
@ -10,7 +10,7 @@ include ..\config.nmake
 all: faq.txt
 faq.txt: faq.py
-	$(PYTHON) faq.py | $(HTML2TXT) > $@
+	$(PYTHON) faq.py -b | $(PYTHON) $(TOOLS_DIR)\html2text.py > $@
 clean:
 	rm -rf faq.txt
--- a/tools/checklicenses.py
+++ b/tools/checklicenses.py
@ -203,9 +203,6 @@ PATH_SPECIFIC_WHITELISTED_LICENSES = {
    'tools/pidl': [
        'UNKNOWN',
    ],
    'tools/html2text.py': [
        'UNKNOWN',
    ],
    'tools/lemon': [
        'UNKNOWN',
    ],
--- a/tools/html2text.py
+++ b/tools/html2text.py
@ -1,504 +1,170 @@
 #!/usr/bin/env python
-"""html2text: Turn HTML into equivalent Markdown-structured text."""
+#
-__version__ = "2.35-Wireshark"
+# html2text.py - converts HTML to text
-__author__ = "Aaron Swartz (me@aaronsw.com)"
+#
-__copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3."
+# Wireshark - Network traffic analyzer
-__contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes"]
+# By Gerald Combs <gerald@wireshark.org>
 # Copyright 1998 Gerald Combs
 #
 # This program is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License
 # as published by the Free Software Foundation; either version 2
 # of the License, or (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with this program; if not, write to the Free Software
 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-# NOTE:
+from __future__ import unicode_literals
-#   This is a modified version of html2text.py from http://www.aaronsw.com/2002/html2text/
+
-#   Changes:
+__author__      = "Peter Wu <peter@lekensteyn.nl>"
-#     Options can now be configured from the command line.
+__copyright__   = "Copyright 2015, Peter Wu"
-#     SKIP_LINKS and INPUT_ENCODING options have been added.
+__license__     = "GPL (v2 or later)"
 #     The script now requires Python 2.3
 # TODO:
-#   Support decoded entities with unifiable.
+#   multiple list indentation levels
-#   Relative URL resolution
+#   maybe allow for ascii output instead of utf-8?
 #   Indent sections and lists similar to elinks/links/lynx
-if not hasattr(__builtins__, 'True'): True, False = 1, 0
+import sys
-import re, sys, urllib, htmlentitydefs, codecs, StringIO, types
+from textwrap import TextWrapper
-import sgmllib
+try:
-sgmllib.charref = re.compile('&#([xX]?[0-9a-fA-F]+)[^0-9a-fA-F]')
+    from HTMLParser import HTMLParser
-from optparse import OptionParser
+    from htmlentitydefs import name2codepoint
 except: # Python 3
    from html.parser import HTMLParser
    from html.entities import name2codepoint
    unichr = chr # for html entity handling
-try: from textwrap import wrap
+class TextHTMLParser(HTMLParser):
-except: pass
+    """Converts a HTML document to text."""
-
+    def __init__(self):
 oparser = OptionParser()
 options = None
 args = None
 oparser.add_option(
    "--force-unicode",
    action="store_true",
    dest="UNICODE_SNOB",
    default=False,
    help="Use Unicode characters instead of their ascii psuedo-replacements. [default: False]",
    )
 oparser.add_option(
    "--links-after-paragraphs",
    action="store_true",
    dest="LINKS_EACH_PARAGRAPH",
    default=False,
    help="Put the links after each paragraph instead of at the end. [default: False]",
    )
 oparser.add_option(
    "--width",
    type="int",
    dest="BODY_WIDTH",
    default=78,
    help="Wrap long lines at position. 0 for no wrapping. Requires Python 2.3. [default: 78 characters]",
    )
 oparser.add_option(
    "--no-internal-links",
    action="store_true",
    dest="SKIP_INTERNAL_LINKS",
    default=False,
    help='''Don't show internal links (href="#local-anchor"). Corresponding link targets won't be visible in the plain text file anyway. [default: False]''',
    )
 oparser.add_option(
    "--no-links",
    action="store_true",
    dest="SKIP_LINKS",
    default=False,
    help='''Don't show links. [default: False]''',
    )
 oparser.add_option(
    "--input-encoding",
    type="string",
    dest="INPUT_ENCODING",
    default='utf-8',
    help='''Force the encoding of the input file. [default: utf-8]''',
    )
 ### Entity Nonsense ###
 def name2cp(k):
    if k == 'apos': return ord("'")
    if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
        return htmlentitydefs.name2codepoint[k]
    else:
        k = htmlentitydefs.entitydefs[k]
        if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
        return ord(codecs.latin_1_decode(k)[0])
 unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"', 
 'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*',
 'ndash':'-', 'oelig':'oe', 'aelig':'ae',
 'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a', 
 'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e', 
 'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i',
 'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o', 
 'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u'}
 unifiable_n = {}
 for k in unifiable.keys():
    unifiable_n[name2cp(k)] = unifiable[k]
 def charref(name):
    global options
    if name[0] in ['x','X']:
        c = int(name[1:], 16)
    else:
        c = int(name)
    if not options.UNICODE_SNOB and c in unifiable_n.keys():
        return unifiable_n[c]
    else:
        return unichr(c)
 def entityref(c):
    global options
    if not options.UNICODE_SNOB and c in unifiable.keys():
        return unifiable[c]
    else:
        try: name2cp(c)
        except KeyError: return "&" + c
        else: return unichr(name2cp(c))
 def replaceEntities(s):
    s = s.group(1)
    if s[0] == "#": 
        return charref(s[1:])
    else: return entityref(s)
 r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
 def unescape(s):
    return r_unescape.sub(replaceEntities, s)
 def fixattrs(attrs):
    # Fix bug in sgmllib.py
    if not attrs: return attrs
    newattrs = []
    for attr in attrs:
        newattrs.append((attr[0], unescape(attr[1])))
    return newattrs
 ### End Entity Nonsense ###
 def onlywhite(line):
    """Return true if the line does only consist of whitespace characters."""
    for c in line:
        if c is not ' ' and c is not '  ':
            return c is ' '
    return line
 def optwrap(text):
    """Wrap all paragraphs in the provided text."""
    global options
    if not options.BODY_WIDTH:
        return text
    assert wrap, "Requires Python 2.3."
    result = ''
    newlines = 0
    for para in text.split("\n"):
        if len(para) > 0:
            if para[0] is not ' ' and para[0] is not '-' and para[0] is not '*':
                for line in wrap(para, options.BODY_WIDTH):
                    result += line + "\n"
                result += "\n"
                newlines = 2
            else:
                if not onlywhite(para):
                    result += para + "\n"
                    newlines = 1
        else:
            if newlines < 2:
                result += "\n"
                newlines += 1
    return result
 def hn(tag):
    if tag[0] == 'h' and len(tag) == 2:
        try:
-            n = int(tag[1])
+            # Python 3.4
-            if n in range(1, 10): return n
+            HTMLParser. __init__(self, convert_charrefs=True)
-        except ValueError: return 0
+        except:
            HTMLParser. __init__(self)
        # All text, concatenated
        self.output_buffer = ''
        # The current text block which is being constructed
        self.text_block = ''
        # Whether the previous element was terminated with whitespace
        self.need_space = False
        # Whether to prevent word-wrapping the contents (for "pre" tag)
        self.skip_wrap = False
        # track list items
        self.list_item_prefix = None
        self.ordered_list_index = None
        # Indentation (for heading and paragraphs)
        self.indent_levels = [0, 0]
-class _html2text(sgmllib.SGMLParser):
+    def _wrap_text(self, text):
-    def __init__(self, out=sys.stdout.write):
+        """Wraps text, but additionally indent list items."""
-        sgmllib.SGMLParser.__init__(self)
+        initial_indent = indent = sum(self.indent_levels) * ' '
-        
+        if self.list_item_prefix:
-        if out is None: self.out = self.outtextf
+            initial_indent += self.list_item_prefix
-        else: self.out = out
+            indent += '    '
-        self.outtext = u''
+        wrapper = TextWrapper(width=66, break_on_hyphens=False,
-        self.quiet = 0
+            initial_indent=initial_indent, subsequent_indent=indent)
-        self.p_p = 0
+        return '\n'.join(wrapper.wrap(text))
        self.outcount = 0
        self.start = 1
        self.space = 0
        self.a = []
        self.astack = []
        self.acount = 0
        self.list = []
        self.blockquote = 0
        self.pre = 0
        self.startpre = 0
        self.lastWasNL = 0
        self.abbr_title = None # current abbreviation definition
        self.abbr_data = None # last inner HTML (for abbr being defined)
        self.abbr_list = {} # stack of abbreviations to write later
    def outtextf(self, s): 
        self.outtext += s
    def close(self):
        sgmllib.SGMLParser.close(self)
        self.pbr()
        self.o('', 0, 'end')
        return self.outtext
    def handle_charref(self, c):
        self.o(charref(c))
-    def handle_entityref(self, c):
+    def _commit_block(self, newline='\n\n'):
-        self.o(entityref(c))
+        text = self.text_block
-            
+        if text:
-    def unknown_starttag(self, tag, attrs):
+            if not self.skip_wrap:
-        self.handle_tag(tag, attrs, 1)
+                text = self._wrap_text(text)
-    
+            self.output_buffer += text + newline
-    def unknown_endtag(self, tag):
+            self.text_block = ''
-        self.handle_tag(tag, None, 0)
+        self.need_space = False
    def previousIndex(self, attrs):
        """ returns the index of certain set of attributes (of a link) in the
            self.a list
            If the set of attributes is not found, returns None
        """
        if not attrs.has_key('href'): return None
        i = -1
        for a in self.a:
            i += 1
            match = 0
            if a.has_key('href') and a['href'] == attrs['href']:
                if a.has_key('title') or attrs.has_key('title'):
                        if (a.has_key('title') and attrs.has_key('title') and
                            a['title'] == attrs['title']):
                            match = True
                else:
                    match = True
-            if match: return i
+    def handle_starttag(self, tag, attrs):
-
+        # end a block of text on <br>, but also flush list items which are not
-    def handle_tag(self, tag, attrs, start):
+        # terminated.
-        global options
+        if tag == 'br' or tag == 'li':
-        attrs = fixattrs(attrs)
+            self._commit_block('\n')
-    
+        if tag == 'pre':
-        if hn(tag):
+            self.skip_wrap = True
-            self.p()
+        # Following list items are numbered.
-            if start: self.o(hn(tag)*"#" + ' ')
+        if tag == 'ol':
-
+            self.ordered_list_index = 1
-        if tag in ['p', 'div']: self.p()
+        if tag == 'ul':
-        
+            self.list_item_prefix = '  * '
-        if tag == "br" and start: self.o("  \n")
+        if tag == 'li' and self.ordered_list_index:
-
+            self.list_item_prefix =  ' %d. ' % (self.ordered_list_index)
-        if tag == "hr" and start:
+            self.ordered_list_index += 1
-            self.p()
+        if tag[0] == 'h' and len(tag) == 2 and \
-            self.o("* * *")
+            (tag[1] >= '1' and tag[1] <= '6'):
-            self.p()
+            self.indent_levels = [int(tag[1]) - 1, 0]
-
+        if tag == 'p':
-        if tag in ["head", "style", 'script']: 
+            self.indent_levels[1] = 1
            if start: self.quiet += 1
            else: self.quiet -= 1
        if tag in ["body"]:
            self.quiet = 0 # sites like 9rules.com never close <head>
        if tag == "blockquote":
            if start: 
                self.p(); self.o('> ', 0, 1); self.start = 1
                self.blockquote += 1
            else:
                self.blockquote -= 1
                self.p()
        if tag in ['em', 'i', 'u']: self.o("_")
        if tag in ['strong', 'b']: self.o("**")
        if tag == "code" and not self.pre: self.o('`') #TODO: `` `this` ``
        if tag == "abbr":
            if start:
                attrsD = {}
                for (x, y) in attrs: attrsD[x] = y
                attrs = attrsD
                self.abbr_title = None
                self.abbr_data = ''
                if attrs.has_key('title'):
                    self.abbr_title = attrs['title']
            else:
                if self.abbr_title != None:
                    self.abbr_list[self.abbr_data] = self.abbr_title
                    self.abbr_title = None
                self.abbr_data = ''
        if tag == "a":
            if start:
                attrsD = {}
                for (x, y) in attrs: attrsD[x] = y
                attrs = attrsD
                if attrs.has_key('href') and not (options.SKIP_LINKS or (options.SKIP_INTERNAL_LINKS and attrs['href'].startswith('#'))): 
                    self.astack.append(attrs)
                    self.o("[")
                else:
                    self.astack.append(None)
            else:
                if self.astack:
                    a = self.astack.pop()
                    if a:
                        i = self.previousIndex(a)
                        if i is not None:
                            a = self.a[i]
                        else:
                            self.acount += 1
                            a['count'] = self.acount
                            a['outcount'] = self.outcount
                            self.a.append(a)
                        self.o("][" + `a['count']` + "]")
        if tag == "img" and start:
            attrsD = {}
            for (x, y) in attrs: attrsD[x] = y
            attrs = attrsD
            if attrs.has_key('src'):
                attrs['href'] = attrs['src']
                alt = attrs.get('alt', '')
                i = self.previousIndex(attrs)
                if i is not None:
                    attrs = self.a[i]
                else:
                    self.acount += 1
                    attrs['count'] = self.acount
                    attrs['outcount'] = self.outcount
                    self.a.append(attrs)
                self.o("![")
                self.o(alt)
                self.o("]["+`attrs['count']`+"]")
        if tag == 'dl' and start: self.p()
        if tag == 'dt' and not start: self.pbr()
        if tag == 'dd' and start: self.o('    ')
        if tag == 'dd' and not start: self.pbr()
        if tag in ["ol", "ul"]:
            if start:
                self.list.append({'name':tag, 'num':0})
            else:
                if self.list: self.list.pop()
            self.p()
        if tag == 'li':
            if start:
                self.pbr()
                if self.list: li = self.list[-1]
                else: li = {'name':'ul', 'num':0}
                self.o("  "*len(self.list)) #TODO: line up <ol><li>s > 9 correctly.
                if li['name'] == "ul": self.o("* ")
                elif li['name'] == "ol":
                    li['num'] += 1
                    self.o(`li['num']`+". ")
                self.start = 1
            else:
                self.pbr()
        if tag in ["table", "tr"] and start: self.p()
        if tag == 'td': self.pbr()
        if tag == "pre":
            if start:
                self.startpre = 1
                self.pre = 1
            else:
                self.pre = 0
            self.p()
    def pbr(self):
        if self.p_p == 0: self.p_p = 1
    def p(self): self.p_p = 2
    def o(self, data, puredata=0, force=0):
        if self.abbr_data is not None: self.abbr_data += data
        if not self.quiet: 
            if puredata and not self.pre:
                data = re.sub('\s+', ' ', data)
                if data and data[0] == ' ':
                    self.space = 1
                    data = data[1:]
            if not data and not force: return
            if self.startpre:
                #self.out(" :") #TODO: not output when already one there
                self.startpre = 0
            bq = (">" * self.blockquote)
            if not (force and data and data[0] == ">") and self.blockquote: bq += " "
            if self.pre:
                bq += "    "
                data = data.replace("\n", "\n"+bq)
            if self.start:
                self.space = 0
                self.p_p = 0
                self.start = 0
            if force == 'end':
                # It's the end.
                self.p_p = 0
                self.out("\n")
                self.space = 0
            if self.p_p:
                self.out(('\n'+bq)*self.p_p)
                self.space = 0
            if self.space:
                if not self.lastWasNL: self.out(' ')
                self.space = 0
            if self.a and ((self.p_p == 2 and options.LINKS_EACH_PARAGRAPH) or force == "end"):
                if force == "end": self.out("\n")
                newa = []
                for link in self.a:
                    if self.outcount > link['outcount']:
                        self.out("   ["+`link['count']`+"]: " + link['href']) #TODO: base href
                        if link.has_key('title'): self.out(" ("+link['title']+")")
                        self.out("\n")
                    else:
                        newa.append(link)
                if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done.
                self.a = newa
            if self.abbr_list and force == "end":
                for abbr, definition in self.abbr_list.items():
                    self.out("  *[" + abbr + "]: " + definition + "\n")
            self.p_p = 0
            self.out(data)
            self.lastWasNL = data and data[-1] == '\n'
            self.outcount += 1
    def handle_data(self, data):
-        if r'\/script>' in data: self.quiet -= 1
+        if self.skip_wrap:
-        self.o(data, 1)
+            block = data
    def unknown_decl(self, data): pass
 def wrapwrite(text): sys.stdout.write(text.encode('utf8'))
 def html2text_file(html, out=wrapwrite):
    global options, args, oparser
    if options is None or args is None:
        (options, args) = oparser.parse_args(None, None)
    h = _html2text(out)
    h.feed(html)
    h.feed("")
    return h.close()
 def html2text(html):
    return optwrap(html2text_file(html, None))
 if __name__ == "__main__":
    (options, args) = oparser.parse_args()
    if len(args) > 0:
        arg = args[0]
        if arg.startswith('http://'):
            j = urllib.urlopen(arg)
            try:
                from feedparser import _getCharacterEncoding as enc
            except ImportError:
                   enc = lambda x, y: ('utf-8', 1)
            text = j.read()
            encoding = enc(j.headers, text)[0]
            if encoding == 'us-ascii': encoding = 'utf-8'
            data = text.decode(encoding)
        else:
-            data = open(arg, 'r').read().decode(options.INPUT_ENCODING)
+            # For normal text, fold multiple whitespace and strip
-    else:
+            # leading and trailing spaces for the whole block (but
-        data = sys.stdin.read().decode(options.INPUT_ENCODING)
+            # keep spaces in the middle).
-    wrapwrite(html2text(data))
+            block = ''
            if data.strip() and data[:1].isspace():
                # Keep spaces in the middle
                self.need_space = True
            if self.need_space and data.strip() and self.text_block:
                block = ' '
            block += ' '.join(data.split())
            self.need_space = data[-1:].isspace()
        self.text_block += block
    def handle_endtag(self, tag):
        block_elements = 'p li ul pre ol h1 h2 h3 h4 h5 h6'
        #block_elements += ' dl dd dt'
        if tag in block_elements.split():
            self._commit_block()
        if tag in ('ol', 'ul'):
            self.list_item_prefix = None
            self.ordered_list_index = None
        if tag == 'pre':
            self.skip_wrap = False
    def handle_charref(self, name):
        self.handle_data(unichr(int(name)))
    def handle_entityref(self, name):
        self.handle_data(unichr(name2codepoint[name]))
    def close(self):
        HTMLParser.close(self)
        self._commit_block()
        byte_output = self.output_buffer.encode('utf-8')
        if hasattr(sys.stdout, 'buffer'):
            sys.stdout.buffer.write(byte_output)
        else:
            sys.stdout.write(byte_output)
 def main():
    htmlparser = TextHTMLParser()
    if len(sys.argv) > 1:
        if sys.version_info[0] >= 3:
            # Python 3: read file as utf-8
            kwargs = { 'encoding': 'utf-8' }
        else:
            kwargs = {}
        with open(sys.argv[1], **kwargs) as f:
            for line in f:
                htmlparser.feed(line)
    else:
        f = sys.stdin
        if hasattr(f, 'buffer'):
            # Access raw (byte) buffer in Python 3 instead of decoded one
            f = f.buffer
        # Read stdin as as Unicode string
        htmlparser.feed(f.read().decode('utf-8'))
    htmlparser.close()
 if __name__ == '__main__':
    sys.exit(main())