From 68698db8ccec8733929752c4b13da71dcad6213e Mon Sep 17 00:00:00 2001 From: Peter Wu Date: Sat, 21 Mar 2015 11:57:01 +0100 Subject: [PATCH] Always use html2text.py for FAQ, improve output A recent commit broke compilation with Python 3. The original author of html2text.py is deceased and the fork has increased the number of files for this "simple" helper. The html2text.py script in this patch was rewritten and its output matches with lynx (except for a few newlines around lists). This means that indentation has been added for headings, paragraphs and lists. Also, since it was written from scratch, a new license could be chosen that matches Wireshark. Since now the in-tree html2text.py script provides nicer output, remove detection of the alternative programs (elinks, links). lynx/w3m is somehow still necessary for asciidoc though. (I also looked into reusing html2text.py for the release notes to replace asciidoc, but the --format=html output produces different output (HTML adds a ToC and section numbers). For now still require lynx for release notes) Tested with Python 2.6.6, 2.7.9, 3.2.6 and 3.4.3 under LC_ALL=C and LC_ALL=en_US.UTF-8 on Linux. Tested reading from stdin and file, writing to file, pipe and tty. Tested with cmake (Ninja) and autotools on Arch Linux x86_64. Test: # For each $PATH per python version, execute (with varying LC_ALL) help/faq.py -b | tools/html2text.py /dev/stdin | md5sum help/faq.py -b | tools/html2text.py | md5sum help/faq.py -b | tools/html2text.py help/faq.py -b | tools/html2text.py >/dev/null Change-Id: I6409450a3e6c8b010ca082251f9db7358b0cc2fd Reviewed-on: https://code.wireshark.org/review/7779 Petri-Dish: Peter Wu Tested-by: Petri Dish Buildbot Reviewed-by: Anders Broman --- CMakeLists.txt | 5 +- COPYING | 1 - cmake/modules/FindLYNX.cmake | 20 +- config.nmake | 14 +- configure.ac | 12 +- docbook/Makefile.am | 20 -- help/Makefile.am | 11 +- help/Makefile.nmake | 2 +- tools/checklicenses.py | 3 - tools/html2text.py | 646 +++++++++-------------------------- 10 files changed, 168 insertions(+), 566 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 31ec4c9c2f..091a58d12b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1416,10 +1416,9 @@ else() endforeach() endif(WIN32) add_custom_command(TARGET copy_data_files PRE_BUILD - COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/help/faq.py > faq.tmp.html + COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/help/faq.py -b > faq.tmp.html COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/tools/html2text.py - --width=72 --no-links faq.tmp.html - > "${DATAFILE_DIR}/help/faq.txt" + faq.tmp.html > "${DATAFILE_DIR}/help/faq.txt" COMMAND ${CMAKE_COMMAND} -E remove faq.tmp.html ) diff --git a/COPYING b/COPYING index ef930bb54d..f879096b68 100644 --- a/COPYING +++ b/COPYING @@ -24,7 +24,6 @@ covered by other licenses that are not themselves directly compatible with the GPLv2. This is OK, as only the tools themselves are licensed this way, the output of the tools is not considered a derived work, and so can be safely licensed for Wireshark's use. An incomplete selection of these tools includes: - - the html2text utility (tools/html2text.py) is licensed under the GPLv3. - the pidl utility (tools/pidl) is licensed under the GPLv3+. Parts of Wireshark can be built and distributed as libraries. These diff --git a/cmake/modules/FindLYNX.cmake b/cmake/modules/FindLYNX.cmake index bc0b7b0bdc..683b574ae5 100644 --- a/cmake/modules/FindLYNX.cmake +++ b/cmake/modules/FindLYNX.cmake @@ -1,6 +1,6 @@ # # - Find unix commands from cygwin -# This module looks for some usual Unix commands. +# This module looks for lynx (used by asciidoc) # INCLUDE(FindCygwin) @@ -8,9 +8,6 @@ INCLUDE(FindCygwin) FIND_PROGRAM(LYNX_EXECUTABLE NAMES lynx - elinks - links - true PATHS ${CYGWIN_INSTALL_PATH}/bin /bin @@ -23,18 +20,3 @@ INCLUDE(FindPackageHandleStandardArgs) FIND_PACKAGE_HANDLE_STANDARD_ARGS(LYNX DEFAULT_MSG LYNX_EXECUTABLE) MARK_AS_ADVANCED(LYNX_EXECUTABLE) - -# Convert html to text -IF(LYNX_EXECUTABLE MATCHES lynx) - # (See Bug # 1446 for note re 'force-html' below) - set(HTML2TXT "lynx -dump -width=72 -nolist -stdin -force-html") -ELSEIF(LYNX_EXECUTABLE MATCHES elinks) - set(HTML2TXT "elinks -dump -dump-width 72") -ELSEIF(LYNX_EXECUTABLE MATCHES links) - set(HTML2TXT "links -dump -width 72") -ELSEIF(LYNX_EXECUTABLE MATCHES true) - set(HTML2TXT "true") -ELSE() - message(ERROR "Should never be reached - please report!") -ENDIF() -message(STATUS "html2text: ${HTML2TXT}") diff --git a/config.nmake b/config.nmake index 19c9723760..6f95af7b01 100644 --- a/config.nmake +++ b/config.nmake @@ -79,8 +79,7 @@ PROGRAM_FILES=$(PROGRAMFILES) PROGRAM_FILES_W6432=$(PROGRAMW6432) # -# Location of the "tools" directory. This affects HTML2TXT below and should -# be overridden by makefiles in any subdirectories that use HTML2TXT. +# Location of the "tools" directory. This affects the path to textify.ps1 !IFNDEF TOOLS_DIR TOOLS_DIR=tools !ENDIF @@ -1321,17 +1320,6 @@ FOP=$(WIRESHARK_LIB_DIR)\fop-1.0\fop.bat # Additional options to fop. FOP_OPTS=-Xmx256m -# html to text converter for text version of release notes, e.g. elinks. -# This could also be "lynx", or "true" if neither elinks nor lynx is installed -# (cygwin: lynx works, elinks not available, links and true doesn't produce output) -#HTML2TXT=elinks -dump -dump-width 72 -##HTML2TXT=links -dump -width 72 ## XXX: Fails: For links -dump requires 'url' (filename) arg. -#HTML2TXT=lynx -dump -width=72 -nolist -stdin - -!IFNDEF HTML2TXT -HTML2TXT=$(PYTHON) $(TOOLS_DIR)\html2text.py --width=72 --no-links -!ENDIF - # the XSL processor (part of cygwin's libxslt package) XSLTPROC="xsltproc" diff --git a/configure.ac b/configure.ac index 62fce43336..857e898089 100644 --- a/configure.ac +++ b/configure.ac @@ -760,23 +760,19 @@ AC_PATH_PROG(A2X, a2x) AC_CHECK_PROG(HAVE_A2X, a2x, "yes", "no") AM_CONDITIONAL(HAVE_A2X, test x$HAVE_A2X = xyes) -# Want to control a tape drive? Use mt. Want to convert HTML to text? -# Uhhhhh... elinks? lynx? w3m? pandoc? html2text? -AC_PATH_PROG(ELINKS, elinks) -AC_CHECK_PROG(HAVE_ELINKS, elinks, "yes", "no") -AM_CONDITIONAL(HAVE_ELINKS, test x$HAVE_ELINKS = xyes) - # Check for fop (translate .fo to e.g. pdf) AC_PATH_PROG(FOP, fop) AC_CHECK_PROG(HAVE_FOP, fop, "yes", "no") AM_CONDITIONAL(HAVE_FOP, test x$HAVE_FOP = xyes) -# Check for lynx (html -> text) +# TODO: HAVE_LYNX and HAVE_W3M are unused. Maybe require one of them +# to be found when a2x is enabled? Otherwise it will fail later... +# Check for lynx (asciidoc text format from html) AC_PATH_PROG(LYNX, lynx) AC_CHECK_PROG(HAVE_LYNX, lynx, "yes", "no") AM_CONDITIONAL(HAVE_LYNX, test x$HAVE_LYNX = xyes) -# Check for w3m (html -> text) +# Check for w3m (asciidoc text format from html) AC_PATH_PROG(W3M, w3m) AC_CHECK_PROG(HAVE_W3M, w3m, "yes", "no") AM_CONDITIONAL(HAVE_W3M, test x$HAVE_W3M = xyes) diff --git a/docbook/Makefile.am b/docbook/Makefile.am index fd0bbc6fba..e25fca4bf9 100644 --- a/docbook/Makefile.am +++ b/docbook/Makefile.am @@ -18,26 +18,6 @@ A2X_TEXT_OPTS= A2X_TEXT_OPTS+="--lynx" #endif -# html to text converter for text version of release notes, e.g. elinks. -# This could also be "lynx", or "true" if neither elinks nor lynx is installed -# (See Bug # 1446 for note re 'force-html' below) -# Sorry about the indenting, but that's what automake requires... -if HAVE_ELINKS -HTML2TXT=$(ELINKS) -dump -dump-width 72 -## links: -dump requires 'url' argument (as opposed to elinks & lynx) -## (Rather than fixing things we'll just disable the use of links). -##else -##if HAVE_LINKS -##HTML2TXT=$(LINKS) -dump -width 72 -else -if HAVE_LYNX -HTML2TXT=$(LYNX) -dump -width=72 -nolist -stdin -force-html -else -HTML2TXT="true" -endif -##endif -endif - ############### YOU SHOULDN'T HAVE TO EDIT ANYTHING BELOW THIS LINE! ################ include Makefile.common diff --git a/help/Makefile.am b/help/Makefile.am index a420b9a9d9..ec4da12989 100644 --- a/help/Makefile.am +++ b/help/Makefile.am @@ -43,13 +43,8 @@ CLEANFILES = faq.txt MAINTAINERCLEANFILES = \ Makefile.in -# Try our best to convert the FAQ to text. -# The output of html2text.py isn't as pretty as elinks, links, or lynx. If that ever changes, we -# can use it exclusively. +# Convert the FAQ to text. faq.txt: $(srcdir)/faq.py - $(AM_V_GEN)$(srcdir)/faq.py >$@.tmp && \ - command -v elinks > /dev/null && elinks -dump -dump-width 72 -no-numbering -no-references < $@.tmp > $@ || \ - command -v links > /dev/null && links -width 72 -html-numbered-links 0 -dump $@.tmp > $@ || \ - command -v lynx > /dev/null && lynx -dump -width=72 -nolist -stdin -force-html < $@.tmp > $@ || \ - $(srcdir)/../tools/html2text.py --width=72 --no-links $@.tmp > $@ && \ + $(AM_V_GEN)$(srcdir)/faq.py -b >$@.tmp && \ + $(srcdir)/../tools/html2text.py $@.tmp > $@ && \ rm -f $@.tmp diff --git a/help/Makefile.nmake b/help/Makefile.nmake index 3eee3bcee7..b8f75540f8 100644 --- a/help/Makefile.nmake +++ b/help/Makefile.nmake @@ -10,7 +10,7 @@ include ..\config.nmake all: faq.txt faq.txt: faq.py - $(PYTHON) faq.py | $(HTML2TXT) > $@ + $(PYTHON) faq.py -b | $(PYTHON) $(TOOLS_DIR)\html2text.py > $@ clean: rm -rf faq.txt diff --git a/tools/checklicenses.py b/tools/checklicenses.py index fe4d38d69f..bbad753e9d 100755 --- a/tools/checklicenses.py +++ b/tools/checklicenses.py @@ -203,9 +203,6 @@ PATH_SPECIFIC_WHITELISTED_LICENSES = { 'tools/pidl': [ 'UNKNOWN', ], - 'tools/html2text.py': [ - 'UNKNOWN', - ], 'tools/lemon': [ 'UNKNOWN', ], diff --git a/tools/html2text.py b/tools/html2text.py index 9ae6c66fb6..54180fabcf 100755 --- a/tools/html2text.py +++ b/tools/html2text.py @@ -1,504 +1,170 @@ #!/usr/bin/env python -"""html2text: Turn HTML into equivalent Markdown-structured text.""" -__version__ = "2.35-Wireshark" -__author__ = "Aaron Swartz (me@aaronsw.com)" -__copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3." -__contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes"] +# +# html2text.py - converts HTML to text +# +# Wireshark - Network traffic analyzer +# By Gerald Combs +# Copyright 1998 Gerald Combs +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -# NOTE: -# This is a modified version of html2text.py from http://www.aaronsw.com/2002/html2text/ -# Changes: -# Options can now be configured from the command line. -# SKIP_LINKS and INPUT_ENCODING options have been added. -# The script now requires Python 2.3 +from __future__ import unicode_literals + +__author__ = "Peter Wu " +__copyright__ = "Copyright 2015, Peter Wu" +__license__ = "GPL (v2 or later)" # TODO: -# Support decoded entities with unifiable. -# Relative URL resolution -# Indent sections and lists similar to elinks/links/lynx +# multiple list indentation levels +# maybe allow for ascii output instead of utf-8? -if not hasattr(__builtins__, 'True'): True, False = 1, 0 -import re, sys, urllib, htmlentitydefs, codecs, StringIO, types -import sgmllib -sgmllib.charref = re.compile('&#([xX]?[0-9a-fA-F]+)[^0-9a-fA-F]') -from optparse import OptionParser +import sys +from textwrap import TextWrapper +try: + from HTMLParser import HTMLParser + from htmlentitydefs import name2codepoint +except: # Python 3 + from html.parser import HTMLParser + from html.entities import name2codepoint + unichr = chr # for html entity handling -try: from textwrap import wrap -except: pass - -oparser = OptionParser() -options = None -args = None - -oparser.add_option( - "--force-unicode", - action="store_true", - dest="UNICODE_SNOB", - default=False, - help="Use Unicode characters instead of their ascii psuedo-replacements. [default: False]", - ) - -oparser.add_option( - "--links-after-paragraphs", - action="store_true", - dest="LINKS_EACH_PARAGRAPH", - default=False, - help="Put the links after each paragraph instead of at the end. [default: False]", - ) - -oparser.add_option( - "--width", - type="int", - dest="BODY_WIDTH", - default=78, - help="Wrap long lines at position. 0 for no wrapping. Requires Python 2.3. [default: 78 characters]", - ) - -oparser.add_option( - "--no-internal-links", - action="store_true", - dest="SKIP_INTERNAL_LINKS", - default=False, - help='''Don't show internal links (href="#local-anchor"). Corresponding link targets won't be visible in the plain text file anyway. [default: False]''', - ) - -oparser.add_option( - "--no-links", - action="store_true", - dest="SKIP_LINKS", - default=False, - help='''Don't show links. [default: False]''', - ) - -oparser.add_option( - "--input-encoding", - type="string", - dest="INPUT_ENCODING", - default='utf-8', - help='''Force the encoding of the input file. [default: utf-8]''', - ) - -### Entity Nonsense ### - -def name2cp(k): - if k == 'apos': return ord("'") - if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3 - return htmlentitydefs.name2codepoint[k] - else: - k = htmlentitydefs.entitydefs[k] - if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1 - return ord(codecs.latin_1_decode(k)[0]) - -unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"', -'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*', -'ndash':'-', 'oelig':'oe', 'aelig':'ae', -'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a', -'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e', -'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i', -'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o', -'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u'} - -unifiable_n = {} - -for k in unifiable.keys(): - unifiable_n[name2cp(k)] = unifiable[k] - -def charref(name): - global options - - if name[0] in ['x','X']: - c = int(name[1:], 16) - else: - c = int(name) - - if not options.UNICODE_SNOB and c in unifiable_n.keys(): - return unifiable_n[c] - else: - return unichr(c) - -def entityref(c): - global options - - if not options.UNICODE_SNOB and c in unifiable.keys(): - return unifiable[c] - else: - try: name2cp(c) - except KeyError: return "&" + c - else: return unichr(name2cp(c)) - -def replaceEntities(s): - s = s.group(1) - if s[0] == "#": - return charref(s[1:]) - else: return entityref(s) - -r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));") -def unescape(s): - return r_unescape.sub(replaceEntities, s) - -def fixattrs(attrs): - # Fix bug in sgmllib.py - if not attrs: return attrs - newattrs = [] - for attr in attrs: - newattrs.append((attr[0], unescape(attr[1]))) - return newattrs - -### End Entity Nonsense ### - -def onlywhite(line): - """Return true if the line does only consist of whitespace characters.""" - for c in line: - if c is not ' ' and c is not ' ': - return c is ' ' - return line - -def optwrap(text): - """Wrap all paragraphs in the provided text.""" - global options - if not options.BODY_WIDTH: - return text - - assert wrap, "Requires Python 2.3." - result = '' - newlines = 0 - for para in text.split("\n"): - if len(para) > 0: - if para[0] is not ' ' and para[0] is not '-' and para[0] is not '*': - for line in wrap(para, options.BODY_WIDTH): - result += line + "\n" - result += "\n" - newlines = 2 - else: - if not onlywhite(para): - result += para + "\n" - newlines = 1 - else: - if newlines < 2: - result += "\n" - newlines += 1 - return result - -def hn(tag): - if tag[0] == 'h' and len(tag) == 2: +class TextHTMLParser(HTMLParser): + """Converts a HTML document to text.""" + def __init__(self): try: - n = int(tag[1]) - if n in range(1, 10): return n - except ValueError: return 0 + # Python 3.4 + HTMLParser. __init__(self, convert_charrefs=True) + except: + HTMLParser. __init__(self) + # All text, concatenated + self.output_buffer = '' + # The current text block which is being constructed + self.text_block = '' + # Whether the previous element was terminated with whitespace + self.need_space = False + # Whether to prevent word-wrapping the contents (for "pre" tag) + self.skip_wrap = False + # track list items + self.list_item_prefix = None + self.ordered_list_index = None + # Indentation (for heading and paragraphs) + self.indent_levels = [0, 0] -class _html2text(sgmllib.SGMLParser): - def __init__(self, out=sys.stdout.write): - sgmllib.SGMLParser.__init__(self) - - if out is None: self.out = self.outtextf - else: self.out = out - self.outtext = u'' - self.quiet = 0 - self.p_p = 0 - self.outcount = 0 - self.start = 1 - self.space = 0 - self.a = [] - self.astack = [] - self.acount = 0 - self.list = [] - self.blockquote = 0 - self.pre = 0 - self.startpre = 0 - self.lastWasNL = 0 - self.abbr_title = None # current abbreviation definition - self.abbr_data = None # last inner HTML (for abbr being defined) - self.abbr_list = {} # stack of abbreviations to write later - - def outtextf(self, s): - self.outtext += s - - def close(self): - sgmllib.SGMLParser.close(self) - - self.pbr() - self.o('', 0, 'end') - - return self.outtext - - def handle_charref(self, c): - self.o(charref(c)) + def _wrap_text(self, text): + """Wraps text, but additionally indent list items.""" + initial_indent = indent = sum(self.indent_levels) * ' ' + if self.list_item_prefix: + initial_indent += self.list_item_prefix + indent += ' ' + wrapper = TextWrapper(width=66, break_on_hyphens=False, + initial_indent=initial_indent, subsequent_indent=indent) + return '\n'.join(wrapper.wrap(text)) - def handle_entityref(self, c): - self.o(entityref(c)) - - def unknown_starttag(self, tag, attrs): - self.handle_tag(tag, attrs, 1) - - def unknown_endtag(self, tag): - self.handle_tag(tag, None, 0) - - def previousIndex(self, attrs): - """ returns the index of certain set of attributes (of a link) in the - self.a list - - If the set of attributes is not found, returns None - """ - if not attrs.has_key('href'): return None - - i = -1 - for a in self.a: - i += 1 - match = 0 - - if a.has_key('href') and a['href'] == attrs['href']: - if a.has_key('title') or attrs.has_key('title'): - if (a.has_key('title') and attrs.has_key('title') and - a['title'] == attrs['title']): - match = True - else: - match = True + def _commit_block(self, newline='\n\n'): + text = self.text_block + if text: + if not self.skip_wrap: + text = self._wrap_text(text) + self.output_buffer += text + newline + self.text_block = '' + self.need_space = False - if match: return i - - def handle_tag(self, tag, attrs, start): - global options - attrs = fixattrs(attrs) - - if hn(tag): - self.p() - if start: self.o(hn(tag)*"#" + ' ') - - if tag in ['p', 'div']: self.p() - - if tag == "br" and start: self.o(" \n") - - if tag == "hr" and start: - self.p() - self.o("* * *") - self.p() - - if tag in ["head", "style", 'script']: - if start: self.quiet += 1 - else: self.quiet -= 1 - - if tag in ["body"]: - self.quiet = 0 # sites like 9rules.com never close - - if tag == "blockquote": - if start: - self.p(); self.o('> ', 0, 1); self.start = 1 - self.blockquote += 1 - else: - self.blockquote -= 1 - self.p() - - if tag in ['em', 'i', 'u']: self.o("_") - if tag in ['strong', 'b']: self.o("**") - if tag == "code" and not self.pre: self.o('`') #TODO: `` `this` `` - if tag == "abbr": - if start: - attrsD = {} - for (x, y) in attrs: attrsD[x] = y - attrs = attrsD - - self.abbr_title = None - self.abbr_data = '' - if attrs.has_key('title'): - self.abbr_title = attrs['title'] - else: - if self.abbr_title != None: - self.abbr_list[self.abbr_data] = self.abbr_title - self.abbr_title = None - self.abbr_data = '' - - if tag == "a": - if start: - attrsD = {} - for (x, y) in attrs: attrsD[x] = y - attrs = attrsD - if attrs.has_key('href') and not (options.SKIP_LINKS or (options.SKIP_INTERNAL_LINKS and attrs['href'].startswith('#'))): - self.astack.append(attrs) - self.o("[") - else: - self.astack.append(None) - else: - if self.astack: - a = self.astack.pop() - if a: - i = self.previousIndex(a) - if i is not None: - a = self.a[i] - else: - self.acount += 1 - a['count'] = self.acount - a['outcount'] = self.outcount - self.a.append(a) - self.o("][" + `a['count']` + "]") - - if tag == "img" and start: - attrsD = {} - for (x, y) in attrs: attrsD[x] = y - attrs = attrsD - if attrs.has_key('src'): - attrs['href'] = attrs['src'] - alt = attrs.get('alt', '') - i = self.previousIndex(attrs) - if i is not None: - attrs = self.a[i] - else: - self.acount += 1 - attrs['count'] = self.acount - attrs['outcount'] = self.outcount - self.a.append(attrs) - self.o("![") - self.o(alt) - self.o("]["+`attrs['count']`+"]") - - if tag == 'dl' and start: self.p() - if tag == 'dt' and not start: self.pbr() - if tag == 'dd' and start: self.o(' ') - if tag == 'dd' and not start: self.pbr() - - if tag in ["ol", "ul"]: - if start: - self.list.append({'name':tag, 'num':0}) - else: - if self.list: self.list.pop() - - self.p() - - if tag == 'li': - if start: - self.pbr() - if self.list: li = self.list[-1] - else: li = {'name':'ul', 'num':0} - self.o(" "*len(self.list)) #TODO: line up
  1. s > 9 correctly. - if li['name'] == "ul": self.o("* ") - elif li['name'] == "ol": - li['num'] += 1 - self.o(`li['num']`+". ") - self.start = 1 - else: - self.pbr() - - if tag in ["table", "tr"] and start: self.p() - if tag == 'td': self.pbr() - - if tag == "pre": - if start: - self.startpre = 1 - self.pre = 1 - else: - self.pre = 0 - self.p() - - def pbr(self): - if self.p_p == 0: self.p_p = 1 - - def p(self): self.p_p = 2 - - def o(self, data, puredata=0, force=0): - if self.abbr_data is not None: self.abbr_data += data - - if not self.quiet: - if puredata and not self.pre: - data = re.sub('\s+', ' ', data) - if data and data[0] == ' ': - self.space = 1 - data = data[1:] - if not data and not force: return - - if self.startpre: - #self.out(" :") #TODO: not output when already one there - self.startpre = 0 - - bq = (">" * self.blockquote) - if not (force and data and data[0] == ">") and self.blockquote: bq += " " - - if self.pre: - bq += " " - data = data.replace("\n", "\n"+bq) - - if self.start: - self.space = 0 - self.p_p = 0 - self.start = 0 - - if force == 'end': - # It's the end. - self.p_p = 0 - self.out("\n") - self.space = 0 - - - if self.p_p: - self.out(('\n'+bq)*self.p_p) - self.space = 0 - - if self.space: - if not self.lastWasNL: self.out(' ') - self.space = 0 - - if self.a and ((self.p_p == 2 and options.LINKS_EACH_PARAGRAPH) or force == "end"): - if force == "end": self.out("\n") - - newa = [] - for link in self.a: - if self.outcount > link['outcount']: - self.out(" ["+`link['count']`+"]: " + link['href']) #TODO: base href - if link.has_key('title'): self.out(" ("+link['title']+")") - self.out("\n") - else: - newa.append(link) - - if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done. - - self.a = newa - - if self.abbr_list and force == "end": - for abbr, definition in self.abbr_list.items(): - self.out(" *[" + abbr + "]: " + definition + "\n") - - self.p_p = 0 - self.out(data) - self.lastWasNL = data and data[-1] == '\n' - self.outcount += 1 + def handle_starttag(self, tag, attrs): + # end a block of text on
    , but also flush list items which are not + # terminated. + if tag == 'br' or tag == 'li': + self._commit_block('\n') + if tag == 'pre': + self.skip_wrap = True + # Following list items are numbered. + if tag == 'ol': + self.ordered_list_index = 1 + if tag == 'ul': + self.list_item_prefix = ' * ' + if tag == 'li' and self.ordered_list_index: + self.list_item_prefix = ' %d. ' % (self.ordered_list_index) + self.ordered_list_index += 1 + if tag[0] == 'h' and len(tag) == 2 and \ + (tag[1] >= '1' and tag[1] <= '6'): + self.indent_levels = [int(tag[1]) - 1, 0] + if tag == 'p': + self.indent_levels[1] = 1 def handle_data(self, data): - if r'\/script>' in data: self.quiet -= 1 - self.o(data, 1) - - def unknown_decl(self, data): pass - -def wrapwrite(text): sys.stdout.write(text.encode('utf8')) - -def html2text_file(html, out=wrapwrite): - global options, args, oparser - if options is None or args is None: - (options, args) = oparser.parse_args(None, None) - - h = _html2text(out) - h.feed(html) - h.feed("") - return h.close() - -def html2text(html): - return optwrap(html2text_file(html, None)) - -if __name__ == "__main__": - (options, args) = oparser.parse_args() - if len(args) > 0: - arg = args[0] - if arg.startswith('http://'): - j = urllib.urlopen(arg) - try: - from feedparser import _getCharacterEncoding as enc - except ImportError: - enc = lambda x, y: ('utf-8', 1) - text = j.read() - encoding = enc(j.headers, text)[0] - if encoding == 'us-ascii': encoding = 'utf-8' - data = text.decode(encoding) - + if self.skip_wrap: + block = data else: - data = open(arg, 'r').read().decode(options.INPUT_ENCODING) - else: - data = sys.stdin.read().decode(options.INPUT_ENCODING) - wrapwrite(html2text(data)) + # For normal text, fold multiple whitespace and strip + # leading and trailing spaces for the whole block (but + # keep spaces in the middle). + block = '' + if data.strip() and data[:1].isspace(): + # Keep spaces in the middle + self.need_space = True + if self.need_space and data.strip() and self.text_block: + block = ' ' + block += ' '.join(data.split()) + self.need_space = data[-1:].isspace() + self.text_block += block + def handle_endtag(self, tag): + block_elements = 'p li ul pre ol h1 h2 h3 h4 h5 h6' + #block_elements += ' dl dd dt' + if tag in block_elements.split(): + self._commit_block() + if tag in ('ol', 'ul'): + self.list_item_prefix = None + self.ordered_list_index = None + if tag == 'pre': + self.skip_wrap = False + + def handle_charref(self, name): + self.handle_data(unichr(int(name))) + + def handle_entityref(self, name): + self.handle_data(unichr(name2codepoint[name])) + + def close(self): + HTMLParser.close(self) + self._commit_block() + byte_output = self.output_buffer.encode('utf-8') + if hasattr(sys.stdout, 'buffer'): + sys.stdout.buffer.write(byte_output) + else: + sys.stdout.write(byte_output) + + +def main(): + htmlparser = TextHTMLParser() + if len(sys.argv) > 1: + if sys.version_info[0] >= 3: + # Python 3: read file as utf-8 + kwargs = { 'encoding': 'utf-8' } + else: + kwargs = {} + with open(sys.argv[1], **kwargs) as f: + for line in f: + htmlparser.feed(line) + else: + f = sys.stdin + if hasattr(f, 'buffer'): + # Access raw (byte) buffer in Python 3 instead of decoded one + f = f.buffer + # Read stdin as as Unicode string + htmlparser.feed(f.read().decode('utf-8')) + htmlparser.close() + +if __name__ == '__main__': + sys.exit(main())