Always use html2text.py for FAQ, improve output

A recent commit broke compilation with Python 3. The original author of
html2text.py is deceased and the fork has increased the number of files
for this "simple" helper.

The html2text.py script in this patch was rewritten and its output
matches with lynx (except for a few newlines around lists). This means
that indentation has been added for headings, paragraphs and lists.
Also, since it was written from scratch, a new license could be chosen
that matches Wireshark.

Since now the in-tree html2text.py script provides nicer output, remove
detection of the alternative programs (elinks, links). lynx/w3m is
somehow still necessary for asciidoc though.

(I also looked into reusing html2text.py for the release notes to
replace asciidoc, but the --format=html output produces different output
(HTML adds a ToC and section numbers). For now still require lynx for
release notes)

Tested with Python 2.6.6, 2.7.9, 3.2.6 and 3.4.3 under LC_ALL=C and
LC_ALL=en_US.UTF-8 on Linux. Tested reading from stdin and file, writing
to file, pipe and tty. Tested with cmake (Ninja) and autotools on Arch
Linux x86_64. Test:

    # For each $PATH per python version, execute (with varying LC_ALL)
    help/faq.py -b | tools/html2text.py /dev/stdin | md5sum
    help/faq.py -b | tools/html2text.py | md5sum
    help/faq.py -b | tools/html2text.py
    help/faq.py -b | tools/html2text.py >/dev/null

Change-Id: I6409450a3e6c8b010ca082251f9db7358b0cc2fd
Reviewed-on: https://code.wireshark.org/review/7779
Petri-Dish: Peter Wu <peter@lekensteyn.nl>
Tested-by: Petri Dish Buildbot <buildbot-no-reply@wireshark.org>
Reviewed-by: Anders Broman <a.broman58@gmail.com>
daniel/osmux
Peter Wu 8 years ago committed by Anders Broman
parent 83b6338673
commit 68698db8cc
  1. 5
      CMakeLists.txt
  2. 1
      COPYING
  3. 20
      cmake/modules/FindLYNX.cmake
  4. 14
      config.nmake
  5. 12
      configure.ac
  6. 20
      docbook/Makefile.am
  7. 11
      help/Makefile.am
  8. 2
      help/Makefile.nmake
  9. 3
      tools/checklicenses.py
  10. 652
      tools/html2text.py

@ -1416,10 +1416,9 @@ else()
endforeach()
endif(WIN32)
add_custom_command(TARGET copy_data_files PRE_BUILD
COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/help/faq.py > faq.tmp.html
COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/help/faq.py -b > faq.tmp.html
COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/tools/html2text.py
--width=72 --no-links faq.tmp.html
> "${DATAFILE_DIR}/help/faq.txt"
faq.tmp.html > "${DATAFILE_DIR}/help/faq.txt"
COMMAND ${CMAKE_COMMAND} -E remove faq.tmp.html
)

@ -24,7 +24,6 @@ covered by other licenses that are not themselves directly compatible with the
GPLv2. This is OK, as only the tools themselves are licensed this way, the
output of the tools is not considered a derived work, and so can be safely
licensed for Wireshark's use. An incomplete selection of these tools includes:
- the html2text utility (tools/html2text.py) is licensed under the GPLv3.
- the pidl utility (tools/pidl) is licensed under the GPLv3+.
Parts of Wireshark can be built and distributed as libraries. These

@ -1,6 +1,6 @@
#
# - Find unix commands from cygwin
# This module looks for some usual Unix commands.
# This module looks for lynx (used by asciidoc)
#
INCLUDE(FindCygwin)
@ -8,9 +8,6 @@ INCLUDE(FindCygwin)
FIND_PROGRAM(LYNX_EXECUTABLE
NAMES
lynx
elinks
links
true
PATHS
${CYGWIN_INSTALL_PATH}/bin
/bin
@ -23,18 +20,3 @@ INCLUDE(FindPackageHandleStandardArgs)
FIND_PACKAGE_HANDLE_STANDARD_ARGS(LYNX DEFAULT_MSG LYNX_EXECUTABLE)
MARK_AS_ADVANCED(LYNX_EXECUTABLE)
# Convert html to text
IF(LYNX_EXECUTABLE MATCHES lynx)
# (See Bug # 1446 for note re 'force-html' below)
set(HTML2TXT "lynx -dump -width=72 -nolist -stdin -force-html")
ELSEIF(LYNX_EXECUTABLE MATCHES elinks)
set(HTML2TXT "elinks -dump -dump-width 72")
ELSEIF(LYNX_EXECUTABLE MATCHES links)
set(HTML2TXT "links -dump -width 72")
ELSEIF(LYNX_EXECUTABLE MATCHES true)
set(HTML2TXT "true")
ELSE()
message(ERROR "Should never be reached - please report!")
ENDIF()
message(STATUS "html2text: ${HTML2TXT}")

@ -79,8 +79,7 @@ PROGRAM_FILES=$(PROGRAMFILES)
PROGRAM_FILES_W6432=$(PROGRAMW6432)
#
# Location of the "tools" directory. This affects HTML2TXT below and should
# be overridden by makefiles in any subdirectories that use HTML2TXT.
# Location of the "tools" directory. This affects the path to textify.ps1
!IFNDEF TOOLS_DIR
TOOLS_DIR=tools
!ENDIF
@ -1321,17 +1320,6 @@ FOP=$(WIRESHARK_LIB_DIR)\fop-1.0\fop.bat
# Additional options to fop.
FOP_OPTS=-Xmx256m
# html to text converter for text version of release notes, e.g. elinks.
# This could also be "lynx", or "true" if neither elinks nor lynx is installed
# (cygwin: lynx works, elinks not available, links and true doesn't produce output)
#HTML2TXT=elinks -dump -dump-width 72
##HTML2TXT=links -dump -width 72 ## XXX: Fails: For links -dump requires 'url' (filename) arg.
#HTML2TXT=lynx -dump -width=72 -nolist -stdin
!IFNDEF HTML2TXT
HTML2TXT=$(PYTHON) $(TOOLS_DIR)\html2text.py --width=72 --no-links
!ENDIF
# the XSL processor (part of cygwin's libxslt package)
XSLTPROC="xsltproc"

@ -760,23 +760,19 @@ AC_PATH_PROG(A2X, a2x)
AC_CHECK_PROG(HAVE_A2X, a2x, "yes", "no")
AM_CONDITIONAL(HAVE_A2X, test x$HAVE_A2X = xyes)
# Want to control a tape drive? Use mt. Want to convert HTML to text?
# Uhhhhh... elinks? lynx? w3m? pandoc? html2text?
AC_PATH_PROG(ELINKS, elinks)
AC_CHECK_PROG(HAVE_ELINKS, elinks, "yes", "no")
AM_CONDITIONAL(HAVE_ELINKS, test x$HAVE_ELINKS = xyes)
# Check for fop (translate .fo to e.g. pdf)
AC_PATH_PROG(FOP, fop)
AC_CHECK_PROG(HAVE_FOP, fop, "yes", "no")
AM_CONDITIONAL(HAVE_FOP, test x$HAVE_FOP = xyes)
# Check for lynx (html -> text)
# TODO: HAVE_LYNX and HAVE_W3M are unused. Maybe require one of them
# to be found when a2x is enabled? Otherwise it will fail later...
# Check for lynx (asciidoc text format from html)
AC_PATH_PROG(LYNX, lynx)
AC_CHECK_PROG(HAVE_LYNX, lynx, "yes", "no")
AM_CONDITIONAL(HAVE_LYNX, test x$HAVE_LYNX = xyes)
# Check for w3m (html -> text)
# Check for w3m (asciidoc text format from html)
AC_PATH_PROG(W3M, w3m)
AC_CHECK_PROG(HAVE_W3M, w3m, "yes", "no")
AM_CONDITIONAL(HAVE_W3M, test x$HAVE_W3M = xyes)

@ -18,26 +18,6 @@ A2X_TEXT_OPTS=
A2X_TEXT_OPTS+="--lynx"
#endif
# html to text converter for text version of release notes, e.g. elinks.
# This could also be "lynx", or "true" if neither elinks nor lynx is installed
# (See Bug # 1446 for note re 'force-html' below)
# Sorry about the indenting, but that's what automake requires...
if HAVE_ELINKS
HTML2TXT=$(ELINKS) -dump -dump-width 72
## links: -dump requires 'url' argument (as opposed to elinks & lynx)
## (Rather than fixing things we'll just disable the use of links).
##else
##if HAVE_LINKS
##HTML2TXT=$(LINKS) -dump -width 72
else
if HAVE_LYNX
HTML2TXT=$(LYNX) -dump -width=72 -nolist -stdin -force-html
else
HTML2TXT="true"
endif
##endif
endif
############### YOU SHOULDN'T HAVE TO EDIT ANYTHING BELOW THIS LINE! ################
include Makefile.common

@ -43,13 +43,8 @@ CLEANFILES = faq.txt
MAINTAINERCLEANFILES = \
Makefile.in
# Try our best to convert the FAQ to text.
# The output of html2text.py isn't as pretty as elinks, links, or lynx. If that ever changes, we
# can use it exclusively.
# Convert the FAQ to text.
faq.txt: $(srcdir)/faq.py
$(AM_V_GEN)$(srcdir)/faq.py >$@.tmp && \
command -v elinks > /dev/null && elinks -dump -dump-width 72 -no-numbering -no-references < $@.tmp > $@ || \
command -v links > /dev/null && links -width 72 -html-numbered-links 0 -dump $@.tmp > $@ || \
command -v lynx > /dev/null && lynx -dump -width=72 -nolist -stdin -force-html < $@.tmp > $@ || \
$(srcdir)/../tools/html2text.py --width=72 --no-links $@.tmp > $@ && \
$(AM_V_GEN)$(srcdir)/faq.py -b >$@.tmp && \
$(srcdir)/../tools/html2text.py $@.tmp > $@ && \
rm -f $@.tmp

@ -10,7 +10,7 @@ include ..\config.nmake
all: faq.txt
faq.txt: faq.py
$(PYTHON) faq.py | $(HTML2TXT) > $@
$(PYTHON) faq.py -b | $(PYTHON) $(TOOLS_DIR)\html2text.py > $@
clean:
rm -rf faq.txt

@ -203,9 +203,6 @@ PATH_SPECIFIC_WHITELISTED_LICENSES = {
'tools/pidl': [
'UNKNOWN',
],
'tools/html2text.py': [
'UNKNOWN',
],
'tools/lemon': [
'UNKNOWN',
],

@ -1,504 +1,170 @@
#!/usr/bin/env python
"""html2text: Turn HTML into equivalent Markdown-structured text."""
__version__ = "2.35-Wireshark"
__author__ = "Aaron Swartz (me@aaronsw.com)"
__copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3."
__contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes"]
# NOTE:
# This is a modified version of html2text.py from http://www.aaronsw.com/2002/html2text/
# Changes:
# Options can now be configured from the command line.
# SKIP_LINKS and INPUT_ENCODING options have been added.
# The script now requires Python 2.3
#
# html2text.py - converts HTML to text
#
# Wireshark - Network traffic analyzer
# By Gerald Combs <gerald@wireshark.org>
# Copyright 1998 Gerald Combs
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
from __future__ import unicode_literals
__author__ = "Peter Wu <peter@lekensteyn.nl>"
__copyright__ = "Copyright 2015, Peter Wu"
__license__ = "GPL (v2 or later)"
# TODO:
# Support decoded entities with unifiable.
# Relative URL resolution
# Indent sections and lists similar to elinks/links/lynx
if not hasattr(__builtins__, 'True'): True, False = 1, 0
import re, sys, urllib, htmlentitydefs, codecs, StringIO, types
import sgmllib
sgmllib.charref = re.compile('&#([xX]?[0-9a-fA-F]+)[^0-9a-fA-F]')
from optparse import OptionParser
try: from textwrap import wrap
except: pass
oparser = OptionParser()
options = None
args = None
oparser.add_option(
"--force-unicode",
action="store_true",
dest="UNICODE_SNOB",
default=False,
help="Use Unicode characters instead of their ascii psuedo-replacements. [default: False]",
)
oparser.add_option(
"--links-after-paragraphs",
action="store_true",
dest="LINKS_EACH_PARAGRAPH",
default=False,
help="Put the links after each paragraph instead of at the end. [default: False]",
)
oparser.add_option(
"--width",
type="int",
dest="BODY_WIDTH",
default=78,
help="Wrap long lines at position. 0 for no wrapping. Requires Python 2.3. [default: 78 characters]",
)
oparser.add_option(
"--no-internal-links",
action="store_true",
dest="SKIP_INTERNAL_LINKS",
default=False,
help='''Don't show internal links (href="#local-anchor"). Corresponding link targets won't be visible in the plain text file anyway. [default: False]''',
)
oparser.add_option(
"--no-links",
action="store_true",
dest="SKIP_LINKS",
default=False,
help='''Don't show links. [default: False]''',
)
oparser.add_option(
"--input-encoding",
type="string",
dest="INPUT_ENCODING",
default='utf-8',
help='''Force the encoding of the input file. [default: utf-8]''',
)
### Entity Nonsense ###
def name2cp(k):
if k == 'apos': return ord("'")
if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
return htmlentitydefs.name2codepoint[k]
else:
k = htmlentitydefs.entitydefs[k]
if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
return ord(codecs.latin_1_decode(k)[0])
unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"',
'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*',
'ndash':'-', 'oelig':'oe', 'aelig':'ae',
'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a',
'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e',
'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i',
'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o',
'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u'}
unifiable_n = {}
for k in unifiable.keys():
unifiable_n[name2cp(k)] = unifiable[k]
def charref(name):
global options
if name[0] in ['x','X']:
c = int(name[1:], 16)
else:
c = int(name)
if not options.UNICODE_SNOB and c in unifiable_n.keys():
return unifiable_n[c]
else:
return unichr(c)
def entityref(c):
global options
if not options.UNICODE_SNOB and c in unifiable.keys():
return unifiable[c]
else:
try: name2cp(c)
except KeyError: return "&" + c
else: return unichr(name2cp(c))
def replaceEntities(s):
s = s.group(1)
if s[0] == "#":
return charref(s[1:])
else: return entityref(s)
r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
def unescape(s):
return r_unescape.sub(replaceEntities, s)
def fixattrs(attrs):
# Fix bug in sgmllib.py
if not attrs: return attrs
newattrs = []
for attr in attrs:
newattrs.append((attr[0], unescape(attr[1])))
return newattrs
### End Entity Nonsense ###
def onlywhite(line):
"""Return true if the line does only consist of whitespace characters."""
for c in line:
if c is not ' ' and c is not ' ':
return c is ' '
return line
def optwrap(text):
"""Wrap all paragraphs in the provided text."""
global options
if not options.BODY_WIDTH:
return text
assert wrap, "Requires Python 2.3."
result = ''
newlines = 0
for para in text.split("\n"):
if len(para) > 0:
if para[0] is not ' ' and para[0] is not '-' and para[0] is not '*':
for line in wrap(para, options.BODY_WIDTH):
result += line + "\n"
result += "\n"
newlines = 2
else:
if not onlywhite(para):
result += para + "\n"
newlines = 1
else:
if newlines < 2:
result += "\n"
newlines += 1
return result
def hn(tag):
if tag[0] == 'h' and len(tag) == 2:
# multiple list indentation levels
# maybe allow for ascii output instead of utf-8?
import sys
from textwrap import TextWrapper
try:
from HTMLParser import HTMLParser
from htmlentitydefs import name2codepoint
except: # Python 3
from html.parser import HTMLParser
from html.entities import name2codepoint
unichr = chr # for html entity handling
class TextHTMLParser(HTMLParser):
"""Converts a HTML document to text."""
def __init__(self):
try:
n = int(tag[1])
if n in range(1, 10): return n
except ValueError: return 0
class _html2text(sgmllib.SGMLParser):
def __init__(self, out=sys.stdout.write):
sgmllib.SGMLParser.__init__(self)
if out is None: self.out = self.outtextf
else: self.out = out
self.outtext = u''
self.quiet = 0
self.p_p = 0
self.outcount = 0
self.start = 1
self.space = 0
self.a = []
self.astack = []
self.acount = 0
self.list = []
self.blockquote = 0
self.pre = 0
self.startpre = 0
self.lastWasNL = 0
self.abbr_title = None # current abbreviation definition
self.abbr_data = None # last inner HTML (for abbr being defined)
self.abbr_list = {} # stack of abbreviations to write later
def outtextf(self, s):
self.outtext += s
def close(self):
sgmllib.SGMLParser.close(self)
self.pbr()
self.o('', 0, 'end')
return self.outtext
def handle_charref(self, c):
self.o(charref(c))
def handle_entityref(self, c):
self.o(entityref(c))
def unknown_starttag(self, tag, attrs):
self.handle_tag(tag, attrs, 1)
def unknown_endtag(self, tag):
self.handle_tag(tag, None, 0)
def previousIndex(self, attrs):
""" returns the index of certain set of attributes (of a link) in the
self.a list
If the set of attributes is not found, returns None
"""
if not attrs.has_key('href'): return None
i = -1
for a in self.a:
i += 1
match = 0
if a.has_key('href') and a['href'] == attrs['href']:
if a.has_key('title') or attrs.has_key('title'):
if (a.has_key('title') and attrs.has_key('title') and
a['title'] == attrs['title']):
match = True
else:
match = True
if match: return i
def handle_tag(self, tag, attrs, start):
global options
attrs = fixattrs(attrs)
if hn(tag):
self.p()
if start: self.o(hn(tag)*"#" + ' ')
if tag in ['p', 'div']: self.p()
if tag == "br" and start: self.o(" \n")
if tag == "hr" and start:
self.p()
self.o("* * *")
self.p()
if tag in ["head", "style", 'script']:
if start: self.quiet += 1
else: self.quiet -= 1
if tag in ["body"]:
self.quiet = 0 # sites like 9rules.com never close <head>
if tag == "blockquote":
if start:
self.p(); self.o('> ', 0, 1); self.start = 1
self.blockquote += 1
else:
self.blockquote -= 1
self.p()
if tag in ['em', 'i', 'u']: self.o("_")
if tag in ['strong', 'b']: self.o("**")
if tag == "code" and not self.pre: self.o('`') #TODO: `` `this` ``
if tag == "abbr":
if start:
attrsD = {}
for (x, y) in attrs: attrsD[x] = y
attrs = attrsD
self.abbr_title = None
self.abbr_data = ''
if attrs.has_key('title'):
self.abbr_title = attrs['title']
else:
if self.abbr_title != None:
self.abbr_list[self.abbr_data] = self.abbr_title
self.abbr_title = None
self.abbr_data = ''
if tag == "a":
if start:
attrsD = {}
for (x, y) in attrs: attrsD[x] = y
attrs = attrsD
if attrs.has_key('href') and not (options.SKIP_LINKS or (options.SKIP_INTERNAL_LINKS and attrs['href'].startswith('#'))):
self.astack.append(attrs)
self.o("[")
else:
self.astack.append(None)
else:
if self.astack:
a = self.astack.pop()
if a:
i = self.previousIndex(a)
if i is not None:
a = self.a[i]
else:
self.acount += 1
a['count'] = self.acount
a['outcount'] = self.outcount
self.a.append(a)
self.o("][" + `a['count']` + "]")
if tag == "img" and start:
attrsD = {}
for (x, y) in attrs: attrsD[x] = y
attrs = attrsD
if attrs.has_key('src'):
attrs['href'] = attrs['src']
alt = attrs.get('alt', '')
i = self.previousIndex(attrs)
if i is not None:
attrs = self.a[i]
else:
self.acount += 1
attrs['count'] = self.acount
attrs['outcount'] = self.outcount
self.a.append(attrs)
self.o("![")
self.o(alt)
self.o("]["+`attrs['count']`+"]")
if tag == 'dl' and start: self.p()
if tag == 'dt' and not start: self.pbr()
if tag == 'dd' and start: self.o(' ')
if tag == 'dd' and not start: self.pbr()
if tag in ["ol", "ul"]:
if start:
self.list.append({'name':tag, 'num':0})
else:
if self.list: self.list.pop()
self.p()
if tag == 'li':
if start:
self.pbr()
if self.list: li = self.list[-1]
else: li = {'name':'ul', 'num':0}
self.o(" "*len(self.list)) #TODO: line up <ol><li>s > 9 correctly.
if li['name'] == "ul": self.o("* ")
elif li['name'] == "ol":
li['num'] += 1
self.o(`li['num']`+". ")
self.start = 1
else:
self.pbr()
if tag in ["table", "tr"] and start: self.p()
if tag == 'td': self.pbr()
if tag == "pre":
if start:
self.startpre = 1
self.pre = 1
else:
self.pre = 0
self.p()
def pbr(self):
if self.p_p == 0: self.p_p = 1
def p(self): self.p_p = 2
def o(self, data, puredata=0, force=0):
if self.abbr_data is not None: self.abbr_data += data
if not self.quiet:
if puredata and not self.pre:
data = re.sub('\s+', ' ', data)
if data and data[0] == ' ':
self.space = 1
data = data[1:]
if not data and not force: return
if self.startpre:
#self.out(" :") #TODO: not output when already one there
self.startpre = 0
bq = (">" * self.blockquote)
if not (force and data and data[0] == ">") and self.blockquote: bq += " "
if self.pre:
bq += " "
data = data.replace("\n", "\n"+bq)
if self.start:
self.space = 0
self.p_p = 0
self.start = 0
if force == 'end':
# It's the end.
self.p_p = 0
self.out("\n")
self.space = 0
if self.p_p:
self.out(('\n'+bq)*self.p_p)
self.space = 0
if self.space:
if not self.lastWasNL: self.out(' ')
self.space = 0
if self.a and ((self.p_p == 2 and options.LINKS_EACH_PARAGRAPH) or force == "end"):
if force == "end": self.out("\n")
newa = []
for link in self.a:
if self.outcount > link['outcount']:
self.out(" ["+`link['count']`+"]: " + link['href']) #TODO: base href
if link.has_key('title'): self.out(" ("+link['title']+")")
self.out("\n")
else:
newa.append(link)
if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done.
self.a = newa
if self.abbr_list and force == "end":
for abbr, definition in self.abbr_list.items():
self.out(" *[" + abbr + "]: " + definition + "\n")
self.p_p = 0
self.out(data)
self.lastWasNL = data and data[-1] == '\n'
self.outcount += 1
# Python 3.4
HTMLParser. __init__(self, convert_charrefs=True)
except:
HTMLParser. __init__(self)
# All text, concatenated
self.output_buffer = ''
# The current text block which is being constructed
self.text_block = ''
# Whether the previous element was terminated with whitespace
self.need_space = False
# Whether to prevent word-wrapping the contents (for "pre" tag)
self.skip_wrap = False
# track list items
self.list_item_prefix = None
self.ordered_list_index = None
# Indentation (for heading and paragraphs)
self.indent_levels = [0, 0]
def _wrap_text(self, text):
"""Wraps text, but additionally indent list items."""
initial_indent = indent = sum(self.indent_levels) * ' '
if self.list_item_prefix:
initial_indent += self.list_item_prefix
indent += ' '
wrapper = TextWrapper(width=66, break_on_hyphens=False,
initial_indent=initial_indent, subsequent_indent=indent)
return '\n'.join(wrapper.wrap(text))
def _commit_block(self, newline='\n\n'):
text = self.text_block
if text:
if not self.skip_wrap:
text = self._wrap_text(text)
self.output_buffer += text + newline
self.text_block = ''
self.need_space = False
def handle_starttag(self, tag, attrs):
# end a block of text on <br>, but also flush list items which are not
# terminated.
if tag == 'br' or tag == 'li':
self._commit_block('\n')
if tag == 'pre':
self.skip_wrap = True
# Following list items are numbered.
if tag == 'ol':
self.ordered_list_index = 1
if tag == 'ul':
self.list_item_prefix = ' * '
if tag == 'li' and self.ordered_list_index:
self.list_item_prefix = ' %d. ' % (self.ordered_list_index)
self.ordered_list_index += 1
if tag[0] == 'h' and len(tag) == 2 and \
(tag[1] >= '1' and tag[1] <= '6'):
self.indent_levels = [int(tag[1]) - 1, 0]
if tag == 'p':
self.indent_levels[1] = 1
def handle_data(self, data):
if r'\/script>' in data: self.quiet -= 1
self.o(data, 1)
def unknown_decl(self, data): pass
def wrapwrite(text): sys.stdout.write(text.encode('utf8'))
def html2text_file(html, out=wrapwrite):
global options, args, oparser
if options is None or args is None:
(options, args) = oparser.parse_args(None, None)
h = _html2text(out)
h.feed(html)
h.feed("")
return h.close()
if self.skip_wrap:
block = data
else:
# For normal text, fold multiple whitespace and strip
# leading and trailing spaces for the whole block (but
# keep spaces in the middle).
block = ''
if data.strip() and data[:1].isspace():
# Keep spaces in the middle
self.need_space = True
if self.need_space and data.strip() and self.text_block:
block = ' '
block += ' '.join(data.split())
self.need_space = data[-1:].isspace()
self.text_block += block
def handle_endtag(self, tag):
block_elements = 'p li ul pre ol h1 h2 h3 h4 h5 h6'
#block_elements += ' dl dd dt'
if tag in block_elements.split():
self._commit_block()
if tag in ('ol', 'ul'):
self.list_item_prefix = None
self.ordered_list_index = None
if tag == 'pre':
self.skip_wrap = False
def handle_charref(self, name):
self.handle_data(unichr(int(name)))
def handle_entityref(self, name):
self.handle_data(unichr(name2codepoint[name]))
def html2text(html):
return optwrap(html2text_file(html, None))
def close(self):
HTMLParser.close(self)
self._commit_block()
byte_output = self.output_buffer.encode('utf-8')
if hasattr(sys.stdout, 'buffer'):
sys.stdout.buffer.write(byte_output)
else:
sys.stdout.write(byte_output)
if __name__ == "__main__":
(options, args) = oparser.parse_args()
if len(args) > 0:
arg = args[0]
if arg.startswith('http://'):
j = urllib.urlopen(arg)
try:
from feedparser import _getCharacterEncoding as enc
except ImportError:
enc = lambda x, y: ('utf-8', 1)
text = j.read()
encoding = enc(j.headers, text)[0]
if encoding == 'us-ascii': encoding = 'utf-8'
data = text.decode(encoding)
def main():
htmlparser = TextHTMLParser()
if len(sys.argv) > 1:
if sys.version_info[0] >= 3:
# Python 3: read file as utf-8
kwargs = { 'encoding': 'utf-8' }
else:
data = open(arg, 'r').read().decode(options.INPUT_ENCODING)
kwargs = {}
with open(sys.argv[1], **kwargs) as f:
for line in f:
htmlparser.feed(line)
else:
data = sys.stdin.read().decode(options.INPUT_ENCODING)
wrapwrite(html2text(data))
f = sys.stdin
if hasattr(f, 'buffer'):
# Access raw (byte) buffer in Python 3 instead of decoded one
f = f.buffer
# Read stdin as as Unicode string
htmlparser.feed(f.read().decode('utf-8'))
htmlparser.close()
if __name__ == '__main__':
sys.exit(main())

Loading…
Cancel
Save