Always use html2text.py for FAQ, improve output

A recent commit broke compilation with Python 3. The original author of
html2text.py is deceased and the fork has increased the number of files
for this "simple" helper.

The html2text.py script in this patch was rewritten and its output
matches with lynx (except for a few newlines around lists). This means
that indentation has been added for headings, paragraphs and lists.
Also, since it was written from scratch, a new license could be chosen
that matches Wireshark.

Since now the in-tree html2text.py script provides nicer output, remove
detection of the alternative programs (elinks, links). lynx/w3m is
somehow still necessary for asciidoc though.

(I also looked into reusing html2text.py for the release notes to
replace asciidoc, but the --format=html output produces different output
(HTML adds a ToC and section numbers). For now still require lynx for
release notes)

Tested with Python 2.6.6, 2.7.9, 3.2.6 and 3.4.3 under LC_ALL=C and
LC_ALL=en_US.UTF-8 on Linux. Tested reading from stdin and file, writing
to file, pipe and tty. Tested with cmake (Ninja) and autotools on Arch
Linux x86_64. Test:

    # For each $PATH per python version, execute (with varying LC_ALL)
    help/faq.py -b | tools/html2text.py /dev/stdin | md5sum
    help/faq.py -b | tools/html2text.py | md5sum
    help/faq.py -b | tools/html2text.py
    help/faq.py -b | tools/html2text.py >/dev/null

Change-Id: I6409450a3e6c8b010ca082251f9db7358b0cc2fd
Reviewed-on: https://code.wireshark.org/review/7779
Petri-Dish: Peter Wu <peter@lekensteyn.nl>
Tested-by: Petri Dish Buildbot <buildbot-no-reply@wireshark.org>
Reviewed-by: Anders Broman <a.broman58@gmail.com>
This commit is contained in:
Peter Wu 2015-03-21 11:57:01 +01:00 committed by Anders Broman
parent 83b6338673
commit 68698db8cc
10 changed files with 168 additions and 566 deletions

View File

@ -1416,10 +1416,9 @@ else()
endforeach() endforeach()
endif(WIN32) endif(WIN32)
add_custom_command(TARGET copy_data_files PRE_BUILD add_custom_command(TARGET copy_data_files PRE_BUILD
COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/help/faq.py > faq.tmp.html COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/help/faq.py -b > faq.tmp.html
COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/tools/html2text.py COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/tools/html2text.py
--width=72 --no-links faq.tmp.html faq.tmp.html > "${DATAFILE_DIR}/help/faq.txt"
> "${DATAFILE_DIR}/help/faq.txt"
COMMAND ${CMAKE_COMMAND} -E remove faq.tmp.html COMMAND ${CMAKE_COMMAND} -E remove faq.tmp.html
) )

View File

@ -24,7 +24,6 @@ covered by other licenses that are not themselves directly compatible with the
GPLv2. This is OK, as only the tools themselves are licensed this way, the GPLv2. This is OK, as only the tools themselves are licensed this way, the
output of the tools is not considered a derived work, and so can be safely output of the tools is not considered a derived work, and so can be safely
licensed for Wireshark's use. An incomplete selection of these tools includes: licensed for Wireshark's use. An incomplete selection of these tools includes:
- the html2text utility (tools/html2text.py) is licensed under the GPLv3.
- the pidl utility (tools/pidl) is licensed under the GPLv3+. - the pidl utility (tools/pidl) is licensed under the GPLv3+.
Parts of Wireshark can be built and distributed as libraries. These Parts of Wireshark can be built and distributed as libraries. These

View File

@ -1,6 +1,6 @@
# #
# - Find unix commands from cygwin # - Find unix commands from cygwin
# This module looks for some usual Unix commands. # This module looks for lynx (used by asciidoc)
# #
INCLUDE(FindCygwin) INCLUDE(FindCygwin)
@ -8,9 +8,6 @@ INCLUDE(FindCygwin)
FIND_PROGRAM(LYNX_EXECUTABLE FIND_PROGRAM(LYNX_EXECUTABLE
NAMES NAMES
lynx lynx
elinks
links
true
PATHS PATHS
${CYGWIN_INSTALL_PATH}/bin ${CYGWIN_INSTALL_PATH}/bin
/bin /bin
@ -23,18 +20,3 @@ INCLUDE(FindPackageHandleStandardArgs)
FIND_PACKAGE_HANDLE_STANDARD_ARGS(LYNX DEFAULT_MSG LYNX_EXECUTABLE) FIND_PACKAGE_HANDLE_STANDARD_ARGS(LYNX DEFAULT_MSG LYNX_EXECUTABLE)
MARK_AS_ADVANCED(LYNX_EXECUTABLE) MARK_AS_ADVANCED(LYNX_EXECUTABLE)
# Convert html to text
IF(LYNX_EXECUTABLE MATCHES lynx)
# (See Bug # 1446 for note re 'force-html' below)
set(HTML2TXT "lynx -dump -width=72 -nolist -stdin -force-html")
ELSEIF(LYNX_EXECUTABLE MATCHES elinks)
set(HTML2TXT "elinks -dump -dump-width 72")
ELSEIF(LYNX_EXECUTABLE MATCHES links)
set(HTML2TXT "links -dump -width 72")
ELSEIF(LYNX_EXECUTABLE MATCHES true)
set(HTML2TXT "true")
ELSE()
message(ERROR "Should never be reached - please report!")
ENDIF()
message(STATUS "html2text: ${HTML2TXT}")

View File

@ -79,8 +79,7 @@ PROGRAM_FILES=$(PROGRAMFILES)
PROGRAM_FILES_W6432=$(PROGRAMW6432) PROGRAM_FILES_W6432=$(PROGRAMW6432)
# #
# Location of the "tools" directory. This affects HTML2TXT below and should # Location of the "tools" directory. This affects the path to textify.ps1
# be overridden by makefiles in any subdirectories that use HTML2TXT.
!IFNDEF TOOLS_DIR !IFNDEF TOOLS_DIR
TOOLS_DIR=tools TOOLS_DIR=tools
!ENDIF !ENDIF
@ -1321,17 +1320,6 @@ FOP=$(WIRESHARK_LIB_DIR)\fop-1.0\fop.bat
# Additional options to fop. # Additional options to fop.
FOP_OPTS=-Xmx256m FOP_OPTS=-Xmx256m
# html to text converter for text version of release notes, e.g. elinks.
# This could also be "lynx", or "true" if neither elinks nor lynx is installed
# (cygwin: lynx works, elinks not available, links and true doesn't produce output)
#HTML2TXT=elinks -dump -dump-width 72
##HTML2TXT=links -dump -width 72 ## XXX: Fails: For links -dump requires 'url' (filename) arg.
#HTML2TXT=lynx -dump -width=72 -nolist -stdin
!IFNDEF HTML2TXT
HTML2TXT=$(PYTHON) $(TOOLS_DIR)\html2text.py --width=72 --no-links
!ENDIF
# the XSL processor (part of cygwin's libxslt package) # the XSL processor (part of cygwin's libxslt package)
XSLTPROC="xsltproc" XSLTPROC="xsltproc"

View File

@ -760,23 +760,19 @@ AC_PATH_PROG(A2X, a2x)
AC_CHECK_PROG(HAVE_A2X, a2x, "yes", "no") AC_CHECK_PROG(HAVE_A2X, a2x, "yes", "no")
AM_CONDITIONAL(HAVE_A2X, test x$HAVE_A2X = xyes) AM_CONDITIONAL(HAVE_A2X, test x$HAVE_A2X = xyes)
# Want to control a tape drive? Use mt. Want to convert HTML to text?
# Uhhhhh... elinks? lynx? w3m? pandoc? html2text?
AC_PATH_PROG(ELINKS, elinks)
AC_CHECK_PROG(HAVE_ELINKS, elinks, "yes", "no")
AM_CONDITIONAL(HAVE_ELINKS, test x$HAVE_ELINKS = xyes)
# Check for fop (translate .fo to e.g. pdf) # Check for fop (translate .fo to e.g. pdf)
AC_PATH_PROG(FOP, fop) AC_PATH_PROG(FOP, fop)
AC_CHECK_PROG(HAVE_FOP, fop, "yes", "no") AC_CHECK_PROG(HAVE_FOP, fop, "yes", "no")
AM_CONDITIONAL(HAVE_FOP, test x$HAVE_FOP = xyes) AM_CONDITIONAL(HAVE_FOP, test x$HAVE_FOP = xyes)
# Check for lynx (html -> text) # TODO: HAVE_LYNX and HAVE_W3M are unused. Maybe require one of them
# to be found when a2x is enabled? Otherwise it will fail later...
# Check for lynx (asciidoc text format from html)
AC_PATH_PROG(LYNX, lynx) AC_PATH_PROG(LYNX, lynx)
AC_CHECK_PROG(HAVE_LYNX, lynx, "yes", "no") AC_CHECK_PROG(HAVE_LYNX, lynx, "yes", "no")
AM_CONDITIONAL(HAVE_LYNX, test x$HAVE_LYNX = xyes) AM_CONDITIONAL(HAVE_LYNX, test x$HAVE_LYNX = xyes)
# Check for w3m (html -> text) # Check for w3m (asciidoc text format from html)
AC_PATH_PROG(W3M, w3m) AC_PATH_PROG(W3M, w3m)
AC_CHECK_PROG(HAVE_W3M, w3m, "yes", "no") AC_CHECK_PROG(HAVE_W3M, w3m, "yes", "no")
AM_CONDITIONAL(HAVE_W3M, test x$HAVE_W3M = xyes) AM_CONDITIONAL(HAVE_W3M, test x$HAVE_W3M = xyes)

View File

@ -18,26 +18,6 @@ A2X_TEXT_OPTS=
A2X_TEXT_OPTS+="--lynx" A2X_TEXT_OPTS+="--lynx"
#endif #endif
# html to text converter for text version of release notes, e.g. elinks.
# This could also be "lynx", or "true" if neither elinks nor lynx is installed
# (See Bug # 1446 for note re 'force-html' below)
# Sorry about the indenting, but that's what automake requires...
if HAVE_ELINKS
HTML2TXT=$(ELINKS) -dump -dump-width 72
## links: -dump requires 'url' argument (as opposed to elinks & lynx)
## (Rather than fixing things we'll just disable the use of links).
##else
##if HAVE_LINKS
##HTML2TXT=$(LINKS) -dump -width 72
else
if HAVE_LYNX
HTML2TXT=$(LYNX) -dump -width=72 -nolist -stdin -force-html
else
HTML2TXT="true"
endif
##endif
endif
############### YOU SHOULDN'T HAVE TO EDIT ANYTHING BELOW THIS LINE! ################ ############### YOU SHOULDN'T HAVE TO EDIT ANYTHING BELOW THIS LINE! ################
include Makefile.common include Makefile.common

View File

@ -43,13 +43,8 @@ CLEANFILES = faq.txt
MAINTAINERCLEANFILES = \ MAINTAINERCLEANFILES = \
Makefile.in Makefile.in
# Try our best to convert the FAQ to text. # Convert the FAQ to text.
# The output of html2text.py isn't as pretty as elinks, links, or lynx. If that ever changes, we
# can use it exclusively.
faq.txt: $(srcdir)/faq.py faq.txt: $(srcdir)/faq.py
$(AM_V_GEN)$(srcdir)/faq.py >$@.tmp && \ $(AM_V_GEN)$(srcdir)/faq.py -b >$@.tmp && \
command -v elinks > /dev/null && elinks -dump -dump-width 72 -no-numbering -no-references < $@.tmp > $@ || \ $(srcdir)/../tools/html2text.py $@.tmp > $@ && \
command -v links > /dev/null && links -width 72 -html-numbered-links 0 -dump $@.tmp > $@ || \
command -v lynx > /dev/null && lynx -dump -width=72 -nolist -stdin -force-html < $@.tmp > $@ || \
$(srcdir)/../tools/html2text.py --width=72 --no-links $@.tmp > $@ && \
rm -f $@.tmp rm -f $@.tmp

View File

@ -10,7 +10,7 @@ include ..\config.nmake
all: faq.txt all: faq.txt
faq.txt: faq.py faq.txt: faq.py
$(PYTHON) faq.py | $(HTML2TXT) > $@ $(PYTHON) faq.py -b | $(PYTHON) $(TOOLS_DIR)\html2text.py > $@
clean: clean:
rm -rf faq.txt rm -rf faq.txt

View File

@ -203,9 +203,6 @@ PATH_SPECIFIC_WHITELISTED_LICENSES = {
'tools/pidl': [ 'tools/pidl': [
'UNKNOWN', 'UNKNOWN',
], ],
'tools/html2text.py': [
'UNKNOWN',
],
'tools/lemon': [ 'tools/lemon': [
'UNKNOWN', 'UNKNOWN',
], ],

View File

@ -1,504 +1,170 @@
#!/usr/bin/env python #!/usr/bin/env python
"""html2text: Turn HTML into equivalent Markdown-structured text.""" #
__version__ = "2.35-Wireshark" # html2text.py - converts HTML to text
__author__ = "Aaron Swartz (me@aaronsw.com)" #
__copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3." # Wireshark - Network traffic analyzer
__contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes"] # By Gerald Combs <gerald@wireshark.org>
# Copyright 1998 Gerald Combs
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
# NOTE: from __future__ import unicode_literals
# This is a modified version of html2text.py from http://www.aaronsw.com/2002/html2text/
# Changes: __author__ = "Peter Wu <peter@lekensteyn.nl>"
# Options can now be configured from the command line. __copyright__ = "Copyright 2015, Peter Wu"
# SKIP_LINKS and INPUT_ENCODING options have been added. __license__ = "GPL (v2 or later)"
# The script now requires Python 2.3
# TODO: # TODO:
# Support decoded entities with unifiable. # multiple list indentation levels
# Relative URL resolution # maybe allow for ascii output instead of utf-8?
# Indent sections and lists similar to elinks/links/lynx
if not hasattr(__builtins__, 'True'): True, False = 1, 0 import sys
import re, sys, urllib, htmlentitydefs, codecs, StringIO, types from textwrap import TextWrapper
import sgmllib try:
sgmllib.charref = re.compile('&#([xX]?[0-9a-fA-F]+)[^0-9a-fA-F]') from HTMLParser import HTMLParser
from optparse import OptionParser from htmlentitydefs import name2codepoint
except: # Python 3
from html.parser import HTMLParser
from html.entities import name2codepoint
unichr = chr # for html entity handling
try: from textwrap import wrap class TextHTMLParser(HTMLParser):
except: pass """Converts a HTML document to text."""
def __init__(self):
oparser = OptionParser()
options = None
args = None
oparser.add_option(
"--force-unicode",
action="store_true",
dest="UNICODE_SNOB",
default=False,
help="Use Unicode characters instead of their ascii psuedo-replacements. [default: False]",
)
oparser.add_option(
"--links-after-paragraphs",
action="store_true",
dest="LINKS_EACH_PARAGRAPH",
default=False,
help="Put the links after each paragraph instead of at the end. [default: False]",
)
oparser.add_option(
"--width",
type="int",
dest="BODY_WIDTH",
default=78,
help="Wrap long lines at position. 0 for no wrapping. Requires Python 2.3. [default: 78 characters]",
)
oparser.add_option(
"--no-internal-links",
action="store_true",
dest="SKIP_INTERNAL_LINKS",
default=False,
help='''Don't show internal links (href="#local-anchor"). Corresponding link targets won't be visible in the plain text file anyway. [default: False]''',
)
oparser.add_option(
"--no-links",
action="store_true",
dest="SKIP_LINKS",
default=False,
help='''Don't show links. [default: False]''',
)
oparser.add_option(
"--input-encoding",
type="string",
dest="INPUT_ENCODING",
default='utf-8',
help='''Force the encoding of the input file. [default: utf-8]''',
)
### Entity Nonsense ###
def name2cp(k):
if k == 'apos': return ord("'")
if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
return htmlentitydefs.name2codepoint[k]
else:
k = htmlentitydefs.entitydefs[k]
if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
return ord(codecs.latin_1_decode(k)[0])
unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"',
'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*',
'ndash':'-', 'oelig':'oe', 'aelig':'ae',
'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a',
'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e',
'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i',
'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o',
'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u'}
unifiable_n = {}
for k in unifiable.keys():
unifiable_n[name2cp(k)] = unifiable[k]
def charref(name):
global options
if name[0] in ['x','X']:
c = int(name[1:], 16)
else:
c = int(name)
if not options.UNICODE_SNOB and c in unifiable_n.keys():
return unifiable_n[c]
else:
return unichr(c)
def entityref(c):
global options
if not options.UNICODE_SNOB and c in unifiable.keys():
return unifiable[c]
else:
try: name2cp(c)
except KeyError: return "&" + c
else: return unichr(name2cp(c))
def replaceEntities(s):
s = s.group(1)
if s[0] == "#":
return charref(s[1:])
else: return entityref(s)
r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
def unescape(s):
return r_unescape.sub(replaceEntities, s)
def fixattrs(attrs):
# Fix bug in sgmllib.py
if not attrs: return attrs
newattrs = []
for attr in attrs:
newattrs.append((attr[0], unescape(attr[1])))
return newattrs
### End Entity Nonsense ###
def onlywhite(line):
"""Return true if the line does only consist of whitespace characters."""
for c in line:
if c is not ' ' and c is not ' ':
return c is ' '
return line
def optwrap(text):
"""Wrap all paragraphs in the provided text."""
global options
if not options.BODY_WIDTH:
return text
assert wrap, "Requires Python 2.3."
result = ''
newlines = 0
for para in text.split("\n"):
if len(para) > 0:
if para[0] is not ' ' and para[0] is not '-' and para[0] is not '*':
for line in wrap(para, options.BODY_WIDTH):
result += line + "\n"
result += "\n"
newlines = 2
else:
if not onlywhite(para):
result += para + "\n"
newlines = 1
else:
if newlines < 2:
result += "\n"
newlines += 1
return result
def hn(tag):
if tag[0] == 'h' and len(tag) == 2:
try: try:
n = int(tag[1]) # Python 3.4
if n in range(1, 10): return n HTMLParser. __init__(self, convert_charrefs=True)
except ValueError: return 0 except:
HTMLParser. __init__(self)
# All text, concatenated
self.output_buffer = ''
# The current text block which is being constructed
self.text_block = ''
# Whether the previous element was terminated with whitespace
self.need_space = False
# Whether to prevent word-wrapping the contents (for "pre" tag)
self.skip_wrap = False
# track list items
self.list_item_prefix = None
self.ordered_list_index = None
# Indentation (for heading and paragraphs)
self.indent_levels = [0, 0]
class _html2text(sgmllib.SGMLParser): def _wrap_text(self, text):
def __init__(self, out=sys.stdout.write): """Wraps text, but additionally indent list items."""
sgmllib.SGMLParser.__init__(self) initial_indent = indent = sum(self.indent_levels) * ' '
if self.list_item_prefix:
if out is None: self.out = self.outtextf initial_indent += self.list_item_prefix
else: self.out = out indent += ' '
self.outtext = u'' wrapper = TextWrapper(width=66, break_on_hyphens=False,
self.quiet = 0 initial_indent=initial_indent, subsequent_indent=indent)
self.p_p = 0 return '\n'.join(wrapper.wrap(text))
self.outcount = 0
self.start = 1
self.space = 0
self.a = []
self.astack = []
self.acount = 0
self.list = []
self.blockquote = 0
self.pre = 0
self.startpre = 0
self.lastWasNL = 0
self.abbr_title = None # current abbreviation definition
self.abbr_data = None # last inner HTML (for abbr being defined)
self.abbr_list = {} # stack of abbreviations to write later
def outtextf(self, s):
self.outtext += s
def close(self):
sgmllib.SGMLParser.close(self)
self.pbr()
self.o('', 0, 'end')
return self.outtext
def handle_charref(self, c):
self.o(charref(c))
def handle_entityref(self, c): def _commit_block(self, newline='\n\n'):
self.o(entityref(c)) text = self.text_block
if text:
def unknown_starttag(self, tag, attrs): if not self.skip_wrap:
self.handle_tag(tag, attrs, 1) text = self._wrap_text(text)
self.output_buffer += text + newline
def unknown_endtag(self, tag): self.text_block = ''
self.handle_tag(tag, None, 0) self.need_space = False
def previousIndex(self, attrs):
""" returns the index of certain set of attributes (of a link) in the
self.a list
If the set of attributes is not found, returns None
"""
if not attrs.has_key('href'): return None
i = -1
for a in self.a:
i += 1
match = 0
if a.has_key('href') and a['href'] == attrs['href']:
if a.has_key('title') or attrs.has_key('title'):
if (a.has_key('title') and attrs.has_key('title') and
a['title'] == attrs['title']):
match = True
else:
match = True
if match: return i def handle_starttag(self, tag, attrs):
# end a block of text on <br>, but also flush list items which are not
def handle_tag(self, tag, attrs, start): # terminated.
global options if tag == 'br' or tag == 'li':
attrs = fixattrs(attrs) self._commit_block('\n')
if tag == 'pre':
if hn(tag): self.skip_wrap = True
self.p() # Following list items are numbered.
if start: self.o(hn(tag)*"#" + ' ') if tag == 'ol':
self.ordered_list_index = 1
if tag in ['p', 'div']: self.p() if tag == 'ul':
self.list_item_prefix = ' * '
if tag == "br" and start: self.o(" \n") if tag == 'li' and self.ordered_list_index:
self.list_item_prefix = ' %d. ' % (self.ordered_list_index)
if tag == "hr" and start: self.ordered_list_index += 1
self.p() if tag[0] == 'h' and len(tag) == 2 and \
self.o("* * *") (tag[1] >= '1' and tag[1] <= '6'):
self.p() self.indent_levels = [int(tag[1]) - 1, 0]
if tag == 'p':
if tag in ["head", "style", 'script']: self.indent_levels[1] = 1
if start: self.quiet += 1
else: self.quiet -= 1
if tag in ["body"]:
self.quiet = 0 # sites like 9rules.com never close <head>
if tag == "blockquote":
if start:
self.p(); self.o('> ', 0, 1); self.start = 1
self.blockquote += 1
else:
self.blockquote -= 1
self.p()
if tag in ['em', 'i', 'u']: self.o("_")
if tag in ['strong', 'b']: self.o("**")
if tag == "code" and not self.pre: self.o('`') #TODO: `` `this` ``
if tag == "abbr":
if start:
attrsD = {}
for (x, y) in attrs: attrsD[x] = y
attrs = attrsD
self.abbr_title = None
self.abbr_data = ''
if attrs.has_key('title'):
self.abbr_title = attrs['title']
else:
if self.abbr_title != None:
self.abbr_list[self.abbr_data] = self.abbr_title
self.abbr_title = None
self.abbr_data = ''
if tag == "a":
if start:
attrsD = {}
for (x, y) in attrs: attrsD[x] = y
attrs = attrsD
if attrs.has_key('href') and not (options.SKIP_LINKS or (options.SKIP_INTERNAL_LINKS and attrs['href'].startswith('#'))):
self.astack.append(attrs)
self.o("[")
else:
self.astack.append(None)
else:
if self.astack:
a = self.astack.pop()
if a:
i = self.previousIndex(a)
if i is not None:
a = self.a[i]
else:
self.acount += 1
a['count'] = self.acount
a['outcount'] = self.outcount
self.a.append(a)
self.o("][" + `a['count']` + "]")
if tag == "img" and start:
attrsD = {}
for (x, y) in attrs: attrsD[x] = y
attrs = attrsD
if attrs.has_key('src'):
attrs['href'] = attrs['src']
alt = attrs.get('alt', '')
i = self.previousIndex(attrs)
if i is not None:
attrs = self.a[i]
else:
self.acount += 1
attrs['count'] = self.acount
attrs['outcount'] = self.outcount
self.a.append(attrs)
self.o("![")
self.o(alt)
self.o("]["+`attrs['count']`+"]")
if tag == 'dl' and start: self.p()
if tag == 'dt' and not start: self.pbr()
if tag == 'dd' and start: self.o(' ')
if tag == 'dd' and not start: self.pbr()
if tag in ["ol", "ul"]:
if start:
self.list.append({'name':tag, 'num':0})
else:
if self.list: self.list.pop()
self.p()
if tag == 'li':
if start:
self.pbr()
if self.list: li = self.list[-1]
else: li = {'name':'ul', 'num':0}
self.o(" "*len(self.list)) #TODO: line up <ol><li>s > 9 correctly.
if li['name'] == "ul": self.o("* ")
elif li['name'] == "ol":
li['num'] += 1
self.o(`li['num']`+". ")
self.start = 1
else:
self.pbr()
if tag in ["table", "tr"] and start: self.p()
if tag == 'td': self.pbr()
if tag == "pre":
if start:
self.startpre = 1
self.pre = 1
else:
self.pre = 0
self.p()
def pbr(self):
if self.p_p == 0: self.p_p = 1
def p(self): self.p_p = 2
def o(self, data, puredata=0, force=0):
if self.abbr_data is not None: self.abbr_data += data
if not self.quiet:
if puredata and not self.pre:
data = re.sub('\s+', ' ', data)
if data and data[0] == ' ':
self.space = 1
data = data[1:]
if not data and not force: return
if self.startpre:
#self.out(" :") #TODO: not output when already one there
self.startpre = 0
bq = (">" * self.blockquote)
if not (force and data and data[0] == ">") and self.blockquote: bq += " "
if self.pre:
bq += " "
data = data.replace("\n", "\n"+bq)
if self.start:
self.space = 0
self.p_p = 0
self.start = 0
if force == 'end':
# It's the end.
self.p_p = 0
self.out("\n")
self.space = 0
if self.p_p:
self.out(('\n'+bq)*self.p_p)
self.space = 0
if self.space:
if not self.lastWasNL: self.out(' ')
self.space = 0
if self.a and ((self.p_p == 2 and options.LINKS_EACH_PARAGRAPH) or force == "end"):
if force == "end": self.out("\n")
newa = []
for link in self.a:
if self.outcount > link['outcount']:
self.out(" ["+`link['count']`+"]: " + link['href']) #TODO: base href
if link.has_key('title'): self.out(" ("+link['title']+")")
self.out("\n")
else:
newa.append(link)
if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done.
self.a = newa
if self.abbr_list and force == "end":
for abbr, definition in self.abbr_list.items():
self.out(" *[" + abbr + "]: " + definition + "\n")
self.p_p = 0
self.out(data)
self.lastWasNL = data and data[-1] == '\n'
self.outcount += 1
def handle_data(self, data): def handle_data(self, data):
if r'\/script>' in data: self.quiet -= 1 if self.skip_wrap:
self.o(data, 1) block = data
def unknown_decl(self, data): pass
def wrapwrite(text): sys.stdout.write(text.encode('utf8'))
def html2text_file(html, out=wrapwrite):
global options, args, oparser
if options is None or args is None:
(options, args) = oparser.parse_args(None, None)
h = _html2text(out)
h.feed(html)
h.feed("")
return h.close()
def html2text(html):
return optwrap(html2text_file(html, None))
if __name__ == "__main__":
(options, args) = oparser.parse_args()
if len(args) > 0:
arg = args[0]
if arg.startswith('http://'):
j = urllib.urlopen(arg)
try:
from feedparser import _getCharacterEncoding as enc
except ImportError:
enc = lambda x, y: ('utf-8', 1)
text = j.read()
encoding = enc(j.headers, text)[0]
if encoding == 'us-ascii': encoding = 'utf-8'
data = text.decode(encoding)
else: else:
data = open(arg, 'r').read().decode(options.INPUT_ENCODING) # For normal text, fold multiple whitespace and strip
else: # leading and trailing spaces for the whole block (but
data = sys.stdin.read().decode(options.INPUT_ENCODING) # keep spaces in the middle).
wrapwrite(html2text(data)) block = ''
if data.strip() and data[:1].isspace():
# Keep spaces in the middle
self.need_space = True
if self.need_space and data.strip() and self.text_block:
block = ' '
block += ' '.join(data.split())
self.need_space = data[-1:].isspace()
self.text_block += block
def handle_endtag(self, tag):
block_elements = 'p li ul pre ol h1 h2 h3 h4 h5 h6'
#block_elements += ' dl dd dt'
if tag in block_elements.split():
self._commit_block()
if tag in ('ol', 'ul'):
self.list_item_prefix = None
self.ordered_list_index = None
if tag == 'pre':
self.skip_wrap = False
def handle_charref(self, name):
self.handle_data(unichr(int(name)))
def handle_entityref(self, name):
self.handle_data(unichr(name2codepoint[name]))
def close(self):
HTMLParser.close(self)
self._commit_block()
byte_output = self.output_buffer.encode('utf-8')
if hasattr(sys.stdout, 'buffer'):
sys.stdout.buffer.write(byte_output)
else:
sys.stdout.write(byte_output)
def main():
htmlparser = TextHTMLParser()
if len(sys.argv) > 1:
if sys.version_info[0] >= 3:
# Python 3: read file as utf-8
kwargs = { 'encoding': 'utf-8' }
else:
kwargs = {}
with open(sys.argv[1], **kwargs) as f:
for line in f:
htmlparser.feed(line)
else:
f = sys.stdin
if hasattr(f, 'buffer'):
# Access raw (byte) buffer in Python 3 instead of decoded one
f = f.buffer
# Read stdin as as Unicode string
htmlparser.feed(f.read().decode('utf-8'))
htmlparser.close()
if __name__ == '__main__':
sys.exit(main())