updates

2017-07-28 19:37:39 -04:00
parent 60e4685ce8
commit 3cc42f282f
1046 changed files with 162392 additions and 9 deletions
@@ -0,0 +1,391 @@
+"""A simple "pull API" for HTML parsing, after Perl's HTML::TokeParser.
+
+Examples
+
+This program extracts all links from a document.  It will print one
+line for each link, containing the URL and the textual description
+between the <A>...</A> tags:
+
+import pullparser, sys
+f = file(sys.argv[1])
+p = pullparser.PullParser(f)
+for token in p.tags("a"):
+    if token.type == "endtag": continue
+    url = dict(token.attrs).get("href", "-")
+    text = p.get_compressed_text(endat=("endtag", "a"))
+    print "%s\t%s" % (url, text)
+
+This program extracts the <TITLE> from the document:
+
+import pullparser, sys
+f = file(sys.argv[1])
+p = pullparser.PullParser(f)
+if p.get_tag("title"):
+    title = p.get_compressed_text()
+    print "Title: %s" % title
+
+
+Copyright 2003-2006 John J. Lee <jjl@pobox.com>
+Copyright 1998-2001 Gisle Aas (original libwww-perl code)
+
+This code is free software; you can redistribute it and/or modify it
+under the terms of the BSD or ZPL 2.1 licenses.
+
+"""
+
+import re, htmlentitydefs
+import _sgmllib_copy as sgmllib
+import HTMLParser
+from xml.sax import saxutils
+
+from _html import unescape, unescape_charref
+
+
+class NoMoreTokensError(Exception): pass
+
+class Token:
+    """Represents an HTML tag, declaration, processing instruction etc.
+
+    Behaves as both a tuple-like object (ie. iterable) and has attributes
+    .type, .data and .attrs.
+
+    >>> t = Token("starttag", "a", [("href", "http://www.python.org/")])
+    >>> t == ("starttag", "a", [("href", "http://www.python.org/")])
+    True
+    >>> (t.type, t.data) == ("starttag", "a")
+    True
+    >>> t.attrs == [("href", "http://www.python.org/")]
+    True
+
+    Public attributes
+
+    type: one of "starttag", "endtag", "startendtag", "charref", "entityref",
+     "data", "comment", "decl", "pi", after the corresponding methods of
+     HTMLParser.HTMLParser
+    data: For a tag, the tag name; otherwise, the relevant data carried by the
+     tag, as a string
+    attrs: list of (name, value) pairs representing HTML attributes
+     (or None if token does not represent an opening tag)
+
+    """
+    def __init__(self, type, data, attrs=None):
+        self.type = type
+        self.data = data
+        self.attrs = attrs
+    def __iter__(self):
+        return iter((self.type, self.data, self.attrs))
+    def __eq__(self, other):
+        type, data, attrs = other
+        if (self.type == type and
+            self.data == data and
+            self.attrs == attrs):
+            return True
+        else:
+            return False
+    def __ne__(self, other): return not self.__eq__(other)
+    def __repr__(self):
+        args = ", ".join(map(repr, [self.type, self.data, self.attrs]))
+        return self.__class__.__name__+"(%s)" % args
+
+    def __str__(self):
+        """
+        >>> print Token("starttag", "br")
+        <br>
+        >>> print Token("starttag", "a",
+        ...     [("href", "http://www.python.org/"), ("alt", '"foo"')])
+        <a href="http://www.python.org/" alt='"foo"'>
+        >>> print Token("startendtag", "br")
+        <br />
+        >>> print Token("startendtag", "br", [("spam", "eggs")])
+        <br spam="eggs" />
+        >>> print Token("endtag", "p")
+        </p>
+        >>> print Token("charref", "38")
+        &#38;
+        >>> print Token("entityref", "amp")
+        &amp;
+        >>> print Token("data", "foo\\nbar")
+        foo
+        bar
+        >>> print Token("comment", "Life is a bowl\\nof cherries.")
+        <!--Life is a bowl
+        of cherries.-->
+        >>> print Token("decl", "decl")
+        <!decl>
+        >>> print Token("pi", "pi")
+        <?pi>
+        """
+        if self.attrs is not None:
+            attrs = "".join([" %s=%s" % (k, saxutils.quoteattr(v)) for
+                             k, v in self.attrs])
+        else:
+            attrs = ""
+        if self.type == "starttag":
+            return "<%s%s>" % (self.data, attrs)
+        elif self.type == "startendtag":
+            return "<%s%s />" % (self.data, attrs)
+        elif self.type == "endtag":
+            return "</%s>" % self.data
+        elif self.type == "charref":
+            return "&#%s;" % self.data
+        elif self.type == "entityref":
+            return "&%s;" % self.data
+        elif self.type == "data":
+            return self.data
+        elif self.type == "comment":
+            return "<!--%s-->" % self.data
+        elif self.type == "decl":
+            return "<!%s>" % self.data
+        elif self.type == "pi":
+            return "<?%s>" % self.data
+        assert False
+
+
+def iter_until_exception(fn, exception, *args, **kwds):
+    while 1:
+        try:
+            yield fn(*args, **kwds)
+        except exception:
+            raise StopIteration
+
+
+class _AbstractParser:
+    chunk = 1024
+    compress_re = re.compile(r"\s+")
+    def __init__(self, fh, textify={"img": "alt", "applet": "alt"},
+                 encoding="ascii", entitydefs=None):
+        """
+        fh: file-like object (only a .read() method is required) from which to
+         read HTML to be parsed
+        textify: mapping used by .get_text() and .get_compressed_text() methods
+         to represent opening tags as text
+        encoding: encoding used to encode numeric character references by
+         .get_text() and .get_compressed_text() ("ascii" by default)
+
+        entitydefs: mapping like {"amp": "&", ...} containing HTML entity
+         definitions (a sensible default is used).  This is used to unescape
+         entities in .get_text() (and .get_compressed_text()) and attribute
+         values.  If the encoding can not represent the character, the entity
+         reference is left unescaped.  Note that entity references (both
+         numeric - e.g. &#123; or &#xabc; - and non-numeric - e.g. &amp;) are
+         unescaped in attribute values and the return value of .get_text(), but
+         not in data outside of tags.  Instead, entity references outside of
+         tags are represented as tokens.  This is a bit odd, it's true :-/
+
+        If the element name of an opening tag matches a key in the textify
+        mapping then that tag is converted to text.  The corresponding value is
+        used to specify which tag attribute to obtain the text from.  textify
+        maps from element names to either:
+
+          - an HTML attribute name, in which case the HTML attribute value is
+            used as its text value along with the element name in square
+            brackets (e.g. "alt text goes here[IMG]", or, if the alt attribute
+            were missing, just "[IMG]")
+          - a callable object (e.g. a function) which takes a Token and returns
+            the string to be used as its text value
+
+        If textify has no key for an element name, nothing is substituted for
+        the opening tag.
+
+        Public attributes:
+
+        encoding and textify: see above
+
+        """
+        self._fh = fh
+        self._tokenstack = []  # FIFO
+        self.textify = textify
+        self.encoding = encoding
+        if entitydefs is None:
+            entitydefs = htmlentitydefs.name2codepoint
+        self._entitydefs = entitydefs
+
+    def __iter__(self): return self
+
+    def tags(self, *names):
+        return iter_until_exception(self.get_tag, NoMoreTokensError, *names)
+
+    def tokens(self, *tokentypes):
+        return iter_until_exception(self.get_token, NoMoreTokensError,
+                                    *tokentypes)
+
+    def next(self):
+        try:
+            return self.get_token()
+        except NoMoreTokensError:
+            raise StopIteration()
+
+    def get_token(self, *tokentypes):
+        """Pop the next Token object from the stack of parsed tokens.
+
+        If arguments are given, they are taken to be token types in which the
+        caller is interested: tokens representing other elements will be
+        skipped.  Element names must be given in lower case.
+
+        Raises NoMoreTokensError.
+
+        """
+        while 1:
+            while self._tokenstack:
+                token = self._tokenstack.pop(0)
+                if tokentypes:
+                    if token.type in tokentypes:
+                        return token
+                else:
+                    return token
+            data = self._fh.read(self.chunk)
+            if not data:
+                raise NoMoreTokensError()
+            self.feed(data)
+
+    def unget_token(self, token):
+        """Push a Token back onto the stack."""
+        self._tokenstack.insert(0, token)
+
+    def get_tag(self, *names):
+        """Return the next Token that represents an opening or closing tag.
+
+        If arguments are given, they are taken to be element names in which the
+        caller is interested: tags representing other elements will be skipped.
+        Element names must be given in lower case.
+
+        Raises NoMoreTokensError.
+
+        """
+        while 1:
+            tok = self.get_token()
+            if tok.type not in ["starttag", "endtag", "startendtag"]:
+                continue
+            if names:
+                if tok.data in names:
+                    return tok
+            else:
+                return tok
+
+    def get_text(self, endat=None):
+        """Get some text.
+
+        endat: stop reading text at this tag (the tag is included in the
+         returned text); endtag is a tuple (type, name) where type is
+         "starttag", "endtag" or "startendtag", and name is the element name of
+         the tag (element names must be given in lower case)
+
+        If endat is not given, .get_text() will stop at the next opening or
+        closing tag, or when there are no more tokens (no exception is raised).
+        Note that .get_text() includes the text representation (if any) of the
+        opening tag, but pushes the opening tag back onto the stack.  As a
+        result, if you want to call .get_text() again, you need to call
+        .get_tag() first (unless you want an empty string returned when you
+        next call .get_text()).
+
+        Entity references are translated using the value of the entitydefs
+        constructor argument (a mapping from names to characters like that
+        provided by the standard module htmlentitydefs).  Named entity
+        references that are not in this mapping are left unchanged.
+
+        The textify attribute is used to translate opening tags into text: see
+        the class docstring.
+
+        """
+        text = []
+        tok = None
+        while 1:
+            try:
+                tok = self.get_token()
+            except NoMoreTokensError:
+                # unget last token (not the one we just failed to get)
+                if tok: self.unget_token(tok)
+                break
+            if tok.type == "data":
+                text.append(tok.data)
+            elif tok.type == "entityref":
+                t = unescape("&%s;"%tok.data, self._entitydefs, self.encoding)
+                text.append(t)
+            elif tok.type == "charref":
+                t = unescape_charref(tok.data, self.encoding)
+                text.append(t)
+            elif tok.type in ["starttag", "endtag", "startendtag"]:
+                tag_name = tok.data
+                if tok.type in ["starttag", "startendtag"]:
+                    alt = self.textify.get(tag_name)
+                    if alt is not None:
+                        if callable(alt):
+                            text.append(alt(tok))
+                        elif tok.attrs is not None:
+                            for k, v in tok.attrs:
+                                if k == alt:
+                                    text.append(v)
+                            text.append("[%s]" % tag_name.upper())
+                if endat is None or endat == (tok.type, tag_name):
+                    self.unget_token(tok)
+                    break
+        return "".join(text)
+
+    def get_compressed_text(self, *args, **kwds):
+        """
+        As .get_text(), but collapses each group of contiguous whitespace to a
+        single space character, and removes all initial and trailing
+        whitespace.
+
+        """
+        text = self.get_text(*args, **kwds)
+        text = text.strip()
+        return self.compress_re.sub(" ", text)
+
+    def handle_startendtag(self, tag, attrs):
+        self._tokenstack.append(Token("startendtag", tag, attrs))
+    def handle_starttag(self, tag, attrs):
+        self._tokenstack.append(Token("starttag", tag, attrs))
+    def handle_endtag(self, tag):
+        self._tokenstack.append(Token("endtag", tag))
+    def handle_charref(self, name):
+        self._tokenstack.append(Token("charref", name))
+    def handle_entityref(self, name):
+        self._tokenstack.append(Token("entityref", name))
+    def handle_data(self, data):
+        self._tokenstack.append(Token("data", data))
+    def handle_comment(self, data):
+        self._tokenstack.append(Token("comment", data))
+    def handle_decl(self, decl):
+        self._tokenstack.append(Token("decl", decl))
+    def unknown_decl(self, data):
+        # XXX should this call self.error instead?
+        #self.error("unknown declaration: " + `data`)
+        self._tokenstack.append(Token("decl", data))
+    def handle_pi(self, data):
+        self._tokenstack.append(Token("pi", data))
+
+    def unescape_attr(self, name):
+        return unescape(name, self._entitydefs, self.encoding)
+    def unescape_attrs(self, attrs):
+        escaped_attrs = []
+        for key, val in attrs:
+            escaped_attrs.append((key, self.unescape_attr(val)))
+        return escaped_attrs
+
+class PullParser(_AbstractParser, HTMLParser.HTMLParser):
+    def __init__(self, *args, **kwds):
+        HTMLParser.HTMLParser.__init__(self)
+        _AbstractParser.__init__(self, *args, **kwds)
+    def unescape(self, name):
+        # Use the entitydefs passed into constructor, not
+        # HTMLParser.HTMLParser's entitydefs.
+        return self.unescape_attr(name)
+
+class TolerantPullParser(_AbstractParser, sgmllib.SGMLParser):
+    def __init__(self, *args, **kwds):
+        sgmllib.SGMLParser.__init__(self)
+        _AbstractParser.__init__(self, *args, **kwds)
+    def unknown_starttag(self, tag, attrs):
+        attrs = self.unescape_attrs(attrs)
+        self._tokenstack.append(Token("starttag", tag, attrs))
+    def unknown_endtag(self, tag):
+        self._tokenstack.append(Token("endtag", tag))
+
+
+def _test():
+   import doctest, _pullparser
+   return doctest.testmod(_pullparser)
+
+if __name__ == "__main__":
+   _test()