Attachment #218823 for bug #250380




# $FreeBSD$

PORTNAME=	feedparser
PORTVERSION=	6.0.1
CATEGORIES=	textproc python
MASTER_SITES=	CHEESESHOP
PKGNAMEPREFIX=	${PYTHON_PKGNAMEPREFIX}

MAINTAINER=	sbz@FreeBSD.org
COMMENT=	Universal feed parser written in Python

LICENSE=	BSD2CLAUSE PSFL
LICENSE_COMB=	multi
LICENSE_FILE=	${WRKSRC}/LICENSE

USES=		python:3.6+
USE_PYTHON=	distutils autoplist

NO_ARCH=	yes

OPTIONS_DEFINE=	DOCS

PORTDOCS=	NEWS README.rst

PLIST_FILES+=	${PYTHON_SITELIBDIR}/sgmllib.py

post-install:
	${CP} ${FILESDIR}/sgmllib.py ${STAGEDIR}/${PYTHON_SITELIBDIR}/sgmllib.py

post-install-DOCS-on:
	${MKDIR} ${STAGEDIR}${DOCSDIR}

Lines 1-3 Link Here

(-)textproc/py-feedparser/distinfo (-3 / +3 lines)
1	TIMESTAMP = 1464128973	1	TIMESTAMP = 1602900188
2	SHA256 (feedparser-5.2.1.tar.bz2) = ce875495c90ebd74b179855449040003a1beb40cd13d5f037a0654251e260b02	2	SHA256 (feedparser-6.0.1.tar.gz) = 6ca88edcaa43f428345968df903a87f020843eda5e28d7ea24a612158d61e74c
3	SIZE (feedparser-5.2.1.tar.bz2) = 192328	3	SIZE (feedparser-6.0.1.tar.gz) = 284620




"""A parser for SGML, using the derived class as a static DTD."""

# XXX This only supports those SGML features used by HTML.

# XXX There should be a way to distinguish between PCDATA (parsed
# character data -- the normal case), RCDATA (replaceable character
# data -- only char and entity references and end tags are special)
# and CDATA (character data -- only end tags are special).  RCDATA is
# not supported at all.

import _markupbase
import re

__all__ = ["SGMLParser", "SGMLParseError"]

# Regular expressions used for parsing

interesting = re.compile('[&<]')
incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
                           '<([a-zA-Z][^<>]*|'
                              '/([a-zA-Z][^<>]*)?|'
                              '![^<>]*)?')

entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
charref = re.compile('&#([0-9]+)[^0-9]')

starttagopen = re.compile('<[>a-zA-Z]')
shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
piclose = re.compile('>')
endbracket = re.compile('[<>]')
tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
attrfind = re.compile(
    r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
    r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?')


class SGMLParseError(RuntimeError):
    """Exception raised for all parse errors."""
    pass


# SGML parser base class -- find tags and call handler functions.
# Usage: p = SGMLParser(); p.feed(data); ...; p.close().
# The dtd is defined by deriving a class which defines methods
# with special names to handle tags: start_foo and end_foo to handle
# <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
# (Tags are converted to lower case for this purpose.)  The data
# between tags is passed to the parser by calling self.handle_data()
# with some data as argument (the data may be split up in arbitrary
# chunks).  Entity references are passed by calling
# self.handle_entityref() with the entity reference as argument.

class SGMLParser(_markupbase.ParserBase):
    # Definition of entities -- derived classes may override
    entity_or_charref = re.compile('&(?:'
      '([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)'
      ')(;?)')

    def __init__(self, verbose=0):
        """Initialize and reset this instance."""
        self.verbose = verbose
        self.reset()

    def reset(self):
        """Reset this instance. Loses all unprocessed data."""
        self.__starttag_text = None
        self.rawdata = ''
        self.stack = []
        self.lasttag = '???'
        self.nomoretags = 0
        self.literal = 0
        _markupbase.ParserBase.reset(self)

    def setnomoretags(self):
        """Enter literal mode (CDATA) till EOF.

        Intended for derived classes only.
        """
        self.nomoretags = self.literal = 1

    def setliteral(self, *args):
        """Enter literal mode (CDATA).

        Intended for derived classes only.
        """
        self.literal = 1

    def feed(self, data):
        """Feed some data to the parser.

        Call this as often as you want, with as little or as much text
        as you want (may include '\n').  (This just saves the text,
        all the processing is done by goahead().)
        """

        self.rawdata = self.rawdata + data
        self.goahead(0)

    def close(self):
        """Handle the remaining data."""
        self.goahead(1)

    def error(self, message):
        raise SGMLParseError(message)

    # Internal -- handle data as far as reasonable.  May leave state
    # and data to be processed by a subsequent call.  If 'end' is
    # true, force handling all data as if followed by EOF marker.
    def goahead(self, end):
        rawdata = self.rawdata
        i = 0
        n = len(rawdata)
        while i < n:
            if self.nomoretags:
                self.handle_data(rawdata[i:n])
                i = n
                break
            match = interesting.search(rawdata, i)
            if match: j = match.start()
            else: j = n
            if i < j:
                self.handle_data(rawdata[i:j])
            i = j
            if i == n: break
            if rawdata[i] == '<':
                if starttagopen.match(rawdata, i):
                    if self.literal:
                        self.handle_data(rawdata[i])
                        i = i+1
                        continue
                    k = self.parse_starttag(i)
                    if k < 0: break
                    i = k
                    continue
                if rawdata.startswith("</", i):
                    k = self.parse_endtag(i)
                    if k < 0: break
                    i = k
                    self.literal = 0
                    continue
                if self.literal:
                    if n > (i + 1):
                        self.handle_data("<")
                        i = i+1
                    else:
                        # incomplete
                        break
                    continue
                if rawdata.startswith("<!--", i):
                        # Strictly speaking, a comment is --.*--
                        # within a declaration tag <!...>.
                        # This should be removed,
                        # and comments handled only in parse_declaration.
                    k = self.parse_comment(i)
                    if k < 0: break
                    i = k
                    continue
                if rawdata.startswith("<?", i):
                    k = self.parse_pi(i)
                    if k < 0: break
                    i = i+k
                    continue
                if rawdata.startswith("<!", i):
                    # This is some sort of declaration; in "HTML as
                    # deployed," this should only be the document type
                    # declaration ("<!DOCTYPE html...>").
                    k = self.parse_declaration(i)
                    if k < 0: break
                    i = k
                    continue
            elif rawdata[i] == '&':
                if self.literal:
                    self.handle_data(rawdata[i])
                    i = i+1
                    continue
                match = charref.match(rawdata, i)
                if match:
                    name = match.group(1)
                    self.handle_charref(name)
                    i = match.end(0)
                    if rawdata[i-1] != ';': i = i-1
                    continue
                match = entityref.match(rawdata, i)
                if match:
                    name = match.group(1)
                    self.handle_entityref(name)
                    i = match.end(0)
                    if rawdata[i-1] != ';': i = i-1
                    continue
            else:
                self.error('neither < nor & ??')
            # We get here only if incomplete matches but
            # nothing else
            match = incomplete.match(rawdata, i)
            if not match:
                self.handle_data(rawdata[i])
                i = i+1
                continue
            j = match.end(0)
            if j == n:
                break # Really incomplete
            self.handle_data(rawdata[i:j])
            i = j
        # end while
        if end and i < n:
            self.handle_data(rawdata[i:n])
            i = n
        self.rawdata = rawdata[i:]
        # XXX if end: check for empty stack

    # Extensions for the DOCTYPE scanner:
    _decl_otherchars = '='

    # Internal -- parse processing instr, return length or -1 if not terminated
    def parse_pi(self, i):
        rawdata = self.rawdata
        if rawdata[i:i+2] != '<?':
            self.error('unexpected call to parse_pi()')
        match = piclose.search(rawdata, i+2)
        if not match:
            return -1
        j = match.start(0)
        self.handle_pi(rawdata[i+2: j])
        j = match.end(0)
        return j-i

    def get_starttag_text(self):
        return self.__starttag_text

    # Internal -- handle starttag, return length or -1 if not terminated
    def parse_starttag(self, i):
        self.__starttag_text = None
        start_pos = i
        rawdata = self.rawdata
        if shorttagopen.match(rawdata, i):
            # SGML shorthand: <tag/data/ == <tag>data</tag>
            # XXX Can data contain &... (entity or char refs)?
            # XXX Can data contain < or > (tag characters)?
            # XXX Can there be whitespace before the first /?
            match = shorttag.match(rawdata, i)
            if not match:
                return -1
            tag, data = match.group(1, 2)
            self.__starttag_text = '<%s/' % tag
            tag = tag.lower()
            k = match.end(0)
            self.finish_shorttag(tag, data)
            self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
            return k
        # XXX The following should skip matching quotes (' or ")
        # As a shortcut way to exit, this isn't so bad, but shouldn't
        # be used to locate the actual end of the start tag since the
        # < or > characters may be embedded in an attribute value.
        match = endbracket.search(rawdata, i+1)
        if not match:
            return -1
        j = match.start(0)
        # Now parse the data between i+1 and j into a tag and attrs
        attrs = []
        if rawdata[i:i+2] == '<>':
            # SGML shorthand: <> == <last open tag seen>
            k = j
            tag = self.lasttag
        else:
            match = tagfind.match(rawdata, i+1)
            if not match:
                self.error('unexpected call to parse_starttag')
            k = match.end(0)
            tag = rawdata[i+1:k].lower()
            self.lasttag = tag
        while k < j:
            match = attrfind.match(rawdata, k)
            if not match: break
            attrname, rest, attrvalue = match.group(1, 2, 3)
            if not rest:
                attrvalue = attrname
            else:
                if (attrvalue[:1] == "'" == attrvalue[-1:] or
                    attrvalue[:1] == '"' == attrvalue[-1:]):
                    # strip quotes
                    attrvalue = attrvalue[1:-1]
                attrvalue = self.entity_or_charref.sub(
                    self._convert_ref, attrvalue)
            attrs.append((attrname.lower(), attrvalue))
            k = match.end(0)
        if rawdata[j] == '>':
            j = j+1
        self.__starttag_text = rawdata[start_pos:j]
        self.finish_starttag(tag, attrs)
        return j

    # Internal -- convert entity or character reference
    def _convert_ref(self, match):
        if match.group(2):
            return self.convert_charref(match.group(2)) or \
                '&#%s%s' % match.groups()[1:]
        elif match.group(3):
            return self.convert_entityref(match.group(1)) or \
                '&%s;' % match.group(1)
        else:
            return '&%s' % match.group(1)

    # Internal -- parse endtag
    def parse_endtag(self, i):
        rawdata = self.rawdata
        match = endbracket.search(rawdata, i+1)
        if not match:
            return -1
        j = match.start(0)
        tag = rawdata[i+2:j].strip().lower()
        if rawdata[j] == '>':
            j = j+1
        self.finish_endtag(tag)
        return j

    # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
    def finish_shorttag(self, tag, data):
        self.finish_starttag(tag, [])
        self.handle_data(data)
        self.finish_endtag(tag)

    # Internal -- finish processing of start tag
    # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
    def finish_starttag(self, tag, attrs):
        try:
            method = getattr(self, 'start_' + tag)
        except AttributeError:
            try:
                method = getattr(self, 'do_' + tag)
            except AttributeError:
                self.unknown_starttag(tag, attrs)
                return -1
            else:
                self.handle_starttag(tag, method, attrs)
                return 0
        else:
            self.stack.append(tag)
            self.handle_starttag(tag, method, attrs)
            return 1

    # Internal -- finish processing of end tag
    def finish_endtag(self, tag):
        if not tag:
            found = len(self.stack) - 1
            if found < 0:
                self.unknown_endtag(tag)
                return
        else:
            if tag not in self.stack:
                try:
                    method = getattr(self, 'end_' + tag)
                except AttributeError:
                    self.unknown_endtag(tag)
                else:
                    self.report_unbalanced(tag)
                return
            found = len(self.stack)
            for i in range(found):
                if self.stack[i] == tag: found = i
        while len(self.stack) > found:
            tag = self.stack[-1]
            try:
                method = getattr(self, 'end_' + tag)
            except AttributeError:
                method = None
            if method:
                self.handle_endtag(tag, method)
            else:
                self.unknown_endtag(tag)
            del self.stack[-1]

    # Overridable -- handle start tag
    def handle_starttag(self, tag, method, attrs):
        method(attrs)

    # Overridable -- handle end tag
    def handle_endtag(self, tag, method):
        method()

    # Example -- report an unbalanced </...> tag.
    def report_unbalanced(self, tag):
        if self.verbose:
            print('*** Unbalanced </' + tag + '>')
            print('*** Stack:', self.stack)

    def convert_charref(self, name):
        """Convert character reference, may be overridden."""
        try:
            n = int(name)
        except ValueError:
            return
        if not 0 <= n <= 127:
            return
        return self.convert_codepoint(n)

    def convert_codepoint(self, codepoint):
        return chr(codepoint)

    def handle_charref(self, name):
        """Handle character reference, no need to override."""
        replacement = self.convert_charref(name)
        if replacement is None:
            self.unknown_charref(name)
        else:
            self.handle_data(replacement)

    # Definition of entities -- derived classes may override
    entitydefs = \
            {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}

    def convert_entityref(self, name):
        """Convert entity references.

        As an alternative to overriding this method; one can tailor the
        results by setting up the self.entitydefs mapping appropriately.
        """
        table = self.entitydefs
        if name in table:
            return table[name]
        else:
            return

    def handle_entityref(self, name):
        """Handle entity references, no need to override."""
        replacement = self.convert_entityref(name)
        if replacement is None:
            self.unknown_entityref(name)
        else:
            self.handle_data(replacement)

    # Example -- handle data, should be overridden
    def handle_data(self, data):
        pass

    # Example -- handle comment, could be overridden
    def handle_comment(self, data):
        pass

    # Example -- handle declaration, could be overridden
    def handle_decl(self, decl):
        pass

    # Example -- handle processing instruction, could be overridden
    def handle_pi(self, data):
        pass

    # To be overridden -- handlers for unknown objects
    def unknown_starttag(self, tag, attrs): pass
    def unknown_endtag(self, tag): pass
    def unknown_charref(self, ref): pass
    def unknown_entityref(self, ref): pass


class TestSGMLParser(SGMLParser):

    def __init__(self, verbose=0):
        self.testdata = ""
        SGMLParser.__init__(self, verbose)

    def handle_data(self, data):
        self.testdata = self.testdata + data
        if len(repr(self.testdata)) >= 70:
            self.flush()

    def flush(self):
        data = self.testdata
        if data:
            self.testdata = ""
            print('data:', repr(data))

    def handle_comment(self, data):
        self.flush()
        r = repr(data)
        if len(r) > 68:
            r = r[:32] + '...' + r[-32:]
        print('comment:', r)

    def unknown_starttag(self, tag, attrs):
        self.flush()
        if not attrs:
            print('start tag: <' + tag + '>')
        else:
            print('start tag: <' + tag, end=' ')
            for name, value in attrs:
                print(name + '=' + '"' + value + '"', end=' ')
            print('>')

    def unknown_endtag(self, tag):
        self.flush()
        print('end tag: </' + tag + '>')

    def unknown_entityref(self, ref):
        self.flush()
        print('*** unknown entity ref: &' + ref + ';')

    def unknown_charref(self, ref):
        self.flush()
        print('*** unknown char ref: &#' + ref + ';')

    def unknown_decl(self, data):
        self.flush()
        print('*** unknown decl: [' + data + ']')

    def close(self):
        SGMLParser.close(self)
        self.flush()


def test(args = None):
    import sys

    if args is None:
        args = sys.argv[1:]

    if args and args[0] == '-s':
        args = args[1:]
        klass = SGMLParser
    else:
        klass = TestSGMLParser

    if args:
        file = args[0]
    else:
        file = 'test.html'

    if file == '-':
        f = sys.stdin
    else:
        try:
            f = open(file, 'r')
        except IOError as msg:
            print(file, ":", msg)
            sys.exit(1)

    data = f.read()
    if f is not sys.stdin:
        f.close()

    x = klass()
    for c in data:
        x.feed(c)
    x.close()


if __name__ == '__main__':
    test()

Return to bug 250380

Lines 2-8 Link Here

(-)textproc/py-feedparser/Makefile (-3 / +9 lines)
2	# $FreeBSD$	2	# $FreeBSD$
3		3
4	PORTNAME= feedparser	4	PORTNAME= feedparser
5	PORTVERSION= 5.2.1	5	PORTVERSION= 6.0.1
6	CATEGORIES= textproc python	6	CATEGORIES= textproc python
7	MASTER_SITES= CHEESESHOP	7	MASTER_SITES= CHEESESHOP
8	PKGNAMEPREFIX= ${PYTHON_PKGNAMEPREFIX}	8	PKGNAMEPREFIX= ${PYTHON_PKGNAMEPREFIX}
Lines 10-19 Link Here
10	MAINTAINER= sbz@FreeBSD.org	10	MAINTAINER= sbz@FreeBSD.org
11	COMMENT= Universal feed parser written in Python	11	COMMENT= Universal feed parser written in Python
12		12
13	LICENSE= BSD2CLAUSE	13	LICENSE= BSD2CLAUSE PSFL
		14	LICENSE_COMB= multi
14	LICENSE_FILE= ${WRKSRC}/LICENSE	15	LICENSE_FILE= ${WRKSRC}/LICENSE
15		16
16	USES= python tar:bzip2	17	USES= python:3.6+
17	USE_PYTHON= distutils autoplist	18	USE_PYTHON= distutils autoplist
18		19
19	NO_ARCH= yes	20	NO_ARCH= yes
Lines 21-26 Link Here
21	OPTIONS_DEFINE= DOCS	22	OPTIONS_DEFINE= DOCS
22		23
23	PORTDOCS= NEWS README.rst	24	PORTDOCS= NEWS README.rst
		25
		26	PLIST_FILES+= ${PYTHON_SITELIBDIR}/sgmllib.py
		27
		28	post-install:
		29	${CP} ${FILESDIR}/sgmllib.py ${STAGEDIR}/${PYTHON_SITELIBDIR}/sgmllib.py
24		30
25	post-install-DOCS-on:	31	post-install-DOCS-on:
26	${MKDIR} ${STAGEDIR}${DOCSDIR}	32	${MKDIR} ${STAGEDIR}${DOCSDIR}

Line 0 Link Here

(-)textproc/py-feedparser/files/sgmllib.py (+547 lines)
		1	"""A parser for SGML, using the derived class as a static DTD."""
		2
		3	# XXX This only supports those SGML features used by HTML.
		4
		5	# XXX There should be a way to distinguish between PCDATA (parsed
		6	# character data -- the normal case), RCDATA (replaceable character
		7	# data -- only char and entity references and end tags are special)
		8	# and CDATA (character data -- only end tags are special). RCDATA is
		9	# not supported at all.
		10
		11	import _markupbase
		12	import re
		13
		14	__all__ = ["SGMLParser", "SGMLParseError"]
		15
		16	# Regular expressions used for parsing
		17
		18	interesting = re.compile('[&<]')
		19	incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]\|#[0-9])?\|'
		20	'<([a-zA-Z][^<>]*\|'
		21	'/([a-zA-Z][^<>]*)?\|'
		22	'![^<>]*)?')
		23
		24	entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
		25	charref = re.compile('&#([0-9]+)[^0-9]')
		26
		27	starttagopen = re.compile('<[>a-zA-Z]')
		28	shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
		29	shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9])/([^/])/')
		30	piclose = re.compile('>')
		31	endbracket = re.compile('[<>]')
		32	tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
		33	attrfind = re.compile(
		34	r'\s([a-zA-Z_][-:.a-zA-Z_0-9])(\s=\s'
		35	r'(\'[^\']\'\|"[^"]"\|[][\-a-zA-Z0-9./,:;+%?!&$\(\)_#=~\'"@]))?')
		36
		37
		38	class SGMLParseError(RuntimeError):
		39	"""Exception raised for all parse errors."""
		40	pass
		41
		42
		43	# SGML parser base class -- find tags and call handler functions.
		44	# Usage: p = SGMLParser(); p.feed(data); ...; p.close().
		45	# The dtd is defined by deriving a class which defines methods
		46	# with special names to handle tags: start_foo and end_foo to handle
		47	# <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
		48	# (Tags are converted to lower case for this purpose.) The data
		49	# between tags is passed to the parser by calling self.handle_data()
		50	# with some data as argument (the data may be split up in arbitrary
		51	# chunks). Entity references are passed by calling
		52	# self.handle_entityref() with the entity reference as argument.
		53
		54	class SGMLParser(_markupbase.ParserBase):
		55	# Definition of entities -- derived classes may override
		56	entity_or_charref = re.compile('&(?:'
		57	'([a-zA-Z][-.a-zA-Z0-9]*)\|#([0-9]+)'
		58	')(;?)')
		59
		60	def __init__(self, verbose=0):
		61	"""Initialize and reset this instance."""
		62	self.verbose = verbose
		63	self.reset()
		64
65	def reset(self):
66	"""Reset this instance. Loses all unprocessed data."""
67	self.__starttag_text = None
68	self.rawdata = ''
69	self.stack = []
70	self.lasttag = '???'
71	self.nomoretags = 0
72	self.literal = 0
73	_markupbase.ParserBase.reset(self)
74
75	def setnomoretags(self):
76	"""Enter literal mode (CDATA) till EOF.
77
78	Intended for derived classes only.
79	"""
80	self.nomoretags = self.literal = 1
81
82	def setliteral(self, *args):
83	"""Enter literal mode (CDATA).
84
85	Intended for derived classes only.
86	"""
87	self.literal = 1
88
89	def feed(self, data):
90	"""Feed some data to the parser.
91
92	Call this as often as you want, with as little or as much text
93	as you want (may include '\n'). (This just saves the text,
94	all the processing is done by goahead().)
95	"""
96
97	self.rawdata = self.rawdata + data
98	self.goahead(0)
99
100	def close(self):
101	"""Handle the remaining data."""
102	self.goahead(1)
103
104	def error(self, message):
105	raise SGMLParseError(message)
106
107	# Internal -- handle data as far as reasonable. May leave state
108	# and data to be processed by a subsequent call. If 'end' is
109	# true, force handling all data as if followed by EOF marker.
110	def goahead(self, end):
111	rawdata = self.rawdata
112	i = 0
113	n = len(rawdata)
114	while i < n:
115	if self.nomoretags:
116	self.handle_data(rawdata[i:n])
117	i = n
118	break
119	match = interesting.search(rawdata, i)
120	if match: j = match.start()
121	else: j = n
122	if i < j:
123	self.handle_data(rawdata[i:j])
124	i = j
125	if i == n: break
126	if rawdata[i] == '<':
127	if starttagopen.match(rawdata, i):
128	if self.literal:
129	self.handle_data(rawdata[i])
130	i = i+1
131	continue
132	k = self.parse_starttag(i)
133	if k < 0: break
134	i = k
135	continue
136	if rawdata.startswith("</", i):
137	k = self.parse_endtag(i)
138	if k < 0: break
139	i = k
140	self.literal = 0
141	continue
142	if self.literal:
143	if n > (i + 1):
144	self.handle_data("<")
145	i = i+1
146	else:
147	# incomplete
148	break
149	continue
150	if rawdata.startswith("<!--", i):
151	# Strictly speaking, a comment is --.*--
152	# within a declaration tag <!...>.
153	# This should be removed,
154	# and comments handled only in parse_declaration.
155	k = self.parse_comment(i)
156	if k < 0: break
157	i = k
158	continue
159	if rawdata.startswith("<?", i):
160	k = self.parse_pi(i)
161	if k < 0: break
162	i = i+k
163	continue
164	if rawdata.startswith("<!", i):
165	# This is some sort of declaration; in "HTML as
166	# deployed," this should only be the document type
167	# declaration ("<!DOCTYPE html...>").
168	k = self.parse_declaration(i)
169	if k < 0: break
170	i = k
171	continue
172	elif rawdata[i] == '&':
173	if self.literal:
174	self.handle_data(rawdata[i])
175	i = i+1
176	continue
177	match = charref.match(rawdata, i)
178	if match:
179	name = match.group(1)
180	self.handle_charref(name)
181	i = match.end(0)
182	if rawdata[i-1] != ';': i = i-1
183	continue
184	match = entityref.match(rawdata, i)
185	if match:
186	name = match.group(1)
187	self.handle_entityref(name)
188	i = match.end(0)
189	if rawdata[i-1] != ';': i = i-1
190	continue
191	else:
192	self.error('neither < nor & ??')
193	# We get here only if incomplete matches but
194	# nothing else
195	match = incomplete.match(rawdata, i)
196	if not match:
197	self.handle_data(rawdata[i])
198	i = i+1
199	continue
200	j = match.end(0)
201	if j == n:
202	break # Really incomplete
203	self.handle_data(rawdata[i:j])
204	i = j
205	# end while
206	if end and i < n:
207	self.handle_data(rawdata[i:n])
208	i = n
209	self.rawdata = rawdata[i:]
210	# XXX if end: check for empty stack
211
212	# Extensions for the DOCTYPE scanner:
213	_decl_otherchars = '='
214
215	# Internal -- parse processing instr, return length or -1 if not terminated
216	def parse_pi(self, i):
217	rawdata = self.rawdata
218	if rawdata[i:i+2] != '<?':
219	self.error('unexpected call to parse_pi()')
220	match = piclose.search(rawdata, i+2)
221	if not match:
222	return -1
223	j = match.start(0)
224	self.handle_pi(rawdata[i+2: j])
225	j = match.end(0)
226	return j-i
227
228	def get_starttag_text(self):
229	return self.__starttag_text
230
231	# Internal -- handle starttag, return length or -1 if not terminated
232	def parse_starttag(self, i):
233	self.__starttag_text = None
234	start_pos = i
235	rawdata = self.rawdata
236	if shorttagopen.match(rawdata, i):
237	# SGML shorthand: <tag/data/ == <tag>data</tag>
238	# XXX Can data contain &... (entity or char refs)?
239	# XXX Can data contain < or > (tag characters)?
240	# XXX Can there be whitespace before the first /?
241	match = shorttag.match(rawdata, i)
242	if not match:
243	return -1
244	tag, data = match.group(1, 2)
245	self.__starttag_text = '<%s/' % tag
246	tag = tag.lower()
247	k = match.end(0)
248	self.finish_shorttag(tag, data)
249	self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
250	return k
251	# XXX The following should skip matching quotes (' or ")
252	# As a shortcut way to exit, this isn't so bad, but shouldn't
253	# be used to locate the actual end of the start tag since the
254	# < or > characters may be embedded in an attribute value.
255	match = endbracket.search(rawdata, i+1)
256	if not match:
257	return -1
258	j = match.start(0)
259	# Now parse the data between i+1 and j into a tag and attrs
260	attrs = []
261	if rawdata[i:i+2] == '<>':
262	# SGML shorthand: <> == <last open tag seen>
263	k = j
264	tag = self.lasttag
265	else:
266	match = tagfind.match(rawdata, i+1)
267	if not match:
268	self.error('unexpected call to parse_starttag')
269	k = match.end(0)
270	tag = rawdata[i+1:k].lower()
271	self.lasttag = tag
272	while k < j:
273	match = attrfind.match(rawdata, k)
274	if not match: break
275	attrname, rest, attrvalue = match.group(1, 2, 3)
276	if not rest:
277	attrvalue = attrname
278	else:
279	if (attrvalue[:1] == "'" == attrvalue[-1:] or
280	attrvalue[:1] == '"' == attrvalue[-1:]):
281	# strip quotes
282	attrvalue = attrvalue[1:-1]
283	attrvalue = self.entity_or_charref.sub(
284	self._convert_ref, attrvalue)
285	attrs.append((attrname.lower(), attrvalue))
286	k = match.end(0)
287	if rawdata[j] == '>':
288	j = j+1
289	self.__starttag_text = rawdata[start_pos:j]
290	self.finish_starttag(tag, attrs)
291	return j
292
293	# Internal -- convert entity or character reference
294	def _convert_ref(self, match):
295	if match.group(2):
296	return self.convert_charref(match.group(2)) or \
297	'&#%s%s' % match.groups()[1:]
298	elif match.group(3):
299	return self.convert_entityref(match.group(1)) or \
300	'&%s;' % match.group(1)
301	else:
302	return '&%s' % match.group(1)
303
304	# Internal -- parse endtag
305	def parse_endtag(self, i):
306	rawdata = self.rawdata
307	match = endbracket.search(rawdata, i+1)
308	if not match:
309	return -1
310	j = match.start(0)
311	tag = rawdata[i+2:j].strip().lower()
312	if rawdata[j] == '>':
313	j = j+1
314	self.finish_endtag(tag)
315	return j
316
317	# Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
318	def finish_shorttag(self, tag, data):
319	self.finish_starttag(tag, [])
320	self.handle_data(data)
321	self.finish_endtag(tag)
322
323	# Internal -- finish processing of start tag
324	# Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
325	def finish_starttag(self, tag, attrs):
326	try:
327	method = getattr(self, 'start_' + tag)
328	except AttributeError:
329	try:
330	method = getattr(self, 'do_' + tag)
331	except AttributeError:
332	self.unknown_starttag(tag, attrs)
333	return -1
334	else:
335	self.handle_starttag(tag, method, attrs)
336	return 0
337	else:
338	self.stack.append(tag)
339	self.handle_starttag(tag, method, attrs)
340	return 1
341
342	# Internal -- finish processing of end tag
343	def finish_endtag(self, tag):
344	if not tag:
345	found = len(self.stack) - 1
346	if found < 0:
347	self.unknown_endtag(tag)
348	return
349	else:
350	if tag not in self.stack:
351	try:
352	method = getattr(self, 'end_' + tag)
353	except AttributeError:
354	self.unknown_endtag(tag)
355	else:
356	self.report_unbalanced(tag)
357	return
358	found = len(self.stack)
359	for i in range(found):
360	if self.stack[i] == tag: found = i
361	while len(self.stack) > found:
362	tag = self.stack[-1]
363	try:
364	method = getattr(self, 'end_' + tag)
365	except AttributeError:
366	method = None
367	if method:
368	self.handle_endtag(tag, method)
369	else:
370	self.unknown_endtag(tag)
371	del self.stack[-1]
372
373	# Overridable -- handle start tag
374	def handle_starttag(self, tag, method, attrs):
375	method(attrs)
376
377	# Overridable -- handle end tag
378	def handle_endtag(self, tag, method):
379	method()
380
381	# Example -- report an unbalanced </...> tag.
382	def report_unbalanced(self, tag):
383	if self.verbose:
384	print('*** Unbalanced </' + tag + '>')
385	print('*** Stack:', self.stack)
386
387	def convert_charref(self, name):
388	"""Convert character reference, may be overridden."""
389	try:
390	n = int(name)
391	except ValueError:
392	return
393	if not 0 <= n <= 127:
394	return
395	return self.convert_codepoint(n)
396
397	def convert_codepoint(self, codepoint):
398	return chr(codepoint)
399
400	def handle_charref(self, name):
401	"""Handle character reference, no need to override."""
402	replacement = self.convert_charref(name)
403	if replacement is None:
404	self.unknown_charref(name)
405	else:
406	self.handle_data(replacement)
407
408	# Definition of entities -- derived classes may override
409	entitydefs = \
410	{'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
411
412	def convert_entityref(self, name):
413	"""Convert entity references.
414
415	As an alternative to overriding this method; one can tailor the
416	results by setting up the self.entitydefs mapping appropriately.
417	"""
418	table = self.entitydefs
419	if name in table:
420	return table[name]
421	else:
422	return
423
424	def handle_entityref(self, name):
425	"""Handle entity references, no need to override."""
426	replacement = self.convert_entityref(name)
427	if replacement is None:
428	self.unknown_entityref(name)
429	else:
430	self.handle_data(replacement)
431
432	# Example -- handle data, should be overridden
433	def handle_data(self, data):
434	pass
435
436	# Example -- handle comment, could be overridden
437	def handle_comment(self, data):
438	pass
439
440	# Example -- handle declaration, could be overridden
441	def handle_decl(self, decl):
442	pass
443
444	# Example -- handle processing instruction, could be overridden
445	def handle_pi(self, data):
446	pass
447
448	# To be overridden -- handlers for unknown objects
449	def unknown_starttag(self, tag, attrs): pass
450	def unknown_endtag(self, tag): pass
451	def unknown_charref(self, ref): pass
452	def unknown_entityref(self, ref): pass
453
454
455	class TestSGMLParser(SGMLParser):
456
457	def __init__(self, verbose=0):
458	self.testdata = ""
459	SGMLParser.__init__(self, verbose)
460
461	def handle_data(self, data):
462	self.testdata = self.testdata + data
463	if len(repr(self.testdata)) >= 70:
464	self.flush()
465
466	def flush(self):
467	data = self.testdata
468	if data:
469	self.testdata = ""
470	print('data:', repr(data))
471
472	def handle_comment(self, data):
473	self.flush()
474	r = repr(data)
475	if len(r) > 68:
476	r = r[:32] + '...' + r[-32:]
477	print('comment:', r)
478
479	def unknown_starttag(self, tag, attrs):
480	self.flush()
481	if not attrs:
482	print('start tag: <' + tag + '>')
483	else:
484	print('start tag: <' + tag, end=' ')
485	for name, value in attrs:
486	print(name + '=' + '"' + value + '"', end=' ')
487	print('>')
488
489	def unknown_endtag(self, tag):
490	self.flush()
491	print('end tag: </' + tag + '>')
492
493	def unknown_entityref(self, ref):
494	self.flush()
495	print('*** unknown entity ref: &' + ref + ';')
496
497	def unknown_charref(self, ref):
498	self.flush()
499	print('*** unknown char ref: &#' + ref + ';')
500
501	def unknown_decl(self, data):
502	self.flush()
503	print('*** unknown decl: [' + data + ']')
504
505	def close(self):
506	SGMLParser.close(self)
507	self.flush()
508
509
510	def test(args = None):
511	import sys
512
513	if args is None:
514	args = sys.argv[1:]
515
516	if args and args[0] == '-s':
517	args = args[1:]
518	klass = SGMLParser
519	else:
520	klass = TestSGMLParser
521
522	if args:
523	file = args[0]
524	else:
525	file = 'test.html'
526
527	if file == '-':
528	f = sys.stdin
529	else:
530	try:
531	f = open(file, 'r')
532	except IOError as msg:
533	print(file, ":", msg)
534	sys.exit(1)
535
536	data = f.read()
537	if f is not sys.stdin:
538	f.close()
539
540	x = klass()
541	for c in data:
542	x.feed(c)
543	x.close()
544
545
546	if __name__ == '__main__':
547	test()