FreeBSD Bugzilla – Attachment 219162 Details for
Bug 250380
textproc/py-feedparser: fails to import with lang/python39
Home
|
New
|
Browse
|
Search
|
[?]
|
Reports
|
Help
|
New Account
|
Log In
Remember
[x]
|
Forgot Password
Login:
[x]
[patch]
Patch to 6.0.1, Ship 2to3 sgmllib (ported from Py27)
feedparser (1).patch (text/plain), 19.85 KB, created by
James French
on 2020-10-28 02:12:30 UTC
(
hide
)
Description:
Patch to 6.0.1, Ship 2to3 sgmllib (ported from Py27)
Filename:
MIME Type:
Creator:
James French
Created:
2020-10-28 02:12:30 UTC
Size:
19.85 KB
patch
obsolete
>diff --git a/textproc/py-feedparser/Makefile b/textproc/py-feedparser/Makefile >index 377b82b7fa96..fd75dfded815 100644 >--- a/textproc/py-feedparser/Makefile >+++ b/textproc/py-feedparser/Makefile >@@ -2,7 +2,7 @@ > # $FreeBSD$ > > PORTNAME= feedparser >-PORTVERSION= 5.2.1 >+PORTVERSION= 6.0.1 > CATEGORIES= textproc python > MASTER_SITES= CHEESESHOP > PKGNAMEPREFIX= ${PYTHON_PKGNAMEPREFIX} >@@ -10,10 +10,11 @@ PKGNAMEPREFIX= ${PYTHON_PKGNAMEPREFIX} > MAINTAINER= sbz@FreeBSD.org > COMMENT= Universal feed parser written in Python > >-LICENSE= BSD2CLAUSE >+LICENSE= BSD2CLAUSE PSFL >+LICENSE_COMB= multi > LICENSE_FILE= ${WRKSRC}/LICENSE > >-USES= python tar:bzip2 >+USES= python:3.6+ > USE_PYTHON= distutils autoplist > > NO_ARCH= yes >@@ -22,6 +23,11 @@ OPTIONS_DEFINE= DOCS > > PORTDOCS= NEWS README.rst > >+PLIST_FILES+= ${PYTHON_SITELIBDIR}/sgmllib.py >+ >+post-install: >+ ${INSTALL_DATA} ${FILESDIR}/sgmllib.py ${STAGEDIR}/${PYTHON_SITELIBDIR}/sgmllib.py >+ > post-install-DOCS-on: > ${MKDIR} ${STAGEDIR}${DOCSDIR} > ${INSTALL_DATA} ${PORTDOCS:S,^,${WRKSRC}/,} ${STAGEDIR}${DOCSDIR} >diff --git a/textproc/py-feedparser/distinfo b/textproc/py-feedparser/distinfo >index 740e0aeea36a..2e31aaba0979 100644 >--- a/textproc/py-feedparser/distinfo >+++ b/textproc/py-feedparser/distinfo >@@ -1,3 +1,3 @@ >-TIMESTAMP = 1464128973 >-SHA256 (feedparser-5.2.1.tar.bz2) = ce875495c90ebd74b179855449040003a1beb40cd13d5f037a0654251e260b02 >-SIZE (feedparser-5.2.1.tar.bz2) = 192328 >+TIMESTAMP = 1602900188 >+SHA256 (feedparser-6.0.1.tar.gz) = 6ca88edcaa43f428345968df903a87f020843eda5e28d7ea24a612158d61e74c >+SIZE (feedparser-6.0.1.tar.gz) = 284620 >diff --git a/textproc/py-feedparser/feedparser.patch b/textproc/py-feedparser/feedparser.patch >new file mode 100644 >index 000000000000..e69de29bb2d1 >diff --git a/textproc/py-feedparser/files/sgmllib.py b/textproc/py-feedparser/files/sgmllib.py >new file mode 100644 >index 000000000000..88a02a307f40 >--- /dev/null >+++ b/textproc/py-feedparser/files/sgmllib.py >@@ -0,0 +1,547 @@ >+"""A parser for SGML, using the derived class as a static DTD.""" >+ >+# XXX This only supports those SGML features used by HTML. >+ >+# XXX There should be a way to distinguish between PCDATA (parsed >+# character data -- the normal case), RCDATA (replaceable character >+# data -- only char and entity references and end tags are special) >+# and CDATA (character data -- only end tags are special). RCDATA is >+# not supported at all. >+ >+import _markupbase >+import re >+ >+__all__ = ["SGMLParser", "SGMLParseError"] >+ >+# Regular expressions used for parsing >+ >+interesting = re.compile('[&<]') >+incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|' >+ '<([a-zA-Z][^<>]*|' >+ '/([a-zA-Z][^<>]*)?|' >+ '![^<>]*)?') >+ >+entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') >+charref = re.compile('&#([0-9]+)[^0-9]') >+ >+starttagopen = re.compile('<[>a-zA-Z]') >+shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/') >+shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/') >+piclose = re.compile('>') >+endbracket = re.compile('[<>]') >+tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*') >+attrfind = re.compile( >+ r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*' >+ r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?') >+ >+ >+class SGMLParseError(RuntimeError): >+ """Exception raised for all parse errors.""" >+ pass >+ >+ >+# SGML parser base class -- find tags and call handler functions. >+# Usage: p = SGMLParser(); p.feed(data); ...; p.close(). >+# The dtd is defined by deriving a class which defines methods >+# with special names to handle tags: start_foo and end_foo to handle >+# <foo> and </foo>, respectively, or do_foo to handle <foo> by itself. >+# (Tags are converted to lower case for this purpose.) The data >+# between tags is passed to the parser by calling self.handle_data() >+# with some data as argument (the data may be split up in arbitrary >+# chunks). Entity references are passed by calling >+# self.handle_entityref() with the entity reference as argument. >+ >+class SGMLParser(_markupbase.ParserBase): >+ # Definition of entities -- derived classes may override >+ entity_or_charref = re.compile('&(?:' >+ '([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)' >+ ')(;?)') >+ >+ def __init__(self, verbose=0): >+ """Initialize and reset this instance.""" >+ self.verbose = verbose >+ self.reset() >+ >+ def reset(self): >+ """Reset this instance. Loses all unprocessed data.""" >+ self.__starttag_text = None >+ self.rawdata = '' >+ self.stack = [] >+ self.lasttag = '???' >+ self.nomoretags = 0 >+ self.literal = 0 >+ _markupbase.ParserBase.reset(self) >+ >+ def setnomoretags(self): >+ """Enter literal mode (CDATA) till EOF. >+ >+ Intended for derived classes only. >+ """ >+ self.nomoretags = self.literal = 1 >+ >+ def setliteral(self, *args): >+ """Enter literal mode (CDATA). >+ >+ Intended for derived classes only. >+ """ >+ self.literal = 1 >+ >+ def feed(self, data): >+ """Feed some data to the parser. >+ >+ Call this as often as you want, with as little or as much text >+ as you want (may include '\n'). (This just saves the text, >+ all the processing is done by goahead().) >+ """ >+ >+ self.rawdata = self.rawdata + data >+ self.goahead(0) >+ >+ def close(self): >+ """Handle the remaining data.""" >+ self.goahead(1) >+ >+ def error(self, message): >+ raise SGMLParseError(message) >+ >+ # Internal -- handle data as far as reasonable. May leave state >+ # and data to be processed by a subsequent call. If 'end' is >+ # true, force handling all data as if followed by EOF marker. >+ def goahead(self, end): >+ rawdata = self.rawdata >+ i = 0 >+ n = len(rawdata) >+ while i < n: >+ if self.nomoretags: >+ self.handle_data(rawdata[i:n]) >+ i = n >+ break >+ match = interesting.search(rawdata, i) >+ if match: j = match.start() >+ else: j = n >+ if i < j: >+ self.handle_data(rawdata[i:j]) >+ i = j >+ if i == n: break >+ if rawdata[i] == '<': >+ if starttagopen.match(rawdata, i): >+ if self.literal: >+ self.handle_data(rawdata[i]) >+ i = i+1 >+ continue >+ k = self.parse_starttag(i) >+ if k < 0: break >+ i = k >+ continue >+ if rawdata.startswith("</", i): >+ k = self.parse_endtag(i) >+ if k < 0: break >+ i = k >+ self.literal = 0 >+ continue >+ if self.literal: >+ if n > (i + 1): >+ self.handle_data("<") >+ i = i+1 >+ else: >+ # incomplete >+ break >+ continue >+ if rawdata.startswith("<!--", i): >+ # Strictly speaking, a comment is --.*-- >+ # within a declaration tag <!...>. >+ # This should be removed, >+ # and comments handled only in parse_declaration. >+ k = self.parse_comment(i) >+ if k < 0: break >+ i = k >+ continue >+ if rawdata.startswith("<?", i): >+ k = self.parse_pi(i) >+ if k < 0: break >+ i = i+k >+ continue >+ if rawdata.startswith("<!", i): >+ # This is some sort of declaration; in "HTML as >+ # deployed," this should only be the document type >+ # declaration ("<!DOCTYPE html...>"). >+ k = self.parse_declaration(i) >+ if k < 0: break >+ i = k >+ continue >+ elif rawdata[i] == '&': >+ if self.literal: >+ self.handle_data(rawdata[i]) >+ i = i+1 >+ continue >+ match = charref.match(rawdata, i) >+ if match: >+ name = match.group(1) >+ self.handle_charref(name) >+ i = match.end(0) >+ if rawdata[i-1] != ';': i = i-1 >+ continue >+ match = entityref.match(rawdata, i) >+ if match: >+ name = match.group(1) >+ self.handle_entityref(name) >+ i = match.end(0) >+ if rawdata[i-1] != ';': i = i-1 >+ continue >+ else: >+ self.error('neither < nor & ??') >+ # We get here only if incomplete matches but >+ # nothing else >+ match = incomplete.match(rawdata, i) >+ if not match: >+ self.handle_data(rawdata[i]) >+ i = i+1 >+ continue >+ j = match.end(0) >+ if j == n: >+ break # Really incomplete >+ self.handle_data(rawdata[i:j]) >+ i = j >+ # end while >+ if end and i < n: >+ self.handle_data(rawdata[i:n]) >+ i = n >+ self.rawdata = rawdata[i:] >+ # XXX if end: check for empty stack >+ >+ # Extensions for the DOCTYPE scanner: >+ _decl_otherchars = '=' >+ >+ # Internal -- parse processing instr, return length or -1 if not terminated >+ def parse_pi(self, i): >+ rawdata = self.rawdata >+ if rawdata[i:i+2] != '<?': >+ self.error('unexpected call to parse_pi()') >+ match = piclose.search(rawdata, i+2) >+ if not match: >+ return -1 >+ j = match.start(0) >+ self.handle_pi(rawdata[i+2: j]) >+ j = match.end(0) >+ return j-i >+ >+ def get_starttag_text(self): >+ return self.__starttag_text >+ >+ # Internal -- handle starttag, return length or -1 if not terminated >+ def parse_starttag(self, i): >+ self.__starttag_text = None >+ start_pos = i >+ rawdata = self.rawdata >+ if shorttagopen.match(rawdata, i): >+ # SGML shorthand: <tag/data/ == <tag>data</tag> >+ # XXX Can data contain &... (entity or char refs)? >+ # XXX Can data contain < or > (tag characters)? >+ # XXX Can there be whitespace before the first /? >+ match = shorttag.match(rawdata, i) >+ if not match: >+ return -1 >+ tag, data = match.group(1, 2) >+ self.__starttag_text = '<%s/' % tag >+ tag = tag.lower() >+ k = match.end(0) >+ self.finish_shorttag(tag, data) >+ self.__starttag_text = rawdata[start_pos:match.end(1) + 1] >+ return k >+ # XXX The following should skip matching quotes (' or ") >+ # As a shortcut way to exit, this isn't so bad, but shouldn't >+ # be used to locate the actual end of the start tag since the >+ # < or > characters may be embedded in an attribute value. >+ match = endbracket.search(rawdata, i+1) >+ if not match: >+ return -1 >+ j = match.start(0) >+ # Now parse the data between i+1 and j into a tag and attrs >+ attrs = [] >+ if rawdata[i:i+2] == '<>': >+ # SGML shorthand: <> == <last open tag seen> >+ k = j >+ tag = self.lasttag >+ else: >+ match = tagfind.match(rawdata, i+1) >+ if not match: >+ self.error('unexpected call to parse_starttag') >+ k = match.end(0) >+ tag = rawdata[i+1:k].lower() >+ self.lasttag = tag >+ while k < j: >+ match = attrfind.match(rawdata, k) >+ if not match: break >+ attrname, rest, attrvalue = match.group(1, 2, 3) >+ if not rest: >+ attrvalue = attrname >+ else: >+ if (attrvalue[:1] == "'" == attrvalue[-1:] or >+ attrvalue[:1] == '"' == attrvalue[-1:]): >+ # strip quotes >+ attrvalue = attrvalue[1:-1] >+ attrvalue = self.entity_or_charref.sub( >+ self._convert_ref, attrvalue) >+ attrs.append((attrname.lower(), attrvalue)) >+ k = match.end(0) >+ if rawdata[j] == '>': >+ j = j+1 >+ self.__starttag_text = rawdata[start_pos:j] >+ self.finish_starttag(tag, attrs) >+ return j >+ >+ # Internal -- convert entity or character reference >+ def _convert_ref(self, match): >+ if match.group(2): >+ return self.convert_charref(match.group(2)) or \ >+ '&#%s%s' % match.groups()[1:] >+ elif match.group(3): >+ return self.convert_entityref(match.group(1)) or \ >+ '&%s;' % match.group(1) >+ else: >+ return '&%s' % match.group(1) >+ >+ # Internal -- parse endtag >+ def parse_endtag(self, i): >+ rawdata = self.rawdata >+ match = endbracket.search(rawdata, i+1) >+ if not match: >+ return -1 >+ j = match.start(0) >+ tag = rawdata[i+2:j].strip().lower() >+ if rawdata[j] == '>': >+ j = j+1 >+ self.finish_endtag(tag) >+ return j >+ >+ # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>) >+ def finish_shorttag(self, tag, data): >+ self.finish_starttag(tag, []) >+ self.handle_data(data) >+ self.finish_endtag(tag) >+ >+ # Internal -- finish processing of start tag >+ # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag >+ def finish_starttag(self, tag, attrs): >+ try: >+ method = getattr(self, 'start_' + tag) >+ except AttributeError: >+ try: >+ method = getattr(self, 'do_' + tag) >+ except AttributeError: >+ self.unknown_starttag(tag, attrs) >+ return -1 >+ else: >+ self.handle_starttag(tag, method, attrs) >+ return 0 >+ else: >+ self.stack.append(tag) >+ self.handle_starttag(tag, method, attrs) >+ return 1 >+ >+ # Internal -- finish processing of end tag >+ def finish_endtag(self, tag): >+ if not tag: >+ found = len(self.stack) - 1 >+ if found < 0: >+ self.unknown_endtag(tag) >+ return >+ else: >+ if tag not in self.stack: >+ try: >+ method = getattr(self, 'end_' + tag) >+ except AttributeError: >+ self.unknown_endtag(tag) >+ else: >+ self.report_unbalanced(tag) >+ return >+ found = len(self.stack) >+ for i in range(found): >+ if self.stack[i] == tag: found = i >+ while len(self.stack) > found: >+ tag = self.stack[-1] >+ try: >+ method = getattr(self, 'end_' + tag) >+ except AttributeError: >+ method = None >+ if method: >+ self.handle_endtag(tag, method) >+ else: >+ self.unknown_endtag(tag) >+ del self.stack[-1] >+ >+ # Overridable -- handle start tag >+ def handle_starttag(self, tag, method, attrs): >+ method(attrs) >+ >+ # Overridable -- handle end tag >+ def handle_endtag(self, tag, method): >+ method() >+ >+ # Example -- report an unbalanced </...> tag. >+ def report_unbalanced(self, tag): >+ if self.verbose: >+ print('*** Unbalanced </' + tag + '>') >+ print('*** Stack:', self.stack) >+ >+ def convert_charref(self, name): >+ """Convert character reference, may be overridden.""" >+ try: >+ n = int(name) >+ except ValueError: >+ return >+ if not 0 <= n <= 127: >+ return >+ return self.convert_codepoint(n) >+ >+ def convert_codepoint(self, codepoint): >+ return chr(codepoint) >+ >+ def handle_charref(self, name): >+ """Handle character reference, no need to override.""" >+ replacement = self.convert_charref(name) >+ if replacement is None: >+ self.unknown_charref(name) >+ else: >+ self.handle_data(replacement) >+ >+ # Definition of entities -- derived classes may override >+ entitydefs = \ >+ {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''} >+ >+ def convert_entityref(self, name): >+ """Convert entity references. >+ >+ As an alternative to overriding this method; one can tailor the >+ results by setting up the self.entitydefs mapping appropriately. >+ """ >+ table = self.entitydefs >+ if name in table: >+ return table[name] >+ else: >+ return >+ >+ def handle_entityref(self, name): >+ """Handle entity references, no need to override.""" >+ replacement = self.convert_entityref(name) >+ if replacement is None: >+ self.unknown_entityref(name) >+ else: >+ self.handle_data(replacement) >+ >+ # Example -- handle data, should be overridden >+ def handle_data(self, data): >+ pass >+ >+ # Example -- handle comment, could be overridden >+ def handle_comment(self, data): >+ pass >+ >+ # Example -- handle declaration, could be overridden >+ def handle_decl(self, decl): >+ pass >+ >+ # Example -- handle processing instruction, could be overridden >+ def handle_pi(self, data): >+ pass >+ >+ # To be overridden -- handlers for unknown objects >+ def unknown_starttag(self, tag, attrs): pass >+ def unknown_endtag(self, tag): pass >+ def unknown_charref(self, ref): pass >+ def unknown_entityref(self, ref): pass >+ >+ >+class TestSGMLParser(SGMLParser): >+ >+ def __init__(self, verbose=0): >+ self.testdata = "" >+ SGMLParser.__init__(self, verbose) >+ >+ def handle_data(self, data): >+ self.testdata = self.testdata + data >+ if len(repr(self.testdata)) >= 70: >+ self.flush() >+ >+ def flush(self): >+ data = self.testdata >+ if data: >+ self.testdata = "" >+ print('data:', repr(data)) >+ >+ def handle_comment(self, data): >+ self.flush() >+ r = repr(data) >+ if len(r) > 68: >+ r = r[:32] + '...' + r[-32:] >+ print('comment:', r) >+ >+ def unknown_starttag(self, tag, attrs): >+ self.flush() >+ if not attrs: >+ print('start tag: <' + tag + '>') >+ else: >+ print('start tag: <' + tag, end=' ') >+ for name, value in attrs: >+ print(name + '=' + '"' + value + '"', end=' ') >+ print('>') >+ >+ def unknown_endtag(self, tag): >+ self.flush() >+ print('end tag: </' + tag + '>') >+ >+ def unknown_entityref(self, ref): >+ self.flush() >+ print('*** unknown entity ref: &' + ref + ';') >+ >+ def unknown_charref(self, ref): >+ self.flush() >+ print('*** unknown char ref: &#' + ref + ';') >+ >+ def unknown_decl(self, data): >+ self.flush() >+ print('*** unknown decl: [' + data + ']') >+ >+ def close(self): >+ SGMLParser.close(self) >+ self.flush() >+ >+ >+def test(args = None): >+ import sys >+ >+ if args is None: >+ args = sys.argv[1:] >+ >+ if args and args[0] == '-s': >+ args = args[1:] >+ klass = SGMLParser >+ else: >+ klass = TestSGMLParser >+ >+ if args: >+ file = args[0] >+ else: >+ file = 'test.html' >+ >+ if file == '-': >+ f = sys.stdin >+ else: >+ try: >+ f = open(file, 'r') >+ except IOError as msg: >+ print(file, ":", msg) >+ sys.exit(1) >+ >+ data = f.read() >+ if f is not sys.stdin: >+ f.close() >+ >+ x = klass() >+ for c in data: >+ x.feed(c) >+ x.close() >+ >+ >+if __name__ == '__main__': >+ test()
You cannot view the attachment while viewing its details because your browser does not support IFRAMEs.
View the attachment on a separate page
.
View Attachment As Diff
View Attachment As Raw
Actions:
View
|
Diff
Attachments on
bug 250380
:
218819
|
218820
|
218823
|
218824
|
219157
| 219162 |
224091
|
224133