View | Details | Raw Unified | Return to bug 250380 | Differences between
and this patch

Collapse All | Expand All

(-)textproc/py-feedparser/Makefile (-3 / +9 lines)
Lines 2-8 Link Here
2
# $FreeBSD$
2
# $FreeBSD$
3
3
4
PORTNAME=	feedparser
4
PORTNAME=	feedparser
5
PORTVERSION=	5.2.1
5
PORTVERSION=	6.0.1
6
CATEGORIES=	textproc python
6
CATEGORIES=	textproc python
7
MASTER_SITES=	CHEESESHOP
7
MASTER_SITES=	CHEESESHOP
8
PKGNAMEPREFIX=	${PYTHON_PKGNAMEPREFIX}
8
PKGNAMEPREFIX=	${PYTHON_PKGNAMEPREFIX}
Lines 10-19 Link Here
10
MAINTAINER=	sbz@FreeBSD.org
10
MAINTAINER=	sbz@FreeBSD.org
11
COMMENT=	Universal feed parser written in Python
11
COMMENT=	Universal feed parser written in Python
12
12
13
LICENSE=	BSD2CLAUSE
13
LICENSE=	BSD2CLAUSE PSFL
14
LICENSE_COMB=	multi
14
LICENSE_FILE=	${WRKSRC}/LICENSE
15
LICENSE_FILE=	${WRKSRC}/LICENSE
15
16
16
USES=		python tar:bzip2
17
USES=		python:3.6+
17
USE_PYTHON=	distutils autoplist
18
USE_PYTHON=	distutils autoplist
18
19
19
NO_ARCH=	yes
20
NO_ARCH=	yes
Lines 21-26 Link Here
21
OPTIONS_DEFINE=	DOCS
22
OPTIONS_DEFINE=	DOCS
22
23
23
PORTDOCS=	NEWS README.rst
24
PORTDOCS=	NEWS README.rst
25
26
PLIST_FILES+=	${PYTHON_SITELIBDIR}/sgmllib.py
27
28
post-install:
29
	${CP} ${FILESDIR}/sgmllib.py ${STAGEDIR}/${PYTHON_SITELIBDIR}/sgmllib.py
24
30
25
post-install-DOCS-on:
31
post-install-DOCS-on:
26
	${MKDIR} ${STAGEDIR}${DOCSDIR}
32
	${MKDIR} ${STAGEDIR}${DOCSDIR}
(-)textproc/py-feedparser/distinfo (-3 / +3 lines)
Lines 1-3 Link Here
1
TIMESTAMP = 1464128973
1
TIMESTAMP = 1602900188
2
SHA256 (feedparser-5.2.1.tar.bz2) = ce875495c90ebd74b179855449040003a1beb40cd13d5f037a0654251e260b02
2
SHA256 (feedparser-6.0.1.tar.gz) = 6ca88edcaa43f428345968df903a87f020843eda5e28d7ea24a612158d61e74c
3
SIZE (feedparser-5.2.1.tar.bz2) = 192328
3
SIZE (feedparser-6.0.1.tar.gz) = 284620
(-)textproc/py-feedparser/files/sgmllib.py (+547 lines)
Line 0 Link Here
1
"""A parser for SGML, using the derived class as a static DTD."""
2
3
# XXX This only supports those SGML features used by HTML.
4
5
# XXX There should be a way to distinguish between PCDATA (parsed
6
# character data -- the normal case), RCDATA (replaceable character
7
# data -- only char and entity references and end tags are special)
8
# and CDATA (character data -- only end tags are special).  RCDATA is
9
# not supported at all.
10
11
import _markupbase
12
import re
13
14
__all__ = ["SGMLParser", "SGMLParseError"]
15
16
# Regular expressions used for parsing
17
18
interesting = re.compile('[&<]')
19
incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
20
                           '<([a-zA-Z][^<>]*|'
21
                              '/([a-zA-Z][^<>]*)?|'
22
                              '![^<>]*)?')
23
24
entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
25
charref = re.compile('&#([0-9]+)[^0-9]')
26
27
starttagopen = re.compile('<[>a-zA-Z]')
28
shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
29
shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
30
piclose = re.compile('>')
31
endbracket = re.compile('[<>]')
32
tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
33
attrfind = re.compile(
34
    r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
35
    r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?')
36
37
38
class SGMLParseError(RuntimeError):
39
    """Exception raised for all parse errors."""
40
    pass
41
42
43
# SGML parser base class -- find tags and call handler functions.
44
# Usage: p = SGMLParser(); p.feed(data); ...; p.close().
45
# The dtd is defined by deriving a class which defines methods
46
# with special names to handle tags: start_foo and end_foo to handle
47
# <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
48
# (Tags are converted to lower case for this purpose.)  The data
49
# between tags is passed to the parser by calling self.handle_data()
50
# with some data as argument (the data may be split up in arbitrary
51
# chunks).  Entity references are passed by calling
52
# self.handle_entityref() with the entity reference as argument.
53
54
class SGMLParser(_markupbase.ParserBase):
55
    # Definition of entities -- derived classes may override
56
    entity_or_charref = re.compile('&(?:'
57
      '([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)'
58
      ')(;?)')
59
60
    def __init__(self, verbose=0):
61
        """Initialize and reset this instance."""
62
        self.verbose = verbose
63
        self.reset()
64
65
    def reset(self):
66
        """Reset this instance. Loses all unprocessed data."""
67
        self.__starttag_text = None
68
        self.rawdata = ''
69
        self.stack = []
70
        self.lasttag = '???'
71
        self.nomoretags = 0
72
        self.literal = 0
73
        _markupbase.ParserBase.reset(self)
74
75
    def setnomoretags(self):
76
        """Enter literal mode (CDATA) till EOF.
77
78
        Intended for derived classes only.
79
        """
80
        self.nomoretags = self.literal = 1
81
82
    def setliteral(self, *args):
83
        """Enter literal mode (CDATA).
84
85
        Intended for derived classes only.
86
        """
87
        self.literal = 1
88
89
    def feed(self, data):
90
        """Feed some data to the parser.
91
92
        Call this as often as you want, with as little or as much text
93
        as you want (may include '\n').  (This just saves the text,
94
        all the processing is done by goahead().)
95
        """
96
97
        self.rawdata = self.rawdata + data
98
        self.goahead(0)
99
100
    def close(self):
101
        """Handle the remaining data."""
102
        self.goahead(1)
103
104
    def error(self, message):
105
        raise SGMLParseError(message)
106
107
    # Internal -- handle data as far as reasonable.  May leave state
108
    # and data to be processed by a subsequent call.  If 'end' is
109
    # true, force handling all data as if followed by EOF marker.
110
    def goahead(self, end):
111
        rawdata = self.rawdata
112
        i = 0
113
        n = len(rawdata)
114
        while i < n:
115
            if self.nomoretags:
116
                self.handle_data(rawdata[i:n])
117
                i = n
118
                break
119
            match = interesting.search(rawdata, i)
120
            if match: j = match.start()
121
            else: j = n
122
            if i < j:
123
                self.handle_data(rawdata[i:j])
124
            i = j
125
            if i == n: break
126
            if rawdata[i] == '<':
127
                if starttagopen.match(rawdata, i):
128
                    if self.literal:
129
                        self.handle_data(rawdata[i])
130
                        i = i+1
131
                        continue
132
                    k = self.parse_starttag(i)
133
                    if k < 0: break
134
                    i = k
135
                    continue
136
                if rawdata.startswith("</", i):
137
                    k = self.parse_endtag(i)
138
                    if k < 0: break
139
                    i = k
140
                    self.literal = 0
141
                    continue
142
                if self.literal:
143
                    if n > (i + 1):
144
                        self.handle_data("<")
145
                        i = i+1
146
                    else:
147
                        # incomplete
148
                        break
149
                    continue
150
                if rawdata.startswith("<!--", i):
151
                        # Strictly speaking, a comment is --.*--
152
                        # within a declaration tag <!...>.
153
                        # This should be removed,
154
                        # and comments handled only in parse_declaration.
155
                    k = self.parse_comment(i)
156
                    if k < 0: break
157
                    i = k
158
                    continue
159
                if rawdata.startswith("<?", i):
160
                    k = self.parse_pi(i)
161
                    if k < 0: break
162
                    i = i+k
163
                    continue
164
                if rawdata.startswith("<!", i):
165
                    # This is some sort of declaration; in "HTML as
166
                    # deployed," this should only be the document type
167
                    # declaration ("<!DOCTYPE html...>").
168
                    k = self.parse_declaration(i)
169
                    if k < 0: break
170
                    i = k
171
                    continue
172
            elif rawdata[i] == '&':
173
                if self.literal:
174
                    self.handle_data(rawdata[i])
175
                    i = i+1
176
                    continue
177
                match = charref.match(rawdata, i)
178
                if match:
179
                    name = match.group(1)
180
                    self.handle_charref(name)
181
                    i = match.end(0)
182
                    if rawdata[i-1] != ';': i = i-1
183
                    continue
184
                match = entityref.match(rawdata, i)
185
                if match:
186
                    name = match.group(1)
187
                    self.handle_entityref(name)
188
                    i = match.end(0)
189
                    if rawdata[i-1] != ';': i = i-1
190
                    continue
191
            else:
192
                self.error('neither < nor & ??')
193
            # We get here only if incomplete matches but
194
            # nothing else
195
            match = incomplete.match(rawdata, i)
196
            if not match:
197
                self.handle_data(rawdata[i])
198
                i = i+1
199
                continue
200
            j = match.end(0)
201
            if j == n:
202
                break # Really incomplete
203
            self.handle_data(rawdata[i:j])
204
            i = j
205
        # end while
206
        if end and i < n:
207
            self.handle_data(rawdata[i:n])
208
            i = n
209
        self.rawdata = rawdata[i:]
210
        # XXX if end: check for empty stack
211
212
    # Extensions for the DOCTYPE scanner:
213
    _decl_otherchars = '='
214
215
    # Internal -- parse processing instr, return length or -1 if not terminated
216
    def parse_pi(self, i):
217
        rawdata = self.rawdata
218
        if rawdata[i:i+2] != '<?':
219
            self.error('unexpected call to parse_pi()')
220
        match = piclose.search(rawdata, i+2)
221
        if not match:
222
            return -1
223
        j = match.start(0)
224
        self.handle_pi(rawdata[i+2: j])
225
        j = match.end(0)
226
        return j-i
227
228
    def get_starttag_text(self):
229
        return self.__starttag_text
230
231
    # Internal -- handle starttag, return length or -1 if not terminated
232
    def parse_starttag(self, i):
233
        self.__starttag_text = None
234
        start_pos = i
235
        rawdata = self.rawdata
236
        if shorttagopen.match(rawdata, i):
237
            # SGML shorthand: <tag/data/ == <tag>data</tag>
238
            # XXX Can data contain &... (entity or char refs)?
239
            # XXX Can data contain < or > (tag characters)?
240
            # XXX Can there be whitespace before the first /?
241
            match = shorttag.match(rawdata, i)
242
            if not match:
243
                return -1
244
            tag, data = match.group(1, 2)
245
            self.__starttag_text = '<%s/' % tag
246
            tag = tag.lower()
247
            k = match.end(0)
248
            self.finish_shorttag(tag, data)
249
            self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
250
            return k
251
        # XXX The following should skip matching quotes (' or ")
252
        # As a shortcut way to exit, this isn't so bad, but shouldn't
253
        # be used to locate the actual end of the start tag since the
254
        # < or > characters may be embedded in an attribute value.
255
        match = endbracket.search(rawdata, i+1)
256
        if not match:
257
            return -1
258
        j = match.start(0)
259
        # Now parse the data between i+1 and j into a tag and attrs
260
        attrs = []
261
        if rawdata[i:i+2] == '<>':
262
            # SGML shorthand: <> == <last open tag seen>
263
            k = j
264
            tag = self.lasttag
265
        else:
266
            match = tagfind.match(rawdata, i+1)
267
            if not match:
268
                self.error('unexpected call to parse_starttag')
269
            k = match.end(0)
270
            tag = rawdata[i+1:k].lower()
271
            self.lasttag = tag
272
        while k < j:
273
            match = attrfind.match(rawdata, k)
274
            if not match: break
275
            attrname, rest, attrvalue = match.group(1, 2, 3)
276
            if not rest:
277
                attrvalue = attrname
278
            else:
279
                if (attrvalue[:1] == "'" == attrvalue[-1:] or
280
                    attrvalue[:1] == '"' == attrvalue[-1:]):
281
                    # strip quotes
282
                    attrvalue = attrvalue[1:-1]
283
                attrvalue = self.entity_or_charref.sub(
284
                    self._convert_ref, attrvalue)
285
            attrs.append((attrname.lower(), attrvalue))
286
            k = match.end(0)
287
        if rawdata[j] == '>':
288
            j = j+1
289
        self.__starttag_text = rawdata[start_pos:j]
290
        self.finish_starttag(tag, attrs)
291
        return j
292
293
    # Internal -- convert entity or character reference
294
    def _convert_ref(self, match):
295
        if match.group(2):
296
            return self.convert_charref(match.group(2)) or \
297
                '&#%s%s' % match.groups()[1:]
298
        elif match.group(3):
299
            return self.convert_entityref(match.group(1)) or \
300
                '&%s;' % match.group(1)
301
        else:
302
            return '&%s' % match.group(1)
303
304
    # Internal -- parse endtag
305
    def parse_endtag(self, i):
306
        rawdata = self.rawdata
307
        match = endbracket.search(rawdata, i+1)
308
        if not match:
309
            return -1
310
        j = match.start(0)
311
        tag = rawdata[i+2:j].strip().lower()
312
        if rawdata[j] == '>':
313
            j = j+1
314
        self.finish_endtag(tag)
315
        return j
316
317
    # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
318
    def finish_shorttag(self, tag, data):
319
        self.finish_starttag(tag, [])
320
        self.handle_data(data)
321
        self.finish_endtag(tag)
322
323
    # Internal -- finish processing of start tag
324
    # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
325
    def finish_starttag(self, tag, attrs):
326
        try:
327
            method = getattr(self, 'start_' + tag)
328
        except AttributeError:
329
            try:
330
                method = getattr(self, 'do_' + tag)
331
            except AttributeError:
332
                self.unknown_starttag(tag, attrs)
333
                return -1
334
            else:
335
                self.handle_starttag(tag, method, attrs)
336
                return 0
337
        else:
338
            self.stack.append(tag)
339
            self.handle_starttag(tag, method, attrs)
340
            return 1
341
342
    # Internal -- finish processing of end tag
343
    def finish_endtag(self, tag):
344
        if not tag:
345
            found = len(self.stack) - 1
346
            if found < 0:
347
                self.unknown_endtag(tag)
348
                return
349
        else:
350
            if tag not in self.stack:
351
                try:
352
                    method = getattr(self, 'end_' + tag)
353
                except AttributeError:
354
                    self.unknown_endtag(tag)
355
                else:
356
                    self.report_unbalanced(tag)
357
                return
358
            found = len(self.stack)
359
            for i in range(found):
360
                if self.stack[i] == tag: found = i
361
        while len(self.stack) > found:
362
            tag = self.stack[-1]
363
            try:
364
                method = getattr(self, 'end_' + tag)
365
            except AttributeError:
366
                method = None
367
            if method:
368
                self.handle_endtag(tag, method)
369
            else:
370
                self.unknown_endtag(tag)
371
            del self.stack[-1]
372
373
    # Overridable -- handle start tag
374
    def handle_starttag(self, tag, method, attrs):
375
        method(attrs)
376
377
    # Overridable -- handle end tag
378
    def handle_endtag(self, tag, method):
379
        method()
380
381
    # Example -- report an unbalanced </...> tag.
382
    def report_unbalanced(self, tag):
383
        if self.verbose:
384
            print('*** Unbalanced </' + tag + '>')
385
            print('*** Stack:', self.stack)
386
387
    def convert_charref(self, name):
388
        """Convert character reference, may be overridden."""
389
        try:
390
            n = int(name)
391
        except ValueError:
392
            return
393
        if not 0 <= n <= 127:
394
            return
395
        return self.convert_codepoint(n)
396
397
    def convert_codepoint(self, codepoint):
398
        return chr(codepoint)
399
400
    def handle_charref(self, name):
401
        """Handle character reference, no need to override."""
402
        replacement = self.convert_charref(name)
403
        if replacement is None:
404
            self.unknown_charref(name)
405
        else:
406
            self.handle_data(replacement)
407
408
    # Definition of entities -- derived classes may override
409
    entitydefs = \
410
            {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
411
412
    def convert_entityref(self, name):
413
        """Convert entity references.
414
415
        As an alternative to overriding this method; one can tailor the
416
        results by setting up the self.entitydefs mapping appropriately.
417
        """
418
        table = self.entitydefs
419
        if name in table:
420
            return table[name]
421
        else:
422
            return
423
424
    def handle_entityref(self, name):
425
        """Handle entity references, no need to override."""
426
        replacement = self.convert_entityref(name)
427
        if replacement is None:
428
            self.unknown_entityref(name)
429
        else:
430
            self.handle_data(replacement)
431
432
    # Example -- handle data, should be overridden
433
    def handle_data(self, data):
434
        pass
435
436
    # Example -- handle comment, could be overridden
437
    def handle_comment(self, data):
438
        pass
439
440
    # Example -- handle declaration, could be overridden
441
    def handle_decl(self, decl):
442
        pass
443
444
    # Example -- handle processing instruction, could be overridden
445
    def handle_pi(self, data):
446
        pass
447
448
    # To be overridden -- handlers for unknown objects
449
    def unknown_starttag(self, tag, attrs): pass
450
    def unknown_endtag(self, tag): pass
451
    def unknown_charref(self, ref): pass
452
    def unknown_entityref(self, ref): pass
453
454
455
class TestSGMLParser(SGMLParser):
456
457
    def __init__(self, verbose=0):
458
        self.testdata = ""
459
        SGMLParser.__init__(self, verbose)
460
461
    def handle_data(self, data):
462
        self.testdata = self.testdata + data
463
        if len(repr(self.testdata)) >= 70:
464
            self.flush()
465
466
    def flush(self):
467
        data = self.testdata
468
        if data:
469
            self.testdata = ""
470
            print('data:', repr(data))
471
472
    def handle_comment(self, data):
473
        self.flush()
474
        r = repr(data)
475
        if len(r) > 68:
476
            r = r[:32] + '...' + r[-32:]
477
        print('comment:', r)
478
479
    def unknown_starttag(self, tag, attrs):
480
        self.flush()
481
        if not attrs:
482
            print('start tag: <' + tag + '>')
483
        else:
484
            print('start tag: <' + tag, end=' ')
485
            for name, value in attrs:
486
                print(name + '=' + '"' + value + '"', end=' ')
487
            print('>')
488
489
    def unknown_endtag(self, tag):
490
        self.flush()
491
        print('end tag: </' + tag + '>')
492
493
    def unknown_entityref(self, ref):
494
        self.flush()
495
        print('*** unknown entity ref: &' + ref + ';')
496
497
    def unknown_charref(self, ref):
498
        self.flush()
499
        print('*** unknown char ref: &#' + ref + ';')
500
501
    def unknown_decl(self, data):
502
        self.flush()
503
        print('*** unknown decl: [' + data + ']')
504
505
    def close(self):
506
        SGMLParser.close(self)
507
        self.flush()
508
509
510
def test(args = None):
511
    import sys
512
513
    if args is None:
514
        args = sys.argv[1:]
515
516
    if args and args[0] == '-s':
517
        args = args[1:]
518
        klass = SGMLParser
519
    else:
520
        klass = TestSGMLParser
521
522
    if args:
523
        file = args[0]
524
    else:
525
        file = 'test.html'
526
527
    if file == '-':
528
        f = sys.stdin
529
    else:
530
        try:
531
            f = open(file, 'r')
532
        except IOError as msg:
533
            print(file, ":", msg)
534
            sys.exit(1)
535
536
    data = f.read()
537
    if f is not sys.stdin:
538
        f.close()
539
540
    x = klass()
541
    for c in data:
542
        x.feed(c)
543
    x.close()
544
545
546
if __name__ == '__main__':
547
    test()

Return to bug 250380