View | Details | Raw Unified | Return to bug 265768 | Differences between
and this patch

Collapse All | Expand All

(-)b/textproc/py-textract/Makefile (+81 lines)
Added Link Here
1
PORTNAME=	textract
2
PORTVERSION=	1.6.5
3
CATEGORIES=	textproc python
4
MASTER_SITES=	CHEESESHOP
5
PKGNAMEPREFIX=	${PYTHON_PKGNAMEPREFIX}
6
7
MAINTAINER=	DtxdF@riseup.net
8
COMMENT=	Extract text from any document
9
10
LICENSE=	MIT
11
LICENSE_FILE=	${WRKSRC}/LICENSE
12
13
RUN_DEPENDS=	${PYTHON_PKGNAMEPREFIX}argcomplete>=1.10.0:devel/py-argcomplete@${PY_FLAVOR} \
14
		${PYTHON_PKGNAMEPREFIX}chardet>=3:textproc/py-chardet@${PY_FLAVOR} \
15
		${PYTHON_PKGNAMEPREFIX}six>1.12.0:devel/py-six@${PY_FLAVOR}
16
17
USES=	python:3.8+
18
USE_PYTHON=	autoplist distutils
19
20
OPTIONS_DEFINE=	ANTIWORD BEAUTIFULSOUP DOCX2TXT MSG LIBXML2 \
21
		LIBXSLT PPTX PS SPREADSHEET UNRTF
22
OPTIONS_DEFAULT=	ANTIWORD BEAUTIFULSOUP DOCX2TXT FFMPEG FLAC JPEG_TURBO \
23
			LAME LIBXML2 LIBXSLT MSG PDFTOTEXT PPTX PS SOX \
24
			SPEECH_RECOGNITION SPREADSHEET TESSERACT UNRTF
25
26
ANTIWORD_DESC=	DOC document support
27
DOCX2TXT_DESC=	DOCX document support
28
BEAUTIFULSOUP_DESC=	HTML parsing library
29
TESSERACT_DESC=	Commercial quality open source OCR engine
30
JPEG_TURBO_DESC=	SIMD-accelerated JPEG codec
31
SOX_DESC=	Command-line audio processing tool
32
SPEECH_RECOGNITION_DESC=	Python library for performing speech recognition
33
POCKETSPHINX_DESC=	Interface to CMU Sphinxbase and Pocketsphinx
34
SPREADSHEET_DESC=	XLS and XLSX spreadsheet support
35
UNRTF_DESC=	RTF document support
36
PDFTOTEXT_DESC=	Extract text from a PDF document
37
PDFMINER_DESC=	PDF parser and analyzer
38
MSG_DESC=	MS Outlook MSG file format support
39
PPTX_DESC=	MS PowerPoint PPTX presentations support
40
LIBXML2_DESC=	Python interface for XML parser library
41
LIBXSLT_DESC=	XML stylesheet transformation library
42
43
# DOC
44
ANTIWORD_RUN_DEPENDS=	antiword>0:textproc/antiword
45
# DOCX
46
DOCX2TXT_RUN_DEPENDS=	${PYTHON_PKGNAMEPREFIX}docx2txt>=0.8:textproc/py-docx2txt@${PY_FLAVOR}
47
# HTML, EPUB, etc.
48
BEAUTIFULSOUP_RUN_DEPENDS=	${PYTHON_PKGNAMEPREFIX}beautifulsoup>=4.8.0:www/py-beautifulsoup@${PY_FLAVOR}
49
# OCR
50
TESSERACT_RUN_DEPENDS=	tesseract>0:graphics/tesseract
51
JPEG_TURBO_RUN_DEPENDS=	jpeg-turbo>0:graphics/jpeg-turbo
52
# AUDIO
53
SOX_RUN_DEPENDS=	sox>0:audio/sox
54
FFMPEG_RUN_DEPENDS=	ffmpeg>0:multimedia/ffmpeg
55
FLAC_RUN_DEPENDS=	flac>0:audio/flac
56
LAME_RUN_DEPENDS=	lame>0:audio/lame
57
SPEECH_RECOGNITION_RUN_DEPENDS=	${PYTHON_PKGNAMEPREFIX}SpeechRecognition>=3.8.1:audio/py-speechrecognition@${PY_FLAVOR}
58
POCKETSPHINX_RUN_DEPENDS=	${PYTHON_PKGNAMEPREFIX}pocketsphinx>0:audio/py-pocketsphinx@${PY_FLAVOR}
59
# XLS and XLSX
60
SPREADSHEET_RUN_DEPENDS=	${PYTHON_PKGNAMEPREFIX}xlrd>=1.2.0:textproc/py-xlrd@${PY_FLAVOR}
61
# RTF
62
UNRTF_RUN_DEPENDS=	unrtf>0:textproc/unrtf
63
# PDF
64
PDFTOTEXT_RUN_DEPENDS=	poppler-utils>0:graphics/poppler-utils
65
PDFMINER_RUN_DEPENDS=	${PYTHON_PKGNAMEPREFIX}pdfminer.six>=20191110:textproc/py-pdfminer.six@${PY_FLAVOR}
66
# PS
67
PS_RUN_DEPENDS=	pstotext>0:print/pstotext
68
# MSG
69
MSG_RUN_DEPENDS=	${PYTHON_PKGNAMEPREFIX}extract-msg>=0.29:textproc/py-extract-msg@${PY_FLAVOR}
70
# PPTX
71
PPTX_RUN_DEPENDS=	${PYTHON_PKGNAMEPREFIX}python-pptx>=0.6.18:textproc/py-python-pptx@${PY_FLAVOR}
72
# MISC
73
LIBXML2_RUN_DEPENDS=	${PYTHON_PKGNAMEPREFIX}libxml2>0:textproc/py-libxml2@${PY_FLAVOR}
74
LIBXSLT_RUN_DEPENDS=	libxslt>=1.1.15:textproc/libxslt
75
76
OPTIONS_GROUP=	AUDIO OCR PDF RTF
77
OPTIONS_GROUP_AUDIO=	FFMPEG FLAC LAME POCKETSPHINX SOX SPEECH_RECOGNITION
78
OPTIONS_GROUP_OCR=	JPEG_TURBO TESSERACT
79
OPTIONS_GROUP_PDF=	PDFMINER PDFTOTEXT
80
81
.include <bsd.port.mk>
(-)b/textproc/py-textract/distinfo (+3 lines)
Added Link Here
1
TIMESTAMP = 1659835075
2
SHA256 (textract-1.6.5.tar.gz) = 68f0f09056885821e6c43d8538987518daa94057c306679f2857cc5ee66ad850
3
SIZE (textract-1.6.5.tar.gz) = 17871
(-)b/textproc/py-textract/pkg-descr (-1 / +5 lines)
Added Link Here
0
- 
1
textract provides a single interface for extracting content embedded
2
from Word documents, PowerPoint presentations, PDFs and much more,
3
which can be used for further textual analysis and visualization.
4
5
WWW: https://github.com/deanmalmgren/textract

Return to bug 265768