Skip to content

Latest commit

 

History

History
112 lines (68 loc) · 2.57 KB

README.md

File metadata and controls

112 lines (68 loc) · 2.57 KB

DF Extract Lib

PyPI version License

Requirements

Python 3.10+ asyncio

Installation

# Using pip
$ python -m pip install df-extract

# Manual install
$ python -m pip install .

1. To extract content from PDF

from df_extract.pdf import ExtractPDF


path = "/home/test/ABC.pdf"

extract_pdf = ExtractPDF(file_path=path)

# By default, output as text
await extract_pdf.extract()  # Output will be located `/home/test/ABC.pdf.txt`

# Output as json
await extract_pdf.extract(as_json=True)  # Output will be located `/home/test/ABC.pdf.json`

You can change the output directory with simply pass output_dir param

from df_extract.pdf import ExtractPDF


path = "/home/test/ABC.pdf"

extract_pdf = ExtractPDF(file_path=path, output_dir="/home/test/output")
await extract_pdf.extract()

Extract content from PDF with image data

This requires easyocr

from df_extract.base import ImageExtract
from df_extract.pdf import ExtractPDF


path = "/home/test/ABC.pdf"

image_extract = ImageExtract(model_download_enabled=True)
extract_pdf = ExtractPDF(file_path=path, image_extract=image_extract)
await extract_pdf.extract()

2. To extract content from PPT and PPTx

from df_extract.pptx import ExtractPPTx


path = "/home/test/DEF.pptx"

extract_pptx = ExtractPPTx(file_path=path)

# By default, output as text
await extract_pptx.extract()  # Output will be located `/home/test/DEF.pptx.txt`

# Output as json
await extract_pptx.extract(as_json=True)  # Output will be located `/home/test/DEF.pptx.json`

3. To extract content from Doc and Docx

from df_extract.docx import ExtractDocx


path = "/home/test/GHI.docx"

extract_docx = ExtractDocx(file_path=path)

# By default, output as text
await extract_docx.extract()  # Output will be located `/home/test/GHI.docx.txt`

# Output as json
await extract_docx.extract(as_json=True)  # Output will be located `/home/test/GHI.docx.json`

4. To extract content from PNG, JPEG and JPG

from df_extract.image import ExtractImage


path = "/home/test/JKL.png"

extract_png = ExtractImage(file_path=path)

# By default, output as text
await extract_png.extract()  # Output will be located `/home/test/JKL.png.txt`

# Output as json
await extract_png.extract(as_json=True)  # Output will be located `/home/test/JKL.png.json`