-
Notifications
You must be signed in to change notification settings - Fork 5.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: add Docling reader and node parser (#16406)
- Loading branch information
Showing
27 changed files
with
1,645 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
392 changes: 392 additions & 0 deletions
392
docs/docs/examples/data_connectors/DoclingReaderDemo.ipynb
Large diffs are not rendered by default.
Oops, something went wrong.
153 changes: 153 additions & 0 deletions
153
llama-index-integrations/node_parser/llama-index-node-parser-docling/.gitignore
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,153 @@ | ||
llama_index/_static | ||
.DS_Store | ||
# Byte-compiled / optimized / DLL files | ||
__pycache__/ | ||
*.py[cod] | ||
*$py.class | ||
|
||
# C extensions | ||
*.so | ||
|
||
# Distribution / packaging | ||
.Python | ||
bin/ | ||
build/ | ||
develop-eggs/ | ||
dist/ | ||
downloads/ | ||
eggs/ | ||
.eggs/ | ||
etc/ | ||
include/ | ||
lib/ | ||
lib64/ | ||
parts/ | ||
sdist/ | ||
share/ | ||
var/ | ||
wheels/ | ||
pip-wheel-metadata/ | ||
share/python-wheels/ | ||
*.egg-info/ | ||
.installed.cfg | ||
*.egg | ||
MANIFEST | ||
|
||
# PyInstaller | ||
# Usually these files are written by a python script from a template | ||
# before PyInstaller builds the exe, so as to inject date/other infos into it. | ||
*.manifest | ||
*.spec | ||
|
||
# Installer logs | ||
pip-log.txt | ||
pip-delete-this-directory.txt | ||
|
||
# Unit test / coverage reports | ||
htmlcov/ | ||
.tox/ | ||
.nox/ | ||
.coverage | ||
.coverage.* | ||
.cache | ||
nosetests.xml | ||
coverage.xml | ||
*.cover | ||
*.py,cover | ||
.hypothesis/ | ||
.pytest_cache/ | ||
.ruff_cache | ||
|
||
# Translations | ||
*.mo | ||
*.pot | ||
|
||
# Django stuff: | ||
*.log | ||
local_settings.py | ||
db.sqlite3 | ||
db.sqlite3-journal | ||
|
||
# Flask stuff: | ||
instance/ | ||
.webassets-cache | ||
|
||
# Scrapy stuff: | ||
.scrapy | ||
|
||
# Sphinx documentation | ||
docs/_build/ | ||
|
||
# PyBuilder | ||
target/ | ||
|
||
# Jupyter Notebook | ||
.ipynb_checkpoints | ||
notebooks/ | ||
|
||
# IPython | ||
profile_default/ | ||
ipython_config.py | ||
|
||
# pyenv | ||
.python-version | ||
|
||
# pipenv | ||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. | ||
# However, in case of collaboration, if having platform-specific dependencies or dependencies | ||
# having no cross-platform support, pipenv may install dependencies that don't work, or not | ||
# install all needed dependencies. | ||
#Pipfile.lock | ||
|
||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow | ||
__pypackages__/ | ||
|
||
# Celery stuff | ||
celerybeat-schedule | ||
celerybeat.pid | ||
|
||
# SageMath parsed files | ||
*.sage.py | ||
|
||
# Environments | ||
.env | ||
.venv | ||
env/ | ||
venv/ | ||
ENV/ | ||
env.bak/ | ||
venv.bak/ | ||
pyvenv.cfg | ||
|
||
# Spyder project settings | ||
.spyderproject | ||
.spyproject | ||
|
||
# Rope project settings | ||
.ropeproject | ||
|
||
# mkdocs documentation | ||
/site | ||
|
||
# mypy | ||
.mypy_cache/ | ||
.dmypy.json | ||
dmypy.json | ||
|
||
# Pyre type checker | ||
.pyre/ | ||
|
||
# Jetbrains | ||
.idea | ||
modules/ | ||
*.swp | ||
|
||
# VsCode | ||
.vscode | ||
|
||
# pipenv | ||
Pipfile | ||
Pipfile.lock | ||
|
||
# pyright | ||
pyrightconfig.json |
3 changes: 3 additions & 0 deletions
3
llama-index-integrations/node_parser/llama-index-node-parser-docling/BUILD
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
poetry_requirements( | ||
name="poetry", | ||
) |
17 changes: 17 additions & 0 deletions
17
llama-index-integrations/node_parser/llama-index-node-parser-docling/Makefile
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
GIT_ROOT ?= $(shell git rev-parse --show-toplevel) | ||
|
||
help: ## Show all Makefile targets. | ||
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[33m%-30s\033[0m %s\n", $$1, $$2}' | ||
|
||
format: ## Run code autoformatters (black). | ||
pre-commit install | ||
git ls-files | xargs pre-commit run black --files | ||
|
||
lint: ## Run linters: pre-commit (black, ruff, codespell) and mypy | ||
pre-commit install && git ls-files | xargs pre-commit run --show-diff-on-failure --files | ||
|
||
test: ## Run tests via pytest. | ||
pytest tests | ||
|
||
watch-docs: ## Build and watch documentation. | ||
sphinx-autobuild docs/ docs/_build/html --open-browser --watch $(GIT_ROOT)/llama_index/ |
35 changes: 35 additions & 0 deletions
35
llama-index-integrations/node_parser/llama-index-node-parser-docling/README.md
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
# Docling Node Parser | ||
|
||
## Overview | ||
|
||
Docling Node Parser parses [Docling](https://github.com/DS4SD/docling) JSON output into LlamaIndex nodes with rich metadata for usage in downstream pipelines for RAG / QA etc. | ||
|
||
## Installation | ||
|
||
```console | ||
pip install llama-index-node-parser-docling | ||
``` | ||
|
||
## Usage | ||
|
||
Docling Node Parser parses LlamaIndex documents containing JSON-serialized Docling format, as created by a Docling Reader. | ||
|
||
Basic usage looks like this: | ||
|
||
```python | ||
# docs = ... # e.g. created using Docling Reader in JSON mode | ||
|
||
from llama_index.node_parser.docling import DoclingNodeParser | ||
|
||
node_parser = DoclingNodeParser() | ||
nodes = node_parser.get_nodes_from_documents(documents=docs) | ||
print(f"{nodes[6].text[:70]}...") | ||
# > Docling provides an easy code interface to convert PDF documents from ... | ||
|
||
print(nodes[6].metadata) | ||
# > {'dl_doc_hash': '556ad9e23b...', | ||
# > 'path': '#/main-text/22', | ||
# > 'heading': '2 Getting Started', | ||
# > 'page': 2, | ||
# > 'bbox': [107.40, 456.93, 504.20, 499.65]} | ||
``` |
1 change: 1 addition & 0 deletions
1
...rations/node_parser/llama-index-node-parser-docling/llama_index/node_parser/docling/BUILD
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
python_sources() |
4 changes: 4 additions & 0 deletions
4
...s/node_parser/llama-index-node-parser-docling/llama_index/node_parser/docling/__init__.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
from llama_index.node_parser.docling.base import DoclingNodeParser | ||
|
||
|
||
__all__ = ["DoclingNodeParser"] |
81 changes: 81 additions & 0 deletions
81
...tions/node_parser/llama-index-node-parser-docling/llama_index/node_parser/docling/base.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
from typing import Any, Iterable, Sequence | ||
|
||
from llama_index.core.schema import Document as LIDocument | ||
from llama_index.core.node_parser import NodeParser | ||
|
||
from docling_core.transforms.chunker import BaseChunker, HierarchicalChunker | ||
from docling_core.types import Document as DLDocument | ||
from llama_index.core import Document as LIDocument | ||
from llama_index.core.node_parser import NodeParser | ||
from llama_index.core.node_parser.node_utils import IdFuncCallable, default_id_func | ||
from llama_index.core.schema import ( | ||
BaseNode, | ||
NodeRelationship, | ||
RelatedNodeType, | ||
TextNode, | ||
) | ||
from llama_index.core.utils import get_tqdm_iterable | ||
|
||
_NODE_TEXT_KEY = "text" | ||
|
||
|
||
class DoclingNodeParser(NodeParser): | ||
"""Docling format node parser. | ||
Splits the JSON format of `DoclingReader` into nodes corresponding | ||
to respective document elements from Docling's data model | ||
(paragraphs, headings, tables etc.). | ||
Args: | ||
chunker (BaseChunker, optional): The chunker to use. Defaults to `HierarchicalChunker(heading_as_metadata=True)`. | ||
doc_meta_keys_allowed (set[str], optional): The Document metadata keys allowed to be included for embedding and LLM input. Defaults to `set()`. | ||
node_meta_keys_allowed (set[str], optional): The Node metadata keys allowed to be included for embedding and LLM input. Defaults to `{"heading"}`. | ||
""" | ||
|
||
chunker: BaseChunker = HierarchicalChunker(heading_as_metadata=True) | ||
doc_meta_keys_allowed: set[str] = set() | ||
node_meta_keys_allowed: set[str] = {"heading"} | ||
|
||
def _parse_nodes( | ||
self, | ||
nodes: Sequence[BaseNode], | ||
show_progress: bool = False, | ||
**kwargs: Any, | ||
) -> list[BaseNode]: | ||
id_func: IdFuncCallable = self.id_func or default_id_func | ||
nodes_with_progress: Iterable[BaseNode] = get_tqdm_iterable( | ||
items=nodes, show_progress=show_progress, desc="Parsing nodes" | ||
) | ||
all_nodes: list[BaseNode] = [] | ||
for input_node in nodes_with_progress: | ||
li_doc = LIDocument.model_validate(input_node) | ||
dl_doc: DLDocument = DLDocument.model_validate_json(li_doc.get_content()) | ||
chunk_iter = self.chunker.chunk(dl_doc=dl_doc) | ||
for i, chunk in enumerate(chunk_iter): | ||
rels: dict[NodeRelationship, RelatedNodeType] = { | ||
NodeRelationship.SOURCE: li_doc.as_related_node_info(), | ||
} | ||
metadata = chunk.model_dump( | ||
exclude=_NODE_TEXT_KEY, | ||
exclude_none=True, | ||
) | ||
# by default we exclude all meta keys from embedding/LLM — unless allowed | ||
excl_meta_keys = [ | ||
k for k in metadata if k not in self.node_meta_keys_allowed | ||
] | ||
if self.include_metadata: | ||
excl_meta_keys = [ | ||
k | ||
for k in li_doc.metadata | ||
if k not in self.doc_meta_keys_allowed | ||
] + excl_meta_keys | ||
node = TextNode( | ||
id_=id_func(i=i, doc=li_doc), | ||
text=chunk.text, | ||
excluded_embed_metadata_keys=excl_meta_keys, | ||
excluded_llm_metadata_keys=excl_meta_keys, | ||
relationships=rels, | ||
) | ||
node.metadata = metadata | ||
all_nodes.append(node) | ||
return all_nodes |
59 changes: 59 additions & 0 deletions
59
llama-index-integrations/node_parser/llama-index-node-parser-docling/pyproject.toml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
[build-system] | ||
build-backend = "poetry.core.masonry.api" | ||
requires = ["poetry-core"] | ||
|
||
[tool.codespell] | ||
check-filenames = true | ||
check-hidden = true | ||
# Feel free to un-skip examples, and experimental, you will just need to | ||
# work through many typos (--write-changes and --interactive will help) | ||
skip = "*.csv,*.html,*.json,*.jsonl,*.pdf,*.txt,*.ipynb" | ||
|
||
[tool.llamahub] | ||
contains_example = false | ||
import_path = "llama_index.node_parser.docling" | ||
|
||
[tool.llamahub.class_authors] | ||
DoclingNodeParser = "vagenas" | ||
|
||
[tool.mypy] | ||
disallow_untyped_defs = true | ||
# Remove venv skip when integrated with pre-commit | ||
exclude = ["_static", "build", "examples", "notebooks", "venv"] | ||
ignore_missing_imports = true | ||
python_version = "3.10" | ||
|
||
[tool.poetry] | ||
authors = ["Panos Vagenas <[email protected]>"] | ||
description = "llama-index node_parser docling integration" | ||
license = "MIT" | ||
name = "llama-index-node-parser-docling" | ||
packages = [{include = "llama_index/"}] | ||
readme = "README.md" | ||
version = "0.1.0" | ||
|
||
[tool.poetry.dependencies] | ||
python = "^3.10" | ||
llama-index-core = "^0.11.0" | ||
docling-core = "^1.7.1" | ||
|
||
[tool.poetry.group.dev] | ||
|
||
[tool.poetry.group.dev.dependencies] | ||
black = {extras = ["jupyter"], version = "<=23.9.1,>=23.7.0"} | ||
codespell = {extras = ["toml"], version = ">=v2.2.6"} | ||
ipython = "8.10.0" | ||
jupyter = "^1.0.0" | ||
mypy = "0.991" | ||
pre-commit = "3.2.0" | ||
pylint = "2.15.10" | ||
pytest = "7.2.1" | ||
pytest-mock = "3.11.1" | ||
ruff = "0.0.292" | ||
tree-sitter-languages = "^1.8.0" | ||
types-Deprecated = ">=0.1.0" | ||
types-PyYAML = "^6.0.12.12" | ||
types-protobuf = "^4.24.0.4" | ||
types-redis = "4.5.5.0" | ||
types-requests = "2.28.11.8" # TODO: unpin when mypy>0.991 | ||
types-setuptools = "67.1.0.0" |
3 changes: 3 additions & 0 deletions
3
llama-index-integrations/node_parser/llama-index-node-parser-docling/tests/BUILD
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
python_tests( | ||
interpreter_constraints=["==3.10.*", "==3.11.*"] | ||
) |
Empty file.
18 changes: 18 additions & 0 deletions
18
...index-integrations/node_parser/llama-index-node-parser-docling/tests/data/inp_li_doc.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
{ | ||
"id_": "129210df929c78e70d74e6f141a46d8326905ce58562f2081819c80c3921d5a3", | ||
"embedding": null, | ||
"metadata": { | ||
"dl_doc_hash": "129210df929c78e70d74e6f141a46d8326905ce58562f2081819c80c3921d5a3" | ||
}, | ||
"excluded_embed_metadata_keys": ["dl_doc_hash"], | ||
"excluded_llm_metadata_keys": ["dl_doc_hash"], | ||
"relationships": {}, | ||
"text": "{\"_name\":\"\",\"type\":\"pdf-document\",\"description\":{\"title\":null,\"abstract\":null,\"authors\":null,\"affiliations\":null,\"subjects\":null,\"keywords\":null,\"publication_date\":null,\"languages\":null,\"license\":null,\"publishers\":null,\"url_refs\":null,\"references\":null,\"publication\":null,\"reference_count\":null,\"citation_count\":null,\"citation_date\":null,\"advanced\":null,\"analytics\":null,\"logs\":[],\"collection\":null,\"acquisition\":null},\"file-info\":{\"filename\":\"\",\"filename-prov\":null,\"document-hash\":\"129210df929c78e70d74e6f141a46d8326905ce58562f2081819c80c3921d5a3\",\"#-pages\":null,\"collection-name\":null,\"description\":null,\"page-hashes\":null},\"main-text\":[{\"text\":\"A duckling is a young duck in downy plumage[1] or baby duck,[2] but in the food trade a young domestic duck which has just reached adult size and bulk and its meat is still fully tender, is sometimes labelled as a duckling.\",\"type\":\"paragraph\",\"name\":\"text\",\"font\":null,\"prov\":[{\"bbox\":[1.0,2.0,3.0,4.0],\"page\":1,\"span\":[0,1],\"__ref_s3_data\":null}]},{\"text\":\"A male is called a drake and the female is called a duck, or in ornithology a hen.\",\"type\":\"paragraph\",\"name\":\"text\",\"font\":null,\"prov\":[{\"bbox\":[1.0,2.0,3.0,4.0],\"page\":1,\"span\":[0,2],\"__ref_s3_data\":null}]}],\"figures\":null,\"tables\":null,\"bitmaps\":null,\"equations\":null,\"footnotes\":null,\"page-dimensions\":null,\"page-footers\":null,\"page-headers\":null,\"_s3_data\":null,\"identifiers\":null}", | ||
"mimetype": "text/plain", | ||
"start_char_idx": null, | ||
"end_char_idx": null, | ||
"text_template": "{metadata_str}\n\n{content}", | ||
"metadata_template": "{key}: {value}", | ||
"metadata_seperator": "\n", | ||
"class_name": "Document" | ||
} |
Oops, something went wrong.