Skip to content

Commit

Permalink
feat: add Docling reader and node parser (#16406)
Browse files Browse the repository at this point in the history
  • Loading branch information
vagenas authored Oct 8, 2024
1 parent 2b1dc7d commit 0b19dea
Show file tree
Hide file tree
Showing 27 changed files with 1,645 additions and 2 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/coverage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,11 +59,11 @@ jobs:
root="$file"
# Keep going up the directory tree until we find a directory containing a marker file
# (e.g., 'pyproject.toml' for python projects)
while [[ ! -f "$root/pyproject.toml" && "$root" != "." && "$root" != "/" ]]; do
while [[ ! -f "$root/pyproject.toml" && "$root" != "." && "$root" != "/" && "$root" != "./" ]]; do
root=$(dirname "$root")
done
if [[ ! "$FILTER_PATTERNS" =~ "$root" ]]; then
if [[ "$root" != "." && "$root" != "/" && "$root" != "./" && ! "$FILTER_PATTERNS" =~ "$root" ]]; then
FILTER_PATTERNS="${FILTER_PATTERNS}'${root}',"
CHANGED_ROOTS="${CHANGED_ROOTS} ${root}/::"
fi
Expand Down
392 changes: 392 additions & 0 deletions docs/docs/examples/data_connectors/DoclingReaderDemo.ipynb

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
llama_index/_static
.DS_Store
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
bin/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
etc/
include/
lib/
lib64/
parts/
sdist/
share/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
.ruff_cache

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints
notebooks/

# IPython
profile_default/
ipython_config.py

# pyenv
.python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
pyvenv.cfg

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# Jetbrains
.idea
modules/
*.swp

# VsCode
.vscode

# pipenv
Pipfile
Pipfile.lock

# pyright
pyrightconfig.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
poetry_requirements(
name="poetry",
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
GIT_ROOT ?= $(shell git rev-parse --show-toplevel)

help: ## Show all Makefile targets.
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[33m%-30s\033[0m %s\n", $$1, $$2}'

format: ## Run code autoformatters (black).
pre-commit install
git ls-files | xargs pre-commit run black --files

lint: ## Run linters: pre-commit (black, ruff, codespell) and mypy
pre-commit install && git ls-files | xargs pre-commit run --show-diff-on-failure --files

test: ## Run tests via pytest.
pytest tests

watch-docs: ## Build and watch documentation.
sphinx-autobuild docs/ docs/_build/html --open-browser --watch $(GIT_ROOT)/llama_index/
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Docling Node Parser

## Overview

Docling Node Parser parses [Docling](https://github.com/DS4SD/docling) JSON output into LlamaIndex nodes with rich metadata for usage in downstream pipelines for RAG / QA etc.

## Installation

```console
pip install llama-index-node-parser-docling
```

## Usage

Docling Node Parser parses LlamaIndex documents containing JSON-serialized Docling format, as created by a Docling Reader.

Basic usage looks like this:

```python
# docs = ... # e.g. created using Docling Reader in JSON mode

from llama_index.node_parser.docling import DoclingNodeParser

node_parser = DoclingNodeParser()
nodes = node_parser.get_nodes_from_documents(documents=docs)
print(f"{nodes[6].text[:70]}...")
# > Docling provides an easy code interface to convert PDF documents from ...

print(nodes[6].metadata)
# > {'dl_doc_hash': '556ad9e23b...',
# > 'path': '#/main-text/22',
# > 'heading': '2 Getting Started',
# > 'page': 2,
# > 'bbox': [107.40, 456.93, 504.20, 499.65]}
```
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
python_sources()
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from llama_index.node_parser.docling.base import DoclingNodeParser


__all__ = ["DoclingNodeParser"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
from typing import Any, Iterable, Sequence

from llama_index.core.schema import Document as LIDocument
from llama_index.core.node_parser import NodeParser

from docling_core.transforms.chunker import BaseChunker, HierarchicalChunker
from docling_core.types import Document as DLDocument
from llama_index.core import Document as LIDocument
from llama_index.core.node_parser import NodeParser
from llama_index.core.node_parser.node_utils import IdFuncCallable, default_id_func
from llama_index.core.schema import (
BaseNode,
NodeRelationship,
RelatedNodeType,
TextNode,
)
from llama_index.core.utils import get_tqdm_iterable

_NODE_TEXT_KEY = "text"


class DoclingNodeParser(NodeParser):
"""Docling format node parser.
Splits the JSON format of `DoclingReader` into nodes corresponding
to respective document elements from Docling's data model
(paragraphs, headings, tables etc.).
Args:
chunker (BaseChunker, optional): The chunker to use. Defaults to `HierarchicalChunker(heading_as_metadata=True)`.
doc_meta_keys_allowed (set[str], optional): The Document metadata keys allowed to be included for embedding and LLM input. Defaults to `set()`.
node_meta_keys_allowed (set[str], optional): The Node metadata keys allowed to be included for embedding and LLM input. Defaults to `{"heading"}`.
"""

chunker: BaseChunker = HierarchicalChunker(heading_as_metadata=True)
doc_meta_keys_allowed: set[str] = set()
node_meta_keys_allowed: set[str] = {"heading"}

def _parse_nodes(
self,
nodes: Sequence[BaseNode],
show_progress: bool = False,
**kwargs: Any,
) -> list[BaseNode]:
id_func: IdFuncCallable = self.id_func or default_id_func
nodes_with_progress: Iterable[BaseNode] = get_tqdm_iterable(
items=nodes, show_progress=show_progress, desc="Parsing nodes"
)
all_nodes: list[BaseNode] = []
for input_node in nodes_with_progress:
li_doc = LIDocument.model_validate(input_node)
dl_doc: DLDocument = DLDocument.model_validate_json(li_doc.get_content())
chunk_iter = self.chunker.chunk(dl_doc=dl_doc)
for i, chunk in enumerate(chunk_iter):
rels: dict[NodeRelationship, RelatedNodeType] = {
NodeRelationship.SOURCE: li_doc.as_related_node_info(),
}
metadata = chunk.model_dump(
exclude=_NODE_TEXT_KEY,
exclude_none=True,
)
# by default we exclude all meta keys from embedding/LLM — unless allowed
excl_meta_keys = [
k for k in metadata if k not in self.node_meta_keys_allowed
]
if self.include_metadata:
excl_meta_keys = [
k
for k in li_doc.metadata
if k not in self.doc_meta_keys_allowed
] + excl_meta_keys
node = TextNode(
id_=id_func(i=i, doc=li_doc),
text=chunk.text,
excluded_embed_metadata_keys=excl_meta_keys,
excluded_llm_metadata_keys=excl_meta_keys,
relationships=rels,
)
node.metadata = metadata
all_nodes.append(node)
return all_nodes
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
[build-system]
build-backend = "poetry.core.masonry.api"
requires = ["poetry-core"]

[tool.codespell]
check-filenames = true
check-hidden = true
# Feel free to un-skip examples, and experimental, you will just need to
# work through many typos (--write-changes and --interactive will help)
skip = "*.csv,*.html,*.json,*.jsonl,*.pdf,*.txt,*.ipynb"

[tool.llamahub]
contains_example = false
import_path = "llama_index.node_parser.docling"

[tool.llamahub.class_authors]
DoclingNodeParser = "vagenas"

[tool.mypy]
disallow_untyped_defs = true
# Remove venv skip when integrated with pre-commit
exclude = ["_static", "build", "examples", "notebooks", "venv"]
ignore_missing_imports = true
python_version = "3.10"

[tool.poetry]
authors = ["Panos Vagenas <[email protected]>"]
description = "llama-index node_parser docling integration"
license = "MIT"
name = "llama-index-node-parser-docling"
packages = [{include = "llama_index/"}]
readme = "README.md"
version = "0.1.0"

[tool.poetry.dependencies]
python = "^3.10"
llama-index-core = "^0.11.0"
docling-core = "^1.7.1"

[tool.poetry.group.dev]

[tool.poetry.group.dev.dependencies]
black = {extras = ["jupyter"], version = "<=23.9.1,>=23.7.0"}
codespell = {extras = ["toml"], version = ">=v2.2.6"}
ipython = "8.10.0"
jupyter = "^1.0.0"
mypy = "0.991"
pre-commit = "3.2.0"
pylint = "2.15.10"
pytest = "7.2.1"
pytest-mock = "3.11.1"
ruff = "0.0.292"
tree-sitter-languages = "^1.8.0"
types-Deprecated = ">=0.1.0"
types-PyYAML = "^6.0.12.12"
types-protobuf = "^4.24.0.4"
types-redis = "4.5.5.0"
types-requests = "2.28.11.8" # TODO: unpin when mypy>0.991
types-setuptools = "67.1.0.0"
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
python_tests(
interpreter_constraints=["==3.10.*", "==3.11.*"]
)
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"id_": "129210df929c78e70d74e6f141a46d8326905ce58562f2081819c80c3921d5a3",
"embedding": null,
"metadata": {
"dl_doc_hash": "129210df929c78e70d74e6f141a46d8326905ce58562f2081819c80c3921d5a3"
},
"excluded_embed_metadata_keys": ["dl_doc_hash"],
"excluded_llm_metadata_keys": ["dl_doc_hash"],
"relationships": {},
"text": "{\"_name\":\"\",\"type\":\"pdf-document\",\"description\":{\"title\":null,\"abstract\":null,\"authors\":null,\"affiliations\":null,\"subjects\":null,\"keywords\":null,\"publication_date\":null,\"languages\":null,\"license\":null,\"publishers\":null,\"url_refs\":null,\"references\":null,\"publication\":null,\"reference_count\":null,\"citation_count\":null,\"citation_date\":null,\"advanced\":null,\"analytics\":null,\"logs\":[],\"collection\":null,\"acquisition\":null},\"file-info\":{\"filename\":\"\",\"filename-prov\":null,\"document-hash\":\"129210df929c78e70d74e6f141a46d8326905ce58562f2081819c80c3921d5a3\",\"#-pages\":null,\"collection-name\":null,\"description\":null,\"page-hashes\":null},\"main-text\":[{\"text\":\"A duckling is a young duck in downy plumage[1] or baby duck,[2] but in the food trade a young domestic duck which has just reached adult size and bulk and its meat is still fully tender, is sometimes labelled as a duckling.\",\"type\":\"paragraph\",\"name\":\"text\",\"font\":null,\"prov\":[{\"bbox\":[1.0,2.0,3.0,4.0],\"page\":1,\"span\":[0,1],\"__ref_s3_data\":null}]},{\"text\":\"A male is called a drake and the female is called a duck, or in ornithology a hen.\",\"type\":\"paragraph\",\"name\":\"text\",\"font\":null,\"prov\":[{\"bbox\":[1.0,2.0,3.0,4.0],\"page\":1,\"span\":[0,2],\"__ref_s3_data\":null}]}],\"figures\":null,\"tables\":null,\"bitmaps\":null,\"equations\":null,\"footnotes\":null,\"page-dimensions\":null,\"page-footers\":null,\"page-headers\":null,\"_s3_data\":null,\"identifiers\":null}",
"mimetype": "text/plain",
"start_char_idx": null,
"end_char_idx": null,
"text_template": "{metadata_str}\n\n{content}",
"metadata_template": "{key}: {value}",
"metadata_seperator": "\n",
"class_name": "Document"
}
Loading

0 comments on commit 0b19dea

Please sign in to comment.