Examples
Specific Extraction
Specific extraction examples for using Docling for IBM watsonx
These examples can only be used with the Python SDK because the DoclingDocument object will separate the elements into categories (e.g. tables, figures, headings, etc.). If you use the API endpoint, you will need to manually extract the items.
Headings/Titles
You can extract only the headings (titles and section headers) from a document:
from pathlib import Path
from docling.service_client import DoclingServiceClient
from docling_core.types.doc.labels import DocItemLabel
import os
SERVICE_URL = os.getenv("DOCLING_SERVICE_URL")
API_KEY = os.getenv("DOCLING_API_KEY")
with DoclingServiceClient(url=SERVICE_URL, api_key=API_KEY) as client:
result = client.convert(
source=Path("path/to/doc.pdf")
)
doc = result.document
# Extract all headings (titles and section headers)
titles = [item for item in doc.texts if item.label == DocItemLabel.TITLE]
section_headers = [item for item in doc.texts if item.label == DocItemLabel.SECTION_HEADER]
print(f"Found {len(titles)} title(s) and {len(section_headers)} section header(s)\n")
# Print titles
if titles:
print("=" * 80)
print("TITLES:")
print("=" * 80)
for i, title in enumerate(titles, 1):
print(f"{i}. {title.text}")
print("\n")
# Print section headers
if section_headers:
print("=" * 80)
print("SECTION HEADERS:")
print("=" * 80)
for i, header in enumerate(section_headers, 1):
print(f"{i}. {header.text}")
print("\n")
# Alternative: Print all headings in document order
print("=" * 80)
print("ALL HEADINGS IN DOCUMENT ORDER:")
print("=" * 80)
for item in doc.texts:
if item.label in [DocItemLabel.TITLE, DocItemLabel.SECTION_HEADER]:
label_type = "TITLE" if item.label == DocItemLabel.TITLE else "SECTION"
print(f"[{label_type}] {item.text}")Tables
After converting a PDF to markdown, you can iterate through just the tables:
from pathlib import Path
from docling.service_client import DoclingServiceClient
import os
import pandas as pd
SERVICE_URL = os.getenv("DOCLING_SERVICE_URL")
API_KEY = os.getenv("DOCLING_API_KEY")
if not SERVICE_URL or not API_KEY:
raise ValueError("DOCLING_SERVICE_URL and DOCLING_API_KEY environment variables must be set")
with DoclingServiceClient(url=SERVICE_URL, api_key=API_KEY) as client:
result = client.convert(
source=Path("path/to/doc.pdf")
)
# Access tables from the document
doc = result.document
print(f"Found {len(doc.tables)} tables in the document\n")
# Iterate through all tables
for i, table in enumerate(doc.tables, 1):
print(f"Table {i}:")
print("=" * 80)
# Export table as DataFrame
table_df: pd.DataFrame = table.export_to_dataframe(doc=doc)
print(table_df.to_markdown())
print("\n")
# You can also export to HTML or Markdown directly
# html_output = table.export_to_html(doc=doc)
# markdown_output = table.export_to_markdown(doc=doc)
print("-" * 80)
print("\n")
Figures
You can also extract figures from the document, including any text labels embedded in the images:
from pathlib import Path
from docling.service_client import DoclingServiceClient
from docling_core.types.doc.document import TextItem
import os
SERVICE_URL = os.getenv("DOCLING_SERVICE_URL")
API_KEY = os.getenv("DOCLING_API_KEY")
with DoclingServiceClient(url=SERVICE_URL, api_key=API_KEY) as client:
result = client.convert(
source=Path("path/to/doc.pdf")
)
# Access figures/pictures from the document
doc = result.document
print(f"Found {len(doc.pictures)} figure(s) in the document\n")
# Iterate through all figures/pictures
for i, picture in enumerate(doc.pictures, 1):
print(f"Figure {i}:")
print("=" * 80)
# Print caption using the caption_text method
caption = picture.caption_text(doc)
if caption:
print(f"Caption: {caption}")
else:
print("Caption: (no caption)")
# Print provenance information (page number, bounding box, etc.)
if hasattr(picture, 'prov') and picture.prov:
for prov in picture.prov:
if hasattr(prov, 'page_no'):
print(f"Page: {prov.page_no}")
if hasattr(prov, 'bbox'):
bbox = prov.bbox
print(f"Bounding Box: ({bbox.l:.2f}, {bbox.t:.2f}, {bbox.r:.2f}, {bbox.b:.2f})")
# Print image information if available
if hasattr(picture, 'image') and picture.image:
if hasattr(picture.image, 'mimetype'):
print(f"Image format: {picture.image.mimetype}")
if hasattr(picture.image, 'size'):
print(f"Image size: {picture.image.size}")
# Print text elements contained within the picture
text_items = []
for item, level in doc.iterate_items(root=picture, traverse_pictures=True):
if isinstance(item, TextItem):
text_items.append(item.text)
if text_items:
print(f"Text elements in figure: {len(text_items)}")
for text in text_items[:3]: # Show first 3 text items
print(f" - {text[:100]}...") # Truncate long text
# Uncomment to display the image:
# picture.get_image(doc).show()
print("-" * 80)
print("\n")