Programmatic Examples¶
Overview¶
This guide provides complete, ready-to-run Python examples for common document processing scenarios using the docling-graph API.
All examples use uv run python for execution.
Quick Reference¶
| Example | Use Case | Backend |
|---|---|---|
| Simple Invoice | Basic extraction | LLM (Remote) |
| Local Processing | Offline processing | LLM (Local) |
| VLM Form Extraction | Image forms | VLM (Local) |
| Rheology Research | Complex documents | LLM (Remote) |
| Batch Processing | Multiple documents | Any |
| Error Handling | Production code | Any |
| Flask Integration | Web application | Any |
| Jupyter Notebook | Interactive analysis | Any |
Example 1: Simple Invoice Extraction¶
Use Case: Extract structured data from an invoice using remote LLM.
File: examples/simple_billing_document.py
"""
Simple invoice extraction using remote LLM.
"""
import os
from pathlib import Path
from docling_graph import run_pipeline, PipelineConfig
# Set API key
os.environ["MISTRAL_API_KEY"] = "your-api-key"
# Configure pipeline
config = PipelineConfig(
source="documents/invoice.pdf",
template="templates.billing_document.BillingDocument",
backend="llm",
inference="remote",
provider_override="mistral",
model_override="mistral-small-latest"
)
# Run pipeline
print("Processing invoice...")
context = run_pipeline(config)
graph = context.knowledge_graph
print(f"✅ Complete! Extracted {graph.number_of_nodes()} nodes")
Run:
Example 2: Local Processing with Ollama¶
Use Case: Process documents locally without API costs.
File: examples/local_ollama.py
"""
Local document processing using Ollama.
"""
from docling_graph import run_pipeline, PipelineConfig
# Ensure Ollama is running:
# ollama serve
# ollama pull llama3:8b
config = PipelineConfig(
source="documents/research.pdf",
template="docs.examples.templates.rheology_research.ScholarlyRheologyPaper",
backend="llm",
inference="local",
provider_override="ollama",
model_override="llama3:8b",
processing_mode="many-to-one",
use_chunking=True,
llm_consolidation=False # Faster
)
print("Processing with Ollama...")
try:
run_pipeline(config)
print("✅ Complete!")
except Exception as e:
print(f"❌ Error: {e}")
print("Hint: Is Ollama running? (ollama serve)")
Run:
Example 3: VLM Form Extraction¶
Use Case: Extract data from image forms using vision model.
File: examples/vlm_form.py
"""
VLM extraction from image forms.
"""
from docling_graph import run_pipeline, PipelineConfig
config = PipelineConfig(
source="documents/id_card.jpg",
template="templates.id_card.IDCard",
backend="vlm",
inference="local", # VLM only supports local
processing_mode="one-to-one",
docling_config="vision"
)
print("Extracting from image...")
context = run_pipeline(config)
print("✅ Complete!")
# Display results
graph = context.knowledge_graph
print(f"\nExtracted {graph.number_of_nodes()} nodes")
for node_id, node_data in list(graph.nodes(data=True))[:5]:
print(f" - {node_id}: {node_data}")
Run:
Example 4: Rheology Research with Consolidation¶
Use Case: High-accuracy extraction from complex documents.
File: examples/research_consolidation.py
"""
Rheology research extraction with LLM consolidation.
"""
import os
from docling_graph import run_pipeline, PipelineConfig
os.environ["MISTRAL_API_KEY"] = "your-api-key"
config = PipelineConfig(
source="documents/research_paper.pdf",
template="docs.examples.templates.rheology_research.ScholarlyRheologyPaper",
backend="llm",
inference="remote",
provider_override="mistral",
model_override="mistral-large-latest",
processing_mode="many-to-one",
use_chunking=True,
llm_consolidation=True, # Higher accuracy
docling_config="vision" # Better for complex layouts
)
print("Processing rheology research (this may take a few minutes)...")
context = run_pipeline(config)
print("✅ Complete!")
# Analyze results
graph = context.knowledge_graph
print(f"\nGraph Statistics:")
print(f" Nodes: {graph.number_of_nodes()}")
print(f" Edges: {graph.number_of_edges()}")
Run:
Example 5: Batch Processing¶
Use Case: Process multiple documents with progress tracking.
File: examples/batch_process.py
"""
Batch process multiple documents.
"""
from pathlib import Path
from docling_graph import run_pipeline, PipelineConfig
from tqdm import tqdm
def process_batch(input_dir: str, template: str):
"""Process all PDFs in a directory."""
documents = list(Path(input_dir).glob("*.pdf"))
results = {"success": [], "failed": []}
print(f"Processing {len(documents)} documents...")
for doc in tqdm(documents, desc="Processing"):
try:
config = PipelineConfig(
source=str(doc),
template=template
)
run_pipeline(config)
results["success"].append(doc.name)
except Exception as e:
results["failed"].append((doc.name, str(e)))
tqdm.write(f"❌ {doc.name}: {e}")
# Summary
print(f"\n{'='*50}")
print(f"Completed: {len(results['success'])} succeeded")
print(f"Failed: {len(results['failed'])}")
if results["failed"]:
print("\nFailed documents:")
for name, error in results["failed"]:
print(f" - {name}: {error}")
return results
if __name__ == "__main__":
results = process_batch(
input_dir="documents/invoices",
template="templates.billing_document.BillingDocument"
)
Run:
Example 6: Robust Error Handling¶
Use Case: Production-ready code with comprehensive error handling.
File: examples/robust_processing.py
"""
Production-ready document processing with error handling.
"""
import logging
from pathlib import Path
from typing import Optional
from docling_graph import run_pipeline, PipelineConfig
from docling_graph.exceptions import (
ConfigurationError,
ExtractionError,
PipelineError,
DoclingGraphError
)
# Setup logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
def process_document(
source: str,
template: str,
max_retries: int = 3
) -> bool:
"""
Process document with retry logic and error handling.
Args:
source: Path to source document
template: Pydantic template path
max_retries: Maximum retry attempts
Returns:
True if successful, False otherwise
"""
for attempt in range(1, max_retries + 1):
try:
logger.info(f"Processing {source} (attempt {attempt}/{max_retries})")
config = PipelineConfig(
source=source,
template=template
)
run_pipeline(config)
logger.info(f"✅ Successfully processed: {source}")
return True
except ConfigurationError as e:
logger.error(f"Configuration error: {e.message}")
if e.details:
logger.error(f"Details: {e.details}")
return False # Don't retry configuration errors
except ExtractionError as e:
logger.error(f"Extraction failed: {e.message}")
if attempt < max_retries:
logger.info(f"Retrying... ({attempt}/{max_retries})")
continue
return False
except PipelineError as e:
logger.error(f"Pipeline error: {e.message}")
if attempt < max_retries:
logger.info(f"Retrying... ({attempt}/{max_retries})")
continue
return False
except DoclingGraphError as e:
logger.error(f"Docling-graph error: {e.message}")
return False
except Exception as e:
logger.exception(f"Unexpected error: {e}")
return False
return False
if __name__ == "__main__":
# Process single document
success = process_document(
source="documents/invoice.pdf",
template="templates.billing_document.BillingDocument"
)
if success:
print("Processing completed successfully")
else:
print("Processing failed")
exit(1)
Run:
Example 7: Flask API Integration¶
Use Case: Web API for document processing.
File: examples/flask_api.py
"""
Flask API for document processing.
"""
from flask import Flask, request, jsonify, send_file
from werkzeug.utils import secure_filename
from pathlib import Path
import uuid
import os
from docling_graph import run_pipeline, PipelineConfig
from docling_graph.exceptions import DoclingGraphError
app = Flask(__name__)
app.config['UPLOAD_FOLDER'] = 'temp'
app.config['OUTPUT_FOLDER'] = 'outputs'
app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024 # 16MB max
# Ensure directories exist
Path(app.config['UPLOAD_FOLDER']).mkdir(exist_ok=True)
Path(app.config['OUTPUT_FOLDER']).mkdir(exist_ok=True)
@app.route('/health', methods=['GET'])
def health():
"""Health check endpoint."""
return jsonify({"status": "healthy"})
@app.route('/process', methods=['POST'])
def process_document():
"""Process uploaded document."""
# Validate request
if 'document' not in request.files:
return jsonify({"error": "No document provided"}), 400
file = request.files['document']
if file.filename == '':
return jsonify({"error": "Empty filename"}), 400
template = request.form.get('template', 'templates.billing_document.BillingDocument')
# Save file
job_id = str(uuid.uuid4())
filename = secure_filename(file.filename)
temp_path = Path(app.config['UPLOAD_FOLDER']) / f"{job_id}_{filename}"
file.save(temp_path)
try:
# Process document
config = PipelineConfig(
source=str(temp_path),
template=template
)
context = run_pipeline(config)
return jsonify({
"status": "success",
"job_id": job_id,
"model": context.pydantic_model.model_dump()
})
except DoclingGraphError as e:
return jsonify({
"status": "error",
"message": e.message,
"details": e.details
}), 500
except Exception as e:
return jsonify({
"status": "error",
"message": str(e)
}), 500
finally:
# Cleanup temp file
temp_path.unlink(missing_ok=True)
if __name__ == '__main__':
app.run(debug=True, port=5000)
Run:
Test:
# Upload and process document
curl -X POST http://localhost:5000/process \
-F "document=@invoice.pdf" \
-F "template=templates.billing_document.BillingDocument"
# Download results
curl -O http://localhost:5000/download/{job_id}/nodes.csv
Example 8: Jupyter Notebook Analysis¶
Use Case: Interactive document analysis in Jupyter.
File: examples/notebook_analysis.ipynb
# Cell 1: Setup
from docling_graph import run_pipeline, PipelineConfig
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
# Cell 2: Process Document
config = PipelineConfig(
source="documents/research.pdf",
template="docs.examples.templates.rheology_research.ScholarlyRheologyPaper"
)
print("Processing document...")
context = run_pipeline(config)
print("✅ Complete!")
# Cell 3: Load Results
graph = context.knowledge_graph
nodes = pd.DataFrame([{"id": node_id, **attrs} for node_id, attrs in graph.nodes(data=True)])
edges = pd.DataFrame(
[{"source": u, "target": v, **attrs} for u, v, attrs in graph.edges(data=True)]
)
print(f"Nodes: {len(nodes)}")
print(f"Edges: {len(edges)}")
# Cell 4: Analyze Node Types
node_counts = nodes['type'].value_counts()
print("\nNode Type Distribution:")
print(node_counts)
# Visualize
plt.figure(figsize=(10, 6))
node_counts.plot(kind='bar')
plt.title('Node Types Distribution')
plt.xlabel('Node Type')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# Cell 5: Analyze Relationships
edge_counts = edges['type'].value_counts()
print("\nRelationship Distribution:")
print(edge_counts)
# Visualize
plt.figure(figsize=(10, 6))
edge_counts.plot(kind='bar', color='coral')
plt.title('Relationship Types Distribution')
plt.xlabel('Relationship Type')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# Cell 6: Network Analysis
import networkx as nx
# Create graph
G = nx.DiGraph()
for _, edge in edges.iterrows():
G.add_edge(edge['source'], edge['target'], type=edge['type'])
print(f"\nNetwork Statistics:")
print(f" Nodes: {G.number_of_nodes()}")
print(f" Edges: {G.number_of_edges()}")
print(f" Density: {nx.density(G):.3f}")
print(f" Is connected: {nx.is_weakly_connected(G)}")
# Cell 7: Visualize Network
plt.figure(figsize=(12, 8))
pos = nx.spring_layout(G, k=0.5, iterations=50)
nx.draw(G, pos,
node_color='lightblue',
node_size=500,
with_labels=True,
font_size=8,
arrows=True,
edge_color='gray',
alpha=0.7)
plt.title('Knowledge Graph Visualization')
plt.tight_layout()
plt.show()
Run:
Best Practices¶
👍 Use Environment Variables for Secrets¶
# ✅ Good - Environment variables
import os
os.environ["MISTRAL_API_KEY"] = os.getenv("MISTRAL_API_KEY")
# ❌ Avoid - Hardcoded secrets
os.environ["MISTRAL_API_KEY"] = "sk-1234..." # Don't commit!
👍 Handle Errors Gracefully¶
# ✅ Good - Specific error handling
from docling_graph.exceptions import ExtractionError
try:
run_pipeline(config)
except ExtractionError as e:
logger.error(f"Extraction failed: {e.message}")
# Implement fallback
# ❌ Avoid - Silent failures
try:
run_pipeline(config)
except:
pass
Next Steps¶
- Batch Processing → - Advanced batch patterns
- Examples → - Real-world examples
- Advanced Topics → - Custom backends