Graph Analysis¶
Overview¶
Graph analysis helps you understand the structure, quality, and characteristics of your knowledge graphs through metrics, statistics, and validation.
In this guide: - Graph metrics - Quality checks - Connectivity analysis - Performance optimization - Validation techniques
Graph Metrics¶
Basic Metrics¶
from docling_graph.core.converters import GraphConverter
# Create graph
converter = GraphConverter()
graph, metadata = converter.pydantic_list_to_graph(models)
# Basic metrics
print(f"Nodes: {metadata.node_count}")
print(f"Edges: {metadata.edge_count}")
print(f"Density: {metadata.density:.3f}")
print(f"Avg degree: {metadata.avg_degree:.2f}")
Node Metrics¶
Node Count by Type¶
# Node type distribution
for node_type, count in metadata.node_types.items():
percentage = (count / metadata.node_count) * 100
print(f"{node_type}: {count} ({percentage:.1f}%)")
Node Degree¶
import networkx as nx
# Calculate node degrees
degrees = dict(graph.degree())
# Statistics
avg_degree = sum(degrees.values()) / len(degrees)
max_degree = max(degrees.values())
min_degree = min(degrees.values())
print(f"Average degree: {avg_degree:.2f}")
print(f"Max degree: {max_degree}")
print(f"Min degree: {min_degree}")
# Find high-degree nodes (hubs)
hubs = [(node, deg) for node, deg in degrees.items() if deg > avg_degree * 2]
print(f"Hub nodes: {len(hubs)}")
Edge Metrics¶
Edge Count by Type¶
# Edge type distribution
for edge_type, count in metadata.edge_types.items():
percentage = (count / metadata.edge_count) * 100
print(f"{edge_type}: {count} ({percentage:.1f}%)")
Edge Density¶
# Graph density (actual edges / possible edges)
density = metadata.density
if density < 0.1:
print("Sparse graph")
elif density < 0.5:
print("Medium density graph")
else:
print("Dense graph")
Connectivity Analysis¶
Connected Components¶
import networkx as nx
# Find connected components
components = list(nx.weakly_connected_components(graph))
print(f"Connected components: {len(components)}")
print(f"Largest component: {len(max(components, key=len))} nodes")
# Check if graph is connected
is_connected = nx.is_weakly_connected(graph)
print(f"Graph is connected: {is_connected}")
Isolated Nodes¶
# Find isolated nodes (no connections)
isolated = [node for node, degree in graph.degree() if degree == 0]
print(f"Isolated nodes: {len(isolated)}")
if isolated:
print("Warning: Graph has isolated nodes")
for node in isolated[:5]:
print(f" - {node}")
Quality Checks¶
Validation¶
from docling_graph.core.utils import validate_graph_structure
try:
validate_graph_structure(graph, raise_on_error=True)
print("✅ Graph structure valid")
except ValueError as e:
print(f"❌ Validation failed: {e}")
Completeness Check¶
def check_completeness(graph, metadata):
"""Check graph completeness."""
issues = []
# Check for nodes
if metadata.node_count == 0:
issues.append("No nodes in graph")
# Check for edges
if metadata.edge_count == 0:
issues.append("No edges in graph")
# Check for isolated nodes
isolated = [n for n, d in graph.degree() if d == 0]
if isolated:
issues.append(f"{len(isolated)} isolated nodes")
# Check node attributes
nodes_without_label = [
n for n, data in graph.nodes(data=True)
if 'label' not in data
]
if nodes_without_label:
issues.append(f"{len(nodes_without_label)} nodes without labels")
return issues
# Run check
issues = check_completeness(graph, metadata)
if issues:
print("Graph issues found:")
for issue in issues:
print(f" - {issue}")
else:
print("✅ Graph is complete")
Complete Examples¶
📍 Comprehensive Analysis¶
from docling_graph.core.converters import GraphConverter
import networkx as nx
# Create graph
converter = GraphConverter()
graph, metadata = converter.pydantic_list_to_graph(models)
print("=== Graph Analysis ===\n")
# Basic metrics
print("Basic Metrics:")
print(f" Nodes: {metadata.node_count}")
print(f" Edges: {metadata.edge_count}")
print(f" Density: {metadata.density:.3f}")
print(f" Avg degree: {metadata.avg_degree:.2f}\n")
# Node types
print("Node Types:")
for node_type, count in sorted(metadata.node_types.items(), key=lambda x: x[1], reverse=True):
percentage = (count / metadata.node_count) * 100
print(f" {node_type}: {count} ({percentage:.1f}%)")
# Edge types
print("\nEdge Types:")
for edge_type, count in sorted(metadata.edge_types.items(), key=lambda x: x[1], reverse=True):
percentage = (count / metadata.edge_count) * 100
print(f" {edge_type}: {count} ({percentage:.1f}%)")
# Connectivity
print("\nConnectivity:")
components = list(nx.weakly_connected_components(graph))
print(f" Connected components: {len(components)}")
print(f" Largest component: {len(max(components, key=len))} nodes")
# Quality
print("\nQuality:")
isolated = [n for n, d in graph.degree() if d == 0]
print(f" Isolated nodes: {len(isolated)}")
print(f" Graph is connected: {nx.is_weakly_connected(graph)}")
📍 Batch Analysis¶
from docling_graph import run_pipeline, PipelineConfig
from pathlib import Path
import json
import pandas as pd
# Analyze multiple documents
results = []
for pdf_file in Path("documents").glob("*.pdf"):
# Process document
output_dir = f"analysis/{pdf_file.stem}"
config = PipelineConfig(
source=str(pdf_file),
template="templates.BillingDocument",
output_dir=output_dir
)
run_pipeline(config)
# Load statistics
with open(f"{output_dir}/graph_stats.json") as f:
stats = json.load(f)
results.append({
"document": pdf_file.name,
"nodes": stats["node_count"],
"edges": stats["edge_count"],
"density": stats["density"],
"avg_degree": stats["avg_degree"]
})
# Create summary
df = pd.DataFrame(results)
print("\n=== Batch Analysis Summary ===")
print(df.describe())
# Export
df.to_csv("batch_analysis.csv", index=False)
📍 Quality Report¶
from docling_graph.core.converters import GraphConverter
import networkx as nx
def generate_quality_report(graph, metadata):
"""Generate comprehensive quality report."""
report = {
"basic_metrics": {
"nodes": metadata.node_count,
"edges": metadata.edge_count,
"density": metadata.density,
"avg_degree": metadata.avg_degree
},
"quality_checks": {},
"warnings": []
}
# Check 1: Empty graph
if metadata.node_count == 0:
report["warnings"].append("Graph is empty")
return report
# Check 2: Isolated nodes
isolated = [n for n, d in graph.degree() if d == 0]
report["quality_checks"]["isolated_nodes"] = len(isolated)
if isolated:
report["warnings"].append(f"{len(isolated)} isolated nodes found")
# Check 3: Connectivity
is_connected = nx.is_weakly_connected(graph)
report["quality_checks"]["is_connected"] = is_connected
if not is_connected:
components = list(nx.weakly_connected_components(graph))
report["warnings"].append(f"Graph has {len(components)} disconnected components")
# Check 4: Node attributes
nodes_without_label = sum(1 for _, data in graph.nodes(data=True) if 'label' not in data)
report["quality_checks"]["nodes_without_label"] = nodes_without_label
if nodes_without_label > 0:
report["warnings"].append(f"{nodes_without_label} nodes missing labels")
# Check 5: Self-loops
self_loops = list(nx.selfloop_edges(graph))
report["quality_checks"]["self_loops"] = len(self_loops)
if self_loops:
report["warnings"].append(f"{len(self_loops)} self-loops found")
return report
# Generate report
converter = GraphConverter()
graph, metadata = converter.pydantic_list_to_graph(models)
report = generate_quality_report(graph, metadata)
# Print report
print("=== Quality Report ===\n")
print("Basic Metrics:")
for key, value in report["basic_metrics"].items():
print(f" {key}: {value}")
print("\nQuality Checks:")
for key, value in report["quality_checks"].items():
print(f" {key}: {value}")
if report["warnings"]:
print("\nWarnings:")
for warning in report["warnings"]:
print(f" ⚠ {warning}")
else:
print("\n✅ No quality issues found")
Advanced Analysis¶
Centrality Measures¶
import networkx as nx
# Degree centrality
degree_centrality = nx.degree_centrality(graph)
top_nodes = sorted(degree_centrality.items(), key=lambda x: x[1], reverse=True)[:5]
print("Top 5 nodes by degree centrality:")
for node, centrality in top_nodes:
print(f" {node}: {centrality:.3f}")
# Betweenness centrality (for undirected view)
undirected = graph.to_undirected()
betweenness = nx.betweenness_centrality(undirected)
top_between = sorted(betweenness.items(), key=lambda x: x[1], reverse=True)[:5]
print("\nTop 5 nodes by betweenness centrality:")
for node, centrality in top_between:
print(f" {node}: {centrality:.3f}")
Path Analysis¶
import networkx as nx
# Average shortest path length (for connected graphs)
if nx.is_weakly_connected(graph):
avg_path_length = nx.average_shortest_path_length(graph.to_undirected())
print(f"Average shortest path length: {avg_path_length:.2f}")
# Diameter (longest shortest path)
if nx.is_weakly_connected(graph):
diameter = nx.diameter(graph.to_undirected())
print(f"Graph diameter: {diameter}")
Performance Optimization¶
Graph Size Optimization¶
# Check graph size
import sys
graph_size = sys.getsizeof(graph)
print(f"Graph size: {graph_size / 1024:.2f} KB")
# Optimize by removing unnecessary attributes
def optimize_graph(graph):
"""Remove unnecessary node attributes."""
for node, data in graph.nodes(data=True):
# Keep only essential attributes
essential = ['id', 'label', 'type']
to_remove = [k for k in data.keys() if k not in essential and data[k] is None]
for key in to_remove:
del data[key]
return graph
optimized = optimize_graph(graph.copy())
optimized_size = sys.getsizeof(optimized)
print(f"Optimized size: {optimized_size / 1024:.2f} KB")
print(f"Reduction: {(1 - optimized_size/graph_size) * 100:.1f}%")
Best Practices¶
👍 Always Validate¶
# ✅ Good - Validate after creation
from docling_graph.core.utils import validate_graph_structure
try:
validate_graph_structure(graph, raise_on_error=True)
except ValueError as e:
print(f"Validation failed: {e}")
👍 Check Statistics¶
# ✅ Good - Review statistics
if metadata.node_count == 0:
print("Warning: Empty graph")
if metadata.edge_count == 0:
print("Warning: No relationships")
if metadata.density < 0.01:
print("Warning: Very sparse graph")
👍 Monitor Quality¶
# ✅ Good - Regular quality checks
isolated = [n for n, d in graph.degree() if d == 0]
if len(isolated) > metadata.node_count * 0.1:
print(f"Warning: {len(isolated)} isolated nodes (>10%)")
Troubleshooting¶
🐛 Low Density¶
Solution:
# Check if entities are properly connected
# Ensure relationships are defined in Pydantic models
class BillingDocument(BaseModel):
issued_by: Organization # Creates edge
line_items: List[LineItem] # Creates edges
🐛 Many Isolated Nodes¶
Solution:
# Enable auto cleanup
converter = GraphConverter(auto_cleanup=True)
graph, metadata = converter.pydantic_list_to_graph(models)
# Or manually remove isolated nodes
isolated = [n for n, d in graph.degree() if d == 0]
graph.remove_nodes_from(isolated)
🐛 Disconnected Components¶
Solution:
# Find largest component
import networkx as nx
components = list(nx.weakly_connected_components(graph))
largest = max(components, key=len)
# Extract largest component
subgraph = graph.subgraph(largest).copy()
print(f"Largest component: {len(subgraph.nodes())} nodes")
Next Steps¶
Now that you understand graph analysis:
- CLI Guide → - Use command-line tools
- API Reference → - Programmatic access
- Examples → - Real-world examples