Graph Analysis¶

Overview¶

Graph analysis helps you understand the structure, quality, and characteristics of your knowledge graphs through metrics, statistics, and validation.

In this guide: - Graph metrics - Quality checks - Connectivity analysis - Performance optimization - Validation techniques

Graph Metrics¶

Basic Metrics¶

from docling_graph.core.converters import GraphConverter

# Create graph
converter = GraphConverter()
graph, metadata = converter.pydantic_list_to_graph(models)

# Basic metrics
print(f"Nodes: {metadata.node_count}")
print(f"Edges: {metadata.edge_count}")
print(f"Density: {metadata.density:.3f}")
print(f"Avg degree: {metadata.avg_degree:.2f}")

Node Metrics¶

Node Count by Type¶

# Node type distribution
for node_type, count in metadata.node_types.items():
    percentage = (count / metadata.node_count) * 100
    print(f"{node_type}: {count} ({percentage:.1f}%)")

Node Degree¶

import networkx as nx

# Calculate node degrees
degrees = dict(graph.degree())

# Statistics
avg_degree = sum(degrees.values()) / len(degrees)
max_degree = max(degrees.values())
min_degree = min(degrees.values())

print(f"Average degree: {avg_degree:.2f}")
print(f"Max degree: {max_degree}")
print(f"Min degree: {min_degree}")

# Find high-degree nodes (hubs)
hubs = [(node, deg) for node, deg in degrees.items() if deg > avg_degree * 2]
print(f"Hub nodes: {len(hubs)}")

Edge Metrics¶

Edge Count by Type¶

# Edge type distribution
for edge_type, count in metadata.edge_types.items():
    percentage = (count / metadata.edge_count) * 100
    print(f"{edge_type}: {count} ({percentage:.1f}%)")

Edge Density¶

# Graph density (actual edges / possible edges)
density = metadata.density

if density < 0.1:
    print("Sparse graph")
elif density < 0.5:
    print("Medium density graph")
else:
    print("Dense graph")

Connectivity Analysis¶

Connected Components¶

import networkx as nx

# Find connected components
components = list(nx.weakly_connected_components(graph))

print(f"Connected components: {len(components)}")
print(f"Largest component: {len(max(components, key=len))} nodes")

# Check if graph is connected
is_connected = nx.is_weakly_connected(graph)
print(f"Graph is connected: {is_connected}")

Isolated Nodes¶

# Find isolated nodes (no connections)
isolated = [node for node, degree in graph.degree() if degree == 0]

print(f"Isolated nodes: {len(isolated)}")
if isolated:
    print("Warning: Graph has isolated nodes")
    for node in isolated[:5]:
        print(f"  - {node}")

Quality Checks¶

Validation¶

from docling_graph.core.utils import validate_graph_structure

try:
    validate_graph_structure(graph, raise_on_error=True)
    print("✅ Graph structure valid")
except ValueError as e:
    print(f"❌ Validation failed: {e}")

Completeness Check¶

def check_completeness(graph, metadata):
    """Check graph completeness."""
    issues = []

    # Check for nodes
    if metadata.node_count == 0:
        issues.append("No nodes in graph")

    # Check for edges
    if metadata.edge_count == 0:
        issues.append("No edges in graph")

    # Check for isolated nodes
    isolated = [n for n, d in graph.degree() if d == 0]
    if isolated:
        issues.append(f"{len(isolated)} isolated nodes")

    # Check node attributes
    nodes_without_label = [
        n for n, data in graph.nodes(data=True)
        if 'label' not in data
    ]
    if nodes_without_label:
        issues.append(f"{len(nodes_without_label)} nodes without labels")

    return issues

# Run check
issues = check_completeness(graph, metadata)
if issues:
    print("Graph issues found:")
    for issue in issues:
        print(f"  - {issue}")
else:
    print("✅ Graph is complete")

Complete Examples¶

📍 Comprehensive Analysis¶

from docling_graph.core.converters import GraphConverter
import networkx as nx

# Create graph
converter = GraphConverter()
graph, metadata = converter.pydantic_list_to_graph(models)

print("=== Graph Analysis ===\n")

# Basic metrics
print("Basic Metrics:")
print(f"  Nodes: {metadata.node_count}")
print(f"  Edges: {metadata.edge_count}")
print(f"  Density: {metadata.density:.3f}")
print(f"  Avg degree: {metadata.avg_degree:.2f}\n")

# Node types
print("Node Types:")
for node_type, count in sorted(metadata.node_types.items(), key=lambda x: x[1], reverse=True):
    percentage = (count / metadata.node_count) * 100
    print(f"  {node_type}: {count} ({percentage:.1f}%)")

# Edge types
print("\nEdge Types:")
for edge_type, count in sorted(metadata.edge_types.items(), key=lambda x: x[1], reverse=True):
    percentage = (count / metadata.edge_count) * 100
    print(f"  {edge_type}: {count} ({percentage:.1f}%)")

# Connectivity
print("\nConnectivity:")
components = list(nx.weakly_connected_components(graph))
print(f"  Connected components: {len(components)}")
print(f"  Largest component: {len(max(components, key=len))} nodes")

# Quality
print("\nQuality:")
isolated = [n for n, d in graph.degree() if d == 0]
print(f"  Isolated nodes: {len(isolated)}")
print(f"  Graph is connected: {nx.is_weakly_connected(graph)}")

📍 Batch Analysis¶

from docling_graph import run_pipeline, PipelineConfig
from pathlib import Path
import json
import pandas as pd

# Analyze multiple documents
results = []

for pdf_file in Path("documents").glob("*.pdf"):
    # Process document
    output_dir = f"analysis/{pdf_file.stem}"

    config = PipelineConfig(
        source=str(pdf_file),
        template="templates.BillingDocument",
        output_dir=output_dir
    )

    run_pipeline(config)

    # Load statistics
    with open(f"{output_dir}/graph_stats.json") as f:
        stats = json.load(f)

    results.append({
        "document": pdf_file.name,
        "nodes": stats["node_count"],
        "edges": stats["edge_count"],
        "density": stats["density"],
        "avg_degree": stats["avg_degree"]
    })

# Create summary
df = pd.DataFrame(results)
print("\n=== Batch Analysis Summary ===")
print(df.describe())

# Export
df.to_csv("batch_analysis.csv", index=False)

📍 Quality Report¶

from docling_graph.core.converters import GraphConverter
import networkx as nx

def generate_quality_report(graph, metadata):
    """Generate comprehensive quality report."""

    report = {
        "basic_metrics": {
            "nodes": metadata.node_count,
            "edges": metadata.edge_count,
            "density": metadata.density,
            "avg_degree": metadata.avg_degree
        },
        "quality_checks": {},
        "warnings": []
    }

    # Check 1: Empty graph
    if metadata.node_count == 0:
        report["warnings"].append("Graph is empty")
        return report

    # Check 2: Isolated nodes
    isolated = [n for n, d in graph.degree() if d == 0]
    report["quality_checks"]["isolated_nodes"] = len(isolated)
    if isolated:
        report["warnings"].append(f"{len(isolated)} isolated nodes found")

    # Check 3: Connectivity
    is_connected = nx.is_weakly_connected(graph)
    report["quality_checks"]["is_connected"] = is_connected
    if not is_connected:
        components = list(nx.weakly_connected_components(graph))
        report["warnings"].append(f"Graph has {len(components)} disconnected components")

    # Check 4: Node attributes
    nodes_without_label = sum(1 for _, data in graph.nodes(data=True) if 'label' not in data)
    report["quality_checks"]["nodes_without_label"] = nodes_without_label
    if nodes_without_label > 0:
        report["warnings"].append(f"{nodes_without_label} nodes missing labels")

    # Check 5: Self-loops
    self_loops = list(nx.selfloop_edges(graph))
    report["quality_checks"]["self_loops"] = len(self_loops)
    if self_loops:
        report["warnings"].append(f"{len(self_loops)} self-loops found")

    return report

# Generate report
converter = GraphConverter()
graph, metadata = converter.pydantic_list_to_graph(models)

report = generate_quality_report(graph, metadata)

# Print report
print("=== Quality Report ===\n")
print("Basic Metrics:")
for key, value in report["basic_metrics"].items():
    print(f"  {key}: {value}")

print("\nQuality Checks:")
for key, value in report["quality_checks"].items():
    print(f"  {key}: {value}")

if report["warnings"]:
    print("\nWarnings:")
    for warning in report["warnings"]:
        print(f"  ⚠ {warning}")
else:
    print("\n✅ No quality issues found")

Advanced Analysis¶

Centrality Measures¶

import networkx as nx

# Degree centrality
degree_centrality = nx.degree_centrality(graph)
top_nodes = sorted(degree_centrality.items(), key=lambda x: x[1], reverse=True)[:5]

print("Top 5 nodes by degree centrality:")
for node, centrality in top_nodes:
    print(f"  {node}: {centrality:.3f}")

# Betweenness centrality (for undirected view)
undirected = graph.to_undirected()
betweenness = nx.betweenness_centrality(undirected)
top_between = sorted(betweenness.items(), key=lambda x: x[1], reverse=True)[:5]

print("\nTop 5 nodes by betweenness centrality:")
for node, centrality in top_between:
    print(f"  {node}: {centrality:.3f}")

Path Analysis¶

import networkx as nx

# Average shortest path length (for connected graphs)
if nx.is_weakly_connected(graph):
    avg_path_length = nx.average_shortest_path_length(graph.to_undirected())
    print(f"Average shortest path length: {avg_path_length:.2f}")

# Diameter (longest shortest path)
if nx.is_weakly_connected(graph):
    diameter = nx.diameter(graph.to_undirected())
    print(f"Graph diameter: {diameter}")

Performance Optimization¶

Graph Size Optimization¶

# Check graph size
import sys

graph_size = sys.getsizeof(graph)
print(f"Graph size: {graph_size / 1024:.2f} KB")

# Optimize by removing unnecessary attributes
def optimize_graph(graph):
    """Remove unnecessary node attributes."""
    for node, data in graph.nodes(data=True):
        # Keep only essential attributes
        essential = ['id', 'label', 'type']
        to_remove = [k for k in data.keys() if k not in essential and data[k] is None]
        for key in to_remove:
            del data[key]

    return graph

optimized = optimize_graph(graph.copy())
optimized_size = sys.getsizeof(optimized)
print(f"Optimized size: {optimized_size / 1024:.2f} KB")
print(f"Reduction: {(1 - optimized_size/graph_size) * 100:.1f}%")

Best Practices¶

👍 Always Validate¶

# ✅ Good - Validate after creation
from docling_graph.core.utils import validate_graph_structure

try:
    validate_graph_structure(graph, raise_on_error=True)
except ValueError as e:
    print(f"Validation failed: {e}")

👍 Check Statistics¶

# ✅ Good - Review statistics
if metadata.node_count == 0:
    print("Warning: Empty graph")

if metadata.edge_count == 0:
    print("Warning: No relationships")

if metadata.density < 0.01:
    print("Warning: Very sparse graph")

👍 Monitor Quality¶

# ✅ Good - Regular quality checks
isolated = [n for n, d in graph.degree() if d == 0]
if len(isolated) > metadata.node_count * 0.1:
    print(f"Warning: {len(isolated)} isolated nodes (>10%)")

Troubleshooting¶

🐛 Low Density¶

Solution:

# Check if entities are properly connected
# Ensure relationships are defined in Pydantic models

class BillingDocument(BaseModel):
    issued_by: Organization  # Creates edge
    line_items: List[LineItem]  # Creates edges

🐛 Many Isolated Nodes¶

Solution:

# Enable auto cleanup
converter = GraphConverter(auto_cleanup=True)
graph, metadata = converter.pydantic_list_to_graph(models)

# Or manually remove isolated nodes
isolated = [n for n, d in graph.degree() if d == 0]
graph.remove_nodes_from(isolated)

🐛 Disconnected Components¶

Solution:

# Find largest component
import networkx as nx

components = list(nx.weakly_connected_components(graph))
largest = max(components, key=len)

# Extract largest component
subgraph = graph.subgraph(largest).copy()
print(f"Largest component: {len(subgraph.nodes())} nodes")

Next Steps¶

Now that you understand graph analysis:

CLI Guide → - Use command-line tools
API Reference → - Programmatic access
Examples → - Real-world examples