Source code for qbiocode.data_generation.make_blobs

"""
Generate synthetic blob (Gaussian cluster) datasets.

This module creates multiple configurations of blob datasets with varying
numbers of samples, features, centers, and cluster standard deviations,
useful for testing clustering and classification algorithms.
"""

from sklearn.datasets import make_blobs
import pandas as pd
import numpy as np
import json
import itertools
import os



[docs]
def generate_blobs_datasets(
    n_samples,
    n_features,
    centers,
    cluster_std,
    save_path=None,
    random_state=42,
):
    """
    Generate multiple blob (Gaussian cluster) datasets with varying parameters.
    
    Creates a series of synthetic datasets consisting of isotropic Gaussian blobs
    for clustering and classification tasks. Each configuration varies the number
    of samples, features, cluster centers, and cluster spread.
    
    Parameters
    ----------
    n_samples : list of int
        List of sample sizes to generate for each configuration.
        Example: [100, 200, 300]
    n_features : list of int
        List of feature dimensions to generate.
        Example: [2, 4, 8]
    centers : list of int
        List of numbers of cluster centers (classes).
        Example: [2, 3, 4]
    cluster_std : list of float
        List of standard deviations of the clusters.
        Example: [0.5, 1.0, 1.5, 2.0]
    save_path : str, optional
        Directory path to save generated datasets. If None, datasets are not saved.
        Default: None
    random_state : int, optional
        Random seed for reproducibility.
        Default: 42
    
    Returns
    -------
    dict
        Dictionary containing generated datasets with keys as configuration strings
        and values as tuples of (X, y) where:
        - X : pd.DataFrame, shape (n_samples, n_features)
            Feature matrix
        - y : pd.Series, shape (n_samples,)
            Target labels
    
    Notes
    -----
    - Generates all combinations of input parameters
    - Each blob is an isotropic Gaussian distribution
    - Useful for testing classification and clustering algorithms
    - Blobs are well-separated when cluster_std is small relative to center distances
    
    Examples
    --------
    >>> from qbiocode.data_generation import generate_blobs_datasets
    >>> 
    >>> # Generate simple blob datasets
    >>> datasets = generate_blobs_datasets(
    ...     n_samples=[100, 200],
    ...     n_features=[2, 4],
    ...     centers=[2, 3],
    ...     cluster_std=[1.0, 1.5]
    ... )
    >>> 
    >>> # Access a specific configuration
    >>> X, y = datasets['n_samples_100_n_features_2_centers_2_cluster_std_1.0']
    >>> print(f"Shape: {X.shape}, Classes: {y.nunique()}")
    Shape: (100, 2), Classes: 2
    
    >>> # Save datasets to disk
    >>> datasets = generate_blobs_datasets(
    ...     n_samples=[100],
    ...     n_features=[2],
    ...     centers=[3],
    ...     cluster_std=[1.0],
    ...     save_path='./data/blobs'
    ... )
    
    See Also
    --------
    generate_circles_datasets : Generate concentric circles
    generate_moons_datasets : Generate interleaving half-circles
    generate_classification_datasets : Generate high-dimensional classification data
    
    References
    ----------
    .. [1] Pedregosa et al., "Scikit-learn: Machine Learning in Python",
           JMLR 12, pp. 2825-2830, 2011.
    """
    dataset_config = {}
    
    # Generate all combinations of parameters
    param_combinations = list(itertools.product(
        n_samples, n_features, centers, cluster_std
    ))
    
    for n_samp, n_feat, n_cent, c_std in param_combinations:
        # Generate dataset
        X, y = make_blobs(
            n_samples=n_samp,
            n_features=n_feat,
            centers=n_cent,
            cluster_std=c_std,
            random_state=random_state,
            shuffle=True
        )
        
        # Convert to pandas for consistency with other QBioCode functions
        X_df = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(n_feat)])
        y_series = pd.Series(y, name='target')
        
        # Create configuration key
        config_key = f'n_samples_{n_samp}_n_features_{n_feat}_centers_{n_cent}_cluster_std_{c_std}'
        
        # Store in dictionary
        dataset_config[config_key] = (X_df, y_series)
        
        # Save if path provided
        if save_path is not None:
            os.makedirs(save_path, exist_ok=True)
            
            # Save features
            X_df.to_csv(
                os.path.join(save_path, f'{config_key}_X.csv'),
                index=False
            )
            
            # Save labels
            y_series.to_csv(
                os.path.join(save_path, f'{config_key}_y.csv'),
                index=False,
                header=True
            )
            
            # Save configuration metadata
            metadata = {
                'n_samples': n_samp,
                'n_features': n_feat,
                'centers': n_cent,
                'cluster_std': c_std,
                'random_state': random_state,
                'dataset_type': 'blobs'
            }
            
            with open(os.path.join(save_path, f'{config_key}_config.json'), 'w') as f:
                json.dump(metadata, f, indent=2)
    
    return dataset_config



# Default parameter configurations for quick generation
DEFAULT_N_SAMPLES = [100, 200, 300]
DEFAULT_N_FEATURES = [2, 4, 8]
DEFAULT_CENTERS = [2, 3, 4]
DEFAULT_CLUSTER_STD = [0.5, 1.0, 1.5, 2.0]



[docs]
def generate_default_blobs_datasets(save_path=None, random_state=42):
    """
    Generate blob datasets with default parameter configurations.
    
    Convenience function that generates a standard set of blob datasets
    using predefined parameter ranges suitable for most testing scenarios.
    
    Parameters
    ----------
    save_path : str, optional
        Directory path to save generated datasets. If None, datasets are not saved.
        Default: None
    random_state : int, optional
        Random seed for reproducibility.
        Default: 42
    
    Returns
    -------
    dict
        Dictionary containing generated datasets.
    
    Examples
    --------
    >>> from qbiocode.data_generation import generate_default_blobs_datasets
    >>> datasets = generate_default_blobs_datasets()
    >>> print(f"Generated {len(datasets)} dataset configurations")
    """
    return generate_blobs_datasets(
        n_samples=DEFAULT_N_SAMPLES,
        n_features=DEFAULT_N_FEATURES,
        centers=DEFAULT_CENTERS,
        cluster_std=DEFAULT_CLUSTER_STD,
        save_path=save_path,
        random_state=random_state
    )


# Made with Bob