Source code for qbiocode.data_generation.make_blobs

"""
Generate synthetic blob (Gaussian cluster) datasets.

This module creates multiple configurations of blob datasets with varying
numbers of samples, features, centers, and cluster standard deviations,
useful for testing clustering and classification algorithms.
"""

from sklearn.datasets import make_blobs
import pandas as pd
import numpy as np
import json
import itertools
import os


[docs] def generate_blobs_datasets( n_samples, n_features, centers, cluster_std, save_path=None, random_state=42, ): """ Generate multiple blob (Gaussian cluster) datasets with varying parameters. Creates a series of synthetic datasets consisting of isotropic Gaussian blobs for clustering and classification tasks. Each configuration varies the number of samples, features, cluster centers, and cluster spread. Parameters ---------- n_samples : list of int List of sample sizes to generate for each configuration. Example: [100, 200, 300] n_features : list of int List of feature dimensions to generate. Example: [2, 4, 8] centers : list of int List of numbers of cluster centers (classes). Example: [2, 3, 4] cluster_std : list of float List of standard deviations of the clusters. Example: [0.5, 1.0, 1.5, 2.0] save_path : str, optional Directory path to save generated datasets. If None, datasets are not saved. Default: None random_state : int, optional Random seed for reproducibility. Default: 42 Returns ------- dict Dictionary containing generated datasets with keys as configuration strings and values as tuples of (X, y) where: - X : pd.DataFrame, shape (n_samples, n_features) Feature matrix - y : pd.Series, shape (n_samples,) Target labels Notes ----- - Generates all combinations of input parameters - Each blob is an isotropic Gaussian distribution - Useful for testing classification and clustering algorithms - Blobs are well-separated when cluster_std is small relative to center distances Examples -------- >>> from qbiocode.data_generation import generate_blobs_datasets >>> >>> # Generate simple blob datasets >>> datasets = generate_blobs_datasets( ... n_samples=[100, 200], ... n_features=[2, 4], ... centers=[2, 3], ... cluster_std=[1.0, 1.5] ... ) >>> >>> # Access a specific configuration >>> X, y = datasets['n_samples_100_n_features_2_centers_2_cluster_std_1.0'] >>> print(f"Shape: {X.shape}, Classes: {y.nunique()}") Shape: (100, 2), Classes: 2 >>> # Save datasets to disk >>> datasets = generate_blobs_datasets( ... n_samples=[100], ... n_features=[2], ... centers=[3], ... cluster_std=[1.0], ... save_path='./data/blobs' ... ) See Also -------- generate_circles_datasets : Generate concentric circles generate_moons_datasets : Generate interleaving half-circles generate_classification_datasets : Generate high-dimensional classification data References ---------- .. [1] Pedregosa et al., "Scikit-learn: Machine Learning in Python", JMLR 12, pp. 2825-2830, 2011. """ dataset_config = {} # Generate all combinations of parameters param_combinations = list(itertools.product( n_samples, n_features, centers, cluster_std )) for n_samp, n_feat, n_cent, c_std in param_combinations: # Generate dataset X, y = make_blobs( n_samples=n_samp, n_features=n_feat, centers=n_cent, cluster_std=c_std, random_state=random_state, shuffle=True ) # Convert to pandas for consistency with other QBioCode functions X_df = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(n_feat)]) y_series = pd.Series(y, name='target') # Create configuration key config_key = f'n_samples_{n_samp}_n_features_{n_feat}_centers_{n_cent}_cluster_std_{c_std}' # Store in dictionary dataset_config[config_key] = (X_df, y_series) # Save if path provided if save_path is not None: os.makedirs(save_path, exist_ok=True) # Save features X_df.to_csv( os.path.join(save_path, f'{config_key}_X.csv'), index=False ) # Save labels y_series.to_csv( os.path.join(save_path, f'{config_key}_y.csv'), index=False, header=True ) # Save configuration metadata metadata = { 'n_samples': n_samp, 'n_features': n_feat, 'centers': n_cent, 'cluster_std': c_std, 'random_state': random_state, 'dataset_type': 'blobs' } with open(os.path.join(save_path, f'{config_key}_config.json'), 'w') as f: json.dump(metadata, f, indent=2) return dataset_config
# Default parameter configurations for quick generation DEFAULT_N_SAMPLES = [100, 200, 300] DEFAULT_N_FEATURES = [2, 4, 8] DEFAULT_CENTERS = [2, 3, 4] DEFAULT_CLUSTER_STD = [0.5, 1.0, 1.5, 2.0]
[docs] def generate_default_blobs_datasets(save_path=None, random_state=42): """ Generate blob datasets with default parameter configurations. Convenience function that generates a standard set of blob datasets using predefined parameter ranges suitable for most testing scenarios. Parameters ---------- save_path : str, optional Directory path to save generated datasets. If None, datasets are not saved. Default: None random_state : int, optional Random seed for reproducibility. Default: 42 Returns ------- dict Dictionary containing generated datasets. Examples -------- >>> from qbiocode.data_generation import generate_default_blobs_datasets >>> datasets = generate_default_blobs_datasets() >>> print(f"Generated {len(datasets)} dataset configurations") """ return generate_blobs_datasets( n_samples=DEFAULT_N_SAMPLES, n_features=DEFAULT_N_FEATURES, centers=DEFAULT_CENTERS, cluster_std=DEFAULT_CLUSTER_STD, save_path=save_path, random_state=random_state )
# Made with Bob