Source code for qbiocode.utils.find_duplicates

"""
File duplicate detection utilities for identifying identical files in directories.

This module provides functions to find duplicate files based on content comparison,
useful for cleaning up redundant configuration files or identifying duplicate datasets.
"""

import os
import itertools
from typing import List, Tuple, Optional


[docs] def find_duplicate_files( directory: str, file_pattern: Optional[str] = None, ignore_empty_lines: bool = True, case_sensitive: bool = True, verbose: bool = False ) -> List[Tuple[str, str]]: """ Find files with identical content in a directory. Scans the specified directory for files and compares their content line by line. Identifies files that have identical content, even if they have different names. Optionally filters files by pattern and provides various comparison options. This is particularly useful for: - Finding duplicate configuration files (e.g., YAML, JSON) - Identifying redundant experiment configurations - Cleaning up duplicate datasets before batch processing - Validating file uniqueness in automated workflows Parameters ---------- directory : str Path to the directory to search for duplicate files. file_pattern : str, optional File extension or pattern to filter (e.g., '.yaml', '.csv', '.txt'). If None, all files are compared. Default is None. ignore_empty_lines : bool, optional If True, empty lines are ignored during comparison. Default is True. case_sensitive : bool, optional If True, comparison is case-sensitive. Default is True. verbose : bool, optional If True, print progress information during comparison. Default is False. Returns ------- List[Tuple[str, str]] List of tuples, where each tuple contains paths of two duplicate files. Returns empty list if no duplicates are found. Raises ------ FileNotFoundError If the specified directory does not exist. NotADirectoryError If the specified path is not a directory. PermissionError If files cannot be read due to permission issues. Examples -------- Find all duplicate files in a directory: >>> duplicates = find_duplicate_files("configs/") >>> if duplicates: ... print(f"Found {len(duplicates)} duplicate pairs") Find duplicate YAML configuration files: >>> duplicates = find_duplicate_files( ... "configs/qml_gridsearch/", ... file_pattern='.yaml', ... verbose=True ... ) >>> for file1, file2 in duplicates: ... print(f"Duplicate: {file1} == {file2}") Case-insensitive comparison: >>> duplicates = find_duplicate_files( ... "data/", ... file_pattern='.txt', ... case_sensitive=False ... ) Integration with QProfiler workflow: >>> # Check for duplicate configs before batch processing >>> config_dir = "configs/experiments/" >>> duplicates = find_duplicate_files(config_dir, file_pattern='.yaml') >>> >>> if duplicates: ... print("Warning: Duplicate configurations found!") ... for f1, f2 in duplicates: ... print(f" {os.path.basename(f1)} == {os.path.basename(f2)}") ... # Optionally remove duplicates or warn user Notes ----- - Files are compared line by line after sorting (order-independent) - Binary files are not supported; use for text files only - Large files may consume significant memory during comparison - Symbolic links are followed and treated as regular files - Hidden files (starting with '.') are included in comparison See Also -------- find_string_in_files : Search for specific strings across multiple files checkpoint_restart : Resume interrupted batch processing jobs """ # Validate input directory if not os.path.exists(directory): raise FileNotFoundError(f"Directory not found: {directory}") if not os.path.isdir(directory): raise NotADirectoryError(f"Path is not a directory: {directory}") # Collect files to compare files = [] for entry in os.scandir(directory): if entry.is_file(): # Apply file pattern filter if specified if file_pattern is None or entry.name.endswith(file_pattern): files.append(entry.path) if verbose: print(f"Comparing {len(files)} files in {directory}") if file_pattern: print(f"Filtering by pattern: {file_pattern}") # Find duplicates by comparing all pairs duplicates = [] total_comparisons = len(list(itertools.combinations(files, 2))) for idx, (file1, file2) in enumerate(itertools.combinations(files, 2)): if verbose and idx % 100 == 0: print(f"Progress: {idx}/{total_comparisons} comparisons") try: # Read and process file contents with open(file1, 'r', encoding='utf-8') as f1: content1 = f1.readlines() with open(file2, 'r', encoding='utf-8') as f2: content2 = f2.readlines() # Filter empty lines if requested if ignore_empty_lines: content1 = [line for line in content1 if line.strip()] content2 = [line for line in content2 if line.strip()] # Apply case sensitivity if not case_sensitive: content1 = [line.lower() for line in content1] content2 = [line.lower() for line in content2] # Sort for order-independent comparison content1_sorted = sorted(content1) content2_sorted = sorted(content2) # Compare contents if content1_sorted == content2_sorted: duplicates.append((file1, file2)) if verbose: print(f" Duplicate found: {os.path.basename(file1)} == {os.path.basename(file2)}") except (UnicodeDecodeError, PermissionError) as e: if verbose: print(f" Warning: Could not read {file1} or {file2}: {e}") continue if verbose: print(f"\nFound {len(duplicates)} duplicate file pairs") return duplicates