Speed Up Large Git Repositories

Optimize Git performance for large repositories using shallow clones, partial clones, sparse checkouts, and advanced configuration strategies.

Large Git repositories can bring development to a crawl. After optimizing repositories with millions of commits and hundreds of gigabytes of history, I've learned that Git performance is about smart configuration, strategic workflows, and understanding Git's internals. Here's how to keep Git fast at any scale.

Understanding Git Performance

Performance Analysis Framework

# git_performance_analyzer.py
import subprocess
import time
import os
from pathlib import Path
from typing import Dict, List
import psutil
import json

class GitPerformanceAnalyzer:
    def __init__(self, repo_path: str = "."):
        self.repo_path = Path(repo_path)
        self.metrics = {}
        
    def benchmark_operations(self) -> Dict:
        """Benchmark common Git operations"""
        
        operations = {
            'status': ['git', 'status'],
            'log': ['git', 'log', '--oneline', '-100'],
            'diff': ['git', 'diff'],
            'branch_list': ['git', 'branch', '-a'],
            'fetch': ['git', 'fetch', '--dry-run'],
            'blame': ['git', 'blame', 'README.md']
        }
        
        results = {}
        
        for op_name, command in operations.items():
            start_time = time.time()
            start_memory = psutil.Process().memory_info().rss
            
            try:
                subprocess.run(
                    command,
                    cwd=self.repo_path,
                    capture_output=True,
                    timeout=30
                )
                
                elapsed = time.time() - start_time
                memory_used = psutil.Process().memory_info().rss - start_memory
                
                results[op_name] = {
                    'time_seconds': round(elapsed, 3),
                    'memory_mb': round(memory_used / (1024 * 1024), 2),
                    'status': 'success'
                }
            except subprocess.TimeoutExpired:
                results[op_name] = {
                    'time_seconds': 30,
                    'status': 'timeout'
                }
            except Exception as e:
                results[op_name] = {
                    'status': 'error',
                    'error': str(e)
                }
        
        return results
    
    def analyze_repository_size(self) -> Dict:
        """Analyze repository size and structure"""
        
        # Get repository size
        git_dir = self.repo_path / '.git'
        
        size_info = {
            'total_size_mb': 0,
            'objects_size_mb': 0,
            'pack_files': 0,
            'loose_objects': 0,
            'index_size_mb': 0
        }
        
        # Calculate .git directory size
        for root, dirs, files in os.walk(git_dir):
            for file in files:
                filepath = Path(root) / file
                size_info['total_size_mb'] += filepath.stat().st_size / (1024 * 1024)
        
        # Count pack files
        pack_dir = git_dir / 'objects' / 'pack'
        if pack_dir.exists():
            size_info['pack_files'] = len(list(pack_dir.glob('*.pack')))
        
        # Count loose objects
        objects_dir = git_dir / 'objects'
        for subdir in objects_dir.iterdir():
            if subdir.is_dir() and len(subdir.name) == 2:  # Object subdirectories
                size_info['loose_objects'] += len(list(subdir.iterdir()))
        
        # Index size
        index_file = git_dir / 'index'
        if index_file.exists():
            size_info['index_size_mb'] = index_file.stat().st_size / (1024 * 1024)
        
        # Get commit count
        commit_count = subprocess.run(
            ['git', 'rev-list', '--all', '--count'],
            cwd=self.repo_path,
            capture_output=True,
            text=True
        ).stdout.strip()
        
        size_info['commit_count'] = int(commit_count) if commit_count else 0
        
        # Get branch count
        branch_count = subprocess.run(
            ['git', 'branch', '-a', '--no-column'],
            cwd=self.repo_path,
            capture_output=True,
            text=True
        ).stdout.count('\n')
        
        size_info['branch_count'] = branch_count
        
        return size_info
    
    def identify_large_files(self, top_n: int = 10) -> List[Dict]:
        """Identify largest files in repository history"""
        
        # Use git-filter-repo or BFG stats
        result = subprocess.run(
            ['git', 'rev-list', '--objects', '--all'],
            cwd=self.repo_path,
            capture_output=True,
            text=True
        )
        
        objects = {}
        
        for line in result.stdout.strip().split('\n'):
            if ' ' in line:
                sha, path = line.split(' ', 1)
                
                # Get object size
                size_result = subprocess.run(
                    ['git', 'cat-file', '-s', sha],
                    cwd=self.repo_path,
                    capture_output=True,
                    text=True
                )
                
                if size_result.returncode == 0:
                    size = int(size_result.stdout.strip())
                    objects[path] = size
        
        # Sort by size
        sorted_objects = sorted(objects.items(), key=lambda x: x[1], reverse=True)
        
        return [
            {
                'path': path,
                'size_mb': round(size / (1024 * 1024), 2)
            }
            for path, size in sorted_objects[:top_n]
        ]

Clone Optimization

Smart Clone Strategies

#!/bin/bash
# smart_clone.sh

# Shallow clone with specific depth
shallow_clone() {
    REPO_URL=$1
    DEPTH=${2:-1}
    
    echo "Performing shallow clone with depth $DEPTH..."
    git clone --depth $DEPTH $REPO_URL
    
    cd $(basename $REPO_URL .git)
    
    # Convert to full clone if needed later
    # git fetch --unshallow
}

# Partial clone (Git 2.19+)
partial_clone() {
    REPO_URL=$1
    
    echo "Performing partial clone..."
    
    # Clone without blobs
    git clone --filter=blob:none $REPO_URL
    
    # Or clone without large files
    # git clone --filter=blob:limit=1m $REPO_URL
    
    # Or clone without any trees
    # git clone --filter=tree:0 $REPO_URL
}

# Sparse checkout clone
sparse_clone() {
    REPO_URL=$1
    shift
    PATHS=$@
    
    echo "Performing sparse clone..."
    
    # Clone with no checkout
    git clone --no-checkout $REPO_URL
    cd $(basename $REPO_URL .git)
    
    # Initialize sparse checkout
    git sparse-checkout init --cone
    
    # Set paths to include
    for path in $PATHS; do
        git sparse-checkout add $path
    done
    
    # Checkout files
    git checkout
}

# Optimized clone for CI/CD
ci_optimized_clone() {
    REPO_URL=$1
    BRANCH=${2:-main}
    
    echo "Optimized CI clone..."
    
    # Single branch, shallow clone
    git clone \
        --single-branch \
        --branch $BRANCH \
        --depth 1 \
        --filter=blob:none \
        --no-tags \
        $REPO_URL
    
    cd $(basename $REPO_URL .git)
    
    # Disable automatic GC
    git config gc.auto 0
    
    # Disable fsmonitor
    git config core.fsmonitor false
}

# Clone with reference repository
reference_clone() {
    REPO_URL=$1
    REFERENCE_REPO=$2
    
    echo "Clone using reference repository..."
    
    git clone --reference $REFERENCE_REPO $REPO_URL
    
    # This shares objects with reference repo, saving space and time
}

Advanced Clone Configuration

# clone_optimizer.py
class CloneOptimizer:
    def __init__(self):
        self.config = {}
        
    def optimize_for_size(self, repo_url: str, target_dir: str):
        """Clone optimized for minimal size"""
        
        commands = [
            # Initial clone
            ['git', 'clone', '--filter=blob:none', '--single-branch', 
             '--depth=1', repo_url, target_dir],
        ]
        
        for cmd in commands:
            subprocess.run(cmd)
        
        # Configure for performance
        os.chdir(target_dir)
        
        configs = [
            ['core.preloadindex', 'true'],
            ['core.fscache', 'true'],
            ['gc.auto', '0'],
            ['fetch.prune', 'true'],
            ['fetch.pruneTags', 'true']
        ]
        
        for key, value in configs:
            subprocess.run(['git', 'config', key, value])
    
    def progressive_clone(self, repo_url: str, target_dir: str):
        """Progressive clone - start minimal, expand as needed"""
        
        # Stage 1: Minimal clone
        subprocess.run([
            'git', 'clone', '--filter=tree:0', '--single-branch',
            '--depth=1', repo_url, target_dir
        ])
        
        os.chdir(target_dir)
        
        # Stage 2: Fetch recent history
        subprocess.run(['git', 'fetch', '--depth=100'])
        
        # Stage 3: Fetch specific paths as needed
        def fetch_path(path: str):
            subprocess.run(['git', 'sparse-checkout', 'add', path])
        
        return fetch_path

Repository Optimization

Git GC and Maintenance

#!/bin/bash
# repo_maintenance.sh

# Aggressive garbage collection
aggressive_gc() {
    echo "Running aggressive garbage collection..."
    
    # Remove all reflogs
    git reflog expire --expire=now --all
    
    # Aggressive GC
    git gc --aggressive --prune=now
    
    # Repack with optimal settings
    git repack -a -d -f --depth=250 --window=250
    
    # Clean unnecessary files
    git clean -fdx
}

# Incremental optimization
incremental_optimize() {
    echo "Running incremental optimization..."
    
    # Prune old objects
    git prune --expire=2.weeks.ago
    
    # Optimize repository
    git gc --auto
    
    # Update index
    git update-index --refresh
    
    # Verify integrity
    git fsck --full
}

# Pack optimization
optimize_packs() {
    echo "Optimizing pack files..."
    
    # Create single pack file
    git repack -Ad
    
    # Create bitmap index for faster counting
    git repack -ab
    
    # Write commit graph for faster traversal
    git commit-graph write --reachable
    
    # Write multi-pack-index
    git multi-pack-index write
}

# Remove large files from history
remove_large_files() {
    FILE_PATTERN=$1
    
    echo "Removing $FILE_PATTERN from history..."
    
    # Using git-filter-repo (recommended)
    if command -v git-filter-repo &> /dev/null; then
        git filter-repo --path $FILE_PATTERN --invert-paths
    else
        # Fallback to filter-branch
        git filter-branch --force --index-filter \
            "git rm --cached --ignore-unmatch $FILE_PATTERN" \
            --prune-empty --tag-name-filter cat -- --all
    fi
    
    # Clean up
    rm -rf .git/refs/original/
    git reflog expire --expire=now --all
    git gc --prune=now --aggressive
}

Performance Configuration

# performance_config.py
class GitPerformanceConfig:
    def __init__(self, repo_path: str = "."):
        self.repo_path = repo_path
        
    def apply_performance_configs(self):
        """Apply comprehensive performance configurations"""
        
        configs = {
            # Core performance
            'core.preloadindex': 'true',
            'core.fscache': 'true',
            'core.multipackindex': 'true',
            'core.commitGraph': 'true',
            'core.untrackedCache': 'true',
            'core.fsmonitor': 'true',
            
            # Index
            'index.threads': 'true',
            'index.version': '4',
            
            # Pack settings
            'pack.useBitmaps': 'true',
            'pack.writeBitmapHashCache': 'true',
            'pack.threads': '0',  # Use all CPU cores
            
            # Fetch optimizations
            'fetch.negotiationAlgorithm': 'skipping',
            'fetch.writeCommitGraph': 'true',
            
            # Diff settings
            'diff.algorithm': 'histogram',
            'diff.renames': 'copies',
            
            # Merge settings
            'merge.renames': 'true',
            'merge.stat': 'false',
            
            # GC settings
            'gc.auto': '256',
            'gc.autopacklimit': '10',
            'gc.writeCommitGraph': 'true',
            
            # Protocol
            'protocol.version': '2',
            
            # Feature settings
            'feature.manyFiles': 'true',
            'feature.experimental': 'true'
        }
        
        for key, value in configs.items():
            subprocess.run(
                ['git', 'config', key, value],
                cwd=self.repo_path
            )
        
        print(f"Applied {len(configs)} performance configurations")
    
    def optimize_for_platform(self):
        """Platform-specific optimizations"""
        
        import platform
        
        system = platform.system()
        
        if system == 'Windows':
            configs = {
                'core.fscache': 'true',
                'core.longpaths': 'true',
                'core.symlinks': 'false'
            }
        elif system == 'Darwin':  # macOS
            configs = {
                'core.precomposeunicode': 'true',
                'core.ignorecase': 'true'
            }
        else:  # Linux
            configs = {
                'core.preloadindex': 'true'
            }
        
        for key, value in configs.items():
            subprocess.run(['git', 'config', key, value], cwd=self.repo_path)

Sparse Checkout Optimization

Advanced Sparse Checkout

# sparse_checkout.py
class SparseCheckoutManager:
    def __init__(self, repo_path: str = "."):
        self.repo_path = Path(repo_path)
        self.sparse_file = self.repo_path / '.git' / 'info' / 'sparse-checkout'
        
    def enable_cone_mode(self):
        """Enable cone mode for better performance"""
        
        subprocess.run(
            ['git', 'sparse-checkout', 'init', '--cone'],
            cwd=self.repo_path
        )
        
    def optimize_for_monorepo(self, team_paths: List[str]):
        """Optimize sparse checkout for monorepo"""
        
        # Enable sparse checkout
        self.enable_cone_mode()
        
        # Add team-specific paths
        for path in team_paths:
            subprocess.run(
                ['git', 'sparse-checkout', 'add', path],
                cwd=self.repo_path
            )
        
        # Add common paths
        common_paths = [
            'docs',
            'scripts',
            '.github',
            'README.md'
        ]
        
        for path in common_paths:
            if (self.repo_path / path).exists():
                subprocess.run(
                    ['git', 'sparse-checkout', 'add', path],
                    cwd=self.repo_path
                )
    
    def dynamic_sparse_checkout(self):
        """Dynamically adjust sparse checkout based on usage"""
        
        # Track file access patterns
        accessed_files = self.track_accessed_files()
        
        # Determine paths to include
        paths_to_include = set()
        for file in accessed_files:
            # Add parent directory
            paths_to_include.add(str(Path(file).parent))
        
        # Update sparse checkout
        with open(self.sparse_file, 'w') as f:
            for path in sorted(paths_to_include):
                f.write(f"{path}\n")
        
        # Reapply sparse checkout
        subprocess.run(
            ['git', 'sparse-checkout', 'reapply'],
            cwd=self.repo_path
        )
    
    def track_accessed_files(self) -> List[str]:
        """Track which files are being accessed"""
        
        # This would integrate with filesystem monitoring
        # For demonstration, we'll check recently modified files
        
        result = subprocess.run(
            ['git', 'log', '--name-only', '--pretty=format:', '-100'],
            cwd=self.repo_path,
            capture_output=True,
            text=True
        )
        
        files = []
        for line in result.stdout.strip().split('\n'):
            if line and not line.startswith('commit'):
                files.append(line)
        
        return files

Working with Large Files

Large File Handling

#!/bin/bash
# large_file_handling.sh

# Split large files for Git
split_large_file() {
    FILE=$1
    CHUNK_SIZE=${2:-100M}
    
    echo "Splitting $FILE into $CHUNK_SIZE chunks..."
    
    # Split file
    split -b $CHUNK_SIZE $FILE $FILE.part.
    
    # Create manifest
    cat > $FILE.manifest << EOF
#!/bin/bash
# Reconstruct $FILE
cat $FILE.part.* > $FILE
rm $FILE.part.*
EOF
    
    chmod +x $FILE.manifest
    
    # Remove original
    rm $FILE
    
    # Add parts to git
    git add $FILE.part.* $FILE.manifest
}

# Use git-annex for large files
setup_git_annex() {
    echo "Setting up git-annex..."
    
    # Initialize annex
    git annex init
    
    # Configure for large files
    git config annex.largefiles "largerthan=100mb"
    
    # Add files
    git annex add .
    git commit -m "Add files to annex"
    
    # Sync with remotes
    git annex sync
}

# Lazy loading of large files
setup_lazy_loading() {
    # Configure smudge/clean filters
    git config filter.lfs-lazy.smudge "cat"
    git config filter.lfs-lazy.clean "cat"
    
    # Apply to large files
    echo "*.bin filter=lfs-lazy -text" >> .gitattributes
}

Network Optimization

Network Performance Tuning

# network_optimization.py
class NetworkOptimizer:
    def __init__(self):
        self.configs = {}
        
    def optimize_fetch_performance(self):
        """Optimize fetch operations"""
        
        configs = {
            # Use protocol v2 for better performance
            'protocol.version': '2',
            
            # Parallel fetching
            'fetch.parallel': '0',  # Auto-detect
            
            # Negotiation optimization
            'fetch.negotiationAlgorithm': 'skipping',
            
            # Output optimization
            'fetch.output': 'compact',
            
            # Prune automatically
            'fetch.prune': 'true',
            'fetch.pruneTags': 'true',
            
            # Write commit graph
            'fetch.writeCommitGraph': 'true'
        }
        
        for key, value in configs.items():
            subprocess.run(['git', 'config', '--global', key, value])
    
    def setup_cdn_mirror(self, cdn_url: str):
        """Setup CDN mirror for faster cloning"""
        
        # Configure URL rewriting
        subprocess.run([
            'git', 'config', '--global',
            f'url.{cdn_url}.insteadOf',
            'https://github.com/'
        ])
        
    def configure_ssh_optimization(self):
        """Optimize SSH connections"""
        
        ssh_config = """
Host github.com
    ControlMaster auto
    ControlPath ~/.ssh/control-%C
    ControlPersist 600
    Compression yes
    TCPKeepAlive yes
    ServerAliveInterval 60
"""
        
        ssh_config_path = Path.home() / '.ssh' / 'config'
        
        with open(ssh_config_path, 'a') as f:
            f.write(ssh_config)

Monitoring and Metrics

Performance Monitoring

# performance_monitor.py
import time
import psutil
from contextlib import contextmanager

class GitPerformanceMonitor:
    def __init__(self):
        self.metrics = []
        
    @contextmanager
    def measure_operation(self, operation_name: str):
        """Context manager to measure operation performance"""
        
        start_time = time.time()
        start_memory = psutil.Process().memory_info().rss
        start_cpu = psutil.cpu_percent()
        
        yield
        
        elapsed = time.time() - start_time
        memory_used = psutil.Process().memory_info().rss - start_memory
        cpu_used = psutil.cpu_percent() - start_cpu
        
        metric = {
            'operation': operation_name,
            'time_seconds': elapsed,
            'memory_mb': memory_used / (1024 * 1024),
            'cpu_percent': cpu_used,
            'timestamp': time.time()
        }
        
        self.metrics.append(metric)
        
        # Alert if slow
        if elapsed > 5:
            print(f"⚠️  Slow operation: {operation_name} took {elapsed:.2f}s")
    
    def generate_report(self) -> str:
        """Generate performance report"""
        
        if not self.metrics:
            return "No metrics collected"
        
        report = ["# Git Performance Report\n"]
        
        # Find slowest operations
        slowest = sorted(self.metrics, key=lambda x: x['time_seconds'], reverse=True)[:5]
        
        report.append("## Slowest Operations")
        for op in slowest:
            report.append(f"- {op['operation']}: {op['time_seconds']:.2f}s")
        
        # Memory usage
        high_memory = sorted(self.metrics, key=lambda x: x['memory_mb'], reverse=True)[:5]
        
        report.append("\n## Highest Memory Usage")
        for op in high_memory:
            report.append(f"- {op['operation']}: {op['memory_mb']:.2f}MB")
        
        return '\n'.join(report)

Best Practices Checklist

Use shallow clones when full history isn't needed
Enable partial clone for large repositories
Configure sparse checkout for monorepos
Run regular garbage collection
Use commit graphs for faster operations
Enable filesystem monitor
Configure appropriate pack settings
Use protocol version 2
Implement Git LFS for binary files
Monitor repository growth
Optimize network settings
Use reference repositories for CI/CD
Clean up unnecessary branches
Configure platform-specific optimizations
Regular performance benchmarking

Conclusion

Git performance optimization is crucial for maintaining developer productivity as repositories grow. By implementing these techniques—from smart cloning strategies to advanced configuration tuning—you can keep Git responsive even with massive repositories. The key is understanding your specific bottlenecks and applying the right optimizations. Remember, a fast repository is a productive repository.