Manage Large Files Efficiently with Git LFS

Master Git LFS (Large File Storage) for handling binary assets, media files, and large datasets while keeping your repository fast and efficient.

Git wasn't designed for large files, but modern projects need them. After managing repositories with gigabytes of assets, from game development to machine learning datasets, I've learned that Git LFS is essential for keeping repositories manageable. Here's your complete guide to mastering large file management in Git.

Understanding Git LFS

LFS Architecture and Setup

# lfs_manager.py
import subprocess
import os
from pathlib import Path
from typing import List, Dict, Optional
import hashlib
import json

class GitLFSManager:
    def __init__(self, repo_path: str = "."):
        self.repo_path = Path(repo_path)
        self.lfs_track_file = self.repo_path / ".gitattributes"
        
    def install_lfs(self):
        """Install and initialize Git LFS"""
        
        # Install LFS hooks
        subprocess.run(["git", "lfs", "install"], cwd=self.repo_path)
        
        # Initialize LFS in repository
        subprocess.run(["git", "lfs", "install", "--local"], cwd=self.repo_path)
        
        print("Git LFS installed and initialized")
        
    def track_patterns(self, patterns: List[str]):
        """Track file patterns with LFS"""
        
        for pattern in patterns:
            subprocess.run(
                ["git", "lfs", "track", pattern],
                cwd=self.repo_path
            )
            print(f"Tracking pattern: {pattern}")
        
        # Add .gitattributes to git
        subprocess.run(["git", "add", ".gitattributes"], cwd=self.repo_path)
        
    def analyze_repository(self) -> Dict:
        """Analyze repository for large file candidates"""
        
        large_files = []
        threshold_mb = 10  # Files larger than 10MB
        
        # Find all files in repository
        for root, dirs, files in os.walk(self.repo_path):
            # Skip .git directory
            if '.git' in root:
                continue
            
            for file in files:
                filepath = Path(root) / file
                
                try:
                    size_mb = filepath.stat().st_size / (1024 * 1024)
                    
                    if size_mb > threshold_mb:
                        large_files.append({
                            'path': str(filepath.relative_to(self.repo_path)),
                            'size_mb': round(size_mb, 2),
                            'extension': filepath.suffix
                        })
                except:
                    continue
        
        # Group by extension
        extensions = {}
        for file in large_files:
            ext = file['extension']
            if ext not in extensions:
                extensions[ext] = {'count': 0, 'total_size_mb': 0}
            extensions[ext]['count'] += 1
            extensions[ext]['total_size_mb'] += file['size_mb']
        
        return {
            'large_files': sorted(large_files, key=lambda x: x['size_mb'], reverse=True),
            'by_extension': extensions,
            'total_large_files': len(large_files),
            'total_size_mb': sum(f['size_mb'] for f in large_files)
        }
    
    def migrate_to_lfs(self, file_pattern: str):
        """Migrate existing files to LFS"""
        
        print(f"Migrating {file_pattern} to LFS...")
        
        # Track pattern
        subprocess.run(["git", "lfs", "track", file_pattern], cwd=self.repo_path)
        
        # Find and add matching files
        result = subprocess.run(
            ["git", "ls-files", file_pattern],
            cwd=self.repo_path,
            capture_output=True,
            text=True
        )
        
        files = result.stdout.strip().split('\n')
        
        if files and files[0]:
            # Remove from Git cache
            subprocess.run(["git", "rm", "--cached"] + files, cwd=self.repo_path)
            
            # Re-add (will use LFS now)
            subprocess.run(["git", "add"] + files, cwd=self.repo_path)
            
            print(f"Migrated {len(files)} files to LFS")
        
        # Commit changes
        subprocess.run(
            ["git", "commit", "-m", f"Migrate {file_pattern} to Git LFS"],
            cwd=self.repo_path
        )

Advanced LFS Configuration

#!/bin/bash
# lfs_configuration.sh

# Configure LFS for optimal performance
configure_lfs_performance() {
    # Set concurrent transfers
    git config lfs.concurrenttransfers 8
    
    # Set transfer batch size
    git config lfs.batch true
    git config lfs.transfer.maxretries 10
    
    # Configure cache
    git config lfs.storage "$HOME/.git-lfs-storage"
    
    # Set custom LFS server (if using)
    # git config lfs.url "https://lfs.company.com"
    
    # Configure authentication
    git config lfs.https://github.com/.access "basic"
    
    echo "LFS performance configuration complete"
}

# Setup comprehensive LFS tracking
setup_lfs_tracking() {
    echo "Setting up comprehensive LFS tracking..."
    
    # Images
    git lfs track "*.jpg"
    git lfs track "*.jpeg"
    git lfs track "*.png"
    git lfs track "*.gif"
    git lfs track "*.bmp"
    git lfs track "*.tiff"
    git lfs track "*.svg"
    git lfs track "*.webp"
    
    # Videos
    git lfs track "*.mp4"
    git lfs track "*.avi"
    git lfs track "*.mov"
    git lfs track "*.wmv"
    git lfs track "*.flv"
    git lfs track "*.webm"
    git lfs track "*.mkv"
    
    # Audio
    git lfs track "*.mp3"
    git lfs track "*.wav"
    git lfs track "*.flac"
    git lfs track "*.aac"
    git lfs track "*.ogg"
    
    # Archives
    git lfs track "*.zip"
    git lfs track "*.tar"
    git lfs track "*.tar.gz"
    git lfs track "*.rar"
    git lfs track "*.7z"
    
    # Documents
    git lfs track "*.pdf"
    git lfs track "*.doc"
    git lfs track "*.docx"
    git lfs track "*.ppt"
    git lfs track "*.pptx"
    git lfs track "*.xls"
    git lfs track "*.xlsx"
    
    # Design files
    git lfs track "*.psd"
    git lfs track "*.ai"
    git lfs track "*.sketch"
    git lfs track "*.fig"
    git lfs track "*.xd"
    
    # 3D models
    git lfs track "*.obj"
    git lfs track "*.fbx"
    git lfs track "*.dae"
    git lfs track "*.3ds"
    git lfs track "*.blend"
    
    # Data files
    git lfs track "*.csv"
    git lfs track "*.json" --filename="*.large.json"
    git lfs track "*.sql"
    git lfs track "*.db"
    git lfs track "*.sqlite"
    
    # Machine Learning
    git lfs track "*.h5"
    git lfs track "*.pkl"
    git lfs track "*.model"
    git lfs track "*.weights"
    
    git add .gitattributes
    git commit -m "Configure Git LFS tracking"
}

# LFS migration helper
migrate_repo_to_lfs() {
    REPO_URL=$1
    
    echo "Migrating repository to use LFS..."
    
    # Clone repository
    git clone --mirror $REPO_URL repo-migration
    cd repo-migration
    
    # Install LFS
    git lfs install
    
    # Track large files
    setup_lfs_tracking
    
    # Migrate history
    git lfs migrate import --include="*.jpg,*.png,*.pdf" --everything
    
    # Clean up
    git reflog expire --expire=now --all
    git gc --prune=now --aggressive
    
    # Push to new repository
    echo "Push to new repository with: git push --mirror NEW_REPO_URL"
}

LFS Workflow Management

Smart LFS Operations

# lfs_workflow.py
import subprocess
import json
from datetime import datetime
from typing import List, Dict, Optional

class LFSWorkflow:
    def __init__(self, repo_path: str = "."):
        self.repo_path = repo_path
        
    def selective_fetch(self, patterns: List[str] = None, 
                       recent_only: bool = False):
        """Selectively fetch LFS files"""
        
        if patterns:
            # Fetch only specific patterns
            for pattern in patterns:
                subprocess.run(
                    ["git", "lfs", "fetch", "--include", pattern],
                    cwd=self.repo_path
                )
        elif recent_only:
            # Fetch only recent files
            subprocess.run(
                ["git", "lfs", "fetch", "--recent"],
                cwd=self.repo_path
            )
        else:
            # Fetch all
            subprocess.run(
                ["git", "lfs", "fetch"],
                cwd=self.repo_path
            )
        
        # Checkout to update working directory
        subprocess.run(["git", "lfs", "checkout"], cwd=self.repo_path)
    
    def prune_lfs_cache(self, days_old: int = 7):
        """Prune old LFS cache files"""
        
        # Prune files older than specified days
        subprocess.run(
            ["git", "lfs", "prune", "--days", str(days_old)],
            cwd=self.repo_path
        )
        
        # Verify objects
        subprocess.run(
            ["git", "lfs", "prune", "--verify-remote"],
            cwd=self.repo_path
        )
    
    def create_lfs_lock(self, filepath: str):
        """Lock LFS file for exclusive editing"""
        
        result = subprocess.run(
            ["git", "lfs", "lock", filepath],
            cwd=self.repo_path,
            capture_output=True,
            text=True
        )
        
        if result.returncode == 0:
            print(f"Locked: {filepath}")
            return True
        else:
            print(f"Failed to lock: {result.stderr}")
            return False
    
    def list_locks(self) -> List[Dict]:
        """List all LFS locks"""
        
        result = subprocess.run(
            ["git", "lfs", "locks", "--json"],
            cwd=self.repo_path,
            capture_output=True,
            text=True
        )
        
        if result.returncode == 0:
            locks = json.loads(result.stdout)
            return locks
        return []
    
    def unlock_file(self, filepath: str, force: bool = False):
        """Unlock LFS file"""
        
        cmd = ["git", "lfs", "unlock", filepath]
        if force:
            cmd.append("--force")
        
        subprocess.run(cmd, cwd=self.repo_path)
    
    def bandwidth_optimization(self):
        """Optimize LFS bandwidth usage"""
        
        # Use batch API
        subprocess.run(
            ["git", "config", "lfs.batch", "true"],
            cwd=self.repo_path
        )
        
        # Limit concurrent transfers
        subprocess.run(
            ["git", "config", "lfs.concurrenttransfers", "3"],
            cwd=self.repo_path
        )
        
        # Enable compression
        subprocess.run(
            ["git", "config", "lfs.transfer.enablehttpcompression", "true"],
            cwd=self.repo_path
        )
        
        print("Bandwidth optimization configured")
    
    def verify_lfs_files(self) -> Dict:
        """Verify integrity of LFS files"""
        
        result = subprocess.run(
            ["git", "lfs", "fsck"],
            cwd=self.repo_path,
            capture_output=True,
            text=True
        )
        
        issues = []
        if "missing" in result.stdout.lower():
            issues.append("Missing LFS objects detected")
        if "corrupt" in result.stdout.lower():
            issues.append("Corrupt LFS objects detected")
        
        return {
            'status': 'ok' if not issues else 'issues',
            'issues': issues,
            'output': result.stdout
        }

LFS Storage Optimization

# lfs_storage.py
class LFSStorageOptimizer:
    def __init__(self):
        self.lfs_objects_dir = Path(".git/lfs/objects")
        
    def analyze_storage(self) -> Dict:
        """Analyze LFS storage usage"""
        
        if not self.lfs_objects_dir.exists():
            return {'error': 'No LFS objects directory found'}
        
        total_size = 0
        file_count = 0
        size_distribution = {
            'small': 0,    # < 1MB
            'medium': 0,   # 1-10MB
            'large': 0,    # 10-100MB
            'huge': 0      # > 100MB
        }
        
        for obj_file in self.lfs_objects_dir.rglob('*'):
            if obj_file.is_file():
                size = obj_file.stat().st_size
                total_size += size
                file_count += 1
                
                size_mb = size / (1024 * 1024)
                if size_mb < 1:
                    size_distribution['small'] += 1
                elif size_mb < 10:
                    size_distribution['medium'] += 1
                elif size_mb < 100:
                    size_distribution['large'] += 1
                else:
                    size_distribution['huge'] += 1
        
        return {
            'total_size_gb': round(total_size / (1024**3), 2),
            'file_count': file_count,
            'average_size_mb': round((total_size / file_count) / (1024**2), 2) if file_count > 0 else 0,
            'distribution': size_distribution
        }
    
    def deduplicate_storage(self):
        """Deduplicate LFS storage using hardlinks"""
        
        hash_map = {}
        space_saved = 0
        
        for obj_file in self.lfs_objects_dir.rglob('*'):
            if obj_file.is_file():
                # Calculate file hash
                file_hash = self.calculate_file_hash(obj_file)
                
                if file_hash in hash_map:
                    # Duplicate found
                    original = hash_map[file_hash]
                    
                    # Replace with hardlink
                    size = obj_file.stat().st_size
                    obj_file.unlink()
                    os.link(original, obj_file)
                    
                    space_saved += size
                else:
                    hash_map[file_hash] = obj_file
        
        print(f"Space saved through deduplication: {space_saved / (1024**2):.2f} MB")
        return space_saved
    
    def calculate_file_hash(self, filepath: Path) -> str:
        """Calculate SHA256 hash of file"""
        
        sha256_hash = hashlib.sha256()
        with open(filepath, "rb") as f:
            for chunk in iter(lambda: f.read(4096), b""):
                sha256_hash.update(chunk)
        return sha256_hash.hexdigest()

Advanced LFS Scenarios

Multi-Repository LFS Sharing

#!/bin/bash
# lfs_sharing.sh

# Setup shared LFS storage
setup_shared_lfs_storage() {
    SHARED_STORAGE="/path/to/shared/lfs/storage"
    
    # Create shared storage directory
    mkdir -p $SHARED_STORAGE
    
    # Configure repository to use shared storage
    git config lfs.storage $SHARED_STORAGE
    
    # Set up symlink for existing objects
    if [ -d ".git/lfs/objects" ]; then
        mv .git/lfs/objects/* $SHARED_STORAGE/ 2>/dev/null
        rmdir .git/lfs/objects
        ln -s $SHARED_STORAGE .git/lfs/objects
    fi
    
    echo "Configured shared LFS storage at: $SHARED_STORAGE"
}

# Clone with minimal LFS data
sparse_lfs_clone() {
    REPO_URL=$1
    
    # Clone without LFS files
    GIT_LFS_SKIP_SMUDGE=1 git clone $REPO_URL
    
    cd $(basename $REPO_URL .git)
    
    # Configure sparse checkout
    git config core.sparseCheckout true
    
    # Define what to include
    cat > .git/info/sparse-checkout << EOF
/*
!*.psd
!*.ai
!*.mov
EOF
    
    # Fetch only needed LFS files
    git lfs fetch --include="*.png,*.jpg"
    git lfs checkout
    
    echo "Sparse LFS clone complete"
}

# Batch process LFS files
batch_process_lfs() {
    PROCESS_COMMAND=$1
    
    # Get all LFS tracked files
    git lfs ls-files | cut -d' ' -f3 | while read file; do
        echo "Processing: $file"
        
        # Pull specific file
        git lfs pull --include="$file"
        
        # Process file
        $PROCESS_COMMAND "$file"
        
        # Optional: Remove from cache after processing
        rm ".git/lfs/objects/$(git lfs pointer --check --file="$file" | grep oid | cut -d: -f2)"
    done
}

LFS Server Management

# lfs_server.py
import subprocess
import requests
from typing import Dict, List

class LFSServerManager:
    def __init__(self, server_url: str, auth_token: str):
        self.server_url = server_url
        self.headers = {
            'Authorization': f'Bearer {auth_token}',
            'Accept': 'application/vnd.git-lfs+json'
        }
    
    def get_server_info(self) -> Dict:
        """Get LFS server information"""
        
        response = requests.get(
            f"{self.server_url}/info/lfs",
            headers=self.headers
        )
        
        if response.status_code == 200:
            return response.json()
        return {'error': f'Server returned {response.status_code}'}
    
    def list_objects(self, limit: int = 100) -> List[Dict]:
        """List objects on LFS server"""
        
        response = requests.post(
            f"{self.server_url}/objects/batch",
            headers=self.headers,
            json={
                'operation': 'download',
                'transfers': ['basic'],
                'objects': [],
                'limit': limit
            }
        )
        
        if response.status_code == 200:
            return response.json().get('objects', [])
        return []
    
    def upload_object(self, filepath: str, oid: str, size: int):
        """Upload object to LFS server"""
        
        # Request upload
        response = requests.post(
            f"{self.server_url}/objects/batch",
            headers=self.headers,
            json={
                'operation': 'upload',
                'transfers': ['basic'],
                'objects': [{
                    'oid': oid,
                    'size': size
                }]
            }
        )
        
        if response.status_code == 200:
            upload_info = response.json()['objects'][0]
            
            # Perform upload
            with open(filepath, 'rb') as f:
                upload_response = requests.put(
                    upload_info['actions']['upload']['href'],
                    headers=upload_info['actions']['upload'].get('header', {}),
                    data=f
                )
                
                return upload_response.status_code == 200
        
        return False
    
    def configure_custom_server(self, repo_path: str):
        """Configure repository to use custom LFS server"""
        
        subprocess.run(
            ["git", "config", "lfs.url", self.server_url],
            cwd=repo_path
        )
        
        # Configure authentication
        subprocess.run(
            ["git", "config", "lfs.cachecredentials", "true"],
            cwd=repo_path
        )
        
        print(f"Configured LFS server: {self.server_url}")

Troubleshooting and Recovery

LFS Troubleshooting Tools

# lfs_troubleshoot.py
class LFSTroubleshooter:
    def __init__(self, repo_path: str = "."):
        self.repo_path = repo_path
    
    def diagnose_issues(self) -> Dict:
        """Comprehensive LFS diagnosis"""
        
        issues = []
        warnings = []
        
        # Check LFS installation
        result = subprocess.run(
            ["git", "lfs", "version"],
            capture_output=True,
            text=True
        )
        
        if result.returncode != 0:
            issues.append("Git LFS not installed")
            return {'issues': issues}
        
        # Check hooks
        hooks_path = Path(self.repo_path) / ".git" / "hooks"
        if not (hooks_path / "pre-push").exists():
            warnings.append("LFS hooks not installed")
        
        # Check tracking
        gitattributes = Path(self.repo_path) / ".gitattributes"
        if not gitattributes.exists():
            warnings.append("No .gitattributes file")
        
        # Check for missing objects
        result = subprocess.run(
            ["git", "lfs", "fsck"],
            cwd=self.repo_path,
            capture_output=True,
            text=True
        )
        
        if "missing" in result.stdout.lower():
            issues.append("Missing LFS objects")
        
        # Check pointer files
        result = subprocess.run(
            ["git", "lfs", "ls-files"],
            cwd=self.repo_path,
            capture_output=True,
            text=True
        )
        
        for line in result.stdout.split('\n'):
            if line and '*' in line:
                warnings.append(f"Unclean pointer: {line}")
        
        # Check storage
        storage_info = self.check_storage_health()
        
        return {
            'issues': issues,
            'warnings': warnings,
            'storage': storage_info,
            'recommendations': self.get_recommendations(issues, warnings)
        }
    
    def check_storage_health(self) -> Dict:
        """Check LFS storage health"""
        
        lfs_dir = Path(self.repo_path) / ".git" / "lfs"
        
        if not lfs_dir.exists():
            return {'status': 'not initialized'}
        
        # Check disk space
        import shutil
        total, used, free = shutil.disk_usage(lfs_dir)
        
        return {
            'total_gb': total // (2**30),
            'used_gb': used // (2**30),
            'free_gb': free // (2**30),
            'usage_percent': (used / total) * 100
        }
    
    def fix_pointer_files(self):
        """Fix corrupted pointer files"""
        
        # Find all pointer files
        result = subprocess.run(
            ["git", "lfs", "ls-files", "-n"],
            cwd=self.repo_path,
            capture_output=True,
            text=True
        )
        
        for filepath in result.stdout.strip().split('\n'):
            if filepath:
                # Check if pointer is valid
                check_result = subprocess.run(
                    ["git", "lfs", "pointer", "--check", "--file", filepath],
                    cwd=self.repo_path,
                    capture_output=True
                )
                
                if check_result.returncode != 0:
                    print(f"Fixing pointer: {filepath}")
                    
                    # Re-add to LFS
                    subprocess.run(
                        ["git", "rm", "--cached", filepath],
                        cwd=self.repo_path
                    )
                    subprocess.run(
                        ["git", "add", filepath],
                        cwd=self.repo_path
                    )
    
    def recover_missing_objects(self):
        """Attempt to recover missing LFS objects"""
        
        # Get list of missing objects
        result = subprocess.run(
            ["git", "lfs", "fsck"],
            cwd=self.repo_path,
            capture_output=True,
            text=True
        )
        
        missing_oids = []
        for line in result.stdout.split('\n'):
            if "missing" in line.lower():
                # Extract OID
                parts = line.split()
                for part in parts:
                    if len(part) == 64:  # SHA256 length
                        missing_oids.append(part)
        
        for oid in missing_oids:
            print(f"Attempting to recover: {oid}")
            
            # Try to fetch from remote
            subprocess.run(
                ["git", "lfs", "fetch", "--all", "--include", oid],
                cwd=self.repo_path
            )
            
            # Check other remotes
            remotes = subprocess.run(
                ["git", "remote"],
                cwd=self.repo_path,
                capture_output=True,
                text=True
            ).stdout.strip().split('\n')
            
            for remote in remotes:
                subprocess.run(
                    ["git", "lfs", "fetch", remote, "--include", oid],
                    cwd=self.repo_path
                )

Performance Optimization

LFS Performance Tuning

#!/bin/bash
# lfs_performance.sh

# Optimize LFS for large-scale operations
optimize_lfs_performance() {
    echo "Optimizing LFS performance..."
    
    # Increase transfer limits
    git config lfs.transfer.maxretries 10
    git config lfs.transfer.maxverifies 10
    
    # Configure connection pooling
    git config http.maxRequests 100
    git config http.minSessions 10
    
    # Enable keepalive
    git config http.keepAlive true
    
    # Set timeout values
    git config lfs.dialtimeout 30
    git config lfs.keepalive 60
    git config lfs.activitytimeout 120
    
    # Configure chunk size for uploads
    git config lfs.transfer.maxchunksize 104857600  # 100MB
    
    # Enable progress meter
    git config lfs.progress true
    
    echo "Performance optimization complete"
}

# Parallel LFS operations
parallel_lfs_fetch() {
    # Get list of LFS files
    git lfs ls-files -n > lfs_files.txt
    
    # Split into chunks for parallel processing
    split -l 100 lfs_files.txt lfs_chunk_
    
    # Process chunks in parallel
    for chunk in lfs_chunk_*; do
        (
            while read file; do
                git lfs fetch --include="$file"
            done < $chunk
        ) &
    done
    
    # Wait for all background jobs
    wait
    
    # Clean up
    rm lfs_files.txt lfs_chunk_*
    
    echo "Parallel fetch complete"
}

Best Practices Checklist

Track binary files from project start
Set appropriate file size thresholds
Configure .gitattributes properly
Use selective fetch for large projects
Implement file locking for binary assets
Regular cache pruning
Monitor storage usage
Configure bandwidth optimization
Set up CI/CD with LFS
Document LFS workflows
Train team on LFS usage
Backup LFS objects separately
Use shallow clones when possible
Verify file integrity regularly
Plan storage capacity

Conclusion

Git LFS transforms Git from a source code tool into a complete project management system. By properly implementing LFS, you can version control everything from code to assets while maintaining performance. The key is understanding when and how to use LFS, optimizing for your specific workflow, and maintaining good storage hygiene. Master these techniques, and large files will never slow down your repository again.