Master Git LFS (Large File Storage) for handling binary assets, media files, and large datasets while keeping your repository fast and efficient.
Git wasn't designed for large files, but modern projects need them. After managing repositories with gigabytes of assets, from game development to machine learning datasets, I've learned that Git LFS is essential for keeping repositories manageable. Here's your complete guide to mastering large file management in Git.
Understanding Git LFS
LFS Architecture and Setup
# lfs_manager.py
import subprocess
import os
from pathlib import Path
from typing import List, Dict, Optional
import hashlib
import json
class GitLFSManager:
def __init__(self, repo_path: str = "."):
self.repo_path = Path(repo_path)
self.lfs_track_file = self.repo_path / ".gitattributes"
def install_lfs(self):
"""Install and initialize Git LFS"""
# Install LFS hooks
subprocess.run(["git", "lfs", "install"], cwd=self.repo_path)
# Initialize LFS in repository
subprocess.run(["git", "lfs", "install", "--local"], cwd=self.repo_path)
print("Git LFS installed and initialized")
def track_patterns(self, patterns: List[str]):
"""Track file patterns with LFS"""
for pattern in patterns:
subprocess.run(
["git", "lfs", "track", pattern],
cwd=self.repo_path
)
print(f"Tracking pattern: {pattern}")
# Add .gitattributes to git
subprocess.run(["git", "add", ".gitattributes"], cwd=self.repo_path)
def analyze_repository(self) -> Dict:
"""Analyze repository for large file candidates"""
large_files = []
threshold_mb = 10 # Files larger than 10MB
# Find all files in repository
for root, dirs, files in os.walk(self.repo_path):
# Skip .git directory
if '.git' in root:
continue
for file in files:
filepath = Path(root) / file
try:
size_mb = filepath.stat().st_size / (1024 * 1024)
if size_mb > threshold_mb:
large_files.append({
'path': str(filepath.relative_to(self.repo_path)),
'size_mb': round(size_mb, 2),
'extension': filepath.suffix
})
except:
continue
# Group by extension
extensions = {}
for file in large_files:
ext = file['extension']
if ext not in extensions:
extensions[ext] = {'count': 0, 'total_size_mb': 0}
extensions[ext]['count'] += 1
extensions[ext]['total_size_mb'] += file['size_mb']
return {
'large_files': sorted(large_files, key=lambda x: x['size_mb'], reverse=True),
'by_extension': extensions,
'total_large_files': len(large_files),
'total_size_mb': sum(f['size_mb'] for f in large_files)
}
def migrate_to_lfs(self, file_pattern: str):
"""Migrate existing files to LFS"""
print(f"Migrating {file_pattern} to LFS...")
# Track pattern
subprocess.run(["git", "lfs", "track", file_pattern], cwd=self.repo_path)
# Find and add matching files
result = subprocess.run(
["git", "ls-files", file_pattern],
cwd=self.repo_path,
capture_output=True,
text=True
)
files = result.stdout.strip().split('\n')
if files and files[0]:
# Remove from Git cache
subprocess.run(["git", "rm", "--cached"] + files, cwd=self.repo_path)
# Re-add (will use LFS now)
subprocess.run(["git", "add"] + files, cwd=self.repo_path)
print(f"Migrated {len(files)} files to LFS")
# Commit changes
subprocess.run(
["git", "commit", "-m", f"Migrate {file_pattern} to Git LFS"],
cwd=self.repo_path
)
Advanced LFS Configuration
#!/bin/bash
# lfs_configuration.sh
# Configure LFS for optimal performance
configure_lfs_performance() {
# Set concurrent transfers
git config lfs.concurrenttransfers 8
# Set transfer batch size
git config lfs.batch true
git config lfs.transfer.maxretries 10
# Configure cache
git config lfs.storage "$HOME/.git-lfs-storage"
# Set custom LFS server (if using)
# git config lfs.url "https://lfs.company.com"
# Configure authentication
git config lfs.https://github.com/.access "basic"
echo "LFS performance configuration complete"
}
# Setup comprehensive LFS tracking
setup_lfs_tracking() {
echo "Setting up comprehensive LFS tracking..."
# Images
git lfs track "*.jpg"
git lfs track "*.jpeg"
git lfs track "*.png"
git lfs track "*.gif"
git lfs track "*.bmp"
git lfs track "*.tiff"
git lfs track "*.svg"
git lfs track "*.webp"
# Videos
git lfs track "*.mp4"
git lfs track "*.avi"
git lfs track "*.mov"
git lfs track "*.wmv"
git lfs track "*.flv"
git lfs track "*.webm"
git lfs track "*.mkv"
# Audio
git lfs track "*.mp3"
git lfs track "*.wav"
git lfs track "*.flac"
git lfs track "*.aac"
git lfs track "*.ogg"
# Archives
git lfs track "*.zip"
git lfs track "*.tar"
git lfs track "*.tar.gz"
git lfs track "*.rar"
git lfs track "*.7z"
# Documents
git lfs track "*.pdf"
git lfs track "*.doc"
git lfs track "*.docx"
git lfs track "*.ppt"
git lfs track "*.pptx"
git lfs track "*.xls"
git lfs track "*.xlsx"
# Design files
git lfs track "*.psd"
git lfs track "*.ai"
git lfs track "*.sketch"
git lfs track "*.fig"
git lfs track "*.xd"
# 3D models
git lfs track "*.obj"
git lfs track "*.fbx"
git lfs track "*.dae"
git lfs track "*.3ds"
git lfs track "*.blend"
# Data files
git lfs track "*.csv"
git lfs track "*.json" --filename="*.large.json"
git lfs track "*.sql"
git lfs track "*.db"
git lfs track "*.sqlite"
# Machine Learning
git lfs track "*.h5"
git lfs track "*.pkl"
git lfs track "*.model"
git lfs track "*.weights"
git add .gitattributes
git commit -m "Configure Git LFS tracking"
}
# LFS migration helper
migrate_repo_to_lfs() {
REPO_URL=$1
echo "Migrating repository to use LFS..."
# Clone repository
git clone --mirror $REPO_URL repo-migration
cd repo-migration
# Install LFS
git lfs install
# Track large files
setup_lfs_tracking
# Migrate history
git lfs migrate import --include="*.jpg,*.png,*.pdf" --everything
# Clean up
git reflog expire --expire=now --all
git gc --prune=now --aggressive
# Push to new repository
echo "Push to new repository with: git push --mirror NEW_REPO_URL"
}
LFS Workflow Management
Smart LFS Operations
# lfs_workflow.py
import subprocess
import json
from datetime import datetime
from typing import List, Dict, Optional
class LFSWorkflow:
def __init__(self, repo_path: str = "."):
self.repo_path = repo_path
def selective_fetch(self, patterns: List[str] = None,
recent_only: bool = False):
"""Selectively fetch LFS files"""
if patterns:
# Fetch only specific patterns
for pattern in patterns:
subprocess.run(
["git", "lfs", "fetch", "--include", pattern],
cwd=self.repo_path
)
elif recent_only:
# Fetch only recent files
subprocess.run(
["git", "lfs", "fetch", "--recent"],
cwd=self.repo_path
)
else:
# Fetch all
subprocess.run(
["git", "lfs", "fetch"],
cwd=self.repo_path
)
# Checkout to update working directory
subprocess.run(["git", "lfs", "checkout"], cwd=self.repo_path)
def prune_lfs_cache(self, days_old: int = 7):
"""Prune old LFS cache files"""
# Prune files older than specified days
subprocess.run(
["git", "lfs", "prune", "--days", str(days_old)],
cwd=self.repo_path
)
# Verify objects
subprocess.run(
["git", "lfs", "prune", "--verify-remote"],
cwd=self.repo_path
)
def create_lfs_lock(self, filepath: str):
"""Lock LFS file for exclusive editing"""
result = subprocess.run(
["git", "lfs", "lock", filepath],
cwd=self.repo_path,
capture_output=True,
text=True
)
if result.returncode == 0:
print(f"Locked: {filepath}")
return True
else:
print(f"Failed to lock: {result.stderr}")
return False
def list_locks(self) -> List[Dict]:
"""List all LFS locks"""
result = subprocess.run(
["git", "lfs", "locks", "--json"],
cwd=self.repo_path,
capture_output=True,
text=True
)
if result.returncode == 0:
locks = json.loads(result.stdout)
return locks
return []
def unlock_file(self, filepath: str, force: bool = False):
"""Unlock LFS file"""
cmd = ["git", "lfs", "unlock", filepath]
if force:
cmd.append("--force")
subprocess.run(cmd, cwd=self.repo_path)
def bandwidth_optimization(self):
"""Optimize LFS bandwidth usage"""
# Use batch API
subprocess.run(
["git", "config", "lfs.batch", "true"],
cwd=self.repo_path
)
# Limit concurrent transfers
subprocess.run(
["git", "config", "lfs.concurrenttransfers", "3"],
cwd=self.repo_path
)
# Enable compression
subprocess.run(
["git", "config", "lfs.transfer.enablehttpcompression", "true"],
cwd=self.repo_path
)
print("Bandwidth optimization configured")
def verify_lfs_files(self) -> Dict:
"""Verify integrity of LFS files"""
result = subprocess.run(
["git", "lfs", "fsck"],
cwd=self.repo_path,
capture_output=True,
text=True
)
issues = []
if "missing" in result.stdout.lower():
issues.append("Missing LFS objects detected")
if "corrupt" in result.stdout.lower():
issues.append("Corrupt LFS objects detected")
return {
'status': 'ok' if not issues else 'issues',
'issues': issues,
'output': result.stdout
}
LFS Storage Optimization
# lfs_storage.py
class LFSStorageOptimizer:
def __init__(self):
self.lfs_objects_dir = Path(".git/lfs/objects")
def analyze_storage(self) -> Dict:
"""Analyze LFS storage usage"""
if not self.lfs_objects_dir.exists():
return {'error': 'No LFS objects directory found'}
total_size = 0
file_count = 0
size_distribution = {
'small': 0, # < 1MB
'medium': 0, # 1-10MB
'large': 0, # 10-100MB
'huge': 0 # > 100MB
}
for obj_file in self.lfs_objects_dir.rglob('*'):
if obj_file.is_file():
size = obj_file.stat().st_size
total_size += size
file_count += 1
size_mb = size / (1024 * 1024)
if size_mb < 1:
size_distribution['small'] += 1
elif size_mb < 10:
size_distribution['medium'] += 1
elif size_mb < 100:
size_distribution['large'] += 1
else:
size_distribution['huge'] += 1
return {
'total_size_gb': round(total_size / (1024**3), 2),
'file_count': file_count,
'average_size_mb': round((total_size / file_count) / (1024**2), 2) if file_count > 0 else 0,
'distribution': size_distribution
}
def deduplicate_storage(self):
"""Deduplicate LFS storage using hardlinks"""
hash_map = {}
space_saved = 0
for obj_file in self.lfs_objects_dir.rglob('*'):
if obj_file.is_file():
# Calculate file hash
file_hash = self.calculate_file_hash(obj_file)
if file_hash in hash_map:
# Duplicate found
original = hash_map[file_hash]
# Replace with hardlink
size = obj_file.stat().st_size
obj_file.unlink()
os.link(original, obj_file)
space_saved += size
else:
hash_map[file_hash] = obj_file
print(f"Space saved through deduplication: {space_saved / (1024**2):.2f} MB")
return space_saved
def calculate_file_hash(self, filepath: Path) -> str:
"""Calculate SHA256 hash of file"""
sha256_hash = hashlib.sha256()
with open(filepath, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
sha256_hash.update(chunk)
return sha256_hash.hexdigest()
Advanced LFS Scenarios
Multi-Repository LFS Sharing
#!/bin/bash
# lfs_sharing.sh
# Setup shared LFS storage
setup_shared_lfs_storage() {
SHARED_STORAGE="/path/to/shared/lfs/storage"
# Create shared storage directory
mkdir -p $SHARED_STORAGE
# Configure repository to use shared storage
git config lfs.storage $SHARED_STORAGE
# Set up symlink for existing objects
if [ -d ".git/lfs/objects" ]; then
mv .git/lfs/objects/* $SHARED_STORAGE/ 2>/dev/null
rmdir .git/lfs/objects
ln -s $SHARED_STORAGE .git/lfs/objects
fi
echo "Configured shared LFS storage at: $SHARED_STORAGE"
}
# Clone with minimal LFS data
sparse_lfs_clone() {
REPO_URL=$1
# Clone without LFS files
GIT_LFS_SKIP_SMUDGE=1 git clone $REPO_URL
cd $(basename $REPO_URL .git)
# Configure sparse checkout
git config core.sparseCheckout true
# Define what to include
cat > .git/info/sparse-checkout << EOF
/*
!*.psd
!*.ai
!*.mov
EOF
# Fetch only needed LFS files
git lfs fetch --include="*.png,*.jpg"
git lfs checkout
echo "Sparse LFS clone complete"
}
# Batch process LFS files
batch_process_lfs() {
PROCESS_COMMAND=$1
# Get all LFS tracked files
git lfs ls-files | cut -d' ' -f3 | while read file; do
echo "Processing: $file"
# Pull specific file
git lfs pull --include="$file"
# Process file
$PROCESS_COMMAND "$file"
# Optional: Remove from cache after processing
rm ".git/lfs/objects/$(git lfs pointer --check --file="$file" | grep oid | cut -d: -f2)"
done
}
LFS Server Management
# lfs_server.py
import subprocess
import requests
from typing import Dict, List
class LFSServerManager:
def __init__(self, server_url: str, auth_token: str):
self.server_url = server_url
self.headers = {
'Authorization': f'Bearer {auth_token}',
'Accept': 'application/vnd.git-lfs+json'
}
def get_server_info(self) -> Dict:
"""Get LFS server information"""
response = requests.get(
f"{self.server_url}/info/lfs",
headers=self.headers
)
if response.status_code == 200:
return response.json()
return {'error': f'Server returned {response.status_code}'}
def list_objects(self, limit: int = 100) -> List[Dict]:
"""List objects on LFS server"""
response = requests.post(
f"{self.server_url}/objects/batch",
headers=self.headers,
json={
'operation': 'download',
'transfers': ['basic'],
'objects': [],
'limit': limit
}
)
if response.status_code == 200:
return response.json().get('objects', [])
return []
def upload_object(self, filepath: str, oid: str, size: int):
"""Upload object to LFS server"""
# Request upload
response = requests.post(
f"{self.server_url}/objects/batch",
headers=self.headers,
json={
'operation': 'upload',
'transfers': ['basic'],
'objects': [{
'oid': oid,
'size': size
}]
}
)
if response.status_code == 200:
upload_info = response.json()['objects'][0]
# Perform upload
with open(filepath, 'rb') as f:
upload_response = requests.put(
upload_info['actions']['upload']['href'],
headers=upload_info['actions']['upload'].get('header', {}),
data=f
)
return upload_response.status_code == 200
return False
def configure_custom_server(self, repo_path: str):
"""Configure repository to use custom LFS server"""
subprocess.run(
["git", "config", "lfs.url", self.server_url],
cwd=repo_path
)
# Configure authentication
subprocess.run(
["git", "config", "lfs.cachecredentials", "true"],
cwd=repo_path
)
print(f"Configured LFS server: {self.server_url}")
Troubleshooting and Recovery
LFS Troubleshooting Tools
# lfs_troubleshoot.py
class LFSTroubleshooter:
def __init__(self, repo_path: str = "."):
self.repo_path = repo_path
def diagnose_issues(self) -> Dict:
"""Comprehensive LFS diagnosis"""
issues = []
warnings = []
# Check LFS installation
result = subprocess.run(
["git", "lfs", "version"],
capture_output=True,
text=True
)
if result.returncode != 0:
issues.append("Git LFS not installed")
return {'issues': issues}
# Check hooks
hooks_path = Path(self.repo_path) / ".git" / "hooks"
if not (hooks_path / "pre-push").exists():
warnings.append("LFS hooks not installed")
# Check tracking
gitattributes = Path(self.repo_path) / ".gitattributes"
if not gitattributes.exists():
warnings.append("No .gitattributes file")
# Check for missing objects
result = subprocess.run(
["git", "lfs", "fsck"],
cwd=self.repo_path,
capture_output=True,
text=True
)
if "missing" in result.stdout.lower():
issues.append("Missing LFS objects")
# Check pointer files
result = subprocess.run(
["git", "lfs", "ls-files"],
cwd=self.repo_path,
capture_output=True,
text=True
)
for line in result.stdout.split('\n'):
if line and '*' in line:
warnings.append(f"Unclean pointer: {line}")
# Check storage
storage_info = self.check_storage_health()
return {
'issues': issues,
'warnings': warnings,
'storage': storage_info,
'recommendations': self.get_recommendations(issues, warnings)
}
def check_storage_health(self) -> Dict:
"""Check LFS storage health"""
lfs_dir = Path(self.repo_path) / ".git" / "lfs"
if not lfs_dir.exists():
return {'status': 'not initialized'}
# Check disk space
import shutil
total, used, free = shutil.disk_usage(lfs_dir)
return {
'total_gb': total // (2**30),
'used_gb': used // (2**30),
'free_gb': free // (2**30),
'usage_percent': (used / total) * 100
}
def fix_pointer_files(self):
"""Fix corrupted pointer files"""
# Find all pointer files
result = subprocess.run(
["git", "lfs", "ls-files", "-n"],
cwd=self.repo_path,
capture_output=True,
text=True
)
for filepath in result.stdout.strip().split('\n'):
if filepath:
# Check if pointer is valid
check_result = subprocess.run(
["git", "lfs", "pointer", "--check", "--file", filepath],
cwd=self.repo_path,
capture_output=True
)
if check_result.returncode != 0:
print(f"Fixing pointer: {filepath}")
# Re-add to LFS
subprocess.run(
["git", "rm", "--cached", filepath],
cwd=self.repo_path
)
subprocess.run(
["git", "add", filepath],
cwd=self.repo_path
)
def recover_missing_objects(self):
"""Attempt to recover missing LFS objects"""
# Get list of missing objects
result = subprocess.run(
["git", "lfs", "fsck"],
cwd=self.repo_path,
capture_output=True,
text=True
)
missing_oids = []
for line in result.stdout.split('\n'):
if "missing" in line.lower():
# Extract OID
parts = line.split()
for part in parts:
if len(part) == 64: # SHA256 length
missing_oids.append(part)
for oid in missing_oids:
print(f"Attempting to recover: {oid}")
# Try to fetch from remote
subprocess.run(
["git", "lfs", "fetch", "--all", "--include", oid],
cwd=self.repo_path
)
# Check other remotes
remotes = subprocess.run(
["git", "remote"],
cwd=self.repo_path,
capture_output=True,
text=True
).stdout.strip().split('\n')
for remote in remotes:
subprocess.run(
["git", "lfs", "fetch", remote, "--include", oid],
cwd=self.repo_path
)
Performance Optimization
LFS Performance Tuning
#!/bin/bash
# lfs_performance.sh
# Optimize LFS for large-scale operations
optimize_lfs_performance() {
echo "Optimizing LFS performance..."
# Increase transfer limits
git config lfs.transfer.maxretries 10
git config lfs.transfer.maxverifies 10
# Configure connection pooling
git config http.maxRequests 100
git config http.minSessions 10
# Enable keepalive
git config http.keepAlive true
# Set timeout values
git config lfs.dialtimeout 30
git config lfs.keepalive 60
git config lfs.activitytimeout 120
# Configure chunk size for uploads
git config lfs.transfer.maxchunksize 104857600 # 100MB
# Enable progress meter
git config lfs.progress true
echo "Performance optimization complete"
}
# Parallel LFS operations
parallel_lfs_fetch() {
# Get list of LFS files
git lfs ls-files -n > lfs_files.txt
# Split into chunks for parallel processing
split -l 100 lfs_files.txt lfs_chunk_
# Process chunks in parallel
for chunk in lfs_chunk_*; do
(
while read file; do
git lfs fetch --include="$file"
done < $chunk
) &
done
# Wait for all background jobs
wait
# Clean up
rm lfs_files.txt lfs_chunk_*
echo "Parallel fetch complete"
}
Best Practices Checklist
Conclusion
Git LFS transforms Git from a source code tool into a complete project management system. By properly implementing LFS, you can version control everything from code to assets while maintaining performance. The key is understanding when and how to use LFS, optimizing for your specific workflow, and maintaining good storage hygiene. Master these techniques, and large files will never slow down your repository again.