Learn practical FinOps strategies to optimize cloud costs, including resource right-sizing, spot instances, and automated cost management.
Last year, our AWS bill was growing faster than our user base. After implementing a comprehensive FinOps strategy, we reduced costs by 60% while actually improving performance. Here's the playbook that saved us $400K annually.
The FinOps Framework
FinOps isn't just about cutting costs—it's about:
- Visibility: Understanding where money goes
- Optimization: Right-sizing and efficiency
- Accountability: Team ownership of costs
- Automation: Continuous optimization
Cost Visibility and Analysis
Setting Up Cost Tracking
# cost_analyzer.py
import boto3
import pandas as pd
from datetime import datetime, timedelta
def analyze_costs():
ce = boto3.client('ce')
# Get cost breakdown by service
response = ce.get_cost_and_usage(
TimePeriod={
'Start': (datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d'),
'End': datetime.now().strftime('%Y-%m-%d')
},
Granularity='DAILY',
Metrics=['UnblendedCost'],
GroupBy=[
{'Type': 'DIMENSION', 'Key': 'SERVICE'},
{'Type': 'TAG', 'Key': 'Environment'}
]
)
# Convert to DataFrame for analysis
df = pd.DataFrame(response['ResultsByTime'])
return df
def identify_waste():
ec2 = boto3.client('ec2')
cloudwatch = boto3.client('cloudwatch')
instances = ec2.describe_instances()
waste_report = []
for reservation in instances['Reservations']:
for instance in reservation['Instances']:
# Check CPU utilization
metrics = cloudwatch.get_metric_statistics(
Namespace='AWS/EC2',
MetricName='CPUUtilization',
Dimensions=[{'Name': 'InstanceId', 'Value': instance['InstanceId']}],
StartTime=datetime.now() - timedelta(days=7),
EndTime=datetime.now(),
Period=3600,
Statistics=['Average']
)
avg_cpu = sum(point['Average'] for point in metrics['Datapoints']) / len(metrics['Datapoints'])
if avg_cpu < 10:
waste_report.append({
'InstanceId': instance['InstanceId'],
'Type': instance['InstanceType'],
'AvgCPU': avg_cpu,
'Recommendation': 'Consider downsizing or terminating'
})
return waste_report
Tagging Strategy
# Consistent tagging for cost allocation
locals {
common_tags = {
Environment = var.environment
Project = var.project_name
Owner = var.team_owner
CostCenter = var.cost_center
ManagedBy = "Terraform"
CreatedDate = timestamp()
}
}
resource "aws_instance" "app" {
instance_type = var.instance_type
tags = merge(
local.common_tags,
{
Name = "${var.project_name}-app-${var.environment}"
Type = "Application"
}
)
}
Right-Sizing Resources
Automated Right-Sizing
# right_sizing.py
def recommend_instance_size(instance_id):
ec2 = boto3.client('ec2')
cloudwatch = boto3.client('cloudwatch')
# Get current instance details
instance = ec2.describe_instances(InstanceIds=[instance_id])
current_type = instance['Reservations'][0]['Instances'][0]['InstanceType']
# Analyze utilization metrics
cpu_metrics = get_metric_stats(cloudwatch, instance_id, 'CPUUtilization')
memory_metrics = get_metric_stats(cloudwatch, instance_id, 'MemoryUtilization')
network_metrics = get_metric_stats(cloudwatch, instance_id, 'NetworkIn')
# Instance sizing matrix
instance_matrix = {
't3.micro': {'cpu': 2, 'memory': 1, 'network': 'Low', 'cost': 0.0104},
't3.small': {'cpu': 2, 'memory': 2, 'network': 'Low', 'cost': 0.0208},
't3.medium': {'cpu': 2, 'memory': 4, 'network': 'Low', 'cost': 0.0416},
't3.large': {'cpu': 2, 'memory': 8, 'network': 'Moderate', 'cost': 0.0832},
't3.xlarge': {'cpu': 4, 'memory': 16, 'network': 'Moderate', 'cost': 0.1664},
}
# Recommendation logic
peak_cpu = max(cpu_metrics)
peak_memory = max(memory_metrics)
recommended = None
for instance_type, specs in instance_matrix.items():
if specs['cpu'] >= peak_cpu * 1.2 and specs['memory'] >= peak_memory * 1.2:
if not recommended or specs['cost'] < instance_matrix[recommended]['cost']:
recommended = instance_type
return {
'current': current_type,
'recommended': recommended,
'monthly_savings': (instance_matrix[current_type]['cost'] -
instance_matrix[recommended]['cost']) * 730
}
Kubernetes Resource Optimization
# Vertical Pod Autoscaler
apiVersion: autoscaling.k8s.io/v1
kind: VerticalPodAutoscaler
metadata:
name: app-vpa
spec:
targetRef:
apiVersion: "apps/v1"
kind: Deployment
name: app-deployment
updatePolicy:
updateMode: "Auto"
resourcePolicy:
containerPolicies:
- containerName: app
minAllowed:
cpu: 100m
memory: 128Mi
maxAllowed:
cpu: 2
memory: 2Gi
---
# Horizontal Pod Autoscaler
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: app-hpa
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: app-deployment
minReplicas: 2
maxReplicas: 10
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70
- type: Resource
resource:
name: memory
target:
type: Utilization
averageUtilization: 80
Spot Instance Strategy
Spot Fleet Configuration
resource "aws_spot_fleet_request" "workers" {
iam_fleet_role = aws_iam_role.fleet.arn
target_capacity = 10
valid_until = "2025-12-31T23:59:59Z"
launch_specification {
instance_type = "t3.large"
ami = data.aws_ami.ubuntu.id
spot_price = "0.03"
user_data = base64encode(<<-EOF
#!/bin/bash
# Configure spot instance termination handler
aws s3 cp s3://my-bucket/spot-termination-handler.sh /usr/local/bin/
chmod +x /usr/local/bin/spot-termination-handler.sh
nohup /usr/local/bin/spot-termination-handler.sh &
EOF
)
}
launch_specification {
instance_type = "t3a.large"
ami = data.aws_ami.ubuntu.id
spot_price = "0.03"
}
launch_specification {
instance_type = "t2.large"
ami = data.aws_ami.ubuntu.id
spot_price = "0.03"
}
}
Spot Termination Handler
#!/usr/bin/env python3
# spot-termination-handler.py
import requests
import time
import subprocess
def check_spot_termination():
"""Check for spot instance termination notice"""
try:
response = requests.get(
'http://169.254.169.254/latest/meta-data/spot/instance-action',
timeout=1
)
if response.status_code == 200:
return True
except:
pass
return False
def graceful_shutdown():
"""Perform graceful shutdown tasks"""
# Drain Kubernetes node
subprocess.run(['kubectl', 'drain', '--ignore-daemonsets',
'--delete-emptydir-data', 'node-name'])
# Save application state
subprocess.run(['./save-state.sh'])
# Notify team
send_alert("Spot instance terminating, workload migrated")
while True:
if check_spot_termination():
graceful_shutdown()
break
time.sleep(5)
Reserved Instances and Savings Plans
Optimization Script
def optimize_reservations():
ce = boto3.client('ce')
# Get RI recommendations
recommendations = ce.get_reservation_purchase_recommendation(
Service='EC2',
PaymentOption='ALL_UPFRONT',
Term='ONE_YEAR',
LookbackPeriodInDays='THIRTY_DAYS'
)
savings_report = []
for rec in recommendations['Recommendations']:
savings_report.append({
'InstanceType': rec['InstanceDetails']['InstanceType'],
'RecommendedCount': rec['RecommendedNumberOfInstancesToPurchase'],
'EstimatedMonthlySavings': rec['EstimatedMonthlySavingsAmount'],
'UpfrontCost': rec['UpfrontCost'],
'ROI_Months': float(rec['UpfrontCost']) / float(rec['EstimatedMonthlySavingsAmount'])
})
return sorted(savings_report, key=lambda x: x['ROI_Months'])
Storage Optimization
S3 Lifecycle Policies
{
"Rules": [
{
"Id": "ArchiveOldLogs",
"Status": "Enabled",
"Transitions": [
{
"Days": 30,
"StorageClass": "STANDARD_IA"
},
{
"Days": 90,
"StorageClass": "GLACIER"
},
{
"Days": 365,
"StorageClass": "DEEP_ARCHIVE"
}
],
"Expiration": {
"Days": 2555
}
}
]
}
EBS Optimization
def optimize_ebs_volumes():
ec2 = boto3.client('ec2')
volumes = ec2.describe_volumes()
recommendations = []
for volume in volumes['Volumes']:
# Check for unattached volumes
if not volume['Attachments']:
recommendations.append({
'VolumeId': volume['VolumeId'],
'Size': volume['Size'],
'Action': 'Delete unattached volume',
'MonthlySavings': volume['Size'] * 0.10
})
# Check for gp2 volumes that could be gp3
elif volume['VolumeType'] == 'gp2':
gp3_cost = volume['Size'] * 0.08
gp2_cost = volume['Size'] * 0.10
recommendations.append({
'VolumeId': volume['VolumeId'],
'Action': 'Convert to gp3',
'MonthlySavings': gp2_cost - gp3_cost
})
return recommendations
Database Optimization
RDS Cost Reduction
# Use Aurora Serverless for variable workloads
resource "aws_rds_cluster" "aurora_serverless" {
engine_mode = "serverless"
engine = "aurora-mysql"
scaling_configuration {
auto_pause = true
min_capacity = 1
max_capacity = 4
seconds_until_auto_pause = 300
}
# Enable backup window during off-peak
backup_window = "03:00-04:00"
preferred_maintenance_window = "sun:04:00-sun:05:00"
}
# Use read replicas for read-heavy workloads
resource "aws_db_instance" "read_replica" {
replicate_source_db = aws_db_instance.main.id
instance_class = "db.t3.micro"
# Use Multi-AZ only for critical databases
multi_az = false
}
Network Cost Optimization
NAT Gateway Alternatives
# Use NAT instances for non-critical workloads
apiVersion: v1
kind: ConfigMap
metadata:
name: nat-instance-setup
data:
setup.sh: |
#!/bin/bash
# Enable IP forwarding
echo 1 > /proc/sys/net/ipv4/ip_forward
# Setup iptables for NAT
iptables -t nat -A POSTROUTING -o eth0 -j MASQUERADE
iptables -A FORWARD -i eth1 -j ACCEPT
# Save iptables rules
iptables-save > /etc/iptables/rules.v4
VPC Endpoint Usage
# Reduce data transfer costs with VPC endpoints
resource "aws_vpc_endpoint" "s3" {
vpc_id = aws_vpc.main.id
service_name = "com.amazonaws.${var.region}.s3"
route_table_ids = [aws_route_table.private.id]
}
resource "aws_vpc_endpoint" "dynamodb" {
vpc_id = aws_vpc.main.id
service_name = "com.amazonaws.${var.region}.dynamodb"
route_table_ids = [aws_route_table.private.id]
}
Automated Cost Controls
Budget Alerts
def setup_budget_alerts():
budgets = boto3.client('budgets')
budgets.create_budget(
AccountId=account_id,
Budget={
'BudgetName': 'Monthly-Infrastructure',
'BudgetLimit': {
'Amount': '10000',
'Unit': 'USD'
},
'TimeUnit': 'MONTHLY',
'BudgetType': 'COST'
},
NotificationsWithSubscribers=[
{
'Notification': {
'NotificationType': 'ACTUAL',
'ComparisonOperator': 'GREATER_THAN',
'Threshold': 80.0,
'ThresholdType': 'PERCENTAGE'
},
'Subscribers': [
{'SubscriptionType': 'SNS', 'Address': 'arn:aws:sns:us-east-1:123456789:cost-alerts'}
]
}
]
)
Automated Cleanup
# Lambda function for resource cleanup
def lambda_handler(event, context):
ec2 = boto3.client('ec2')
# Stop instances tagged for auto-stop
instances = ec2.describe_instances(
Filters=[
{'Name': 'tag:AutoStop', 'Values': ['true']},
{'Name': 'instance-state-name', 'Values': ['running']}
]
)
instance_ids = []
for reservation in instances['Reservations']:
for instance in reservation['Instances']:
instance_ids.append(instance['InstanceId'])
if instance_ids:
ec2.stop_instances(InstanceIds=instance_ids)
# Delete old snapshots
snapshots = ec2.describe_snapshots(OwnerIds=['self'])
cutoff_date = datetime.now() - timedelta(days=30)
for snapshot in snapshots['Snapshots']:
if snapshot['StartTime'].replace(tzinfo=None) < cutoff_date:
ec2.delete_snapshot(SnapshotId=snapshot['SnapshotId'])
return {
'stopped_instances': len(instance_ids),
'deleted_snapshots': len([s for s in snapshots['Snapshots']
if s['StartTime'].replace(tzinfo=None) < cutoff_date])
}
Container Cost Optimization
Fargate vs EC2 Analysis
def compare_fargate_ec2(cpu, memory, hours_per_month):
# Fargate pricing
fargate_cpu_price = 0.04048 # per vCPU-hour
fargate_memory_price = 0.004445 # per GB-hour
fargate_cost = (cpu * fargate_cpu_price + memory * fargate_memory_price) * hours_per_month
# EC2 equivalent (t3.medium example)
ec2_hourly = 0.0416
ec2_cost = ec2_hourly * 730 # Full month
# Consider utilization
containers_per_instance = min(2 // cpu, 4 // memory)
ec2_effective_cost = ec2_cost / containers_per_instance
return {
'fargate_monthly': fargate_cost,
'ec2_monthly': ec2_effective_cost,
'recommendation': 'Fargate' if fargate_cost < ec2_effective_cost else 'EC2',
'monthly_savings': abs(fargate_cost - ec2_effective_cost)
}
Monitoring and Reporting
Cost Dashboard
# Generate cost report
def generate_cost_report():
ce = boto3.client('ce')
# Get current month costs
current_month = ce.get_cost_and_usage(
TimePeriod={
'Start': datetime.now().replace(day=1).strftime('%Y-%m-%d'),
'End': datetime.now().strftime('%Y-%m-%d')
},
Granularity='MONTHLY',
Metrics=['UnblendedCost'],
GroupBy=[{'Type': 'DIMENSION', 'Key': 'SERVICE'}]
)
# Get forecast
forecast = ce.get_cost_forecast(
TimePeriod={
'Start': datetime.now().strftime('%Y-%m-%d'),
'End': (datetime.now() + timedelta(days=30)).strftime('%Y-%m-%d')
},
Metric='UNBLENDED_COST',
Granularity='MONTHLY'
)
# Generate report
report = {
'current_month_spend': current_month['ResultsByTime'][0]['Total']['UnblendedCost']['Amount'],
'forecast_next_month': forecast['Total']['Amount'],
'top_services': sorted(
current_month['ResultsByTime'][0]['Groups'],
key=lambda x: float(x['Metrics']['UnblendedCost']['Amount']),
reverse=True
)[:5]
}
return report
Team Accountability
Cost Allocation Tags
# Kubernetes labels for cost tracking
apiVersion: apps/v1
kind: Deployment
metadata:
name: api-service
labels:
team: platform
cost-center: engineering
environment: production
spec:
template:
metadata:
labels:
team: platform
cost-center: engineering
Results and Lessons Learned
After implementing these strategies:
- Compute costs: Reduced by 45% through right-sizing and spot instances
- Storage costs: Reduced by 70% with lifecycle policies
- Network costs: Reduced by 30% with VPC endpoints
- Database costs: Reduced by 50% with Aurora Serverless
- Overall savings: 60% reduction in monthly AWS bill
Key lessons:
- Automate everything: Manual optimization doesn't scale
- Make costs visible: Teams optimize what they can see
- Start with quick wins: Unattached volumes, oversized instances
- Continuous optimization: Set up automated monitoring and adjustment
- Cultural change: Make cost awareness part of engineering culture
Conclusion
Cloud cost optimization is an ongoing journey, not a destination. Start with visibility, implement quick wins, then build automation for continuous optimization. The investment in FinOps practices pays for itself many times over, freeing up budget for innovation while maintaining or improving performance.
Share this article
David Childs
Consulting Systems Engineer with over 10 years of experience building scalable infrastructure and helping organizations optimize their technology stack.