Performance Profiling and Memory Optimization in Python
Measure before you optimize. Python provides both built-in and third-party tools to pinpoint exactly where time and memory are spent β preventing premature optimization that complicates code without real benefit.
The golden rule
"Premature optimization is the root of all evil." β Donald Knuth
The correct flow:
- Measure with a profiler β identify the real bottleneck
- Optimize only what matters (typically 10% of code uses 90% of time)
- Verify that the optimization actually improved performance
timeit β measure small code snippets
import timeit
# Command-line interface
# python -m timeit "[i**2 for i in range(1000)]"
# Programmatic API
result = timeit.timeit(
stmt='[i**2 for i in range(1000)]',
number=10_000,
)
print(f"10k runs: {result:.3f}s ({result/10_000*1e6:.2f} Β΅s each)")
# Compare two implementations
list_comp = timeit.timeit('[i**2 for i in range(1000)]', number=10_000)
generator = timeit.timeit('list(i**2 for i in range(1000))', number=10_000)
print(f"List comprehension: {list_comp:.3f}s")
print(f"Generator: {generator:.3f}s")
# In Jupyter/IPython:
# %timeit [i**2 for i in range(1000)]
cProfile β CPU profiling
import cProfile
import pstats
import io
def slow_function():
result = 0
for i in range(100_000):
result += sum(j**2 for j in range(100))
return result
# Method 1: Profile object
profiler = cProfile.Profile()
profiler.enable()
slow_function()
profiler.disable()
stream = io.StringIO()
stats = pstats.Stats(profiler, stream=stream).sort_stats('cumulative')
stats.print_stats(20)
print(stream.getvalue())
# Method 2: context manager
with cProfile.Profile() as pr:
slow_function()
pstats.Stats(pr).sort_stats('cumulative').print_stats(10)
# Method 3: command line (most convenient)
# python -m cProfile -s cumulative my_script.py | head -30
line_profiler β line-by-line CPU profiling
pip install line_profiler
from line_profiler import LineProfiler
def compute_stats(data: list) -> dict:
total = sum(data)
mean = total / len(data)
sorted_d = sorted(data)
median = sorted_d[len(data) // 2]
variance = sum((x - mean)**2 for x in data) / len(data)
return {'mean': mean, 'median': median, 'variance': variance}
profiler = LineProfiler()
profiler.add_function(compute_stats)
profiler.enable()
compute_stats(list(range(100_000)))
profiler.disable()
profiler.print_stats()
# Or use the CLI with @profile decorator:
# kernprof -l -v my_script.py
memory_profiler β line-by-line memory usage
pip install memory_profiler
from memory_profiler import profile
@profile
def load_large_data():
# Method 1: list β loads everything into RAM
data_list = [i * 1.5 for i in range(1_000_000)]
# Method 2: generator β much less memory
data_gen = (i * 1.5 for i in range(1_000_000))
total = sum(data_gen)
return total
load_large_data()
# Run with:
# python -m memory_profiler my_script.py
# Typical output:
# Line # Mem usage Increment Line Contents
# =====================================================
# 11 127.4 MiB +77.4 MiB data_list = [i*1.5 for i in range(1_000_000)]
# 14 93.2 MiB -34.2 MiB data_gen = (i*1.5 for i in range(1_000_000))
tracemalloc β track memory allocations
import tracemalloc
tracemalloc.start(10) # keep the last 10 stack frames
# Code to analyze
data = [{'id': i, 'value': i * 1.5, 'name': f'item_{i}'} for i in range(100_000)]
snapshot = tracemalloc.take_snapshot()
top_stats = snapshot.statistics('lineno')
for stat in top_stats[:10]:
print(stat)
# Compare two snapshots to detect memory leaks
tracemalloc.clear_traces()
snap1 = tracemalloc.take_snapshot()
# Suspected leaky code
cache = {}
for i in range(10_000):
cache[i] = [0] * 100
snap2 = tracemalloc.take_snapshot()
for stat in snap2.compare_to(snap1, 'lineno')[:5]:
print(f"+{stat.size_diff/1024:.1f} KB β {stat}")
tracemalloc.stop()
Memory optimization with slots
import sys
class PointNormal:
def __init__(self, x, y, z):
self.x = x; self.y = y; self.z = z
class PointSlots:
__slots__ = ('x', 'y', 'z') # no per-instance __dict__
def __init__(self, x, y, z):
self.x = x; self.y = y; self.z = z
p1 = PointNormal(1.0, 2.0, 3.0)
p2 = PointSlots(1.0, 2.0, 3.0)
print(f"Normal: {sys.getsizeof(p1)} bytes + {sys.getsizeof(p1.__dict__)} (dict)")
print(f"__slots__: {sys.getsizeof(p2)} bytes (no dict)")
# Typical saving: 40-50% per instance β critical with millions of objects
Garbage collection
import gc
# Python uses reference counting + cycle detector
# Reference-counting alone does NOT free cyclic references
class Node:
def __init__(self, value):
self.value = value
self.next = None
a = Node(1); b = Node(2)
a.next = b; b.next = a # cycle: a β b β a
del a, b
print(f"Gen 0 objects: {gc.get_count()[0]}")
gc.collect() # force cycle collection
print(f"After gc.collect: {gc.get_count()[0]}")
# Detect leaked objects
gc.set_debug(gc.DEBUG_LEAK)
gc.collect()
# Callback when cycles are found
def on_gc(phase, info):
if phase == 'stop':
print(f"GC: collected {info['collected']} objects")
gc.callbacks.append(on_gc)
Proven optimization techniques
# 1. List comprehensions over explicit loops
squares = [x**2 for x in range(10_000)] # C internals, fast
squares = [] # slower: Python overhead
for x in range(10_000):
squares.append(x**2)
# 2. Local variable lookups vs attribute lookups
import math
sqrt = math.sqrt # local reference β avoids dict lookup each iteration
results = [sqrt(x) for x in range(10_000)]
# 3. str.join over += for string building
parts = ['Hello', ' ', 'world', '!']
result = ''.join(parts) # one allocation
# 4. Sets for O(1) membership testing
numbers_list = list(range(1_000_000))
numbers_set = set(range(1_000_000))
999_999 in numbers_list # O(n) β scans up to 1M elements
999_999 in numbers_set # O(1) β hash lookup
# 5. defaultdict and Counter over manual if-else
from collections import defaultdict, Counter
text = "the quick brown fox jumps over the lazy dog"
# Manual β slow and verbose
freq = {}
for word in text.split():
if word in freq:
freq[word] += 1
else:
freq[word] = 1
# Counter β fast, expressive
freq = Counter(text.split())
print(freq.most_common(3))
# 6. Generators over lists for large sequences
# Loads 8 MB:
big_list = [x**2 for x in range(1_000_000)]
# Loads ~120 bytes:
big_gen = (x**2 for x in range(1_000_000))
total = sum(big_gen) # process without materializing
Numba and Cython β accelerating numeric code
# Numba β JIT compilation for NumPy-heavy loops (pip install numba)
from numba import jit
import numpy as np
@jit(nopython=True, parallel=True)
def sum_squares_numba(arr):
total = 0.0
for i in range(len(arr)):
total += arr[i] * arr[i]
return total
arr = np.random.rand(10_000_000)
# First call compiles (slow), subsequent calls run as native C
result = sum_squares_numba(arr)
# Cython β mixed Python/C (requires compilation)
# rapido.pyx
"""
import numpy as np
cimport numpy as np
def sum_squares_cython(double[:] arr) -> double:
cdef double total = 0.0
cdef int i
for i in range(len(arr)):
total += arr[i] * arr[i]
return total
"""
# Compile: python setup.py build_ext --inplace
Best practices
- Always measure before optimizing β use
cProfilefor CPU bottlenecks andtracemallocfor memory issues. Never guess. - Generators over lists for large sequences:
(x for x in data)vs[x for x in data]can be the difference between 1 MB and 100 MB of RAM. __slots__in classes with millions of instances β eliminates the per-object__dict__(40-50% memory savings).- Sets for frequent membership tests β
in setis O(1) vs O(n) in lists; the crossover is usually around 5-10 elements. collections.Counteranddefaultdictare faster and more readable than equivalent manual patterns.- Numba
@jit(nopython=True)for numeric loops β can achieve 100Γ speedup without rewriting in C, just adding a decorator.