Performance Profiling and Memory Optimization in Python

Measure before you optimize. Python provides both built-in and third-party tools to pinpoint exactly where time and memory are spent — preventing premature optimization that complicates code without real benefit.

The golden rule

"Premature optimization is the root of all evil." — Donald Knuth

The correct flow:

Measure with a profiler — identify the real bottleneck
Optimize only what matters (typically 10% of code uses 90% of time)
Verify that the optimization actually improved performance

timeit — measure small code snippets

import timeit

# Command-line interface
# python -m timeit "[i**2 for i in range(1000)]"

# Programmatic API
result = timeit.timeit(
    stmt='[i**2 for i in range(1000)]',
    number=10_000,
)
print(f"10k runs: {result:.3f}s ({result/10_000*1e6:.2f} µs each)")

# Compare two implementations
list_comp = timeit.timeit('[i**2 for i in range(1000)]', number=10_000)
generator = timeit.timeit('list(i**2 for i in range(1000))', number=10_000)
print(f"List comprehension: {list_comp:.3f}s")
print(f"Generator:          {generator:.3f}s")

# In Jupyter/IPython:
# %timeit [i**2 for i in range(1000)]

cProfile — CPU profiling

import cProfile
import pstats
import io

def slow_function():
    result = 0
    for i in range(100_000):
        result += sum(j**2 for j in range(100))
    return result

# Method 1: Profile object
profiler = cProfile.Profile()
profiler.enable()
slow_function()
profiler.disable()

stream = io.StringIO()
stats  = pstats.Stats(profiler, stream=stream).sort_stats('cumulative')
stats.print_stats(20)
print(stream.getvalue())

# Method 2: context manager
with cProfile.Profile() as pr:
    slow_function()
pstats.Stats(pr).sort_stats('cumulative').print_stats(10)

# Method 3: command line (most convenient)
# python -m cProfile -s cumulative my_script.py | head -30

line_profiler — line-by-line CPU profiling

pip install line_profiler

from line_profiler import LineProfiler

def compute_stats(data: list) -> dict:
    total    = sum(data)
    mean     = total / len(data)
    sorted_d = sorted(data)
    median   = sorted_d[len(data) // 2]
    variance = sum((x - mean)**2 for x in data) / len(data)
    return {'mean': mean, 'median': median, 'variance': variance}

profiler = LineProfiler()
profiler.add_function(compute_stats)
profiler.enable()
compute_stats(list(range(100_000)))
profiler.disable()
profiler.print_stats()

# Or use the CLI with @profile decorator:
# kernprof -l -v my_script.py

memory_profiler — line-by-line memory usage

pip install memory_profiler

from memory_profiler import profile

@profile
def load_large_data():
    # Method 1: list — loads everything into RAM
    data_list = [i * 1.5 for i in range(1_000_000)]

    # Method 2: generator — much less memory
    data_gen = (i * 1.5 for i in range(1_000_000))

    total = sum(data_gen)
    return total

load_large_data()

# Run with:
# python -m memory_profiler my_script.py

# Typical output:
# Line #  Mem usage   Increment  Line Contents
# =====================================================
#   11   127.4 MiB   +77.4 MiB   data_list = [i*1.5 for i in range(1_000_000)]
#   14    93.2 MiB   -34.2 MiB   data_gen = (i*1.5 for i in range(1_000_000))

tracemalloc — track memory allocations

import tracemalloc

tracemalloc.start(10)   # keep the last 10 stack frames

# Code to analyze
data = [{'id': i, 'value': i * 1.5, 'name': f'item_{i}'} for i in range(100_000)]

snapshot = tracemalloc.take_snapshot()
top_stats = snapshot.statistics('lineno')
for stat in top_stats[:10]:
    print(stat)

# Compare two snapshots to detect memory leaks
tracemalloc.clear_traces()
snap1 = tracemalloc.take_snapshot()

# Suspected leaky code
cache = {}
for i in range(10_000):
    cache[i] = [0] * 100

snap2 = tracemalloc.take_snapshot()
for stat in snap2.compare_to(snap1, 'lineno')[:5]:
    print(f"+{stat.size_diff/1024:.1f} KB — {stat}")

tracemalloc.stop()

Memory optimization with slots

import sys

class PointNormal:
    def __init__(self, x, y, z):
        self.x = x; self.y = y; self.z = z

class PointSlots:
    __slots__ = ('x', 'y', 'z')   # no per-instance __dict__
    def __init__(self, x, y, z):
        self.x = x; self.y = y; self.z = z

p1 = PointNormal(1.0, 2.0, 3.0)
p2 = PointSlots(1.0, 2.0, 3.0)
print(f"Normal:   {sys.getsizeof(p1)} bytes + {sys.getsizeof(p1.__dict__)} (dict)")
print(f"__slots__: {sys.getsizeof(p2)} bytes (no dict)")
# Typical saving: 40-50% per instance — critical with millions of objects

Garbage collection

import gc

# Python uses reference counting + cycle detector
# Reference-counting alone does NOT free cyclic references

class Node:
    def __init__(self, value):
        self.value = value
        self.next = None

a = Node(1); b = Node(2)
a.next = b; b.next = a   # cycle: a → b → a

del a, b
print(f"Gen 0 objects: {gc.get_count()[0]}")
gc.collect()              # force cycle collection
print(f"After gc.collect: {gc.get_count()[0]}")

# Detect leaked objects
gc.set_debug(gc.DEBUG_LEAK)
gc.collect()

# Callback when cycles are found
def on_gc(phase, info):
    if phase == 'stop':
        print(f"GC: collected {info['collected']} objects")
gc.callbacks.append(on_gc)

Proven optimization techniques

# 1. List comprehensions over explicit loops
squares = [x**2 for x in range(10_000)]   # C internals, fast

squares = []                               # slower: Python overhead
for x in range(10_000):
    squares.append(x**2)

# 2. Local variable lookups vs attribute lookups
import math
sqrt = math.sqrt              # local reference — avoids dict lookup each iteration
results = [sqrt(x) for x in range(10_000)]

# 3. str.join over += for string building
parts = ['Hello', ' ', 'world', '!']
result = ''.join(parts)       # one allocation

# 4. Sets for O(1) membership testing
numbers_list = list(range(1_000_000))
numbers_set  = set(range(1_000_000))

999_999 in numbers_list   # O(n) — scans up to 1M elements
999_999 in numbers_set    # O(1) — hash lookup

# 5. defaultdict and Counter over manual if-else
from collections import defaultdict, Counter
text = "the quick brown fox jumps over the lazy dog"

# Manual — slow and verbose
freq = {}
for word in text.split():
    if word in freq:
        freq[word] += 1
    else:
        freq[word] = 1

# Counter — fast, expressive
freq = Counter(text.split())
print(freq.most_common(3))

# 6. Generators over lists for large sequences
# Loads 8 MB:
big_list = [x**2 for x in range(1_000_000)]

# Loads ~120 bytes:
big_gen = (x**2 for x in range(1_000_000))
total   = sum(big_gen)   # process without materializing

Numba and Cython — accelerating numeric code

# Numba — JIT compilation for NumPy-heavy loops (pip install numba)
from numba import jit
import numpy as np

@jit(nopython=True, parallel=True)
def sum_squares_numba(arr):
    total = 0.0
    for i in range(len(arr)):
        total += arr[i] * arr[i]
    return total

arr = np.random.rand(10_000_000)
# First call compiles (slow), subsequent calls run as native C
result = sum_squares_numba(arr)

# Cython — mixed Python/C (requires compilation)
# rapido.pyx
"""
import numpy as np
cimport numpy as np

def sum_squares_cython(double[:] arr) -> double:
    cdef double total = 0.0
    cdef int i
    for i in range(len(arr)):
        total += arr[i] * arr[i]
    return total
"""
# Compile: python setup.py build_ext --inplace

Best practices

Always measure before optimizing — use cProfile for CPU bottlenecks and tracemalloc for memory issues. Never guess.
Generators over lists for large sequences: (x for x in data) vs [x for x in data] can be the difference between 1 MB and 100 MB of RAM.
__slots__ in classes with millions of instances — eliminates the per-object __dict__ (40-50% memory savings).
Sets for frequent membership tests — in set is O(1) vs O(n) in lists; the crossover is usually around 5-10 elements.
collections.Counter and defaultdict are faster and more readable than equivalent manual patterns.
Numba @jit(nopython=True) for numeric loops — can achieve 100× speedup without rewriting in C, just adding a decorator.