NumPy: Arrays and Scientific Computing in Python

NumPy is the foundational library for scientific computing in Python. It provides high-performance multidimensional arrays and vectorized mathematical functions that run 10–100× faster than pure Python lists.

Installation and basics

pip install numpy

import numpy as np

# Create arrays from lists
arr_1d = np.array([1, 2, 3, 4, 5])
arr_2d = np.array([[1, 2, 3], [4, 5, 6]])

print(arr_1d.shape)   # (5,)
print(arr_2d.shape)   # (2, 3)
print(arr_2d.ndim)    # 2
print(arr_2d.size)    # 6
print(arr_2d.dtype)   # int64

Creating special arrays

# Zeros, ones, and identity
zeros = np.zeros((3, 4))           # 3×4 float64 zeros
ones  = np.ones((2, 3), dtype=int) # 2×3 integer ones
eye   = np.eye(3)                  # 3×3 identity matrix
diag  = np.diag([1, 2, 3])         # diagonal matrix

# Ranges
arange   = np.arange(0, 10, 2)     # [0, 2, 4, 6, 8]
linspace = np.linspace(0, 1, 5)    # [0.0, 0.25, 0.5, 0.75, 1.0]
logspace = np.logspace(0, 3, 4)    # [1, 10, 100, 1000]

# Modern reproducible random numbers
rng      = np.random.default_rng(seed=42)
rand_f   = rng.random((3, 3))          # floats [0, 1)
rand_i   = rng.integers(0, 100, (5,))  # integers [0, 100)
normal   = rng.standard_normal((100,)) # normal distribution

Data types (dtypes)

float32_arr = np.array([1.5, 2.5], dtype=np.float32)  # 4 bytes/element
int16_arr   = np.array([100, 200], dtype=np.int16)     # 2 bytes/element
bool_arr    = np.array([0, 1, 0],  dtype=bool)

# Common dtypes
# np.float64  → double precision (default)
# np.float32  → single precision (ML workloads)
# np.int64    → 64-bit integer (default)
# np.uint8    → unsigned 8-bit (images)
# np.complex128 → 128-bit complex

# Convert dtype
converted = float32_arr.astype(np.float64)

Indexing and slicing

arr = np.arange(20).reshape(4, 5)

# Basic indexing
print(arr[1, 3])      # 8
print(arr[2])         # row 2: [10, 11, 12, 13, 14]

# Slicing
print(arr[1:3, 2:4])  # sub-matrix rows 1-2, cols 2-3
print(arr[:, 0])       # entire first column
print(arr[::2])        # every other row

# Boolean indexing
mask = arr > 10
print(arr[mask])       # all elements > 10

# Fancy indexing
rows = np.array([0, 2])
cols = np.array([1, 3])
print(arr[rows, cols]) # [arr[0,1], arr[2,3]] = [1, 13]

# Slicing returns a VIEW — modifying it changes the original
view = arr[1:3]
view[:] = 0            # also modifies arr!

# To copy explicitly
copy = arr[1:3].copy()

Reshaping and shape manipulation

arr = np.arange(24)

mat_4x6  = arr.reshape(4, 6)
tensor   = arr.reshape(2, 3, 4)   # 3D
flat     = mat_4x6.flatten()      # copy, always 1D
ravel    = mat_4x6.ravel()        # view when possible

# -1 infers the dimension automatically
auto = arr.reshape(-1, 6)         # (4, 6)

# Transpose
T         = mat_4x6.T             # (6, 4)
perm      = np.transpose(tensor, (2, 0, 1))

# Add/remove dimensions
col       = arr[:5].reshape(-1, 1)            # (5,) → (5, 1)
expanded  = np.expand_dims(arr[:5], axis=0)   # (5,) → (1, 5)
squeezed  = np.squeeze(expanded)              # (1, 5) → (5,)

# Stack and concatenate
a = np.array([1, 2, 3])
b = np.array([4, 5, 6])
np.hstack([a, b])               # [1, 2, 3, 4, 5, 6]
np.vstack([a, b])               # [[1,2,3],[4,5,6]]
np.stack([a, b], axis=0)        # [[1,2,3],[4,5,6]]

Broadcasting

Broadcasting lets you perform operations between arrays of different shapes.

# Rules:
# 1. If arrays have different ndim, pad with 1s on the left
# 2. Sizes must match or one of them must be 1

mat = np.ones((3, 4))
result = mat + 10                   # scalar broadcasts to (3,4)

row = np.array([1, 2, 3, 4])       # shape (4,)
col = np.array([[10], [20], [30]])  # shape (3, 1)
result = row + col
# [[11, 12, 13, 14],
#  [21, 22, 23, 24],
#  [31, 32, 33, 34]]

# Practical: normalize each row of a matrix
matrix   = rng.random((100, 5))
row_mean = matrix.mean(axis=1, keepdims=True)  # (100, 1)
row_std  = matrix.std(axis=1, keepdims=True)   # (100, 1)
normed   = (matrix - row_mean) / row_std       # (100, 5) — broadcasting

Vectorized math operations

a = np.array([1.0, 4.0, 9.0, 16.0])

# Universal functions (ufuncs) — element-wise
np.sqrt(a)                         # [1., 2., 3., 4.]
np.exp(a)
np.log(a)
np.sin(a)
np.abs(np.array([-1, -2, 3]))      # [1, 2, 3]

# Array operations
b = np.array([2.0, 3.0, 4.0, 5.0])
np.dot(a, b)                       # dot product = sum(a*b)

# Reductions
a.sum()
a.mean()
a.std()
a.min(), a.max()
a.argmin(), a.argmax()             # indices of min/max

# Axis-wise reductions
mat = np.array([[1,2,3],[4,5,6]])
mat.sum(axis=0)                    # [5, 7, 9] — column sums
mat.sum(axis=1)                    # [6, 15]   — row sums
mat.cumsum()                       # cumulative sum, flattened

# Element-wise utilities
np.maximum(a, b)                   # element-wise max
np.clip(a, 2.0, 10.0)             # clamp to [2, 10]

Linear algebra

from numpy.linalg import det, inv, solve, eig, svd, norm, lstsq

A = np.array([[3, 1], [1, 2]], dtype=float)
b = np.array([9, 8], dtype=float)

det(A)                             # 5.0
A_inv = inv(A)
x = solve(A, b)                    # solve Ax = b → [2., 3.]
print(np.allclose(A @ x, b))      # True

# Matrix product
C = rng.random((3, 3))
D = rng.random((3, 3))
product = C @ D                    # preferred over np.matmul

# Eigenvalues / eigenvectors
values, vectors = eig(A)

# Singular Value Decomposition
M       = rng.random((4, 3))
U, S, Vt = svd(M, full_matrices=False)
# M ≈ U @ np.diag(S) @ Vt

# Norms
norm(x)                            # L2 (Euclidean)
norm(A, ord='fro')                 # Frobenius norm

# Least squares (linear regression)
X      = np.column_stack([np.ones(10), np.arange(10)])
y      = 2 * np.arange(10) + 1 + rng.standard_normal(10)
coeffs, _, _, _ = lstsq(X, y, rcond=None)

Fast Fourier Transform (FFT)

t      = np.linspace(0, 1, 1000, endpoint=False)  # 1 s, 1000 samples
signal = np.sin(2 * np.pi * 50 * t) + 0.5 * np.sin(2 * np.pi * 120 * t)

# FFT
fft_vals  = np.fft.fft(signal)
freqs     = np.fft.fftfreq(len(signal), d=1/1000)  # Hz
magnitude = np.abs(fft_vals)

# Keep only positive frequencies
pos  = freqs > 0
top5 = freqs[pos][np.argsort(magnitude[pos])[-5:]]
print(top5)  # approx. [50., 120.]

# 2D FFT (image processing)
from numpy.fft import fft2, fftshift
image    = rng.random((256, 256))
spectrum = fftshift(fft2(image))   # centered spectrum

Performance and memory

import time

n       = 10_000_000
py_list = list(range(n))
np_arr  = np.arange(n, dtype=np.float64)

t0 = time.perf_counter()
_ = sum(x**2 for x in py_list)
t1 = time.perf_counter()
_ = (np_arr**2).sum()
t2 = time.perf_counter()

print(f"Pure Python: {t1-t0:.3f}s")
print(f"NumPy:       {t2-t1:.4f}s")  # typically 50–100× faster

# Memory-efficient dtypes
# float64: 8 bytes/element (default)
# float32: 4 bytes/element (good for ML)
# int16:   2 bytes/element (values ≤ 32,767)

# Read-only arrays
arr = np.arange(10)
arr.flags.writeable = False

# Saving and loading
np.save('array.npy', np_arr)
loaded = np.load('array.npy')

np.savez('arrays.npz', x=np_arr, y=np_arr[:100])
data = np.load('arrays.npz')
print(data['x'].shape, data['y'].shape)

Practical example: signal analysis pipeline

import numpy as np

def analyze_signal(raw_data: np.ndarray, fs: float) -> dict:
    """
    Analyze a signal: remove DC offset, apply Hann window, compute spectrum.

    Args:
        raw_data: 1D array of samples
        fs: sampling rate in Hz
    Returns:
        dict with statistics and dominant frequencies
    """
    # 1. Remove DC component (mean)
    data = raw_data - raw_data.mean()

    # 2. Apply Hann window to reduce spectral leakage
    window = np.hanning(len(data))
    windowed = data * window

    # 3. FFT — rfft only computes positive frequencies
    N         = len(data)
    fft_vals  = np.fft.rfft(windowed)
    magnitudes = np.abs(fft_vals) * 2 / N
    freqs     = np.fft.rfftfreq(N, d=1/fs)

    # 4. Top-5 dominant frequencies
    top5_idx = np.argsort(magnitudes)[-5:][::-1]

    return {
        'mean':              raw_data.mean(),
        'std':               raw_data.std(),
        'rms':               np.sqrt((data**2).mean()),
        'peak_positive':     data.max(),
        'peak_negative':     data.min(),
        'dominant_freqs_hz': freqs[top5_idx].tolist(),
        'dominant_amplitudes': magnitudes[top5_idx].tolist(),
    }

# Usage
rng  = np.random.default_rng(42)
fs   = 1000.0   # 1 kHz
t    = np.arange(0, 5, 1/fs)   # 5 seconds

signal = (
    3.0 * np.sin(2 * np.pi * 60  * t) +
    1.5 * np.sin(2 * np.pi * 180 * t) +
    0.3 * rng.standard_normal(len(t))
)

result = analyze_signal(signal, fs)
for k, v in result.items():
    print(f"{k}: {v}")

Best practices

Avoid Python loops over NumPy arrays — use vectorized operations and ufuncs. For unavoidable loops, consider numba.jit for a free speed boost.
Understand views vs copies — slices return views; mutating them modifies the original. Use .copy() when you need isolation.
Use np.random.default_rng(seed) instead of the legacy np.random.seed() — thread-safe and reproducible across NumPy versions.
Specify dtype explicitly when memory matters: float32 for neural network weights, uint8 for 8-bit image data.
Prefer @ over np.dot for matrix multiplication — more readable and consistent with PyTorch/TensorFlow.
Profile before optimizing — use %timeit in Jupyter or time.perf_counter() to locate real bottlenecks.

Related conversions

Frequent conversions across the catalogue:

NumPy: Arrays and Scientific Computing in Python

NumPy: Arrays and Scientific Computing in Python

Installation and basics

Creating special arrays

Data types (dtypes)

Indexing and slicing

Reshaping and shape manipulation

Broadcasting

Vectorized math operations

Linear algebra

Fast Fourier Transform (FFT)

Performance and memory

Practical example: signal analysis pipeline

Best practices

Related conversions

Frequently Asked Questions